diff --git a/apps/api/src/api/v1/agents.py b/apps/api/src/api/v1/agents.py index 1faf1936..ecc10610 100644 --- a/apps/api/src/api/v1/agents.py +++ b/apps/api/src/api/v1/agents.py @@ -65,6 +65,9 @@ from src.services.runtime_surface_inventory import ( from src.services.gitea_workflow_runner_health import ( load_latest_gitea_workflow_runner_health, ) +from src.services.observability_contract_matrix import ( + load_latest_observability_contract_matrix, +) from src.services.package_supply_chain_inventory import ( load_latest_package_supply_chain_inventory, ) @@ -536,6 +539,34 @@ async def get_gitea_workflow_runner_health() -> dict[str, Any]: ) from exc +@router.get( + "/observability-contract-matrix", + response_model=dict[str, Any], + summary="取得監控合約與降噪機會矩陣", + description=( + "讀取最新已提交的 Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry " + "只讀 observability matrix;此端點不修改 alert rules、不呼叫 silence API、" + "不建立 Grafana dashboard、不改 SigNoz / Sentry 設定、不讀 Secret payload、" + "不送 Telegram 測試通知、不觸發 monitoring deploy。" + ), +) +async def get_observability_contract_matrix() -> dict[str, Any]: + """Return the latest read-only observability contract matrix.""" + try: + return await asyncio.to_thread(load_latest_observability_contract_matrix) + except FileNotFoundError as exc: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=str(exc), + ) from exc + except (json.JSONDecodeError, ValueError) as exc: + logger.error("observability_contract_matrix_invalid", error=str(exc)) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="監控合約與降噪機會矩陣快照無效", + ) from exc + + @router.get( "/backup-dr-target-inventory", response_model=dict[str, Any], diff --git a/apps/api/src/services/observability_contract_matrix.py b/apps/api/src/services/observability_contract_matrix.py new file mode 100644 index 00000000..13875fa3 --- /dev/null +++ b/apps/api/src/services/observability_contract_matrix.py @@ -0,0 +1,232 @@ +""" +Observability contract and noise-reduction matrix snapshot. + +Loads the latest committed, read-only Prometheus / Alertmanager / SigNoz / +Grafana observability contract matrix. This module never mutates alert rules, +routes, receivers, silences, dashboards, webhooks, collectors, secrets, +notifications, workflows, or runtime state. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +from src.services.snapshot_paths import default_evaluations_dir + +_DEFAULT_EVALUATIONS_DIR = default_evaluations_dir(Path(__file__)) +_SNAPSHOT_PATTERN = "observability_contract_matrix_*.json" +_SCHEMA_VERSION = "observability_contract_matrix_v1" + + +def load_latest_observability_contract_matrix( + evaluations_dir: Path | None = None, +) -> dict[str, Any]: + """Load the newest committed observability contract matrix snapshot.""" + directory = evaluations_dir or _DEFAULT_EVALUATIONS_DIR + candidates = sorted(directory.glob(_SNAPSHOT_PATTERN)) + if not candidates: + raise FileNotFoundError(f"no observability contract matrix snapshots found in {directory}") + + latest = candidates[-1] + with latest.open(encoding="utf-8") as handle: + payload = json.load(handle) + + if not isinstance(payload, dict): + raise ValueError(f"{latest}: expected JSON object") + _require_schema(payload, _SCHEMA_VERSION, str(latest)) + _require_read_only_boundaries(payload, str(latest)) + _require_operation_boundaries(payload, str(latest)) + _require_rollup_consistency(payload, str(latest)) + _require_surface_evidence(payload, str(latest)) + _require_noise_opportunities(payload, str(latest)) + _require_operator_denials(payload, str(latest)) + _require_no_plaintext_secret_payload_keys(payload, str(latest)) + return payload + + +def _require_schema(payload: dict[str, Any], expected: str, label: str) -> None: + actual = payload.get("schema_version") + if actual != expected: + raise ValueError(f"{label}: expected schema_version={expected}, got {actual!r}") + + +def _require_read_only_boundaries(payload: dict[str, Any], label: str) -> None: + program_status = payload.get("program_status") or {} + if program_status.get("read_only_mode") is not True: + raise ValueError(f"{label}: program_status.read_only_mode must be true") + + approval_boundaries = payload.get("approval_boundaries") or {} + allowed = sorted(key for key, value in approval_boundaries.items() if value is not False) + if allowed: + raise ValueError(f"{label}: approval boundaries must remain false: {allowed}") + + +def _require_operation_boundaries(payload: dict[str, Any], label: str) -> None: + boundaries = payload.get("operation_boundaries") or {} + if boundaries.get("read_only_api_allowed") is not True: + raise ValueError(f"{label}: read_only_api_allowed must be true") + + blocked_flags = { + "prometheus_rule_write_allowed", + "prometheus_reload_allowed", + "alertmanager_route_write_allowed", + "alertmanager_receiver_change_allowed", + "alertmanager_to_openclaw_allowed", + "silence_create_allowed", + "grafana_dashboard_write_allowed", + "grafana_api_write_allowed", + "signoz_query_mutation_allowed", + "signoz_webhook_change_allowed", + "sentry_webhook_change_allowed", + "otel_collector_deploy_allowed", + "event_exporter_restart_allowed", + "secret_read_allowed", + "secret_plaintext_allowed", + "notification_send_allowed", + "external_api_call_allowed", + "live_prometheus_query_allowed", + "workflow_trigger_allowed", + "deploy_trigger_allowed", + "reload_trigger_allowed", + "runtime_execution_allowed", + } + allowed = sorted(flag for flag in blocked_flags if boundaries.get(flag) is not False) + if allowed: + raise ValueError(f"{label}: operation boundaries must remain false: {allowed}") + + +def _require_rollup_consistency(payload: dict[str, Any], label: str) -> None: + surfaces = payload.get("observability_surfaces") or [] + opportunities = payload.get("noise_reduction_opportunities") or [] + gaps = payload.get("classification_gaps") or [] + rollups = payload.get("rollups") or {} + + if rollups.get("total_surfaces") != len(surfaces): + raise ValueError(f"{label}: rollups.total_surfaces must match observability_surfaces") + if rollups.get("by_kind") != _count_by(surfaces, "kind"): + raise ValueError(f"{label}: rollups.by_kind must match observability_surfaces") + if rollups.get("by_status") != _count_by(surfaces, "status"): + raise ValueError(f"{label}: rollups.by_status must match observability_surfaces") + if rollups.get("by_evidence_status") != _count_by(surfaces, "evidence_status"): + raise ValueError(f"{label}: rollups.by_evidence_status must match observability_surfaces") + if rollups.get("by_noise_policy_status") != _count_by(surfaces, "noise_policy_status"): + raise ValueError(f"{label}: rollups.by_noise_policy_status must match observability_surfaces") + + action_required = sorted( + surface.get("surface_id") + for surface in surfaces + if surface.get("status") == "action_required" + ) + if sorted(rollups.get("surface_ids_requiring_action") or []) != action_required: + raise ValueError(f"{label}: rollups.surface_ids_requiring_action must match surfaces") + + proposal_only_surfaces = sorted( + surface.get("surface_id") + for surface in surfaces + if surface.get("noise_policy_status") == "proposal_only" + ) + if sorted(rollups.get("surface_ids_with_proposal_only_noise_policy") or []) != proposal_only_surfaces: + raise ValueError( + f"{label}: rollups.surface_ids_with_proposal_only_noise_policy must match surfaces" + ) + + approval_required = sorted( + opportunity.get("opportunity_id") + for opportunity in opportunities + if opportunity.get("status") == "approval_required" + ) + if rollups.get("noise_reduction_opportunities_total") != len(opportunities): + raise ValueError(f"{label}: rollups.noise_reduction_opportunities_total must match opportunities") + if sorted(rollups.get("approval_required_opportunity_ids") or []) != approval_required: + raise ValueError(f"{label}: rollups.approval_required_opportunity_ids must match opportunities") + + if sorted(rollups.get("classification_gap_ids") or []) != sorted(gap.get("gap_id") for gap in gaps): + raise ValueError(f"{label}: rollups.classification_gap_ids must match classification_gaps") + + +def _require_surface_evidence(payload: dict[str, Any], label: str) -> None: + surfaces = payload.get("observability_surfaces") or [] + missing = sorted( + surface.get("surface_id") + for surface in surfaces + if not surface.get("coverage_contract") + or not surface.get("evidence_refs") + or not surface.get("next_action") + ) + if missing: + raise ValueError(f"{label}: observability_surfaces must include contract, evidence, next_action: {missing}") + + +def _require_noise_opportunities(payload: dict[str, Any], label: str) -> None: + opportunities = payload.get("noise_reduction_opportunities") or [] + non_proposal = sorted( + opportunity.get("opportunity_id") + for opportunity in opportunities + if opportunity.get("proposal_only") is not True + ) + if non_proposal: + raise ValueError(f"{label}: noise opportunities must stay proposal_only: {non_proposal}") + + required_ids = { + "prometheus_noise_rule_tuning", + "alertmanager_grouping_inhibit_tuning", + "success_notification_quiet_policy", + } + present = {opportunity.get("opportunity_id") for opportunity in opportunities} + if not required_ids.issubset(present): + raise ValueError(f"{label}: missing required noise-reduction opportunities") + + +def _require_operator_denials(payload: dict[str, Any], label: str) -> None: + contract = payload.get("operator_contract") or {} + must_not_interpret_as = set(contract.get("must_not_interpret_as") or []) + required_denials = { + "Prometheus alert rule 修改批准", + "Alertmanager receiver / route 修改批准", + "Alertmanager 指向 OpenClaw receiver 批准", + "Silence 建立或維護窗口批准", + "Grafana dashboard 寫入批准", + "SigNoz / Sentry webhook 設定修改批准", + "Secret 已讀取或可輸出", + "Telegram 測試通知批准", + "deploy / reload / workflow 觸發批准", + "runtime execution 授權", + } + if not required_denials.issubset(must_not_interpret_as): + raise ValueError(f"{label}: operator_contract.must_not_interpret_as is missing required denials") + + route_policy = str(contract.get("alertmanager_route_policy") or "") + if "OpenClaw" not in route_policy or "不接收 Alertmanager webhook" not in route_policy: + raise ValueError(f"{label}: operator_contract.alertmanager_route_policy must block OpenClaw receiver use") + + +def _require_no_plaintext_secret_payload_keys(value: Any, label: str, path: str = "$") -> None: + if isinstance(value, dict): + forbidden_key_fragments = { + "secret_value", + "token_value", + "authorization_header", + "private_key", + "webhook_secret", + "runner_token", + "signoz_token", + "sentry_dsn", + } + for key, nested in value.items(): + lowered = str(key).lower() + if any(fragment in lowered for fragment in forbidden_key_fragments): + raise ValueError(f"{label}: forbidden secret payload key at {path}.{key}") + _require_no_plaintext_secret_payload_keys(nested, label, f"{path}.{key}") + elif isinstance(value, list): + for index, nested in enumerate(value): + _require_no_plaintext_secret_payload_keys(nested, label, f"{path}[{index}]") + + +def _count_by(items: list[dict[str, Any]], key: str) -> dict[str, int]: + counts: dict[str, int] = {} + for item in items: + value = item.get(key) + counts[value] = counts.get(value, 0) + 1 + return counts diff --git a/apps/api/tests/test_ai_agent_automation_backlog_snapshot_api.py b/apps/api/tests/test_ai_agent_automation_backlog_snapshot_api.py index edc410aa..fdbb6c96 100644 --- a/apps/api/tests/test_ai_agent_automation_backlog_snapshot_api.py +++ b/apps/api/tests/test_ai_agent_automation_backlog_snapshot_api.py @@ -16,16 +16,16 @@ def test_ai_agent_automation_backlog_snapshot_endpoint_returns_committed_snapsho assert response.status_code == 200 data = response.json() assert data["schema_version"] == "ai_agent_automation_backlog_v1" - assert data["program_status"]["overall_completion_percent"] == 78 + assert data["program_status"]["overall_completion_percent"] == 83 assert data["program_status"]["read_only_mode"] is True - assert data["program_status"]["current_task_id"] == "P1-002" - assert data["program_status"]["next_task_id"] == "P1-003" + assert data["program_status"]["current_task_id"] == "P1-003" + assert data["program_status"]["next_task_id"] == "P1-004" assert data["rollups"]["total_items"] == len(data["backlog_items"]) == 23 assert data["rollups"]["by_priority"]["P1"] == 21 - assert data["rollups"]["by_status"]["done"] == 18 + assert data["rollups"]["by_status"]["done"] == 19 assert data["rollups"]["by_gate_status"]["read_only_allowed"] == 20 - assert data["progress_summary"]["overall_percent"] == 78 - assert data["progress_summary"]["done_items"] == 18 + assert data["progress_summary"]["overall_percent"] == 83 + assert data["progress_summary"]["done_items"] == 19 assert data["progress_summary"]["total_items"] == 23 assert data["item_approval_boundary_rollup"]["total_items"] == 23 assert data["item_approval_boundary_rollup"]["items_requiring_explicit_approval"] == [ @@ -51,6 +51,10 @@ def test_ai_agent_automation_backlog_snapshot_endpoint_returns_committed_snapsho assert p1_002["status"] == "done" assert p1_002["next_review"] == "P1-003" assert "gitea_workflow_runner_health_2026-06-05.json" in p1_002["evidence_refs"][0] + p1_003 = next(item for item in data["backlog_items"] if item["item_id"] == "AUTO-P1-003") + assert p1_003["status"] == "done" + assert p1_003["next_review"] == "P1-004" + assert "observability_contract_matrix_2026-06-05.json" in p1_003["evidence_refs"][0] p1_306 = next(item for item in data["backlog_items"] if item["item_id"] == "AUTO-P1-306") assert p1_306["approval_boundary"]["mode"] == "read_only_allowed" assert "runtime_execution" in p1_306["approval_boundary"]["blocked_actions"] diff --git a/apps/api/tests/test_ai_agent_automation_inventory_snapshot_api.py b/apps/api/tests/test_ai_agent_automation_inventory_snapshot_api.py index 9c5c3ee6..34508b0e 100644 --- a/apps/api/tests/test_ai_agent_automation_inventory_snapshot_api.py +++ b/apps/api/tests/test_ai_agent_automation_inventory_snapshot_api.py @@ -18,10 +18,10 @@ def test_ai_agent_automation_inventory_snapshot_endpoint_returns_committed_snaps assert data["schema_version"] == "ai_agent_automation_inventory_snapshot_v1" assert data["program_status"]["overall_completion_percent"] == 100 assert data["program_status"]["read_only_mode"] is True - assert data["program_status"]["current_task_id"] == "P1-002" - assert data["program_status"]["next_task_id"] == "P1-003" - assert data["task_approval_boundary_rollup"]["total_tasks"] == len(data["tasks"]) == 28 - assert data["task_approval_boundary_rollup"]["by_mode"]["read_only_allowed"] == 26 + assert data["program_status"]["current_task_id"] == "P1-003" + assert data["program_status"]["next_task_id"] == "P1-004" + assert data["task_approval_boundary_rollup"]["total_tasks"] == len(data["tasks"]) == 29 + assert data["task_approval_boundary_rollup"]["by_mode"]["read_only_allowed"] == 27 assert data["task_approval_boundary_rollup"]["tasks_requiring_explicit_approval"] == [ "P0-001", "P0-004", @@ -37,6 +37,10 @@ def test_ai_agent_automation_inventory_snapshot_endpoint_returns_committed_snaps assert p1_002["status"] == "done" assert p1_002["approval_boundary"]["mode"] == "read_only_allowed" assert "gitea_workflow_runner_health_2026-06-05.json" in p1_002["output"] + p1_003 = next(task for task in data["tasks"] if task["task_id"] == "P1-003") + assert p1_003["status"] == "done" + assert p1_003["approval_boundary"]["mode"] == "read_only_allowed" + assert "observability_contract_matrix_2026-06-05.json" in p1_003["output"] assert any(task["task_id"] == "P1-204" for task in data["tasks"]) assert any(task["task_id"] == "P1-205" for task in data["tasks"]) assert any(task["task_id"] == "P1-206" for task in data["tasks"]) @@ -67,3 +71,4 @@ def test_ai_agent_automation_inventory_snapshot_endpoint_returns_committed_snaps assert any(evidence["evidence_id"] == "backlog_progress_summary_ui" for evidence in data["evidence"]) assert any(evidence["evidence_id"] == "runtime_surface_inventory_api" for evidence in data["evidence"]) assert any(evidence["evidence_id"] == "gitea_workflow_runner_health_api" for evidence in data["evidence"]) + assert any(evidence["evidence_id"] == "observability_contract_matrix_api" for evidence in data["evidence"]) diff --git a/apps/api/tests/test_observability_contract_matrix.py b/apps/api/tests/test_observability_contract_matrix.py new file mode 100644 index 00000000..aacd1483 --- /dev/null +++ b/apps/api/tests/test_observability_contract_matrix.py @@ -0,0 +1,293 @@ +from __future__ import annotations + +import json + +import pytest + +from src.services.observability_contract_matrix import load_latest_observability_contract_matrix + + +def test_load_latest_observability_contract_matrix_reads_newest_file(tmp_path): + older = _snapshot(generated_at="2026-06-04T00:00:00+08:00", completion=40) + newer = _snapshot(generated_at="2026-06-05T00:00:00+08:00", completion=100) + (tmp_path / "observability_contract_matrix_2026-06-04.json").write_text( + json.dumps(older), + encoding="utf-8", + ) + (tmp_path / "observability_contract_matrix_2026-06-05.json").write_text( + json.dumps(newer), + encoding="utf-8", + ) + + loaded = load_latest_observability_contract_matrix(tmp_path) + + assert loaded["generated_at"] == "2026-06-05T00:00:00+08:00" + assert loaded["program_status"]["overall_completion_percent"] == 100 + assert loaded["rollups"]["total_surfaces"] == 2 + assert loaded["operation_boundaries"]["alertmanager_to_openclaw_allowed"] is False + + +def test_observability_contract_matrix_requires_read_only_mode(tmp_path): + snapshot = _snapshot() + snapshot["program_status"]["read_only_mode"] = False + (tmp_path / "observability_contract_matrix_2026-06-05.json").write_text( + json.dumps(snapshot), + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="read_only_mode"): + load_latest_observability_contract_matrix(tmp_path) + + +def test_observability_contract_matrix_blocks_route_and_rule_mutations(tmp_path): + snapshot = _snapshot() + snapshot["operation_boundaries"]["prometheus_rule_write_allowed"] = True + snapshot["operation_boundaries"]["alertmanager_to_openclaw_allowed"] = True + (tmp_path / "observability_contract_matrix_2026-06-05.json").write_text( + json.dumps(snapshot), + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="operation boundaries"): + load_latest_observability_contract_matrix(tmp_path) + + +def test_observability_contract_matrix_requires_rollup_consistency(tmp_path): + snapshot = _snapshot() + snapshot["rollups"]["surface_ids_requiring_action"] = [] + (tmp_path / "observability_contract_matrix_2026-06-05.json").write_text( + json.dumps(snapshot), + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="surface_ids_requiring_action"): + load_latest_observability_contract_matrix(tmp_path) + + +def test_observability_contract_matrix_requires_noise_candidates_to_be_proposal_only(tmp_path): + snapshot = _snapshot() + snapshot["noise_reduction_opportunities"][0]["proposal_only"] = False + (tmp_path / "observability_contract_matrix_2026-06-05.json").write_text( + json.dumps(snapshot), + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="proposal_only"): + load_latest_observability_contract_matrix(tmp_path) + + +def test_observability_contract_matrix_requires_openclaw_receiver_denial(tmp_path): + snapshot = _snapshot() + snapshot["operator_contract"]["must_not_interpret_as"].remove( + "Alertmanager 指向 OpenClaw receiver 批准" + ) + (tmp_path / "observability_contract_matrix_2026-06-05.json").write_text( + json.dumps(snapshot), + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="operator_contract"): + load_latest_observability_contract_matrix(tmp_path) + + +def test_observability_contract_matrix_rejects_secret_payload_keys(tmp_path): + snapshot = _snapshot() + snapshot["latest_observations"][0]["webhook_secret"] = "redacted" + (tmp_path / "observability_contract_matrix_2026-06-05.json").write_text( + json.dumps(snapshot), + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="forbidden secret payload key"): + load_latest_observability_contract_matrix(tmp_path) + + +def test_observability_contract_matrix_fails_when_missing(tmp_path): + with pytest.raises(FileNotFoundError): + load_latest_observability_contract_matrix(tmp_path) + + +def _snapshot( + *, + generated_at: str = "2026-06-05T00:00:00+08:00", + completion: int = 100, +) -> dict: + surfaces = [ + _surface( + "prometheus_alert_rule_catalog", + "Prometheus 告警規則合約", + "prometheus_rules", + "action_required", + "proposal_only", + ), + _surface( + "alertmanager_awoooi_route", + "Alertmanager → AWOOOI API 路由", + "alertmanager_route", + "verified", + "proposal_only", + ), + ] + opportunities = [ + _opportunity("prometheus_noise_rule_tuning", "approval_required"), + _opportunity("alertmanager_grouping_inhibit_tuning", "approval_required"), + _opportunity("success_notification_quiet_policy", "preserved"), + ] + gaps = [ + { + "gap_id": "prometheus_alert_rule_catalog_seed", + "display_name": "Alert rule catalog seed 未正式產品化", + "status": "action_required", + "severity": "high", + "summary": "只讀矩陣已建立,尚未產生 catalog seed。", + "evidence_refs": ["docs/adr/ADR-090-monitoring-blindspot-governance.md"], + "next_action": "先產 proposal,不改 rule。", + } + ] + return { + "schema_version": "observability_contract_matrix_v1", + "generated_at": generated_at, + "program_status": { + "overall_completion_percent": completion, + "current_priority": "P1", + "current_task_id": "P1-003", + "next_task_id": "P1-004", + "read_only_mode": True, + }, + "source_refs": ["docs/schemas/observability_contract_matrix_v1.schema.json"], + "rollups": { + "total_surfaces": len(surfaces), + "by_kind": _count_by(surfaces, "kind"), + "by_status": _count_by(surfaces, "status"), + "by_evidence_status": _count_by(surfaces, "evidence_status"), + "by_noise_policy_status": _count_by(surfaces, "noise_policy_status"), + "surface_ids_requiring_action": ["prometheus_alert_rule_catalog"], + "surface_ids_with_proposal_only_noise_policy": [ + "alertmanager_awoooi_route", + "prometheus_alert_rule_catalog", + ], + "noise_reduction_opportunities_total": len(opportunities), + "approval_required_opportunity_ids": [ + "alertmanager_grouping_inhibit_tuning", + "prometheus_noise_rule_tuning", + ], + "classification_gap_ids": ["prometheus_alert_rule_catalog_seed"], + "read_only_denials_total": 12, + }, + "observability_surfaces": surfaces, + "noise_reduction_opportunities": opportunities, + "classification_gaps": gaps, + "latest_observations": [ + { + "observation_id": "alertmanager_receiver_guard", + "status": "verified", + "summary": "Alertmanager 不得指向 OpenClaw。", + "evidence_refs": ["docs/HARD_RULES.md#alertmanager-routing"], + } + ], + "operator_contract": { + "display_mode": "read_only_observability_contract_matrix", + "must_not_interpret_as": [ + "Prometheus alert rule 修改批准", + "Alertmanager receiver / route 修改批准", + "Alertmanager 指向 OpenClaw receiver 批准", + "Silence 建立或維護窗口批准", + "Grafana dashboard 寫入批准", + "SigNoz / Sentry webhook 設定修改批准", + "Secret 已讀取或可輸出", + "Telegram 測試通知批准", + "deploy / reload / workflow 觸發批准", + "runtime execution 授權", + ], + "secret_display_policy": "只顯示 redacted metadata。", + "alertmanager_route_policy": "OpenClaw 不接收 Alertmanager webhook;receiver 維持 AWOOOI API。", + "noise_reduction_policy": "只產生 proposal。", + "notification_policy": "成功不洗版。", + }, + "operation_boundaries": { + "read_only_api_allowed": True, + "prometheus_rule_write_allowed": False, + "prometheus_reload_allowed": False, + "alertmanager_route_write_allowed": False, + "alertmanager_receiver_change_allowed": False, + "alertmanager_to_openclaw_allowed": False, + "silence_create_allowed": False, + "grafana_dashboard_write_allowed": False, + "grafana_api_write_allowed": False, + "signoz_query_mutation_allowed": False, + "signoz_webhook_change_allowed": False, + "sentry_webhook_change_allowed": False, + "otel_collector_deploy_allowed": False, + "event_exporter_restart_allowed": False, + "secret_read_allowed": False, + "secret_plaintext_allowed": False, + "notification_send_allowed": False, + "external_api_call_allowed": False, + "live_prometheus_query_allowed": False, + "workflow_trigger_allowed": False, + "deploy_trigger_allowed": False, + "reload_trigger_allowed": False, + "runtime_execution_allowed": False, + }, + "approval_boundaries": { + "prometheus_rule_change_authorized": False, + "prometheus_reload_authorized": False, + "alertmanager_route_change_authorized": False, + "alertmanager_receiver_change_authorized": False, + "alertmanager_to_openclaw_authorized": False, + "silence_authorized": False, + "grafana_write_authorized": False, + "signoz_write_authorized": False, + "sentry_write_authorized": False, + "otel_deploy_authorized": False, + "event_exporter_restart_authorized": False, + "notification_send_authorized": False, + "external_call_authorized": False, + "secret_plaintext_allowed": False, + "workflow_trigger_authorized": False, + "deploy_reload_authorized": False, + "runtime_execution_authorized": False, + }, + } + + +def _surface( + surface_id: str, + display_name: str, + kind: str, + status: str, + noise_policy_status: str, +) -> dict: + return { + "surface_id": surface_id, + "display_name": display_name, + "kind": kind, + "status": status, + "risk_level": "critical", + "evidence_status": "committed_manifest", + "noise_policy_status": noise_policy_status, + "coverage_contract": "只讀 committed evidence。", + "current_contract": "不得改 live 設定。", + "evidence_refs": ["docs/HARD_RULES.md"], + "next_action": "只產 proposal。", + } + + +def _opportunity(opportunity_id: str, status: str) -> dict: + return { + "opportunity_id": opportunity_id, + "display_name": opportunity_id, + "status": status, + "proposal_only": True, + "impact": "降噪提案。", + "evidence_refs": ["docs/HARD_RULES.md"], + "next_action": "人工批准前不執行。", + } + + +def _count_by(items: list[dict], key: str) -> dict[str, int]: + counts: dict[str, int] = {} + for item in items: + value = item[key] + counts[value] = counts.get(value, 0) + 1 + return counts diff --git a/apps/api/tests/test_observability_contract_matrix_api.py b/apps/api/tests/test_observability_contract_matrix_api.py new file mode 100644 index 00000000..147e798c --- /dev/null +++ b/apps/api/tests/test_observability_contract_matrix_api.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from src.api.v1.agents import router + + +def test_observability_contract_matrix_endpoint_returns_committed_snapshot(): + app = FastAPI() + app.include_router(router, prefix="/api/v1") + client = TestClient(app) + + response = client.get("/api/v1/agents/observability-contract-matrix") + + assert response.status_code == 200 + data = response.json() + assert data["schema_version"] == "observability_contract_matrix_v1" + assert data["program_status"]["overall_completion_percent"] == 100 + assert data["program_status"]["current_task_id"] == "P1-003" + assert data["program_status"]["next_task_id"] == "P1-004" + assert data["program_status"]["read_only_mode"] is True + assert data["rollups"]["total_surfaces"] == len(data["observability_surfaces"]) == 6 + assert data["rollups"]["noise_reduction_opportunities_total"] == 5 + assert data["rollups"]["surface_ids_requiring_action"] == [ + "grafana_dashboard_inventory", + "prometheus_alert_rule_catalog", + ] + assert data["rollups"]["approval_required_opportunity_ids"] == [ + "alertmanager_grouping_inhibit_tuning", + "prometheus_noise_rule_tuning", + ] + assert data["operation_boundaries"]["read_only_api_allowed"] is True + assert data["operation_boundaries"]["prometheus_rule_write_allowed"] is False + assert data["operation_boundaries"]["alertmanager_route_write_allowed"] is False + assert data["operation_boundaries"]["alertmanager_to_openclaw_allowed"] is False + assert data["operation_boundaries"]["silence_create_allowed"] is False + assert data["operation_boundaries"]["grafana_dashboard_write_allowed"] is False + assert data["operation_boundaries"]["notification_send_allowed"] is False + assert data["operation_boundaries"]["deploy_trigger_allowed"] is False + assert data["approval_boundaries"]["prometheus_rule_change_authorized"] is False + assert data["approval_boundaries"]["alertmanager_to_openclaw_authorized"] is False + assert data["approval_boundaries"]["deploy_reload_authorized"] is False + alertmanager = next( + row for row in data["observability_surfaces"] if row["surface_id"] == "alertmanager_awoooi_route" + ) + assert alertmanager["status"] == "verified" + assert alertmanager["noise_policy_status"] == "proposal_only" + assert "OpenClaw 只做 AI 分析" in alertmanager["coverage_contract"] + assert "Alertmanager 指向 OpenClaw receiver 批准" in data["operator_contract"]["must_not_interpret_as"] + assert "不接收 Alertmanager webhook" in data["operator_contract"]["alertmanager_route_policy"] + for opportunity in data["noise_reduction_opportunities"]: + assert opportunity["proposal_only"] is True diff --git a/apps/web/messages/en.json b/apps/web/messages/en.json index bc8d9042..d5b24a49 100644 --- a/apps/web/messages/en.json +++ b/apps/web/messages/en.json @@ -3259,6 +3259,54 @@ "not_applicable": "不適用", "actionable_only_no_success_noise": "需處置才通知,成功不洗版" } + }, + "observability": { + "title": "監控合約與降噪機會", + "source": "{generated} · {current} → {next}", + "noiseTitle": "降噪 proposal", + "classificationTitle": "分類缺口", + "contractTitle": "不可誤讀合約", + "metrics": { + "surfaces": "監控面", + "actions": "需處置", + "proposals": "降噪提案", + "classificationGaps": "分類缺口", + "approvalRequired": "需批准" + }, + "map": { + "coverage": "合約覆蓋", + "coverageDetail": "Prometheus / Alertmanager / Grafana / SigNoz / Sentry / taxonomy。", + "noise": "降噪路徑", + "noiseDetail": "只產生 proposal,不改 receiver 或 silence。", + "classification": "批准邊界", + "classificationDetail": "降噪候選先進批准包,不直接改規則、receiver 或分類器。", + "safeBoundary": "安全邊界", + "safeBoundaryDetail": "alert rule、silence、通知、dashboard、deploy 入口皆為 0。" + }, + "labels": { + "evidence": "證據", + "noise": "降噪" + }, + "values": { + "prometheus_rules": "Prometheus 規則", + "alertmanager_route": "Alertmanager 路由", + "grafana_dashboard": "Grafana Dashboard", + "signoz_clickhouse": "SigNoz / ClickHouse", + "sentry_source_link": "Sentry Source Link", + "otel_event_exporter": "OTEL / Event Exporter", + "verified": "已驗證", + "action_required": "需處置", + "blocked": "阻擋", + "committed_manifest": "已提交 manifest", + "production_readback_recorded": "正式讀回已記錄", + "proposal_only": "僅提案", + "preserved": "已保留", + "needs_proposal": "待提案", + "approval_required": "需批准", + "ready_for_proposal": "提案可審", + "deferred": "延後", + "proposal_required": "需提案" + } } } }, diff --git a/apps/web/messages/zh-TW.json b/apps/web/messages/zh-TW.json index bc8d9042..d5b24a49 100644 --- a/apps/web/messages/zh-TW.json +++ b/apps/web/messages/zh-TW.json @@ -3259,6 +3259,54 @@ "not_applicable": "不適用", "actionable_only_no_success_noise": "需處置才通知,成功不洗版" } + }, + "observability": { + "title": "監控合約與降噪機會", + "source": "{generated} · {current} → {next}", + "noiseTitle": "降噪 proposal", + "classificationTitle": "分類缺口", + "contractTitle": "不可誤讀合約", + "metrics": { + "surfaces": "監控面", + "actions": "需處置", + "proposals": "降噪提案", + "classificationGaps": "分類缺口", + "approvalRequired": "需批准" + }, + "map": { + "coverage": "合約覆蓋", + "coverageDetail": "Prometheus / Alertmanager / Grafana / SigNoz / Sentry / taxonomy。", + "noise": "降噪路徑", + "noiseDetail": "只產生 proposal,不改 receiver 或 silence。", + "classification": "批准邊界", + "classificationDetail": "降噪候選先進批准包,不直接改規則、receiver 或分類器。", + "safeBoundary": "安全邊界", + "safeBoundaryDetail": "alert rule、silence、通知、dashboard、deploy 入口皆為 0。" + }, + "labels": { + "evidence": "證據", + "noise": "降噪" + }, + "values": { + "prometheus_rules": "Prometheus 規則", + "alertmanager_route": "Alertmanager 路由", + "grafana_dashboard": "Grafana Dashboard", + "signoz_clickhouse": "SigNoz / ClickHouse", + "sentry_source_link": "Sentry Source Link", + "otel_event_exporter": "OTEL / Event Exporter", + "verified": "已驗證", + "action_required": "需處置", + "blocked": "阻擋", + "committed_manifest": "已提交 manifest", + "production_readback_recorded": "正式讀回已記錄", + "proposal_only": "僅提案", + "preserved": "已保留", + "needs_proposal": "待提案", + "approval_required": "需批准", + "ready_for_proposal": "提案可審", + "deferred": "延後", + "proposal_required": "需提案" + } } } }, diff --git a/apps/web/src/app/[locale]/governance/tabs/automation-inventory-tab.tsx b/apps/web/src/app/[locale]/governance/tabs/automation-inventory-tab.tsx index e3f3563d..6ce23c2e 100644 --- a/apps/web/src/app/[locale]/governance/tabs/automation-inventory-tab.tsx +++ b/apps/web/src/app/[locale]/governance/tabs/automation-inventory-tab.tsx @@ -39,6 +39,7 @@ import { type BackupDrTargetInventorySnapshot, type BackupNotificationPolicySnapshot, type GiteaWorkflowRunnerHealthSnapshot, + type ObservabilityContractMatrixSnapshot, type OffsiteEscrowReadinessStatusSnapshot, type RuntimeSurfaceInventorySnapshot, } from '@/lib/api-client' @@ -303,6 +304,7 @@ export function AutomationInventoryTab() { const [offsiteEscrow, setOffsiteEscrow] = useState(null) const [runtimeSurface, setRuntimeSurface] = useState(null) const [giteaHealth, setGiteaHealth] = useState(null) + const [observabilityMatrix, setObservabilityMatrix] = useState(null) const [loading, setLoading] = useState(true) const [error, setError] = useState(false) @@ -317,6 +319,7 @@ export function AutomationInventoryTab() { apiClient.getOffsiteEscrowReadinessStatus(), apiClient.getRuntimeSurfaceInventory(), apiClient.getGiteaWorkflowRunnerHealth(), + apiClient.getObservabilityContractMatrix(), ] as const Promise.allSettled(requests) @@ -330,6 +333,7 @@ export function AutomationInventoryTab() { offsiteEscrowResult, runtimeSurfaceResult, giteaHealthResult, + observabilityMatrixResult, ] = results setSnapshot(inventoryResult.status === 'fulfilled' ? inventoryResult.value : null) @@ -340,6 +344,7 @@ export function AutomationInventoryTab() { setOffsiteEscrow(offsiteEscrowResult.status === 'fulfilled' ? offsiteEscrowResult.value : null) setRuntimeSurface(runtimeSurfaceResult.status === 'fulfilled' ? runtimeSurfaceResult.value : null) setGiteaHealth(giteaHealthResult.status === 'fulfilled' ? giteaHealthResult.value : null) + setObservabilityMatrix(observabilityMatrixResult.status === 'fulfilled' ? observabilityMatrixResult.value : null) setError([ inventoryResult, backlogResult, @@ -348,6 +353,7 @@ export function AutomationInventoryTab() { policyResult, offsiteEscrowResult, giteaHealthResult, + observabilityMatrixResult, ].some(result => result.status === 'rejected')) }) .catch(() => setError(true)) @@ -461,6 +467,28 @@ export function AutomationInventoryTab() { }) }, [giteaHealth]) + const visibleObservabilitySurfaces = useMemo(() => { + if (!observabilityMatrix) return [] + const priority = { action_required: 0, blocked: 1, verified: 2 } as Record + return [...observabilityMatrix.observability_surfaces].sort((a, b) => { + const left = priority[a.status] ?? 3 + const right = priority[b.status] ?? 3 + if (left !== right) return left - right + return a.surface_id.localeCompare(b.surface_id) + }) + }, [observabilityMatrix]) + + const visibleNoiseOpportunities = useMemo(() => { + if (!observabilityMatrix) return [] + const priority = { approval_required: 0, ready_for_proposal: 1, preserved: 2, deferred: 3 } as Record + return [...observabilityMatrix.noise_reduction_opportunities].sort((a, b) => { + const left = priority[a.status] ?? 3 + const right = priority[b.status] ?? 3 + if (left !== right) return left - right + return a.opportunity_id.localeCompare(b.opportunity_id) + }) + }, [observabilityMatrix]) + if (loading) { return (
@@ -474,7 +502,7 @@ export function AutomationInventoryTab() { ) } - if (error || !snapshot || !backlog || !backupTargets || !backupReadiness || !backupPolicy || !offsiteEscrow || !giteaHealth) { + if (error || !snapshot || !backlog || !backupTargets || !backupReadiness || !backupPolicy || !offsiteEscrow || !giteaHealth || !observabilityMatrix) { return (
@@ -527,6 +555,10 @@ export function AutomationInventoryTab() { const runtimeBoundComponents = runtimeSurface?.rollups.source_components_with_runtime_binding ?? 0 const giteaRunnerActions = giteaHealth.rollups.workflow_ids_requiring_runner_attestation.length const giteaQuietPolicies = giteaHealth.rollups.notification_contracts_quiet_success_count + const observabilityActions = observabilityMatrix.rollups.surface_ids_requiring_action.length + const observabilityProposalCount = observabilityMatrix.rollups.noise_reduction_opportunities_total + const observabilityClassificationGaps = observabilityMatrix.rollups.classification_gap_ids.length + const observabilityApprovalRequired = observabilityMatrix.rollups.approval_required_opportunity_ids.length const backlogProgressPercent = backlog.progress_summary.overall_percent const explicitApprovalItemCount = backlog.item_approval_boundary_rollup.items_requiring_explicit_approval.length const taskBoundaryCount = snapshot.task_approval_boundary_rollup.total_tasks @@ -643,6 +675,14 @@ export function AutomationInventoryTab() { } } + const observabilityValueLabel = (value: string) => { + try { + return t(`observability.values.${value}` as never) + } catch { + return value + } + } + return (
@@ -1450,6 +1490,169 @@ export function AutomationInventoryTab() {
+ +
+
+
+ + + {t('observability.title')} + +
+
+ {t('observability.source', { + generated: formatDateTime(observabilityMatrix.generated_at), + current: observabilityMatrix.program_status.current_task_id, + next: observabilityMatrix.program_status.next_task_id, + })} +
+
+ +
+ } /> + 0 ? 'warn' : 'ok'} icon={} /> + } /> + } /> + 0 ? 'warn' : 'ok'} icon={} /> +
+ +
+ } + /> + } + /> + } + /> + } + /> +
+ +
+
+ {visibleObservabilitySurfaces.map(surface => ( +
+
+
+ + {surface.display_name} + + +
+
+ + + +
+
+ {surface.coverage_contract} +
+ {surface.current_contract ? ( +
+ {surface.current_contract} +
+ ) : null} +
+ {surface.next_action} +
+
+ +
+
+
+ ))} +
+ +
+
+ + {t('observability.noiseTitle')} + + {visibleNoiseOpportunities.slice(0, 5).map(opportunity => ( +
+
+ + {opportunity.display_name} + + +
+
+ {opportunity.impact} +
+
+ ))} +
+ +
+ + {t('observability.classificationTitle')} + + {observabilityMatrix.classification_gaps.map(gap => ( +
+
+ + {gap.display_name} + + +
+
+ {gap.summary} +
+
+ ))} +
+ +
+ + {t('observability.contractTitle')} + +
+ {observabilityMatrix.operator_contract.alertmanager_route_policy} +
+
+ {observabilityMatrix.operator_contract.noise_reduction_policy} +
+
+ {observabilityMatrix.operator_contract.notification_policy} +
+
+ {observabilityMatrix.operator_contract.must_not_interpret_as.slice(0, 6).map(item => ( + + ))} +
+
+
+
+
+
+
@@ -1564,6 +1767,10 @@ export function AutomationInventoryTab() { .automation-inventory-gitea-map-grid, .automation-inventory-gitea-grid, .automation-inventory-gitea-workflow-grid, + .automation-inventory-observability-kpi-grid, + .automation-inventory-observability-map-grid, + .automation-inventory-observability-grid, + .automation-inventory-observability-surface-grid, .automation-inventory-bottom-grid, .automation-inventory-task-grid { grid-template-columns: 1fr !important; diff --git a/apps/web/src/lib/api-client.ts b/apps/web/src/lib/api-client.ts index 942c106e..19f14574 100644 --- a/apps/web/src/lib/api-client.ts +++ b/apps/web/src/lib/api-client.ts @@ -272,6 +272,11 @@ export const apiClient = { return handleResponse(res) }, + async getObservabilityContractMatrix() { + const res = await fetch(`${API_BASE_URL}/agents/observability-contract-matrix`) + return handleResponse(res) + }, + async getBackupDrTargetInventory() { const res = await fetch(`${API_BASE_URL}/agents/backup-dr-target-inventory`) return handleResponse(res) @@ -946,6 +951,80 @@ export interface GiteaWorkflowRunnerHealthSnapshot { approval_boundaries: Record } +export interface ObservabilityContractMatrixSnapshot { + schema_version: 'observability_contract_matrix_v1' + generated_at: string + program_status: { + overall_completion_percent: number + current_priority: 'P0' | 'P1' | 'P2' | 'P3' + current_task_id: string + next_task_id: string + read_only_mode: true + } + source_refs: string[] + rollups: { + total_surfaces: number + by_kind: Record + by_status: Record + by_evidence_status: Record + by_noise_policy_status: Record + surface_ids_requiring_action: string[] + surface_ids_with_proposal_only_noise_policy: string[] + noise_reduction_opportunities_total: number + approval_required_opportunity_ids: string[] + classification_gap_ids: string[] + read_only_denials_total: number + } + observability_surfaces: Array<{ + surface_id: string + display_name: string + kind: string + status: 'verified' | 'action_required' | 'blocked' + risk_level: 'low' | 'medium' | 'high' | 'critical' + evidence_status: string + noise_policy_status: string + coverage_contract: string + current_contract?: string + evidence_refs: string[] + next_action: string + }> + noise_reduction_opportunities: Array<{ + opportunity_id: string + display_name: string + status: string + proposal_only: true + impact: string + target_surface_ids?: string[] + evidence_refs: string[] + next_action: string + }> + classification_gaps: Array<{ + gap_id: string + display_name: string + status: string + severity: 'low' | 'medium' | 'high' | 'critical' + summary: string + evidence_refs: string[] + next_action: string + }> + latest_observations: Array<{ + observation_id: string + status: string + summary: string + evidence_refs: string[] + }> + operator_contract: { + display_mode: 'read_only_observability_contract_matrix' + must_not_interpret_as: string[] + secret_display_policy: string + alertmanager_route_policy: string + noise_reduction_policy: string + notification_policy: string + } + operation_boundaries: Record + approval_boundaries: Record +} + export interface BackupDrTargetInventorySnapshot { schema_version: 'backup_dr_target_inventory_v1' generated_at: string diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 2182a4bc..d0ee6597 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,38 @@ +## 2026-06-05|P1-003 監控合約與降噪矩陣本地完成 + +**背景**:接續 P1-002 Gitea workflow / runner health contract 與決策摘要正式驗證,依工作清單推進 `P1-003`。本段只建立 Prometheus / Alertmanager / SigNoz / Grafana / Sentry / OTEL 的 committed observability matrix、只讀 API 與治理頁顯示,不修改 alert rules、不 reload Prometheus、不改 Alertmanager receiver / route、不建立 silence、不寫 Grafana、不改 SigNoz / Sentry webhook、不發 Telegram 測試通知。 + +**本輪完成**: +- 新增 `observability_contract_matrix_v1` schema 與 `docs/evaluations/observability_contract_matrix_2026-06-05.json`。 +- 新增 `GET /api/v1/agents/observability-contract-matrix` 與 service guard,強制驗證 read-only mode、operation / approval boundaries、rollup consistency、降噪候選只能 proposal、Alertmanager 不得指向 OpenClaw、不得出現 secret payload key。 +- 治理頁 `/zh-TW/governance?tab=automation-inventory` 新增「監控合約與降噪機會」區塊,顯示監控面、需處置、降噪候選、需批准候選、分類缺口與不可誤讀合約。 +- 同步 automation backlog / inventory snapshot:current `P1-003`、next `P1-004`、backlog overall `83%`、P1 `90%`、done `19/23`、inventory tasks `29`。 + +**目前數字**: +- Observability surfaces:`6`。 +- 需處置 surfaces:`2`(`grafana_dashboard_inventory`、`prometheus_alert_rule_catalog`)。 +- 降噪候選:`5`。 +- 需人工批准的降噪候選:`2`(Prometheus rule tuning、Alertmanager grouping / inhibit tuning)。 +- Classification gaps:`3`。 +- Read-only denials:`12`。 + +**本地驗證**: +- JSON parse 通過:`observability_contract_matrix_2026-06-05.json`、`observability_contract_matrix_v1.schema.json`、automation backlog / inventory snapshots。 +- 目標測試通過:observability contract matrix service / API、automation inventory / backlog snapshot API、Gitea workflow runner health service / API 共 `19 passed`。 +- Python py_compile 通過:`apps/api/src/services/observability_contract_matrix.py`、`apps/api/src/api/v1/agents.py`。 +- zh-TW / en i18n key 差異 `0`;web typecheck 通過;Next production build 通過。 +- source-control-owner-response guard、security-mirror-progress guard、`git diff --check` 通過。 +- 本地 API readback 回 `observability_contract_matrix_v1`、current `P1-003`、next `P1-004`、surfaces `6`、noise opportunities `5`、approval-required opportunities `2`;backlog 回 overall `83%`、done `19/23`。 + +**邊界**: +- Prometheus alert rule 修改、Prometheus reload、Alertmanager route / receiver 修改、Alertmanager 指向 OpenClaw、silence 建立、Grafana 寫入、SigNoz / Sentry webhook 修改、OTEL / Event Exporter deploy 或 restart、Secret payload read、Telegram 測試通知、external API / live query、workflow / deploy / reload / runtime execution 全部仍未批准。 +- 成功 smoke 不即時通知洗版;失敗、action-required 或人工作業才可進通知批准流程。 + +**下一步**: +1. Commit 並推 `gitea main`。 +2. 等 deploy marker 後執行 production API / Browser smoke。 +3. P1-004:盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑。 + ## 2026-06-05|AI Agent 自動化盤點決策摘要正式上線 **背景**:接續 P1-002 Gitea workflow / runner health contract 與文字換行正式驗證,治理頁 `/zh-TW/governance?tab=automation-inventory` 已能呈現完整資料,但首屏資訊密度偏高,使用者難以快速判讀「目前狀態、拖累因素、下一步」。本段只優化既有資訊呈現,不移除原明細、不新增 API、不改 workflow / runner / secret / runtime 行為。 diff --git a/docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md b/docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md index 58a8ba0a..229b8822 100644 --- a/docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md +++ b/docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md @@ -10,10 +10,10 @@ |---|---:|---|---| | Agent 市場治理 | 72% | 進行中 | `agent_market_governance_snapshot_v1`、API、UI 分頁、每週觀察流程 | | Nemotron 實際整合應用 | 30% | 完整回放前仍被關卡擋下 | `blocked_needs_evidence`,下一關是 `refresh_source_evidence_then_5_record_smoke_only` | -| 工具 / 服務 / 套件 AI 自動化 | 78% | P0 已完成,P1 套件 / 供應鏈主線已完成;備份 / DR 主線已完成到異地 / escrow 準備度顯示;任務批准邊界、進度彙總、P1-001 執行面只讀矩陣與 P1-002 Gitea 工作流程 / runner 健康合約已完成,下一主線是 P1-003 監控合約與降噪機會 | 狀態分類、盤點 schema、權限矩陣、靜態盤點種子、只讀 API、UI 骨架、驗證、自動化待辦 schema / 快照 / API / 分組 UI、Backup / DR 目標盤點、準備度矩陣、備份通知政策、Backup / DR 證據 UI、復原演練批准包模板、異地 / escrow 準備度狀態、任務批准邊界、確定性進度彙總、Python 套件 / 供應鏈只讀基線、JS pnpm/npm 只讀基線、Docker build surface 只讀基線、CVE / license / drift 嚴重度政策、定期依賴漂移與外部資料來源檢查設計、依賴升級批准包模板、runtime_surface_inventory_v1 schema / snapshot / API / UI、gitea_workflow_runner_health_v1 schema / snapshot / API / UI 已完成 | +| 工具 / 服務 / 套件 AI 自動化 | 83% | P0 已完成,P1 套件 / 供應鏈主線已完成;備份 / DR 主線已完成到異地 / escrow 準備度顯示;任務批准邊界、進度彙總、P1-001 執行面只讀矩陣、P1-002 Gitea 工作流程 / runner 健康合約與 P1-003 監控合約 / 降噪矩陣已完成,下一主線是 P1-004 AI Router / provider route 盤點 | 狀態分類、盤點 schema、權限矩陣、靜態盤點種子、只讀 API、UI 骨架、驗證、自動化待辦 schema / 快照 / API / 分組 UI、Backup / DR 目標盤點、準備度矩陣、備份通知政策、Backup / DR 證據 UI、復原演練批准包模板、異地 / escrow 準備度狀態、任務批准邊界、確定性進度彙總、Python 套件 / 供應鏈只讀基線、JS pnpm/npm 只讀基線、Docker build surface 只讀基線、CVE / license / drift 嚴重度政策、定期依賴漂移與外部資料來源檢查設計、依賴升級批准包模板、runtime_surface_inventory_v1 schema / snapshot / API / UI、gitea_workflow_runner_health_v1 schema / snapshot / API / UI、observability_contract_matrix_v1 schema / snapshot / API / UI 已完成 | | 本工作清單與分析報告 | 100% | 已完成 | 本 MD 文件 | -AI Agent 自動化工作包目前完成度:**78%**。本工作清單文件本身完成度:**100%**。 +AI Agent 自動化工作包目前完成度:**83%**。本工作清單文件本身完成度:**100%**。 完成度計算模型: @@ -868,7 +868,7 @@ UI: |---|---|---:|---|---|---|---| | P1-001 | 完成 | 100 | OpenClaw | 盤點 API / Web / Worker / K8s runtime surface | `runtime_surface_inventory_v1` / `GET /api/v1/agents/runtime-surface-inventory` / 執行面只讀矩陣 | 只讀;不得查 Secret payload、不得 rollout / restart / scale / delete | | P1-002 | 完成 | 100 | Hermes | 盤點 Gitea 工作流程與 runner 健康合約 | `gitea_workflow_runner_health_v1` / `GET /api/v1/agents/gitea-workflow-runner-health` / Gitea 健康合約 UI | 只讀;不修改 workflow、不重啟 runner、不停止 container、不讀 Secret、不發通知 | -| P1-003 | 待辦 | 0 | Hermes | 盤點 Prometheus / Alertmanager / SigNoz / Grafana 監控合約 | 可觀測性矩陣 | 只讀 | +| P1-003 | 完成 | 100 | Hermes | 盤點 Prometheus / Alertmanager / SigNoz / Grafana 監控合約 | `observability_contract_matrix_v1` / `GET /api/v1/agents/observability-contract-matrix` / 監控合約與降噪 UI | 只讀;不修改 alert rules、不改 receiver/route、不建立 silence、不寫 Grafana、不發通知 | | P1-004 | 待辦 | 0 | OpenClaw | 盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑 | 推理路由矩陣 | 不切 provider | | P1-005 | 待辦 | 0 | OpenClaw | 偵測服務健康缺口與過期端點 | 需處置清單 | 不重啟 | | P1-006 | 待辦 | 0 | Hermes | 在 UI 顯示 service health 證據卡 | 狀態卡 | 瀏覽器驗證 | @@ -1084,10 +1084,25 @@ UI: 下一步:P1-003 盤點監控合約與降噪機會。 ``` +本次同步: + +```text +進度:83%。 +目前優先級:P1。 +目前任務:P1-003 盤點監控合約與降噪機會。 +狀態變更:待辦 -> 完成。 +證據:observability_contract_matrix_v1 schema / snapshot;GET /api/v1/agents/observability-contract-matrix;治理頁監控合約與降噪機會區塊;automation backlog 83%;inventory tasks 29。 +目前數字:observability surfaces 6;需處置 2;降噪候選 5;需人工批准的降噪候選 2;classification gaps 3;backlog done 19/23;overall 83%;P1 90%;WS3 監控自動化 75%。 +驗證:JSON parse 通過;observability contract matrix service / API、automation inventory / backlog snapshot API、Gitea workflow runner health service / API 目標測試 `19 passed`;Python py_compile 通過;zh-TW / en i18n key 差異 `0`;web typecheck 通過;Next production build 通過;source-control-owner-response guard、security-mirror-progress guard、git diff --check 通過;本地 API readback 回 `observability_contract_matrix_v1`、current `P1-003`、next `P1-004`、surfaces `6`、noise opportunities `5`、approval-required opportunities `2`。 +正式驗證:尚未推版;待本地驗證完成後推 `gitea main` 並補 production API / browser smoke。 +阻擋:Prometheus alert rule 修改、Prometheus reload、Alertmanager route / receiver 修改、Alertmanager 指向 OpenClaw、silence 建立、Grafana 寫入、SigNoz / Sentry webhook 修改、OTEL/Event Exporter deploy 或 restart、Secret payload read、Telegram 測試通知、external API/live query、workflow/deploy/reload/runtime execution 仍全部禁止。 +下一步:P1-004 盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑。 +``` + ## 13. 立即執行順序 -1. P1-003:盤點 Prometheus / Alertmanager / SigNoz / Grafana 監控合約與降噪機會。 -2. P1-004:盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑。 +1. P1-004:盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑。 +2. P1-005:偵測服務健康缺口與過期端點。 3. P2 / P3 必須等 P1 服務、監控與 provider runtime surface 可見且關卡穩定後再做。 ## 14. 目前風險 diff --git a/docs/evaluations/ai_agent_automation_backlog_2026-06-04.json b/docs/evaluations/ai_agent_automation_backlog_2026-06-04.json index fb78257b..5cc5f851 100644 --- a/docs/evaluations/ai_agent_automation_backlog_2026-06-04.json +++ b/docs/evaluations/ai_agent_automation_backlog_2026-06-04.json @@ -1,12 +1,12 @@ { "schema_version": "ai_agent_automation_backlog_v1", - "generated_at": "2026-06-05T10:56:16+08:00", + "generated_at": "2026-06-05T12:34:00+08:00", "source_inventory_snapshot_ref": "docs/evaluations/ai_agent_automation_inventory_snapshot_2026-06-04_static_seed.json", "program_status": { - "overall_completion_percent": 78, + "overall_completion_percent": 83, "current_priority": "P1", - "current_task_id": "P1-002", - "next_task_id": "P1-003", + "current_task_id": "P1-003", + "next_task_id": "P1-004", "read_only_mode": true }, "rollups": { @@ -17,8 +17,8 @@ "P3": 1 }, "by_status": { - "done": 18, - "planned": 5 + "done": 19, + "planned": 4 }, "by_gate_status": { "read_only_allowed": 20, @@ -318,26 +318,31 @@ { "item_id": "AUTO-P1-003", "priority": "P1", - "status": "planned", + "status": "done", "workstream_id": "WS3", "source_asset_id": "prometheus_alertmanager", "source_signal_kind": "health_gap", "title": "盤點監控合約與降噪機會", "owner_agent": "hermes", - "recommended_action": "建立 Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse 的只讀 observability matrix。", + "recommended_action": "已建立 Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry 的只讀 observability matrix;降噪與分類缺口只產生 proposal,不修改 alert rules。", "action_class": "observe", "gate_status": "read_only_allowed", "risk_level": "high", "evidence_refs": [ - "k8s/monitoring/prometheus.yml", - "ops/monitoring/" + "docs/evaluations/observability_contract_matrix_2026-06-05.json", + "GET /api/v1/agents/observability-contract-matrix", + "k8s/monitoring/", + "ops/alertmanager/alertmanager.yml", + "ops/monitoring/", + "apps/api/src/constants/alert_types.py" ], "acceptance_criteria": [ - "不修改 alert rules", - "降噪只產生 proposal", - "標出 stale、缺 evidence、過度通知與 classification gap" + "不修改 alert rules、不呼叫 silence API、不送測試通知", + "列出 Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry 的只讀合約", + "降噪只產生 proposal,標出 stale、缺 evidence、過度通知與 classification gap", + "API / UI 僅顯示 committed snapshot 與不可誤讀合約" ], - "next_review": "P1-003", + "next_review": "P1-004", "approval_boundary": { "mode": "read_only_allowed", "display_summary": "只允許只讀盤點、顯示與批准包準備;不得直接執行寫入、部署、通知或外部呼叫。", @@ -1170,16 +1175,16 @@ ] }, "progress_summary": { - "overall_percent": 78, - "done_items": 18, - "planned_items": 5, + "overall_percent": 83, + "done_items": 19, + "planned_items": 4, "total_items": 23, "formula": "round(done_items / total_items * 100),只有 status=done 計入完成;planned/in_progress/blocked/deferred/rejected 不計入。", "by_priority": [ { "priority": "P1", - "completion_percent": 86, - "done_items": 18, + "completion_percent": 90, + "done_items": 19, "total_items": 21 }, { @@ -1207,10 +1212,10 @@ { "workstream_id": "WS3", "display_name": "監控自動化", - "completion_percent": 50, - "done_items": 2, + "completion_percent": 75, + "done_items": 3, "total_items": 4, - "next_task_id": "P1-003" + "next_task_id": "P1-004" }, { "workstream_id": "WS4", @@ -1250,7 +1255,7 @@ "completion_percent": 100, "done_items": 2, "total_items": 2, - "next_task_id": "P1-003" + "next_task_id": "P1-004" } ] } diff --git a/docs/evaluations/ai_agent_automation_inventory_snapshot_2026-06-04_static_seed.json b/docs/evaluations/ai_agent_automation_inventory_snapshot_2026-06-04_static_seed.json index 51ec9ed3..47b50571 100644 --- a/docs/evaluations/ai_agent_automation_inventory_snapshot_2026-06-04_static_seed.json +++ b/docs/evaluations/ai_agent_automation_inventory_snapshot_2026-06-04_static_seed.json @@ -1,11 +1,11 @@ { "schema_version": "ai_agent_automation_inventory_snapshot_v1", - "generated_at": "2026-06-05T10:56:16+08:00", + "generated_at": "2026-06-05T12:34:00+08:00", "program_status": { "overall_completion_percent": 100, "current_priority": "P1", - "current_task_id": "P1-002", - "next_task_id": "P1-003", + "current_task_id": "P1-003", + "next_task_id": "P1-004", "read_only_mode": true }, "status_taxonomy": { @@ -287,44 +287,50 @@ "domain_id": "observability", "display_name": "Prometheus / Alertmanager", "asset_type": "observability_tool", - "status": "planned", + "status": "done", "gate_status": "read_only_allowed", "owner_agent": "hermes", "risk_level": "high", "evidence_refs": [ - "k8s/monitoring/prometheus.yml", + "docs/evaluations/observability_contract_matrix_2026-06-05.json", + "GET /api/v1/agents/observability-contract-matrix", + "k8s/monitoring/", + "ops/alertmanager/alertmanager.yml", "ops/monitoring/" ], - "next_action": "P1-003 盤點告警合約與降噪機會。" + "next_action": "P1-003 已完成只讀監控合約與降噪機會矩陣;P1-004 盤點 AI Router / provider route。" }, { "asset_id": "signoz_clickhouse", "domain_id": "observability", "display_name": "SigNoz / ClickHouse", "asset_type": "observability_tool", - "status": "planned", + "status": "done", "gate_status": "read_only_allowed", "owner_agent": "hermes", "risk_level": "medium", "evidence_refs": [ - "docs/LOGBOOK.md" + "docs/evaluations/observability_contract_matrix_2026-06-05.json", + "apps/api/src/services/signoz_client.py", + "ops/signoz" ], - "next_action": "P1-003 補 trace / metrics / log 可見性盤點。" + "next_action": "P1-003 已補 trace / metrics / log 可見性只讀合約;live readback 仍需後續人工批准範圍。" }, { "asset_id": "sentry", "domain_id": "tools", "display_name": "Sentry", "asset_type": "external_service", - "status": "planned", + "status": "done", "gate_status": "read_only_allowed", "owner_agent": "hermes", "risk_level": "medium", "evidence_refs": [ - "scripts/backup/backup-sentry.sh", - "apps/web/src/instrumentation.ts" + "docs/evaluations/observability_contract_matrix_2026-06-05.json", + "apps/web/src/instrumentation.ts", + "scripts/backup/backup-sentry.sh" ], - "next_action": "P1-003 盤點錯誤監控與備份狀態。" + "next_action": "P1-003 已補 Sentry error monitoring 合約;不讀 DSN secret、不送事件。" }, { "asset_id": "telegram_chain", @@ -472,9 +478,9 @@ { "workstream_id": "WS3", "display_name": "監控自動化", - "completion_percent": 50, + "completion_percent": 75, "status": "in_progress", - "next_task_id": "P1-003" + "next_task_id": "P1-004" }, { "workstream_id": "WS4", @@ -509,7 +515,7 @@ "display_name": "產品 UI", "completion_percent": 94, "status": "in_progress", - "next_task_id": "P1-003" + "next_task_id": "P1-004" } ], "tasks": [ @@ -834,6 +840,38 @@ ] } }, + { + "task_id": "P1-003", + "priority": "P1", + "status": "done", + "completion_percent": 100, + "owner_agent": "hermes", + "title": "盤點監控合約與降噪機會", + "output": "docs/evaluations/observability_contract_matrix_2026-06-05.json + GET /api/v1/agents/observability-contract-matrix", + "gate_status": "read_only_allowed", + "next_action": "完成 committed observability contract matrix;下一步 P1-004 盤點 AI Router / provider route。", + "approval_boundary": { + "mode": "read_only_allowed", + "display_summary": "只允許只讀盤點、顯示與批准包準備;不得直接執行寫入、部署、通知或外部呼叫。", + "allowed_actions": [ + "讀取 committed snapshot", + "整理只讀證據", + "顯示治理 UI" + ], + "blocked_actions": [ + "production_write", + "runtime_execution", + "destructive_operation", + "secret_plaintext_collection", + "unapproved_deploy", + "unapproved_external_call" + ], + "requires_operator_approval_for": [ + "任何非只讀操作", + "任何部署、排程、通知或外部呼叫變更" + ] + } + }, { "task_id": "P1-301", "priority": "P1", @@ -1729,6 +1767,13 @@ "kind": "api", "ref": "GET /api/v1/agents/gitea-workflow-runner-health", "result": "只讀 API 回傳 gitea_workflow_runner_health_v1;不修改 workflow、不重啟 runner、不停止 container、不讀 Secret、不送通知。" + }, + { + "evidence_id": "observability_contract_matrix_api", + "title": "監控合約與降噪機會只讀 API", + "source_ref": "GET /api/v1/agents/observability-contract-matrix", + "status": "done", + "summary": "只讀呈現 Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry 合約、降噪 proposal 與分類缺口;不修改 alert rules、不送通知、不讀 Secret。" } ], "approval_boundaries": { @@ -1739,10 +1784,10 @@ "destructive_operation_allowed": false }, "task_approval_boundary_rollup": { - "total_tasks": 28, + "total_tasks": 29, "by_mode": { "ready_for_operator_review": 1, - "read_only_allowed": 26, + "read_only_allowed": 27, "approval_required": 1 }, "tasks_requiring_explicit_approval": [ @@ -1760,6 +1805,13 @@ "P0-008", "P1-001", "P1-002", + "P1-003", + "P1-301", + "P1-302", + "P1-303", + "P1-304", + "P1-305", + "P1-306", "P1-101", "P1-102", "P1-103", @@ -1771,13 +1823,7 @@ "P1-203", "P1-204", "P1-205", - "P1-206", - "P1-301", - "P1-302", - "P1-303", - "P1-304", - "P1-305", - "P1-306" + "P1-206" ] } } diff --git a/docs/evaluations/observability_contract_matrix_2026-06-05.json b/docs/evaluations/observability_contract_matrix_2026-06-05.json new file mode 100644 index 00000000..79c62b53 --- /dev/null +++ b/docs/evaluations/observability_contract_matrix_2026-06-05.json @@ -0,0 +1,391 @@ +{ + "schema_version": "observability_contract_matrix_v1", + "generated_at": "2026-06-05T12:24:00+08:00", + "program_status": { + "overall_completion_percent": 100, + "current_priority": "P1", + "current_task_id": "P1-003", + "next_task_id": "P1-004", + "read_only_mode": true + }, + "source_refs": [ + "docs/schemas/observability_contract_matrix_v1.schema.json", + "docs/HARD_RULES.md#alertmanager-routing", + "ops/alertmanager/alertmanager.yml", + "ops/monitoring/alerts.yml", + "ops/monitoring/alerts-unified.yml", + "k8s/monitoring/prometheus.yml", + "k8s/monitoring/alert-chain-monitor.yaml", + "ops/grafana/dashboards/ai-monitoring.json", + "ops/grafana/dashboards/infra-monitoring.json", + "ops/signoz/alerting/rules.yaml", + "ops/signoz/alerting/log-rules.md", + "ops/signoz/otel-collector-config-phase-o.yaml", + "k8s/observability/otel-collector-daemonset.yaml", + "docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md", + "docs/adr/ADR-053-observability-signoz-unified-architecture.md", + "docs/adr/ADR-090-monitoring-blindspot-governance.md", + "docs/LOGBOOK.md" + ], + "rollups": { + "total_surfaces": 6, + "by_kind": { + "prometheus_rules": 1, + "alertmanager_route": 1, + "signoz_clickhouse": 1, + "grafana_dashboard": 1, + "sentry_source_link": 1, + "otel_event_exporter": 1 + }, + "by_status": { + "action_required": 2, + "verified": 4 + }, + "by_evidence_status": { + "committed_manifest": 4, + "production_readback_recorded": 2 + }, + "by_noise_policy_status": { + "proposal_only": 2, + "preserved": 3, + "needs_proposal": 1 + }, + "surface_ids_requiring_action": [ + "grafana_dashboard_inventory", + "prometheus_alert_rule_catalog" + ], + "surface_ids_with_proposal_only_noise_policy": [ + "alertmanager_awoooi_route", + "prometheus_alert_rule_catalog" + ], + "noise_reduction_opportunities_total": 5, + "approval_required_opportunity_ids": [ + "alertmanager_grouping_inhibit_tuning", + "prometheus_noise_rule_tuning" + ], + "classification_gap_ids": [ + "grafana_dashboard_owner_status", + "prometheus_alert_rule_catalog_seed", + "signoz_provider_native_real_alert_gap" + ], + "read_only_denials_total": 12, + "surfaces_requiring_action": [ + "grafana_dashboard_inventory", + "prometheus_alert_rule_catalog" + ], + "proposal_only_count": 5 + }, + "observability_surfaces": [ + { + "surface_id": "prometheus_alert_rule_catalog", + "display_name": "Prometheus 告警規則合約", + "kind": "prometheus_rules", + "status": "action_required", + "risk_level": "critical", + "evidence_status": "committed_manifest", + "noise_policy_status": "proposal_only", + "coverage_contract": "已提交 ops/monitoring/alerts-unified.yml 與 k8s/monitoring/* 規則;本快照只盤點規則、label、runbook 與分類缺口,不 reload Prometheus、不修改 alert rules。", + "current_contract": "committed ops/monitoring/alerts-unified.yml 目前含 118 條 alert;LOGBOOK 曾記錄 production Prometheus rule count 142,需以正式 smoke 讀回確認。", + "evidence_refs": [ + "ops/monitoring/alerts-unified.yml", + "ops/monitoring/alerts.yml", + "k8s/monitoring/alert-chain-monitor.yaml", + "docs/LOGBOOK.md" + ], + "next_action": "建立 alert_rule_catalog seed 與噪音率觀察 proposal;任何 rule 調整放到 P2-003 人工批准。" + }, + { + "surface_id": "alertmanager_awoooi_route", + "display_name": "Alertmanager → AWOOOI API 路由", + "kind": "alertmanager_route", + "status": "verified", + "risk_level": "critical", + "evidence_status": "committed_manifest", + "noise_policy_status": "proposal_only", + "coverage_contract": "Alertmanager receiver 必須指向 AWOOOI API;OpenClaw 只做 AI 分析,不得成為 Alertmanager receiver。", + "current_contract": "ops/alertmanager/alertmanager.yml 以 awoooi-webhook 為主路徑,telegram-direct 僅限 alert-chain/API health 緊急旁路;group_by/team/alertname/severity 已存在。", + "evidence_refs": [ + "docs/HARD_RULES.md#alertmanager-routing", + "ops/alertmanager/alertmanager.yml" + ], + "next_action": "只提出 group_by、inhibit、repeat interval 降噪 proposal;不得直接改 receiver、route 或 silence。" + }, + { + "surface_id": "signoz_clickhouse_ingestion", + "display_name": "SigNoz / ClickHouse / Provider Webhook", + "kind": "signoz_clickhouse", + "status": "verified", + "risk_level": "high", + "evidence_status": "production_readback_recorded", + "noise_policy_status": "preserved", + "coverage_contract": "SigNoz webhook、ClickHouse TTL、OTEL prometheus receiver 與 source provider heartbeat 需分開標示;heartbeat 不是 provider-native 真實告警。", + "current_contract": "ops/signoz/alerting/rules.yaml、log-rules.md 與 RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK 已描述 webhook / rules;LOGBOOK 記錄 SigNoz webhook 與 source provider heartbeat 多次通過。", + "evidence_refs": [ + "ops/signoz/alerting/rules.yaml", + "ops/signoz/alerting/log-rules.md", + "ops/signoz/otel-collector-config-phase-o.yaml", + "docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md", + "docs/adr/ADR-053-observability-signoz-unified-architecture.md", + "docs/LOGBOOK.md" + ], + "next_action": "保留 provider heartbeat / upstream canary 低噪音;補 provider-native 真實告警與 incident correlation gap 的只讀看板。" + }, + { + "surface_id": "grafana_dashboard_inventory", + "display_name": "Grafana Dashboard / Alert Chain 視覺化", + "kind": "grafana_dashboard", + "status": "action_required", + "risk_level": "medium", + "evidence_status": "committed_manifest", + "noise_policy_status": "needs_proposal", + "coverage_contract": "目前只確認 committed dashboard JSON;本快照不呼叫 Grafana API、不匯入 dashboard、不改 datasource。", + "current_contract": "ai-monitoring dashboard 包含 Alert Chain 健康與最後成功時間;infra-monitoring dashboard 包含 Prometheus target up/down 與 API request rate。", + "evidence_refs": [ + "ops/grafana/dashboards/ai-monitoring.json", + "ops/grafana/dashboards/infra-monitoring.json" + ], + "next_action": "補 dashboard owner、datasource parity、正式站可讀性與 alert-chain panel fresh readback;寫入或 import 需另案批准。" + }, + { + "surface_id": "sentry_source_link_canary", + "display_name": "Sentry Webhook / Source Link Canary", + "kind": "sentry_source_link", + "status": "verified", + "risk_level": "high", + "evidence_status": "production_readback_recorded", + "noise_policy_status": "preserved", + "coverage_contract": "Sentry webhook 與 source-link canary 用來驗證來源鏈路,不能被誤讀成真實 provider alert 全部已關聯。", + "current_contract": "LOGBOOK 記錄 Alertmanager / SigNoz / Sentry webhook 與 Source Link Canary 通過,且 source provider freshness / incident matching 必須分開判斷。", + "evidence_refs": [ + "docs/adr/ADR-022-sentry-integration-architecture.md", + "docs/LOGBOOK.md" + ], + "next_action": "持續把 heartbeat、upstream canary、direct/candidate/applied source link 分開呈現;不修改 Sentry project webhook。" + }, + { + "surface_id": "otel_event_exporter_bridge", + "display_name": "OTEL Collector / Event Exporter", + "kind": "otel_event_exporter", + "status": "verified", + "risk_level": "medium", + "evidence_status": "committed_manifest", + "noise_policy_status": "preserved", + "coverage_contract": "OTEL Collector DaemonSet 與 SigNoz prometheus receiver 只作為可觀測來源;本快照不部署 collector、不重啟 exporter。", + "current_contract": "k8s/observability/otel-collector-daemonset.yaml 與 ops/signoz/otel-collector-config-phase-o.yaml 描述 log/metric/trace pipeline;LOGBOOK 記錄 OTEL Collector / Event Exporter post-deploy smoke 通過。", + "evidence_refs": [ + "k8s/observability/otel-collector-daemonset.yaml", + "ops/signoz/otel-collector-config-phase-o.yaml", + "docs/LOGBOOK.md" + ], + "next_action": "把 collector/exporter health 放入 observability readiness;任何 deploy / restart 仍需獨立批准。" + } + ], + "noise_reduction_opportunities": [ + { + "opportunity_id": "prometheus_noise_rule_tuning", + "display_name": "Prometheus 告警噪音調整提案", + "status": "approval_required", + "proposal_only": true, + "impact": "降低 stale provider、低樣本 SLO、重複 resource alert 對 operator 的干擾;不得直接修改 alert rules。", + "target_surface_ids": [ + "prometheus_alert_rule_catalog" + ], + "evidence_refs": [ + "ops/monitoring/alerts-unified.yml", + "docs/adr/ADR-090-monitoring-blindspot-governance.md" + ], + "next_action": "進 P2-003 建立人工批准包,先收集 24h alert frequency / fingerprint evidence。" + }, + { + "opportunity_id": "alertmanager_grouping_inhibit_tuning", + "display_name": "Alertmanager grouping / inhibit 降噪提案", + "status": "approval_required", + "proposal_only": true, + "impact": "針對同 team / alertname / severity 的爆量與 Host/K8s 重複告警做提案,不變更 receiver。", + "target_surface_ids": [ + "alertmanager_awoooi_route" + ], + "evidence_refs": [ + "ops/alertmanager/alertmanager.yml", + "docs/HARD_RULES.md#alertmanager-routing" + ], + "next_action": "產生 diff proposal 與 rollback plan;未批准前不得 reload Alertmanager。" + }, + { + "opportunity_id": "success_notification_quiet_policy", + "display_name": "Provider heartbeat 與真實告警分流", + "status": "ready_for_proposal", + "proposal_only": true, + "impact": "避免把 Sentry / SigNoz heartbeat 誤當真實 provider alert,降低假綠與錯誤升級。", + "target_surface_ids": [ + "signoz_clickhouse_ingestion", + "sentry_source_link_canary" + ], + "evidence_refs": [ + "docs/LOGBOOK.md", + "docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md" + ], + "next_action": "在 UI / API 上維持 heartbeat、upstream canary、direct source link、candidate source link 四種標籤。" + }, + { + "opportunity_id": "grafana_dashboard_owner_freshness", + "display_name": "Grafana dashboard owner / freshness 標籤", + "status": "ready_for_proposal", + "proposal_only": true, + "impact": "讓 dashboard 缺 datasource、缺 owner 或 stale panel 不被誤讀成監控缺失已修復。", + "target_surface_ids": [ + "grafana_dashboard_inventory" + ], + "evidence_refs": [ + "ops/grafana/dashboards/ai-monitoring.json", + "ops/grafana/dashboards/infra-monitoring.json" + ], + "next_action": "只讀補 owner/freshness matrix;不寫 Grafana。" + }, + { + "opportunity_id": "success_notification_quiet_policy", + "display_name": "成功不洗版 / 失敗才升級", + "status": "preserved", + "proposal_only": true, + "impact": "沿用備份與 Gitea 的 quiet-success 原則,讓 observability smoke 成功證據走 API/LOGBOOK,失敗才通知。", + "target_surface_ids": [ + "otel_event_exporter_bridge", + "signoz_clickhouse_ingestion" + ], + "evidence_refs": [ + "docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md", + "docs/LOGBOOK.md" + ], + "next_action": "P1-003 僅記錄;未批准前不送 Telegram 測試通知。" + } + ], + "classification_gaps": [ + { + "gap_id": "prometheus_alert_rule_catalog_seed", + "display_name": "Alert rule catalog seed 未正式產品化", + "status": "action_required", + "severity": "high", + "summary": "ADR-090 要求 alert_rule_catalog 能追蹤規則資產、noise_rate 與 superseded_by_rule_id;目前 P1-003 只完成只讀矩陣。", + "evidence_refs": [ + "docs/adr/ADR-090-monitoring-blindspot-governance.md", + "ops/monitoring/alerts-unified.yml" + ], + "next_action": "P2-003 前先產生 seed proposal 與 migration/rollback 分離批准包。" + }, + { + "gap_id": "signoz_provider_native_real_alert_gap", + "display_name": "SigNoz provider-native 真實告警證據缺口", + "status": "action_required", + "severity": "medium", + "summary": "Heartbeat / upstream canary 能證明管道新鮮,但不等於每種 provider-native alert 都已接到 incident correlation。", + "evidence_refs": [ + "docs/LOGBOOK.md", + "docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md" + ], + "next_action": "只讀列出 provider-native alert coverage;需要 side effect 的 signed canary 另案批准。" + }, + { + "gap_id": "grafana_dashboard_owner_status", + "display_name": "Grafana dashboard owner / datasource 狀態未連到治理頁", + "status": "action_required", + "severity": "medium", + "summary": "Committed dashboard JSON 存在,但尚未顯示 datasource freshness、owner、last import 或 panel stale 狀態。", + "evidence_refs": [ + "ops/grafana/dashboards/ai-monitoring.json", + "ops/grafana/dashboards/infra-monitoring.json" + ], + "next_action": "下一輪只讀補 dashboard readiness,不呼叫 Grafana write API。" + } + ], + "latest_observations": [ + { + "observation_id": "alertmanager_receiver_guard", + "status": "verified", + "summary": "HARD_RULES 與 ops/alertmanager/alertmanager.yml 都保留 Alertmanager 指向 AWOOOI API 的邊界;OpenClaw 不得成為 receiver。", + "evidence_refs": [ + "docs/HARD_RULES.md#alertmanager-routing", + "ops/alertmanager/alertmanager.yml" + ] + }, + { + "observation_id": "prometheus_rule_source_split", + "status": "action_required", + "summary": "committed Prometheus 規則分散於 ops/monitoring 與 k8s/monitoring;P1-003 建立 matrix,尚未調整規則或 reload。", + "evidence_refs": [ + "ops/monitoring/alerts-unified.yml", + "k8s/monitoring/alert-chain-monitor.yaml" + ] + }, + { + "observation_id": "post_deploy_observability_smoke_history", + "status": "verified", + "summary": "LOGBOOK 已多次記錄 Alertmanager / SigNoz / Sentry webhook、SigNoz、OTEL Collector、Event Exporter post-deploy smoke 通過。", + "evidence_refs": [ + "docs/LOGBOOK.md" + ] + } + ], + "operator_contract": { + "display_mode": "read_only_observability_contract_matrix", + "must_not_interpret_as": [ + "Prometheus alert rule 修改批准", + "Alertmanager receiver / route 修改批准", + "Alertmanager 指向 OpenClaw receiver 批准", + "Silence 建立或維護窗口批准", + "Grafana dashboard 寫入批准", + "SigNoz / Sentry webhook 設定修改批准", + "Secret 已讀取或可輸出", + "Telegram 測試通知批准", + "deploy / reload / workflow 觸發批准", + "runtime execution 授權" + ], + "secret_display_policy": "只允許顯示 committed file refs、endpoint role 與 redacted metadata;不得顯示 token、webhook secret 或 authorization header。", + "alertmanager_route_policy": "Alertmanager webhook 必須指向 AWOOOI API;OpenClaw 不接收 Alertmanager webhook,只能在 API 持久化與分類後參與只讀分析。", + "noise_reduction_policy": "P1-003 僅產生 proposal;P2-003 或任何 route/rule/silence 變更需人工批准。", + "notification_policy": "成功 smoke 不即時通知洗版;失敗、action-required 或人工作業才可進通知批准流程。" + }, + "operation_boundaries": { + "read_only_api_allowed": true, + "prometheus_rule_write_allowed": false, + "prometheus_reload_allowed": false, + "alertmanager_route_write_allowed": false, + "alertmanager_receiver_change_allowed": false, + "alertmanager_to_openclaw_allowed": false, + "silence_create_allowed": false, + "grafana_dashboard_write_allowed": false, + "grafana_api_write_allowed": false, + "signoz_query_mutation_allowed": false, + "signoz_webhook_change_allowed": false, + "sentry_webhook_change_allowed": false, + "otel_collector_deploy_allowed": false, + "event_exporter_restart_allowed": false, + "secret_read_allowed": false, + "secret_plaintext_allowed": false, + "notification_send_allowed": false, + "external_api_call_allowed": false, + "live_prometheus_query_allowed": false, + "workflow_trigger_allowed": false, + "deploy_trigger_allowed": false, + "reload_trigger_allowed": false, + "runtime_execution_allowed": false + }, + "approval_boundaries": { + "prometheus_rule_change_authorized": false, + "prometheus_reload_authorized": false, + "alertmanager_route_change_authorized": false, + "alertmanager_receiver_change_authorized": false, + "alertmanager_to_openclaw_authorized": false, + "silence_authorized": false, + "grafana_write_authorized": false, + "signoz_write_authorized": false, + "sentry_write_authorized": false, + "otel_deploy_authorized": false, + "event_exporter_restart_authorized": false, + "notification_send_authorized": false, + "external_call_authorized": false, + "secret_plaintext_allowed": false, + "workflow_trigger_authorized": false, + "deploy_reload_authorized": false, + "runtime_execution_authorized": false + } +} diff --git a/docs/schemas/observability_contract_matrix_v1.schema.json b/docs/schemas/observability_contract_matrix_v1.schema.json new file mode 100644 index 00000000..6223ae09 --- /dev/null +++ b/docs/schemas/observability_contract_matrix_v1.schema.json @@ -0,0 +1,159 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "urn:awoooi:observability-contract-matrix-v1", + "title": "AWOOOI 監控合約與降噪機會矩陣 v1", + "description": "以 repo 內 committed Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry evidence 建立只讀 observability matrix;不修改 alert rules、不發通知、不打 silence API、不部署 exporter、不觸發 workflow。", + "type": "object", + "required": [ + "schema_version", + "generated_at", + "program_status", + "source_refs", + "rollups", + "observability_surfaces", + "noise_reduction_opportunities", + "classification_gaps", + "latest_observations", + "operator_contract", + "operation_boundaries", + "approval_boundaries" + ], + "properties": { + "schema_version": { + "type": "string", + "const": "observability_contract_matrix_v1" + }, + "generated_at": { + "type": "string", + "minLength": 1 + }, + "program_status": { + "type": "object", + "required": [ + "overall_completion_percent", + "current_priority", + "current_task_id", + "next_task_id", + "read_only_mode" + ], + "properties": { + "overall_completion_percent": { + "type": "integer", + "minimum": 0, + "maximum": 100 + }, + "current_priority": { + "type": "string", + "enum": [ + "P0", + "P1", + "P2", + "P3" + ] + }, + "current_task_id": { + "type": "string", + "minLength": 1 + }, + "next_task_id": { + "type": "string", + "minLength": 1 + }, + "read_only_mode": { + "type": "boolean", + "const": true + } + }, + "additionalProperties": false + }, + "source_refs": { + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "minLength": 1 + } + }, + "rollups": { + "type": "object", + "additionalProperties": true + }, + "observability_surfaces": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "required": [ + "surface_id", + "display_name", + "kind", + "status", + "risk_level", + "evidence_status", + "noise_policy_status", + "coverage_contract", + "evidence_refs", + "next_action" + ], + "additionalProperties": true + } + }, + "noise_reduction_opportunities": { + "type": "array", + "items": { + "type": "object", + "required": [ + "opportunity_id", + "display_name", + "status", + "proposal_only", + "impact", + "evidence_refs", + "next_action" + ], + "additionalProperties": true + } + }, + "classification_gaps": { + "type": "array", + "items": { + "type": "object", + "required": [ + "gap_id", + "display_name", + "status", + "severity", + "summary", + "evidence_refs", + "next_action" + ], + "additionalProperties": true + } + }, + "latest_observations": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + "operator_contract": { + "type": "object", + "additionalProperties": true + }, + "operation_boundaries": { + "type": "object", + "additionalProperties": { + "type": "boolean" + } + }, + "approval_boundaries": { + "type": "object", + "additionalProperties": { + "type": "boolean", + "const": false + } + } + }, + "additionalProperties": false +} diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 63f1105d..c3cdc73a 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -3516,3 +3516,21 @@ Phase 6 完成後 1. P1-003:盤點 Prometheus / Alertmanager / SigNoz / Grafana 監控合約與降噪機會。 **裁決:** P1-002 只完成 read-only committed workflow / runner health contract。不得把 `ubuntu-latest` owner attestation 缺口、runner watchdog 草案、stale-job dry-run guard 或 notification contract 解讀成 workflow 修改、runner restart / stop、container stop、runner label change、runner registration、Secret payload collection、Telegram 測試通知、schedule enable、Gitea write、deploy / migration trigger 或任何 runtime execution 授權。 + +### 2026-06-05 下午 (台北) — P1-003 監控合約與降噪矩陣本地完成 + +**觸發**:統帥批准繼續,要求依 `docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md` 的優先順序推進,並同步完成度、工作狀態與正式環境推版。 + +**已推進:** +- P1-003:新增 `observability_contract_matrix_v1` schema 與 `docs/evaluations/observability_contract_matrix_2026-06-05.json`,以 committed Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry / OTEL evidence 建立只讀監控合約矩陣。 +- P1-003:新增 `GET /api/v1/agents/observability-contract-matrix` 只讀 API 與 service guard,強制拒絕把 snapshot 誤讀成 alert rule 修改、Prometheus reload、Alertmanager receiver / route 修改、Alertmanager 指向 OpenClaw、silence 建立、Grafana 寫入、SigNoz / Sentry webhook 修改、Secret payload、Telegram 測試通知、deploy / reload / workflow 觸發或 runtime execution 授權。 +- P1-003:治理頁 `/zh-TW/governance?tab=automation-inventory` 新增監控合約與降噪機會區塊;顯示監控面、需處置、降噪候選、需批准候選、分類缺口與不可誤讀合約,不新增任何執行按鈕。 +- 目前數字:observability surfaces `6`;需處置 surfaces `2`;降噪候選 `5`;需人工批准的降噪候選 `2`;classification gaps `3`;read-only denials `12`;automation backlog done `19/23`、overall `83%`、P1 `90%`、WS3 `75%`;inventory tasks `29`。 +- 本地驗證:JSON parse 通過;observability contract matrix service / API、automation inventory / backlog snapshot API、Gitea workflow runner health service / API 目標測試 `19 passed`;Python py_compile 通過;zh-TW / en i18n key 差異 `0`;web typecheck 通過;Next production build 通過;source-control-owner-response guard、security-mirror-progress guard、`git diff --check` 通過;本地 API readback 回 `observability_contract_matrix_v1`、current `P1-003`、next `P1-004`、surfaces `6`、noise opportunities `5`、approval-required opportunities `2`。 + +**下一步:** +1. Commit 並推 `gitea main`。 +2. 等 deploy marker 後補 production API / Browser smoke。 +3. P1-004:盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑。 + +**裁決:** P1-003 只完成 read-only observability contract matrix 與降噪候選顯示。不得把 Prometheus rule count、Alertmanager grouping、SigNoz / Sentry heartbeat、Grafana dashboard JSON、OTEL/Event Exporter evidence 或 classification gap 解讀成 alert rule 變更、receiver/route 變更、OpenClaw receiver 授權、silence、dashboard import、webhook 修改、secret 讀取、通知發送、deploy/reload/workflow 或 runtime execution 批准。成功 smoke 不即時通知洗版;失敗與需處置才進批准流程。