515 lines
26 KiB
Python
515 lines
26 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
IwoooS Monitoring / Alerting / Observability post-incident readback 只讀計畫產生器。
|
||
|
||
本工具讀取 monitoring owner response acceptance snapshot,建立事故後回讀
|
||
計畫:誰改了告警 / receiver / silence、何時異常、改前改後 alert state、
|
||
receiver 是否收到、stale / silence 是否造成 false green、如何 rollback 與防再發。
|
||
它不連 live Prometheus、不 reload Alertmanager、不套用 Grafana / SigNoz /
|
||
Sentry / Langfuse、不 reload OTEL、不改 receiver route、不建立 silence、
|
||
不送 Telegram、不 fire live alert、不跑 alert chain smoke、不 SSH、不 kubectl、
|
||
不讀 secret value、不保存 raw alert payload、不寫 production。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import subprocess
|
||
import sys
|
||
from datetime import datetime, timedelta, timezone
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
|
||
TAIPEI = timezone(timedelta(hours=8))
|
||
|
||
READBACK_FIELDS = [
|
||
"post_incident_readback_candidate_id",
|
||
"source_acceptance_candidate_id",
|
||
"request_id",
|
||
"surface_id",
|
||
"label",
|
||
"expected_scope",
|
||
"config_kind",
|
||
"observability_scope",
|
||
"control_tier",
|
||
"repo_source_path",
|
||
"repo_sha256",
|
||
"source_line_count",
|
||
"write_capable_surface",
|
||
"requires_live_evidence",
|
||
"monitoring_incident_or_change_ref",
|
||
"actor_attribution_ref",
|
||
"change_or_outage_time_window_ref",
|
||
"change_intent_or_break_glass_ref",
|
||
"before_alert_state_ref",
|
||
"after_alert_state_ref",
|
||
"rule_datasource_scrape_state_ref",
|
||
"receiver_route_state_ref",
|
||
"reload_or_no_reload_ref",
|
||
"receiver_receipt_readback_ref",
|
||
"stale_pending_resolved_review_ref",
|
||
"silence_mute_dedup_inhibit_review_ref",
|
||
"dashboard_trace_log_freshness_ref",
|
||
"notification_delivery_metadata_ref",
|
||
"alert_chain_health_readback_ref",
|
||
"cross_project_sync_ref",
|
||
"rollback_or_disable_validation_ref",
|
||
"post_change_monitoring_ref",
|
||
"postcheck_readback_ref",
|
||
"recurrence_guard_ref",
|
||
"maintenance_window",
|
||
"rollback_owner",
|
||
"reviewer_outcome",
|
||
"followup_owner",
|
||
"not_approval",
|
||
]
|
||
|
||
REQUIRED_READBACK_FIELDS = [
|
||
"monitoring_incident_or_change_ref",
|
||
"actor_attribution_ref",
|
||
"change_or_outage_time_window_ref",
|
||
"change_intent_or_break_glass_ref",
|
||
"before_alert_state_ref",
|
||
"after_alert_state_ref",
|
||
"rule_datasource_scrape_state_ref",
|
||
"receiver_route_state_ref",
|
||
"reload_or_no_reload_ref",
|
||
"receiver_receipt_readback_ref",
|
||
"stale_pending_resolved_review_ref",
|
||
"silence_mute_dedup_inhibit_review_ref",
|
||
"dashboard_trace_log_freshness_ref",
|
||
"notification_delivery_metadata_ref",
|
||
"alert_chain_health_readback_ref",
|
||
"cross_project_sync_ref",
|
||
"rollback_or_disable_validation_ref",
|
||
"post_change_monitoring_ref",
|
||
"postcheck_readback_ref",
|
||
"recurrence_guard_ref",
|
||
"maintenance_window",
|
||
"rollback_owner",
|
||
"followup_owner",
|
||
"redacted_evidence_refs",
|
||
"no_secret_value_attestation",
|
||
"no_raw_payload_attestation",
|
||
"no_false_green_attestation",
|
||
"receipt_not_route_only_attestation",
|
||
"independent_postcheck_attestation",
|
||
"noise_budget_or_silence_owner_attestation",
|
||
]
|
||
|
||
REVIEWER_CHECKS = [
|
||
{"check_id": "source_owner_response_acceptance_current", "instruction": "來源 owner response acceptance snapshot 必須是目前版本。"},
|
||
{"check_id": "incident_or_change_ref_present", "instruction": "必須有 incident、change、outage、ticket 或 maintenance ref,不能只寫 dashboard 已恢復。"},
|
||
{"check_id": "actor_attribution_present", "instruction": "必須標示 actor role / team,不接受匿名 reload、silence、rule apply 或 receiver route change。"},
|
||
{"check_id": "change_or_outage_time_window_present", "instruction": "必須有變更 / 異常時間窗,供 alert state、receiver receipt 與跨專案同步對齊。"},
|
||
{"check_id": "intent_or_break_glass_present", "instruction": "正常變更需有 change intent;緊急變更需有 break-glass reason,但 break-glass 不等於事前批准。"},
|
||
{"check_id": "before_after_alert_state_present", "instruction": "必須有 before / after alert state ref,不得只看目前 resolved。"},
|
||
{"check_id": "rule_datasource_scrape_state_present", "instruction": "必須回讀 rule、datasource、scrape、remote write 或 exporter state;不能只看 UI up。"},
|
||
{"check_id": "receiver_route_state_present", "instruction": "必須回讀 receiver route、notification policy、webhook 或 Telegram route state。"},
|
||
{"check_id": "reload_or_no_reload_called_out", "instruction": "需明確標示是否 reload / deploy;若未 reload 也需列原因與風險。"},
|
||
{"check_id": "receiver_receipt_readback_present", "instruction": "必須提供 receiver receipt readback ref;不得用 receiver route 200 取代 receipt。"},
|
||
{"check_id": "stale_pending_resolved_review_present", "instruction": "必須檢查 stale、pending、resolved 未清、資料來源停止更新或 silent failure。"},
|
||
{"check_id": "silence_mute_dedup_inhibit_review_present", "instruction": "必須檢查 silence、mute、dedup、inhibit 或 maintenance rule 是否造成 false green。"},
|
||
{"check_id": "dashboard_trace_log_freshness_present", "instruction": "Grafana / SigNoz / Sentry / Langfuse freshness 必須有 query / trace / log metadata ref。"},
|
||
{"check_id": "notification_delivery_metadata_present", "instruction": "notification delivery 只能收脫敏 metadata ref、message id、hash 或 ticket。"},
|
||
{"check_id": "alert_chain_health_not_route_only", "instruction": "告警鏈路健康不得只用 public route 200、dashboard up、container up、CD success 或 UI 可見判定。"},
|
||
{"check_id": "cross_project_sync_present", "instruction": "若影響 AwoooP、IwoooS、agent-bounty、StockPlatform、公開網站或監控,需有跨專案同步 ref。"},
|
||
{"check_id": "rollback_or_disable_validation_present", "instruction": "必須提供 rollback / disable validation ref,包含 rollback owner 與停損條件。"},
|
||
{"check_id": "post_change_monitoring_present", "instruction": "必須有 post-change monitoring window,觀察 alert state、receipt、error、metrics 與 upstream。"},
|
||
{"check_id": "postcheck_independent", "instruction": "post-check 必須獨立於原操作人、dashboard 卡片與 UI 顯示。"},
|
||
{"check_id": "recurrence_guard_present", "instruction": "必須提出防再發 guard、owner review、change freeze 或 automation block。"},
|
||
{"check_id": "maintenance_window_present", "instruction": "後續任何 reload、deploy、rule apply、silence 或 notification send 都需維護窗口。"},
|
||
{"check_id": "noise_budget_owner_present", "instruction": "測試通知、silence、dedup、inhibit 與 alert noise 必須有 noise budget owner。"},
|
||
{"check_id": "redacted_refs_only", "instruction": "evidence 只能是脫敏 ref、hash、ticket、commit 或 artifact pointer。"},
|
||
{"check_id": "secret_value_absent", "instruction": "不得出現 token、Bot token、DSN secret、webhook secret、cookie、private key、env dump 或 partial secret。"},
|
||
{"check_id": "raw_payload_absent", "instruction": "不得保存 raw alert payload、raw receiver payload、raw config、raw log、未脫敏 screenshot 或完整 notification body。"},
|
||
{"check_id": "no_false_green", "instruction": "不得只用 route 200、dashboard up、container up、receiver reachable、CD success 或 UI 可見當告警鏈路驗收。"},
|
||
{"check_id": "runtime_stays_zero", "instruction": "readback plan 不得觸發 reload、receiver route change、silence、Telegram send、alert fire、alert chain smoke、SSH、kubectl 或 production write。"},
|
||
{"check_id": "counts_transition_safe", "instruction": "只有 reviewer record 能更新 accepted count,且不得同時開 runtime gate。"},
|
||
]
|
||
|
||
OUTCOME_LANES = [
|
||
{"lane_id": "waiting_post_incident_readback", "meaning": "尚未收到監控 / 告警事故回讀包;所有 accepted / runtime count 維持 0。"},
|
||
{"lane_id": "request_actor_or_time_supplement", "meaning": "缺 actor、change / outage time window、intent 或 break-glass reason 時要求補件。"},
|
||
{"lane_id": "request_alert_state_supplement", "meaning": "缺 before / after alert state、rule、datasource、scrape、remote write 或 exporter state 時要求補件。"},
|
||
{"lane_id": "request_receiver_receipt_supplement", "meaning": "缺 receiver route state、receipt proof、delivery metadata 或 notification owner 時要求補件。"},
|
||
{"lane_id": "request_stale_silence_supplement", "meaning": "缺 stale / pending / resolved、silence、mute、dedup、inhibit 或 maintenance rule review 時要求補件。"},
|
||
{"lane_id": "request_freshness_or_alert_chain_supplement", "meaning": "缺 dashboard / trace / log freshness、alert chain health、post-change monitoring 或 rollback validation 時要求補件。"},
|
||
{"lane_id": "quarantine_raw_payload", "meaning": "收到 secret、raw config、raw alert payload、raw receiver payload、未脫敏 log 或截圖時只能隔離。"},
|
||
{"lane_id": "reject_false_green_claim", "meaning": "把 route 200、dashboard up、container up、receiver reachable、CD success 或 UI 可見當驗收時拒收。"},
|
||
{"lane_id": "ready_for_monitoring_post_incident_review", "meaning": "metadata 合格後,只能進 monitoring post-incident reviewer review。"},
|
||
{"lane_id": "recurrence_guard_backfill_required", "meaning": "需補防再發 guard、owner review、change freeze、automation block 或 noise budget owner。"},
|
||
{"lane_id": "waiting_runtime_gate", "meaning": "即使 readback accepted,runtime gate 仍需獨立人工批准。"},
|
||
]
|
||
|
||
BLOCKED_ACTIONS = [
|
||
"prometheus_reload",
|
||
"alertmanager_reload",
|
||
"grafana_dashboard_apply",
|
||
"signoz_rule_apply",
|
||
"sentry_deploy",
|
||
"langfuse_config_change",
|
||
"otel_collector_reload",
|
||
"receiver_route_change",
|
||
"silence_policy_change",
|
||
"telegram_send",
|
||
"notification_route_change",
|
||
"webhook_receiver_change",
|
||
"remote_write_change",
|
||
"exporter_deploy",
|
||
"live_alert_fire",
|
||
"alert_chain_smoke",
|
||
"ssh_read",
|
||
"ssh_write",
|
||
"kubectl_action",
|
||
"secret_value_collection",
|
||
"host_write",
|
||
"active_scan",
|
||
"production_write",
|
||
"runtime_gate_open",
|
||
"raw_monitoring_payload_storage",
|
||
"accept_secret_value_evidence",
|
||
"mark_owner_response_accepted_without_reviewer_record",
|
||
"mark_route_200_as_alert_chain_healthy",
|
||
"mark_receiver_healthy_without_receipt",
|
||
"accept_silence_without_owner",
|
||
"accept_stale_alert_without_review",
|
||
"accept_reload_without_postcheck",
|
||
"store_raw_alert_payload",
|
||
"add_action_button",
|
||
"store_raw_live_config",
|
||
"store_full_receiver_payload",
|
||
"collect_bot_token",
|
||
"collect_dsn_secret",
|
||
"collect_webhook_secret",
|
||
"change_alert_rule",
|
||
"change_scrape_config",
|
||
"change_dashboard",
|
||
"change_notification_policy",
|
||
"change_silence",
|
||
"change_inhibit_rule",
|
||
"accept_dashboard_up_as_alert_chain_healthy",
|
||
"skip_receiver_receipt",
|
||
"skip_stale_review",
|
||
"skip_silence_review",
|
||
"skip_rollback_validation",
|
||
"skip_post_change_monitoring",
|
||
"skip_cross_project_sync",
|
||
"live_receiver_probe",
|
||
]
|
||
|
||
|
||
def git_short_sha(root: Path) -> str:
|
||
try:
|
||
result = subprocess.run(
|
||
["git", "rev-parse", "--short", "HEAD"],
|
||
cwd=root,
|
||
check=True,
|
||
capture_output=True,
|
||
text=True,
|
||
)
|
||
return result.stdout.strip()
|
||
except Exception:
|
||
return "unknown"
|
||
|
||
|
||
def load_json(path: Path) -> dict[str, Any]:
|
||
return json.loads(path.read_text(encoding="utf-8"))
|
||
|
||
|
||
def build_candidate(source: dict[str, Any]) -> dict[str, Any]:
|
||
surface_id = source["surface_id"]
|
||
return {
|
||
"post_incident_readback_candidate_id": f"monitoring_post_incident_readback:{surface_id}",
|
||
"status": "waiting_post_incident_readback",
|
||
"source_acceptance_candidate_id": source["acceptance_candidate_id"],
|
||
"request_id": source["request_id"],
|
||
"surface_id": surface_id,
|
||
"label": source["label"],
|
||
"expected_scope": source["expected_scope"],
|
||
"config_kind": source["config_kind"],
|
||
"observability_scope": source["observability_scope"],
|
||
"control_tier": source["control_tier"],
|
||
"repo_source_path": source["repo_source_path"],
|
||
"repo_sha256": source["repo_sha256"],
|
||
"source_line_count": source["source_line_count"],
|
||
"write_capable_surface": source["write_capable_surface"],
|
||
"requires_live_evidence": source["requires_live_evidence"],
|
||
"monitoring_incident_or_change_ref": None,
|
||
"actor_attribution_ref": None,
|
||
"change_or_outage_time_window_ref": None,
|
||
"change_intent_or_break_glass_ref": None,
|
||
"before_alert_state_ref": None,
|
||
"after_alert_state_ref": None,
|
||
"rule_datasource_scrape_state_ref": None,
|
||
"receiver_route_state_ref": None,
|
||
"reload_or_no_reload_ref": None,
|
||
"receiver_receipt_readback_ref": None,
|
||
"stale_pending_resolved_review_ref": None,
|
||
"silence_mute_dedup_inhibit_review_ref": None,
|
||
"dashboard_trace_log_freshness_ref": None,
|
||
"notification_delivery_metadata_ref": None,
|
||
"alert_chain_health_readback_ref": None,
|
||
"cross_project_sync_ref": None,
|
||
"rollback_or_disable_validation_ref": None,
|
||
"post_change_monitoring_ref": None,
|
||
"postcheck_readback_ref": None,
|
||
"recurrence_guard_ref": None,
|
||
"maintenance_window": "pending_post_incident_readback",
|
||
"rollback_owner": "pending_post_incident_readback",
|
||
"reviewer_outcome": "waiting_post_incident_readback",
|
||
"followup_owner": "pending_post_incident_readback",
|
||
"readback_fields": READBACK_FIELDS,
|
||
"required_readback_fields": REQUIRED_READBACK_FIELDS,
|
||
"reviewer_checks": [item["check_id"] for item in REVIEWER_CHECKS],
|
||
"outcome_lanes": [item["lane_id"] for item in OUTCOME_LANES],
|
||
"blocked_actions": BLOCKED_ACTIONS,
|
||
"not_approval": True,
|
||
"post_incident_readback_received": False,
|
||
"post_incident_readback_accepted": False,
|
||
"actor_attribution_accepted": False,
|
||
"change_or_outage_time_window_accepted": False,
|
||
"intent_or_break_glass_accepted": False,
|
||
"before_after_alert_state_accepted": False,
|
||
"rule_datasource_scrape_state_accepted": False,
|
||
"receiver_route_state_accepted": False,
|
||
"reload_or_no_reload_accepted": False,
|
||
"receiver_receipt_readback_accepted": False,
|
||
"stale_pending_resolved_review_accepted": False,
|
||
"silence_mute_dedup_inhibit_review_accepted": False,
|
||
"dashboard_trace_log_freshness_accepted": False,
|
||
"notification_delivery_metadata_accepted": False,
|
||
"alert_chain_health_readback_accepted": False,
|
||
"cross_project_sync_accepted": False,
|
||
"rollback_or_disable_validation_accepted": False,
|
||
"post_change_monitoring_accepted": False,
|
||
"postcheck_readback_accepted": False,
|
||
"recurrence_guard_accepted": False,
|
||
"no_false_green_accepted": False,
|
||
"prometheus_reload_authorized": False,
|
||
"alertmanager_reload_authorized": False,
|
||
"grafana_dashboard_apply_authorized": False,
|
||
"signoz_rule_apply_authorized": False,
|
||
"sentry_deploy_authorized": False,
|
||
"langfuse_config_change_authorized": False,
|
||
"otel_collector_reload_authorized": False,
|
||
"receiver_route_change_authorized": False,
|
||
"silence_policy_change_authorized": False,
|
||
"telegram_send_authorized": False,
|
||
"notification_route_change_authorized": False,
|
||
"webhook_receiver_change_authorized": False,
|
||
"remote_write_change_authorized": False,
|
||
"exporter_deploy_authorized": False,
|
||
"live_alert_fire_authorized": False,
|
||
"alert_chain_smoke_authorized": False,
|
||
"ssh_read_authorized": False,
|
||
"ssh_write_authorized": False,
|
||
"kubectl_action_authorized": False,
|
||
"secret_value_collection_allowed": False,
|
||
"host_write_authorized": False,
|
||
"active_scan_authorized": False,
|
||
"production_write_authorized": False,
|
||
"runtime_gate": False,
|
||
"action_buttons_allowed": False,
|
||
}
|
||
|
||
|
||
def build_report(root: Path, source_report: dict[str, Any], generated_at: str | None) -> dict[str, Any]:
|
||
report_time = generated_at or datetime.now(TAIPEI).isoformat(timespec="seconds")
|
||
candidates = [build_candidate(item) for item in source_report.get("acceptance_candidates", [])]
|
||
write_capable = [item for item in candidates if item["write_capable_surface"]]
|
||
live_evidence = [item for item in candidates if item["requires_live_evidence"]]
|
||
alert_rule = [
|
||
item
|
||
for item in candidates
|
||
if item["config_kind"] in {"prometheus_alert_rules", "grafana_alert_rules", "signoz_alert_rules"}
|
||
]
|
||
deploy_or_reload = [
|
||
item
|
||
for item in candidates
|
||
if item["config_kind"] in {"reload_capable_script", "sentry_deploy_script", "host_deploy_script"}
|
||
]
|
||
source_summary = source_report.get("summary", {})
|
||
|
||
return {
|
||
"schema_version": "monitoring_post_incident_readback_plan_v1",
|
||
"generated_at": report_time,
|
||
"git_commit": git_short_sha(root),
|
||
"source_schema_version": source_report.get("schema_version"),
|
||
"source_status": source_report.get("status"),
|
||
"status": "post_incident_readback_plan_ready_no_runtime_action",
|
||
"summary": {
|
||
"source_acceptance_candidate_count": source_summary.get("acceptance_candidate_count", 0),
|
||
"source_write_capable_acceptance_candidate_count": source_summary.get(
|
||
"write_capable_acceptance_candidate_count", 0
|
||
),
|
||
"source_live_evidence_required_candidate_count": source_summary.get(
|
||
"live_evidence_required_candidate_count", 0
|
||
),
|
||
"source_acceptance_field_count": source_summary.get("acceptance_field_count", 0),
|
||
"source_required_owner_field_count": source_summary.get("required_owner_field_count", 0),
|
||
"source_reviewer_check_count": source_summary.get("reviewer_check_count", 0),
|
||
"source_outcome_lane_count": source_summary.get("outcome_lane_count", 0),
|
||
"source_blocked_action_count": source_summary.get("blocked_action_count", 0),
|
||
"source_owner_response_accepted_count": source_summary.get("owner_response_accepted_count", 0),
|
||
"source_alert_chain_health_accepted_count": source_summary.get(
|
||
"alert_chain_health_accepted_count", 0
|
||
),
|
||
"source_receiver_receipt_proof_accepted_count": source_summary.get(
|
||
"receiver_receipt_proof_accepted_count", 0
|
||
),
|
||
"source_runtime_gate_count": source_summary.get("runtime_gate_count", 0),
|
||
"readback_candidate_count": len(candidates),
|
||
"write_capable_readback_candidate_count": len(write_capable),
|
||
"live_evidence_required_readback_candidate_count": len(live_evidence),
|
||
"alert_rule_readback_candidate_count": len(alert_rule),
|
||
"deploy_or_reload_readback_candidate_count": len(deploy_or_reload),
|
||
"receiver_receipt_review_required_candidate_count": len(candidates),
|
||
"stale_silence_review_required_candidate_count": len(candidates),
|
||
"freshness_alert_chain_review_required_candidate_count": len(candidates),
|
||
"no_false_green_required_candidate_count": len(candidates),
|
||
"readback_field_count": len(READBACK_FIELDS),
|
||
"required_readback_field_count": len(REQUIRED_READBACK_FIELDS),
|
||
"reviewer_check_count": len(REVIEWER_CHECKS),
|
||
"outcome_lane_count": len(OUTCOME_LANES),
|
||
"blocked_action_count": len(BLOCKED_ACTIONS),
|
||
"post_incident_readback_received_count": 0,
|
||
"post_incident_readback_accepted_count": 0,
|
||
"actor_attribution_accepted_count": 0,
|
||
"change_or_outage_time_window_accepted_count": 0,
|
||
"intent_or_break_glass_accepted_count": 0,
|
||
"before_after_alert_state_accepted_count": 0,
|
||
"rule_datasource_scrape_state_accepted_count": 0,
|
||
"receiver_route_state_accepted_count": 0,
|
||
"reload_or_no_reload_accepted_count": 0,
|
||
"receiver_receipt_readback_accepted_count": 0,
|
||
"stale_pending_resolved_review_accepted_count": 0,
|
||
"silence_mute_dedup_inhibit_review_accepted_count": 0,
|
||
"dashboard_trace_log_freshness_accepted_count": 0,
|
||
"notification_delivery_metadata_accepted_count": 0,
|
||
"alert_chain_health_readback_accepted_count": 0,
|
||
"cross_project_sync_accepted_count": 0,
|
||
"rollback_or_disable_validation_accepted_count": 0,
|
||
"post_change_monitoring_accepted_count": 0,
|
||
"postcheck_readback_accepted_count": 0,
|
||
"recurrence_guard_accepted_count": 0,
|
||
"no_false_green_accepted_count": 0,
|
||
"prometheus_reload_authorized_count": 0,
|
||
"alertmanager_reload_authorized_count": 0,
|
||
"grafana_dashboard_apply_authorized_count": 0,
|
||
"signoz_rule_apply_authorized_count": 0,
|
||
"sentry_deploy_authorized_count": 0,
|
||
"langfuse_config_change_authorized_count": 0,
|
||
"otel_collector_reload_authorized_count": 0,
|
||
"receiver_route_change_authorized_count": 0,
|
||
"silence_policy_change_authorized_count": 0,
|
||
"telegram_send_authorized_count": 0,
|
||
"notification_route_change_authorized_count": 0,
|
||
"webhook_receiver_change_authorized_count": 0,
|
||
"remote_write_change_authorized_count": 0,
|
||
"exporter_deploy_authorized_count": 0,
|
||
"live_alert_fire_authorized_count": 0,
|
||
"alert_chain_smoke_authorized_count": 0,
|
||
"secret_value_collection_allowed_count": 0,
|
||
"host_write_authorized_count": 0,
|
||
"active_scan_authorized_count": 0,
|
||
"production_write_authorized_count": 0,
|
||
"runtime_gate_count": 0,
|
||
"action_button_count": 0,
|
||
"coverage_percent_after_readback_plan": 70,
|
||
},
|
||
"readback_candidates": candidates,
|
||
"required_readback_fields": REQUIRED_READBACK_FIELDS,
|
||
"reviewer_checks": REVIEWER_CHECKS,
|
||
"outcome_lanes": OUTCOME_LANES,
|
||
"blocked_actions": BLOCKED_ACTIONS,
|
||
"execution_boundaries": {
|
||
"not_authorization": True,
|
||
"prometheus_reload_authorized": False,
|
||
"alertmanager_reload_authorized": False,
|
||
"grafana_dashboard_apply_authorized": False,
|
||
"signoz_rule_apply_authorized": False,
|
||
"sentry_deploy_authorized": False,
|
||
"langfuse_config_change_authorized": False,
|
||
"otel_collector_reload_authorized": False,
|
||
"receiver_route_change_authorized": False,
|
||
"silence_policy_change_authorized": False,
|
||
"telegram_send_authorized": False,
|
||
"notification_route_change_authorized": False,
|
||
"webhook_receiver_change_authorized": False,
|
||
"remote_write_change_authorized": False,
|
||
"exporter_deploy_authorized": False,
|
||
"live_alert_fire_authorized": False,
|
||
"alert_chain_smoke_authorized": False,
|
||
"ssh_read_authorized": False,
|
||
"ssh_write_authorized": False,
|
||
"kubectl_action_authorized": False,
|
||
"secret_value_collection_allowed": False,
|
||
"host_write_authorized": False,
|
||
"active_scan_authorized": False,
|
||
"production_write_authorized": False,
|
||
"runtime_gate": False,
|
||
"action_buttons_allowed": False,
|
||
"raw_alert_payload_storage_allowed": False,
|
||
"false_green_acceptance_authorized": False,
|
||
},
|
||
"source_paths": [
|
||
"docs/security/MONITORING-ALERTING-OBSERVABILITY-INVENTORY.md",
|
||
"docs/security/monitoring-alerting-observability-inventory.snapshot.json",
|
||
"docs/security/MONITORING-OWNER-REQUEST-DRAFT.md",
|
||
"docs/security/monitoring-owner-request-draft.snapshot.json",
|
||
"docs/security/MONITORING-OWNER-RESPONSE-ACCEPTANCE.md",
|
||
"docs/security/monitoring-owner-response-acceptance.snapshot.json",
|
||
],
|
||
"mode": "metadata_only_no_live_monitoring_no_reload_no_alert_send",
|
||
"operator_interpretation": [
|
||
"此計畫只定義 Monitoring / Alerting / Observability 事故後回讀欄位,不代表 live monitoring 已讀取或可讀取。",
|
||
"route 200、dashboard up、container up、receiver reachable、CD success、UI 可見都不能單獨當成告警鏈路事故驗收。",
|
||
"未來若要 reload、rule apply、receiver route change、silence、Telegram send、alert fire 或 alert chain smoke,必須另有維護窗口、rollback owner 與人工批准。",
|
||
],
|
||
}
|
||
|
||
|
||
def main() -> int:
|
||
parser = argparse.ArgumentParser(description="IwoooS Monitoring / Alerting 事故後回讀只讀計畫產生器")
|
||
parser.add_argument("--root", default=".", help="repo root")
|
||
parser.add_argument(
|
||
"--source-report",
|
||
default="docs/security/monitoring-owner-response-acceptance.snapshot.json",
|
||
help="monitoring-owner-response-acceptance.py 輸出的 JSON",
|
||
)
|
||
parser.add_argument("--output", help="寫出 JSON 報告")
|
||
parser.add_argument("--generated-at", help="固定報告時間,供 committed snapshot 使用")
|
||
args = parser.parse_args()
|
||
|
||
root = Path(args.root).resolve()
|
||
source_report = load_json(root / args.source_report)
|
||
report = build_report(root, source_report, args.generated_at)
|
||
payload = json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True)
|
||
|
||
if args.output:
|
||
output = Path(args.output)
|
||
output.parent.mkdir(parents=True, exist_ok=True)
|
||
output.write_text(payload + "\n", encoding="utf-8")
|
||
else:
|
||
print(payload)
|
||
|
||
summary = report["summary"]
|
||
print(
|
||
"MONITORING_POST_INCIDENT_READBACK_PLAN_OK "
|
||
f"candidates={summary['readback_candidate_count']} "
|
||
f"write_capable={summary['write_capable_readback_candidate_count']} "
|
||
f"checks={summary['reviewer_check_count']} "
|
||
f"lanes={summary['outcome_lane_count']} "
|
||
f"accepted={summary['post_incident_readback_accepted_count']} "
|
||
f"runtime_gate={summary['runtime_gate_count']}",
|
||
file=sys.stderr,
|
||
)
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|