#!/usr/bin/env python3 """ IwoooS Monitoring / Alerting / Observability post-incident readback 只讀計畫產生器。 本工具讀取 monitoring owner response acceptance snapshot,建立事故後回讀 計畫:誰改了告警 / receiver / silence、何時異常、改前改後 alert state、 receiver 是否收到、stale / silence 是否造成 false green、如何 rollback 與防再發。 它不連 live Prometheus、不 reload Alertmanager、不套用 Grafana / SigNoz / Sentry / Langfuse、不 reload OTEL、不改 receiver route、不建立 silence、 不送 Telegram、不 fire live alert、不跑 alert chain smoke、不 SSH、不 kubectl、 不讀 secret value、不保存 raw alert payload、不寫 production。 """ from __future__ import annotations import argparse import json import subprocess import sys from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Any TAIPEI = timezone(timedelta(hours=8)) READBACK_FIELDS = [ "post_incident_readback_candidate_id", "source_acceptance_candidate_id", "request_id", "surface_id", "label", "expected_scope", "config_kind", "observability_scope", "control_tier", "repo_source_path", "repo_sha256", "source_line_count", "write_capable_surface", "requires_live_evidence", "monitoring_incident_or_change_ref", "actor_attribution_ref", "change_or_outage_time_window_ref", "change_intent_or_break_glass_ref", "before_alert_state_ref", "after_alert_state_ref", "rule_datasource_scrape_state_ref", "receiver_route_state_ref", "reload_or_no_reload_ref", "receiver_receipt_readback_ref", "stale_pending_resolved_review_ref", "silence_mute_dedup_inhibit_review_ref", "dashboard_trace_log_freshness_ref", "notification_delivery_metadata_ref", "alert_chain_health_readback_ref", "cross_project_sync_ref", "rollback_or_disable_validation_ref", "post_change_monitoring_ref", "postcheck_readback_ref", "recurrence_guard_ref", "maintenance_window", "rollback_owner", "reviewer_outcome", "followup_owner", "not_approval", ] REQUIRED_READBACK_FIELDS = [ "monitoring_incident_or_change_ref", "actor_attribution_ref", "change_or_outage_time_window_ref", "change_intent_or_break_glass_ref", "before_alert_state_ref", "after_alert_state_ref", "rule_datasource_scrape_state_ref", "receiver_route_state_ref", "reload_or_no_reload_ref", "receiver_receipt_readback_ref", "stale_pending_resolved_review_ref", "silence_mute_dedup_inhibit_review_ref", "dashboard_trace_log_freshness_ref", "notification_delivery_metadata_ref", "alert_chain_health_readback_ref", "cross_project_sync_ref", "rollback_or_disable_validation_ref", "post_change_monitoring_ref", "postcheck_readback_ref", "recurrence_guard_ref", "maintenance_window", "rollback_owner", "followup_owner", "redacted_evidence_refs", "no_secret_value_attestation", "no_raw_payload_attestation", "no_false_green_attestation", "receipt_not_route_only_attestation", "independent_postcheck_attestation", "noise_budget_or_silence_owner_attestation", ] REVIEWER_CHECKS = [ {"check_id": "source_owner_response_acceptance_current", "instruction": "來源 owner response acceptance snapshot 必須是目前版本。"}, {"check_id": "incident_or_change_ref_present", "instruction": "必須有 incident、change、outage、ticket 或 maintenance ref,不能只寫 dashboard 已恢復。"}, {"check_id": "actor_attribution_present", "instruction": "必須標示 actor role / team,不接受匿名 reload、silence、rule apply 或 receiver route change。"}, {"check_id": "change_or_outage_time_window_present", "instruction": "必須有變更 / 異常時間窗,供 alert state、receiver receipt 與跨專案同步對齊。"}, {"check_id": "intent_or_break_glass_present", "instruction": "正常變更需有 change intent;緊急變更需有 break-glass reason,但 break-glass 不等於事前批准。"}, {"check_id": "before_after_alert_state_present", "instruction": "必須有 before / after alert state ref,不得只看目前 resolved。"}, {"check_id": "rule_datasource_scrape_state_present", "instruction": "必須回讀 rule、datasource、scrape、remote write 或 exporter state;不能只看 UI up。"}, {"check_id": "receiver_route_state_present", "instruction": "必須回讀 receiver route、notification policy、webhook 或 Telegram route state。"}, {"check_id": "reload_or_no_reload_called_out", "instruction": "需明確標示是否 reload / deploy;若未 reload 也需列原因與風險。"}, {"check_id": "receiver_receipt_readback_present", "instruction": "必須提供 receiver receipt readback ref;不得用 receiver route 200 取代 receipt。"}, {"check_id": "stale_pending_resolved_review_present", "instruction": "必須檢查 stale、pending、resolved 未清、資料來源停止更新或 silent failure。"}, {"check_id": "silence_mute_dedup_inhibit_review_present", "instruction": "必須檢查 silence、mute、dedup、inhibit 或 maintenance rule 是否造成 false green。"}, {"check_id": "dashboard_trace_log_freshness_present", "instruction": "Grafana / SigNoz / Sentry / Langfuse freshness 必須有 query / trace / log metadata ref。"}, {"check_id": "notification_delivery_metadata_present", "instruction": "notification delivery 只能收脫敏 metadata ref、message id、hash 或 ticket。"}, {"check_id": "alert_chain_health_not_route_only", "instruction": "告警鏈路健康不得只用 public route 200、dashboard up、container up、CD success 或 UI 可見判定。"}, {"check_id": "cross_project_sync_present", "instruction": "若影響 AwoooP、IwoooS、agent-bounty、StockPlatform、公開網站或監控,需有跨專案同步 ref。"}, {"check_id": "rollback_or_disable_validation_present", "instruction": "必須提供 rollback / disable validation ref,包含 rollback owner 與停損條件。"}, {"check_id": "post_change_monitoring_present", "instruction": "必須有 post-change monitoring window,觀察 alert state、receipt、error、metrics 與 upstream。"}, {"check_id": "postcheck_independent", "instruction": "post-check 必須獨立於原操作人、dashboard 卡片與 UI 顯示。"}, {"check_id": "recurrence_guard_present", "instruction": "必須提出防再發 guard、owner review、change freeze 或 automation block。"}, {"check_id": "maintenance_window_present", "instruction": "後續任何 reload、deploy、rule apply、silence 或 notification send 都需維護窗口。"}, {"check_id": "noise_budget_owner_present", "instruction": "測試通知、silence、dedup、inhibit 與 alert noise 必須有 noise budget owner。"}, {"check_id": "redacted_refs_only", "instruction": "evidence 只能是脫敏 ref、hash、ticket、commit 或 artifact pointer。"}, {"check_id": "secret_value_absent", "instruction": "不得出現 token、Bot token、DSN secret、webhook secret、cookie、private key、env dump 或 partial secret。"}, {"check_id": "raw_payload_absent", "instruction": "不得保存 raw alert payload、raw receiver payload、raw config、raw log、未脫敏 screenshot 或完整 notification body。"}, {"check_id": "no_false_green", "instruction": "不得只用 route 200、dashboard up、container up、receiver reachable、CD success 或 UI 可見當告警鏈路驗收。"}, {"check_id": "runtime_stays_zero", "instruction": "readback plan 不得觸發 reload、receiver route change、silence、Telegram send、alert fire、alert chain smoke、SSH、kubectl 或 production write。"}, {"check_id": "counts_transition_safe", "instruction": "只有 reviewer record 能更新 accepted count,且不得同時開 runtime gate。"}, ] OUTCOME_LANES = [ {"lane_id": "waiting_post_incident_readback", "meaning": "尚未收到監控 / 告警事故回讀包;所有 accepted / runtime count 維持 0。"}, {"lane_id": "request_actor_or_time_supplement", "meaning": "缺 actor、change / outage time window、intent 或 break-glass reason 時要求補件。"}, {"lane_id": "request_alert_state_supplement", "meaning": "缺 before / after alert state、rule、datasource、scrape、remote write 或 exporter state 時要求補件。"}, {"lane_id": "request_receiver_receipt_supplement", "meaning": "缺 receiver route state、receipt proof、delivery metadata 或 notification owner 時要求補件。"}, {"lane_id": "request_stale_silence_supplement", "meaning": "缺 stale / pending / resolved、silence、mute、dedup、inhibit 或 maintenance rule review 時要求補件。"}, {"lane_id": "request_freshness_or_alert_chain_supplement", "meaning": "缺 dashboard / trace / log freshness、alert chain health、post-change monitoring 或 rollback validation 時要求補件。"}, {"lane_id": "quarantine_raw_payload", "meaning": "收到 secret、raw config、raw alert payload、raw receiver payload、未脫敏 log 或截圖時只能隔離。"}, {"lane_id": "reject_false_green_claim", "meaning": "把 route 200、dashboard up、container up、receiver reachable、CD success 或 UI 可見當驗收時拒收。"}, {"lane_id": "ready_for_monitoring_post_incident_review", "meaning": "metadata 合格後,只能進 monitoring post-incident reviewer review。"}, {"lane_id": "recurrence_guard_backfill_required", "meaning": "需補防再發 guard、owner review、change freeze、automation block 或 noise budget owner。"}, {"lane_id": "waiting_runtime_gate", "meaning": "即使 readback accepted,runtime gate 仍需獨立人工批准。"}, ] BLOCKED_ACTIONS = [ "prometheus_reload", "alertmanager_reload", "grafana_dashboard_apply", "signoz_rule_apply", "sentry_deploy", "langfuse_config_change", "otel_collector_reload", "receiver_route_change", "silence_policy_change", "telegram_send", "notification_route_change", "webhook_receiver_change", "remote_write_change", "exporter_deploy", "live_alert_fire", "alert_chain_smoke", "ssh_read", "ssh_write", "kubectl_action", "secret_value_collection", "host_write", "active_scan", "production_write", "runtime_gate_open", "raw_monitoring_payload_storage", "accept_secret_value_evidence", "mark_owner_response_accepted_without_reviewer_record", "mark_route_200_as_alert_chain_healthy", "mark_receiver_healthy_without_receipt", "accept_silence_without_owner", "accept_stale_alert_without_review", "accept_reload_without_postcheck", "store_raw_alert_payload", "add_action_button", "store_raw_live_config", "store_full_receiver_payload", "collect_bot_token", "collect_dsn_secret", "collect_webhook_secret", "change_alert_rule", "change_scrape_config", "change_dashboard", "change_notification_policy", "change_silence", "change_inhibit_rule", "accept_dashboard_up_as_alert_chain_healthy", "skip_receiver_receipt", "skip_stale_review", "skip_silence_review", "skip_rollback_validation", "skip_post_change_monitoring", "skip_cross_project_sync", "live_receiver_probe", ] def git_short_sha(root: Path) -> str: try: result = subprocess.run( ["git", "rev-parse", "--short", "HEAD"], cwd=root, check=True, capture_output=True, text=True, ) return result.stdout.strip() except Exception: return "unknown" def load_json(path: Path) -> dict[str, Any]: return json.loads(path.read_text(encoding="utf-8")) def build_candidate(source: dict[str, Any]) -> dict[str, Any]: surface_id = source["surface_id"] return { "post_incident_readback_candidate_id": f"monitoring_post_incident_readback:{surface_id}", "status": "waiting_post_incident_readback", "source_acceptance_candidate_id": source["acceptance_candidate_id"], "request_id": source["request_id"], "surface_id": surface_id, "label": source["label"], "expected_scope": source["expected_scope"], "config_kind": source["config_kind"], "observability_scope": source["observability_scope"], "control_tier": source["control_tier"], "repo_source_path": source["repo_source_path"], "repo_sha256": source["repo_sha256"], "source_line_count": source["source_line_count"], "write_capable_surface": source["write_capable_surface"], "requires_live_evidence": source["requires_live_evidence"], "monitoring_incident_or_change_ref": None, "actor_attribution_ref": None, "change_or_outage_time_window_ref": None, "change_intent_or_break_glass_ref": None, "before_alert_state_ref": None, "after_alert_state_ref": None, "rule_datasource_scrape_state_ref": None, "receiver_route_state_ref": None, "reload_or_no_reload_ref": None, "receiver_receipt_readback_ref": None, "stale_pending_resolved_review_ref": None, "silence_mute_dedup_inhibit_review_ref": None, "dashboard_trace_log_freshness_ref": None, "notification_delivery_metadata_ref": None, "alert_chain_health_readback_ref": None, "cross_project_sync_ref": None, "rollback_or_disable_validation_ref": None, "post_change_monitoring_ref": None, "postcheck_readback_ref": None, "recurrence_guard_ref": None, "maintenance_window": "pending_post_incident_readback", "rollback_owner": "pending_post_incident_readback", "reviewer_outcome": "waiting_post_incident_readback", "followup_owner": "pending_post_incident_readback", "readback_fields": READBACK_FIELDS, "required_readback_fields": REQUIRED_READBACK_FIELDS, "reviewer_checks": [item["check_id"] for item in REVIEWER_CHECKS], "outcome_lanes": [item["lane_id"] for item in OUTCOME_LANES], "blocked_actions": BLOCKED_ACTIONS, "not_approval": True, "post_incident_readback_received": False, "post_incident_readback_accepted": False, "actor_attribution_accepted": False, "change_or_outage_time_window_accepted": False, "intent_or_break_glass_accepted": False, "before_after_alert_state_accepted": False, "rule_datasource_scrape_state_accepted": False, "receiver_route_state_accepted": False, "reload_or_no_reload_accepted": False, "receiver_receipt_readback_accepted": False, "stale_pending_resolved_review_accepted": False, "silence_mute_dedup_inhibit_review_accepted": False, "dashboard_trace_log_freshness_accepted": False, "notification_delivery_metadata_accepted": False, "alert_chain_health_readback_accepted": False, "cross_project_sync_accepted": False, "rollback_or_disable_validation_accepted": False, "post_change_monitoring_accepted": False, "postcheck_readback_accepted": False, "recurrence_guard_accepted": False, "no_false_green_accepted": False, "prometheus_reload_authorized": False, "alertmanager_reload_authorized": False, "grafana_dashboard_apply_authorized": False, "signoz_rule_apply_authorized": False, "sentry_deploy_authorized": False, "langfuse_config_change_authorized": False, "otel_collector_reload_authorized": False, "receiver_route_change_authorized": False, "silence_policy_change_authorized": False, "telegram_send_authorized": False, "notification_route_change_authorized": False, "webhook_receiver_change_authorized": False, "remote_write_change_authorized": False, "exporter_deploy_authorized": False, "live_alert_fire_authorized": False, "alert_chain_smoke_authorized": False, "ssh_read_authorized": False, "ssh_write_authorized": False, "kubectl_action_authorized": False, "secret_value_collection_allowed": False, "host_write_authorized": False, "active_scan_authorized": False, "production_write_authorized": False, "runtime_gate": False, "action_buttons_allowed": False, } def build_report(root: Path, source_report: dict[str, Any], generated_at: str | None) -> dict[str, Any]: report_time = generated_at or datetime.now(TAIPEI).isoformat(timespec="seconds") candidates = [build_candidate(item) for item in source_report.get("acceptance_candidates", [])] write_capable = [item for item in candidates if item["write_capable_surface"]] live_evidence = [item for item in candidates if item["requires_live_evidence"]] alert_rule = [ item for item in candidates if item["config_kind"] in {"prometheus_alert_rules", "grafana_alert_rules", "signoz_alert_rules"} ] deploy_or_reload = [ item for item in candidates if item["config_kind"] in {"reload_capable_script", "sentry_deploy_script", "host_deploy_script"} ] source_summary = source_report.get("summary", {}) return { "schema_version": "monitoring_post_incident_readback_plan_v1", "generated_at": report_time, "git_commit": git_short_sha(root), "source_schema_version": source_report.get("schema_version"), "source_status": source_report.get("status"), "status": "post_incident_readback_plan_ready_no_runtime_action", "summary": { "source_acceptance_candidate_count": source_summary.get("acceptance_candidate_count", 0), "source_write_capable_acceptance_candidate_count": source_summary.get( "write_capable_acceptance_candidate_count", 0 ), "source_live_evidence_required_candidate_count": source_summary.get( "live_evidence_required_candidate_count", 0 ), "source_acceptance_field_count": source_summary.get("acceptance_field_count", 0), "source_required_owner_field_count": source_summary.get("required_owner_field_count", 0), "source_reviewer_check_count": source_summary.get("reviewer_check_count", 0), "source_outcome_lane_count": source_summary.get("outcome_lane_count", 0), "source_blocked_action_count": source_summary.get("blocked_action_count", 0), "source_owner_response_accepted_count": source_summary.get("owner_response_accepted_count", 0), "source_alert_chain_health_accepted_count": source_summary.get( "alert_chain_health_accepted_count", 0 ), "source_receiver_receipt_proof_accepted_count": source_summary.get( "receiver_receipt_proof_accepted_count", 0 ), "source_runtime_gate_count": source_summary.get("runtime_gate_count", 0), "readback_candidate_count": len(candidates), "write_capable_readback_candidate_count": len(write_capable), "live_evidence_required_readback_candidate_count": len(live_evidence), "alert_rule_readback_candidate_count": len(alert_rule), "deploy_or_reload_readback_candidate_count": len(deploy_or_reload), "receiver_receipt_review_required_candidate_count": len(candidates), "stale_silence_review_required_candidate_count": len(candidates), "freshness_alert_chain_review_required_candidate_count": len(candidates), "no_false_green_required_candidate_count": len(candidates), "readback_field_count": len(READBACK_FIELDS), "required_readback_field_count": len(REQUIRED_READBACK_FIELDS), "reviewer_check_count": len(REVIEWER_CHECKS), "outcome_lane_count": len(OUTCOME_LANES), "blocked_action_count": len(BLOCKED_ACTIONS), "post_incident_readback_received_count": 0, "post_incident_readback_accepted_count": 0, "actor_attribution_accepted_count": 0, "change_or_outage_time_window_accepted_count": 0, "intent_or_break_glass_accepted_count": 0, "before_after_alert_state_accepted_count": 0, "rule_datasource_scrape_state_accepted_count": 0, "receiver_route_state_accepted_count": 0, "reload_or_no_reload_accepted_count": 0, "receiver_receipt_readback_accepted_count": 0, "stale_pending_resolved_review_accepted_count": 0, "silence_mute_dedup_inhibit_review_accepted_count": 0, "dashboard_trace_log_freshness_accepted_count": 0, "notification_delivery_metadata_accepted_count": 0, "alert_chain_health_readback_accepted_count": 0, "cross_project_sync_accepted_count": 0, "rollback_or_disable_validation_accepted_count": 0, "post_change_monitoring_accepted_count": 0, "postcheck_readback_accepted_count": 0, "recurrence_guard_accepted_count": 0, "no_false_green_accepted_count": 0, "prometheus_reload_authorized_count": 0, "alertmanager_reload_authorized_count": 0, "grafana_dashboard_apply_authorized_count": 0, "signoz_rule_apply_authorized_count": 0, "sentry_deploy_authorized_count": 0, "langfuse_config_change_authorized_count": 0, "otel_collector_reload_authorized_count": 0, "receiver_route_change_authorized_count": 0, "silence_policy_change_authorized_count": 0, "telegram_send_authorized_count": 0, "notification_route_change_authorized_count": 0, "webhook_receiver_change_authorized_count": 0, "remote_write_change_authorized_count": 0, "exporter_deploy_authorized_count": 0, "live_alert_fire_authorized_count": 0, "alert_chain_smoke_authorized_count": 0, "secret_value_collection_allowed_count": 0, "host_write_authorized_count": 0, "active_scan_authorized_count": 0, "production_write_authorized_count": 0, "runtime_gate_count": 0, "action_button_count": 0, "coverage_percent_after_readback_plan": 70, }, "readback_candidates": candidates, "required_readback_fields": REQUIRED_READBACK_FIELDS, "reviewer_checks": REVIEWER_CHECKS, "outcome_lanes": OUTCOME_LANES, "blocked_actions": BLOCKED_ACTIONS, "execution_boundaries": { "not_authorization": True, "prometheus_reload_authorized": False, "alertmanager_reload_authorized": False, "grafana_dashboard_apply_authorized": False, "signoz_rule_apply_authorized": False, "sentry_deploy_authorized": False, "langfuse_config_change_authorized": False, "otel_collector_reload_authorized": False, "receiver_route_change_authorized": False, "silence_policy_change_authorized": False, "telegram_send_authorized": False, "notification_route_change_authorized": False, "webhook_receiver_change_authorized": False, "remote_write_change_authorized": False, "exporter_deploy_authorized": False, "live_alert_fire_authorized": False, "alert_chain_smoke_authorized": False, "ssh_read_authorized": False, "ssh_write_authorized": False, "kubectl_action_authorized": False, "secret_value_collection_allowed": False, "host_write_authorized": False, "active_scan_authorized": False, "production_write_authorized": False, "runtime_gate": False, "action_buttons_allowed": False, "raw_alert_payload_storage_allowed": False, "false_green_acceptance_authorized": False, }, "source_paths": [ "docs/security/MONITORING-ALERTING-OBSERVABILITY-INVENTORY.md", "docs/security/monitoring-alerting-observability-inventory.snapshot.json", "docs/security/MONITORING-OWNER-REQUEST-DRAFT.md", "docs/security/monitoring-owner-request-draft.snapshot.json", "docs/security/MONITORING-OWNER-RESPONSE-ACCEPTANCE.md", "docs/security/monitoring-owner-response-acceptance.snapshot.json", ], "mode": "metadata_only_no_live_monitoring_no_reload_no_alert_send", "operator_interpretation": [ "此計畫只定義 Monitoring / Alerting / Observability 事故後回讀欄位,不代表 live monitoring 已讀取或可讀取。", "route 200、dashboard up、container up、receiver reachable、CD success、UI 可見都不能單獨當成告警鏈路事故驗收。", "未來若要 reload、rule apply、receiver route change、silence、Telegram send、alert fire 或 alert chain smoke,必須另有維護窗口、rollback owner 與人工批准。", ], } def main() -> int: parser = argparse.ArgumentParser(description="IwoooS Monitoring / Alerting 事故後回讀只讀計畫產生器") parser.add_argument("--root", default=".", help="repo root") parser.add_argument( "--source-report", default="docs/security/monitoring-owner-response-acceptance.snapshot.json", help="monitoring-owner-response-acceptance.py 輸出的 JSON", ) parser.add_argument("--output", help="寫出 JSON 報告") parser.add_argument("--generated-at", help="固定報告時間,供 committed snapshot 使用") args = parser.parse_args() root = Path(args.root).resolve() source_report = load_json(root / args.source_report) report = build_report(root, source_report, args.generated_at) payload = json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) if args.output: output = Path(args.output) output.parent.mkdir(parents=True, exist_ok=True) output.write_text(payload + "\n", encoding="utf-8") else: print(payload) summary = report["summary"] print( "MONITORING_POST_INCIDENT_READBACK_PLAN_OK " f"candidates={summary['readback_candidate_count']} " f"write_capable={summary['write_capable_readback_candidate_count']} " f"checks={summary['reviewer_check_count']} " f"lanes={summary['outcome_lane_count']} " f"accepted={summary['post_incident_readback_accepted_count']} " f"runtime_gate={summary['runtime_gate_count']}", file=sys.stderr, ) return 0 if __name__ == "__main__": raise SystemExit(main())