Files
awoooi/scripts/security/monitoring-owner-response-acceptance.py
Your Name 8c1f9dca0f
All checks were successful
Code Review / ai-code-review (push) Successful in 13s
CD Pipeline / tests (push) Successful in 1m37s
CD Pipeline / build-and-deploy (push) Successful in 3m49s
CD Pipeline / post-deploy-checks (push) Successful in 1m29s
feat(iwooos): 強化告警鏈路 no-false-green gate
2026-06-15 17:45:28 +08:00

441 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
IwoooS Monitoring / Alerting / Observability owner response acceptance 只讀帳本產生器。
本工具讀取 monitoring / alerting / observability inventory 與 owner request
draft建立未來 owner response 如何收件、補件、隔離、拒收或進入
monitoring reviewer review 的 metadata-only acceptance ledger。它不連 live
Prometheus、不 reload Alertmanager、不套用 Grafana / SigNoz / Sentry /
Langfuse、不 reload OTEL、不改 receiver route、不建立 silence、不送
Telegram、不 fire live alert、不跑 alert chain smoke、不 SSH、不 kubectl、
不讀 secret value、不寫 production。
"""
from __future__ import annotations
import argparse
import json
import subprocess
import sys
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any
TAIPEI = timezone(timedelta(hours=8))
ACCEPTANCE_FIELDS = [
"acceptance_candidate_id",
"request_id",
"surface_id",
"label",
"expected_scope",
"config_kind",
"observability_scope",
"control_tier",
"repo_source_path",
"repo_sha256",
"source_line_count",
"write_capable_surface",
"requires_live_evidence",
"owner_response_ref",
"owner_role_or_team",
"decision",
"decision_reason",
"affected_scope",
"redacted_evidence_refs",
"live_config_hash_ref",
"reload_owner",
"receiver_owner",
"route_smoke_plan",
"incident_context_ref",
"alert_chain_health_ref",
"receiver_receipt_proof_ref",
"stale_alert_review",
"silence_or_dedup_review",
"false_green_risk_review",
"post_reload_readback_plan",
"cross_project_notification_ref",
"maintenance_window",
"rollback_owner",
"validation_plan",
"noise_budget_owner",
"reviewer_outcome",
"followup_owner",
"not_approval",
]
REVIEWER_CHECKS = [
{"check_id": "owner_identity_present", "instruction": "owner role / team 必須可追溯。"},
{"check_id": "decision_reason_present", "instruction": "decision 與 decision reason 必須同時存在。"},
{"check_id": "affected_scope_matches_surface", "instruction": "affected scope 必須能對回 committed surface_id。"},
{"check_id": "redacted_refs_only", "instruction": "evidence 只能是脫敏 ref、hash、ticket、commit 或 artifact pointer。"},
{"check_id": "secret_value_absent", "instruction": "不得出現 token、Bot token、DSN secret、cookie、private key、env dump 或 partial secret。"},
{"check_id": "live_config_hash_metadata_only", "instruction": "live config hash 只能是 owner-provided metadata ref不得貼 raw config。"},
{"check_id": "reload_owner_present", "instruction": "Prometheus / Alertmanager / OTEL / Sentry 類變更必須有 reload / deploy owner。"},
{"check_id": "receiver_owner_present", "instruction": "receiver route、Telegram receipt 與 notification policy 必須有 receiver owner。"},
{"check_id": "route_smoke_plan_present", "instruction": "route smoke / receipt proof 必須是計畫或脫敏證據 ref不得直接 fire alert。"},
{"check_id": "incident_context_present", "instruction": "事故回補必須有 incident / change / outage context ref不得只寫服務已恢復。"},
{"check_id": "alert_chain_health_not_route_only", "instruction": "告警鏈路健康不得只用 public route 200 或 UI 可見判定,必須包含 rule、state、receiver 與 receipt/ref。"},
{"check_id": "receiver_receipt_proof_metadata_only", "instruction": "receiver receipt proof 只能是脫敏 ref、hash、message id 或 ticket不得貼 raw notification payload。"},
{"check_id": "stale_alert_review_present", "instruction": "必須確認告警是否 stale、pending、resolved 未清或資料來源停止更新。"},
{"check_id": "silence_or_dedup_review_present", "instruction": "必須確認 silence、mute、dedup、inhibit 或 maintenance rule 是否造成 false green。"},
{"check_id": "false_green_risk_review_present", "instruction": "必須列 no-false-green 判定,避免把 route up、容器 up 或 dashboard up 當作告警鏈路 up。"},
{"check_id": "post_reload_readback_plan_present", "instruction": "若後續有 reload / deploy必須先有 post-reload readback plan 與 stop condition。"},
{"check_id": "cross_project_notification_present", "instruction": "若影響 AwoooP、IwoooS、agent-bounty、StockPlatform、公開網站或監控需有跨專案通知 ref。"},
{"check_id": "noise_budget_owner_present", "instruction": "告警噪音、silence、dedup 與測試通知必須有 noise budget owner。"},
{"check_id": "maintenance_window_present", "instruction": "reload、deploy、route change、smoke 或 notification send 必須另有維護窗口。"},
{"check_id": "rollback_owner_present", "instruction": "rollback owner、rollback ref 或 disable path 必須存在。"},
{"check_id": "validation_plan_present", "instruction": "validation / post-check 必須列 route、receipt、alert state、metrics 與 rollback stop condition。"},
{"check_id": "no_runtime_request", "instruction": "夾帶 reload、receiver route change、Telegram send、alert smoke、SSH 或 kubectl 要求時拒收。"},
{"check_id": "counts_transition_safe", "instruction": "只有 reviewer record 可更新 received / accepted / rejected不得同時開 runtime gate。"},
]
OUTCOME_LANES = [
{"lane_id": "waiting_owner_response", "meaning": "尚未收到 owner response所有 accepted / runtime count 維持 0。"},
{"lane_id": "quarantine_secret_or_raw_payload", "meaning": "收到 secret、raw config、raw receiver payload、未脫敏 log 或截圖時隔離。"},
{"lane_id": "reject_execution_request", "meaning": "夾帶 reload、deploy、route change、Telegram send、fire alert、SSH 或 kubectl 要求時拒收。"},
{"lane_id": "request_supplement", "meaning": "欄位不足、scope 不清、reload / receiver / route smoke / rollback / noise owner 缺失時要求補件。"},
{"lane_id": "ready_for_monitoring_review", "meaning": "metadata 合格後,只能進 monitoring reviewer review。"},
{"lane_id": "incident_recovery_backfill_required", "meaning": "事故後只看到服務恢復時,要求補 incident context、receiver receipt、stale / silence review 與 post-check refs。"},
{"lane_id": "no_false_green_supplement_required", "meaning": "只用 route 200、container up、dashboard up 或 UI 可見代表告警健康時,要求補 no-false-green 證據。"},
{"lane_id": "receiver_receipt_gap_required", "meaning": "缺 Telegram / webhook / receiver receipt proof 或 delivery state 時,不得接受告警鏈路健康。"},
{"lane_id": "stale_or_silenced_alert_review_required", "meaning": "缺 stale alert、silence、dedup、inhibit 或 maintenance rule review 時,要求補件。"},
{"lane_id": "post_reload_readback_required", "meaning": "reload / deploy / rule change 後必須有 readback、metrics、receipt 與 rollback stop condition。"},
{"lane_id": "owner_review_only_update", "meaning": "只允許更新只讀 owner review ledger不得 reload、send Telegram 或 fire alert。"},
{"lane_id": "waiting_runtime_gate", "meaning": "即使 owner response acceptedruntime gate 仍等待獨立人工批准。"},
]
BLOCKED_ACTIONS = [
"prometheus_reload",
"alertmanager_reload",
"grafana_dashboard_apply",
"signoz_rule_apply",
"sentry_deploy",
"langfuse_config_change",
"otel_collector_reload",
"receiver_route_change",
"silence_policy_change",
"telegram_send",
"notification_route_change",
"webhook_receiver_change",
"remote_write_change",
"exporter_deploy",
"live_alert_fire",
"alert_chain_smoke",
"ssh_read",
"ssh_write",
"kubectl_action",
"secret_value_collection",
"host_write",
"active_scan",
"production_write",
"runtime_gate_open",
"raw_monitoring_payload_storage",
"accept_secret_value_evidence",
"mark_owner_response_accepted_without_reviewer_record",
"mark_route_200_as_alert_chain_healthy",
"mark_receiver_healthy_without_receipt",
"accept_silence_without_owner",
"accept_stale_alert_without_review",
"accept_reload_without_postcheck",
"store_raw_alert_payload",
"add_action_button",
]
def git_short_sha(root: Path) -> str:
try:
result = subprocess.run(
["git", "rev-parse", "--short", "HEAD"],
cwd=root,
check=True,
capture_output=True,
text=True,
)
return result.stdout.strip()
except Exception:
return "unknown"
def load_json(path: Path) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8"))
def acceptance_candidate(request: dict[str, Any]) -> dict[str, Any]:
surface_id = request["surface_id"]
return {
"acceptance_candidate_id": f"monitoring_owner_response_acceptance:{surface_id}",
"status": "waiting_owner_response",
"request_id": request["request_id"],
"surface_id": surface_id,
"label": request["label"],
"expected_scope": request["expected_scope"],
"config_kind": request["config_kind"],
"observability_scope": request["observability_scope"],
"control_tier": request["control_tier"],
"repo_source_path": request["repo_source_path"],
"repo_sha256": request["repo_sha256"],
"source_line_count": request["source_line_count"],
"write_capable_surface": request["write_capable_surface"],
"requires_live_evidence": request["requires_live_evidence"],
"owner_response_ref": None,
"owner_role_or_team": "pending_owner_response",
"decision": "pending_owner_response",
"decision_reason": "pending_owner_response",
"affected_scope": "pending_owner_response",
"redacted_evidence_refs": [],
"live_config_hash_ref": None,
"reload_owner": "pending_owner_response",
"receiver_owner": "pending_owner_response",
"route_smoke_plan": "pending_owner_response",
"incident_context_ref": None,
"alert_chain_health_ref": None,
"receiver_receipt_proof_ref": None,
"stale_alert_review": "pending_owner_response",
"silence_or_dedup_review": "pending_owner_response",
"false_green_risk_review": "pending_owner_response",
"post_reload_readback_plan": "pending_owner_response",
"cross_project_notification_ref": None,
"maintenance_window": "pending_owner_response",
"rollback_owner": "pending_owner_response",
"validation_plan": "pending_owner_response",
"noise_budget_owner": "pending_owner_response",
"reviewer_outcome": "waiting_owner_response",
"followup_owner": "pending_owner_response",
"acceptance_fields": ACCEPTANCE_FIELDS,
"required_owner_fields": request["required_owner_fields"],
"reviewer_checks": [item["check_id"] for item in REVIEWER_CHECKS],
"outcome_lanes": [item["lane_id"] for item in OUTCOME_LANES],
"blocked_actions": BLOCKED_ACTIONS,
"not_approval": True,
"request_sent": False,
"recipient_confirmed": False,
"owner_response_received": False,
"owner_response_accepted": False,
"owner_response_rejected": False,
"owner_response_quarantined": False,
"supplement_requested": False,
"live_evidence_received": False,
"live_config_hash_accepted": False,
"reload_owner_accepted": False,
"receiver_owner_accepted": False,
"route_smoke_accepted": False,
"incident_context_accepted": False,
"alert_chain_health_accepted": False,
"receiver_receipt_proof_accepted": False,
"stale_alert_review_accepted": False,
"silence_or_dedup_review_accepted": False,
"false_green_risk_review_accepted": False,
"post_reload_readback_plan_accepted": False,
"cross_project_notification_accepted": False,
"maintenance_window_accepted": False,
"rollback_owner_accepted": False,
"validation_plan_accepted": False,
"noise_budget_owner_accepted": False,
"prometheus_reload_authorized": False,
"alertmanager_reload_authorized": False,
"grafana_dashboard_apply_authorized": False,
"signoz_rule_apply_authorized": False,
"sentry_deploy_authorized": False,
"langfuse_config_change_authorized": False,
"otel_collector_reload_authorized": False,
"receiver_route_change_authorized": False,
"silence_policy_change_authorized": False,
"telegram_send_authorized": False,
"notification_route_change_authorized": False,
"webhook_receiver_change_authorized": False,
"remote_write_change_authorized": False,
"exporter_deploy_authorized": False,
"live_alert_fire_authorized": False,
"alert_chain_smoke_authorized": False,
"ssh_read_authorized": False,
"ssh_write_authorized": False,
"kubectl_action_authorized": False,
"secret_value_collection_allowed": False,
"host_write_authorized": False,
"active_scan_authorized": False,
"production_write_authorized": False,
"runtime_gate": False,
"action_buttons_allowed": False,
}
def build_report(
root: Path,
inventory: dict[str, Any],
request_draft_report: dict[str, Any],
generated_at: str | None,
) -> dict[str, Any]:
report_time = generated_at or datetime.now(TAIPEI).isoformat(timespec="seconds")
requests = request_draft_report.get("request_drafts", [])
acceptance_candidates = [acceptance_candidate(item) for item in requests]
write_capable = [item for item in acceptance_candidates if item["write_capable_surface"]]
live_evidence = [item for item in acceptance_candidates if item["requires_live_evidence"]]
return {
"schema_version": "monitoring_owner_response_acceptance_v1",
"generated_at": report_time,
"git_commit": git_short_sha(root),
"source_inventory_schema_version": inventory.get("schema_version"),
"source_inventory_status": inventory.get("status"),
"source_owner_request_schema_version": request_draft_report.get("schema_version"),
"source_owner_request_status": request_draft_report.get("status"),
"status": "owner_response_acceptance_ledger_ready_no_runtime_action",
"summary": {
"source_surface_count": inventory.get("summary", {}).get("surface_count", 0),
"source_request_draft_count": request_draft_report.get("summary", {}).get("request_draft_count", 0),
"acceptance_candidate_count": len(acceptance_candidates),
"write_capable_acceptance_candidate_count": len(write_capable),
"live_evidence_required_candidate_count": len(live_evidence),
"acceptance_field_count": len(ACCEPTANCE_FIELDS),
"required_owner_field_count": len(request_draft_report["required_owner_fields"]),
"reviewer_check_count": len(REVIEWER_CHECKS),
"outcome_lane_count": len(OUTCOME_LANES),
"blocked_action_count": len(BLOCKED_ACTIONS),
"request_sent_count": 0,
"recipient_confirmed_count": 0,
"owner_response_received_count": 0,
"owner_response_accepted_count": 0,
"owner_response_rejected_count": 0,
"owner_response_quarantined_count": 0,
"supplement_requested_count": 0,
"live_evidence_received_count": 0,
"live_config_hash_accepted_count": 0,
"reload_owner_accepted_count": 0,
"receiver_owner_accepted_count": 0,
"route_smoke_accepted_count": 0,
"incident_context_accepted_count": 0,
"alert_chain_health_accepted_count": 0,
"receiver_receipt_proof_accepted_count": 0,
"stale_alert_review_accepted_count": 0,
"silence_or_dedup_review_accepted_count": 0,
"false_green_risk_review_accepted_count": 0,
"post_reload_readback_plan_accepted_count": 0,
"cross_project_notification_accepted_count": 0,
"maintenance_window_accepted_count": 0,
"rollback_owner_accepted_count": 0,
"validation_plan_accepted_count": 0,
"noise_budget_owner_accepted_count": 0,
"prometheus_reload_authorized_count": 0,
"alertmanager_reload_authorized_count": 0,
"grafana_dashboard_apply_authorized_count": 0,
"signoz_rule_apply_authorized_count": 0,
"sentry_deploy_authorized_count": 0,
"langfuse_config_change_authorized_count": 0,
"otel_collector_reload_authorized_count": 0,
"receiver_route_change_authorized_count": 0,
"silence_policy_change_authorized_count": 0,
"telegram_send_authorized_count": 0,
"notification_route_change_authorized_count": 0,
"webhook_receiver_change_authorized_count": 0,
"remote_write_change_authorized_count": 0,
"exporter_deploy_authorized_count": 0,
"live_alert_fire_authorized_count": 0,
"alert_chain_smoke_authorized_count": 0,
"ssh_read_authorized_count": 0,
"ssh_write_authorized_count": 0,
"kubectl_action_authorized_count": 0,
"secret_value_collection_allowed_count": 0,
"host_write_authorized_count": 0,
"active_scan_authorized_count": 0,
"production_write_authorized_count": 0,
"runtime_gate_count": 0,
"action_button_count": 0,
},
"execution_boundaries": {
"request_dispatch_authorized": False,
"owner_response_accepted": False,
"live_evidence_received": False,
"prometheus_reload_authorized": False,
"alertmanager_reload_authorized": False,
"grafana_dashboard_apply_authorized": False,
"signoz_rule_apply_authorized": False,
"sentry_deploy_authorized": False,
"langfuse_config_change_authorized": False,
"otel_collector_reload_authorized": False,
"receiver_route_change_authorized": False,
"silence_policy_change_authorized": False,
"telegram_send_authorized": False,
"notification_route_change_authorized": False,
"webhook_receiver_change_authorized": False,
"remote_write_change_authorized": False,
"exporter_deploy_authorized": False,
"live_alert_fire_authorized": False,
"alert_chain_smoke_authorized": False,
"ssh_read_authorized": False,
"ssh_write_authorized": False,
"kubectl_action_authorized": False,
"secret_value_collection_allowed": False,
"host_write_authorized": False,
"active_scan_authorized": False,
"production_write_authorized": False,
"runtime_execution_authorized": False,
"route_200_alert_chain_health_accepted": False,
"receiver_health_without_receipt_accepted": False,
"false_green_acceptance_authorized": False,
"raw_alert_payload_storage_allowed": False,
"action_buttons_allowed": False,
"not_authorization": True,
},
"acceptance_fields": ACCEPTANCE_FIELDS,
"required_owner_fields": request_draft_report["required_owner_fields"],
"reviewer_checks": REVIEWER_CHECKS,
"outcome_lanes": OUTCOME_LANES,
"blocked_actions": BLOCKED_ACTIONS,
"acceptance_candidates": acceptance_candidates,
"next_steps": [
"等待 owner response未收到前不得更新 received / accepted / rejected count。",
"收到回覆後先做欄位完整性、敏感 payload 隔離、reload / send / smoke execution request 拒收。",
"metadata 合格也只能進 monitoring reviewer reviewreload、receiver route change、Telegram send、live alert fire、alert chain smoke 與 production write 仍需獨立人工批准。",
],
}
def main() -> int:
parser = argparse.ArgumentParser(description="IwoooS Monitoring owner response acceptance 只讀帳本產生器")
parser.add_argument("--root", default=".", help="repo root")
parser.add_argument(
"--inventory-report",
default="docs/security/monitoring-alerting-observability-inventory.snapshot.json",
help="monitoring-alerting-observability-inventory.py 輸出的 JSON",
)
parser.add_argument(
"--owner-request-report",
default="docs/security/monitoring-owner-request-draft.snapshot.json",
help="monitoring-owner-request-draft.py 輸出的 JSON",
)
parser.add_argument("--output", help="寫出 JSON 報告")
parser.add_argument("--generated-at", help="固定報告時間,供 committed snapshot 使用")
args = parser.parse_args()
root = Path(args.root).resolve()
inventory = load_json(root / args.inventory_report)
request_draft_report = load_json(root / args.owner_request_report)
report = build_report(root, inventory, request_draft_report, args.generated_at)
payload = json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True)
if args.output:
output = Path(args.output)
output.parent.mkdir(parents=True, exist_ok=True)
output.write_text(payload + "\n", encoding="utf-8")
else:
print(payload)
summary = report["summary"]
print(
"MONITORING_OWNER_RESPONSE_ACCEPTANCE_OK "
f"candidates={summary['acceptance_candidate_count']} "
f"write_capable={summary['write_capable_acceptance_candidate_count']} "
f"checks={summary['reviewer_check_count']} "
f"lanes={summary['outcome_lane_count']} "
f"accepted={summary['owner_response_accepted_count']} "
f"runtime_gate={summary['runtime_gate_count']}",
file=sys.stderr,
)
return 0
if __name__ == "__main__":
raise SystemExit(main())