Files
awoooi/scripts/security/host-service-post-incident-readback-plan.py
Your Name abda0ef617
All checks were successful
Code Review / ai-code-review (push) Successful in 18s
CD Pipeline / tests (push) Successful in 1m41s
CD Pipeline / build-and-deploy (push) Successful in 5m21s
CD Pipeline / post-deploy-checks (push) Successful in 2m49s
feat(iwooos): 新增主機服務事故回讀 gate
2026-06-15 20:01:14 +08:00

411 lines
20 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
IwoooS Docker / systemd / host service post-incident readback 只讀計畫產生器。
本工具讀取 host service change evidence acceptance snapshot建立事故後回讀
計畫:誰動了 Docker / systemd / compose / repair-bot、何時動、改前改後狀態、
哪些 public / admin route、AI provider、monitoring 與產品受影響、如何恢復、
如何防再發。它不 SSH、不讀 live host、不執行 docker / systemctl、不呼叫
repair-bot、不跑 Ansible、不做 route smoke、不保存 raw log / raw config
也不把「服務變綠」誤判成 runtime authorization。
"""
from __future__ import annotations
import argparse
import json
import subprocess
import sys
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any
TAIPEI = timezone(timedelta(hours=8))
READBACK_FIELDS = [
"readback_candidate_id",
"source_change_evidence_candidate_id",
"surface_id",
"label",
"expected_host_scope",
"config_kind",
"service_scope",
"control_tier",
"write_capable_surface",
"requires_live_evidence",
"change_or_incident_ref",
"actor_attribution_ref",
"boot_time_ref",
"restart_or_recovery_window_ref",
"before_service_state_ref",
"after_service_state_ref",
"docker_daemon_state_ref",
"compose_stack_state_ref",
"systemd_unit_state_ref",
"failed_unit_review_ref",
"port_binding_state_ref",
"dependency_impact_ref",
"public_route_recovery_ref",
"admin_route_recovery_ref",
"agent_provider_health_ref",
"monitoring_alert_ref",
"operator_notification_ref",
"cross_project_sync_ref",
"restoration_evidence_ref",
"postcheck_readback_ref",
"recurrence_guard_ref",
"maintenance_window",
"rollback_owner",
"reviewer_outcome",
"followup_owner",
"not_approval",
]
REQUIRED_READBACK_FIELDS = [
"change_or_incident_ref",
"actor_attribution_ref",
"boot_time_ref",
"restart_or_recovery_window_ref",
"before_service_state_ref",
"after_service_state_ref",
"docker_daemon_state_ref",
"compose_stack_state_ref",
"systemd_unit_state_ref",
"failed_unit_review_ref",
"port_binding_state_ref",
"dependency_impact_ref",
"public_route_recovery_ref",
"admin_route_recovery_ref",
"agent_provider_health_ref",
"monitoring_alert_ref",
"operator_notification_ref",
"cross_project_sync_ref",
"restoration_evidence_ref",
"postcheck_readback_ref",
"recurrence_guard_ref",
"maintenance_window",
"rollback_owner",
"followup_owner",
"redacted_evidence_refs",
"no_secret_value_attestation",
"no_raw_log_or_config_attestation",
"no_false_green_attestation",
]
REVIEWER_CHECKS = [
{"check_id": "source_change_evidence_current", "instruction": "來源 host service change evidence snapshot 必須是目前版本。"},
{"check_id": "incident_ref_present", "instruction": "必須有可追溯 change / incident ref。"},
{"check_id": "actor_not_anonymous", "instruction": "必須標示 actor role / team不接受匿名 restart、kill、start、compose 或 daemon 操作。"},
{"check_id": "boot_or_recovery_window_present", "instruction": "boot time、restart window 或 recovery window 必須有脫敏 ref。"},
{"check_id": "before_after_service_state_present", "instruction": "必須有 before / after service state ref不能只寫服務已恢復。"},
{"check_id": "docker_daemon_state_present", "instruction": "Docker daemon active、starting、failed、socket、contention 或 API reachable 狀態必須有摘要 ref。"},
{"check_id": "compose_stack_state_present", "instruction": "Compose stack / container state 只能收脫敏狀態摘要 ref不保存 raw docker ps dump。"},
{"check_id": "systemd_unit_state_present", "instruction": "systemd failed unit、restart policy 或 degraded state 必須有摘要 ref。"},
{"check_id": "failed_unit_review_present", "instruction": "必須說明 failed unit 是否與事故、restart 或服務恢復相關。"},
{"check_id": "port_binding_state_present", "instruction": "必須確認 host port、container port、proxy、gateway 與 firewall 狀態是否一致。"},
{"check_id": "dependency_impact_present", "instruction": "必須列出上游、下游、DB、queue、registry、AI provider、public route 與 monitoring 影響。"},
{"check_id": "public_route_recovery_present", "instruction": "public route 受影響時需有恢復 ref無影響也需明確不適用。"},
{"check_id": "admin_route_recovery_present", "instruction": "admin / internal operator route 受影響時需有恢復 ref無影響也需明確不適用。"},
{"check_id": "agent_provider_health_present", "instruction": "Ollama、AI provider、agent route 或 webhook 受影響時需有健康 readback ref。"},
{"check_id": "monitoring_alert_ref_present", "instruction": "需列 monitoring / alert / dashboard / incident ref不能只靠人工觀察。"},
{"check_id": "operator_notification_present", "instruction": "需提供已通知受影響產品、owner 或 Session 的脫敏 ref。"},
{"check_id": "cross_project_sync_present", "instruction": "若影響 AwoooP、IwoooS、agent-bounty、StockPlatform、公開網站或監控需有跨專案同步 ref。"},
{"check_id": "restoration_evidence_present", "instruction": "已恢復事故需提供恢復時間與恢復證據;未恢復需提供 still-degraded ref。"},
{"check_id": "postcheck_independent", "instruction": "post-check 必須獨立於原操作人與 UI 卡片。"},
{"check_id": "recurrence_guard_present", "instruction": "必須提出防再發 guard、change freeze、owner review 或自動化阻擋。"},
{"check_id": "runner_repair_bot_contention_present", "instruction": "必須確認 runner、repair-bot、backup job、iptables / xtables 或 compose action 是否競爭。"},
{"check_id": "maintenance_window_present", "instruction": "後續任何 restart / repair / compose / systemd 操作都需維護窗口。"},
{"check_id": "rollback_owner_present", "instruction": "rollback owner 與 rollback plan 必須同時存在。"},
{"check_id": "no_false_green_route_or_container", "instruction": "不得只用 route 200、container up、Docker API 回應、dashboard up 或 service healthy 當成事故已驗收。"},
{"check_id": "raw_log_config_absent", "instruction": "不得保存 raw docker logs、raw journal、raw compose、raw systemd unit、env dump 或未脫敏 host config。"},
{"check_id": "secret_or_key_value_absent", "instruction": "不得包含 secret、SSH key、token、cookie、private key、hash 或 partial secret。"},
{"check_id": "counts_transition_safe", "instruction": "只有 reviewer record 能更新 accepted count且不得同時開 runtime gate。"},
{"check_id": "runtime_stays_zero", "instruction": "readback plan 不得觸發任何 SSH、Docker、systemctl、repair-bot、Ansible、route smoke 或 production write。"},
]
OUTCOME_LANES = [
{"lane_id": "waiting_post_incident_readback", "meaning": "尚未收到主機服務事故回讀包;所有 accepted / runtime count 維持 0。"},
{"lane_id": "request_actor_supplement", "meaning": "缺 actor / owner / decision 時要求補件。"},
{"lane_id": "request_before_after_supplement", "meaning": "缺 before / after、boot time、restart window 或 restoration evidence 時要求補件。"},
{"lane_id": "request_service_state_supplement", "meaning": "缺 Docker daemon、compose、systemd、failed unit、port binding 或 dependency 狀態時要求補件。"},
{"lane_id": "request_impact_supplement", "meaning": "缺 public/admin route、AI provider、monitoring、operator notification 或 cross-project sync 時要求補件。"},
{"lane_id": "quarantine_raw_payload", "meaning": "收到 secret、env dump、raw log、raw journal、raw compose 或未脫敏 host config 時只能隔離。"},
{"lane_id": "reject_unattributed_restart", "meaning": "無 actor、無 affected scope、無 rollback 或無 notification 的 restart / kill / compose action 不得驗收。"},
{"lane_id": "ready_for_host_service_post_incident_review", "meaning": "metadata 合格後,只能進 reviewer review。"},
{"lane_id": "recurrence_guard_backfill_required", "meaning": "需補防再發 guard、owner review、change freeze 或 automation block。"},
{"lane_id": "waiting_runtime_gate", "meaning": "即使 readback acceptedruntime gate 仍需獨立人工批准。"},
]
BLOCKED_ACTIONS = [
"ssh_read",
"ssh_write",
"live_host_read",
"docker_ps_live_read",
"docker_restart",
"docker_kill",
"docker_start",
"docker_compose_up",
"docker_compose_down",
"docker_compose_pull",
"systemctl_restart",
"systemctl_reload",
"systemctl_kill",
"systemctl_start",
"repair_bot_execute",
"ansible_apply",
"sudo_action",
"host_file_write",
"firewall_change",
"port_change",
"route_smoke",
"public_gateway_reload",
"nginx_reload",
"active_scan",
"secret_value_collection",
"raw_live_config_storage",
"raw_docker_log_storage",
"raw_journal_storage",
"raw_env_dump_storage",
"accept_restart_without_actor",
"accept_recovery_without_before_after",
"accept_service_healthy_as_config_accepted",
"accept_route_200_as_all_green",
"accept_container_up_as_all_green",
"skip_dependency_map_review",
"skip_port_binding_review",
"hide_daemon_runner_contention",
"mark_readback_accepted_without_reviewer_record",
"open_runtime_gate",
"add_action_button",
"production_write",
]
def git_short_sha(root: Path) -> str:
try:
result = subprocess.run(
["git", "rev-parse", "--short", "HEAD"],
cwd=root,
check=True,
capture_output=True,
text=True,
)
return result.stdout.strip()
except Exception:
return "unknown"
def load_json(path: Path) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8"))
def build_candidate(source: dict[str, Any]) -> dict[str, Any]:
surface_id = source["surface_id"]
return {
"readback_candidate_id": f"host_service_post_incident_readback:{surface_id}",
"status": "waiting_post_incident_readback",
"source_change_evidence_candidate_id": source["change_evidence_candidate_id"],
"surface_id": surface_id,
"label": source["label"],
"expected_host_scope": source["expected_host_scope"],
"config_kind": source["config_kind"],
"service_scope": source["service_scope"],
"control_tier": source["control_tier"],
"write_capable_surface": source["write_capable_surface"],
"requires_live_evidence": source["requires_live_evidence"],
"change_or_incident_ref": None,
"actor_attribution_ref": None,
"boot_time_ref": None,
"restart_or_recovery_window_ref": None,
"before_service_state_ref": None,
"after_service_state_ref": None,
"docker_daemon_state_ref": None,
"compose_stack_state_ref": None,
"systemd_unit_state_ref": None,
"failed_unit_review_ref": None,
"port_binding_state_ref": None,
"dependency_impact_ref": None,
"public_route_recovery_ref": None,
"admin_route_recovery_ref": None,
"agent_provider_health_ref": None,
"monitoring_alert_ref": None,
"operator_notification_ref": None,
"cross_project_sync_ref": None,
"restoration_evidence_ref": None,
"postcheck_readback_ref": None,
"recurrence_guard_ref": None,
"maintenance_window": "pending_post_incident_readback",
"rollback_owner": "pending_post_incident_readback",
"reviewer_outcome": "waiting_post_incident_readback",
"followup_owner": "pending_post_incident_readback",
"readback_fields": READBACK_FIELDS,
"required_readback_fields": REQUIRED_READBACK_FIELDS,
"reviewer_checks": [item["check_id"] for item in REVIEWER_CHECKS],
"outcome_lanes": [item["lane_id"] for item in OUTCOME_LANES],
"blocked_actions": BLOCKED_ACTIONS,
"not_approval": True,
"post_incident_readback_received": False,
"post_incident_readback_accepted": False,
"actor_attribution_accepted": False,
"before_after_state_accepted": False,
"docker_daemon_state_accepted": False,
"compose_stack_state_accepted": False,
"systemd_unit_state_accepted": False,
"failed_unit_review_accepted": False,
"port_binding_state_accepted": False,
"dependency_impact_accepted": False,
"public_route_recovery_accepted": False,
"admin_route_recovery_accepted": False,
"agent_provider_health_accepted": False,
"monitoring_alert_accepted": False,
"operator_notification_accepted": False,
"cross_project_sync_accepted": False,
"restoration_evidence_accepted": False,
"postcheck_readback_accepted": False,
"recurrence_guard_accepted": False,
"maintenance_window_accepted": False,
"rollback_owner_accepted": False,
"no_false_green_accepted": False,
"ssh_read_authorized": False,
"ssh_write_authorized": False,
"live_host_read_authorized": False,
"docker_action_authorized": False,
"systemctl_action_authorized": False,
"repair_bot_execution_authorized": False,
"ansible_apply_authorized": False,
"route_smoke_authorized": False,
"secret_value_collection_allowed": False,
"active_scan_authorized": False,
"runtime_gate": False,
"action_buttons_allowed": False,
"production_write_authorized": False,
}
def build_report(root: Path, source_report: dict[str, Any], generated_at: str | None) -> dict[str, Any]:
report_time = generated_at or datetime.now(TAIPEI).isoformat(timespec="seconds")
source_candidates = source_report.get("change_evidence_candidates", [])
readback_candidates = [build_candidate(item) for item in source_candidates]
write_capable = [item for item in readback_candidates if item["write_capable_surface"]]
live_required = [item for item in readback_candidates if item["requires_live_evidence"]]
return {
"schema_version": "host_service_post_incident_readback_plan_v1",
"generated_at": report_time,
"git_commit": git_short_sha(root),
"status": "post_incident_readback_plan_ready_no_runtime_action",
"source_schema_version": source_report.get("schema_version"),
"source_status": source_report.get("status"),
"source_paths": [
"docs/security/HOST-SERVICE-CHANGE-EVIDENCE-ACCEPTANCE.md",
"docs/security/host-service-change-evidence-acceptance.snapshot.json",
"docs/security/HOST-SERVICE-OWNER-RESPONSE-ACCEPTANCE.md",
"docs/security/host-service-owner-response-acceptance.snapshot.json",
],
"summary": {
"readback_candidate_count": len(readback_candidates),
"write_capable_readback_candidate_count": len(write_capable),
"live_evidence_required_readback_candidate_count": len(live_required),
"recovery_health_impact_review_required_candidate_count": len(readback_candidates),
"cross_project_sync_required_candidate_count": len(readback_candidates),
"no_false_green_required_candidate_count": len(readback_candidates),
"readback_field_count": len(READBACK_FIELDS),
"required_readback_field_count": len(REQUIRED_READBACK_FIELDS),
"reviewer_check_count": len(REVIEWER_CHECKS),
"outcome_lane_count": len(OUTCOME_LANES),
"blocked_action_count": len(BLOCKED_ACTIONS),
"post_incident_readback_received_count": 0,
"post_incident_readback_accepted_count": 0,
"actor_attribution_accepted_count": 0,
"before_after_state_accepted_count": 0,
"docker_daemon_state_accepted_count": 0,
"compose_stack_state_accepted_count": 0,
"systemd_unit_state_accepted_count": 0,
"failed_unit_review_accepted_count": 0,
"port_binding_state_accepted_count": 0,
"dependency_impact_accepted_count": 0,
"public_route_recovery_accepted_count": 0,
"admin_route_recovery_accepted_count": 0,
"agent_provider_health_accepted_count": 0,
"monitoring_alert_accepted_count": 0,
"operator_notification_accepted_count": 0,
"cross_project_sync_accepted_count": 0,
"restoration_evidence_accepted_count": 0,
"postcheck_readback_accepted_count": 0,
"recurrence_guard_accepted_count": 0,
"no_false_green_accepted_count": 0,
"runtime_gate_count": 0,
"action_button_count": 0,
"coverage_percent_after_readback_plan": 64,
},
"required_readback_fields": REQUIRED_READBACK_FIELDS,
"reviewer_checks": REVIEWER_CHECKS,
"outcome_lanes": OUTCOME_LANES,
"blocked_actions": BLOCKED_ACTIONS,
"readback_candidates": readback_candidates,
"boundaries": {
"not_authorization": True,
"ssh_read_authorized": False,
"ssh_write_authorized": False,
"live_host_read_authorized": False,
"docker_action_authorized": False,
"docker_restart_authorized": False,
"docker_kill_authorized": False,
"docker_start_authorized": False,
"docker_compose_action_authorized": False,
"systemctl_action_authorized": False,
"systemctl_restart_authorized": False,
"repair_bot_execution_authorized": False,
"ansible_apply_authorized": False,
"route_smoke_authorized": False,
"public_gateway_reload_authorized": False,
"nginx_reload_authorized": False,
"active_scan_authorized": False,
"secret_value_collection_allowed": False,
"raw_log_or_config_storage_allowed": False,
"runtime_execution_authorized": False,
"production_write_authorized": False,
"action_buttons_allowed": False,
},
}
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--root", default=".")
parser.add_argument(
"--source-change-evidence-report",
default="docs/security/host-service-change-evidence-acceptance.snapshot.json",
)
parser.add_argument(
"--output",
default="docs/security/host-service-post-incident-readback-plan.snapshot.json",
)
parser.add_argument("--generated-at")
args = parser.parse_args()
root = Path(args.root).resolve()
source_report = load_json(root / args.source_change_evidence_report)
report = build_report(root, source_report, args.generated_at)
payload = json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True)
output_path = root / args.output
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(payload + "\n", encoding="utf-8")
summary = report["summary"]
print(
"HOST_SERVICE_POST_INCIDENT_READBACK_PLAN_OK "
f"candidates={summary['readback_candidate_count']} "
f"checks={summary['reviewer_check_count']} "
f"lanes={summary['outcome_lane_count']} "
f"accepted={summary['post_incident_readback_accepted_count']} "
f"runtime_gate={summary['runtime_gate_count']}"
)
return 0
if __name__ == "__main__":
sys.exit(main())