411 lines
20 KiB
Python
411 lines
20 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
IwoooS Docker / systemd / host service post-incident readback 只讀計畫產生器。
|
||
|
||
本工具讀取 host service change evidence acceptance snapshot,建立事故後回讀
|
||
計畫:誰動了 Docker / systemd / compose / repair-bot、何時動、改前改後狀態、
|
||
哪些 public / admin route、AI provider、monitoring 與產品受影響、如何恢復、
|
||
如何防再發。它不 SSH、不讀 live host、不執行 docker / systemctl、不呼叫
|
||
repair-bot、不跑 Ansible、不做 route smoke、不保存 raw log / raw config,
|
||
也不把「服務變綠」誤判成 runtime authorization。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import subprocess
|
||
import sys
|
||
from datetime import datetime, timedelta, timezone
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
|
||
TAIPEI = timezone(timedelta(hours=8))
|
||
|
||
READBACK_FIELDS = [
|
||
"readback_candidate_id",
|
||
"source_change_evidence_candidate_id",
|
||
"surface_id",
|
||
"label",
|
||
"expected_host_scope",
|
||
"config_kind",
|
||
"service_scope",
|
||
"control_tier",
|
||
"write_capable_surface",
|
||
"requires_live_evidence",
|
||
"change_or_incident_ref",
|
||
"actor_attribution_ref",
|
||
"boot_time_ref",
|
||
"restart_or_recovery_window_ref",
|
||
"before_service_state_ref",
|
||
"after_service_state_ref",
|
||
"docker_daemon_state_ref",
|
||
"compose_stack_state_ref",
|
||
"systemd_unit_state_ref",
|
||
"failed_unit_review_ref",
|
||
"port_binding_state_ref",
|
||
"dependency_impact_ref",
|
||
"public_route_recovery_ref",
|
||
"admin_route_recovery_ref",
|
||
"agent_provider_health_ref",
|
||
"monitoring_alert_ref",
|
||
"operator_notification_ref",
|
||
"cross_project_sync_ref",
|
||
"restoration_evidence_ref",
|
||
"postcheck_readback_ref",
|
||
"recurrence_guard_ref",
|
||
"maintenance_window",
|
||
"rollback_owner",
|
||
"reviewer_outcome",
|
||
"followup_owner",
|
||
"not_approval",
|
||
]
|
||
|
||
REQUIRED_READBACK_FIELDS = [
|
||
"change_or_incident_ref",
|
||
"actor_attribution_ref",
|
||
"boot_time_ref",
|
||
"restart_or_recovery_window_ref",
|
||
"before_service_state_ref",
|
||
"after_service_state_ref",
|
||
"docker_daemon_state_ref",
|
||
"compose_stack_state_ref",
|
||
"systemd_unit_state_ref",
|
||
"failed_unit_review_ref",
|
||
"port_binding_state_ref",
|
||
"dependency_impact_ref",
|
||
"public_route_recovery_ref",
|
||
"admin_route_recovery_ref",
|
||
"agent_provider_health_ref",
|
||
"monitoring_alert_ref",
|
||
"operator_notification_ref",
|
||
"cross_project_sync_ref",
|
||
"restoration_evidence_ref",
|
||
"postcheck_readback_ref",
|
||
"recurrence_guard_ref",
|
||
"maintenance_window",
|
||
"rollback_owner",
|
||
"followup_owner",
|
||
"redacted_evidence_refs",
|
||
"no_secret_value_attestation",
|
||
"no_raw_log_or_config_attestation",
|
||
"no_false_green_attestation",
|
||
]
|
||
|
||
REVIEWER_CHECKS = [
|
||
{"check_id": "source_change_evidence_current", "instruction": "來源 host service change evidence snapshot 必須是目前版本。"},
|
||
{"check_id": "incident_ref_present", "instruction": "必須有可追溯 change / incident ref。"},
|
||
{"check_id": "actor_not_anonymous", "instruction": "必須標示 actor role / team,不接受匿名 restart、kill、start、compose 或 daemon 操作。"},
|
||
{"check_id": "boot_or_recovery_window_present", "instruction": "boot time、restart window 或 recovery window 必須有脫敏 ref。"},
|
||
{"check_id": "before_after_service_state_present", "instruction": "必須有 before / after service state ref,不能只寫服務已恢復。"},
|
||
{"check_id": "docker_daemon_state_present", "instruction": "Docker daemon active、starting、failed、socket、contention 或 API reachable 狀態必須有摘要 ref。"},
|
||
{"check_id": "compose_stack_state_present", "instruction": "Compose stack / container state 只能收脫敏狀態摘要 ref,不保存 raw docker ps dump。"},
|
||
{"check_id": "systemd_unit_state_present", "instruction": "systemd failed unit、restart policy 或 degraded state 必須有摘要 ref。"},
|
||
{"check_id": "failed_unit_review_present", "instruction": "必須說明 failed unit 是否與事故、restart 或服務恢復相關。"},
|
||
{"check_id": "port_binding_state_present", "instruction": "必須確認 host port、container port、proxy、gateway 與 firewall 狀態是否一致。"},
|
||
{"check_id": "dependency_impact_present", "instruction": "必須列出上游、下游、DB、queue、registry、AI provider、public route 與 monitoring 影響。"},
|
||
{"check_id": "public_route_recovery_present", "instruction": "public route 受影響時需有恢復 ref;無影響也需明確不適用。"},
|
||
{"check_id": "admin_route_recovery_present", "instruction": "admin / internal operator route 受影響時需有恢復 ref;無影響也需明確不適用。"},
|
||
{"check_id": "agent_provider_health_present", "instruction": "Ollama、AI provider、agent route 或 webhook 受影響時需有健康 readback ref。"},
|
||
{"check_id": "monitoring_alert_ref_present", "instruction": "需列 monitoring / alert / dashboard / incident ref,不能只靠人工觀察。"},
|
||
{"check_id": "operator_notification_present", "instruction": "需提供已通知受影響產品、owner 或 Session 的脫敏 ref。"},
|
||
{"check_id": "cross_project_sync_present", "instruction": "若影響 AwoooP、IwoooS、agent-bounty、StockPlatform、公開網站或監控,需有跨專案同步 ref。"},
|
||
{"check_id": "restoration_evidence_present", "instruction": "已恢復事故需提供恢復時間與恢復證據;未恢復需提供 still-degraded ref。"},
|
||
{"check_id": "postcheck_independent", "instruction": "post-check 必須獨立於原操作人與 UI 卡片。"},
|
||
{"check_id": "recurrence_guard_present", "instruction": "必須提出防再發 guard、change freeze、owner review 或自動化阻擋。"},
|
||
{"check_id": "runner_repair_bot_contention_present", "instruction": "必須確認 runner、repair-bot、backup job、iptables / xtables 或 compose action 是否競爭。"},
|
||
{"check_id": "maintenance_window_present", "instruction": "後續任何 restart / repair / compose / systemd 操作都需維護窗口。"},
|
||
{"check_id": "rollback_owner_present", "instruction": "rollback owner 與 rollback plan 必須同時存在。"},
|
||
{"check_id": "no_false_green_route_or_container", "instruction": "不得只用 route 200、container up、Docker API 回應、dashboard up 或 service healthy 當成事故已驗收。"},
|
||
{"check_id": "raw_log_config_absent", "instruction": "不得保存 raw docker logs、raw journal、raw compose、raw systemd unit、env dump 或未脫敏 host config。"},
|
||
{"check_id": "secret_or_key_value_absent", "instruction": "不得包含 secret、SSH key、token、cookie、private key、hash 或 partial secret。"},
|
||
{"check_id": "counts_transition_safe", "instruction": "只有 reviewer record 能更新 accepted count,且不得同時開 runtime gate。"},
|
||
{"check_id": "runtime_stays_zero", "instruction": "readback plan 不得觸發任何 SSH、Docker、systemctl、repair-bot、Ansible、route smoke 或 production write。"},
|
||
]
|
||
|
||
OUTCOME_LANES = [
|
||
{"lane_id": "waiting_post_incident_readback", "meaning": "尚未收到主機服務事故回讀包;所有 accepted / runtime count 維持 0。"},
|
||
{"lane_id": "request_actor_supplement", "meaning": "缺 actor / owner / decision 時要求補件。"},
|
||
{"lane_id": "request_before_after_supplement", "meaning": "缺 before / after、boot time、restart window 或 restoration evidence 時要求補件。"},
|
||
{"lane_id": "request_service_state_supplement", "meaning": "缺 Docker daemon、compose、systemd、failed unit、port binding 或 dependency 狀態時要求補件。"},
|
||
{"lane_id": "request_impact_supplement", "meaning": "缺 public/admin route、AI provider、monitoring、operator notification 或 cross-project sync 時要求補件。"},
|
||
{"lane_id": "quarantine_raw_payload", "meaning": "收到 secret、env dump、raw log、raw journal、raw compose 或未脫敏 host config 時只能隔離。"},
|
||
{"lane_id": "reject_unattributed_restart", "meaning": "無 actor、無 affected scope、無 rollback 或無 notification 的 restart / kill / compose action 不得驗收。"},
|
||
{"lane_id": "ready_for_host_service_post_incident_review", "meaning": "metadata 合格後,只能進 reviewer review。"},
|
||
{"lane_id": "recurrence_guard_backfill_required", "meaning": "需補防再發 guard、owner review、change freeze 或 automation block。"},
|
||
{"lane_id": "waiting_runtime_gate", "meaning": "即使 readback accepted,runtime gate 仍需獨立人工批准。"},
|
||
]
|
||
|
||
BLOCKED_ACTIONS = [
|
||
"ssh_read",
|
||
"ssh_write",
|
||
"live_host_read",
|
||
"docker_ps_live_read",
|
||
"docker_restart",
|
||
"docker_kill",
|
||
"docker_start",
|
||
"docker_compose_up",
|
||
"docker_compose_down",
|
||
"docker_compose_pull",
|
||
"systemctl_restart",
|
||
"systemctl_reload",
|
||
"systemctl_kill",
|
||
"systemctl_start",
|
||
"repair_bot_execute",
|
||
"ansible_apply",
|
||
"sudo_action",
|
||
"host_file_write",
|
||
"firewall_change",
|
||
"port_change",
|
||
"route_smoke",
|
||
"public_gateway_reload",
|
||
"nginx_reload",
|
||
"active_scan",
|
||
"secret_value_collection",
|
||
"raw_live_config_storage",
|
||
"raw_docker_log_storage",
|
||
"raw_journal_storage",
|
||
"raw_env_dump_storage",
|
||
"accept_restart_without_actor",
|
||
"accept_recovery_without_before_after",
|
||
"accept_service_healthy_as_config_accepted",
|
||
"accept_route_200_as_all_green",
|
||
"accept_container_up_as_all_green",
|
||
"skip_dependency_map_review",
|
||
"skip_port_binding_review",
|
||
"hide_daemon_runner_contention",
|
||
"mark_readback_accepted_without_reviewer_record",
|
||
"open_runtime_gate",
|
||
"add_action_button",
|
||
"production_write",
|
||
]
|
||
|
||
|
||
def git_short_sha(root: Path) -> str:
|
||
try:
|
||
result = subprocess.run(
|
||
["git", "rev-parse", "--short", "HEAD"],
|
||
cwd=root,
|
||
check=True,
|
||
capture_output=True,
|
||
text=True,
|
||
)
|
||
return result.stdout.strip()
|
||
except Exception:
|
||
return "unknown"
|
||
|
||
|
||
def load_json(path: Path) -> dict[str, Any]:
|
||
return json.loads(path.read_text(encoding="utf-8"))
|
||
|
||
|
||
def build_candidate(source: dict[str, Any]) -> dict[str, Any]:
|
||
surface_id = source["surface_id"]
|
||
return {
|
||
"readback_candidate_id": f"host_service_post_incident_readback:{surface_id}",
|
||
"status": "waiting_post_incident_readback",
|
||
"source_change_evidence_candidate_id": source["change_evidence_candidate_id"],
|
||
"surface_id": surface_id,
|
||
"label": source["label"],
|
||
"expected_host_scope": source["expected_host_scope"],
|
||
"config_kind": source["config_kind"],
|
||
"service_scope": source["service_scope"],
|
||
"control_tier": source["control_tier"],
|
||
"write_capable_surface": source["write_capable_surface"],
|
||
"requires_live_evidence": source["requires_live_evidence"],
|
||
"change_or_incident_ref": None,
|
||
"actor_attribution_ref": None,
|
||
"boot_time_ref": None,
|
||
"restart_or_recovery_window_ref": None,
|
||
"before_service_state_ref": None,
|
||
"after_service_state_ref": None,
|
||
"docker_daemon_state_ref": None,
|
||
"compose_stack_state_ref": None,
|
||
"systemd_unit_state_ref": None,
|
||
"failed_unit_review_ref": None,
|
||
"port_binding_state_ref": None,
|
||
"dependency_impact_ref": None,
|
||
"public_route_recovery_ref": None,
|
||
"admin_route_recovery_ref": None,
|
||
"agent_provider_health_ref": None,
|
||
"monitoring_alert_ref": None,
|
||
"operator_notification_ref": None,
|
||
"cross_project_sync_ref": None,
|
||
"restoration_evidence_ref": None,
|
||
"postcheck_readback_ref": None,
|
||
"recurrence_guard_ref": None,
|
||
"maintenance_window": "pending_post_incident_readback",
|
||
"rollback_owner": "pending_post_incident_readback",
|
||
"reviewer_outcome": "waiting_post_incident_readback",
|
||
"followup_owner": "pending_post_incident_readback",
|
||
"readback_fields": READBACK_FIELDS,
|
||
"required_readback_fields": REQUIRED_READBACK_FIELDS,
|
||
"reviewer_checks": [item["check_id"] for item in REVIEWER_CHECKS],
|
||
"outcome_lanes": [item["lane_id"] for item in OUTCOME_LANES],
|
||
"blocked_actions": BLOCKED_ACTIONS,
|
||
"not_approval": True,
|
||
"post_incident_readback_received": False,
|
||
"post_incident_readback_accepted": False,
|
||
"actor_attribution_accepted": False,
|
||
"before_after_state_accepted": False,
|
||
"docker_daemon_state_accepted": False,
|
||
"compose_stack_state_accepted": False,
|
||
"systemd_unit_state_accepted": False,
|
||
"failed_unit_review_accepted": False,
|
||
"port_binding_state_accepted": False,
|
||
"dependency_impact_accepted": False,
|
||
"public_route_recovery_accepted": False,
|
||
"admin_route_recovery_accepted": False,
|
||
"agent_provider_health_accepted": False,
|
||
"monitoring_alert_accepted": False,
|
||
"operator_notification_accepted": False,
|
||
"cross_project_sync_accepted": False,
|
||
"restoration_evidence_accepted": False,
|
||
"postcheck_readback_accepted": False,
|
||
"recurrence_guard_accepted": False,
|
||
"maintenance_window_accepted": False,
|
||
"rollback_owner_accepted": False,
|
||
"no_false_green_accepted": False,
|
||
"ssh_read_authorized": False,
|
||
"ssh_write_authorized": False,
|
||
"live_host_read_authorized": False,
|
||
"docker_action_authorized": False,
|
||
"systemctl_action_authorized": False,
|
||
"repair_bot_execution_authorized": False,
|
||
"ansible_apply_authorized": False,
|
||
"route_smoke_authorized": False,
|
||
"secret_value_collection_allowed": False,
|
||
"active_scan_authorized": False,
|
||
"runtime_gate": False,
|
||
"action_buttons_allowed": False,
|
||
"production_write_authorized": False,
|
||
}
|
||
|
||
|
||
def build_report(root: Path, source_report: dict[str, Any], generated_at: str | None) -> dict[str, Any]:
|
||
report_time = generated_at or datetime.now(TAIPEI).isoformat(timespec="seconds")
|
||
source_candidates = source_report.get("change_evidence_candidates", [])
|
||
readback_candidates = [build_candidate(item) for item in source_candidates]
|
||
write_capable = [item for item in readback_candidates if item["write_capable_surface"]]
|
||
live_required = [item for item in readback_candidates if item["requires_live_evidence"]]
|
||
|
||
return {
|
||
"schema_version": "host_service_post_incident_readback_plan_v1",
|
||
"generated_at": report_time,
|
||
"git_commit": git_short_sha(root),
|
||
"status": "post_incident_readback_plan_ready_no_runtime_action",
|
||
"source_schema_version": source_report.get("schema_version"),
|
||
"source_status": source_report.get("status"),
|
||
"source_paths": [
|
||
"docs/security/HOST-SERVICE-CHANGE-EVIDENCE-ACCEPTANCE.md",
|
||
"docs/security/host-service-change-evidence-acceptance.snapshot.json",
|
||
"docs/security/HOST-SERVICE-OWNER-RESPONSE-ACCEPTANCE.md",
|
||
"docs/security/host-service-owner-response-acceptance.snapshot.json",
|
||
],
|
||
"summary": {
|
||
"readback_candidate_count": len(readback_candidates),
|
||
"write_capable_readback_candidate_count": len(write_capable),
|
||
"live_evidence_required_readback_candidate_count": len(live_required),
|
||
"recovery_health_impact_review_required_candidate_count": len(readback_candidates),
|
||
"cross_project_sync_required_candidate_count": len(readback_candidates),
|
||
"no_false_green_required_candidate_count": len(readback_candidates),
|
||
"readback_field_count": len(READBACK_FIELDS),
|
||
"required_readback_field_count": len(REQUIRED_READBACK_FIELDS),
|
||
"reviewer_check_count": len(REVIEWER_CHECKS),
|
||
"outcome_lane_count": len(OUTCOME_LANES),
|
||
"blocked_action_count": len(BLOCKED_ACTIONS),
|
||
"post_incident_readback_received_count": 0,
|
||
"post_incident_readback_accepted_count": 0,
|
||
"actor_attribution_accepted_count": 0,
|
||
"before_after_state_accepted_count": 0,
|
||
"docker_daemon_state_accepted_count": 0,
|
||
"compose_stack_state_accepted_count": 0,
|
||
"systemd_unit_state_accepted_count": 0,
|
||
"failed_unit_review_accepted_count": 0,
|
||
"port_binding_state_accepted_count": 0,
|
||
"dependency_impact_accepted_count": 0,
|
||
"public_route_recovery_accepted_count": 0,
|
||
"admin_route_recovery_accepted_count": 0,
|
||
"agent_provider_health_accepted_count": 0,
|
||
"monitoring_alert_accepted_count": 0,
|
||
"operator_notification_accepted_count": 0,
|
||
"cross_project_sync_accepted_count": 0,
|
||
"restoration_evidence_accepted_count": 0,
|
||
"postcheck_readback_accepted_count": 0,
|
||
"recurrence_guard_accepted_count": 0,
|
||
"no_false_green_accepted_count": 0,
|
||
"runtime_gate_count": 0,
|
||
"action_button_count": 0,
|
||
"coverage_percent_after_readback_plan": 64,
|
||
},
|
||
"required_readback_fields": REQUIRED_READBACK_FIELDS,
|
||
"reviewer_checks": REVIEWER_CHECKS,
|
||
"outcome_lanes": OUTCOME_LANES,
|
||
"blocked_actions": BLOCKED_ACTIONS,
|
||
"readback_candidates": readback_candidates,
|
||
"boundaries": {
|
||
"not_authorization": True,
|
||
"ssh_read_authorized": False,
|
||
"ssh_write_authorized": False,
|
||
"live_host_read_authorized": False,
|
||
"docker_action_authorized": False,
|
||
"docker_restart_authorized": False,
|
||
"docker_kill_authorized": False,
|
||
"docker_start_authorized": False,
|
||
"docker_compose_action_authorized": False,
|
||
"systemctl_action_authorized": False,
|
||
"systemctl_restart_authorized": False,
|
||
"repair_bot_execution_authorized": False,
|
||
"ansible_apply_authorized": False,
|
||
"route_smoke_authorized": False,
|
||
"public_gateway_reload_authorized": False,
|
||
"nginx_reload_authorized": False,
|
||
"active_scan_authorized": False,
|
||
"secret_value_collection_allowed": False,
|
||
"raw_log_or_config_storage_allowed": False,
|
||
"runtime_execution_authorized": False,
|
||
"production_write_authorized": False,
|
||
"action_buttons_allowed": False,
|
||
},
|
||
}
|
||
|
||
|
||
def main() -> int:
|
||
parser = argparse.ArgumentParser(description=__doc__)
|
||
parser.add_argument("--root", default=".")
|
||
parser.add_argument(
|
||
"--source-change-evidence-report",
|
||
default="docs/security/host-service-change-evidence-acceptance.snapshot.json",
|
||
)
|
||
parser.add_argument(
|
||
"--output",
|
||
default="docs/security/host-service-post-incident-readback-plan.snapshot.json",
|
||
)
|
||
parser.add_argument("--generated-at")
|
||
args = parser.parse_args()
|
||
|
||
root = Path(args.root).resolve()
|
||
source_report = load_json(root / args.source_change_evidence_report)
|
||
report = build_report(root, source_report, args.generated_at)
|
||
payload = json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True)
|
||
|
||
output_path = root / args.output
|
||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||
output_path.write_text(payload + "\n", encoding="utf-8")
|
||
|
||
summary = report["summary"]
|
||
print(
|
||
"HOST_SERVICE_POST_INCIDENT_READBACK_PLAN_OK "
|
||
f"candidates={summary['readback_candidate_count']} "
|
||
f"checks={summary['reviewer_check_count']} "
|
||
f"lanes={summary['outcome_lane_count']} "
|
||
f"accepted={summary['post_incident_readback_accepted_count']} "
|
||
f"runtime_gate={summary['runtime_gate_count']}"
|
||
)
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|