Files
awoooi/scripts/security/k8s-argocd-post-incident-readback-plan.py
Your Name 45c2b8ebe6
All checks were successful
Code Review / ai-code-review (push) Successful in 12s
CD Pipeline / tests (push) Successful in 1m48s
CD Pipeline / build-and-deploy (push) Successful in 6m0s
CD Pipeline / post-deploy-checks (push) Successful in 2m19s
feat(iwooos): 新增 K8s ArgoCD 事故回讀 gate
2026-06-15 20:42:12 +08:00

477 lines
24 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
IwoooS K8s / ArgoCD post-incident readback 只讀計畫產生器。
本工具讀取 K8s / ArgoCD GitOps 變更證據驗收 snapshot建立事故後回讀
計畫:誰觸發 GitOps / rollout、ArgoCD 是否 Synced / Degraded、Pod 是否
Pending、是否有 image pull / scheduling / drift scanner / CronJob / route /
AI provider / monitoring 影響,以及如何防止把 UI 可見、route 200 或 CD
success 誤判成 runtime authorization。它不呼叫 ArgoCD API、不讀 live
cluster、不執行 kubectl / helm / kustomize、不改 NetworkPolicy / NodePort /
RBAC、不保存 raw manifest / kubeconfig / Secret value。
"""
from __future__ import annotations
import argparse
import json
import subprocess
import sys
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any
TAIPEI = timezone(timedelta(hours=8))
READBACK_FIELDS = [
"post_incident_readback_candidate_id",
"source_change_evidence_candidate_id",
"group_id",
"root",
"control_tier",
"gate_tags",
"k8s_incident_or_change_ref",
"actor_attribution_ref",
"argocd_app_health_ref",
"argocd_sync_status_ref",
"degraded_state_ref",
"pending_workload_ref",
"image_pull_or_scheduling_ref",
"rollout_before_ref",
"rollout_after_ref",
"event_summary_ref",
"metrics_alert_ref",
"drift_scanner_ref",
"cronjob_schedule_ref",
"backup_restore_impact_ref",
"secret_metadata_parity_ref",
"network_policy_service_impact_ref",
"rbac_serviceaccount_impact_ref",
"public_admin_route_impact_ref",
"ai_provider_monitoring_impact_ref",
"operator_notification_ref",
"cross_project_sync_ref",
"recovery_or_still_degraded_ref",
"postcheck_readback_ref",
"recurrence_guard_ref",
"maintenance_window",
"rollback_revision",
"rollback_owner",
"reviewer_outcome",
"followup_owner",
"not_approval",
]
REQUIRED_READBACK_FIELDS = [
"k8s_incident_or_change_ref",
"actor_attribution_ref",
"argocd_app_health_ref",
"argocd_sync_status_ref",
"degraded_state_ref",
"pending_workload_ref",
"image_pull_or_scheduling_ref",
"rollout_before_ref",
"rollout_after_ref",
"event_summary_ref",
"metrics_alert_ref",
"drift_scanner_ref",
"cronjob_schedule_ref",
"secret_metadata_parity_ref",
"network_policy_service_impact_ref",
"rbac_serviceaccount_impact_ref",
"public_admin_route_impact_ref",
"ai_provider_monitoring_impact_ref",
"operator_notification_ref",
"cross_project_sync_ref",
"recovery_or_still_degraded_ref",
"postcheck_readback_ref",
"recurrence_guard_ref",
"maintenance_window",
"rollback_revision",
"rollback_owner",
"followup_owner",
"redacted_evidence_refs",
"no_secret_value_attestation",
"no_raw_manifest_or_kubeconfig_attestation",
"no_false_green_attestation",
]
REVIEWER_CHECKS = [
{"check_id": "source_change_evidence_current", "instruction": "來源 GitOps change evidence snapshot 必須是目前版本。"},
{"check_id": "incident_or_change_ref_present", "instruction": "必須有 incident / change / deploy marker ref不能只寫服務已恢復。"},
{"check_id": "actor_attribution_present", "instruction": "必須標示 actor role / team不接受匿名 ArgoCD sync、kubectl、Helm、rollout 或 image 變更。"},
{"check_id": "argocd_sync_and_health_present", "instruction": "ArgoCD sync status、health status 與 revision 必須有脫敏 ref。"},
{"check_id": "degraded_state_review_present", "instruction": "Synced / Degraded、OutOfSync、Progressing、Suspended 或 Unknown 狀態都需說明。"},
{"check_id": "pending_workload_review_present", "instruction": "Pending Pod / Job / CronJob / Deployment 需列出 image pull、scheduling、quota、node、PVC 或 RBAC 檢查摘要。"},
{"check_id": "rollout_before_after_present", "instruction": "必須有 rollout before / after ref不能只用 CD success 或 deploy marker。"},
{"check_id": "event_summary_redacted", "instruction": "只能收事件摘要 ref不保存 raw event dump、raw manifest、raw pod log 或 raw kubeconfig。"},
{"check_id": "metrics_alert_ref_present", "instruction": "需有 metric / alert / dashboard / incident ref不能只靠人工觀察。"},
{"check_id": "drift_scanner_ref_present", "instruction": "涉及 drift-scanner、配置漂移或 GitOps parity 時需有 readback ref不適用也需明確說明。"},
{"check_id": "cronjob_schedule_review_present", "instruction": "涉及 CronJob / scheduled job / backup schedule 時需確認 schedule、suspend、last run 與 missed run 摘要。"},
{"check_id": "backup_restore_impact_called_out", "instruction": "涉及 Velero、backup、restore、PVC 或 object storage 時需列 backup / restore 影響。"},
{"check_id": "secret_metadata_only", "instruction": "Secret 只能收 metadata parity ref不得收 value、hash、partial token、DSN、cookie 或 kubeconfig。"},
{"check_id": "network_policy_service_impact_present", "instruction": "涉及 NetworkPolicy、Service、Ingress、NodePort 或 DNS 時需列 route / port 影響。"},
{"check_id": "rbac_serviceaccount_impact_present", "instruction": "涉及 RBAC / ServiceAccount / RoleBinding 時需列權限與 fallback 影響。"},
{"check_id": "public_admin_route_impact_present", "instruction": "public / admin / API route 受影響時需有 route recovery ref無影響也需明確不適用。"},
{"check_id": "ai_provider_monitoring_impact_present", "instruction": "AI provider、Ollama、monitoring、alert route 或 webhook 受影響時需列 impact ref。"},
{"check_id": "operator_notification_present", "instruction": "需提供已通知受影響產品、owner 或 Session 的脫敏 ref。"},
{"check_id": "cross_project_sync_present", "instruction": "若影響 AwoooP、IwoooS、agent-bounty、StockPlatform、公開網站或監控需有跨專案同步 ref。"},
{"check_id": "recovery_or_still_degraded_present", "instruction": "已恢復需提供恢復時間與恢復證據;未恢復需提供 still-degraded ref。"},
{"check_id": "postcheck_independent", "instruction": "post-check 必須獨立於原操作人與 UI 卡片。"},
{"check_id": "recurrence_guard_present", "instruction": "必須提出防再發 guard、change freeze、owner review 或 automation block。"},
{"check_id": "maintenance_window_present", "instruction": "後續任何 sync / rollout / rollback / apply 操作都需維護窗口。"},
{"check_id": "rollback_revision_present", "instruction": "rollback revision 與 rollback owner 必須同時存在。"},
{"check_id": "no_false_green", "instruction": "不得只用 route 200、Pod Running、ArgoCD Synced、CD success、dashboard up 或 UI 可見當成事故已驗收。"},
{"check_id": "raw_payload_absent", "instruction": "不得保存 raw manifest、raw kubeconfig、raw event dump、raw pod log、raw Secret 或未脫敏 screenshot。"},
{"check_id": "runtime_stays_zero", "instruction": "readback plan 不得觸發 ArgoCD API、kubectl、Helm、Kustomize、NetworkPolicy / NodePort / RBAC 變更或 production write。"},
{"check_id": "counts_transition_safe", "instruction": "只有 reviewer record 能更新 accepted count且不得同時開 runtime gate。"},
]
OUTCOME_LANES = [
{"lane_id": "waiting_post_incident_readback", "meaning": "尚未收到 K8s / ArgoCD 事故回讀包;所有 accepted / runtime count 維持 0。"},
{"lane_id": "request_actor_or_revision_supplement", "meaning": "缺 actor、deploy marker、ArgoCD revision 或 rollback revision 時要求補件。"},
{"lane_id": "request_degraded_pending_supplement", "meaning": "缺 Degraded / Pending / image pull / scheduling / rollout before-after 時要求補件。"},
{"lane_id": "request_event_metric_supplement", "meaning": "缺 event summary、metrics、alert、drift scanner 或 CronJob schedule 時要求補件。"},
{"lane_id": "request_route_dependency_supplement", "meaning": "缺 public/admin route、AI provider、monitoring、backup / restore、network / RBAC 影響時要求補件。"},
{"lane_id": "quarantine_raw_payload", "meaning": "收到 Secret value、kubeconfig、raw manifest、raw log、raw event 或未脫敏截圖時只能隔離。"},
{"lane_id": "reject_runtime_claim", "meaning": "把 CD success、ArgoCD Synced、route 200、Pod Running 或 UI 可見當驗收時拒收。"},
{"lane_id": "ready_for_k8s_post_incident_review", "meaning": "metadata 合格後,只能進 reviewer review。"},
{"lane_id": "recurrence_guard_backfill_required", "meaning": "需補防再發 guard、owner review、change freeze 或 automation block。"},
{"lane_id": "waiting_runtime_gate", "meaning": "即使 readback acceptedruntime gate 仍需獨立人工批准。"},
]
BLOCKED_ACTIONS = [
"argocd_api_read",
"argocd_sync",
"argocd_rollback",
"live_cluster_read",
"kubectl_get_live",
"kubectl_apply",
"kubectl_patch",
"kubectl_delete",
"kubectl_rollout_restart",
"kubectl_scale",
"helm_upgrade",
"helm_rollback",
"kustomize_image_set",
"change_network_policy",
"change_nodeport",
"change_service_type",
"change_ingress_route",
"change_rbac",
"change_serviceaccount",
"change_configmap_runtime",
"change_secret_metadata",
"secret_value_collection",
"kubeconfig_collection",
"raw_manifest_storage",
"raw_event_dump_storage",
"raw_pod_log_storage",
"raw_secret_storage",
"velero_restore",
"restore_backup",
"prometheus_rule_apply",
"alertmanager_reload",
"route_smoke",
"production_write",
"accept_synced_as_healthy",
"accept_route_200_as_all_green",
"accept_pod_running_as_all_green",
"accept_cd_success_as_security_acceptance",
"skip_degraded_pending_review",
"mark_readback_accepted_without_reviewer_record",
"open_runtime_gate",
"add_action_button",
]
def git_short_sha(root: Path) -> str:
try:
result = subprocess.run(
["git", "rev-parse", "--short", "HEAD"],
cwd=root,
check=True,
capture_output=True,
text=True,
)
return result.stdout.strip()
except Exception:
return "unknown"
def load_json(path: Path) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8"))
def build_candidate(source: dict[str, Any]) -> dict[str, Any]:
group_id = source["group_id"]
return {
"post_incident_readback_candidate_id": f"k8s_argocd_post_incident_readback:{group_id}",
"status": "waiting_post_incident_readback",
"source_change_evidence_candidate_id": source["change_evidence_candidate_id"],
"group_id": group_id,
"root": source["root"],
"control_tier": source["control_tier"],
"gate_tags": source.get("gate_tags", []),
"write_capable": source.get("write_capable", True),
"k8s_incident_or_change_ref": None,
"actor_attribution_ref": None,
"argocd_app_health_ref": None,
"argocd_sync_status_ref": None,
"degraded_state_ref": None,
"pending_workload_ref": None,
"image_pull_or_scheduling_ref": None,
"rollout_before_ref": None,
"rollout_after_ref": None,
"event_summary_ref": None,
"metrics_alert_ref": None,
"drift_scanner_ref": None,
"cronjob_schedule_ref": None,
"backup_restore_impact_ref": None,
"secret_metadata_parity_ref": None,
"network_policy_service_impact_ref": None,
"rbac_serviceaccount_impact_ref": None,
"public_admin_route_impact_ref": None,
"ai_provider_monitoring_impact_ref": None,
"operator_notification_ref": None,
"cross_project_sync_ref": None,
"recovery_or_still_degraded_ref": None,
"postcheck_readback_ref": None,
"recurrence_guard_ref": None,
"maintenance_window": "pending_post_incident_readback",
"rollback_revision": "pending_post_incident_readback",
"rollback_owner": "pending_post_incident_readback",
"reviewer_outcome": "waiting_post_incident_readback",
"followup_owner": "pending_post_incident_readback",
"readback_fields": READBACK_FIELDS,
"required_readback_fields": REQUIRED_READBACK_FIELDS,
"reviewer_checks": [item["check_id"] for item in REVIEWER_CHECKS],
"outcome_lanes": [item["lane_id"] for item in OUTCOME_LANES],
"blocked_actions": BLOCKED_ACTIONS,
"not_approval": True,
"post_incident_readback_received": False,
"post_incident_readback_accepted": False,
"actor_attribution_accepted": False,
"argocd_app_health_accepted": False,
"argocd_sync_status_accepted": False,
"degraded_state_accepted": False,
"pending_workload_accepted": False,
"image_pull_or_scheduling_accepted": False,
"rollout_before_after_accepted": False,
"event_summary_accepted": False,
"metrics_alert_accepted": False,
"drift_scanner_accepted": False,
"cronjob_schedule_accepted": False,
"backup_restore_impact_accepted": False,
"secret_metadata_parity_accepted": False,
"network_policy_service_impact_accepted": False,
"rbac_serviceaccount_impact_accepted": False,
"public_admin_route_impact_accepted": False,
"ai_provider_monitoring_impact_accepted": False,
"operator_notification_accepted": False,
"cross_project_sync_accepted": False,
"recovery_or_still_degraded_accepted": False,
"postcheck_readback_accepted": False,
"recurrence_guard_accepted": False,
"no_false_green_accepted": False,
"argocd_api_read_authorized": False,
"argocd_sync_authorized": False,
"kubectl_action_authorized": False,
"helm_action_authorized": False,
"live_cluster_read_authorized": False,
"network_policy_change_authorized": False,
"nodeport_change_authorized": False,
"rbac_change_authorized": False,
"secret_value_collection_allowed": False,
"route_smoke_authorized": False,
"production_write_authorized": False,
"runtime_gate": False,
"action_buttons_allowed": False,
}
def build_report(
root: Path,
source_report: dict[str, Any],
generated_at: str | None,
) -> dict[str, Any]:
report_time = generated_at or datetime.now(TAIPEI).isoformat(timespec="seconds")
candidates = [build_candidate(item) for item in source_report.get("change_evidence_candidates", [])]
c0_candidates = [item for item in candidates if item["control_tier"] == "C0"]
c1_candidates = [item for item in candidates if item["control_tier"] == "C1"]
write_capable_candidates = [item for item in candidates if item["write_capable"]]
source_summary = source_report.get("summary", {})
return {
"schema_version": "k8s_argocd_post_incident_readback_plan_v1",
"generated_at": report_time,
"git_commit": git_short_sha(root),
"source_schema_version": source_report.get("schema_version"),
"source_status": source_report.get("status"),
"status": "post_incident_readback_plan_ready_no_runtime_action",
"summary": {
"source_change_evidence_candidate_count": source_summary.get("change_evidence_candidate_count", 0),
"source_c0_change_evidence_candidate_count": source_summary.get("c0_change_evidence_candidate_count", 0),
"source_c1_change_evidence_candidate_count": source_summary.get("c1_change_evidence_candidate_count", 0),
"source_write_capable_candidate_count": source_summary.get("write_capable_candidate_count", 0),
"source_scan_group_count": source_summary.get("source_scan_group_count", 0),
"source_manifest_file_count": source_summary.get("source_manifest_file_count", 0),
"source_yaml_manifest_file_count": source_summary.get("source_yaml_manifest_file_count", 0),
"source_c0_file_count": source_summary.get("source_c0_file_count", 0),
"deployment_object_count": source_summary.get("deployment_object_count", 0),
"cronjob_object_count": source_summary.get("cronjob_object_count", 0),
"secret_object_count": source_summary.get("secret_object_count", 0),
"network_policy_object_count": source_summary.get("network_policy_object_count", 0),
"rbac_object_count": source_summary.get("rbac_object_count", 0),
"argocd_application_count": source_summary.get("argocd_application_count", 0),
"prometheus_rule_count": source_summary.get("prometheus_rule_count", 0),
"readback_candidate_count": len(candidates),
"c0_readback_candidate_count": len(c0_candidates),
"c1_readback_candidate_count": len(c1_candidates),
"write_capable_readback_candidate_count": len(write_capable_candidates),
"degraded_or_pending_review_required_candidate_count": len(candidates),
"drift_or_schedule_review_required_candidate_count": len(candidates),
"route_ai_monitoring_impact_required_candidate_count": len(candidates),
"cross_project_sync_required_candidate_count": len(candidates),
"no_false_green_required_candidate_count": len(candidates),
"readback_field_count": len(READBACK_FIELDS),
"required_readback_field_count": len(REQUIRED_READBACK_FIELDS),
"reviewer_check_count": len(REVIEWER_CHECKS),
"outcome_lane_count": len(OUTCOME_LANES),
"blocked_action_count": len(BLOCKED_ACTIONS),
"post_incident_readback_received_count": 0,
"post_incident_readback_accepted_count": 0,
"actor_attribution_accepted_count": 0,
"argocd_app_health_accepted_count": 0,
"argocd_sync_status_accepted_count": 0,
"degraded_state_accepted_count": 0,
"pending_workload_accepted_count": 0,
"image_pull_or_scheduling_accepted_count": 0,
"rollout_before_after_accepted_count": 0,
"event_summary_accepted_count": 0,
"metrics_alert_accepted_count": 0,
"drift_scanner_accepted_count": 0,
"cronjob_schedule_accepted_count": 0,
"backup_restore_impact_accepted_count": 0,
"secret_metadata_parity_accepted_count": 0,
"network_policy_service_impact_accepted_count": 0,
"rbac_serviceaccount_impact_accepted_count": 0,
"public_admin_route_impact_accepted_count": 0,
"ai_provider_monitoring_impact_accepted_count": 0,
"operator_notification_accepted_count": 0,
"cross_project_sync_accepted_count": 0,
"recovery_or_still_degraded_accepted_count": 0,
"postcheck_readback_accepted_count": 0,
"recurrence_guard_accepted_count": 0,
"no_false_green_accepted_count": 0,
"argocd_api_read_authorized_count": 0,
"argocd_sync_authorized_count": 0,
"kubectl_action_authorized_count": 0,
"helm_action_authorized_count": 0,
"live_cluster_read_authorized_count": 0,
"network_policy_change_authorized_count": 0,
"nodeport_change_authorized_count": 0,
"rbac_change_authorized_count": 0,
"secret_value_collection_allowed_count": 0,
"route_smoke_authorized_count": 0,
"production_write_authorized_count": 0,
"runtime_gate_count": 0,
"action_button_count": 0,
"coverage_percent_after_readback_plan": 66,
},
"boundaries": {
"post_incident_readback_received": False,
"post_incident_readback_accepted": False,
"actor_attribution_accepted": False,
"argocd_app_health_accepted": False,
"argocd_sync_status_accepted": False,
"degraded_state_accepted": False,
"pending_workload_accepted": False,
"image_pull_or_scheduling_accepted": False,
"rollout_before_after_accepted": False,
"event_summary_accepted": False,
"metrics_alert_accepted": False,
"drift_scanner_accepted": False,
"cronjob_schedule_accepted": False,
"backup_restore_impact_accepted": False,
"secret_metadata_parity_accepted": False,
"network_policy_service_impact_accepted": False,
"rbac_serviceaccount_impact_accepted": False,
"public_admin_route_impact_accepted": False,
"ai_provider_monitoring_impact_accepted": False,
"operator_notification_accepted": False,
"cross_project_sync_accepted": False,
"recovery_or_still_degraded_accepted": False,
"postcheck_readback_accepted": False,
"recurrence_guard_accepted": False,
"no_false_green_accepted": False,
"argocd_api_read_authorized": False,
"argocd_sync_authorized": False,
"kubectl_action_authorized": False,
"helm_action_authorized": False,
"live_cluster_read_authorized": False,
"network_policy_change_authorized": False,
"nodeport_change_authorized": False,
"rbac_change_authorized": False,
"secret_value_collection_allowed": False,
"route_smoke_authorized": False,
"production_write_authorized": False,
"runtime_execution_authorized": False,
"action_buttons_allowed": False,
"not_authorization": True,
},
"readback_fields": READBACK_FIELDS,
"required_readback_fields": REQUIRED_READBACK_FIELDS,
"reviewer_checks": REVIEWER_CHECKS,
"outcome_lanes": OUTCOME_LANES,
"blocked_actions": BLOCKED_ACTIONS,
"readback_candidates": candidates,
"source_paths": [
"docs/security/K8S-ARGOCD-CHANGE-EVIDENCE-ACCEPTANCE.md",
"docs/security/k8s-argocd-change-evidence-acceptance.snapshot.json",
],
}
def main() -> int:
parser = argparse.ArgumentParser(description="IwoooS K8s / ArgoCD 事故後回讀只讀計畫產生器")
parser.add_argument("--root", default=".", help="repo root")
parser.add_argument(
"--source-change-evidence-report",
default="docs/security/k8s-argocd-change-evidence-acceptance.snapshot.json",
help="k8s-argocd-change-evidence-acceptance.py 輸出的 JSON",
)
parser.add_argument("--output", help="寫出 JSON 報告")
parser.add_argument("--generated-at", help="固定報告時間,供 committed snapshot 使用")
args = parser.parse_args()
root = Path(args.root).resolve()
source_report = load_json(root / args.source_change_evidence_report)
report = build_report(root, source_report, args.generated_at)
payload = json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True)
if args.output:
output = Path(args.output)
output.parent.mkdir(parents=True, exist_ok=True)
output.write_text(payload + "\n", encoding="utf-8")
else:
print(payload)
summary = report["summary"]
print(
"K8S_ARGOCD_POST_INCIDENT_READBACK_PLAN_OK "
f"candidates={summary['readback_candidate_count']} "
f"c0={summary['c0_readback_candidate_count']} "
f"checks={summary['reviewer_check_count']} "
f"lanes={summary['outcome_lane_count']} "
f"accepted={summary['post_incident_readback_accepted_count']} "
f"runtime_gate={summary['runtime_gate_count']}",
file=sys.stderr,
)
return 0
if __name__ == "__main__":
sys.exit(main())