Files
awoooi/ops/reboot-recovery/full-stack-cold-start-baseline.yml
2026-05-29 12:41:34 +08:00

205 lines
6.3 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
version: 2026-05-06.v1
scope:
included_hosts:
"110": "DevOps, registry, observability, Sentry, runners"
"120": "K3s control plane and VIP"
"121": "K3s peer node and DR drill cron"
"188": "Data, AI, web, momo, SignOz, public nginx gateway"
excluded_hosts:
"112": "Kali security host; recorded but not part of cold-start release gate"
principles:
- recover_dependency_chain_before_workloads
- keep_ai_auto_repair_observe_only_until_green
- never_generic_restart_stateful_services
- preserve_corrupt_parts_in_quarantine_not_delete
- release_runners_and_crawlers_last
phases:
- id: P0-NETWORK
order: 0
gates:
- ping_110_120_121_188
- ssh_port_110_120_121_188
- arp_evidence_or_monitor_mode_fallback
- id: P0-188-DATA
order: 10
required_before:
- P1-K3S
- P2-WORKLOAD-ALERTCHAIN
gates:
- containerd_docker_postgresql_redis_ollama_nginx_active
- postgresql_5432_accepting_connections
- redis_pong
- momo_db_not_restarting
- signoz_http_reachable
- momo_health_200
- id: P0-110-REGISTRY-OBSERVABILITY
order: 20
required_before:
- P1-K3S
- P3-RUNNER-CD
gates:
- docker_active
- harbor_v2_200_or_401
- gitea_200_or_302
- prometheus_ready
- alertmanager_healthy
- sentry_http_reachable
- docker_containers_all_up
- runner_watchdog_disabled
- sentry_clickhouse_not_restarting
- cadvisor_image_v0_47_0
- cadvisor_cpu_cap_0_3
- id: P1-K3S
order: 30
gates:
- 120_can_reach_188_postgres
- mon_and_mon1_ready
- no_non_running_non_succeeded_pods
- awoooi_dev_api_nodeport_200
- vip_192_168_0_125_present
- id: P2-WORKLOAD-ALERTCHAIN
order: 40
gates:
- awoooi_api_vip_health_2xx_or_3xx
- awoooi_web_vip_2xx_or_3xx
- alertmanager_webhook_e2e_2xx_when_release_gate
- id: P2-PUBLIC-ROUTES
order: 50
public_https_routes:
- https://awoooi.wooo.work/api/v1/health
- https://awoooi.wooo.work/
- https://mo.wooo.work/
- https://mo.wooo.work/health
- https://gitea.wooo.work/
- https://harbor.wooo.work/
- https://registry.wooo.work/
- https://sentry.wooo.work/
- https://signoz.wooo.work/
- https://stock.wooo.work/
- https://langfuse.wooo.work/
- https://bitan.wooo.work/
- https://aiops.wooo.work/
- id: P2-SCHEDULES
order: 60
gates:
- cron_active_188_110_120_121
- docker_restart_textfile_fresh_188
- docker_stats_textfile_fresh_188_110
- systemd_units_textfile_fresh_110
- backup_health_textfile_fresh_188_110
- backup_from_110_success_under_25h
- expected_backup_jobs_fresh_188_110
- host_service_config_backup_success_under_48h
- sentry_dedicated_backup_success_under_48h
- backup_integrity_check_success_under_8d
- backup_restore_drill_success_under_31d
- velero_schedule_present_and_latest_completed_under_25h
- velero_restore_test_cron_present
- momo_scheduler_registered_jobs
- k8s_cronjobs_unsuspended
- k8s_failed_jobs_zero
- dr_drill_cron_present_121
- id: P3-HIGH-LOAD-WORK
order: 70
release_after:
- P0-NETWORK
- P0-188-DATA
- P0-110-REGISTRY-OBSERVABILITY
- P1-K3S
- P2-WORKLOAD-ALERTCHAIN
- P2-PUBLIC-ROUTES
- P2-SCHEDULES
release_conditions:
- host_load_per_core_below_1_0_for_15m
- no_restart_storm
- clickhouse_merge_or_kafka_lag_not_increasing_two_checks
examples:
- sentry_snuba_consumers
- momo_scheduler_chrome_crawlers
- gitea_actions_jobs
- id: P3-RUNNER-CD
order: 80
release_conditions:
- all_previous_gates_green
- runner_cpuquota_200_percent
- runner_memorymax_2g
- watchdogusec_0
- active_awoooi_cd_or_gitea_actions_task_containers_cpu_capped_during_cold_start
automation_policy:
before_green:
ai_auto_repair: observe_only
alertmanager_smoke_test: manual_or_release_gate_only
stateful_service_actions: human_approval_required
generic_restart: forbidden
after_green:
ai_auto_repair: limited_execution_for_stateless_exporters_only
stateful_service_actions: human_in_the_loop
runner_cd: controlled_release
resource_guardrails:
"110":
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.0
cpus: 0.3
mem_limit: 512m
sentry_snuba_cold_start_consumers:
cpus: 0.5
persist_in: /opt/sentry/docker-compose.override.yml
sentry_self_hosted_memory_limits:
taskscheduler_mem_limit: 1g
relay_mem_limit: 2g
persist_in: /opt/sentry/docker-compose.override.yml
note: "taskscheduler/relay 不得回退到 512m/1g 造成長期 >85% memory-limit pressure110 主機仍以 ClickHouse/Kafka/Snuba CPU caps 防止冷啟動過載。"
actions_runner_systemd:
cpu_quota: 200%
memory_max: 2G
watchdog: disabled
"188":
ollama_systemd:
cpu_quota: 300%
memory_high: 20G
memory_max: 24G
max_loaded_models: 1
num_parallel: 1
note: "188 本機 Ollama 是 cold-start 依賴與 Open-WebUI local endpoint不得維持 disabled/inactive也不得保留 700%/45G 無節制 guardrail。"
litellm:
cpus: 1.0
memory: 1G
mode: stateless
momo_scheduler:
cpus: 2.0
memory: 2G
signoz_clickhouse:
memory: 24G
note: do_not_lower_during_merge_backlog
authoritative_checks:
read_only_monitor:
command: bash scripts/reboot-recovery/full-stack-cold-start-check.sh --monitor-read-only --no-color
expected_for_cron: PASS>0 WARN=0 BLOCKED=0
release_gate:
command: SSH_BATCH_MODE=yes bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test
expected: PASS=64 WARN=0 BLOCKED=0
textfile_metric:
path: /home/wooo/node_exporter_textfiles/cold_start_recovery.prom
green_metric: awoooi_cold_start_last_result{host="110",scope="110_120_121_188",result="green"} 1
backup_baseline:
path: ops/reboot-recovery/full-stack-backup-baseline.yml
required_metrics:
- awoooi_backup_health_monitor_up
- awoooi_backup_job_fresh
- awoooi_backup_integrity_fresh
- awoooi_velero_restore_test_cron_present
- awoooi_velero_restore_test_last_success_fresh