205 lines
6.3 KiB
YAML
205 lines
6.3 KiB
YAML
version: 2026-05-06.v1
|
||
scope:
|
||
included_hosts:
|
||
"110": "DevOps, registry, observability, Sentry, runners"
|
||
"120": "K3s control plane and VIP"
|
||
"121": "K3s peer node and DR drill cron"
|
||
"188": "Data, AI, web, momo, SignOz, public nginx gateway"
|
||
excluded_hosts:
|
||
"112": "Kali security host; recorded but not part of cold-start release gate"
|
||
|
||
principles:
|
||
- recover_dependency_chain_before_workloads
|
||
- keep_ai_auto_repair_observe_only_until_green
|
||
- never_generic_restart_stateful_services
|
||
- preserve_corrupt_parts_in_quarantine_not_delete
|
||
- release_runners_and_crawlers_last
|
||
|
||
phases:
|
||
- id: P0-NETWORK
|
||
order: 0
|
||
gates:
|
||
- ping_110_120_121_188
|
||
- ssh_port_110_120_121_188
|
||
- arp_evidence_or_monitor_mode_fallback
|
||
|
||
- id: P0-188-DATA
|
||
order: 10
|
||
required_before:
|
||
- P1-K3S
|
||
- P2-WORKLOAD-ALERTCHAIN
|
||
gates:
|
||
- containerd_docker_postgresql_redis_ollama_nginx_active
|
||
- postgresql_5432_accepting_connections
|
||
- redis_pong
|
||
- momo_db_not_restarting
|
||
- signoz_http_reachable
|
||
- momo_health_200
|
||
|
||
- id: P0-110-REGISTRY-OBSERVABILITY
|
||
order: 20
|
||
required_before:
|
||
- P1-K3S
|
||
- P3-RUNNER-CD
|
||
gates:
|
||
- docker_active
|
||
- harbor_v2_200_or_401
|
||
- gitea_200_or_302
|
||
- prometheus_ready
|
||
- alertmanager_healthy
|
||
- sentry_http_reachable
|
||
- docker_containers_all_up
|
||
- runner_watchdog_disabled
|
||
- sentry_clickhouse_not_restarting
|
||
- cadvisor_image_v0_47_0
|
||
- cadvisor_cpu_cap_0_3
|
||
|
||
- id: P1-K3S
|
||
order: 30
|
||
gates:
|
||
- 120_can_reach_188_postgres
|
||
- mon_and_mon1_ready
|
||
- no_non_running_non_succeeded_pods
|
||
- awoooi_dev_api_nodeport_200
|
||
- vip_192_168_0_125_present
|
||
|
||
- id: P2-WORKLOAD-ALERTCHAIN
|
||
order: 40
|
||
gates:
|
||
- awoooi_api_vip_health_2xx_or_3xx
|
||
- awoooi_web_vip_2xx_or_3xx
|
||
- alertmanager_webhook_e2e_2xx_when_release_gate
|
||
|
||
- id: P2-PUBLIC-ROUTES
|
||
order: 50
|
||
public_https_routes:
|
||
- https://awoooi.wooo.work/api/v1/health
|
||
- https://awoooi.wooo.work/
|
||
- https://mo.wooo.work/
|
||
- https://mo.wooo.work/health
|
||
- https://gitea.wooo.work/
|
||
- https://harbor.wooo.work/
|
||
- https://registry.wooo.work/
|
||
- https://sentry.wooo.work/
|
||
- https://signoz.wooo.work/
|
||
- https://stock.wooo.work/
|
||
- https://langfuse.wooo.work/
|
||
- https://bitan.wooo.work/
|
||
- https://aiops.wooo.work/
|
||
|
||
- id: P2-SCHEDULES
|
||
order: 60
|
||
gates:
|
||
- cron_active_188_110_120_121
|
||
- docker_restart_textfile_fresh_188
|
||
- docker_stats_textfile_fresh_188_110
|
||
- systemd_units_textfile_fresh_110
|
||
- backup_health_textfile_fresh_188_110
|
||
- backup_from_110_success_under_25h
|
||
- expected_backup_jobs_fresh_188_110
|
||
- host_service_config_backup_success_under_48h
|
||
- sentry_dedicated_backup_success_under_48h
|
||
- backup_integrity_check_success_under_8d
|
||
- backup_restore_drill_success_under_31d
|
||
- velero_schedule_present_and_latest_completed_under_25h
|
||
- velero_restore_test_cron_present
|
||
- momo_scheduler_registered_jobs
|
||
- k8s_cronjobs_unsuspended
|
||
- k8s_failed_jobs_zero
|
||
- dr_drill_cron_present_121
|
||
|
||
- id: P3-HIGH-LOAD-WORK
|
||
order: 70
|
||
release_after:
|
||
- P0-NETWORK
|
||
- P0-188-DATA
|
||
- P0-110-REGISTRY-OBSERVABILITY
|
||
- P1-K3S
|
||
- P2-WORKLOAD-ALERTCHAIN
|
||
- P2-PUBLIC-ROUTES
|
||
- P2-SCHEDULES
|
||
release_conditions:
|
||
- host_load_per_core_below_1_0_for_15m
|
||
- no_restart_storm
|
||
- clickhouse_merge_or_kafka_lag_not_increasing_two_checks
|
||
examples:
|
||
- sentry_snuba_consumers
|
||
- momo_scheduler_chrome_crawlers
|
||
- gitea_actions_jobs
|
||
|
||
- id: P3-RUNNER-CD
|
||
order: 80
|
||
release_conditions:
|
||
- all_previous_gates_green
|
||
- runner_cpuquota_200_percent
|
||
- runner_memorymax_2g
|
||
- watchdogusec_0
|
||
- active_awoooi_cd_or_gitea_actions_task_containers_cpu_capped_during_cold_start
|
||
|
||
automation_policy:
|
||
before_green:
|
||
ai_auto_repair: observe_only
|
||
alertmanager_smoke_test: manual_or_release_gate_only
|
||
stateful_service_actions: human_approval_required
|
||
generic_restart: forbidden
|
||
after_green:
|
||
ai_auto_repair: limited_execution_for_stateless_exporters_only
|
||
stateful_service_actions: human_in_the_loop
|
||
runner_cd: controlled_release
|
||
|
||
resource_guardrails:
|
||
"110":
|
||
cadvisor:
|
||
image: gcr.io/cadvisor/cadvisor:v0.47.0
|
||
cpus: 0.3
|
||
mem_limit: 512m
|
||
sentry_snuba_cold_start_consumers:
|
||
cpus: 0.5
|
||
persist_in: /opt/sentry/docker-compose.override.yml
|
||
sentry_self_hosted_memory_limits:
|
||
taskscheduler_mem_limit: 1g
|
||
relay_mem_limit: 2g
|
||
persist_in: /opt/sentry/docker-compose.override.yml
|
||
note: "taskscheduler/relay 不得回退到 512m/1g 造成長期 >85% memory-limit pressure;110 主機仍以 ClickHouse/Kafka/Snuba CPU caps 防止冷啟動過載。"
|
||
actions_runner_systemd:
|
||
cpu_quota: 200%
|
||
memory_max: 2G
|
||
watchdog: disabled
|
||
"188":
|
||
ollama_systemd:
|
||
cpu_quota: 300%
|
||
memory_high: 20G
|
||
memory_max: 24G
|
||
max_loaded_models: 1
|
||
num_parallel: 1
|
||
note: "188 本機 Ollama 是 cold-start 依賴與 Open-WebUI local endpoint;不得維持 disabled/inactive,也不得保留 700%/45G 無節制 guardrail。"
|
||
litellm:
|
||
cpus: 1.0
|
||
memory: 1G
|
||
mode: stateless
|
||
momo_scheduler:
|
||
cpus: 2.0
|
||
memory: 2G
|
||
signoz_clickhouse:
|
||
memory: 24G
|
||
note: do_not_lower_during_merge_backlog
|
||
|
||
authoritative_checks:
|
||
read_only_monitor:
|
||
command: bash scripts/reboot-recovery/full-stack-cold-start-check.sh --monitor-read-only --no-color
|
||
expected_for_cron: PASS>0 WARN=0 BLOCKED=0
|
||
release_gate:
|
||
command: SSH_BATCH_MODE=yes bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test
|
||
expected: PASS=64 WARN=0 BLOCKED=0
|
||
textfile_metric:
|
||
path: /home/wooo/node_exporter_textfiles/cold_start_recovery.prom
|
||
green_metric: awoooi_cold_start_last_result{host="110",scope="110_120_121_188",result="green"} 1
|
||
backup_baseline:
|
||
path: ops/reboot-recovery/full-stack-backup-baseline.yml
|
||
required_metrics:
|
||
- awoooi_backup_health_monitor_up
|
||
- awoooi_backup_job_fresh
|
||
- awoooi_backup_integrity_fresh
|
||
- awoooi_velero_restore_test_cron_present
|
||
- awoooi_velero_restore_test_last_success_fresh
|