fix(cd): keep ops recovery checks on controlled profile
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 41s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 41s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
This commit is contained in:
@@ -532,6 +532,8 @@ jobs:
|
||||
;;
|
||||
ops/runner/verify-awoooi-non110-cd-closure.py)
|
||||
;;
|
||||
docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json)
|
||||
;;
|
||||
ops/monitoring/alerts-unified.yml)
|
||||
;;
|
||||
ops/monitoring/alerts.yml)
|
||||
@@ -574,6 +576,10 @@ jobs:
|
||||
;;
|
||||
scripts/reboot-recovery/dr-escrow-evidence-checklist.py)
|
||||
;;
|
||||
scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh)
|
||||
;;
|
||||
scripts/reboot-recovery/tests/test_dr_escrow_evidence_checklist.py)
|
||||
;;
|
||||
scripts/reboot-recovery/post-reboot-owner-response-preflight.py)
|
||||
;;
|
||||
scripts/reboot-recovery/post-start-quick-check.sh)
|
||||
@@ -785,8 +791,8 @@ jobs:
|
||||
../../scripts/reboot-recovery/post-reboot-owner-response-preflight.py \
|
||||
../../scripts/reboot-recovery/momo-source-arrival-gate.py \
|
||||
../../scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py \
|
||||
../../scripts/ops/backup-health-textfile-exporter.py \
|
||||
../../scripts/ops/backup-alert-label-contract-check.py \
|
||||
../../scripts/ops/backup-health-textfile-exporter.py \
|
||||
../../scripts/security/gitea-private-inventory-p0-scorecard.py \
|
||||
../../scripts/security/gitea-authenticated-inventory-payload-validator.py
|
||||
python3.11 -c "import yaml; yaml.safe_load(open('../../ops/monitoring/alerts-unified.yml')); print('alerts-unified YAML OK')"
|
||||
@@ -814,6 +820,7 @@ jobs:
|
||||
../../scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh \
|
||||
../../scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh \
|
||||
../../scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh \
|
||||
../../scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh \
|
||||
../../scripts/backup/backup-awoooi-frequent.sh \
|
||||
../../scripts/backup/backup-status.sh \
|
||||
../../scripts/backup/gitea-repo-bundle-backup.sh
|
||||
|
||||
@@ -1,3 +1,21 @@
|
||||
## 2026-07-01 — 23:52 Gitea CD #4315 B5 socket / queue historical blocker 修正
|
||||
|
||||
**照主線修正的問題**:
|
||||
- Gitea CD `#4315` 對 `d658f03a` 失敗於 tests job:`BLOCKER b5_docker_socket_unavailable`,build / deploy / post-deploy 因 tests failed 被跳過;根因是 ops / backup / alert / escrow 類 source patch 未完整列入 controlled-runtime profile,掉回需要 Docker socket 的 B5 lane。
|
||||
- `.gitea/workflows/cd.yaml` 補齊 `d658f03a` 涉及的 ops / backup / alert / escrow source 與 focused checks:backup status、backup alert label contract、host pressure alert contract、credential escrow closeout script、alerts YAML。
|
||||
- `read-public-gitea-actions-queue.py` 修正 latest CD Success 時舊 scheduled `harbor-110-local-repair` Failure 只能作 historical evidence,不得蓋過最新 main CD 成功;raw stalled evidence 保留,但 active queue status 不再被舊 run 拉回。
|
||||
|
||||
**驗證**:
|
||||
- `python3.11 -m py_compile ops/runner/read-public-gitea-actions-queue.py scripts/ops/backup-alert-label-contract-check.py scripts/ops/backup-health-textfile-exporter.py`:通過。
|
||||
- `.gitea/workflows/cd.yaml`、`ops/monitoring/alerts-unified.yml`、`ops/monitoring/alerts.yml` YAML parse:通過。
|
||||
- `bash -n scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh scripts/backup/backup-status.sh`:通過。
|
||||
- `python3.11 -m pytest ops/runner/test_read_public_gitea_actions_queue.py ops/runner/test_cd_controlled_runtime_profile.py scripts/backup/tests/test_backup_status_contract.py scripts/ops/tests/test_backup_health_textfile_exporter.py scripts/ops/tests/test_host_pressure_alert_contract.py scripts/reboot-recovery/tests/test_dr_escrow_evidence_checklist.py -q`:`93 passed`。
|
||||
- `git diff --check`:通過。
|
||||
|
||||
**邊界**:未使用 GitHub / `gh` / GitHub API;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未重啟主機,未 restart Docker / Nginx / K3s / DB / firewall,未 workflow_dispatch,未 force push。
|
||||
|
||||
**下一步**:正常 push Gitea `main` 觸發下一個 CD;預期 B5 socket blocker 不再重現,latest CD Success 後 queue readback 不再被舊 harbor repair Failure 覆蓋。
|
||||
|
||||
## 2026-07-01 — 23:00 core cold-start GREEN / MOMO source-arrival gate 拆分
|
||||
|
||||
**照主線修正的問題**:
|
||||
|
||||
@@ -400,6 +400,7 @@ def build_readback(
|
||||
latest_harbor_110_repair_log_text
|
||||
)
|
||||
latest_cd_status = latest_cd_run.get("status", "")
|
||||
latest_cd_success = latest_cd_status == "Success"
|
||||
latest_cd_visible_blocked = latest_cd_status == "Blocked"
|
||||
latest_cd_waiting = latest_cd_status == "Waiting"
|
||||
host_pressure_waiting_from_stale_jobs = (
|
||||
@@ -512,7 +513,41 @@ def build_readback(
|
||||
current_cd_waiting_behind_harbor_110_repair_running = (
|
||||
latest_cd_waiting and harbor_110_repair_running
|
||||
)
|
||||
harbor_110_repair_blocked = (
|
||||
harbor_110_repair_historical_after_latest_cd_success = bool(
|
||||
latest_cd_success
|
||||
and latest_cd_run_id
|
||||
and harbor_110_repair_run_id
|
||||
and harbor_110_repair_run_id != latest_cd_run_id
|
||||
)
|
||||
effective_remote_ssh_publickey_auth_stalled = bool(
|
||||
harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
)
|
||||
effective_remote_control_channel_unavailable = bool(
|
||||
harbor_110_repair_log_classifier["remote_control_channel_unavailable"]
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
)
|
||||
effective_remote_local_registry_v2_unavailable = bool(
|
||||
harbor_110_repair_log_classifier["local_registry_v2_unavailable"]
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
)
|
||||
effective_remote_public_registry_v2_unavailable = bool(
|
||||
harbor_110_repair_log_classifier["public_registry_v2_unavailable"]
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
)
|
||||
effective_harbor_110_repair_failed = bool(
|
||||
harbor_110_repair_failed
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
)
|
||||
effective_harbor_110_repair_jobs_stale_or_mismatched = bool(
|
||||
harbor_110_repair_jobs_stale_or_mismatched
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
)
|
||||
effective_harbor_110_repair_visible_failure_jobs_api_stale = bool(
|
||||
harbor_110_repair_visible_failure_jobs_api_stale
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
)
|
||||
harbor_110_repair_blocked_raw = (
|
||||
harbor_110_repair_status_blocked
|
||||
or harbor_110_repair_failed
|
||||
or bool(harbor_110_repair_no_matching_runner_label)
|
||||
@@ -520,11 +555,17 @@ def build_readback(
|
||||
or harbor_110_repair_visible_running_jobs_api_stale
|
||||
or bool(harbor_110_repair_log_classifier["failure_classifier"])
|
||||
)
|
||||
harbor_110_repair_blocked = bool(
|
||||
harbor_110_repair_blocked_raw
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
)
|
||||
safe_next_action = _queue_safe_next_action(
|
||||
latest_cd_waiting=latest_cd_waiting,
|
||||
latest_cd_status=latest_cd_status,
|
||||
latest_cd_no_matching_runner_label=latest_cd_no_matching_runner_label,
|
||||
cd_jobs_stale_or_mismatched=cd_jobs_stale_or_mismatched,
|
||||
cd_jobs_stale_or_mismatched=(
|
||||
cd_jobs_stale_or_mismatched and not latest_cd_success
|
||||
),
|
||||
cd_jobs_payload_classifier=cd_jobs_payload_classifier,
|
||||
effective_host_pressure_classifier=effective_tests_log_classifier[
|
||||
"host_pressure_classifier"
|
||||
@@ -540,34 +581,36 @@ def build_readback(
|
||||
],
|
||||
harbor_110_repair_no_matching_runner_label=(
|
||||
harbor_110_repair_no_matching_runner_label
|
||||
if not harbor_110_repair_historical_after_latest_cd_success
|
||||
else ""
|
||||
),
|
||||
harbor_110_repair_waiting=harbor_110_repair_waiting,
|
||||
harbor_110_repair_running=harbor_110_repair_running,
|
||||
harbor_110_repair_failed=harbor_110_repair_failed,
|
||||
harbor_110_repair_failed=effective_harbor_110_repair_failed,
|
||||
harbor_110_repair_waiting_after_cd_harbor_blocker=(
|
||||
harbor_110_repair_waiting_after_cd_harbor_blocker
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
),
|
||||
harbor_110_repair_jobs_stale_or_mismatched=(
|
||||
harbor_110_repair_jobs_stale_or_mismatched
|
||||
effective_harbor_110_repair_jobs_stale_or_mismatched
|
||||
),
|
||||
harbor_110_repair_jobs_payload_classifier=(
|
||||
harbor_110_repair_jobs_payload_classifier
|
||||
),
|
||||
harbor_110_repair_visible_running_jobs_api_stale=(
|
||||
harbor_110_repair_visible_running_jobs_api_stale
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
),
|
||||
harbor_110_repair_visible_failure_jobs_api_stale=(
|
||||
harbor_110_repair_visible_failure_jobs_api_stale
|
||||
effective_harbor_110_repair_visible_failure_jobs_api_stale
|
||||
),
|
||||
current_cd_waiting_behind_harbor_110_repair_running=(
|
||||
current_cd_waiting_behind_harbor_110_repair_running
|
||||
),
|
||||
remote_control_channel_unavailable=harbor_110_repair_log_classifier[
|
||||
"remote_control_channel_unavailable"
|
||||
],
|
||||
remote_ssh_publickey_auth_stalled=harbor_110_repair_log_classifier[
|
||||
"remote_ssh_publickey_auth_stalled"
|
||||
],
|
||||
remote_control_channel_unavailable=(
|
||||
effective_remote_control_channel_unavailable
|
||||
),
|
||||
remote_ssh_publickey_auth_stalled=effective_remote_ssh_publickey_auth_stalled,
|
||||
remote_ssh_publickey_offer_timeout=harbor_110_repair_log_classifier[
|
||||
"remote_ssh_publickey_offer_timeout"
|
||||
],
|
||||
@@ -684,10 +727,16 @@ def build_readback(
|
||||
"latest_visible_harbor_110_repair_waiting": harbor_110_repair_waiting,
|
||||
"latest_visible_harbor_110_repair_running": harbor_110_repair_running,
|
||||
"latest_visible_harbor_110_repair_failed": harbor_110_repair_failed,
|
||||
"latest_visible_harbor_110_repair_historical_after_latest_cd_success": (
|
||||
harbor_110_repair_historical_after_latest_cd_success
|
||||
),
|
||||
"latest_visible_harbor_110_repair_status_blocked": (
|
||||
harbor_110_repair_status_blocked
|
||||
),
|
||||
"latest_visible_harbor_110_repair_blocked": harbor_110_repair_blocked,
|
||||
"latest_visible_harbor_110_repair_blocked_raw": (
|
||||
harbor_110_repair_blocked_raw
|
||||
),
|
||||
"latest_visible_harbor_110_repair_log_http_status": (
|
||||
latest_harbor_110_repair_log_http_status
|
||||
),
|
||||
@@ -872,13 +921,13 @@ def build_readback(
|
||||
else "blocked_latest_visible_cd_run"
|
||||
if latest_cd_visible_blocked
|
||||
else "blocked_harbor_110_remote_ssh_publickey_auth_stalled"
|
||||
if harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
|
||||
if effective_remote_ssh_publickey_auth_stalled
|
||||
else "blocked_harbor_110_remote_control_channel_unavailable"
|
||||
if harbor_110_repair_log_classifier["remote_control_channel_unavailable"]
|
||||
if effective_remote_control_channel_unavailable
|
||||
else "blocked_harbor_110_remote_local_registry_v2_unavailable"
|
||||
if harbor_110_repair_log_classifier["local_registry_v2_unavailable"]
|
||||
if effective_remote_local_registry_v2_unavailable
|
||||
else "blocked_harbor_public_registry_v2_unavailable_after_remote_repair"
|
||||
if harbor_110_repair_log_classifier["public_registry_v2_unavailable"]
|
||||
if effective_remote_public_registry_v2_unavailable
|
||||
else "blocked_current_cd_waiting_behind_stale_harbor_110_repair_readback"
|
||||
if (
|
||||
latest_cd_waiting
|
||||
@@ -891,7 +940,7 @@ def build_readback(
|
||||
else "blocked_harbor_110_repair_failed"
|
||||
if (
|
||||
build_log_classifier["harbor_public_route_blocked_or_retrying"]
|
||||
and harbor_110_repair_failed
|
||||
and effective_harbor_110_repair_failed
|
||||
)
|
||||
else (
|
||||
"blocked_harbor_public_route_unavailable_after_harbor_110_repair_success"
|
||||
@@ -925,13 +974,13 @@ def build_readback(
|
||||
else "harbor_110_repair_running"
|
||||
if harbor_110_repair_running
|
||||
else "blocked_harbor_110_repair_failed"
|
||||
if harbor_110_repair_failed
|
||||
if effective_harbor_110_repair_failed
|
||||
else "blocked_harbor_110_repair_run"
|
||||
if harbor_110_repair_blocked
|
||||
else "harbor_110_repair_jobs_stale_or_mismatched"
|
||||
if harbor_110_repair_jobs_stale_or_mismatched
|
||||
if effective_harbor_110_repair_jobs_stale_or_mismatched
|
||||
else "cd_jobs_stale_or_mismatched"
|
||||
if cd_jobs_stale_or_mismatched
|
||||
if cd_jobs_stale_or_mismatched and not latest_cd_success
|
||||
else "no_matching_runner_not_visible"
|
||||
),
|
||||
"readback": readback,
|
||||
@@ -1014,7 +1063,11 @@ def build_readback(
|
||||
"harbor_110_repair_waiting": harbor_110_repair_waiting,
|
||||
"harbor_110_repair_running": harbor_110_repair_running,
|
||||
"harbor_110_repair_failed": harbor_110_repair_failed,
|
||||
"harbor_110_repair_historical_after_latest_cd_success": (
|
||||
harbor_110_repair_historical_after_latest_cd_success
|
||||
),
|
||||
"harbor_110_repair_blocked": harbor_110_repair_blocked,
|
||||
"harbor_110_repair_blocked_raw": harbor_110_repair_blocked_raw,
|
||||
"harbor_110_repair_waiting_after_cd_harbor_blocker": (
|
||||
harbor_110_repair_waiting_after_cd_harbor_blocker
|
||||
),
|
||||
|
||||
@@ -703,6 +703,7 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N
|
||||
expected_sources = [
|
||||
"docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md)",
|
||||
"docs/runbooks/FULL-STACK-COLD-START-SOP.md)",
|
||||
"docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json)",
|
||||
"docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md)",
|
||||
"ops/monitoring/alerts-unified.yml)",
|
||||
"ops/monitoring/alerts.yml)",
|
||||
@@ -725,6 +726,7 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N
|
||||
"scripts/reboot-recovery/full-stack-recovery-scorecard.sh)",
|
||||
"scripts/reboot-recovery/awoooi-startup-110.sh)",
|
||||
"scripts/reboot-recovery/harbor-watchdog.sh)",
|
||||
"scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh)",
|
||||
"scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh)",
|
||||
"scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py)",
|
||||
"scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py)",
|
||||
@@ -750,6 +752,7 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N
|
||||
"../../scripts/reboot-recovery/momo-source-arrival-gate.py",
|
||||
"../../scripts/reboot-recovery/full-stack-recovery-scorecard.sh",
|
||||
"../../scripts/reboot-recovery/harbor-watchdog.sh",
|
||||
"../../scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh",
|
||||
"../../scripts/reboot-recovery/awoooi-startup-110.sh",
|
||||
"../../scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh",
|
||||
"../../scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh",
|
||||
|
||||
@@ -142,6 +142,20 @@ def _actions_html_cd_failed_harbor_repair_failed() -> str:
|
||||
)
|
||||
|
||||
|
||||
def _actions_html_cd_success_harbor_repair_failed() -> str:
|
||||
return (
|
||||
_actions_html_cd_running_harbor_repair_waiting()
|
||||
.replace('data-tooltip-content="Running"', 'data-tooltip-content="Success"', 1)
|
||||
.replace('data-tooltip-content="Waiting"', 'data-tooltip-content="Failure"', 1)
|
||||
.replace("4061", "4314")
|
||||
.replace("4060", "4307")
|
||||
.replace(
|
||||
"fix(cd): keep harbor repair workflow on controlled profile",
|
||||
"feat(web): surface AI automation production proof",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _actions_html_harbor_repair_waiting_with_workflow_no_matching() -> str:
|
||||
return """
|
||||
<div class="menu">
|
||||
@@ -696,6 +710,49 @@ def test_harbor_ssh_command_path_ready_overrides_raw_publickey_stall() -> None:
|
||||
assert classifier["failure_classifier"] == ""
|
||||
|
||||
|
||||
def test_latest_cd_success_makes_old_harbor_repair_failure_historical() -> None:
|
||||
module = _load_module()
|
||||
payload = module.build_readback(
|
||||
actions_html=_actions_html_cd_success_harbor_repair_failed(),
|
||||
actions_list_http_status=401,
|
||||
actions_list_payload={"message": "token is required"},
|
||||
cd_jobs_http_status=200,
|
||||
cd_jobs_payload={"jobs": [], "total_count": 0},
|
||||
harbor_110_repair_jobs_http_status=200,
|
||||
harbor_110_repair_jobs_payload=_harbor_110_repair_stale_code_review_jobs(),
|
||||
latest_harbor_110_repair_log_http_status=200,
|
||||
latest_harbor_110_repair_log_text=(
|
||||
_harbor_110_repair_publickey_auth_stalled_log()
|
||||
),
|
||||
)
|
||||
|
||||
assert payload["readback"]["latest_visible_cd_run_status"] == "Success"
|
||||
assert (
|
||||
payload["readback"][
|
||||
"latest_visible_harbor_110_repair_historical_after_latest_cd_success"
|
||||
]
|
||||
is True
|
||||
)
|
||||
assert payload["readback"]["latest_visible_harbor_110_repair_failed"] is True
|
||||
assert payload["readback"]["latest_visible_harbor_110_repair_blocked_raw"] is True
|
||||
assert payload["readback"]["latest_visible_harbor_110_repair_blocked"] is False
|
||||
assert (
|
||||
payload["readback"][
|
||||
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled"
|
||||
]
|
||||
is True
|
||||
)
|
||||
assert payload["status"] == "no_matching_runner_not_visible"
|
||||
assert (
|
||||
payload["rollups"][
|
||||
"harbor_110_repair_historical_after_latest_cd_success"
|
||||
]
|
||||
is True
|
||||
)
|
||||
assert payload["rollups"]["harbor_110_repair_blocked"] is False
|
||||
assert payload["rollups"]["harbor_110_repair_blocked_raw"] is True
|
||||
|
||||
|
||||
def test_build_readback_classifies_harbor_502_after_110_repair_jobs_success() -> None:
|
||||
module = _load_module()
|
||||
payload = module.build_readback(
|
||||
|
||||
Reference in New Issue
Block a user