fix(cd): keep ops recovery checks on controlled profile
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 41s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
ogt
2026-07-01 23:40:18 +08:00
parent 670cf9afd6
commit 9f4f1b417c
5 changed files with 158 additions and 20 deletions

View File

@@ -532,6 +532,8 @@ jobs:
;;
ops/runner/verify-awoooi-non110-cd-closure.py)
;;
docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json)
;;
ops/monitoring/alerts-unified.yml)
;;
ops/monitoring/alerts.yml)
@@ -574,6 +576,10 @@ jobs:
;;
scripts/reboot-recovery/dr-escrow-evidence-checklist.py)
;;
scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh)
;;
scripts/reboot-recovery/tests/test_dr_escrow_evidence_checklist.py)
;;
scripts/reboot-recovery/post-reboot-owner-response-preflight.py)
;;
scripts/reboot-recovery/post-start-quick-check.sh)
@@ -785,8 +791,8 @@ jobs:
../../scripts/reboot-recovery/post-reboot-owner-response-preflight.py \
../../scripts/reboot-recovery/momo-source-arrival-gate.py \
../../scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py \
../../scripts/ops/backup-health-textfile-exporter.py \
../../scripts/ops/backup-alert-label-contract-check.py \
../../scripts/ops/backup-health-textfile-exporter.py \
../../scripts/security/gitea-private-inventory-p0-scorecard.py \
../../scripts/security/gitea-authenticated-inventory-payload-validator.py
python3.11 -c "import yaml; yaml.safe_load(open('../../ops/monitoring/alerts-unified.yml')); print('alerts-unified YAML OK')"
@@ -814,6 +820,7 @@ jobs:
../../scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh \
../../scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh \
../../scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh \
../../scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh \
../../scripts/backup/backup-awoooi-frequent.sh \
../../scripts/backup/backup-status.sh \
../../scripts/backup/gitea-repo-bundle-backup.sh

View File

@@ -1,3 +1,21 @@
## 2026-07-01 — 23:52 Gitea CD #4315 B5 socket / queue historical blocker 修正
**照主線修正的問題**
- Gitea CD `#4315``d658f03a` 失敗於 tests job`BLOCKER b5_docker_socket_unavailable`build / deploy / post-deploy 因 tests failed 被跳過;根因是 ops / backup / alert / escrow 類 source patch 未完整列入 controlled-runtime profile掉回需要 Docker socket 的 B5 lane。
- `.gitea/workflows/cd.yaml` 補齊 `d658f03a` 涉及的 ops / backup / alert / escrow source 與 focused checksbackup status、backup alert label contract、host pressure alert contract、credential escrow closeout script、alerts YAML。
- `read-public-gitea-actions-queue.py` 修正 latest CD Success 時舊 scheduled `harbor-110-local-repair` Failure 只能作 historical evidence不得蓋過最新 main CD 成功raw stalled evidence 保留,但 active queue status 不再被舊 run 拉回。
**驗證**
- `python3.11 -m py_compile ops/runner/read-public-gitea-actions-queue.py scripts/ops/backup-alert-label-contract-check.py scripts/ops/backup-health-textfile-exporter.py`:通過。
- `.gitea/workflows/cd.yaml``ops/monitoring/alerts-unified.yml``ops/monitoring/alerts.yml` YAML parse通過。
- `bash -n scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh scripts/backup/backup-status.sh`:通過。
- `python3.11 -m pytest ops/runner/test_read_public_gitea_actions_queue.py ops/runner/test_cd_controlled_runtime_profile.py scripts/backup/tests/test_backup_status_contract.py scripts/ops/tests/test_backup_health_textfile_exporter.py scripts/ops/tests/test_host_pressure_alert_contract.py scripts/reboot-recovery/tests/test_dr_escrow_evidence_checklist.py -q``93 passed`
- `git diff --check`:通過。
**邊界**:未使用 GitHub / `gh` / GitHub API未讀 secret / token / `.env` / raw sessions / SQLite / auth未重啟主機未 restart Docker / Nginx / K3s / DB / firewall未 workflow_dispatch未 force push。
**下一步**:正常 push Gitea `main` 觸發下一個 CD預期 B5 socket blocker 不再重現latest CD Success 後 queue readback 不再被舊 harbor repair Failure 覆蓋。
## 2026-07-01 — 23:00 core cold-start GREEN / MOMO source-arrival gate 拆分
**照主線修正的問題**

View File

@@ -400,6 +400,7 @@ def build_readback(
latest_harbor_110_repair_log_text
)
latest_cd_status = latest_cd_run.get("status", "")
latest_cd_success = latest_cd_status == "Success"
latest_cd_visible_blocked = latest_cd_status == "Blocked"
latest_cd_waiting = latest_cd_status == "Waiting"
host_pressure_waiting_from_stale_jobs = (
@@ -512,7 +513,41 @@ def build_readback(
current_cd_waiting_behind_harbor_110_repair_running = (
latest_cd_waiting and harbor_110_repair_running
)
harbor_110_repair_blocked = (
harbor_110_repair_historical_after_latest_cd_success = bool(
latest_cd_success
and latest_cd_run_id
and harbor_110_repair_run_id
and harbor_110_repair_run_id != latest_cd_run_id
)
effective_remote_ssh_publickey_auth_stalled = bool(
harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
and not harbor_110_repair_historical_after_latest_cd_success
)
effective_remote_control_channel_unavailable = bool(
harbor_110_repair_log_classifier["remote_control_channel_unavailable"]
and not harbor_110_repair_historical_after_latest_cd_success
)
effective_remote_local_registry_v2_unavailable = bool(
harbor_110_repair_log_classifier["local_registry_v2_unavailable"]
and not harbor_110_repair_historical_after_latest_cd_success
)
effective_remote_public_registry_v2_unavailable = bool(
harbor_110_repair_log_classifier["public_registry_v2_unavailable"]
and not harbor_110_repair_historical_after_latest_cd_success
)
effective_harbor_110_repair_failed = bool(
harbor_110_repair_failed
and not harbor_110_repair_historical_after_latest_cd_success
)
effective_harbor_110_repair_jobs_stale_or_mismatched = bool(
harbor_110_repair_jobs_stale_or_mismatched
and not harbor_110_repair_historical_after_latest_cd_success
)
effective_harbor_110_repair_visible_failure_jobs_api_stale = bool(
harbor_110_repair_visible_failure_jobs_api_stale
and not harbor_110_repair_historical_after_latest_cd_success
)
harbor_110_repair_blocked_raw = (
harbor_110_repair_status_blocked
or harbor_110_repair_failed
or bool(harbor_110_repair_no_matching_runner_label)
@@ -520,11 +555,17 @@ def build_readback(
or harbor_110_repair_visible_running_jobs_api_stale
or bool(harbor_110_repair_log_classifier["failure_classifier"])
)
harbor_110_repair_blocked = bool(
harbor_110_repair_blocked_raw
and not harbor_110_repair_historical_after_latest_cd_success
)
safe_next_action = _queue_safe_next_action(
latest_cd_waiting=latest_cd_waiting,
latest_cd_status=latest_cd_status,
latest_cd_no_matching_runner_label=latest_cd_no_matching_runner_label,
cd_jobs_stale_or_mismatched=cd_jobs_stale_or_mismatched,
cd_jobs_stale_or_mismatched=(
cd_jobs_stale_or_mismatched and not latest_cd_success
),
cd_jobs_payload_classifier=cd_jobs_payload_classifier,
effective_host_pressure_classifier=effective_tests_log_classifier[
"host_pressure_classifier"
@@ -540,34 +581,36 @@ def build_readback(
],
harbor_110_repair_no_matching_runner_label=(
harbor_110_repair_no_matching_runner_label
if not harbor_110_repair_historical_after_latest_cd_success
else ""
),
harbor_110_repair_waiting=harbor_110_repair_waiting,
harbor_110_repair_running=harbor_110_repair_running,
harbor_110_repair_failed=harbor_110_repair_failed,
harbor_110_repair_failed=effective_harbor_110_repair_failed,
harbor_110_repair_waiting_after_cd_harbor_blocker=(
harbor_110_repair_waiting_after_cd_harbor_blocker
and not harbor_110_repair_historical_after_latest_cd_success
),
harbor_110_repair_jobs_stale_or_mismatched=(
harbor_110_repair_jobs_stale_or_mismatched
effective_harbor_110_repair_jobs_stale_or_mismatched
),
harbor_110_repair_jobs_payload_classifier=(
harbor_110_repair_jobs_payload_classifier
),
harbor_110_repair_visible_running_jobs_api_stale=(
harbor_110_repair_visible_running_jobs_api_stale
and not harbor_110_repair_historical_after_latest_cd_success
),
harbor_110_repair_visible_failure_jobs_api_stale=(
harbor_110_repair_visible_failure_jobs_api_stale
effective_harbor_110_repair_visible_failure_jobs_api_stale
),
current_cd_waiting_behind_harbor_110_repair_running=(
current_cd_waiting_behind_harbor_110_repair_running
),
remote_control_channel_unavailable=harbor_110_repair_log_classifier[
"remote_control_channel_unavailable"
],
remote_ssh_publickey_auth_stalled=harbor_110_repair_log_classifier[
"remote_ssh_publickey_auth_stalled"
],
remote_control_channel_unavailable=(
effective_remote_control_channel_unavailable
),
remote_ssh_publickey_auth_stalled=effective_remote_ssh_publickey_auth_stalled,
remote_ssh_publickey_offer_timeout=harbor_110_repair_log_classifier[
"remote_ssh_publickey_offer_timeout"
],
@@ -684,10 +727,16 @@ def build_readback(
"latest_visible_harbor_110_repair_waiting": harbor_110_repair_waiting,
"latest_visible_harbor_110_repair_running": harbor_110_repair_running,
"latest_visible_harbor_110_repair_failed": harbor_110_repair_failed,
"latest_visible_harbor_110_repair_historical_after_latest_cd_success": (
harbor_110_repair_historical_after_latest_cd_success
),
"latest_visible_harbor_110_repair_status_blocked": (
harbor_110_repair_status_blocked
),
"latest_visible_harbor_110_repair_blocked": harbor_110_repair_blocked,
"latest_visible_harbor_110_repair_blocked_raw": (
harbor_110_repair_blocked_raw
),
"latest_visible_harbor_110_repair_log_http_status": (
latest_harbor_110_repair_log_http_status
),
@@ -872,13 +921,13 @@ def build_readback(
else "blocked_latest_visible_cd_run"
if latest_cd_visible_blocked
else "blocked_harbor_110_remote_ssh_publickey_auth_stalled"
if harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
if effective_remote_ssh_publickey_auth_stalled
else "blocked_harbor_110_remote_control_channel_unavailable"
if harbor_110_repair_log_classifier["remote_control_channel_unavailable"]
if effective_remote_control_channel_unavailable
else "blocked_harbor_110_remote_local_registry_v2_unavailable"
if harbor_110_repair_log_classifier["local_registry_v2_unavailable"]
if effective_remote_local_registry_v2_unavailable
else "blocked_harbor_public_registry_v2_unavailable_after_remote_repair"
if harbor_110_repair_log_classifier["public_registry_v2_unavailable"]
if effective_remote_public_registry_v2_unavailable
else "blocked_current_cd_waiting_behind_stale_harbor_110_repair_readback"
if (
latest_cd_waiting
@@ -891,7 +940,7 @@ def build_readback(
else "blocked_harbor_110_repair_failed"
if (
build_log_classifier["harbor_public_route_blocked_or_retrying"]
and harbor_110_repair_failed
and effective_harbor_110_repair_failed
)
else (
"blocked_harbor_public_route_unavailable_after_harbor_110_repair_success"
@@ -925,13 +974,13 @@ def build_readback(
else "harbor_110_repair_running"
if harbor_110_repair_running
else "blocked_harbor_110_repair_failed"
if harbor_110_repair_failed
if effective_harbor_110_repair_failed
else "blocked_harbor_110_repair_run"
if harbor_110_repair_blocked
else "harbor_110_repair_jobs_stale_or_mismatched"
if harbor_110_repair_jobs_stale_or_mismatched
if effective_harbor_110_repair_jobs_stale_or_mismatched
else "cd_jobs_stale_or_mismatched"
if cd_jobs_stale_or_mismatched
if cd_jobs_stale_or_mismatched and not latest_cd_success
else "no_matching_runner_not_visible"
),
"readback": readback,
@@ -1014,7 +1063,11 @@ def build_readback(
"harbor_110_repair_waiting": harbor_110_repair_waiting,
"harbor_110_repair_running": harbor_110_repair_running,
"harbor_110_repair_failed": harbor_110_repair_failed,
"harbor_110_repair_historical_after_latest_cd_success": (
harbor_110_repair_historical_after_latest_cd_success
),
"harbor_110_repair_blocked": harbor_110_repair_blocked,
"harbor_110_repair_blocked_raw": harbor_110_repair_blocked_raw,
"harbor_110_repair_waiting_after_cd_harbor_blocker": (
harbor_110_repair_waiting_after_cd_harbor_blocker
),

View File

@@ -703,6 +703,7 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N
expected_sources = [
"docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md)",
"docs/runbooks/FULL-STACK-COLD-START-SOP.md)",
"docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json)",
"docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md)",
"ops/monitoring/alerts-unified.yml)",
"ops/monitoring/alerts.yml)",
@@ -725,6 +726,7 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N
"scripts/reboot-recovery/full-stack-recovery-scorecard.sh)",
"scripts/reboot-recovery/awoooi-startup-110.sh)",
"scripts/reboot-recovery/harbor-watchdog.sh)",
"scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh)",
"scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh)",
"scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py)",
"scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py)",
@@ -750,6 +752,7 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N
"../../scripts/reboot-recovery/momo-source-arrival-gate.py",
"../../scripts/reboot-recovery/full-stack-recovery-scorecard.sh",
"../../scripts/reboot-recovery/harbor-watchdog.sh",
"../../scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh",
"../../scripts/reboot-recovery/awoooi-startup-110.sh",
"../../scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh",
"../../scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh",

View File

@@ -142,6 +142,20 @@ def _actions_html_cd_failed_harbor_repair_failed() -> str:
)
def _actions_html_cd_success_harbor_repair_failed() -> str:
return (
_actions_html_cd_running_harbor_repair_waiting()
.replace('data-tooltip-content="Running"', 'data-tooltip-content="Success"', 1)
.replace('data-tooltip-content="Waiting"', 'data-tooltip-content="Failure"', 1)
.replace("4061", "4314")
.replace("4060", "4307")
.replace(
"fix(cd): keep harbor repair workflow on controlled profile",
"feat(web): surface AI automation production proof",
)
)
def _actions_html_harbor_repair_waiting_with_workflow_no_matching() -> str:
return """
<div class="menu">
@@ -696,6 +710,49 @@ def test_harbor_ssh_command_path_ready_overrides_raw_publickey_stall() -> None:
assert classifier["failure_classifier"] == ""
def test_latest_cd_success_makes_old_harbor_repair_failure_historical() -> None:
module = _load_module()
payload = module.build_readback(
actions_html=_actions_html_cd_success_harbor_repair_failed(),
actions_list_http_status=401,
actions_list_payload={"message": "token is required"},
cd_jobs_http_status=200,
cd_jobs_payload={"jobs": [], "total_count": 0},
harbor_110_repair_jobs_http_status=200,
harbor_110_repair_jobs_payload=_harbor_110_repair_stale_code_review_jobs(),
latest_harbor_110_repair_log_http_status=200,
latest_harbor_110_repair_log_text=(
_harbor_110_repair_publickey_auth_stalled_log()
),
)
assert payload["readback"]["latest_visible_cd_run_status"] == "Success"
assert (
payload["readback"][
"latest_visible_harbor_110_repair_historical_after_latest_cd_success"
]
is True
)
assert payload["readback"]["latest_visible_harbor_110_repair_failed"] is True
assert payload["readback"]["latest_visible_harbor_110_repair_blocked_raw"] is True
assert payload["readback"]["latest_visible_harbor_110_repair_blocked"] is False
assert (
payload["readback"][
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled"
]
is True
)
assert payload["status"] == "no_matching_runner_not_visible"
assert (
payload["rollups"][
"harbor_110_repair_historical_after_latest_cd_success"
]
is True
)
assert payload["rollups"]["harbor_110_repair_blocked"] is False
assert payload["rollups"]["harbor_110_repair_blocked_raw"] is True
def test_build_readback_classifies_harbor_502_after_110_repair_jobs_success() -> None:
module = _load_module()
payload = module.build_readback(