fix(runner): read harbor repair jobs truth
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 29s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
AWOOOI Harbor 110 Local Repair / workflow-shape (push) Successful in 0s
AWOOOI Harbor 110 Local Repair / harbor-110-local-repair (push) Has been cancelled

This commit is contained in:
Your Name
2026-06-30 20:58:01 +08:00
parent 789e78a17f
commit f3a044d848
3 changed files with 198 additions and 3 deletions

View File

@@ -36,11 +36,12 @@
**照主線修正的問題**
- `ops/runner/read-public-gitea-actions-queue.py` 新增 `harbor-110-local-repair.yaml` 專用 readbackrun id、status、kind、title、commit、waiting / running / blocked 與 no-matching runner label。
- Rollups 新增 `harbor_110_repair_run_visible``harbor_110_repair_run_status``harbor_110_repair_waiting``harbor_110_repair_running``harbor_110_repair_blocked`,避免 110 local repair workflow 只藏在 `top_visible_runs` 靠人眼判讀。
- Live readback 目前精準收斂CD `#4061` failureclassifier=`harbor_registry_public_route_unavailable``registry_v2_status=502`、CD non110 self-heal skip=`not_110_host`scheduled Harbor repair `#4060` visible 且 `Waiting`
- 追加 Harbor 110 repair jobs API readback`harbor_110_repair_jobs_total_count``harbor_110_repair_jobs_all_success`、runner labels、runner names 與 `harbor_110_repair_visible_waiting_stale`,避免 public HTML stale `Waiting` 蓋掉 jobs API truth
- Live readback 目前精準收斂CD `#4062` failureclassifier=`blocked_harbor_public_route_unavailable_after_harbor_110_repair_success`scheduled Harbor repair `#4063` jobs `3/3 success`、label=`awoooi-host`、runner=`wooo-runner`,但 public / internal registry `/v2/` 仍為 `502`
**驗證**
- `py_compile``ruff check``pytest ops/runner/test_read_public_gitea_actions_queue.py -q` 通過(`15 passed`)。
- `read-public-gitea-actions-queue.py --json` 成功讀出 `latest_visible_harbor_110_repair_run_id=4060``latest_visible_harbor_110_repair_run_status=Waiting`
- `py_compile``ruff check``pytest ops/runner/test_read_public_gitea_actions_queue.py -q` 通過(`16 passed`)。
- `read-public-gitea-actions-queue.py --json` 成功讀出 `latest_visible_harbor_110_repair_run_id=4063``latest_visible_harbor_110_repair_run_status=Waiting``harbor_110_repair_jobs_all_success=true``harbor_110_repair_visible_waiting_stale=true`
- `git diff --check` 通過。
**邊界**:只讀 public Gitea / 本機 source未讀 secret / token / `.env` / raw sessions / SQLite / auth未使用 GitHub / `gh` / GitHub API未 workflow_dispatch未 SSH / Docker / Nginx / K3s / DB / firewall runtime 寫入。

View File

@@ -168,6 +168,8 @@ def build_readback(
actions_list_payload: Any,
cd_jobs_http_status: int,
cd_jobs_payload: Any,
harbor_110_repair_jobs_http_status: int = 0,
harbor_110_repair_jobs_payload: Any | None = None,
latest_cd_build_log_http_status: int = 0,
latest_cd_build_log_text: str = "",
latest_cd_tests_log_http_status: int = 0,
@@ -191,10 +193,21 @@ def build_readback(
{},
)
cd_jobs = cd_jobs_payload if isinstance(cd_jobs_payload, dict) else {}
harbor_110_repair_jobs = (
harbor_110_repair_jobs_payload
if isinstance(harbor_110_repair_jobs_payload, dict)
else {}
)
actions_list = actions_list_payload if isinstance(actions_list_payload, dict) else {}
actions_list_message = str(actions_list.get("message") or "")
jobs_total_count = _int(cd_jobs.get("total_count"))
jobs = cd_jobs.get("jobs") if isinstance(cd_jobs.get("jobs"), list) else []
harbor_jobs_total_count = _int(harbor_110_repair_jobs.get("total_count"))
harbor_jobs = (
harbor_110_repair_jobs.get("jobs")
if isinstance(harbor_110_repair_jobs.get("jobs"), list)
else []
)
latest_cd_run_id = latest_cd_run.get("run_id", "")
latest_cd_commit_sha = latest_cd_run.get("commit_sha", "")
job_head_shas = sorted(
@@ -216,6 +229,25 @@ def build_readback(
if isinstance(job, dict):
conclusion = str(job.get("conclusion") or job.get("status") or "unknown")
job_conclusion_counts[conclusion] = job_conclusion_counts.get(conclusion, 0) + 1
harbor_job_conclusion_counts: dict[str, int] = {}
harbor_job_run_ids: set[str] = set()
harbor_job_labels: set[str] = set()
harbor_job_runner_names: set[str] = set()
for job in harbor_jobs:
if not isinstance(job, dict):
continue
conclusion = str(job.get("conclusion") or job.get("status") or "unknown")
harbor_job_conclusion_counts[conclusion] = (
harbor_job_conclusion_counts.get(conclusion, 0) + 1
)
if job.get("run_id") is not None:
harbor_job_run_ids.add(str(job.get("run_id")))
labels = job.get("labels")
if isinstance(labels, list):
harbor_job_labels.update(str(label) for label in labels if label)
runner_name = str(job.get("runner_name") or "")
if runner_name:
harbor_job_runner_names.add(runner_name)
cd_jobs_head_sha_matches_visible = (
bool(latest_cd_commit_sha)
and latest_cd_commit_sha in job_head_shas
@@ -237,9 +269,23 @@ def build_readback(
tests_log_classifier = classify_cd_tests_log(latest_cd_tests_log_text)
latest_cd_visible_blocked = latest_cd_run.get("status", "") == "Blocked"
harbor_110_repair_status = latest_harbor_110_repair_run.get("status", "")
harbor_110_repair_run_id = latest_harbor_110_repair_run.get("run_id", "")
harbor_110_repair_waiting = harbor_110_repair_status == "Waiting"
harbor_110_repair_running = harbor_110_repair_status == "Running"
harbor_110_repair_blocked = harbor_110_repair_status == "Blocked"
harbor_110_repair_jobs_run_id_matches_visible = (
bool(harbor_110_repair_run_id)
and harbor_110_repair_run_id in harbor_job_run_ids
)
harbor_110_repair_jobs_all_success = (
harbor_110_repair_jobs_http_status == 200
and harbor_jobs_total_count > 0
and harbor_job_conclusion_counts.get("success") == harbor_jobs_total_count
and harbor_110_repair_jobs_run_id_matches_visible
)
harbor_110_repair_visible_waiting_stale = (
harbor_110_repair_waiting and harbor_110_repair_jobs_all_success
)
harbor_110_repair_waiting_after_cd_harbor_blocker = (
build_log_classifier["harbor_public_route_blocked"]
and harbor_110_repair_waiting
@@ -319,6 +365,19 @@ def build_readback(
"harbor_110_repair_waiting_after_cd_harbor_blocker": (
harbor_110_repair_waiting_after_cd_harbor_blocker
),
"harbor_110_repair_jobs_http_status": harbor_110_repair_jobs_http_status,
"harbor_110_repair_jobs_total_count": harbor_jobs_total_count,
"harbor_110_repair_jobs_conclusion_counts": harbor_job_conclusion_counts,
"harbor_110_repair_jobs_run_ids": sorted(harbor_job_run_ids),
"harbor_110_repair_jobs_labels": sorted(harbor_job_labels),
"harbor_110_repair_jobs_runner_names": sorted(harbor_job_runner_names),
"harbor_110_repair_jobs_run_id_matches_visible": (
harbor_110_repair_jobs_run_id_matches_visible
),
"harbor_110_repair_jobs_all_success": harbor_110_repair_jobs_all_success,
"harbor_110_repair_visible_waiting_stale": (
harbor_110_repair_visible_waiting_stale
),
"latest_visible_cd_host_pressure_classifier": tests_log_classifier[
"host_pressure_classifier"
],
@@ -350,6 +409,13 @@ def build_readback(
if no_matching
else "blocked_latest_visible_cd_run"
if latest_cd_visible_blocked
else (
"blocked_harbor_public_route_unavailable_after_harbor_110_repair_success"
)
if (
build_log_classifier["harbor_public_route_blocked"]
and harbor_110_repair_jobs_all_success
)
else "blocked_harbor_110_repair_workflow_waiting"
if harbor_110_repair_waiting_after_cd_harbor_blocker
else "blocked_harbor_public_route_unavailable"
@@ -408,6 +474,14 @@ def build_readback(
"harbor_110_repair_no_matching_runner_label": (
latest_harbor_110_repair_run.get("no_matching_runner_label", "")
),
"harbor_110_repair_jobs_total_count": harbor_jobs_total_count,
"harbor_110_repair_jobs_all_success": (
harbor_110_repair_jobs_all_success
),
"harbor_110_repair_jobs_runner_names": sorted(harbor_job_runner_names),
"harbor_110_repair_visible_waiting_stale": (
harbor_110_repair_visible_waiting_stale
),
},
"operation_boundaries": {
"public_gitea_read_only": True,
@@ -571,6 +645,14 @@ def _human_summary(payload: dict[str, Any]) -> str:
"HARBOR_110_REPAIR_WAITING_AFTER_CD_HARBOR_BLOCKER="
f"{int(readback['harbor_110_repair_waiting_after_cd_harbor_blocker'])}"
),
(
"HARBOR_110_REPAIR_JOBS_ALL_SUCCESS="
f"{int(readback['harbor_110_repair_jobs_all_success'])}"
),
(
"HARBOR_110_REPAIR_VISIBLE_WAITING_STALE="
f"{int(readback['harbor_110_repair_visible_waiting_stale'])}"
),
"WRITE_PERFORMED=false",
"TOKEN_COLLECTED=false",
]
@@ -601,6 +683,8 @@ def main(argv: list[str] | None = None) -> int:
parser.add_argument("--actions-list-http-status", type=int)
parser.add_argument("--cd-run-jobs-json-file", type=Path)
parser.add_argument("--cd-run-jobs-http-status", type=int)
parser.add_argument("--harbor-110-repair-jobs-json-file", type=Path)
parser.add_argument("--harbor-110-repair-jobs-http-status", type=int)
parser.add_argument("--cd-build-job-log-file", type=Path)
parser.add_argument("--cd-build-job-log-http-status", type=int)
parser.add_argument("--cd-tests-job-log-file", type=Path)
@@ -650,6 +734,42 @@ def main(argv: list[str] | None = None) -> int:
cd_jobs_http_status = 0
cd_jobs_payload = {"jobs": [], "total_count": 0}
if args.harbor_110_repair_jobs_json_file:
harbor_110_repair_jobs_http_status = (
args.harbor_110_repair_jobs_http_status or 0
)
harbor_110_repair_jobs_payload = load_json_file(
args.harbor_110_repair_jobs_json_file
)
else:
visible_runs_for_harbor_jobs = parse_visible_runs(actions_html)
latest_harbor_110_repair_run_for_jobs = next(
(
run
for run in visible_runs_for_harbor_jobs
if run.get("workflow") == "harbor-110-local-repair.yaml"
),
{},
)
harbor_110_repair_jobs_api_url = derive_jobs_api_url(
args.actions_list_api_url,
latest_harbor_110_repair_run_for_jobs.get("run_id", ""),
)
if harbor_110_repair_jobs_api_url:
harbor_110_repair_jobs_read = fetch_public_url(
harbor_110_repair_jobs_api_url,
args.timeout_seconds,
)
harbor_110_repair_jobs_http_status = (
harbor_110_repair_jobs_read.http_status
)
harbor_110_repair_jobs_payload = load_json_text(
harbor_110_repair_jobs_read.text
)
else:
harbor_110_repair_jobs_http_status = 0
harbor_110_repair_jobs_payload = {"jobs": [], "total_count": 0}
if args.cd_build_job_log_file:
cd_build_job_log_http_status = args.cd_build_job_log_http_status or 0
cd_build_job_log_text = _read_text_file(args.cd_build_job_log_file)
@@ -716,6 +836,8 @@ def main(argv: list[str] | None = None) -> int:
actions_list_payload=actions_list_payload,
cd_jobs_http_status=cd_jobs_http_status,
cd_jobs_payload=cd_jobs_payload,
harbor_110_repair_jobs_http_status=harbor_110_repair_jobs_http_status,
harbor_110_repair_jobs_payload=harbor_110_repair_jobs_payload,
latest_cd_build_log_http_status=cd_build_job_log_http_status,
latest_cd_build_log_text=cd_build_job_log_text,
latest_cd_tests_log_http_status=cd_tests_job_log_http_status,

View File

@@ -164,6 +164,44 @@ def _harbor_blocked_log() -> str:
"""
def _harbor_110_repair_success_jobs() -> dict:
return {
"total_count": 3,
"jobs": [
{
"id": 5821,
"name": "tests",
"status": "completed",
"conclusion": "success",
"labels": ["awoooi-host"],
"runner_name": "wooo-runner",
"run_id": 4060,
"head_sha": "7c8bb3645bdf1fa5ac1aaa7041c237fce8c19c0e",
},
{
"id": 5822,
"name": "build-and-deploy",
"status": "completed",
"conclusion": "success",
"labels": ["awoooi-host"],
"runner_name": "wooo-runner",
"run_id": 4060,
"head_sha": "7c8bb3645bdf1fa5ac1aaa7041c237fce8c19c0e",
},
{
"id": 5823,
"name": "post-deploy-checks",
"status": "completed",
"conclusion": "success",
"labels": ["awoooi-host"],
"runner_name": "wooo-runner",
"run_id": 4060,
"head_sha": "7c8bb3645bdf1fa5ac1aaa7041c237fce8c19c0e",
},
],
}
def _host_pressure_waiting_log() -> str:
return """
2026-06-30T11:48:41.7864172Z ⏳ host web/build/smoke pressure detected (attempt 1/60); waiting 10s
@@ -250,6 +288,40 @@ def test_build_readback_surfaces_harbor_110_repair_waiting_run() -> None:
assert payload["operation_boundaries"]["workflow_dispatch_performed"] is False
def test_build_readback_classifies_harbor_502_after_110_repair_jobs_success() -> None:
module = _load_module()
payload = module.build_readback(
actions_html=_actions_html_cd_running_harbor_repair_waiting().replace(
'data-tooltip-content="Running"',
'data-tooltip-content="Failure"',
1,
),
actions_list_http_status=401,
actions_list_payload={"message": "token is required"},
cd_jobs_http_status=200,
cd_jobs_payload={"jobs": [], "total_count": 0},
harbor_110_repair_jobs_http_status=200,
harbor_110_repair_jobs_payload=_harbor_110_repair_success_jobs(),
latest_cd_build_log_http_status=200,
latest_cd_build_log_text=_harbor_blocked_log(),
)
assert payload["status"] == (
"blocked_harbor_public_route_unavailable_after_harbor_110_repair_success"
)
assert payload["readback"]["latest_visible_harbor_110_repair_run_id"] == "4060"
assert payload["readback"]["harbor_110_repair_jobs_total_count"] == 3
assert payload["readback"]["harbor_110_repair_jobs_all_success"] is True
assert payload["readback"]["harbor_110_repair_visible_waiting_stale"] is True
assert payload["readback"]["harbor_110_repair_jobs_labels"] == ["awoooi-host"]
assert payload["readback"]["harbor_110_repair_jobs_runner_names"] == [
"wooo-runner"
]
assert payload["rollups"]["harbor_110_repair_jobs_all_success"] is True
assert payload["rollups"]["harbor_110_repair_visible_waiting_stale"] is True
assert payload["operation_boundaries"]["host_write_performed"] is False
def test_build_readback_classifies_harbor_public_route_blocker() -> None:
module = _load_module()
payload = module.build_readback(