diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index cd7c98ed..84015f90 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -36,11 +36,12 @@ **照主線修正的問題**: - `ops/runner/read-public-gitea-actions-queue.py` 新增 `harbor-110-local-repair.yaml` 專用 readback:run id、status、kind、title、commit、waiting / running / blocked 與 no-matching runner label。 - Rollups 新增 `harbor_110_repair_run_visible`、`harbor_110_repair_run_status`、`harbor_110_repair_waiting`、`harbor_110_repair_running`、`harbor_110_repair_blocked`,避免 110 local repair workflow 只藏在 `top_visible_runs` 靠人眼判讀。 -- Live readback 目前精準收斂:CD `#4061` failure,classifier=`harbor_registry_public_route_unavailable`、`registry_v2_status=502`、CD non110 self-heal skip=`not_110_host`;scheduled Harbor repair `#4060` visible 且 `Waiting`。 +- 追加 Harbor 110 repair jobs API readback:`harbor_110_repair_jobs_total_count`、`harbor_110_repair_jobs_all_success`、runner labels、runner names 與 `harbor_110_repair_visible_waiting_stale`,避免 public HTML stale `Waiting` 蓋掉 jobs API truth。 +- Live readback 目前精準收斂:CD `#4062` failure,classifier=`blocked_harbor_public_route_unavailable_after_harbor_110_repair_success`;scheduled Harbor repair `#4063` jobs `3/3 success`、label=`awoooi-host`、runner=`wooo-runner`,但 public / internal registry `/v2/` 仍為 `502`。 **驗證**: -- `py_compile`、`ruff check`、`pytest ops/runner/test_read_public_gitea_actions_queue.py -q` 通過(`15 passed`)。 -- `read-public-gitea-actions-queue.py --json` 成功讀出 `latest_visible_harbor_110_repair_run_id=4060`、`latest_visible_harbor_110_repair_run_status=Waiting`。 +- `py_compile`、`ruff check`、`pytest ops/runner/test_read_public_gitea_actions_queue.py -q` 通過(`16 passed`)。 +- `read-public-gitea-actions-queue.py --json` 成功讀出 `latest_visible_harbor_110_repair_run_id=4063`、`latest_visible_harbor_110_repair_run_status=Waiting`、`harbor_110_repair_jobs_all_success=true`、`harbor_110_repair_visible_waiting_stale=true`。 - `git diff --check` 通過。 **邊界**:只讀 public Gitea / 本機 source;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未 SSH / Docker / Nginx / K3s / DB / firewall runtime 寫入。 diff --git a/ops/runner/read-public-gitea-actions-queue.py b/ops/runner/read-public-gitea-actions-queue.py index 83b86762..73215651 100644 --- a/ops/runner/read-public-gitea-actions-queue.py +++ b/ops/runner/read-public-gitea-actions-queue.py @@ -168,6 +168,8 @@ def build_readback( actions_list_payload: Any, cd_jobs_http_status: int, cd_jobs_payload: Any, + harbor_110_repair_jobs_http_status: int = 0, + harbor_110_repair_jobs_payload: Any | None = None, latest_cd_build_log_http_status: int = 0, latest_cd_build_log_text: str = "", latest_cd_tests_log_http_status: int = 0, @@ -191,10 +193,21 @@ def build_readback( {}, ) cd_jobs = cd_jobs_payload if isinstance(cd_jobs_payload, dict) else {} + harbor_110_repair_jobs = ( + harbor_110_repair_jobs_payload + if isinstance(harbor_110_repair_jobs_payload, dict) + else {} + ) actions_list = actions_list_payload if isinstance(actions_list_payload, dict) else {} actions_list_message = str(actions_list.get("message") or "") jobs_total_count = _int(cd_jobs.get("total_count")) jobs = cd_jobs.get("jobs") if isinstance(cd_jobs.get("jobs"), list) else [] + harbor_jobs_total_count = _int(harbor_110_repair_jobs.get("total_count")) + harbor_jobs = ( + harbor_110_repair_jobs.get("jobs") + if isinstance(harbor_110_repair_jobs.get("jobs"), list) + else [] + ) latest_cd_run_id = latest_cd_run.get("run_id", "") latest_cd_commit_sha = latest_cd_run.get("commit_sha", "") job_head_shas = sorted( @@ -216,6 +229,25 @@ def build_readback( if isinstance(job, dict): conclusion = str(job.get("conclusion") or job.get("status") or "unknown") job_conclusion_counts[conclusion] = job_conclusion_counts.get(conclusion, 0) + 1 + harbor_job_conclusion_counts: dict[str, int] = {} + harbor_job_run_ids: set[str] = set() + harbor_job_labels: set[str] = set() + harbor_job_runner_names: set[str] = set() + for job in harbor_jobs: + if not isinstance(job, dict): + continue + conclusion = str(job.get("conclusion") or job.get("status") or "unknown") + harbor_job_conclusion_counts[conclusion] = ( + harbor_job_conclusion_counts.get(conclusion, 0) + 1 + ) + if job.get("run_id") is not None: + harbor_job_run_ids.add(str(job.get("run_id"))) + labels = job.get("labels") + if isinstance(labels, list): + harbor_job_labels.update(str(label) for label in labels if label) + runner_name = str(job.get("runner_name") or "") + if runner_name: + harbor_job_runner_names.add(runner_name) cd_jobs_head_sha_matches_visible = ( bool(latest_cd_commit_sha) and latest_cd_commit_sha in job_head_shas @@ -237,9 +269,23 @@ def build_readback( tests_log_classifier = classify_cd_tests_log(latest_cd_tests_log_text) latest_cd_visible_blocked = latest_cd_run.get("status", "") == "Blocked" harbor_110_repair_status = latest_harbor_110_repair_run.get("status", "") + harbor_110_repair_run_id = latest_harbor_110_repair_run.get("run_id", "") harbor_110_repair_waiting = harbor_110_repair_status == "Waiting" harbor_110_repair_running = harbor_110_repair_status == "Running" harbor_110_repair_blocked = harbor_110_repair_status == "Blocked" + harbor_110_repair_jobs_run_id_matches_visible = ( + bool(harbor_110_repair_run_id) + and harbor_110_repair_run_id in harbor_job_run_ids + ) + harbor_110_repair_jobs_all_success = ( + harbor_110_repair_jobs_http_status == 200 + and harbor_jobs_total_count > 0 + and harbor_job_conclusion_counts.get("success") == harbor_jobs_total_count + and harbor_110_repair_jobs_run_id_matches_visible + ) + harbor_110_repair_visible_waiting_stale = ( + harbor_110_repair_waiting and harbor_110_repair_jobs_all_success + ) harbor_110_repair_waiting_after_cd_harbor_blocker = ( build_log_classifier["harbor_public_route_blocked"] and harbor_110_repair_waiting @@ -319,6 +365,19 @@ def build_readback( "harbor_110_repair_waiting_after_cd_harbor_blocker": ( harbor_110_repair_waiting_after_cd_harbor_blocker ), + "harbor_110_repair_jobs_http_status": harbor_110_repair_jobs_http_status, + "harbor_110_repair_jobs_total_count": harbor_jobs_total_count, + "harbor_110_repair_jobs_conclusion_counts": harbor_job_conclusion_counts, + "harbor_110_repair_jobs_run_ids": sorted(harbor_job_run_ids), + "harbor_110_repair_jobs_labels": sorted(harbor_job_labels), + "harbor_110_repair_jobs_runner_names": sorted(harbor_job_runner_names), + "harbor_110_repair_jobs_run_id_matches_visible": ( + harbor_110_repair_jobs_run_id_matches_visible + ), + "harbor_110_repair_jobs_all_success": harbor_110_repair_jobs_all_success, + "harbor_110_repair_visible_waiting_stale": ( + harbor_110_repair_visible_waiting_stale + ), "latest_visible_cd_host_pressure_classifier": tests_log_classifier[ "host_pressure_classifier" ], @@ -350,6 +409,13 @@ def build_readback( if no_matching else "blocked_latest_visible_cd_run" if latest_cd_visible_blocked + else ( + "blocked_harbor_public_route_unavailable_after_harbor_110_repair_success" + ) + if ( + build_log_classifier["harbor_public_route_blocked"] + and harbor_110_repair_jobs_all_success + ) else "blocked_harbor_110_repair_workflow_waiting" if harbor_110_repair_waiting_after_cd_harbor_blocker else "blocked_harbor_public_route_unavailable" @@ -408,6 +474,14 @@ def build_readback( "harbor_110_repair_no_matching_runner_label": ( latest_harbor_110_repair_run.get("no_matching_runner_label", "") ), + "harbor_110_repair_jobs_total_count": harbor_jobs_total_count, + "harbor_110_repair_jobs_all_success": ( + harbor_110_repair_jobs_all_success + ), + "harbor_110_repair_jobs_runner_names": sorted(harbor_job_runner_names), + "harbor_110_repair_visible_waiting_stale": ( + harbor_110_repair_visible_waiting_stale + ), }, "operation_boundaries": { "public_gitea_read_only": True, @@ -571,6 +645,14 @@ def _human_summary(payload: dict[str, Any]) -> str: "HARBOR_110_REPAIR_WAITING_AFTER_CD_HARBOR_BLOCKER=" f"{int(readback['harbor_110_repair_waiting_after_cd_harbor_blocker'])}" ), + ( + "HARBOR_110_REPAIR_JOBS_ALL_SUCCESS=" + f"{int(readback['harbor_110_repair_jobs_all_success'])}" + ), + ( + "HARBOR_110_REPAIR_VISIBLE_WAITING_STALE=" + f"{int(readback['harbor_110_repair_visible_waiting_stale'])}" + ), "WRITE_PERFORMED=false", "TOKEN_COLLECTED=false", ] @@ -601,6 +683,8 @@ def main(argv: list[str] | None = None) -> int: parser.add_argument("--actions-list-http-status", type=int) parser.add_argument("--cd-run-jobs-json-file", type=Path) parser.add_argument("--cd-run-jobs-http-status", type=int) + parser.add_argument("--harbor-110-repair-jobs-json-file", type=Path) + parser.add_argument("--harbor-110-repair-jobs-http-status", type=int) parser.add_argument("--cd-build-job-log-file", type=Path) parser.add_argument("--cd-build-job-log-http-status", type=int) parser.add_argument("--cd-tests-job-log-file", type=Path) @@ -650,6 +734,42 @@ def main(argv: list[str] | None = None) -> int: cd_jobs_http_status = 0 cd_jobs_payload = {"jobs": [], "total_count": 0} + if args.harbor_110_repair_jobs_json_file: + harbor_110_repair_jobs_http_status = ( + args.harbor_110_repair_jobs_http_status or 0 + ) + harbor_110_repair_jobs_payload = load_json_file( + args.harbor_110_repair_jobs_json_file + ) + else: + visible_runs_for_harbor_jobs = parse_visible_runs(actions_html) + latest_harbor_110_repair_run_for_jobs = next( + ( + run + for run in visible_runs_for_harbor_jobs + if run.get("workflow") == "harbor-110-local-repair.yaml" + ), + {}, + ) + harbor_110_repair_jobs_api_url = derive_jobs_api_url( + args.actions_list_api_url, + latest_harbor_110_repair_run_for_jobs.get("run_id", ""), + ) + if harbor_110_repair_jobs_api_url: + harbor_110_repair_jobs_read = fetch_public_url( + harbor_110_repair_jobs_api_url, + args.timeout_seconds, + ) + harbor_110_repair_jobs_http_status = ( + harbor_110_repair_jobs_read.http_status + ) + harbor_110_repair_jobs_payload = load_json_text( + harbor_110_repair_jobs_read.text + ) + else: + harbor_110_repair_jobs_http_status = 0 + harbor_110_repair_jobs_payload = {"jobs": [], "total_count": 0} + if args.cd_build_job_log_file: cd_build_job_log_http_status = args.cd_build_job_log_http_status or 0 cd_build_job_log_text = _read_text_file(args.cd_build_job_log_file) @@ -716,6 +836,8 @@ def main(argv: list[str] | None = None) -> int: actions_list_payload=actions_list_payload, cd_jobs_http_status=cd_jobs_http_status, cd_jobs_payload=cd_jobs_payload, + harbor_110_repair_jobs_http_status=harbor_110_repair_jobs_http_status, + harbor_110_repair_jobs_payload=harbor_110_repair_jobs_payload, latest_cd_build_log_http_status=cd_build_job_log_http_status, latest_cd_build_log_text=cd_build_job_log_text, latest_cd_tests_log_http_status=cd_tests_job_log_http_status, diff --git a/ops/runner/test_read_public_gitea_actions_queue.py b/ops/runner/test_read_public_gitea_actions_queue.py index 14d3600a..a153b6a9 100644 --- a/ops/runner/test_read_public_gitea_actions_queue.py +++ b/ops/runner/test_read_public_gitea_actions_queue.py @@ -164,6 +164,44 @@ def _harbor_blocked_log() -> str: """ +def _harbor_110_repair_success_jobs() -> dict: + return { + "total_count": 3, + "jobs": [ + { + "id": 5821, + "name": "tests", + "status": "completed", + "conclusion": "success", + "labels": ["awoooi-host"], + "runner_name": "wooo-runner", + "run_id": 4060, + "head_sha": "7c8bb3645bdf1fa5ac1aaa7041c237fce8c19c0e", + }, + { + "id": 5822, + "name": "build-and-deploy", + "status": "completed", + "conclusion": "success", + "labels": ["awoooi-host"], + "runner_name": "wooo-runner", + "run_id": 4060, + "head_sha": "7c8bb3645bdf1fa5ac1aaa7041c237fce8c19c0e", + }, + { + "id": 5823, + "name": "post-deploy-checks", + "status": "completed", + "conclusion": "success", + "labels": ["awoooi-host"], + "runner_name": "wooo-runner", + "run_id": 4060, + "head_sha": "7c8bb3645bdf1fa5ac1aaa7041c237fce8c19c0e", + }, + ], + } + + def _host_pressure_waiting_log() -> str: return """ 2026-06-30T11:48:41.7864172Z ⏳ host web/build/smoke pressure detected (attempt 1/60); waiting 10s @@ -250,6 +288,40 @@ def test_build_readback_surfaces_harbor_110_repair_waiting_run() -> None: assert payload["operation_boundaries"]["workflow_dispatch_performed"] is False +def test_build_readback_classifies_harbor_502_after_110_repair_jobs_success() -> None: + module = _load_module() + payload = module.build_readback( + actions_html=_actions_html_cd_running_harbor_repair_waiting().replace( + 'data-tooltip-content="Running"', + 'data-tooltip-content="Failure"', + 1, + ), + actions_list_http_status=401, + actions_list_payload={"message": "token is required"}, + cd_jobs_http_status=200, + cd_jobs_payload={"jobs": [], "total_count": 0}, + harbor_110_repair_jobs_http_status=200, + harbor_110_repair_jobs_payload=_harbor_110_repair_success_jobs(), + latest_cd_build_log_http_status=200, + latest_cd_build_log_text=_harbor_blocked_log(), + ) + + assert payload["status"] == ( + "blocked_harbor_public_route_unavailable_after_harbor_110_repair_success" + ) + assert payload["readback"]["latest_visible_harbor_110_repair_run_id"] == "4060" + assert payload["readback"]["harbor_110_repair_jobs_total_count"] == 3 + assert payload["readback"]["harbor_110_repair_jobs_all_success"] is True + assert payload["readback"]["harbor_110_repair_visible_waiting_stale"] is True + assert payload["readback"]["harbor_110_repair_jobs_labels"] == ["awoooi-host"] + assert payload["readback"]["harbor_110_repair_jobs_runner_names"] == [ + "wooo-runner" + ] + assert payload["rollups"]["harbor_110_repair_jobs_all_success"] is True + assert payload["rollups"]["harbor_110_repair_visible_waiting_stale"] is True + assert payload["operation_boundaries"]["host_write_performed"] is False + + def test_build_readback_classifies_harbor_public_route_blocker() -> None: module = _load_module() payload = module.build_readback(