diff --git a/apps/api/src/services/ai_agent_log_controlled_writeback_executor_readback.py b/apps/api/src/services/ai_agent_log_controlled_writeback_executor_readback.py index 913f9c31..9d97d345 100644 --- a/apps/api/src/services/ai_agent_log_controlled_writeback_executor_readback.py +++ b/apps/api/src/services/ai_agent_log_controlled_writeback_executor_readback.py @@ -614,6 +614,14 @@ def _queue_readback_normalizer_contract() -> list[dict[str, Any]]: ], "learning_targets": ["km", "rag", "playbook", "mcp", "verifier", "ai_agent"], }, + { + "field_id": "latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout", + "purpose": "classify whether 110 accepts the SSH key and then times out during session, PAM, account, or shell setup", + "writes_blockers": [ + "gitea_queue_harbor_110_remote_ssh_server_accepts_key_then_session_timeout", + ], + "learning_targets": ["km", "rag", "playbook", "mcp", "verifier", "ai_agent"], + }, { "field_id": "current_cd_workflow_runner_readiness", "purpose": "classify whether current CD is waiting on a non110 runner label", diff --git a/apps/api/tests/test_ai_agent_autonomous_runtime_control.py b/apps/api/tests/test_ai_agent_autonomous_runtime_control.py index d5fac736..47ae0987 100644 --- a/apps/api/tests/test_ai_agent_autonomous_runtime_control.py +++ b/apps/api/tests/test_ai_agent_autonomous_runtime_control.py @@ -37,7 +37,7 @@ def _assert_log_controlled_writeback_executor(payload: dict): assert payload["rollups"]["current_blocker_control_path_blocked_count"] == 1 assert ( payload["rollups"]["current_blocker_control_path_pressure_blocked_count"] - == 1 + == 0 ) assert payload["rollups"]["current_blocker_local_recovery_package_count"] == 1 assert payload["rollups"]["runtime_dispatch_performed"] is False @@ -75,11 +75,11 @@ def _assert_log_controlled_writeback_executor(payload: dict): "forbidden_runtime_actions" ] assert current_queue["external_control_path_blocker"] == ( - "remote_ssh_publickey_auth_stalled" + "remote_ssh_server_accepts_key_then_session_timeout" ) - assert current_queue["node_load_classifier"] == "high_load" - assert current_queue["node_load_high"] is True - assert current_queue["control_path_pressure_blocker"] == "node_load_high" + assert current_queue["node_load_classifier"] == "load_not_high" + assert current_queue["node_load_high"] is False + assert current_queue["control_path_pressure_blocker"] == "" assert context["raw_payload_required"] is False boundaries = payload["operation_boundaries"] diff --git a/apps/api/tests/test_ai_agent_log_controlled_writeback_executor_readback_api.py b/apps/api/tests/test_ai_agent_log_controlled_writeback_executor_readback_api.py index 78e806fd..6210e8f3 100644 --- a/apps/api/tests/test_ai_agent_log_controlled_writeback_executor_readback_api.py +++ b/apps/api/tests/test_ai_agent_log_controlled_writeback_executor_readback_api.py @@ -67,7 +67,7 @@ def _assert_executor_readback(payload: dict, *, public_endpoint: bool = False): assert payload["rollups"]["current_blocker_control_path_blocked_count"] == 1 assert ( payload["rollups"]["current_blocker_control_path_pressure_blocked_count"] - == 1 + == 0 ) assert payload["rollups"]["current_blocker_local_recovery_package_count"] == 1 assert ( @@ -129,8 +129,8 @@ def _assert_executor_readback(payload: dict, *, public_endpoint: bool = False): "controlled_recovery_packaged_waiting_control_path_readback" ) assert current_queue[0]["runner_label"] == "awoooi-non110-host" - assert current_queue[0]["node_load_classifier"] == "high_load" - assert current_queue[0]["node_load_high"] is True + assert current_queue[0]["node_load_classifier"] == "load_not_high" + assert current_queue[0]["node_load_high"] is False assert current_queue[0]["registry_v2_status"] == 502 assert current_queue[0]["controlled_recovery_package"] == ( "recover-110-control-path-and-harbor-local.sh --check" @@ -187,7 +187,7 @@ def _assert_executor_readback(payload: dict, *, public_endpoint: bool = False): ] for item in current_queue[0]["harbor_recovery_receipt_output_contract"] ) - assert current_queue[0]["queue_readback_normalizer_contract_count"] == 7 + assert current_queue[0]["queue_readback_normalizer_contract_count"] == 8 assert [ item["field_id"] for item in current_queue[0]["queue_readback_normalizer_contract"] @@ -197,6 +197,7 @@ def _assert_executor_readback(payload: dict, *, public_endpoint: bool = False): "latest_visible_harbor_110_repair_no_matching_runner_label", "latest_visible_harbor_110_repair_remote_control_channel_unavailable", "latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled", + "latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout", "current_cd_workflow_runner_readiness", "controlled_profile_no_matching_runner_labels", ] @@ -237,9 +238,9 @@ def _assert_executor_readback(payload: dict, *, public_endpoint: bool = False): ) assert current_queue[0]["runtime_apply_required_on_110_local_console"] is True assert current_queue[0]["external_control_path_blocker"] == ( - "remote_ssh_publickey_auth_stalled" + "remote_ssh_server_accepts_key_then_session_timeout" ) - assert current_queue[0]["control_path_pressure_blocker"] == "node_load_high" + assert current_queue[0]["control_path_pressure_blocker"] == "" assert current_queue[0]["control_channel_readback_required"] is True assert set(current_queue[0]["learning_writeback_targets"]) == set(batches) assert len(current_queue[0]["target_batches"]) == 6 diff --git a/apps/api/tests/test_ai_agent_log_controlled_writeback_plan_readback_api.py b/apps/api/tests/test_ai_agent_log_controlled_writeback_plan_readback_api.py index 0f3ce68e..276c1486 100644 --- a/apps/api/tests/test_ai_agent_log_controlled_writeback_plan_readback_api.py +++ b/apps/api/tests/test_ai_agent_log_controlled_writeback_plan_readback_api.py @@ -167,7 +167,7 @@ def _assert_controlled_writeback_plan(payload: dict, *, public_endpoint: bool = assert { plan["current_blocker_recovery"]["external_control_path_blocker"] for plan in p0_plans - } == {"remote_ssh_publickey_auth_stalled"} + } == {"remote_ssh_server_accepts_key_then_session_timeout"} boundaries = payload["operation_boundaries"] assert boundaries["plan_readback_only"] is True diff --git a/apps/api/tests/test_ai_agent_log_intelligence_integration_readback_api.py b/apps/api/tests/test_ai_agent_log_intelligence_integration_readback_api.py index 45d4dbed..853c734e 100644 --- a/apps/api/tests/test_ai_agent_log_intelligence_integration_readback_api.py +++ b/apps/api/tests/test_ai_agent_log_intelligence_integration_readback_api.py @@ -107,11 +107,17 @@ def _assert_log_intelligence_payload(payload: dict): "harbor_110_remote_ssh_publickey_auth_stalled" ) assert p0_sample["classification"]["ssh_auth_classification"] == ( - "remote_ssh_publickey_auth_stalled" + "remote_ssh_server_accepts_key_then_session_timeout" ) assert p0_sample["classification"]["remote_control_channel"] == "unavailable" assert p0_sample["classification"]["bounded_ssh_timeout_seen"] is True assert p0_sample["classification"]["remote_ssh_publickey_auth_stalled"] is True + assert ( + p0_sample["classification"][ + "remote_ssh_server_accepts_key_then_session_timeout" + ] + is True + ) assert p0_sample["classification"]["remote_ssh_auth_permission_denied"] is False assert p0_sample["classification"]["remote_ssh_reachable"] is True assert p0_sample["classification"]["controlled_recovery_package"] == ( @@ -131,7 +137,13 @@ def _assert_log_intelligence_payload(payload: dict): "verify_harbor_queue_and_controlled_cd_lane", ] assert recovery_plan[0]["mode"] == "read_only" + assert recovery_plan[0]["expected_result"] == ( + "remote_ssh_server_accepts_key_then_session_timeout_classifier_without_secret_material" + ) assert recovery_plan[2]["mode"] == "controlled_apply" + assert recovery_plan[2]["expected_result"] == ( + "ssh_session_pam_account_or_shell_path_repaired_without_key_material_read" + ) assert recovery_plan[2]["runtime_write_allowed_only_on_110_local_console"] is True assert "read-public-gitea-actions-queue.py --json" in p0_sample[ "classification" diff --git a/apps/api/tests/test_awoooi_priority_work_order_readback_api.py b/apps/api/tests/test_awoooi_priority_work_order_readback_api.py index 5f6e7660..56bcdef7 100644 --- a/apps/api/tests/test_awoooi_priority_work_order_readback_api.py +++ b/apps/api/tests/test_awoooi_priority_work_order_readback_api.py @@ -261,12 +261,10 @@ def test_awoooi_priority_work_order_readback_overlays_ai_loop_current_blocker_qu "harbor_110_remote_ssh_publickey_auth_stalled" ) assert state["ai_loop_current_blocker_control_path_blocker"] == ( - "remote_ssh_publickey_auth_stalled" + "remote_ssh_server_accepts_key_then_session_timeout" ) - assert state["ai_loop_current_blocker_control_path_pressure_blocker"] == ( - "node_load_high" - ) - assert state["ai_loop_current_blocker_node_load_classifier"] == "high_load" + assert state["ai_loop_current_blocker_control_path_pressure_blocker"] == "" + assert state["ai_loop_current_blocker_node_load_classifier"] == "load_not_high" assert evidence["ai_loop_current_blocker_execution_queue_count"] == 1 assert evidence["ai_loop_current_blocker_id"] == ( "harbor_110_remote_ssh_publickey_auth_stalled" @@ -283,12 +281,10 @@ def test_awoooi_priority_work_order_readback_overlays_ai_loop_current_blocker_qu "controlled_after_110_local_console_preflight" ) assert evidence["ai_loop_current_blocker_control_path_blocker"] == ( - "remote_ssh_publickey_auth_stalled" + "remote_ssh_server_accepts_key_then_session_timeout" ) - assert evidence["ai_loop_current_blocker_control_path_pressure_blocker"] == ( - "node_load_high" - ) - assert evidence["ai_loop_current_blocker_node_load_classifier"] == "high_load" + assert evidence["ai_loop_current_blocker_control_path_pressure_blocker"] == "" + assert evidence["ai_loop_current_blocker_node_load_classifier"] == "load_not_high" assert set(evidence["ai_loop_current_blocker_learning_writeback_targets"]) == { "km", "rag", @@ -369,6 +365,7 @@ def test_awoooi_priority_work_order_readback_overlays_ai_loop_current_blocker_qu "latest_visible_harbor_110_repair_no_matching_runner_label", "latest_visible_harbor_110_repair_remote_control_channel_unavailable", "latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled", + "latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout", "current_cd_workflow_runner_readiness", "controlled_profile_no_matching_runner_labels", ] @@ -394,11 +391,9 @@ def test_awoooi_priority_work_order_readback_overlays_ai_loop_current_blocker_qu assert payload["summary"]["ai_loop_current_blocker_id"] == ( "harbor_110_remote_ssh_publickey_auth_stalled" ) - assert payload["summary"]["ai_loop_current_blocker_control_path_pressure_blocker"] == ( - "node_load_high" - ) + assert payload["summary"]["ai_loop_current_blocker_control_path_pressure_blocker"] == "" assert payload["summary"]["ai_loop_current_blocker_node_load_classifier"] == ( - "high_load" + "load_not_high" ) assert payload["summary"]["ai_loop_current_blocker_local_console_phase_count"] == 5 assert payload["summary"][ @@ -429,7 +424,7 @@ def test_awoooi_priority_work_order_readback_overlays_ai_loop_current_blocker_qu payload["summary"][ "ai_loop_current_blocker_queue_readback_normalizer_contract_count" ] - == 7 + == 8 ) assert payload["summary"][ "ai_loop_current_blocker_queue_readback_normalizer_field_ids" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index dcf12dfa..8a0c1b38 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -12,6 +12,20 @@ **邊界**:只改 110 local no-secret metadata checker、receipt parser、tests 與 LOGBOOK;未讀 authorized_keys 內容、secret / token / `.env` / raw sessions / SQLite / auth;未讀 `.runner` 內容;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall。 +## 2026-07-01 — 12:46 AI Loop SSH session-path blocker 細分 + +**照主線修正的問題**: +- 最新 110 read-only 診斷顯示 SSH port / banner 可達、`wooo` publickey 已進到 `server_accepts_key_then_timeout`,目前 `NODE_LOAD_CLASSIFIER=load_not_high`;這不是 no-matching runner,也不是當下 CPU pressure,而是 key accepted 後的 session / PAM / account / shell path stall。 +- `ops/runner/read-public-gitea-actions-queue.py` 新增 `remote_ssh_server_accepts_key_then_session_timeout` classifier 與 readback / rollup 欄位,讓 Gitea queue 不再只留下泛稱 `remote_ssh_publickey_auth_stalled`。 +- AI Loop runtime sample / controlled writeback executor / priority work order 同步新子分類:`ssh_auth_classification=remote_ssh_server_accepts_key_then_session_timeout`、`node_load_classifier=load_not_high`、`control_path_pressure_blocker=""`,避免把目前 P0 blocker 錯歸因成 CPU pressure。 +- queue normalizer contract 新增 `latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout`,KM / RAG / PlayBook / MCP / verifier / AI Agent 都會收到這個更精準 blocker。 + +**驗證**: +- `DATABASE_URL=... PYTHONPATH=apps/api python3.11 -m pytest apps/api/tests/test_ai_agent_log_intelligence_integration_readback_api.py apps/api/tests/test_ai_agent_log_feedback_receipt_dry_run_api.py apps/api/tests/test_ai_agent_log_controlled_writeback_plan_readback_api.py apps/api/tests/test_ai_agent_log_controlled_writeback_executor_readback_api.py apps/api/tests/test_ai_agent_autonomous_runtime_control.py apps/api/tests/test_awoooi_priority_work_order_readback_api.py apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py ops/runner/test_read_public_gitea_actions_queue.py -q --tb=short -p no:cacheprovider`:`81 passed`。 +- `python3.11 -m json.tool docs/operations/ai-agent-log-intelligence-runtime-sample-readback.snapshot.json >/dev/null`、`py_compile`、`python3.11 ops/runner/guard-gitea-runner-pressure.py --root .`、`node scripts/ci/check-gitea-step-env-secrets.js`、`git diff --check`:通過。 + +**邊界**:只改 queue classifier、AI Loop metadata/readback/tests 與 LOGBOOK;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未讀 authorized_keys 內容或 `.runner` 內容;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall;未執行 110 runtime apply。 + ## 2026-07-01 — 12:35 110 CPU / control-plane live readback **照主線釐清的問題**: diff --git a/docs/operations/ai-agent-log-intelligence-runtime-sample-readback.snapshot.json b/docs/operations/ai-agent-log-intelligence-runtime-sample-readback.snapshot.json index 083741b5..c59a7ea8 100644 --- a/docs/operations/ai-agent-log-intelligence-runtime-sample-readback.snapshot.json +++ b/docs/operations/ai-agent-log-intelligence-runtime-sample-readback.snapshot.json @@ -85,11 +85,12 @@ "harbor_110_remote_control_channel_unavailable", "harbor_110_remote_ssh_reachable", "harbor_110_remote_ssh_publickey_auth_stalled", + "harbor_110_remote_ssh_server_accepts_key_then_session_timeout", "harbor_110_remote_ssh_auth_permission_denied_false", "harbor_110_repair_jobs_payload_stale_or_cross_workflow", "bounded_ssh_timeout_seen", "controlled_profile_no_matching_runner_cleared", - "node_load_high", + "node_load_not_high", "controlled_lane_verifier_packaged", "local_recovery_package_packaged" ], @@ -101,6 +102,7 @@ "ssh_auth_classification", "remote_control_channel", "remote_ssh_publickey_auth_stalled", + "remote_ssh_server_accepts_key_then_session_timeout", "remote_ssh_auth_permission_denied", "harbor_110_repair_failure_classifier", "harbor_110_repair_jobs_payload_classifier", @@ -116,15 +118,16 @@ "risk_tier": "high", "runner_label": "awoooi-non110-host", "current_blocker": "harbor_110_remote_ssh_publickey_auth_stalled", - "ssh_auth_classification": "remote_ssh_publickey_auth_stalled", + "ssh_auth_classification": "remote_ssh_server_accepts_key_then_session_timeout", "remote_control_channel": "unavailable", "bounded_ssh_timeout_seen": true, "remote_ssh_reachable": true, "remote_ssh_publickey_auth_stalled": true, + "remote_ssh_server_accepts_key_then_session_timeout": true, "remote_ssh_auth_permission_denied": false, "harbor_110_repair_failure_classifier": "harbor_110_remote_ssh_publickey_auth_stalled", "harbor_110_repair_jobs_payload_stale_or_cross_workflow": true, - "node_load_classifier": "high_load", + "node_load_classifier": "load_not_high", "registry_v2_status": 502, "controlled_recovery_package": "recover-110-control-path-and-harbor-local.sh --check", "post_apply_verifier": "check-awoooi-110-controlled-cd-lane-readiness.sh", @@ -133,7 +136,7 @@ "phase_id": "diagnose_remote_control_channel", "mode": "read_only", "command": "read-public-gitea-actions-queue.py --json", - "expected_result": "remote_ssh_publickey_auth_stalled_classifier_without_secret_material", + "expected_result": "remote_ssh_server_accepts_key_then_session_timeout_classifier_without_secret_material", "runtime_write_allowed_only_on_110_local_console": false }, { @@ -147,7 +150,7 @@ "phase_id": "repair_ssh_service_or_metadata_if_check_confirms_drift", "mode": "controlled_apply", "command": "recover-110-control-path-and-harbor-local.sh --apply-ssh-control-path", - "expected_result": "ssh_control_path_repaired_without_key_material_read", + "expected_result": "ssh_session_pam_account_or_shell_path_repaired_without_key_material_read", "runtime_write_allowed_only_on_110_local_console": true }, { diff --git a/ops/runner/read-public-gitea-actions-queue.py b/ops/runner/read-public-gitea-actions-queue.py index a8cd799f..4ff8fe91 100644 --- a/ops/runner/read-public-gitea-actions-queue.py +++ b/ops/runner/read-public-gitea-actions-queue.py @@ -655,6 +655,11 @@ def build_readback( "latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled": ( harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"] ), + "latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout": ( + harbor_110_repair_log_classifier[ + "remote_ssh_server_accepts_key_then_session_timeout" + ] + ), "latest_visible_harbor_110_repair_remote_ssh_auth_permission_denied": ( harbor_110_repair_log_classifier["remote_ssh_auth_permission_denied"] ), @@ -926,6 +931,11 @@ def build_readback( "harbor_110_repair_remote_ssh_publickey_auth_stalled": ( harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"] ), + "harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout": ( + harbor_110_repair_log_classifier[ + "remote_ssh_server_accepts_key_then_session_timeout" + ] + ), "harbor_110_repair_remote_ssh_auth_permission_denied": ( harbor_110_repair_log_classifier["remote_ssh_auth_permission_denied"] ), @@ -1076,6 +1086,14 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]: and remote_ssh_publickey_reply_timeout_seen is True ) ) + remote_ssh_server_accepts_key_then_session_timeout = ( + "harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true" in text + or "classification=server_accepts_key_then_timeout" in text + or ( + ("Server accepts key" in text or "server_accepts_key_then_timeout" in text) + and _HARBOR_110_REMOTE_SSH_TIMEOUT_RE.search(text) is not None + ) + ) remote_ssh_auth_permission_denied = _last_bool_marker( "harbor_110_remote_ssh_auth_permission_denied", text, @@ -1136,6 +1154,9 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]: remote_ssh_publickey_reply_timeout_seen ), "remote_ssh_publickey_auth_stalled": remote_ssh_publickey_auth_stalled, + "remote_ssh_server_accepts_key_then_session_timeout": ( + remote_ssh_server_accepts_key_then_session_timeout + ), "remote_ssh_auth_permission_denied": remote_ssh_auth_permission_denied, "local_registry_v2_status": local_status, "public_registry_v2_status": public_status, diff --git a/ops/runner/test_read_public_gitea_actions_queue.py b/ops/runner/test_read_public_gitea_actions_queue.py index bfcf5f5b..68daabab 100644 --- a/ops/runner/test_read_public_gitea_actions_queue.py +++ b/ops/runner/test_read_public_gitea_actions_queue.py @@ -296,6 +296,8 @@ harbor_110_remote_ssh_userauth_service_accept_seen=true harbor_110_remote_ssh_publickey_offered=true harbor_110_remote_ssh_publickey_reply_timeout_seen=true harbor_110_remote_ssh_publickey_auth_stalled=true +harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true +SSH_AUTH user=wooo mode=publickey rc=124 classification=server_accepts_key_then_timeout BLOCKED harbor_110_remote_ssh_publickey_auth_stalled target=wooo@192.168.0.110 harbor_110_remote_ssh_auth_permission_denied=false harbor_110_remote_ssh_diag_raw_log_printed=false @@ -751,6 +753,12 @@ def test_build_readback_classifies_harbor_repair_publickey_auth_stalled() -> Non ] is True ) + assert ( + payload["readback"][ + "latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout" + ] + is True + ) assert ( payload["readback"][ "latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen" @@ -773,6 +781,12 @@ def test_build_readback_classifies_harbor_repair_publickey_auth_stalled() -> Non payload["rollups"]["harbor_110_repair_remote_ssh_publickey_auth_stalled"] is True ) + assert ( + payload["rollups"][ + "harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout" + ] + is True + ) assert payload["operation_boundaries"]["secret_or_runner_token_read"] is False assert payload["operation_boundaries"]["host_write_performed"] is False