fix(agent): normalize ssh session timeout blocker
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 35s
CD Pipeline / build-and-deploy (push) Failing after 27s
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-07-01 12:46:19 +08:00
parent 7019d4a005
commit ce5bcab8b5
10 changed files with 101 additions and 33 deletions

View File

@@ -614,6 +614,14 @@ def _queue_readback_normalizer_contract() -> list[dict[str, Any]]:
],
"learning_targets": ["km", "rag", "playbook", "mcp", "verifier", "ai_agent"],
},
{
"field_id": "latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout",
"purpose": "classify whether 110 accepts the SSH key and then times out during session, PAM, account, or shell setup",
"writes_blockers": [
"gitea_queue_harbor_110_remote_ssh_server_accepts_key_then_session_timeout",
],
"learning_targets": ["km", "rag", "playbook", "mcp", "verifier", "ai_agent"],
},
{
"field_id": "current_cd_workflow_runner_readiness",
"purpose": "classify whether current CD is waiting on a non110 runner label",

View File

@@ -37,7 +37,7 @@ def _assert_log_controlled_writeback_executor(payload: dict):
assert payload["rollups"]["current_blocker_control_path_blocked_count"] == 1
assert (
payload["rollups"]["current_blocker_control_path_pressure_blocked_count"]
== 1
== 0
)
assert payload["rollups"]["current_blocker_local_recovery_package_count"] == 1
assert payload["rollups"]["runtime_dispatch_performed"] is False
@@ -75,11 +75,11 @@ def _assert_log_controlled_writeback_executor(payload: dict):
"forbidden_runtime_actions"
]
assert current_queue["external_control_path_blocker"] == (
"remote_ssh_publickey_auth_stalled"
"remote_ssh_server_accepts_key_then_session_timeout"
)
assert current_queue["node_load_classifier"] == "high_load"
assert current_queue["node_load_high"] is True
assert current_queue["control_path_pressure_blocker"] == "node_load_high"
assert current_queue["node_load_classifier"] == "load_not_high"
assert current_queue["node_load_high"] is False
assert current_queue["control_path_pressure_blocker"] == ""
assert context["raw_payload_required"] is False
boundaries = payload["operation_boundaries"]

View File

@@ -67,7 +67,7 @@ def _assert_executor_readback(payload: dict, *, public_endpoint: bool = False):
assert payload["rollups"]["current_blocker_control_path_blocked_count"] == 1
assert (
payload["rollups"]["current_blocker_control_path_pressure_blocked_count"]
== 1
== 0
)
assert payload["rollups"]["current_blocker_local_recovery_package_count"] == 1
assert (
@@ -129,8 +129,8 @@ def _assert_executor_readback(payload: dict, *, public_endpoint: bool = False):
"controlled_recovery_packaged_waiting_control_path_readback"
)
assert current_queue[0]["runner_label"] == "awoooi-non110-host"
assert current_queue[0]["node_load_classifier"] == "high_load"
assert current_queue[0]["node_load_high"] is True
assert current_queue[0]["node_load_classifier"] == "load_not_high"
assert current_queue[0]["node_load_high"] is False
assert current_queue[0]["registry_v2_status"] == 502
assert current_queue[0]["controlled_recovery_package"] == (
"recover-110-control-path-and-harbor-local.sh --check"
@@ -187,7 +187,7 @@ def _assert_executor_readback(payload: dict, *, public_endpoint: bool = False):
]
for item in current_queue[0]["harbor_recovery_receipt_output_contract"]
)
assert current_queue[0]["queue_readback_normalizer_contract_count"] == 7
assert current_queue[0]["queue_readback_normalizer_contract_count"] == 8
assert [
item["field_id"]
for item in current_queue[0]["queue_readback_normalizer_contract"]
@@ -197,6 +197,7 @@ def _assert_executor_readback(payload: dict, *, public_endpoint: bool = False):
"latest_visible_harbor_110_repair_no_matching_runner_label",
"latest_visible_harbor_110_repair_remote_control_channel_unavailable",
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled",
"latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout",
"current_cd_workflow_runner_readiness",
"controlled_profile_no_matching_runner_labels",
]
@@ -237,9 +238,9 @@ def _assert_executor_readback(payload: dict, *, public_endpoint: bool = False):
)
assert current_queue[0]["runtime_apply_required_on_110_local_console"] is True
assert current_queue[0]["external_control_path_blocker"] == (
"remote_ssh_publickey_auth_stalled"
"remote_ssh_server_accepts_key_then_session_timeout"
)
assert current_queue[0]["control_path_pressure_blocker"] == "node_load_high"
assert current_queue[0]["control_path_pressure_blocker"] == ""
assert current_queue[0]["control_channel_readback_required"] is True
assert set(current_queue[0]["learning_writeback_targets"]) == set(batches)
assert len(current_queue[0]["target_batches"]) == 6

View File

@@ -167,7 +167,7 @@ def _assert_controlled_writeback_plan(payload: dict, *, public_endpoint: bool =
assert {
plan["current_blocker_recovery"]["external_control_path_blocker"]
for plan in p0_plans
} == {"remote_ssh_publickey_auth_stalled"}
} == {"remote_ssh_server_accepts_key_then_session_timeout"}
boundaries = payload["operation_boundaries"]
assert boundaries["plan_readback_only"] is True

View File

@@ -107,11 +107,17 @@ def _assert_log_intelligence_payload(payload: dict):
"harbor_110_remote_ssh_publickey_auth_stalled"
)
assert p0_sample["classification"]["ssh_auth_classification"] == (
"remote_ssh_publickey_auth_stalled"
"remote_ssh_server_accepts_key_then_session_timeout"
)
assert p0_sample["classification"]["remote_control_channel"] == "unavailable"
assert p0_sample["classification"]["bounded_ssh_timeout_seen"] is True
assert p0_sample["classification"]["remote_ssh_publickey_auth_stalled"] is True
assert (
p0_sample["classification"][
"remote_ssh_server_accepts_key_then_session_timeout"
]
is True
)
assert p0_sample["classification"]["remote_ssh_auth_permission_denied"] is False
assert p0_sample["classification"]["remote_ssh_reachable"] is True
assert p0_sample["classification"]["controlled_recovery_package"] == (
@@ -131,7 +137,13 @@ def _assert_log_intelligence_payload(payload: dict):
"verify_harbor_queue_and_controlled_cd_lane",
]
assert recovery_plan[0]["mode"] == "read_only"
assert recovery_plan[0]["expected_result"] == (
"remote_ssh_server_accepts_key_then_session_timeout_classifier_without_secret_material"
)
assert recovery_plan[2]["mode"] == "controlled_apply"
assert recovery_plan[2]["expected_result"] == (
"ssh_session_pam_account_or_shell_path_repaired_without_key_material_read"
)
assert recovery_plan[2]["runtime_write_allowed_only_on_110_local_console"] is True
assert "read-public-gitea-actions-queue.py --json" in p0_sample[
"classification"

View File

@@ -261,12 +261,10 @@ def test_awoooi_priority_work_order_readback_overlays_ai_loop_current_blocker_qu
"harbor_110_remote_ssh_publickey_auth_stalled"
)
assert state["ai_loop_current_blocker_control_path_blocker"] == (
"remote_ssh_publickey_auth_stalled"
"remote_ssh_server_accepts_key_then_session_timeout"
)
assert state["ai_loop_current_blocker_control_path_pressure_blocker"] == (
"node_load_high"
)
assert state["ai_loop_current_blocker_node_load_classifier"] == "high_load"
assert state["ai_loop_current_blocker_control_path_pressure_blocker"] == ""
assert state["ai_loop_current_blocker_node_load_classifier"] == "load_not_high"
assert evidence["ai_loop_current_blocker_execution_queue_count"] == 1
assert evidence["ai_loop_current_blocker_id"] == (
"harbor_110_remote_ssh_publickey_auth_stalled"
@@ -283,12 +281,10 @@ def test_awoooi_priority_work_order_readback_overlays_ai_loop_current_blocker_qu
"controlled_after_110_local_console_preflight"
)
assert evidence["ai_loop_current_blocker_control_path_blocker"] == (
"remote_ssh_publickey_auth_stalled"
"remote_ssh_server_accepts_key_then_session_timeout"
)
assert evidence["ai_loop_current_blocker_control_path_pressure_blocker"] == (
"node_load_high"
)
assert evidence["ai_loop_current_blocker_node_load_classifier"] == "high_load"
assert evidence["ai_loop_current_blocker_control_path_pressure_blocker"] == ""
assert evidence["ai_loop_current_blocker_node_load_classifier"] == "load_not_high"
assert set(evidence["ai_loop_current_blocker_learning_writeback_targets"]) == {
"km",
"rag",
@@ -369,6 +365,7 @@ def test_awoooi_priority_work_order_readback_overlays_ai_loop_current_blocker_qu
"latest_visible_harbor_110_repair_no_matching_runner_label",
"latest_visible_harbor_110_repair_remote_control_channel_unavailable",
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled",
"latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout",
"current_cd_workflow_runner_readiness",
"controlled_profile_no_matching_runner_labels",
]
@@ -394,11 +391,9 @@ def test_awoooi_priority_work_order_readback_overlays_ai_loop_current_blocker_qu
assert payload["summary"]["ai_loop_current_blocker_id"] == (
"harbor_110_remote_ssh_publickey_auth_stalled"
)
assert payload["summary"]["ai_loop_current_blocker_control_path_pressure_blocker"] == (
"node_load_high"
)
assert payload["summary"]["ai_loop_current_blocker_control_path_pressure_blocker"] == ""
assert payload["summary"]["ai_loop_current_blocker_node_load_classifier"] == (
"high_load"
"load_not_high"
)
assert payload["summary"]["ai_loop_current_blocker_local_console_phase_count"] == 5
assert payload["summary"][
@@ -429,7 +424,7 @@ def test_awoooi_priority_work_order_readback_overlays_ai_loop_current_blocker_qu
payload["summary"][
"ai_loop_current_blocker_queue_readback_normalizer_contract_count"
]
== 7
== 8
)
assert payload["summary"][
"ai_loop_current_blocker_queue_readback_normalizer_field_ids"

View File

@@ -12,6 +12,20 @@
**邊界**:只改 110 local no-secret metadata checker、receipt parser、tests 與 LOGBOOK未讀 authorized_keys 內容、secret / token / `.env` / raw sessions / SQLite / auth未讀 `.runner` 內容;未使用 GitHub / `gh` / GitHub API未 workflow_dispatch未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall。
## 2026-07-01 — 12:46 AI Loop SSH session-path blocker 細分
**照主線修正的問題**
- 最新 110 read-only 診斷顯示 SSH port / banner 可達、`wooo` publickey 已進到 `server_accepts_key_then_timeout`,目前 `NODE_LOAD_CLASSIFIER=load_not_high`;這不是 no-matching runner也不是當下 CPU pressure而是 key accepted 後的 session / PAM / account / shell path stall。
- `ops/runner/read-public-gitea-actions-queue.py` 新增 `remote_ssh_server_accepts_key_then_session_timeout` classifier 與 readback / rollup 欄位,讓 Gitea queue 不再只留下泛稱 `remote_ssh_publickey_auth_stalled`
- AI Loop runtime sample / controlled writeback executor / priority work order 同步新子分類:`ssh_auth_classification=remote_ssh_server_accepts_key_then_session_timeout``node_load_classifier=load_not_high``control_path_pressure_blocker=""`,避免把目前 P0 blocker 錯歸因成 CPU pressure。
- queue normalizer contract 新增 `latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout`KM / RAG / PlayBook / MCP / verifier / AI Agent 都會收到這個更精準 blocker。
**驗證**
- `DATABASE_URL=... PYTHONPATH=apps/api python3.11 -m pytest apps/api/tests/test_ai_agent_log_intelligence_integration_readback_api.py apps/api/tests/test_ai_agent_log_feedback_receipt_dry_run_api.py apps/api/tests/test_ai_agent_log_controlled_writeback_plan_readback_api.py apps/api/tests/test_ai_agent_log_controlled_writeback_executor_readback_api.py apps/api/tests/test_ai_agent_autonomous_runtime_control.py apps/api/tests/test_awoooi_priority_work_order_readback_api.py apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py ops/runner/test_read_public_gitea_actions_queue.py -q --tb=short -p no:cacheprovider``81 passed`
- `python3.11 -m json.tool docs/operations/ai-agent-log-intelligence-runtime-sample-readback.snapshot.json >/dev/null``py_compile``python3.11 ops/runner/guard-gitea-runner-pressure.py --root .``node scripts/ci/check-gitea-step-env-secrets.js``git diff --check`:通過。
**邊界**:只改 queue classifier、AI Loop metadata/readback/tests 與 LOGBOOK未讀 secret / token / `.env` / raw sessions / SQLite / auth未讀 authorized_keys 內容或 `.runner` 內容;未使用 GitHub / `gh` / GitHub API未 workflow_dispatch未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall未執行 110 runtime apply。
## 2026-07-01 — 12:35 110 CPU / control-plane live readback
**照主線釐清的問題**

View File

@@ -85,11 +85,12 @@
"harbor_110_remote_control_channel_unavailable",
"harbor_110_remote_ssh_reachable",
"harbor_110_remote_ssh_publickey_auth_stalled",
"harbor_110_remote_ssh_server_accepts_key_then_session_timeout",
"harbor_110_remote_ssh_auth_permission_denied_false",
"harbor_110_repair_jobs_payload_stale_or_cross_workflow",
"bounded_ssh_timeout_seen",
"controlled_profile_no_matching_runner_cleared",
"node_load_high",
"node_load_not_high",
"controlled_lane_verifier_packaged",
"local_recovery_package_packaged"
],
@@ -101,6 +102,7 @@
"ssh_auth_classification",
"remote_control_channel",
"remote_ssh_publickey_auth_stalled",
"remote_ssh_server_accepts_key_then_session_timeout",
"remote_ssh_auth_permission_denied",
"harbor_110_repair_failure_classifier",
"harbor_110_repair_jobs_payload_classifier",
@@ -116,15 +118,16 @@
"risk_tier": "high",
"runner_label": "awoooi-non110-host",
"current_blocker": "harbor_110_remote_ssh_publickey_auth_stalled",
"ssh_auth_classification": "remote_ssh_publickey_auth_stalled",
"ssh_auth_classification": "remote_ssh_server_accepts_key_then_session_timeout",
"remote_control_channel": "unavailable",
"bounded_ssh_timeout_seen": true,
"remote_ssh_reachable": true,
"remote_ssh_publickey_auth_stalled": true,
"remote_ssh_server_accepts_key_then_session_timeout": true,
"remote_ssh_auth_permission_denied": false,
"harbor_110_repair_failure_classifier": "harbor_110_remote_ssh_publickey_auth_stalled",
"harbor_110_repair_jobs_payload_stale_or_cross_workflow": true,
"node_load_classifier": "high_load",
"node_load_classifier": "load_not_high",
"registry_v2_status": 502,
"controlled_recovery_package": "recover-110-control-path-and-harbor-local.sh --check",
"post_apply_verifier": "check-awoooi-110-controlled-cd-lane-readiness.sh",
@@ -133,7 +136,7 @@
"phase_id": "diagnose_remote_control_channel",
"mode": "read_only",
"command": "read-public-gitea-actions-queue.py --json",
"expected_result": "remote_ssh_publickey_auth_stalled_classifier_without_secret_material",
"expected_result": "remote_ssh_server_accepts_key_then_session_timeout_classifier_without_secret_material",
"runtime_write_allowed_only_on_110_local_console": false
},
{
@@ -147,7 +150,7 @@
"phase_id": "repair_ssh_service_or_metadata_if_check_confirms_drift",
"mode": "controlled_apply",
"command": "recover-110-control-path-and-harbor-local.sh --apply-ssh-control-path",
"expected_result": "ssh_control_path_repaired_without_key_material_read",
"expected_result": "ssh_session_pam_account_or_shell_path_repaired_without_key_material_read",
"runtime_write_allowed_only_on_110_local_console": true
},
{

View File

@@ -655,6 +655,11 @@ def build_readback(
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled": (
harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
),
"latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout": (
harbor_110_repair_log_classifier[
"remote_ssh_server_accepts_key_then_session_timeout"
]
),
"latest_visible_harbor_110_repair_remote_ssh_auth_permission_denied": (
harbor_110_repair_log_classifier["remote_ssh_auth_permission_denied"]
),
@@ -926,6 +931,11 @@ def build_readback(
"harbor_110_repair_remote_ssh_publickey_auth_stalled": (
harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
),
"harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout": (
harbor_110_repair_log_classifier[
"remote_ssh_server_accepts_key_then_session_timeout"
]
),
"harbor_110_repair_remote_ssh_auth_permission_denied": (
harbor_110_repair_log_classifier["remote_ssh_auth_permission_denied"]
),
@@ -1076,6 +1086,14 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]:
and remote_ssh_publickey_reply_timeout_seen is True
)
)
remote_ssh_server_accepts_key_then_session_timeout = (
"harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true" in text
or "classification=server_accepts_key_then_timeout" in text
or (
("Server accepts key" in text or "server_accepts_key_then_timeout" in text)
and _HARBOR_110_REMOTE_SSH_TIMEOUT_RE.search(text) is not None
)
)
remote_ssh_auth_permission_denied = _last_bool_marker(
"harbor_110_remote_ssh_auth_permission_denied",
text,
@@ -1136,6 +1154,9 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]:
remote_ssh_publickey_reply_timeout_seen
),
"remote_ssh_publickey_auth_stalled": remote_ssh_publickey_auth_stalled,
"remote_ssh_server_accepts_key_then_session_timeout": (
remote_ssh_server_accepts_key_then_session_timeout
),
"remote_ssh_auth_permission_denied": remote_ssh_auth_permission_denied,
"local_registry_v2_status": local_status,
"public_registry_v2_status": public_status,

View File

@@ -296,6 +296,8 @@ harbor_110_remote_ssh_userauth_service_accept_seen=true
harbor_110_remote_ssh_publickey_offered=true
harbor_110_remote_ssh_publickey_reply_timeout_seen=true
harbor_110_remote_ssh_publickey_auth_stalled=true
harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true
SSH_AUTH user=wooo mode=publickey rc=124 classification=server_accepts_key_then_timeout
BLOCKED harbor_110_remote_ssh_publickey_auth_stalled target=wooo@192.168.0.110
harbor_110_remote_ssh_auth_permission_denied=false
harbor_110_remote_ssh_diag_raw_log_printed=false
@@ -751,6 +753,12 @@ def test_build_readback_classifies_harbor_repair_publickey_auth_stalled() -> Non
]
is True
)
assert (
payload["readback"][
"latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout"
]
is True
)
assert (
payload["readback"][
"latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen"
@@ -773,6 +781,12 @@ def test_build_readback_classifies_harbor_repair_publickey_auth_stalled() -> Non
payload["rollups"]["harbor_110_repair_remote_ssh_publickey_auth_stalled"]
is True
)
assert (
payload["rollups"][
"harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout"
]
is True
)
assert payload["operation_boundaries"]["secret_or_runner_token_read"] is False
assert payload["operation_boundaries"]["host_write_performed"] is False