fix(awooop): cooldown ansible check-mode transport blockers
This commit is contained in:
@@ -639,6 +639,15 @@ class Settings(BaseSettings):
|
||||
le=900,
|
||||
description="Delay before the check-mode worker first tick after API startup.",
|
||||
)
|
||||
AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_COOLDOWN_SECONDS: int = Field(
|
||||
default=21_600,
|
||||
ge=300,
|
||||
le=86_400,
|
||||
description=(
|
||||
"Cooldown after transport-level check-mode blockers such as "
|
||||
"forced-command repair SSH denial."
|
||||
),
|
||||
)
|
||||
|
||||
# ==========================================================================
|
||||
# 統帥鐵律:禁止 SQLite (AWOOOI 憲法)
|
||||
|
||||
@@ -31,6 +31,7 @@ _SAFE_HOST_RE = re.compile(r"^[A-Za-z0-9_.-]+$")
|
||||
_PLAYBOOK_PREFIX = Path("infra/ansible/playbooks")
|
||||
_STDOUT_LIMIT = 20_000
|
||||
_STDERR_LIMIT = 12_000
|
||||
FORCED_COMMAND_BLOCKER = "ansible_repair_ssh_forced_command_denies_ansible_bootstrap"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -80,6 +81,14 @@ def _json_loads(value: Any) -> dict[str, Any]:
|
||||
return {}
|
||||
|
||||
|
||||
def detect_ansible_transport_blockers(*values: Any) -> list[str]:
|
||||
combined = " ".join(str(value or "") for value in values)
|
||||
blockers: list[str] = []
|
||||
if "REPAIR_DENIED:invalid_command" in combined:
|
||||
blockers.append(FORCED_COMMAND_BLOCKER)
|
||||
return blockers
|
||||
|
||||
|
||||
def _playbook_roots(module_path: Path | None = None) -> list[Path]:
|
||||
resolved_module_path = (module_path or Path(__file__)).resolve()
|
||||
return [
|
||||
@@ -387,6 +396,44 @@ async def claim_pending_check_modes(
|
||||
return claims
|
||||
|
||||
|
||||
async def recent_ansible_transport_blockers(
|
||||
*,
|
||||
project_id: str = "awoooi",
|
||||
cooldown_seconds: int | None = None,
|
||||
) -> list[str]:
|
||||
"""Return transport blockers observed from recent failed check-mode rows."""
|
||||
|
||||
cooldown = cooldown_seconds or settings.AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_COOLDOWN_SECONDS
|
||||
async with get_db_context(project_id) as db:
|
||||
result = await db.execute(
|
||||
text("""
|
||||
SELECT
|
||||
coalesce(output::text, '') AS output_text,
|
||||
coalesce(dry_run_result::text, '') AS dry_run_text,
|
||||
coalesce(error, '') AS error_text,
|
||||
coalesce(stderr_feed_back, '') AS stderr_text
|
||||
FROM automation_operation_log
|
||||
WHERE operation_type = 'ansible_check_mode_executed'
|
||||
AND status = 'failed'
|
||||
AND created_at >= NOW() - CAST(:cooldown AS interval)
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 20
|
||||
"""),
|
||||
{"cooldown": f"{max(60, cooldown)} seconds"},
|
||||
)
|
||||
blockers: set[str] = set()
|
||||
for row in result.mappings().all():
|
||||
blockers.update(
|
||||
detect_ansible_transport_blockers(
|
||||
row.get("output_text"),
|
||||
row.get("dry_run_text"),
|
||||
row.get("error_text"),
|
||||
row.get("stderr_text"),
|
||||
)
|
||||
)
|
||||
return sorted(blockers)
|
||||
|
||||
|
||||
async def _insert_skipped_candidate(
|
||||
db: Any,
|
||||
*,
|
||||
@@ -512,6 +559,10 @@ async def run_pending_check_modes_once(
|
||||
if blockers:
|
||||
logger.warning("ansible_check_mode_runtime_blocked", blockers=blockers)
|
||||
return {"claimed": 0, "completed": 0, "failed": 0, "blockers": blockers}
|
||||
transport_blockers = await recent_ansible_transport_blockers(project_id=project_id)
|
||||
if transport_blockers:
|
||||
logger.warning("ansible_check_mode_transport_blocked", blockers=transport_blockers)
|
||||
return {"claimed": 0, "completed": 0, "failed": 0, "blockers": transport_blockers}
|
||||
|
||||
claims = await claim_pending_check_modes(project_id=project_id, limit=limit)
|
||||
completed = 0
|
||||
|
||||
@@ -21,6 +21,7 @@ import structlog
|
||||
from sqlalchemy import text
|
||||
|
||||
from src.db.base import get_db_context
|
||||
from src.services.awooop_ansible_check_mode_service import detect_ansible_transport_blockers
|
||||
from src.services.awooop_ansible_audit_service import build_ansible_truth
|
||||
from src.services.drift_repeat_state import build_drift_repeat_state
|
||||
|
||||
@@ -722,6 +723,28 @@ def _execution_backend_summary(records: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
return summary
|
||||
|
||||
|
||||
def _ansible_observed_runtime_blockers(records: list[dict[str, Any]]) -> list[str]:
|
||||
blockers: set[str] = set()
|
||||
for record in records:
|
||||
execution = record.get("execution") if isinstance(record.get("execution"), dict) else {}
|
||||
ansible = execution.get("ansible") if isinstance(execution.get("ansible"), dict) else {}
|
||||
ansible_records = ansible.get("records") if isinstance(ansible.get("records"), list) else []
|
||||
for row in ansible_records:
|
||||
if not isinstance(row, dict):
|
||||
continue
|
||||
if str(row.get("operation_type") or "") != "ansible_check_mode_executed":
|
||||
continue
|
||||
dry_run_result = row.get("dry_run_result") if isinstance(row.get("dry_run_result"), dict) else {}
|
||||
blockers.update(
|
||||
detect_ansible_transport_blockers(
|
||||
row.get("error"),
|
||||
dry_run_result.get("stdout_tail"),
|
||||
dry_run_result.get("stderr_tail"),
|
||||
)
|
||||
)
|
||||
return sorted(blockers)
|
||||
|
||||
|
||||
def _ansible_playbook_roots(module_path: Path | None = None) -> list[Path]:
|
||||
resolved_module_path = (module_path or Path(__file__)).resolve()
|
||||
return [
|
||||
@@ -875,6 +898,15 @@ def summarize_automation_quality_records(
|
||||
key=lambda row: (-int(row["total"]), str(row["gate"])),
|
||||
)
|
||||
|
||||
ansible_runtime = _ansible_runtime_readiness()
|
||||
observed_ansible_blockers = _ansible_observed_runtime_blockers(records)
|
||||
if observed_ansible_blockers:
|
||||
ansible_runtime["observed_transport_blockers"] = observed_ansible_blockers
|
||||
ansible_runtime["blockers"] = sorted(
|
||||
set(ansible_runtime.get("blockers") or []) | set(observed_ansible_blockers)
|
||||
)
|
||||
ansible_runtime["can_run_check_mode"] = False
|
||||
|
||||
return {
|
||||
"schema_version": "automation_quality_summary_v1",
|
||||
"project_id": project_id,
|
||||
@@ -888,7 +920,7 @@ def summarize_automation_quality_records(
|
||||
"by_verdict": by_verdict,
|
||||
"gate_failures": failing_gates,
|
||||
"execution_backend_summary": _execution_backend_summary(records),
|
||||
"ansible_runtime": _ansible_runtime_readiness(),
|
||||
"ansible_runtime": ansible_runtime,
|
||||
"examples": examples[:25],
|
||||
"production_claim": {
|
||||
"can_claim_full_auto_repair": evaluated_total > 0 and verified_total == evaluated_total,
|
||||
|
||||
@@ -12,6 +12,7 @@ from src.services.awooop_ansible_audit_service import (
|
||||
from src.services.awooop_ansible_check_mode_service import (
|
||||
build_ansible_check_mode_claim_input,
|
||||
build_ansible_check_mode_command,
|
||||
detect_ansible_transport_blockers,
|
||||
)
|
||||
from src.services.awooop_truth_chain_service import (
|
||||
_ansible_playbook_roots,
|
||||
@@ -976,6 +977,14 @@ def test_ansible_check_mode_command_uses_check_diff_and_repair_ssh(tmp_path: Pat
|
||||
assert "apply" not in " ".join(spec.command)
|
||||
|
||||
|
||||
def test_ansible_transport_blocker_detects_repair_forced_command_denial() -> None:
|
||||
blockers = detect_ansible_transport_blockers(
|
||||
"fatal: host unreachable REPAIR_DENIED:invalid_command",
|
||||
)
|
||||
|
||||
assert blockers == ["ansible_repair_ssh_forced_command_denies_ansible_bootstrap"]
|
||||
|
||||
|
||||
def test_execution_backend_summary_subtracts_completed_check_mode_parent() -> None:
|
||||
summary = _execution_backend_summary([
|
||||
{
|
||||
@@ -1006,3 +1015,42 @@ def test_execution_backend_summary_subtracts_completed_check_mode_parent() -> No
|
||||
|
||||
assert summary["ansible_check_mode_total"] == 1
|
||||
assert summary["ansible_pending_check_mode_total"] == 0
|
||||
|
||||
|
||||
def test_quality_summary_marks_forced_command_denial_as_runtime_blocker() -> None:
|
||||
summary = summarize_automation_quality_records(
|
||||
project_id="awoooi",
|
||||
window_hours=24,
|
||||
limit=20,
|
||||
records=[
|
||||
{
|
||||
"incident": {"incident_id": "INC-1", "alertname": "DockerContainerUnhealthy"},
|
||||
"truth_status": {},
|
||||
"automation_quality": {"applicable": True, "score": 50, "verdict": "observed"},
|
||||
"execution": {
|
||||
"automation_operation_log": [],
|
||||
"auto_repair_executions": [],
|
||||
"ansible": {
|
||||
"considered": True,
|
||||
"candidate_catalog": {"candidates": [{"catalog_id": "ansible:110-devops"}]},
|
||||
"records": [
|
||||
{
|
||||
"op_id": "check-1",
|
||||
"operation_type": "ansible_check_mode_executed",
|
||||
"status": "failed",
|
||||
"dry_run_result": {
|
||||
"stdout_tail": "REPAIR_DENIED:invalid_command",
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
assert summary["ansible_runtime"]["can_run_check_mode"] is False
|
||||
assert (
|
||||
"ansible_repair_ssh_forced_command_denies_ansible_bootstrap"
|
||||
in summary["ansible_runtime"]["blockers"]
|
||||
)
|
||||
|
||||
@@ -111,6 +111,8 @@ spec:
|
||||
value: "180"
|
||||
- name: AWOOOP_ANSIBLE_CHECK_MODE_STARTUP_SLEEP_SECONDS
|
||||
value: "120"
|
||||
- name: AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_COOLDOWN_SECONDS
|
||||
value: "21600"
|
||||
# 2026-04-05 Claude Code: Sprint 3 — 掛載 SSH key 供 HostRepairAgent 使用
|
||||
volumeMounts:
|
||||
- name: repair-ssh-key
|
||||
|
||||
Reference in New Issue
Block a user