awoooi/apps/api/tests/test_agent_step_timeouts.py

"""
Agent Step Timeout 拆分 + Metric 測試
======================================
# 2026-04-27 Claude Sonnet 4.6: A1 — 三段 timeout 拆分 + step metric (北極星 §1.2 Observable by Default)

測試範圍：
1. 三個 Agent 的 timeout default 值正確（Diagnostician=30 / Solver=20 / Critic=15）
2. env override 生效（monkeypatch 模擬不同環境配置）
3. Histogram metric 在 success / timeout 情境下各被 observe 一次

注意：測試 timeout 行為時使用 asyncio fake（asyncio.sleep mock），
      符合 feedback_no_mock_testing：這是測試時序行為，不是測試 LLM 推理。
"""

from __future__ import annotations

import asyncio
import importlib
import sys
from unittest.mock import AsyncMock, MagicMock, patch

import pytest
from prometheus_client import CollectorRegistry, Histogram


# =============================================================================
# Section 1: Timeout Default 值正確性
# =============================================================================

class TestTimeoutDefaults:
    """三段 timeout 的 default 值必須是 30/20/15s（不受環境干擾）"""

    def test_diagnostician_default_timeout_is_30(self, monkeypatch):
        """Diagnostician default timeout = 30.0s（NIM 主吃口，需最大預算）"""
        # 確保 env 未設置，移除可能的殘留
        monkeypatch.delenv("AGENT_DIAGNOSTICIAN_TIMEOUT_SEC", raising=False)

        # 重新 import 模組，確保 env 讀取發生在 import time
        if "src.agents.diagnostician_agent" in sys.modules:
            del sys.modules["src.agents.diagnostician_agent"]
        import src.agents.diagnostician_agent as mod
        importlib.reload(mod)

        assert mod.AGENT_DIAGNOSTICIAN_TIMEOUT_SEC == 30.0, (
            f"Diagnostician default timeout 期望 30.0，實際 {mod.AGENT_DIAGNOSTICIAN_TIMEOUT_SEC}"
        )

    def test_solver_default_timeout_is_20(self, monkeypatch):
        """Solver default timeout = 20.0s（prompt 規模中等）"""
        monkeypatch.delenv("AGENT_SOLVER_TIMEOUT_SEC", raising=False)

        if "src.agents.solver_agent" in sys.modules:
            del sys.modules["src.agents.solver_agent"]
        import src.agents.solver_agent as mod
        importlib.reload(mod)

        assert mod.AGENT_SOLVER_TIMEOUT_SEC == 20.0, (
            f"Solver default timeout 期望 20.0，實際 {mod.AGENT_SOLVER_TIMEOUT_SEC}"
        )

    def test_critic_default_timeout_is_15(self, monkeypatch):
        """Critic default timeout = 15.0s（輸出最短，保留預算給 Diagnostician/Solver）"""
        monkeypatch.delenv("AGENT_CRITIC_TIMEOUT_SEC", raising=False)

        if "src.agents.critic_agent" in sys.modules:
            del sys.modules["src.agents.critic_agent"]
        import src.agents.critic_agent as mod
        importlib.reload(mod)

        assert mod.AGENT_CRITIC_TIMEOUT_SEC == 15.0, (
            f"Critic default timeout 期望 15.0，實際 {mod.AGENT_CRITIC_TIMEOUT_SEC}"
        )

    def test_agent_debate_global_timeout_default_is_420(self, monkeypatch):
        """Agent debate global timeout defaults to the direct GCP qwen3 budget."""
        monkeypatch.delenv("AGENT_DEBATE_GLOBAL_TIMEOUT_SEC", raising=False)

        if "src.services.agent_orchestrator" in sys.modules:
            del sys.modules["src.services.agent_orchestrator"]
        import src.services.agent_orchestrator as mod
        importlib.reload(mod)

        assert mod.GLOBAL_TIMEOUT_SEC == 420.0

    def test_deprecated_alias_matches_new_constant_diagnostician(self, monkeypatch):
        """PHASE2_STEP_TIMEOUT_SEC alias 應等於 AGENT_DIAGNOSTICIAN_TIMEOUT_SEC（相容性保證）"""
        monkeypatch.delenv("AGENT_DIAGNOSTICIAN_TIMEOUT_SEC", raising=False)

        if "src.agents.diagnostician_agent" in sys.modules:
            del sys.modules["src.agents.diagnostician_agent"]
        import src.agents.diagnostician_agent as mod
        importlib.reload(mod)

        assert mod.PHASE2_STEP_TIMEOUT_SEC == mod.AGENT_DIAGNOSTICIAN_TIMEOUT_SEC

    def test_deprecated_alias_matches_new_constant_solver(self, monkeypatch):
        """PHASE2_STEP_TIMEOUT_SEC alias 應等於 AGENT_SOLVER_TIMEOUT_SEC（相容性保證）"""
        monkeypatch.delenv("AGENT_SOLVER_TIMEOUT_SEC", raising=False)

        if "src.agents.solver_agent" in sys.modules:
            del sys.modules["src.agents.solver_agent"]
        import src.agents.solver_agent as mod
        importlib.reload(mod)

        assert mod.PHASE2_STEP_TIMEOUT_SEC == mod.AGENT_SOLVER_TIMEOUT_SEC

    def test_deprecated_alias_matches_new_constant_critic(self, monkeypatch):
        """PHASE2_STEP_TIMEOUT_SEC alias 應等於 AGENT_CRITIC_TIMEOUT_SEC（相容性保證）"""
        monkeypatch.delenv("AGENT_CRITIC_TIMEOUT_SEC", raising=False)

        if "src.agents.critic_agent" in sys.modules:
            del sys.modules["src.agents.critic_agent"]
        import src.agents.critic_agent as mod
        importlib.reload(mod)

        assert mod.PHASE2_STEP_TIMEOUT_SEC == mod.AGENT_CRITIC_TIMEOUT_SEC


# =============================================================================
# Section 2: env override 生效
# =============================================================================

class TestEnvOverride:
    """env var 覆蓋 default — 模擬 K8s ConfigMap 動態調整"""

    def test_diagnostician_env_override(self, monkeypatch):
        """AGENT_DIAGNOSTICIAN_TIMEOUT_SEC=45.0 覆蓋 default 30.0"""
        monkeypatch.setenv("AGENT_DIAGNOSTICIAN_TIMEOUT_SEC", "45.0")

        if "src.agents.diagnostician_agent" in sys.modules:
            del sys.modules["src.agents.diagnostician_agent"]
        import src.agents.diagnostician_agent as mod
        importlib.reload(mod)

        assert mod.AGENT_DIAGNOSTICIAN_TIMEOUT_SEC == 45.0, (
            f"env override 期望 45.0，實際 {mod.AGENT_DIAGNOSTICIAN_TIMEOUT_SEC}"
        )

    def test_solver_env_override(self, monkeypatch):
        """AGENT_SOLVER_TIMEOUT_SEC=25.0 覆蓋 default 20.0"""
        monkeypatch.setenv("AGENT_SOLVER_TIMEOUT_SEC", "25.0")

        if "src.agents.solver_agent" in sys.modules:
            del sys.modules["src.agents.solver_agent"]
        import src.agents.solver_agent as mod
        importlib.reload(mod)

        assert mod.AGENT_SOLVER_TIMEOUT_SEC == 25.0

    def test_critic_env_override(self, monkeypatch):
        """AGENT_CRITIC_TIMEOUT_SEC=10.0 覆蓋 default 15.0"""
        monkeypatch.setenv("AGENT_CRITIC_TIMEOUT_SEC", "10.0")

        if "src.agents.critic_agent" in sys.modules:
            del sys.modules["src.agents.critic_agent"]
        import src.agents.critic_agent as mod
        importlib.reload(mod)

        assert mod.AGENT_CRITIC_TIMEOUT_SEC == 10.0

    def test_env_override_integer_string(self, monkeypatch):
        """env var 為整數字串（無小數點）應正確轉為 float"""
        monkeypatch.setenv("AGENT_DIAGNOSTICIAN_TIMEOUT_SEC", "60")

        if "src.agents.diagnostician_agent" in sys.modules:
            del sys.modules["src.agents.diagnostician_agent"]
        import src.agents.diagnostician_agent as mod
        importlib.reload(mod)

        assert mod.AGENT_DIAGNOSTICIAN_TIMEOUT_SEC == 60.0
        assert isinstance(mod.AGENT_DIAGNOSTICIAN_TIMEOUT_SEC, float)

    def test_env_override_updates_deprecated_alias(self, monkeypatch):
        """env override 後，相容 alias PHASE2_STEP_TIMEOUT_SEC 也跟著更新"""
        monkeypatch.setenv("AGENT_CRITIC_TIMEOUT_SEC", "8.0")

        if "src.agents.critic_agent" in sys.modules:
            del sys.modules["src.agents.critic_agent"]
        import src.agents.critic_agent as mod
        importlib.reload(mod)

        assert mod.PHASE2_STEP_TIMEOUT_SEC == 8.0
        assert mod.PHASE2_STEP_TIMEOUT_SEC == mod.AGENT_CRITIC_TIMEOUT_SEC

    def test_agent_debate_global_timeout_env_override(self, monkeypatch):
        """AGENT_DEBATE_GLOBAL_TIMEOUT_SEC=300 覆蓋 default 420.0"""
        monkeypatch.setenv("AGENT_DEBATE_GLOBAL_TIMEOUT_SEC", "300")

        if "src.services.agent_orchestrator" in sys.modules:
            del sys.modules["src.services.agent_orchestrator"]
        import src.services.agent_orchestrator as mod
        importlib.reload(mod)

        assert mod.GLOBAL_TIMEOUT_SEC == 300.0


# =============================================================================
# Section 3: Metric Histogram observe 驗證
# =============================================================================

class TestAgentStepMetrics:
    """
    aiops_agent_step_duration_seconds Histogram 在各情境下被正確 observe。

    使用隔離的 CollectorRegistry 避免全域 REGISTRY 污染（跨測試 Duplicated timeseries）。
    直接呼叫 observe_agent_step()，驗證 _sum / _count 值。
    """

    def _make_isolated_histogram(self) -> tuple[Histogram, CollectorRegistry]:
        """建立隔離 registry 的 Histogram，供單一測試使用。"""
        registry = CollectorRegistry()
        hist = Histogram(
            "aiops_agent_step_duration_seconds_test",
            "test histogram",
            ["agent", "outcome"],
            buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 15.0, 20.0, 30.0, 45.0, 60.0],
            registry=registry,
        )
        return hist, registry

    def _get_sample_value(
        self,
        registry: CollectorRegistry,
        metric_name: str,
        labels: dict,
        suffix: str = "_count",
    ) -> float:
        """從隔離 registry 抓取指定 label 的 sample 值。"""
        for metric in registry.collect():
            if metric.name == metric_name:
                for sample in metric.samples:
                    if sample.name == metric_name + suffix and sample.labels == labels:
                        return sample.value
        return 0.0

    def test_observe_agent_step_success(self):
        """success outcome 呼叫一次後，_count=1 且 _sum>0"""
        hist, registry = self._make_isolated_histogram()

        # 直接 observe，繞過全域 REGISTRY
        hist.labels(agent="diagnostician", outcome="success").observe(1.5)

        count = self._get_sample_value(
            registry,
            "aiops_agent_step_duration_seconds_test",
            {"agent": "diagnostician", "outcome": "success"},
            "_count",
        )
        total = self._get_sample_value(
            registry,
            "aiops_agent_step_duration_seconds_test",
            {"agent": "diagnostician", "outcome": "success"},
            "_sum",
        )

        assert count == 1.0, f"expect _count=1, got {count}"
        assert total == pytest.approx(1.5), f"expect _sum=1.5, got {total}"

    def test_observe_agent_step_timeout(self):
        """timeout outcome 呼叫一次後，_count=1"""
        hist, registry = self._make_isolated_histogram()

        hist.labels(agent="solver", outcome="timeout").observe(20.1)

        count = self._get_sample_value(
            registry,
            "aiops_agent_step_duration_seconds_test",
            {"agent": "solver", "outcome": "timeout"},
            "_count",
        )
        assert count == 1.0, f"expect _count=1 for timeout, got {count}"

    def test_observe_agent_step_error(self):
        """error outcome 呼叫一次後，_count=1"""
        hist, registry = self._make_isolated_histogram()

        hist.labels(agent="critic", outcome="error").observe(0.05)

        count = self._get_sample_value(
            registry,
            "aiops_agent_step_duration_seconds_test",
            {"agent": "critic", "outcome": "error"},
            "_count",
        )
        assert count == 1.0, f"expect _count=1 for error, got {count}"

    def test_observe_multiple_agents_independent(self):
        """三個 agent 各自 observe，互不干擾（label cardinality 正確）"""
        hist, registry = self._make_isolated_histogram()

        hist.labels(agent="diagnostician", outcome="success").observe(2.0)
        hist.labels(agent="solver", outcome="success").observe(3.0)
        hist.labels(agent="critic", outcome="timeout").observe(15.5)

        diag_count = self._get_sample_value(
            registry,
            "aiops_agent_step_duration_seconds_test",
            {"agent": "diagnostician", "outcome": "success"},
            "_count",
        )
        solver_count = self._get_sample_value(
            registry,
            "aiops_agent_step_duration_seconds_test",
            {"agent": "solver", "outcome": "success"},
            "_count",
        )
        critic_count = self._get_sample_value(
            registry,
            "aiops_agent_step_duration_seconds_test",
            {"agent": "critic", "outcome": "timeout"},
            "_count",
        )

        assert diag_count == 1.0
        assert solver_count == 1.0
        assert critic_count == 1.0

    @pytest.mark.asyncio
    async def test_observe_called_on_success_via_mock(self):
        """
        透過 mock 驗證 diagnostician _analyze 在成功路徑呼叫 observe_agent_step("diagnostician", "success", ...)。

        策略：mock openclaw.call 回傳合法 JSON，mock observe_agent_step，
              驗證被呼叫一次且 outcome="success"。
        LLM 推理本身不被 mock（只 mock 網路層回傳）。
        """
        import src.agents.diagnostician_agent as diag_mod

        fake_response = '{"hypotheses": [{"description": "CPU 高", "confidence": 0.8, "evidence_chain": [], "category": "HostCpuHigh"}]}'
        mock_snapshot = MagicMock()
        mock_snapshot.snapshot_id = "test-snap-001"
        mock_snapshot.evidence_summary = "CPU 95%"
        mock_snapshot.anomaly_context = None

        with patch(
            "src.agents.diagnostician_agent.observe_agent_step"
        ) as mock_observe, patch(
            "src.services.openclaw.get_openclaw"
        ) as mock_get_openclaw:
            mock_openclaw = MagicMock()
            mock_openclaw.call = AsyncMock(
                return_value=(fake_response, "nim", True)
            )
            mock_get_openclaw.return_value = mock_openclaw

            agent = diag_mod.DiagnosticianAgent()
            await agent._analyze(mock_snapshot)

        mock_observe.assert_called_once()
        call_args = mock_observe.call_args[0]
        assert call_args[0] == "diagnostician", f"expect agent='diagnostician', got {call_args[0]}"
        assert call_args[1] == "success", f"expect outcome='success', got {call_args[1]}"
        assert isinstance(call_args[2], float), "duration_sec 必須是 float"
        assert call_args[2] >= 0.0, "duration_sec 不能為負"

    @pytest.mark.asyncio
    async def test_observe_called_on_timeout_via_mock(self):
        """
        透過 mock 驗證 diagnostician _analyze 在 timeout 路徑呼叫 observe_agent_step("diagnostician", "timeout", ...)。

        策略：mock openclaw.call 拋出 asyncio.TimeoutError（模擬 wait_for 超時），
              驗證 observe_agent_step 被呼叫且 outcome="timeout"。
        """
        import src.agents.diagnostician_agent as diag_mod

        mock_snapshot = MagicMock()
        mock_snapshot.snapshot_id = "test-snap-timeout"
        mock_snapshot.evidence_summary = "NIM 無回應"
        mock_snapshot.anomaly_context = None

        with patch(
            "src.agents.diagnostician_agent.observe_agent_step"
        ) as mock_observe, patch(
            "src.agents.diagnostician_agent.asyncio.wait_for",
            side_effect=asyncio.TimeoutError(),
        ):
            agent = diag_mod.DiagnosticianAgent()
            result = await agent._analyze(mock_snapshot)

        mock_observe.assert_called_once()
        call_args = mock_observe.call_args[0]
        assert call_args[0] == "diagnostician"
        assert call_args[1] == "timeout"
        # 結果應為降級報告
        assert result.degraded is True

    @pytest.mark.asyncio
    async def test_observe_called_on_solver_success(self):
        """Solver 成功路徑呼叫 observe_agent_step("solver", "success", ...)"""
        import src.agents.solver_agent as solver_mod
        from src.agents.protocol import AgentVote, DiagnosisReport, Hypothesis

        fake_diag = DiagnosisReport(
            hypotheses=[Hypothesis(
                description="CPU 高負載",
                confidence=0.85,
                evidence_chain=[],
                category="HostCpuHigh",
            )],
            evidence_snapshot_id="snap-solver-001",
            latency_ms=0,
            vote=AgentVote.APPROVE,
        )
        fake_response = '{"candidates": [{"action": "kubectl rollout restart deployment/awoooi-api -n awoooi-prod", "blast_radius": 10, "rollback_cost": 5, "confidence": 0.8, "rationale": "重啟清除碎片"}]}'

        with patch(
            "src.agents.solver_agent.observe_agent_step"
        ) as mock_observe, patch(
            "src.services.openclaw.get_openclaw"
        ) as mock_get_openclaw, patch(
            "src.agents.solver_agent._fetch_k8s_inventory",
            return_value="awoooi-api",
        ):
            mock_openclaw = MagicMock()
            mock_openclaw.call = AsyncMock(return_value=(fake_response, "nim", True))
            mock_get_openclaw.return_value = mock_openclaw

            agent = solver_mod.SolverAgent()
            await agent._solve(fake_diag)

        mock_observe.assert_called_once()
        call_args = mock_observe.call_args[0]
        assert call_args[0] == "solver"
        assert call_args[1] == "success"

    @pytest.mark.asyncio
    async def test_observe_called_on_critic_timeout(self):
        """Critic timeout 路徑呼叫 observe_agent_step("critic", "timeout", ...)"""
        import src.agents.critic_agent as critic_mod
        from src.agents.protocol import (
            ActionPlan, AgentVote, CandidateAction,
            DiagnosisReport, Hypothesis,
        )

        fake_diag = DiagnosisReport(
            hypotheses=[Hypothesis(
                description="Memory Leak",
                confidence=0.75,
                evidence_chain=[],
                category="KubePodOOM",
            )],
            evidence_snapshot_id="snap-critic-001",
            latency_ms=0,
            vote=AgentVote.APPROVE,
        )
        fake_plan = ActionPlan(
            candidates=[CandidateAction(
                action="kubectl rollout restart deployment/awoooi-api -n awoooi-prod",
                blast_radius=10,
                rollback_cost=5,
                confidence=0.8,
                rationale="重啟",
            )],
            diagnosis_report=fake_diag,
            latency_ms=0,
            vote=AgentVote.APPROVE,
        )

        with patch(
            "src.agents.critic_agent.observe_agent_step"
        ) as mock_observe, patch(
            "src.agents.critic_agent.asyncio.wait_for",
            side_effect=asyncio.TimeoutError(),
        ):
            agent = critic_mod.CriticAgent()
            result = await agent._critique(fake_diag, fake_plan)

        mock_observe.assert_called_once()
        call_args = mock_observe.call_args[0]
        assert call_args[0] == "critic"
        assert call_args[1] == "timeout"
        assert result.degraded is True


# =============================================================================
# Section 4: Histogram buckets 驗證
# =============================================================================

class TestHistogramBuckets:
    """aiops_agent_step_duration_seconds 的 buckets 必須覆蓋 NIM 實測分佈"""

    def test_expected_buckets(self):
        """buckets 必須包含 30s（Diagnostician timeout 邊界）和 15s（Critic timeout 邊界）"""
        from src.observability.agent_step_metrics import _AGENT_STEP_BUCKETS

        assert 15.0 in _AGENT_STEP_BUCKETS, "15s bucket 必須存在（Critic timeout 邊界）"
        assert 20.0 in _AGENT_STEP_BUCKETS, "20s bucket 必須存在（Solver timeout 邊界）"
        assert 30.0 in _AGENT_STEP_BUCKETS, "30s bucket 必須存在（Diagnostician timeout 邊界）"

    def test_buckets_are_sorted_ascending(self):
        """buckets 必須升序排列（prometheus_client 要求）"""
        from src.observability.agent_step_metrics import _AGENT_STEP_BUCKETS

        assert _AGENT_STEP_BUCKETS == sorted(_AGENT_STEP_BUCKETS), (
            f"buckets 必須升序：{_AGENT_STEP_BUCKETS}"
        )