ewoooc/services/elephant_alpha_autonomous_engine.py

"""
elephant_alpha_autonomous_engine.py

AI 3.0 Autonomous Operations:
- Self-learning from outcomes
- Predictive decision making
- Autonomous resource optimization
- Continuous improvement loop

ADR-012 Compliance:
  §③ 單一 audit trail — 所有基行完畢後必發 triaged_alert Telegram
  §⑤ 雙寫強制 — ai_insights (由 orchestrator._log_decision) + Telegram
ADR-013 Compliance:
  resource_optimization trigger → deterministic resource-pressure telemetry + controlled throttling
"""

import asyncio
import inspect
import json
import logging
import os
import re
import sqlite3
import threading
from dataclasses import dataclass, asdict
from datetime import datetime, timedelta
from enum import Enum
from html import escape
from typing import Dict, List, Any, Optional

from sqlalchemy import text
from services.logger_manager import SystemLogger
from services.elephant_alpha_orchestrator import elephant_orchestrator, StrategicDecision
from database.manager import get_db_manager, get_session

logger = SystemLogger("ElephantAlphaEngine").get_logger()

# ---- Configuration ----
SSH_JUMP_HOST = os.getenv("ELEPHANT_ALPHA_JUMP_HOST", "192.168.0.110")
SSH_JUMP_USER = os.getenv("ELEPHANT_ALPHA_JUMP_USER", "wooo")
SSH_KEY_PATH = os.getenv("ELEPHANT_ALPHA_SSH_KEY_PATH", os.path.join(os.path.dirname(__file__), "..", "config", "autoheal_id_ed25519"))
SSH_PORT = int(os.getenv("ELEPHANT_ALPHA_SSH_PORT", "22"))
SSH_CONNECT_TIMEOUT = int(os.getenv("ELEPHANT_ALPHA_SSH_CONNECT_TIMEOUT", "10"))
SSH_COMMAND_TIMEOUT = int(os.getenv("ELEPHANT_ALPHA_SSH_COMMAND_TIMEOUT", "60"))

CACHE_DB_PATH = os.getenv("ELEPHANT_ALPHA_CACHE_DB", ":memory:")
ESCALATION_COOLDOWN_MIN = int(os.getenv("ELEPHANT_ALPHA_ESCALATION_COOLDOWN_MIN", "30"))
CONFIDENCE_THRESHOLD = float(os.getenv("ELEPHANT_ALPHA_CONFIDENCE_THRESHOLD", "0.7"))
MAX_AUTONOMOUS_DECISIONS_PER_HOUR = int(os.getenv("ELEPHANT_ALPHA_MAX_AUTONOMOUS_DECISIONS_PER_HOUR", "10"))
RESOURCE_QUEUE_THRESHOLD = int(os.getenv("ELEPHANT_ALPHA_RESOURCE_QUEUE_THRESHOLD", "10"))
RESOURCE_LOAD_THRESHOLD_PCT = float(os.getenv("ELEPHANT_ALPHA_RESOURCE_LOAD_THRESHOLD_PCT", "80"))
RESOURCE_HIGH_PRIORITY_THRESHOLD = int(os.getenv("ELEPHANT_ALPHA_RESOURCE_HIGH_PRIORITY_THRESHOLD", "5"))
RESOURCE_STALE_THRESHOLD = int(os.getenv("ELEPHANT_ALPHA_RESOURCE_STALE_THRESHOLD", "5"))
RESOURCE_STALE_HOURS = int(os.getenv("ELEPHANT_ALPHA_RESOURCE_STALE_HOURS", "24"))
RESOURCE_HYGIENE_ENABLED = os.getenv("ELEPHANT_ALPHA_RESOURCE_HYGIENE_ENABLED", "true").lower() in {"1", "true", "yes", "on"}
HERMES_LLM_PREFETCH_ENABLED = os.getenv("ELEPHANT_ALPHA_HERMES_LLM_PREFETCH_ENABLED", "false").lower() in {"1", "true", "yes", "on"}

# ---- Constants ----
_ALLOWED_ACTION_TYPES = frozenset({
    "DOCKER_RESTART",
    "WAIT_RETRY",
    "ALERT_ONLY",
    "SSH_CMD",
    "CODE_FIX",
})

_TRIGGER_TO_DECISION_TYPE = {}


class DecisionType(Enum):
    PRICE_OPTIMIZATION = "price_optimization"
    THREAT_RESPONSE = "threat_response"
    MARKET_OPPORTUNITY = "market_opportunity"
    RESOURCE_ALLOCATION = "resource_allocation"
    STRATEGIC_PLANNING = "strategic_planning"


_TRIGGER_TO_DECISION_TYPE = {
    "price_drop_alert": DecisionType.PRICE_OPTIMIZATION,
    "market_opportunity": DecisionType.MARKET_OPPORTUNITY,
    "threat_escalation": DecisionType.THREAT_RESPONSE,
    "resource_optimization": DecisionType.RESOURCE_ALLOCATION,
    "code_exception": DecisionType.RESOURCE_ALLOCATION,  # mapped for handling
}

_TRIGGER_ZH = {
    "price_drop_alert": "價格下滑警報",
    "market_opportunity": "市場機會偵測",
    "threat_escalation": "威脅升級通報",
    "resource_optimization": "資源調配優化",
    "code_exception": "程式碼異常偵測",
    "weekly_insight": "全景電商洞察分析",
}

# Agent 名稱保留英文，僅補上角色說明（禁止音譯）
_AGENT_LABEL = {
    "hermes":    "Hermes",
    "nemotron":  "NemoTron",
    "openclaw":  "OpenClaw",
    "elephant_alpha": "Elephant Alpha",
    "scheduler": "Scheduler",
}

_ACTION_ZH = {
    "analyze_price_competition":   "競品價格分析",
    "dispatch_alert":              "派送告警通知",
    "dispatch_price_updates":      "派送定價更新",
    "dispatch_price_update":       "派送定價更新",
    "generate_strategic_analysis": "產出策略分析",
    "generate_market_analysis":    "市場分析",
    "generate_pricing_strategy":   "定價策略建議",
    "execute_price_adjustment":    "價格調整覆核",
    "adjust_price":                "調整定價",
    "send_alert":                  "發送告警",
}

_PRICE_ADJUSTMENT_REVIEW_ACTIONS = frozenset({
    "execute_price_adjustment",
    "adjust_price",
    "apply_price_change",
    "update_price",
    "dispatch_price_update",
    "dispatch_price_updates",
})

# A' 軌：價格相關觸發類型，HITL 前需 pre-fetch Hermes 具體威脅清單
# 取代 Gemini plan 階段的元流程文字（「步驟 1:[OpenClaw] 生成策略」這類）
_PRICE_RELATED_TRIGGERS = frozenset({
    "price_drop_alert",
    "market_opportunity",
    "threat_escalation",
})

# 這些低信心觸發若沒有具體實證，不應升級打擾人工。
_NO_CONCRETE_ESCALATION_SUPPRESSED_TRIGGERS = frozenset({
    "resource_optimization",
})


def _zh_trigger(trigger_type: str) -> str:
    return _TRIGGER_ZH.get(trigger_type, trigger_type)


def _zh_step(step: dict) -> str:
    agent_key = step.get("agent", "?").lower()
    agent = _AGENT_LABEL.get(agent_key, step.get("agent", "?"))
    action = _ACTION_ZH.get(step.get("action", ""), step.get("action", ""))
    desc = step.get("description", "")
    # 優先用 description（Gemini 生成的繁中說明），其次用 action 中文對照
    detail = desc if desc else action
    return f"[{agent}] {detail}"


@dataclass
class DecisionOutcome:
    decision_id: str
    decision_type: DecisionType
    prediction: Dict[str, Any]
    actual_outcome: Dict[str, Any]
    accuracy_score: float
    business_impact: float
    timestamp: datetime
    lessons_learned: List[str]


class AutonomousTrigger:
    def __init__(
        self,
        trigger_type: str,
        conditions: Dict[str, Any],
        threshold: float,
        enabled: bool,
    ):
        self.trigger_type = trigger_type
        self.conditions = conditions
        self.threshold = threshold
        self.enabled = enabled
        self.last_triggered: Optional[datetime] = None
        self._temp_error_msg: Optional[str] = None
        self._temp_target_file: Optional[str] = None

    @property
    def temp_error_msg(self) -> Optional[str]:
        return self._temp_error_msg

    @temp_error_msg.setter
    def temp_error_msg(self, value: Optional[str]) -> None:
        self._temp_error_msg = value

    @property
    def temp_target_file(self) -> Optional[str]:
        return self._temp_target_file

    @temp_target_file.setter
    def temp_target_file(self, value: Optional[str]) -> None:
        self._temp_target_file = value


class ElephantAlphaAutonomousEngine:
    """
    Elephant Alpha Autonomous Decision Engine

    Features:
    - Persistent escalation dedup via SQLite
    - Secure SSH key handling with enforced file permissions
    - Timeouts and retries for external calls
    - Circuit-breaker for repeated failures
    """

    def __init__(self):
        self._log = logger
        self._init_cache_db()
        self._init_ssh_key_once()
        self.decision_history: List[DecisionOutcome] = []
        self.triggers: List[AutonomousTrigger] = []
        self.confidence_threshold = CONFIDENCE_THRESHOLD
        self.max_autonomous_decisions_per_hour = MAX_AUTONOMOUS_DECISIONS_PER_HOUR
        self._initialize_triggers()
        self._circuit_breaker_state = {"failures": 0, "last_failure": None}
        self._cb_threshold = 5
        self._cb_reset_after = timedelta(minutes=5)

    # ---- DB ----
    def _init_cache_db(self) -> None:
        self._db_lock = threading.Lock()
        self._conn = sqlite3.connect(CACHE_DB_PATH, check_same_thread=False)
        self._conn.execute("""
            CREATE TABLE IF NOT EXISTS escalation_dedup (
                trigger_type TEXT PRIMARY KEY,
                last_triggered INTEGER NOT NULL
            )
        """)
        self._conn.commit()

    def _store_escalation(self, trigger_type: str) -> None:
        with self._db_lock:
            self._conn.execute(
                "INSERT OR REPLACE INTO escalation_dedup (trigger_type, last_triggered) VALUES (?, ?)",
                (trigger_type, int(datetime.now().timestamp())),
            )
            self._conn.commit()

    def _load_escalation(self, trigger_type: str) -> Optional[int]:
        with self._db_lock:
            row = self._conn.execute(
                "SELECT last_triggered FROM escalation_dedup WHERE trigger_type = ?",
                (trigger_type,),
            ).fetchone()
        return row[0] if row else None

    # ---- SSH ----
    def _init_ssh_key_once(self) -> None:
        key_path = os.path.expanduser(SSH_KEY_PATH)
        if not os.path.exists(key_path):
            self._log.warning("SSH key not found at %s; some operations may fail.", key_path)
            return
        try:
            os.chmod(key_path, 0o600)
        except Exception as e:
            self._log.warning("Failed to secure SSH key permissions: %s", e)

    # ---- Triggers ----
    def _initialize_triggers(self) -> None:
        self.triggers = [
            AutonomousTrigger(
                trigger_type="price_drop_alert",
                conditions={"competitor_price_drop_pct": 15, "sales_velocity": "decreasing"},
                threshold=0.8,
                enabled=True,
            ),
            AutonomousTrigger(
                trigger_type="market_opportunity",
                conditions={"competitor_price_premium": ">5%", "our_stock": "available"},
                threshold=0.7,
                enabled=True,
            ),
            AutonomousTrigger(
                trigger_type="threat_escalation",
                conditions={"threat_score": 0.9, "trend": "worsening"},
                threshold=0.85,
                enabled=True,
            ),
            AutonomousTrigger(
                trigger_type="resource_optimization",
                conditions={
                    "queue_threshold": RESOURCE_QUEUE_THRESHOLD,
                    "load_threshold_pct": RESOURCE_LOAD_THRESHOLD_PCT,
                    "high_priority_threshold": RESOURCE_HIGH_PRIORITY_THRESHOLD,
                    "stale_threshold": RESOURCE_STALE_THRESHOLD,
                    "stale_hours": RESOURCE_STALE_HOURS,
                },
                threshold=0.6,
                enabled=True,
            ),
            AutonomousTrigger(
                trigger_type="code_exception",
                conditions={
                    "scan_containers": ["momo-pro-system", "momo-scheduler"],
                    "error_patterns": ["Traceback", "ImportError", "RuntimeError", "ModuleNotFoundError"],
                },
                threshold=1.0,
                enabled=True,
            ),
            AutonomousTrigger(
                trigger_type="weekly_insight",
                conditions={"min_new_insights": 5, "cooldown_hours": 6},
                threshold=0.7,
                # weekly_strategy 由 run_scheduler.py 週一 06:00 統一發送，EA 不再 6h 觸發（防 35+/週重複）
                enabled=False,
            ),
        ]

    # ---- Main loop ----
    async def start_autonomous_monitoring(self) -> None:
        self._log.info("Starting autonomous monitoring engine")
        while True:
            try:
                await self._check_triggers()
                await self._continuous_learning()
                await self._optimize_resources()
                await asyncio.sleep(60)
            except asyncio.CancelledError:
                raise
            except Exception as e:
                self._log.exception("Autonomous monitoring error: %s", e)
                await asyncio.sleep(30)

    # ---- Trigger evaluation ----
    async def _check_triggers(self) -> None:
        if self._is_circuit_open():
            self._log.warning("Circuit breaker open; skipping trigger checks")
            return

        for trigger in self.triggers:
            if not trigger.enabled:
                continue
            cooldown_min = self._get_cooldown(trigger.trigger_type)
            last = trigger.last_triggered
            if last and (datetime.now() - last).total_seconds() / 60 < cooldown_min:
                continue
            if await self._evaluate_trigger(trigger):
                await self._execute_autonomous_decision(trigger)
                trigger.last_triggered = datetime.now()

    def _get_cooldown(self, trigger_type: str) -> int:
        return self._get_cooldown_min(trigger_type)

    def _get_cooldown_min(self, trigger_type: str) -> int:
        return ESCALATION_COOLDOWN_MIN

    async def _evaluate_trigger(self, trigger: AutonomousTrigger) -> bool:
        try:
            if trigger.trigger_type == "price_drop_alert":
                return await self._check_price_drop_trigger(trigger)
            if trigger.trigger_type == "market_opportunity":
                return await self._check_market_opportunity_trigger(trigger)
            if trigger.trigger_type == "threat_escalation":
                return await self._check_threat_escalation_trigger(trigger)
            if trigger.trigger_type == "resource_optimization":
                return await self._check_resource_optimization_trigger(trigger)
            if trigger.trigger_type == "code_exception":
                return await self._check_code_exception_trigger(trigger)
            if trigger.trigger_type == "weekly_insight":
                return await self._check_weekly_insight_trigger(trigger)
        except Exception as e:
            self._log.exception("Trigger evaluation error: %s", e)
        return False

    # ---- Individual trigger checkers ----
    async def _check_price_drop_trigger(self, trigger: AutonomousTrigger) -> bool:
        session = get_session()
        try:
            rows = session.execute(
                text("""
                    SELECT p.i_code AS sku, p.name, p.category,
                           cp.price AS competitor_price, pr.price AS momo_price,
                           ((pr.price - cp.price) / NULLIF(pr.price, 0) * 100) AS price_gap_pct,
                           cp.competitor_product_id,
                           cp.competitor_product_name,
                           cp.crawled_at
                    FROM products p
                    JOIN (
                        SELECT DISTINCT ON (product_id) product_id, price
                        FROM price_records
                        ORDER BY product_id, timestamp DESC
                    ) pr ON pr.product_id = p.id
                    JOIN competitor_prices cp ON cp.sku = p.i_code
                    WHERE cp.expires_at > NOW()
                      AND COALESCE(cp.match_score, 0) >= 0.76
                      AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
                      AND cp.price < pr.price * 0.85
                      AND cp.crawled_at >= NOW() - INTERVAL '2 hours'
                    LIMIT 10
                """)
            ).mappings().fetchall()
            trigger.conditions = dict(trigger.conditions or {})
            trigger.conditions["_db_evidence_actions"] = self._format_competitor_evidence_actions(
                rows,
                top_n=5,
                trigger_type=trigger.trigger_type,
            )
            trigger.conditions["db_evidence_count"] = len(rows)
            return len(rows) >= 3
        finally:
            session.close()

    async def _check_market_opportunity_trigger(self, trigger: AutonomousTrigger) -> bool:
        session = get_session()
        try:
            rows = session.execute(
                text("""
                    SELECT p.i_code AS sku, p.name, p.category,
                           cp.price AS competitor_price, pr.price AS momo_price,
                           ((pr.price - cp.price) / NULLIF(pr.price, 0) * 100) AS price_gap_pct,
                           cp.competitor_product_id,
                           cp.competitor_product_name,
                           cp.crawled_at
                    FROM products p
                    JOIN (
                        SELECT DISTINCT ON (product_id) product_id, price
                        FROM price_records
                        ORDER BY product_id, timestamp DESC
                    ) pr ON pr.product_id = p.id
                    JOIN competitor_prices cp ON cp.sku = p.i_code
                    WHERE cp.expires_at > NOW()
                      AND COALESCE(cp.match_score, 0) >= 0.76
                      AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
                      AND cp.price > pr.price * 1.05
                      AND cp.crawled_at >= NOW() - INTERVAL '1 hour'
                    LIMIT 5
                """)
            ).mappings().fetchall()
            trigger.conditions = dict(trigger.conditions or {})
            trigger.conditions["_db_evidence_actions"] = self._format_competitor_evidence_actions(
                rows,
                top_n=5,
                trigger_type=trigger.trigger_type,
            )
            trigger.conditions["db_evidence_count"] = len(rows)
            return bool(rows)
        finally:
            session.close()

    async def _check_threat_escalation_trigger(self, trigger: AutonomousTrigger) -> bool:
        session = get_session()
        try:
            rows = session.execute(
                text("""
                    SELECT product_sku AS sku, confidence, content, created_at
                    FROM ai_insights
                    WHERE insight_type = 'price_alert'
                      AND confidence >= 0.9
                      AND created_at >= NOW() - INTERVAL '30 minutes'
                      AND metadata_json LIKE '%worsening%'
                    LIMIT 5
                """)
            ).fetchall()
            return len(rows) >= 2
        finally:
            session.close()

    async def _check_resource_optimization_trigger(self, trigger: AutonomousTrigger) -> bool:
        metrics = self._collect_resource_pressure_metrics()
        trigger.conditions = dict(trigger.conditions or {})
        trigger.conditions["_resource_metrics"] = metrics
        trigger.conditions["resource_pressure_level"] = metrics.get("pressure_level", "normal")
        trigger.conditions["resource_evidence"] = metrics.get("evidence", [])
        return bool(metrics.get("should_alert"))

    async def _check_code_exception_trigger(self, trigger: AutonomousTrigger) -> bool:
        containers = trigger.conditions.get("scan_containers", ["momo-pro-system", "momo-scheduler"])
        error_ptns = trigger.conditions.get("error_patterns", ["Traceback", "ImportError", "RuntimeError", "ModuleNotFoundError"])

        key_path = os.path.expanduser(SSH_KEY_PATH)
        if not os.path.exists(key_path):
            self._log.warning("SSH key missing for code exception scan: %s", key_path)
            return False

        has_error = False
        error_context = []
        target_file = ""

        for c in containers:
            try:
                rc, out, err = self._ssh_exec(
                    "192.168.0.188",
                    "ollama",
                    ["docker", "logs", "--since", "5m", c]
                )
                if rc != 0:
                    self._log.debug("Failed to fetch logs for %s via SSH", c)
                    continue
                combined = out + "\n" + err
                if "Traceback (most recent call last):" in combined:
                    lines = combined.splitlines()
                    for i, line in enumerate(lines):
                        if "Traceback" in line:
                            block = lines[i:i + 15]
                            blk_str = "\n".join(block)
                            m = re.search(r'File "([^"]*/(services|routes|database)/[^"]+\.py)"', blk_str)
                            if m:
                                tf = m.group(1)
                                if "/app/" in tf:
                                    tf = tf.split("/app/")[1]
                                elif "momo-pro-system/" in tf:
                                    tf = tf.split("momo-pro-system/")[1]
                                target_file = tf
                            error_context.append(f"[{c}] {blk_str}")
                            has_error = True
                            break
            except Exception as e:
                self._log.debug("Error scanning container %s: %s", c, e)

        if has_error and error_context:
            trigger.temp_error_msg = "\n".join(error_context)
            trigger.temp_target_file = target_file
            return True
        return False

    async def _check_weekly_insight_trigger(self, trigger: AutonomousTrigger) -> bool:
        """
        每 6 小時累積 ≥ 5 筆新 ai_insights 時觸發 OpenClaw 全景分析。
        cooldown 機制防止重複觸發。
        """
        min_new = trigger.conditions.get("min_new_insights", 5)
        session = get_session()
        try:
            row = session.execute(text("""
                SELECT COUNT(*) FROM ai_insights
                WHERE created_at >= NOW() - INTERVAL '6 hours'
                  AND insight_type IN ('price_alert', 'recommendation', 'relearn_event')
                  AND created_by != 'openclaw'
            """)).fetchone()
            count = int(row[0]) if row else 0
            return count >= min_new
        except Exception as e:
            self._log.debug("weekly_insight trigger check failed: %s", e)
            return False
        finally:
            session.close()

    # ---- Decision execution ----
    async def _build_trigger_context(self, trigger: AutonomousTrigger) -> Dict[str, Any]:
        """Build the business context passed to Elephant Alpha."""
        context = {
            "trigger_type": trigger.trigger_type,
            "trigger_label": _zh_trigger(trigger.trigger_type),
            "conditions": trigger.conditions,
            "threshold": trigger.threshold,
            "enabled": trigger.enabled,
            "last_triggered": trigger.last_triggered.isoformat() if trigger.last_triggered else None,
            "generated_at": datetime.now().isoformat(),
            "system_state": {
                "action_queue_size": self._safe_metric(self._get_action_queue_size, default=0),
                "system_load_pct": self._safe_metric(self._get_system_load_percentage, default=0.0),
                "circuit_breaker_failures": self._circuit_breaker_state.get("failures", 0),
                "confidence_threshold": self.confidence_threshold,
                "max_autonomous_decisions_per_hour": self.max_autonomous_decisions_per_hour,
            },
            "objectives": [
                "維持 MOMO 商品監控與業績分析服務可用性",
                "降低重複告警與人工排查成本",
                "在低風險範圍內產生可稽核的自動化行動建議",
            ],
            "constraints": [
                "不得使用 docker compose down 或 --remove-orphans",
                "不得操作 momo-db 容器生命週期",
                "P1/P2 行動需保留 audit trail 與 Telegram 稽核通知",
            ],
        }

        if trigger.temp_error_msg:
            context["error_context"] = trigger.temp_error_msg[:4000]
        if trigger.temp_target_file:
            context["target_file"] = trigger.temp_target_file
        return context

    @staticmethod
    def _safe_metric(fn, default):
        try:
            return fn()
        except Exception:
            return default

    async def _execute_autonomous_decision(self, trigger: AutonomousTrigger) -> None:
        if trigger.trigger_type == "resource_optimization":
            await self._handle_resource_optimization_trigger(trigger)
            return

        # ─── Operation Ollama-First v5.0 修補（2026-05-03）───
        # 統帥反饋：每個 EA trigger 都先跑 Gemini orchestrate（燒錢）才 prefetch Hermes，
        # 結果 Hermes 0 threats 時送出空泛幻覺訊息 + Gemini 帳單照付。
        # 修法：價格類 trigger Hermes-first short-circuit
        #   1. 先跑 Hermes（5s timeout，免費 Ollama）
        #   2. 0 threats → 直接 return（**不燒 Gemini**）
        #   3. 有 threats → 才走原 Gemini orchestration
        # 預期：>70% 的 trigger 在 Hermes 階段就被攔截，月省 Gemini ~3-5M tokens
        if trigger.trigger_type in _PRICE_RELATED_TRIGGERS:
            try:
                hermes_threats = (
                    self._get_trigger_db_concrete_actions(trigger)
                    or await self._fetch_hermes_threats_summary(top_n=5)
                )
            except Exception as e:
                self._log.warning("Hermes pre-flight check 失敗 (non-blocking): %s", e)
                hermes_threats = None

            if not hermes_threats:
                # 0 threats 或 prefetch timeout → 直接 drop trigger，不燒 Gemini
                self._log.info(
                    "EA short-circuit: trigger=%s 無 Hermes 實證 threats，"
                    "跳過 Gemini orchestration 省成本",
                    trigger.trigger_type
                )
                # 紀錄 short-circuit 事件供 token report 統計（不影響主流程）
                try:
                    from services.ai_call_logger import log_ai_call
                    with log_ai_call(
                        caller='ea_engine',
                        provider='gcp_ollama',  # 只跑了 Hermes，沒燒 Gemini
                        model='hermes3:latest',
                        meta={'short_circuit': True, 'trigger': trigger.trigger_type,
                              'reason': 'no_hermes_threats'},
                    ) as ctx:
                        ctx.set_tokens(input=0, output=0)  # Hermes 已自己記
                        ctx.status = 'cache_only'  # 不算 ok 也不算 error
                except Exception:
                    self._log.warning(
                        "EA short-circuit telemetry failed; trigger=%s",
                        trigger.trigger_type,
                        exc_info=True,
                    )
                return

            # Hermes 有實證 threats → 把它存進 trigger.conditions 給 orchestrator 用
            # 避免 orchestrator 又自己編 SKU 數字
            trigger.conditions = trigger.conditions or {}
            trigger.conditions['_prefetched_hermes_threats'] = hermes_threats

        context = await self._build_trigger_context(trigger)
        try:
            decision = await self._run_with_timeout(
                elephant_orchestrator.analyze_and_coordinate,
                context,
                timeout=SSH_COMMAND_TIMEOUT,
            )
        except Exception as e:
            self._log.exception("Orchestrator analysis failed: %s", e)
            self._handle_failure(trigger, "analysis_failure")
            return

        if decision.confidence >= (0.85 if trigger.trigger_type in {"price_drop_alert", "market_opportunity"} else self.confidence_threshold):
            try:
                await self._run_with_timeout(
                    self._execute_decision,
                    decision,
                    timeout=SSH_COMMAND_TIMEOUT,
                )
                self._store_escalation(trigger.trigger_type)
                self._log.info("Autonomous decision executed: %s", trigger.trigger_type)
                await self._notify_telegram_executed(decision, trigger)
                self._circuit_reset()
            except Exception as e:
                self._log.exception("Decision execution failed: %s", e)
                self._handle_failure(trigger, "execution_failure")
        else:
            self._log.warning("Low confidence decision; escalating: %s", trigger.trigger_type)
            await self._escalate_to_human(decision, trigger)

    async def _execute_decision(self, decision: StrategicDecision) -> None:
        for step in decision.execution_plan:
            try:
                await self._execute_step(step)
                self._log.info("Step done: %s -> %s", step.get("agent"), step.get("action"))
            except Exception as e:
                self._log.error("Step failed: %s", e)
                raise

    async def _execute_step(self, step: Dict[str, Any]) -> None:
        agent_type = step.get("agent", "").lower()
        action = step.get("action", "")
        params = step.get("parameters") or step.get("params") or {}

        if agent_type == "hermes" and action == "analyze_price_competition":
            return await self._run_with_timeout(
                self._hermes_analyze,
                timeout=SSH_COMMAND_TIMEOUT,
            )

        if agent_type == "nemotron" and action == "dispatch_alert":
            raw = params.get("threats", [])
            threats = [self._parse_threat(t) for t in raw if isinstance(t, dict)]
            if threats:
                return await self._run_with_timeout(
                    self._dispatch_alerts,
                    threats,
                    timeout=SSH_COMMAND_TIMEOUT,
                )
            return

        # NOTE: openclaw weekly_strategy / meta_analysis / strategic_analysis /
        # market_analysis / pricing_strategy dispatch removed — weekly_strategy 由
        # run_scheduler.py 週一 06:00 統一發送（防 EA 6h 重複觸發 35+/週）。
        # 2026-05-05: orchestrator prompt once allowed openclaw generate_* actions.
        # Treat those legacy strategy steps as advisory no-op so they do not trip
        # the autonomous engine circuit breaker.
        if agent_type == "openclaw" and action in {
            "generate_strategic_analysis",
            "generate_market_analysis",
            "generate_pricing_strategy",
            "generate_resource_optimization_strategy",
            "weekly_strategy",
            "meta_analysis",
            "strategic_analysis",
            "market_analysis",
            "pricing_strategy",
        }:
            self._log.warning("OpenClaw advisory step skipped: action=%s", action)
            return

        if action in {"auto_heal", "resource_optimization", "optimize_resources", "handle_exception"}:
            return await self._run_with_timeout(
                self._run_auto_heal,
                "scheduler_task_failure",
                params,
                timeout=SSH_COMMAND_TIMEOUT,
            )

        if action in {"code_fix", "fix_code_exception", "handle_code_exception"}:
            return await self._run_with_timeout(
                self._run_auto_heal,
                "python_exception",
                params,
                timeout=SSH_COMMAND_TIMEOUT,
            )

        if action in _PRICE_ADJUSTMENT_REVIEW_ACTIONS:
            return await self._run_with_timeout(
                self._record_price_adjustment_review,
                step,
                timeout=SSH_COMMAND_TIMEOUT,
            )

        raise ValueError(f"Unrecognized step: agent={agent_type} action={action}")

    # ---- Deterministic resource pressure handling ----
    def _collect_resource_pressure_metrics(self) -> Dict[str, Any]:
        system_load_pct = float(self._safe_metric(self._get_system_load_percentage, default=0.0) or 0.0)
        rows: List[Any] = []
        query_error = None

        try:
            session = get_session()
            try:
                rows = session.execute(text("""
                    SELECT status, priority, created_at, action_type, created_by, plan_type
                    FROM action_plans
                    WHERE status IN ('pending', 'auto_pending', 'pending_review')
                """)).fetchall()
            finally:
                session.close()
        except Exception as exc:
            query_error = f"{type(exc).__name__}: {str(exc)[:160]}"
            rows = []

        now = datetime.now()
        status_breakdown: Dict[str, int] = {}
        type_breakdown: Dict[str, int] = {}
        stale_count = 0
        high_priority_count = 0
        human_review_count = 0
        oldest_pending_age_hours = 0.0

        for row in rows:
            data = row._mapping if hasattr(row, "_mapping") else row
            status = self._row_get(data, "status")
            priority = self._row_get(data, "priority")
            action_type = (
                self._row_get(data, "action_type")
                or self._row_get(data, "plan_type")
                or self._row_get(data, "created_by")
                or "unknown"
            )
            created_at = self._coerce_datetime(self._row_get(data, "created_at"))

            status_breakdown[str(status or "unknown")] = status_breakdown.get(str(status or "unknown"), 0) + 1
            type_breakdown[str(action_type)] = type_breakdown.get(str(action_type), 0) + 1

            if str(status) == "pending_review":
                human_review_count += 1
            try:
                if priority is not None and int(priority) <= 2:
                    high_priority_count += 1
            except (TypeError, ValueError):
                pass
            if created_at:
                age_hours = max(0.0, (now - created_at).total_seconds() / 3600.0)
                oldest_pending_age_hours = max(oldest_pending_age_hours, age_hours)
                if age_hours >= RESOURCE_STALE_HOURS:
                    stale_count += 1

        queue_size = len(rows)
        if query_error:
            queue_size = int(self._safe_metric(self._get_action_queue_size, default=0) or 0)

        metrics = {
            "action_queue_size": queue_size,
            "high_priority_count": high_priority_count,
            "human_review_count": human_review_count,
            "stale_count": stale_count,
            "oldest_pending_age_hours": round(oldest_pending_age_hours, 1),
            "system_load_pct": round(system_load_pct, 1),
            "queue_threshold": RESOURCE_QUEUE_THRESHOLD,
            "load_threshold_pct": RESOURCE_LOAD_THRESHOLD_PCT,
            "high_priority_threshold": RESOURCE_HIGH_PRIORITY_THRESHOLD,
            "stale_threshold": RESOURCE_STALE_THRESHOLD,
            "stale_hours": RESOURCE_STALE_HOURS,
            "status_breakdown": status_breakdown,
            "type_breakdown": dict(sorted(type_breakdown.items(), key=lambda item: item[1], reverse=True)[:6]),
        }
        if query_error:
            metrics["query_error"] = query_error
        return self._classify_resource_pressure(metrics)

    @staticmethod
    def _row_get(row: Any, key: str) -> Any:
        if isinstance(row, dict):
            return row.get(key)
        try:
            return row[key]
        except Exception:
            return getattr(row, key, None)

    @staticmethod
    def _coerce_datetime(value: Any) -> Optional[datetime]:
        if isinstance(value, datetime):
            return value.replace(tzinfo=None)
        if isinstance(value, str) and value.strip():
            try:
                return datetime.fromisoformat(value.replace("Z", "+00:00")).replace(tzinfo=None)
            except ValueError:
                return None
        return None

    @staticmethod
    def _classify_resource_pressure(metrics: Dict[str, Any]) -> Dict[str, Any]:
        classified = dict(metrics or {})
        queue_size = int(classified.get("action_queue_size") or 0)
        high_priority_count = int(classified.get("high_priority_count") or 0)
        human_review_count = int(classified.get("human_review_count") or 0)
        stale_count = int(classified.get("stale_count") or 0)
        system_load_pct = float(classified.get("system_load_pct") or 0.0)
        queue_threshold = int(classified.get("queue_threshold") or RESOURCE_QUEUE_THRESHOLD)
        load_threshold_pct = float(classified.get("load_threshold_pct") or RESOURCE_LOAD_THRESHOLD_PCT)
        high_priority_threshold = int(classified.get("high_priority_threshold") or RESOURCE_HIGH_PRIORITY_THRESHOLD)
        stale_threshold = int(classified.get("stale_threshold") or RESOURCE_STALE_THRESHOLD)

        queue_pressure = queue_size > queue_threshold
        load_pressure = system_load_pct >= load_threshold_pct
        high_priority_pressure = high_priority_count >= high_priority_threshold
        stale_pressure = stale_count >= stale_threshold

        if load_pressure and (queue_pressure or high_priority_count > 0 or stale_count > 0):
            pressure_level = "critical"
        elif load_pressure or high_priority_pressure or stale_pressure:
            pressure_level = "warning"
        elif queue_pressure:
            pressure_level = "backlog_only"
        else:
            pressure_level = "normal"

        evidence = [
            f"action_queue={queue_size}/{queue_threshold}",
            f"system_load={system_load_pct:.1f}%/{load_threshold_pct:.0f}%",
            f"high_priority={high_priority_count}/{high_priority_threshold}",
            f"stale={stale_count}/{stale_threshold}",
        ]
        if human_review_count:
            evidence.append(f"pending_review={human_review_count}")

        classified.update({
            "pressure_level": pressure_level,
            "should_alert": pressure_level in {"critical", "warning"},
            "queue_pressure": queue_pressure,
            "load_pressure": load_pressure,
            "high_priority_pressure": high_priority_pressure,
            "stale_pressure": stale_pressure,
            "evidence": evidence,
            "confidence": 0.95 if pressure_level == "critical" else 0.86 if pressure_level == "warning" else 0.68,
        })
        return classified

    async def _handle_resource_optimization_trigger(self, trigger: AutonomousTrigger) -> None:
        metrics = (trigger.conditions or {}).get("_resource_metrics")
        if not isinstance(metrics, dict):
            metrics = self._collect_resource_pressure_metrics()
        else:
            metrics = self._classify_resource_pressure(metrics)

        pre_hygiene_metrics = dict(metrics)
        hygiene_result = None
        if metrics.get("should_alert") and RESOURCE_HYGIENE_ENABLED:
            hygiene_result = self._run_action_plan_hygiene()
            if hygiene_result and int(hygiene_result.get("updated_count") or 0) > 0:
                metrics = self._collect_resource_pressure_metrics()
                metrics["pre_hygiene"] = pre_hygiene_metrics
                metrics["hygiene_result"] = hygiene_result

        trigger.conditions = dict(trigger.conditions or {})
        trigger.conditions["_resource_metrics"] = metrics
        trigger.conditions["resource_pressure_level"] = metrics.get("pressure_level")
        trigger.conditions["resource_evidence"] = metrics.get("evidence", [])

        if not metrics.get("should_alert") and not hygiene_result:
            self._store_escalation(trigger.trigger_type)
            self._record_resource_optimization_suppressed(trigger, metrics, "below_actionable_threshold")
            return

        previous_limit = self.max_autonomous_decisions_per_hour
        new_limit = previous_limit
        if metrics.get("load_pressure") or metrics.get("pressure_level") == "critical":
            new_limit = min(previous_limit, 5)
        elif metrics.get("high_priority_pressure") or metrics.get("stale_pressure"):
            new_limit = min(previous_limit, 8)
        self.max_autonomous_decisions_per_hour = new_limit

        insight_id = None
        try:
            insight_id = self._record_resource_pressure_insight(metrics, previous_limit, new_limit)
        except Exception as exc:
            self._log.error("Resource pressure insight write failed (non-blocking): %s", exc)
        await self._send_resource_pressure_telegram(metrics, insight_id, previous_limit, new_limit)
        self._store_escalation(trigger.trigger_type)
        self._circuit_reset()

    def _run_action_plan_hygiene(self) -> Optional[Dict[str, Any]]:
        try:
            from services.action_plan_hygiene import run_action_plan_hygiene

            return run_action_plan_hygiene()
        except Exception as exc:
            self._log.error("Action plan hygiene failed (non-blocking): %s", exc)
            return {"updated_count": 0, "error": f"{type(exc).__name__}: {str(exc)[:200]}"}

    def _record_resource_pressure_insight(
        self,
        metrics: Dict[str, Any],
        previous_limit: int,
        new_limit: int,
    ) -> Optional[int]:
        content = self._format_resource_pressure_content(metrics, previous_limit, new_limit)
        session = get_session()
        try:
            row = session.execute(
                text("""
                    INSERT INTO ai_insights
                        (insight_type, content, confidence, created_by, status, metadata_json)
                    VALUES (:type, :content, :confidence, :created_by, :status, :metadata)
                    RETURNING id
                """),
                {
                    "type": "resource_pressure",
                    "content": content,
                    "confidence": float(metrics.get("confidence") or 0.86),
                    "created_by": "elephant_alpha",
                    "status": "pending",
                    "metadata": json.dumps({
                        "source": "elephant_alpha_resource_pressure",
                        "metrics": metrics,
                        "previous_limit": previous_limit,
                        "new_limit": new_limit,
                    }, ensure_ascii=False),
                },
            ).fetchone()
            session.commit()
            insight_id = row[0] if row else None
            if insight_id:
                try:
                    from services.openclaw_learning_service import enqueue_insight_embedding
                    enqueue_insight_embedding(insight_id, "resource_pressure", content)
                except Exception as embed_err:
                    self._log.warning("Embedding enqueue failed for resource_pressure: %s", embed_err)
            return insight_id
        except Exception:
            session.rollback()
            raise
        finally:
            session.close()

    def _record_resource_optimization_suppressed(
        self,
        trigger: AutonomousTrigger,
        metrics: Dict[str, Any],
        reason: str,
    ) -> None:
        self._log.info(
            "EA resource optimization suppressed: reason=%s level=%s metrics=%s",
            reason,
            metrics.get("pressure_level"),
            metrics,
        )
        try:
            from services.ai_call_logger import log_ai_call

            with log_ai_call(
                caller="ea_engine",
                provider="gcp_ollama",
                model="hermes3:latest",
                meta={
                    "suppressed_resource_optimization": True,
                    "trigger": trigger.trigger_type,
                    "reason": reason,
                    "metrics": metrics,
                },
            ) as ctx:
                ctx.set_tokens(input=0, output=0)
                ctx.status = "cache_only"
        except Exception:
            self._log.warning(
                "EA resource suppression telemetry failed; trigger=%s",
                trigger.trigger_type,
                exc_info=True,
            )

    @staticmethod
    def _format_resource_pressure_content(
        metrics: Dict[str, Any],
        previous_limit: int,
        new_limit: int,
    ) -> str:
        level = metrics.get("pressure_level", "unknown")
        load_text = (
            "主機負載達告警門檻"
            if metrics.get("load_pressure")
            else "主機負載未達告警門檻，壓力主要來自工作隊列"
        )
        return (
            f"[Elephant Alpha 資源壓力] level={level}；"
            f"action_queue={metrics.get('action_queue_size', 0)}，"
            f"P1/P2={metrics.get('high_priority_count', 0)}，"
            f"pending_review={metrics.get('human_review_count', 0)}，"
            f"stale={metrics.get('stale_count', 0)}，"
            f"system_load={float(metrics.get('system_load_pct') or 0):.1f}%。"
            f"{load_text}；"
            f"auto_closed={int((metrics.get('hygiene_result') or {}).get('updated_count') or 0)}；"
            f"autonomous_limit={previous_limit}->{new_limit}。"
        )

    @staticmethod
    def _build_resource_pressure_actions(metrics: Dict[str, Any]) -> List[str]:
        actions = []
        if metrics.get("high_priority_pressure"):
            actions.append("先處理 priority <= 2 的 action_plans，避免高風險項目被一般建議淹沒。")
        if metrics.get("stale_pressure"):
            actions.append(
                f"清理或關閉超過 {metrics.get('stale_hours', RESOURCE_STALE_HOURS)} 小時仍未處理的 pending / pending_review 項目。"
            )
        if metrics.get("load_pressure"):
            actions.append("暫停非必要背景任務，優先保留匯入、Dashboard、Telegram 與 AutoHeal 路徑。")
        if not actions:
            actions.append("僅保留觀測，不派發 Hermes/NemoTron/價格分析，避免把一般 backlog 誤報成資源事件。")
        actions.append("確認 action_plans 來源是否持續產生重複建議；若是報表型建議，應改為摘要消化而非逐筆告警。")
        return actions

    @staticmethod
    def _build_resource_pressure_telegram_message(
        metrics: Dict[str, Any],
        insight_id: Optional[int],
        previous_limit: int,
        new_limit: int,
    ) -> str:
        level = str(metrics.get("pressure_level", "unknown"))
        level_label = {
            "critical": "P1 critical",
            "warning": "P2 warning",
            "backlog_only": "P3 backlog",
            "normal": "P4 normal",
        }.get(level, level)
        load_pct = float(metrics.get("system_load_pct") or 0.0)
        pre_hygiene = metrics.get("pre_hygiene") if isinstance(metrics.get("pre_hygiene"), dict) else None
        hygiene = metrics.get("hygiene_result") if isinstance(metrics.get("hygiene_result"), dict) else None
        hygiene_count = int((hygiene or {}).get("updated_count") or 0)
        if hygiene_count > 0 and not metrics.get("should_alert"):
            level_label = "P4 resolved"
        load_judgement = (
            "主機 CPU 已達高負載門檻。"
            if metrics.get("load_pressure")
            else "主機 CPU 未達高負載門檻，這不是主機資源耗盡，而是工作隊列/人工審核積壓。"
        )
        executed = [
            f"已寫入 ai_insights(resource_pressure) #{insight_id}。"
            if insight_id
            else "ai_insights(resource_pressure) 寫入未取得 id；請查看 scheduler log。"
        ]
        if new_limit != previous_limit:
            executed.append(f"已將 ElephantAlpha 自主決策上限由 {previous_limit} 調整為 {new_limit} 次/小時。")
        if hygiene_count > 0:
            by_source = hygiene.get("by_source") or {}
            source_text = "、".join(f"{key} {value}" for key, value in by_source.items()) or f"{hygiene_count} 筆"
            executed.insert(
                1,
                f"已自動關閉過期 action_plans {hygiene_count} 筆（{source_text}）；只改 status/metadata，不刪除資料。",
            )
        executed.append("未執行外部修復、未啟動 Hermes/NemoTron 價格分析、未宣稱效益預測。")

        lines = [
            "<b>Elephant Alpha · 資源壓力告警</b>",
            f"事件：<code>resource_optimization</code>",
            f"等級：<b>{escape(level_label)}</b>",
            f"時間：{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
            "",
            "<b>量測指標</b>",
        ]
        if pre_hygiene:
            lines.append(
                "• 清理前 Action queue："
                f"{int(pre_hygiene.get('action_queue_size') or 0)}，"
                f"P1/P2：{int(pre_hygiene.get('high_priority_count') or 0)}，"
                f"逾時：{int(pre_hygiene.get('stale_count') or 0)}"
            )
            lines.append("<b>清理後</b>")
        lines += [
            f"• Action queue：{int(metrics.get('action_queue_size') or 0)} / {int(metrics.get('queue_threshold') or RESOURCE_QUEUE_THRESHOLD)}",
            f"• P1/P2 待處理：{int(metrics.get('high_priority_count') or 0)} / {int(metrics.get('high_priority_threshold') or RESOURCE_HIGH_PRIORITY_THRESHOLD)}",
            f"• Pending review：{int(metrics.get('human_review_count') or 0)}",
            f"• 逾時未處理：{int(metrics.get('stale_count') or 0)} / {int(metrics.get('stale_threshold') or RESOURCE_STALE_THRESHOLD)}",
            f"• CPU load：{load_pct:.1f}% / {float(metrics.get('load_threshold_pct') or RESOURCE_LOAD_THRESHOLD_PCT):.0f}%",
            "",
            "<b>判讀</b>",
            f"• {escape(load_judgement)}",
            "• 這則告警只採用 action_plans 與 CPU 實測值，不採用 LLM 生成的 48 小時效益預測。",
            "",
            "<b>已執行</b>",
            *[f"• {escape(item)}" for item in executed],
            "",
            "<b>建議下一步</b>",
            *[f"• {escape(item)}" for item in ElephantAlphaAutonomousEngine._build_resource_pressure_actions(metrics)],
        ]
        return "\n".join(lines)

    async def _send_resource_pressure_telegram(
        self,
        metrics: Dict[str, Any],
        insight_id: Optional[int],
        previous_limit: int,
        new_limit: int,
    ) -> None:
        try:
            from services.telegram_templates import _send_telegram_raw

            msg = self._build_resource_pressure_telegram_message(
                metrics,
                insight_id,
                previous_limit,
                new_limit,
            )
            await self._run_with_timeout(_send_telegram_raw, msg, timeout=10)
            self._log.info("Resource pressure Telegram sent: level=%s", metrics.get("pressure_level"))
        except Exception as e:
            self._log.error("Resource pressure Telegram failed (non-blocking): %s", e)

    # ---- Sub-services ----
    @classmethod
    def _format_competitor_evidence_actions(
        cls,
        rows: Any,
        *,
        top_n: int = 5,
        trigger_type: str = "",
    ) -> List[str]:
        actions: List[str] = []
        for row in list(rows or [])[:top_n]:
            sku = str(cls._row_get(row, "sku") or "").strip()
            name = str(cls._row_get(row, "name") or "")[:24]
            momo = cls._to_float(cls._row_get(row, "momo_price")) or 0.0
            pchome = cls._to_float(cls._row_get(row, "competitor_price")) or 0.0
            if not sku or momo <= 0 or pchome <= 0:
                continue
            gap_pct = cls._to_float(cls._row_get(row, "price_gap_pct"))
            if gap_pct is None:
                gap_pct = (momo - pchome) / momo * 100 if momo else 0.0
            gap_amount = abs(momo - pchome)
            if gap_amount <= 0:
                continue

            competitor_id = str(cls._row_get(row, "competitor_product_id") or "").strip()
            comparison = f"MOMO ${momo:,.0f} vs PChome ${pchome:,.0f} ({gap_pct:+.1f}%)"
            if trigger_type == "market_opportunity" or pchome > momo:
                action = "建議加強曝光或列入 AI 挑品，不需降價"
                impact = f"MOMO 每件價格優勢 NT$ {gap_amount:,.0f}"
            else:
                action = "建議人工確認 PChome identity_v2 後評估跟價或促銷"
                impact = f"每件價差 NT$ {gap_amount:,.0f}"

            parts = [f"[{sku}] {name}", comparison, impact, action]
            if competitor_id:
                parts.append(f"PChome {competitor_id}")
            actions.append("｜".join(parts))
        return actions

    @staticmethod
    def _to_float(value: Any) -> Optional[float]:
        if value is None:
            return None
        try:
            return float(value)
        except (TypeError, ValueError):
            return None

    def _fetch_recent_competitor_evidence_actions(self, top_n: int = 5) -> Optional[List[str]]:
        """用最新 DB 價差產生 EA HITL 實證，不啟動完整 Hermes LLM。"""
        session = get_session()
        try:
            rows = session.execute(
                text("""
                    WITH latest_momo AS (
                        SELECT DISTINCT ON (p.i_code)
                               p.i_code AS sku,
                               p.name,
                               p.category,
                               pr.price AS momo_price,
                               pr.timestamp
                        FROM products p
                        JOIN price_records pr ON pr.product_id = p.id
                        WHERE p.status = 'ACTIVE'
                          AND pr.price IS NOT NULL
                          AND pr.price > 0
                        ORDER BY p.i_code, pr.timestamp DESC, pr.id DESC
                    ),
                    latest_competitor AS (
                        SELECT DISTINCT ON (cp.sku)
                               cp.sku,
                               cp.price AS competitor_price,
                               cp.competitor_product_id,
                               cp.competitor_product_name,
                               cp.crawled_at
                        FROM competitor_prices cp
                        WHERE cp.source = 'pchome'
                          AND (cp.expires_at IS NULL OR cp.expires_at > NOW())
                          AND cp.price IS NOT NULL
                          AND cp.price > 0
                          AND COALESCE(cp.match_score, 0) >= 0.76
                          AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
                          AND cp.crawled_at >= NOW() - INTERVAL '2 hours'
                        ORDER BY cp.sku, cp.crawled_at DESC NULLS LAST
                    )
                    SELECT lm.sku, lm.name, lm.category,
                           lm.momo_price,
                           lc.competitor_price,
                           ((lm.momo_price - lc.competitor_price) / NULLIF(lm.momo_price, 0) * 100) AS price_gap_pct,
                           lc.competitor_product_id,
                           lc.competitor_product_name,
                           lc.crawled_at
                    FROM latest_momo lm
                    JOIN latest_competitor lc ON lc.sku = lm.sku
                    WHERE lc.competitor_price < lm.momo_price * 0.85
                       OR lc.competitor_price > lm.momo_price * 1.05
                    ORDER BY ABS((lm.momo_price - lc.competitor_price) / NULLIF(lm.momo_price, 0)) DESC NULLS LAST,
                             lc.crawled_at DESC NULLS LAST
                    LIMIT :limit
                """),
                {"limit": int(top_n or 5)},
            ).mappings().fetchall()
            actions = self._format_competitor_evidence_actions(rows, top_n=top_n)
            return actions or None
        except Exception as exc:
            self._log.warning("EA DB evidence prefetch failed (non-blocking): %s", exc)
            return None
        finally:
            session.close()

    async def _hermes_analyze(self) -> Any:
        from services.hermes_analyst_service import HermesAnalystService
        return await self._run_with_timeout(
            HermesAnalystService(engine=get_db_manager().engine).run,
            timeout=SSH_COMMAND_TIMEOUT,
        )

    async def _fetch_hermes_threats_summary(self, top_n: int = 5) -> Optional[List[str]]:
        """A' 軌：HITL escalation 前 pre-fetch Hermes 具體威脅清單，
        將「步驟 1: [OpenClaw] 生成策略」這類元流程文字換成
        「[SKU] 商品｜MOMO $X / PChome $Y｜流失 NT$ Z｜建議 NT$ W」具體可決策行動。

        失敗回 None，由呼叫端 fallback 至既有 execution_plan 文字。
        本方法為 best-effort：任何例外都不阻斷 escalation 主流程。

        Critic High-1 fix: 加 5 秒短超時防止阻塞 escalation cooldown 視窗
          （Hermes 完整 run 可能 30-60s，HITL 訊息應快速送出）
        Critic High-2 fix: 若每筆都缺 loss/rec_price，視同無料、return None 觸發 fallback
        """
        db_actions = self._fetch_recent_competitor_evidence_actions(top_n=top_n)
        if db_actions:
            self._log.info(
                "EA prefetch DB evidence produced %d concrete actions",
                len(db_actions),
            )
            return db_actions

        if not HERMES_LLM_PREFETCH_ENABLED:
            self._log.info(
                "EA Hermes LLM prefetch disabled; no recent DB evidence actions"
            )
            return None

        # 使用 5s 短超時：Hermes 熱駐留時實測 < 10s，但若需冷啟動會拖到 30s+
        # HITL 訊息延遲不可大於 10s（影響統帥決策時效性），寧可 fallback 到原 plan 文字
        try:
            result = await asyncio.wait_for(self._hermes_analyze(), timeout=5)
        except asyncio.TimeoutError:
            self._log.warning("Pre-fetch Hermes 5s timeout; falling back to plan text")
            return None
        except Exception as e:
            self._log.warning("Pre-fetch Hermes threats failed (non-blocking): %s", e)
            return None

        threats = getattr(result, "threats", None) or []
        if not threats:
            self._log.info("Pre-fetch Hermes returned 0 threats; falling back to plan text")
            return None

        # 模組頂部 import 較乾淨，但這裡保留 lazy import 避免兩服務循環依賴
        # （nemoton 也需 hermes_analyst 的 PriceThreat dataclass）
        try:
            from services.nemoton_dispatcher_service import _compute_business_impact
        except Exception as imp_err:
            self._log.error("import _compute_business_impact failed: %s", imp_err)
            _compute_business_impact = None

        lines: List[str] = []
        any_concrete = False  # Critic High-2: 至少一筆有金額才算具體
        for t in threats[:top_n]:
            sku = getattr(t, "sku", "?")
            name = getattr(t, "name", "")[:24]
            momo = float(getattr(t, "momo_price", 0) or 0)
            pchome = float(getattr(t, "pchome_price", 0) or 0)
            gap_pct = float(getattr(t, "gap_pct", 0) or 0)

            impact = _compute_business_impact(t) if _compute_business_impact else {
                "revenue_loss_7d": 0.0, "recommended_price": None,
            }
            loss = impact.get("revenue_loss_7d", 0.0) or 0.0
            rec_price = impact.get("recommended_price")

            parts = [
                f"[{sku}] {name}",
                f"MOMO ${momo:,.0f} vs PChome ${pchome:,.0f} ({gap_pct:+.1f}%)",
            ]
            if loss > 0:
                parts.append(f"近 7 日流失 NT$ {loss:,.0f}")
                any_concrete = True
            if rec_price is not None and rec_price > 0:
                parts.append(f"建議跟進 NT$ {rec_price:,.0f}")
                any_concrete = True
            lines.append("｜".join(parts))

        if not any_concrete:
            # Critic High-2: 全部都只有「MOMO $X vs PChome $Y」乾巴巴兩行，
            # 比原本「步驟 1:OpenClaw 生成策略」更空泛。返回 None 觸發 plan fallback
            self._log.info("Pre-fetch threats lacked impact figures on all rows; falling back")
            return None

        self._log.info("Pre-fetch Hermes threats produced %d concrete actions", len(lines))
        return lines

    async def _dispatch_alerts(self, threats: List[Any]) -> Any:
        from services.nemoton_dispatcher_service import NemotronDispatcher
        return await self._run_with_timeout(
            NemotronDispatcher().dispatch,
            threats,
            timeout=SSH_COMMAND_TIMEOUT,
        )

    async def _generate_strategy_report(self) -> Any:
        # 深層保險：即便未來新增 caller，此 method 仍立即崩，避免 EA 再度繞過 dedupe
        # 重複發送 weekly_strategy。weekly_strategy 路徑唯一擁有者：run_scheduler.py 週一 06:00。
        raise RuntimeError(
            "EA autonomous engine no longer generates weekly/meta reports — "
            "owned by run_scheduler.py"
        )

    async def _generate_meta_report(self) -> Any:
        # 同 _generate_strategy_report：meta_analysis 不再由 EA 觸發。
        raise RuntimeError(
            "EA autonomous engine no longer generates weekly/meta reports — "
            "owned by run_scheduler.py"
        )

    def _run_auto_heal(self, error_type: str, context: Dict[str, Any]) -> Any:
        from services.auto_heal_service import auto_heal_service
        payload = dict(context or {})
        payload.setdefault("source", "ElephantAlphaAutonomousEngine")
        payload.setdefault("error_type", error_type)
        return auto_heal_service.handle_exception(error_type=error_type, context=payload)

    def _record_price_adjustment_review(self, step: Dict[str, Any]) -> Dict[str, Any]:
        """
        Price changes are business-critical. Elephant Alpha may recommend them,
        but this system records the proposal for HITL review instead of applying it.
        """
        params = step.get("parameters") or step.get("params") or {}
        sku = (
            params.get("sku")
            or params.get("product_sku")
            or params.get("i_code")
            or params.get("item_id")
            or "unknown"
        )
        action = step.get("action", "price_adjustment")
        content = (
            f"[Elephant Alpha 價格調整覆核] AI 建議執行 {action}，"
            f"商品 {sku} 已攔截直接執行並轉入人工審核。"
        )

        session = get_session()
        try:
            row = session.execute(
                text("""
                    INSERT INTO ai_insights
                        (insight_type, content, confidence, created_by, status, metadata_json)
                    VALUES (:type, :content, :confidence, :created_by, :status, :metadata)
                    RETURNING id
                """),
                {
                    "type": "human_review",
                    "content": content,
                    "confidence": 0.8,
                    "created_by": "elephant_alpha",
                    "status": "pending",
                    "metadata": json.dumps({
                        "source": "price_adjustment_review",
                        "step": step,
                        "sku": sku,
                        "reason": "price_adjustment_requires_human_approval",
                    }, ensure_ascii=False),
                },
            ).fetchone()
            session.commit()
            insight_id = row[0] if row else None
            if insight_id:
                try:
                    from services.openclaw_learning_service import enqueue_insight_embedding
                    enqueue_insight_embedding(insight_id, "human_review", content)
                except Exception as embed_err:
                    self._log.warning("Embedding enqueue failed for price adjustment review: %s", embed_err)
            self._log.warning("Price adjustment intercepted for HITL review: action=%s sku=%s", action, sku)
            return {"status": "pending_review", "insight_id": insight_id, "sku": sku, "action": action}
        except Exception:
            session.rollback()
            raise
        finally:
            session.close()

    # ---- Notification ----
    async def _notify_telegram_executed(
        self,
        decision: StrategicDecision,
        trigger: AutonomousTrigger,
    ) -> None:
        try:
            from services.telegram_templates import _send_telegram_raw

            trigger_zh = _zh_trigger(trigger.trigger_type)
            steps = [_zh_step(s) for s in decision.execution_plan[:5]] or ["（無執行步驟）"]
            steps_text = "\n".join(f"  • {s}" for s in steps)

            # reasoning 必須含數據；若只是空泛摘要則標記為「待補充」
            reasoning = (decision.reasoning or "").strip()
            if len(reasoning) < 30:
                reasoning = "（AI 推理未提供足夠細節）"

            msg = (
                f"<b>⚡ 🐘 Elephant Alpha · 自主執行 · {trigger.trigger_type}</b>\n"
                f"📌 <b>{trigger_zh}</b>\n\n"
                f"🔍 <b>預期效益：</b>{(decision.expected_outcome or '').strip()}\n\n"
                f"🧠 <b>決策依據：</b>{reasoning[:400]}\n\n"
                f"✅ <b>已執行（信心 {decision.confidence:.0%}）：</b>\n{steps_text}"
            )
            await self._run_with_timeout(_send_telegram_raw, msg, timeout=10)
            self._log.info("Telegram audit sent: %s", trigger.trigger_type)
        except Exception as e:
            self._log.error("Telegram audit failed (non-blocking): %s", e)

    @staticmethod
    def _get_prefetched_concrete_actions(trigger: AutonomousTrigger) -> Optional[List[str]]:
        actions = (trigger.conditions or {}).get("_prefetched_hermes_threats")
        if not isinstance(actions, list):
            return None
        cleaned = [str(action).strip() for action in actions if str(action).strip()]
        return cleaned[:5] or None

    @staticmethod
    def _get_trigger_db_concrete_actions(trigger: AutonomousTrigger) -> Optional[List[str]]:
        actions = (trigger.conditions or {}).get("_db_evidence_actions")
        if not isinstance(actions, list):
            return None
        cleaned = [str(action).strip() for action in actions if str(action).strip()]
        return cleaned[:5] or None

    @staticmethod
    def _should_suppress_no_concrete_escalation(trigger: AutonomousTrigger) -> bool:
        return (
            trigger.trigger_type in _PRICE_RELATED_TRIGGERS
            or trigger.trigger_type in _NO_CONCRETE_ESCALATION_SUPPRESSED_TRIGGERS
        )

    def _record_suppressed_escalation(
        self,
        decision: StrategicDecision,
        trigger: AutonomousTrigger,
        reason: str,
    ) -> None:
        self._log.warning(
            "EA escalation suppressed: trigger=%s reason=%s confidence=%.2f conditions=%s",
            trigger.trigger_type,
            reason,
            decision.confidence,
            trigger.conditions,
        )
        try:
            from services.ai_call_logger import log_ai_call

            with log_ai_call(
                caller="ea_engine",
                provider="gcp_ollama",
                model="hermes3:latest",
                meta={
                    "suppressed_escalation": True,
                    "trigger": trigger.trigger_type,
                    "reason": reason,
                    "confidence": decision.confidence,
                    "conditions": trigger.conditions,
                },
            ) as ctx:
                ctx.set_tokens(input=0, output=0)
                ctx.status = "cache_only"
        except Exception:
            self._log.warning(
                "EA suppressed escalation telemetry failed; trigger=%s",
                trigger.trigger_type,
                exc_info=True,
            )

    async def _escalate_to_human(self, decision: StrategicDecision, trigger: AutonomousTrigger) -> None:
        self._log.warning("Escalating to human: %s", trigger.trigger_type)
        concrete_actions = (
            self._get_prefetched_concrete_actions(trigger)
            or self._get_trigger_db_concrete_actions(trigger)
        )
        if not concrete_actions and trigger.trigger_type in _PRICE_RELATED_TRIGGERS:
            try:
                concrete_actions = await self._fetch_hermes_threats_summary(top_n=5)
            except Exception as e:
                self._log.warning("Pre-fetch threats raised (non-blocking): %s", e)
                concrete_actions = None

        if not concrete_actions and self._should_suppress_no_concrete_escalation(trigger):
            self._store_escalation(trigger.trigger_type)
            self._record_suppressed_escalation(decision, trigger, "no_concrete_evidence")
            return

        session = get_session()
        try:
            row = session.execute(
                text("""
                    INSERT INTO ai_insights
                        (insight_type, content, confidence, created_by, status, metadata_json)
                    VALUES (:type, :content, :conf, :by, :status, :meta)
                    RETURNING id
                """),
                {
                    "type": "human_review",
                    "content": (
                        f"[Elephant Alpha 升級審核] {trigger.trigger_type} "
                        f"信心度僅 {decision.confidence:.2f}，建議人工介入。"
                    ),
                    "conf": decision.confidence,
                    "by": "elephant_alpha",
                    "status": "pending",
                    "meta": json.dumps({
                        "decision": asdict(decision),
                        "trigger": trigger.trigger_type,
                        "reason": "low_confidence"
                    }),
                },
            ).fetchone()
            session.commit()
            if row:
                try:
                    from services.openclaw_learning_service import enqueue_insight_embedding
                    enqueue_insight_embedding(
                        row[0],
                        "human_review",
                        f"[Elephant Alpha 升級審核] {trigger.trigger_type} 信心度僅 {decision.confidence:.2f}",
                    )
                except Exception as embed_err:
                    self._log.warning("Embedding enqueue failed for human_review: %s", embed_err)
        except Exception as e:
            self._log.error("DB escalation write failed: %s", e)
            session.rollback()
        finally:
            session.close()

        dedup_ts = self._load_escalation(trigger.trigger_type)
        cooldown_min = self._get_cooldown_min(trigger.trigger_type)
        if not dedup_ts or (datetime.now().timestamp() - dedup_ts) / 60 >= cooldown_min:
            self._store_escalation(trigger.trigger_type)

            # ─── Operation Ollama-First v5.0 修補：消除空泛幻覺訊息 ───
            # 統帥反饋（2026-05-03）：fallback 路徑帶 OpenClaw Gemini plan 文字 +
            # decision.reasoning 全是「312 SKU / 23% / 14 項任務」幻覺數字，無 DB 鉤住，
            # 嚴重誤導決策。修法：concrete=Hermes 實證 vs concrete=None 兩條路徑徹底分離。
            #   - 有實證 → 完整訊息（含 SKU 流失金額）
            #   - 無實證 → 極簡訊息「Hermes 即時數據不可用」+ 不再灌 LLM 幻覺

            from services.telegram_templates import triaged_alert, _send_telegram_raw

            if concrete_actions:
                # 有實證數據路徑：保留完整訊息
                ai_actions_payload = concrete_actions
                ai_summary_text = (decision.reasoning or "")[:300]
                ai_cause_text = (
                    f"觸發類型：{_zh_trigger(trigger.trigger_type)} | "
                    f"信心度：{decision.confidence:.2f} | "
                    f"參與模組：{', '.join(_AGENT_LABEL.get(a.lower(), a) for a in decision.agents_required)}"
                )
            else:
                # 非價格類且允許升級的低信心事件，不能套 Hermes/SKU 模板。
                self._log.warning(
                    "EA escalation 落入 no-concrete-data fallback (trigger=%s)；"
                    "送非價格類診斷訊息，避免 LLM 幻覺數字誤導統帥",
                    trigger.trigger_type
                )
                ai_actions_payload = [
                    f"檢查觸發條件：{json.dumps(trigger.conditions, ensure_ascii=False)[:300]}",
                    "不執行自動動作；請先在觀測台確認對應資料來源與最近錯誤紀錄。",
                ]
                ai_summary_text = (
                    f"低信心且缺少可格式化的具體行動；已隱藏 LLM plan 文字，避免把推測當成事實。"
                )
                ai_cause_text = (
                    f"觸發類型：{_zh_trigger(trigger.trigger_type)} | "
                    f"信心度：{decision.confidence:.2f} | "
                    f"缺少可直接審核的實證資料"
                )

            try:
                msg, keyboard = triaged_alert(
                    base_event={
                        "event_type": "ea_escalation",
                        "title": f"🐘 EA 升級審核 · {_zh_trigger(trigger.trigger_type)}",
                        "summary": (
                            f"自主決策信心度 {decision.confidence:.2f} 低於門檻，需人工批准"
                            + ("" if concrete_actions else "（⚠️ 無實證數據）")
                        ),
                        "id": f"ea_review_{int(datetime.now().timestamp())}",
                    },
                    tier_label="🐘 Elephant Alpha · L3 HITL",
                    ai_summary=ai_summary_text,
                    ai_cause=ai_cause_text,
                    ai_actions=ai_actions_payload,
                )
                await self._run_with_timeout(_send_telegram_raw, msg, timeout=10, reply_markup=keyboard)
                self._log.info(
                    "Human escalation Telegram sent: %s (concrete=%s)",
                    trigger.trigger_type, bool(concrete_actions),
                )
            except Exception as e:
                self._log.error("Telegram escalation failed (non-blocking): %s", e)

    # ---- Resource Optimization ----
    def _ssh_exec(self, host: str, user: str, cmd: Any, timeout: int = 120) -> tuple:
        """Execute command via SSH jump host to target host."""
        import subprocess
        remote_cmd = [str(part) for part in cmd] if isinstance(cmd, list) else [str(cmd)]
        full_cmd = [
            "ssh",
            "-p", str(SSH_PORT),
            "-i", SSH_KEY_PATH,
            "-o", "StrictHostKeyChecking=no",
            "-o", "ConnectTimeout=10",
            "-J", f"{SSH_JUMP_USER}@{SSH_JUMP_HOST}",
            f"{user}@{host}",
            "--",
            *remote_cmd,
        ]
        try:
            res = subprocess.run(
                full_cmd,
                capture_output=True,
                text=True,
                timeout=timeout
            )
            return res.returncode, res.stdout, res.stderr
        except Exception as e:
            return -1, "", str(e)

    def _is_circuit_open(self) -> bool:
        cb = self._circuit_breaker_state
        if cb["failures"] >= self._cb_threshold:
            if not cb["last_failure"]:
                cb["last_failure"] = datetime.now()
            if datetime.now() - cb["last_failure"] > self._cb_reset_after:
                cb["failures"] = 0
                cb["last_failure"] = None
                return False
            return True
        return False

    def _circuit_reset(self) -> None:
        self._circuit_breaker_state["failures"] = 0
        self._circuit_breaker_state["last_failure"] = None

    def _handle_failure(self, trigger: AutonomousTrigger, reason: str) -> None:
        self._circuit_breaker_state["failures"] += 1
        self._circuit_breaker_state["last_failure"] = datetime.now()
        self._log.warning("Failure reason=%s trigger=%s", reason, trigger.trigger_type)
        self._store_escalation(trigger.trigger_type)

    def _get_action_queue_size(self) -> int:
        session = get_session()
        try:
            row = session.execute(
                text("SELECT COUNT(*) AS count FROM action_plans WHERE status = 'pending'")
            ).fetchone()
            return row.count if row else 0
        finally:
            session.close()

    def _get_system_load_percentage(self) -> float:
        # ADR-019 Phase 2 / critic post-review BLOCKER #2：
        # production 必裝 psutil（已加入 requirements.txt）；ImportError fallback 僅作
        # defensive，使用 queue size 估算（pending=14→70%、pending≥18→飽和 90%）。
        # 該 fallback 與真實 CPU 無關，僅避免 dev 環境炸開；若 prod 觸發即代表
        # requirements.txt 與容器映像同步漏裝，需立刻補裝 psutil。
        try:
            import psutil
            return float(psutil.cpu_percent(interval=0.1))
        except ImportError:
            return min(90.0, float(self._get_action_queue_size() * 5.0))

    @staticmethod
    async def _run_with_timeout(coro, *args, timeout: int = 30, **kwargs):
        try:
            if inspect.iscoroutinefunction(coro):
                awaitable = coro(*args, **kwargs)
            else:
                async def _call_sync():
                    result = await asyncio.to_thread(coro, *args, **kwargs)
                    if inspect.isawaitable(result):
                        return await result
                    return result
                awaitable = _call_sync()
            return await asyncio.wait_for(awaitable, timeout=timeout)
        except asyncio.TimeoutError:
            raise TimeoutError(f"Operation timed out after {timeout}s")

    @staticmethod
    def _parse_threat(raw: Any) -> Dict[str, Any]:
        if isinstance(raw, dict):
            return raw
        return {"sku": "", "name": "", "confidence": 0.5}

    # ---- Internal workers ----
    async def _continuous_learning(self) -> None:
        recent = [
            o for o in self.decision_history
            if o.timestamp >= datetime.now() - timedelta(hours=24)
        ]
        if len(recent) >= 5:
            acc = [o.accuracy_score for o in recent if o.accuracy_score > 0]
            if acc:
                avg = sum(acc) / len(acc)
                if avg > 0.8:
                    self.confidence_threshold = min(0.9, self.confidence_threshold + 0.05)
                elif avg < 0.6:
                    self.confidence_threshold = max(0.5, self.confidence_threshold - 0.05)
                self._log.info(
                    "Learning: accuracy=%.2f threshold=%.2f",
                    avg,
                    self.confidence_threshold,
                )

    async def _optimize_resources(self) -> None:
        q = self._get_action_queue_size()
        load = self._get_system_load_percentage()
        if load > 90:
            self.max_autonomous_decisions_per_hour = 5
        elif load < 50 and q < 5:
            self.max_autonomous_decisions_per_hour = 15

# Singleton instance
autonomous_engine = ElephantAlphaAutonomousEngine()

__all__ = [
    "ElephantAlphaAutonomousEngine",
    "autonomous_engine",
    "AutonomousTrigger",
    "DecisionType",
    "DecisionOutcome",
]