Some checks are pending
CD Pipeline / deploy (push) Has started running
Operation Ollama-First v5.0 / Phase 11 / RAG 自主學習迴圈 services/rag_service.py (532 行) - RAGService.query() — bge-m3 embed + cosine 0.85 threshold + top_k=5 - get_embedding_signature() — v5.0 護欄 #3 一致性檢查 (SHA1[:12]) - fire-and-forget rag_query_log INSERT (不阻塞主流程) - feedback() — Telegram 👍/👎 寫回 feedback_score - RAG_ENABLED 預設 OFF(戰前行為不變) services/learning_pipeline.py (750 行) - Distiller — 純 Hermes 規則引擎,零 LLM 成本 Quality 規則:MCP >200 字 0.8 / LLM JSON ok 0.9 / TextRank 0.6 / 👍 1.0 / 👎 0.0 - PromotionGate — Owen v5.0 護欄 #1 鐵律 Stage 1: quality_score >= 0.7 Stage 2: 無幻覺檢測(規則引擎,零 LLM) Stage 3: 與既有 insight 相似度 < 0.95(Stage 3 在 episode embed 後啟用) Stage 4: weight >= 0.8 必經 Telegram 👍/👎 - expire_stale_reviews() — 24h 無回應自動降級 weight=0.5 - hash_human_approver — Telegram username SHA1[:8] PII 保護 services/hermes_analyst_service.py — 新增 analyze() RAG-first - RAG hit → return synthesize(不燒 LLM) - RAG miss → 既有 LLM 路徑 + enqueue learning_episodes services/openclaw_strategist_service.py — Q&A 入口接 RAG-first - 不動週/月/年報(敘事報告 RAG hit 機率低) services/telegram_templates.py - rag_feedback_keyboard() — 👍/👎 inline keyboard - promotion_review_keyboard() — Stage 4 人工驗收按鈕 routes/openclaw_bot_routes.py — 3 組 callback handler - rag_fb:{id}:{score} → rag_service.feedback() - pg_ok:{episode_id} → PromotionGate.promote() - pg_no:{episode_id} → PromotionGate.reject() 70 unit tests 全綠 + 全戰役 196 tests zero regression(4:17 跑完) 剩餘 limitations(Phase 12+ 補): 1. learning_episodes.embedding 寫入路徑(Stage 3 dedup 暫 skip) 2. PromotionGate worker cron 未掛 3. Telegram awaiting_review 推播未接(callback handler 已就位) 灰度開啟條件(建議 1 週後): - ANTHROPIC_API_KEY 設定 + RAG_ENABLED=true + threshold=0.90 保守 - feedback_score >= 4 比率 > 70% → threshold 降至 0.85 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
391 lines
17 KiB
Python
391 lines
17 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
tests/test_promotion_gate.py
|
||
Operation Ollama-First v5.0 / Phase 11 — PromotionGate 4 階段晉升閘單元測試
|
||
|
||
涵蓋:
|
||
Stage 1: quality_score < 0.7 → rejected_quality
|
||
Stage 2: 規則引擎幻覺檢測(hedge words / 矛盾)
|
||
Stage 3: cosine similarity >= 0.95 → rejected_duplicate
|
||
Stage 4: weight >= 0.8 強制 awaiting_review(不能跳)
|
||
promote() / reject() / mark_awaiting_review() DB 操作
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from unittest.mock import MagicMock
|
||
|
||
import pytest
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 共用工具
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
def _fake_episode(
|
||
id_=1, episode_type='llm_response',
|
||
distilled_text='內容', quality_score=0.8, weight=0.5,
|
||
embedding=None, status='pending',
|
||
):
|
||
return {
|
||
'id': id_,
|
||
'episode_type': episode_type,
|
||
'distilled_text': distilled_text,
|
||
'quality_score': quality_score,
|
||
'weight': weight,
|
||
'embedding': embedding,
|
||
'promotion_status': status,
|
||
}
|
||
|
||
|
||
def _patch_load_episode(monkeypatch, episode):
|
||
"""讓 PromotionGate._load_episode 直接回 episode(不走 DB)。"""
|
||
from services.learning_pipeline import PromotionGate
|
||
monkeypatch.setattr(
|
||
PromotionGate, '_load_episode',
|
||
staticmethod(lambda episode_id: episode if episode else None),
|
||
)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Stage 1: quality_score
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
class TestStage1Quality:
|
||
def test_low_quality_rejected(self, monkeypatch):
|
||
from services.learning_pipeline import PromotionGate
|
||
ep = _fake_episode(quality_score=0.5)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
gate = PromotionGate()
|
||
decision = gate.can_promote(1)
|
||
assert decision.can_promote is False
|
||
assert decision.reason == 'rejected_quality'
|
||
assert '0.500' in (decision.detail or '')
|
||
|
||
def test_quality_at_threshold_passes(self, monkeypatch):
|
||
from services.learning_pipeline import PromotionGate, STAGE_1_AUTO_QUALITY
|
||
ep = _fake_episode(quality_score=STAGE_1_AUTO_QUALITY, weight=0.5)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
gate = PromotionGate()
|
||
decision = gate.can_promote(1)
|
||
# 過 Stage 1 + 2 + 3 + 4 自動晉升
|
||
assert decision.can_promote is True
|
||
assert decision.reason == 'approved'
|
||
|
||
def test_episode_not_found(self, monkeypatch):
|
||
from services.learning_pipeline import PromotionGate
|
||
_patch_load_episode(monkeypatch, None)
|
||
|
||
gate = PromotionGate()
|
||
decision = gate.can_promote(99999)
|
||
assert decision.can_promote is False
|
||
assert decision.reason == 'rejected_quality'
|
||
assert 'not found' in (decision.detail or '')
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Stage 2: 幻覺檢測
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
class TestStage2Hallucination:
|
||
def test_hedge_words_without_numbers_rejected(self, monkeypatch):
|
||
from services.learning_pipeline import PromotionGate
|
||
ep = _fake_episode(
|
||
quality_score=0.8,
|
||
distilled_text='我猜本週業績可能會有點成長吧,也許不錯。',
|
||
)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
gate = PromotionGate()
|
||
decision = gate.can_promote(1)
|
||
assert decision.can_promote is False
|
||
assert decision.reason == 'rejected_hallucination'
|
||
|
||
def test_hedge_words_with_numbers_passes(self, monkeypatch):
|
||
from services.learning_pipeline import PromotionGate
|
||
ep = _fake_episode(
|
||
quality_score=0.8, weight=0.5,
|
||
distilled_text='我猜本週業績會漲 5.2%,根據過去 30 天平均。',
|
||
)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
gate = PromotionGate()
|
||
decision = gate.can_promote(1)
|
||
# 通過 Stage 2(有具體數字)→ 進到 Stage 4 → approved
|
||
assert decision.can_promote is True
|
||
|
||
def test_contradiction_rejected(self, monkeypatch):
|
||
from services.learning_pipeline import PromotionGate
|
||
ep = _fake_episode(
|
||
quality_score=0.8,
|
||
distilled_text='A是黑色。A是白色。',
|
||
)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
gate = PromotionGate()
|
||
decision = gate.can_promote(1)
|
||
assert decision.can_promote is False
|
||
assert decision.reason == 'rejected_hallucination'
|
||
assert '自相矛盾' in (decision.detail or '')
|
||
|
||
def test_clean_text_passes(self, monkeypatch):
|
||
from services.learning_pipeline import PromotionGate
|
||
ep = _fake_episode(
|
||
quality_score=0.8, weight=0.5,
|
||
distilled_text='本週業績漲 5.2%,建議聚焦保濕品類。',
|
||
)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
gate = PromotionGate()
|
||
decision = gate.can_promote(1)
|
||
assert decision.can_promote is True
|
||
assert decision.reason == 'approved'
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Stage 3: 去重
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
class TestStage3Dedup:
|
||
def test_high_similarity_rejected(self, monkeypatch):
|
||
from services.learning_pipeline import PromotionGate, STAGE_3_DEDUP_THRESHOLD
|
||
ep = _fake_episode(
|
||
quality_score=0.8, weight=0.5,
|
||
distilled_text='本週業績漲 5%。',
|
||
embedding=[0.1] * 1024, # 模擬非空 embedding
|
||
)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
# 模擬 DB 回 similarity=0.96
|
||
fake_row = MagicMock()
|
||
fake_row.id = 999
|
||
fake_row.similarity = 0.96
|
||
fake_session = MagicMock()
|
||
fake_session.execute.return_value.fetchone.return_value = fake_row
|
||
monkeypatch.setattr('database.manager.get_session', lambda: fake_session)
|
||
|
||
gate = PromotionGate()
|
||
decision = gate.can_promote(1)
|
||
assert decision.can_promote is False
|
||
assert decision.reason == 'rejected_duplicate'
|
||
assert decision.similar_insight_id == 999
|
||
|
||
def test_low_similarity_passes(self, monkeypatch):
|
||
from services.learning_pipeline import PromotionGate
|
||
ep = _fake_episode(
|
||
quality_score=0.8, weight=0.5,
|
||
distilled_text='全新內容', embedding=[0.1] * 1024,
|
||
)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
fake_row = MagicMock()
|
||
fake_row.id = 999
|
||
fake_row.similarity = 0.5
|
||
fake_session = MagicMock()
|
||
fake_session.execute.return_value.fetchone.return_value = fake_row
|
||
monkeypatch.setattr('database.manager.get_session', lambda: fake_session)
|
||
|
||
gate = PromotionGate()
|
||
decision = gate.can_promote(1)
|
||
assert decision.can_promote is True
|
||
|
||
def test_null_embedding_skips_dedup(self, monkeypatch):
|
||
"""蒸餾時尚未 embed → 略過 Stage 3,不阻擋晉升。"""
|
||
from services.learning_pipeline import PromotionGate
|
||
ep = _fake_episode(quality_score=0.8, weight=0.5, embedding=None)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
# DB 不應被呼叫
|
||
called = {'count': 0}
|
||
|
||
def _spy_session():
|
||
called['count'] += 1
|
||
return MagicMock()
|
||
|
||
monkeypatch.setattr('database.manager.get_session', _spy_session)
|
||
|
||
gate = PromotionGate()
|
||
decision = gate.can_promote(1)
|
||
assert decision.can_promote is True
|
||
assert called['count'] == 0
|
||
|
||
def test_dedup_query_failure_passes(self, monkeypatch):
|
||
"""DB 查詢失敗 → 視為通過(避免 DB 故障阻塞晉升)。"""
|
||
from services.learning_pipeline import PromotionGate
|
||
ep = _fake_episode(
|
||
quality_score=0.8, weight=0.5,
|
||
embedding=[0.1] * 1024,
|
||
)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
fake_session = MagicMock()
|
||
fake_session.execute.side_effect = RuntimeError("db down")
|
||
monkeypatch.setattr('database.manager.get_session', lambda: fake_session)
|
||
|
||
gate = PromotionGate()
|
||
decision = gate.can_promote(1)
|
||
assert decision.can_promote is True
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Stage 4: 強制人工驗收(v5.0 護欄 #1 核心)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
class TestStage4HumanReview:
|
||
def test_high_weight_forces_awaiting_review(self, monkeypatch):
|
||
"""weight=0.85 (>=0.8) 必經人工驗收,不能跳 Stage 4。"""
|
||
from services.learning_pipeline import PromotionGate, STAGE_4_HUMAN_REVIEW_WEIGHT
|
||
ep = _fake_episode(quality_score=0.9, weight=0.85)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
gate = PromotionGate()
|
||
decision = gate.can_promote(1)
|
||
assert decision.can_promote is False
|
||
assert decision.reason == 'awaiting_review'
|
||
assert str(STAGE_4_HUMAN_REVIEW_WEIGHT) in (decision.detail or '')
|
||
|
||
def test_high_weight_at_threshold_forces_review(self, monkeypatch):
|
||
"""weight 剛好 0.8 也要進人工驗收(>= not >)。"""
|
||
from services.learning_pipeline import PromotionGate
|
||
ep = _fake_episode(quality_score=0.9, weight=0.8)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
gate = PromotionGate()
|
||
decision = gate.can_promote(1)
|
||
assert decision.reason == 'awaiting_review'
|
||
|
||
def test_low_weight_auto_promoted(self, monkeypatch):
|
||
from services.learning_pipeline import PromotionGate
|
||
ep = _fake_episode(quality_score=0.9, weight=0.79)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
gate = PromotionGate()
|
||
decision = gate.can_promote(1)
|
||
assert decision.can_promote is True
|
||
assert decision.reason == 'approved'
|
||
|
||
def test_high_weight_user_feedback_forces_review(self, monkeypatch):
|
||
"""user_feedback episode_type 預設 weight=0.9 → 必經人工。"""
|
||
from services.learning_pipeline import PromotionGate
|
||
ep = _fake_episode(
|
||
episode_type='user_feedback',
|
||
quality_score=1.0, weight=0.9,
|
||
)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
gate = PromotionGate()
|
||
decision = gate.can_promote(1)
|
||
assert decision.reason == 'awaiting_review'
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# promote() DB 操作
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
class TestPromote:
|
||
def test_promote_inserts_ai_insights_and_updates_episode(self, monkeypatch):
|
||
from services.learning_pipeline import PromotionGate
|
||
ep = _fake_episode(
|
||
id_=1, episode_type='llm_response',
|
||
quality_score=0.9, weight=0.5,
|
||
distilled_text='本週業績漲 5%',
|
||
)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
# 模擬 INSERT RETURNING id = 555
|
||
fake_row = MagicMock()
|
||
fake_row.__getitem__.return_value = 555
|
||
fake_session = MagicMock()
|
||
fake_session.execute.return_value.fetchone.return_value = fake_row
|
||
monkeypatch.setattr('database.manager.get_session', lambda: fake_session)
|
||
|
||
gate = PromotionGate()
|
||
insight_id = gate.promote(1)
|
||
assert insight_id == 555
|
||
# 檢查 INSERT + UPDATE 各跑一次(execute 至少 2 次)
|
||
assert fake_session.execute.call_count >= 2
|
||
fake_session.commit.assert_called_once()
|
||
|
||
def test_promote_episode_not_found_returns_none(self, monkeypatch):
|
||
from services.learning_pipeline import PromotionGate
|
||
_patch_load_episode(monkeypatch, None)
|
||
gate = PromotionGate()
|
||
assert gate.promote(99999) is None
|
||
|
||
def test_promote_db_failure_returns_none(self, monkeypatch):
|
||
from services.learning_pipeline import PromotionGate
|
||
ep = _fake_episode(quality_score=0.9, weight=0.5)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
fake_session = MagicMock()
|
||
fake_session.execute.side_effect = RuntimeError("db down")
|
||
monkeypatch.setattr('database.manager.get_session', lambda: fake_session)
|
||
|
||
gate = PromotionGate()
|
||
assert gate.promote(1) is None
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# reject() / mark_awaiting_review()
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
class TestRejectAndMark:
|
||
def test_reject_valid_reason(self, monkeypatch):
|
||
from services.learning_pipeline import PromotionGate
|
||
|
||
fake_session = MagicMock()
|
||
monkeypatch.setattr('database.manager.get_session', lambda: fake_session)
|
||
|
||
gate = PromotionGate()
|
||
ok = gate.reject(1, 'rejected_quality', detail='quality 0.3 < 0.7')
|
||
assert ok is True
|
||
fake_session.commit.assert_called_once()
|
||
|
||
def test_reject_invalid_reason_returns_false(self):
|
||
from services.learning_pipeline import PromotionGate
|
||
gate = PromotionGate()
|
||
assert gate.reject(1, 'invalid_reason') is False
|
||
|
||
def test_reject_db_failure_returns_false(self, monkeypatch):
|
||
from services.learning_pipeline import PromotionGate
|
||
|
||
fake_session = MagicMock()
|
||
fake_session.execute.side_effect = RuntimeError("db down")
|
||
monkeypatch.setattr('database.manager.get_session', lambda: fake_session)
|
||
|
||
gate = PromotionGate()
|
||
assert gate.reject(1, 'rejected_quality') is False
|
||
|
||
def test_mark_awaiting_review_runs_update(self, monkeypatch):
|
||
from services.learning_pipeline import PromotionGate
|
||
|
||
fake_session = MagicMock()
|
||
monkeypatch.setattr('database.manager.get_session', lambda: fake_session)
|
||
|
||
gate = PromotionGate()
|
||
ok = gate.mark_awaiting_review(1)
|
||
assert ok is True
|
||
fake_session.execute.assert_called_once()
|
||
fake_session.commit.assert_called_once()
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 完整 4 階段流程串接(高 weight 必經人工)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
class TestEndToEndFlow:
|
||
def test_user_feedback_high_quality_must_await_review(self, monkeypatch):
|
||
"""v5.0 護欄 #1 核心案例:user_feedback weight=0.9 強制 awaiting_review,
|
||
即使 quality=1.0 + 無幻覺 + 無重複也不能直接晉升。
|
||
"""
|
||
from services.learning_pipeline import PromotionGate
|
||
ep = _fake_episode(
|
||
episode_type='user_feedback',
|
||
quality_score=1.0, # Stage 1 過
|
||
distilled_text='2026-04-29 業績漲 12%,廣告 ROI 4.2 倍。', # 有數字 → Stage 2 過
|
||
embedding=None, # Stage 3 略過(無 embedding)
|
||
weight=0.9, # >= 0.8 → 強制 Stage 4
|
||
)
|
||
_patch_load_episode(monkeypatch, ep)
|
||
|
||
gate = PromotionGate()
|
||
decision = gate.can_promote(1)
|
||
# 鐵律:高權重必經人工
|
||
assert decision.can_promote is False
|
||
assert decision.reason == 'awaiting_review'
|