Files
ewoooc/tests/test_learning_pipeline.py
OoO c7d6db31f2
Some checks are pending
CD Pipeline / deploy (push) Has started running
feat(p11): RAG 自主學習 + Promotion Gate 4 階段護欄(feature flag OFF)
Operation Ollama-First v5.0 / Phase 11 / RAG 自主學習迴圈

services/rag_service.py (532 行)
- RAGService.query() — bge-m3 embed + cosine 0.85 threshold + top_k=5
- get_embedding_signature() — v5.0 護欄 #3 一致性檢查 (SHA1[:12])
- fire-and-forget rag_query_log INSERT (不阻塞主流程)
- feedback() — Telegram 👍/👎 寫回 feedback_score
- RAG_ENABLED 預設 OFF(戰前行為不變)

services/learning_pipeline.py (750 行)
- Distiller — 純 Hermes 規則引擎,零 LLM 成本
  Quality 規則:MCP >200 字 0.8 / LLM JSON ok 0.9 / TextRank 0.6 / 👍 1.0 / 👎 0.0
- PromotionGate — Owen v5.0 護欄 #1 鐵律
  Stage 1: quality_score >= 0.7
  Stage 2: 無幻覺檢測(規則引擎,零 LLM)
  Stage 3: 與既有 insight 相似度 < 0.95(Stage 3 在 episode embed 後啟用)
  Stage 4: weight >= 0.8 必經 Telegram 👍/👎
- expire_stale_reviews() — 24h 無回應自動降級 weight=0.5
- hash_human_approver — Telegram username SHA1[:8] PII 保護

services/hermes_analyst_service.py — 新增 analyze() RAG-first
- RAG hit → return synthesize(不燒 LLM)
- RAG miss → 既有 LLM 路徑 + enqueue learning_episodes

services/openclaw_strategist_service.py — Q&A 入口接 RAG-first
- 不動週/月/年報(敘事報告 RAG hit 機率低)

services/telegram_templates.py
- rag_feedback_keyboard() — 👍/👎 inline keyboard
- promotion_review_keyboard() — Stage 4 人工驗收按鈕

routes/openclaw_bot_routes.py — 3 組 callback handler
- rag_fb:{id}:{score} → rag_service.feedback()
- pg_ok:{episode_id} → PromotionGate.promote()
- pg_no:{episode_id} → PromotionGate.reject()

70 unit tests 全綠 + 全戰役 196 tests zero regression(4:17 跑完)

剩餘 limitations(Phase 12+ 補):
1. learning_episodes.embedding 寫入路徑(Stage 3 dedup 暫 skip)
2. PromotionGate worker cron 未掛
3. Telegram awaiting_review 推播未接(callback handler 已就位)

灰度開啟條件(建議 1 週後):
- ANTHROPIC_API_KEY 設定 + RAG_ENABLED=true + threshold=0.90 保守
- feedback_score >= 4 比率 > 70% → threshold 降至 0.85

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 23:56:12 +08:00

275 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
tests/test_learning_pipeline.py
Operation Ollama-First v5.0 / Phase 11 — Distiller + LearningPipeline 單元測試
涵蓋:
- Distiller 各 quality_score 規則mcp / llm_response / user_feedback / manual_curated
- LearningPipeline.enqueue() DB 寫入路徑
- expire_stale_reviews() 24h 自動降級
- hash_human_approver() PII 保護
"""
from __future__ import annotations
import json
from unittest.mock import MagicMock, patch
import pytest
# ─────────────────────────────────────────────────────────────────────────────
# Distiller 各規則
# ─────────────────────────────────────────────────────────────────────────────
class TestDistillerMcpResult:
def test_long_with_keywords_high_quality(self):
from services.learning_pipeline import Distiller
d = Distiller()
text = "本週業績分析顯示,建議聚焦保濕品類。" + "詳細說明 " * 80 # > 200 字
result = d.distill(episode_type='mcp_result', raw_content=text)
assert result is not None
assert result.quality_score == 0.8
assert result.episode_type == 'mcp_result'
def test_long_no_keywords_medium_quality(self):
from services.learning_pipeline import Distiller
d = Distiller()
text = "啦啦啦" * 100 # > 200 字但無關鍵字
result = d.distill(episode_type='mcp_result', raw_content=text)
assert result.quality_score == 0.65
def test_short_low_quality(self):
from services.learning_pipeline import Distiller
d = Distiller()
text = "短內容"
result = d.distill(episode_type='mcp_result', raw_content=text)
assert result.quality_score == 0.5
def test_empty_returns_none(self):
from services.learning_pipeline import Distiller
d = Distiller()
assert d.distill(episode_type='mcp_result', raw_content='') is None
assert d.distill(episode_type='mcp_result', raw_content=' ') is None
class TestDistillerLlmResponse:
def test_json_structured_high_quality(self):
from services.learning_pipeline import Distiller
d = Distiller()
text = json.dumps({"status": "ok", "summary": "本週重點"})
result = d.distill(episode_type='llm_response', raw_content=text)
assert result.quality_score == 0.9
def test_json_array_non_empty_high(self):
from services.learning_pipeline import Distiller
d = Distiller()
text = json.dumps([{"sku": "A001", "risk": "HIGH"}])
result = d.distill(episode_type='llm_response', raw_content=text)
assert result.quality_score == 0.9
def test_json_dict_no_status_lower(self):
from services.learning_pipeline import Distiller
d = Distiller()
text = json.dumps({"some_field": "value"})
result = d.distill(episode_type='llm_response', raw_content=text)
# dict 非空 → 0.9 (status_ok 條件含 "len(obj)>0")
assert result.quality_score == 0.9
def test_free_text_long_with_numbers(self):
from services.learning_pipeline import Distiller
d = Distiller()
text = "本週業績漲了 15.3%" + "詳細說明 " * 100 # > 500 字 + 數字
result = d.distill(episode_type='llm_response', raw_content=text)
assert result.quality_score == 0.65
def test_free_text_long_no_numbers(self):
from services.learning_pipeline import Distiller
d = Distiller()
text = "本週業績趨勢上升。" + "詳細說明 " * 100 # > 500 字無數字
result = d.distill(episode_type='llm_response', raw_content=text)
assert result.quality_score == 0.55
def test_free_text_short_below_quality_gate(self):
from services.learning_pipeline import Distiller
d = Distiller()
text = "本週業績有變化" # 短文本
result = d.distill(episode_type='llm_response', raw_content=text)
# 0.4 → Stage 1 會 reject
assert result.quality_score == 0.4
class TestDistillerUserFeedback:
def test_score_5_high_quality(self):
from services.learning_pipeline import Distiller
d = Distiller()
result = d.distill(
episode_type='user_feedback',
raw_content='這個建議幫我增加了 8% 銷量',
user_feedback_score=5,
)
assert result.quality_score == 1.0
assert result.weight == 0.9 # 高權重 → Stage 4 人工驗收
def test_score_1_negative_sample(self):
from services.learning_pipeline import Distiller
d = Distiller()
result = d.distill(
episode_type='user_feedback',
raw_content='完全沒幫助',
user_feedback_score=1,
)
assert result.quality_score == 0.0 # Stage 1 reject
def test_default_score_3_mid(self):
from services.learning_pipeline import Distiller
d = Distiller()
result = d.distill(
episode_type='user_feedback',
raw_content='普通',
user_feedback_score=None,
)
# 預設 3 → (3-1)/4 = 0.5
assert result.quality_score == 0.5
class TestDistillerManualCurated:
def test_max_quality_and_weight(self):
from services.learning_pipeline import Distiller
d = Distiller()
result = d.distill(episode_type='manual_curated', raw_content='手動入庫')
assert result.quality_score == 1.0
assert result.weight == 1.0
class TestDistillerInvalidType:
def test_unknown_type_returns_none(self):
from services.learning_pipeline import Distiller
d = Distiller()
result = d.distill(episode_type='garbage', raw_content='whatever')
assert result is None
class TestDistillerLengthGuard:
def test_distilled_text_truncated_to_16kb(self):
from services.learning_pipeline import Distiller, DISTILLED_TEXT_MAX_BYTES
d = Distiller()
text = '建議分析 ' * 5000 # 遠超 16KB
result = d.distill(episode_type='mcp_result', raw_content=text)
encoded = result.distilled_text.encode('utf-8')
assert len(encoded) <= DISTILLED_TEXT_MAX_BYTES
# ─────────────────────────────────────────────────────────────────────────────
# LearningPipeline.enqueue
# ─────────────────────────────────────────────────────────────────────────────
class TestLearningPipelineEnqueue:
def test_enqueue_returns_id_on_success(self, monkeypatch):
from services.learning_pipeline import learning_pipeline
fake_session = MagicMock()
fake_row = MagicMock()
fake_row.__getitem__.return_value = 42
fake_session.execute.return_value.fetchone.return_value = fake_row
monkeypatch.setattr('database.manager.get_session', lambda: fake_session)
new_id = learning_pipeline.enqueue(
episode_type='manual_curated',
raw_content='手動入庫測試內容',
)
assert new_id == 42
fake_session.commit.assert_called_once()
def test_enqueue_returns_none_when_distill_fails(self):
from services.learning_pipeline import learning_pipeline
# 空內容 → distill 回 None → enqueue 回 None
result = learning_pipeline.enqueue(
episode_type='mcp_result',
raw_content='',
)
assert result is None
def test_enqueue_db_failure_returns_none(self, monkeypatch):
from services.learning_pipeline import learning_pipeline
fake_session = MagicMock()
fake_session.execute.side_effect = RuntimeError("db down")
monkeypatch.setattr('database.manager.get_session', lambda: fake_session)
result = learning_pipeline.enqueue(
episode_type='manual_curated',
raw_content='測試內容',
)
assert result is None
# ─────────────────────────────────────────────────────────────────────────────
# expire_stale_reviews
# ─────────────────────────────────────────────────────────────────────────────
class TestExpireStaleReviews:
def test_expire_uses_correct_sql(self, monkeypatch):
from services.learning_pipeline import expire_stale_reviews
fake_session = MagicMock()
fake_result = MagicMock()
fake_result.rowcount = 3
fake_session.execute.return_value = fake_result
monkeypatch.setattr('database.manager.get_session', lambda: fake_session)
count = expire_stale_reviews(hours=24)
assert count == 3
# 確認 commit 跑了
fake_session.commit.assert_called_once()
def test_expire_db_failure_returns_zero(self, monkeypatch):
from services.learning_pipeline import expire_stale_reviews
fake_session = MagicMock()
fake_session.execute.side_effect = RuntimeError("db down")
monkeypatch.setattr('database.manager.get_session', lambda: fake_session)
count = expire_stale_reviews(hours=24)
assert count == 0
# ─────────────────────────────────────────────────────────────────────────────
# hash_human_approver
# ─────────────────────────────────────────────────────────────────────────────
class TestHashHumanApprover:
def test_returns_8_char_hex(self):
from services.learning_pipeline import hash_human_approver
h = hash_human_approver('owen.tsai')
assert len(h) == 8
assert all(c in '0123456789abcdef' for c in h)
def test_empty_returns_empty(self):
from services.learning_pipeline import hash_human_approver
assert hash_human_approver('') == ''
assert hash_human_approver(None) == '' # type: ignore
def test_deterministic(self):
from services.learning_pipeline import hash_human_approver
a = hash_human_approver('alice')
b = hash_human_approver('alice')
c = hash_human_approver('bob')
assert a == b
assert a != c
# ─────────────────────────────────────────────────────────────────────────────
# 工具函式_detect_simple_contradiction
# ─────────────────────────────────────────────────────────────────────────────
class TestContradictionDetector:
def test_no_contradiction_returns_none(self):
from services.learning_pipeline import _detect_simple_contradiction
text = "業績是上升。市場是競爭。"
# subject=業績→上升, subject=市場→競爭,沒矛盾
assert _detect_simple_contradiction(text) is None
def test_contradiction_detected(self):
from services.learning_pipeline import _detect_simple_contradiction
text = "A是黑色。A是白色。"
result = _detect_simple_contradiction(text)
assert result is not None
assert 'A' in result