Files
ewoooc/tests/test_feedback_quality_tracker.py
OoO bd32e04dad
All checks were successful
CD Pipeline / deploy (push) Successful in 3m2s
feat(p25): 反饋環深化 — caller-level quality 趨勢追蹤 + ROI 月報整合
Operation Ollama-First v5.0 / Phase 25 — 反饋環自主學習深化

services/feedback_quality_tracker.py (180+ 行)
- 純 SQL 統計,零 LLM 成本
- 4 個閾值常數(demote 👎×5/avg<2.5 / promote 👍×10/avg>=4.5)
- compute_caller_quality_trend(days=7) — 取近 N 日各 caller 反饋
- get_caller_recommendations() — 給 token 日報/ROI 月報用
  • 規則 1: 👎 ≥ 5 次 → review
  • 規則 2: avg < 2.5 + 樣本足 → review
  • 規則 3: 👍 ≥ 10 + avg ≥ 4.5 → promote(建議關閉 Gemini fallback)
- should_demote_caller(caller) — 自動降權判斷(戰役預設不啟用)
- render_quality_summary() — 給訊息用 emoji 摘要

ROI 月報整合(services/roi_report_service.py):
- 加 Section 「💬 Caller 反饋趨勢(30 日)」TOP 10 by 最低 avg
- 加 Section 「🔮 智能建議」最多 3 條(review / promote)
- 失敗 swallow 不影響月報主流程

訊息範例:
  💬 Caller 反饋趨勢(30 日)
    ⚠️ openclaw_qa: avg 1.85/5 (👍2 👎8 n=12)
     hermes_analyst: avg 3.10/5 (👍5 👎3 n=10)
     ppt_gemini: avg 4.75/5 (👍12 👎0 n=15)
  🔮 智能建議
    ⚠️ openclaw_qa: 近 30 日 👎 反饋 8 次 (avg 1.85/5) — 建議統帥檢視 prompt 或切換 model
     ppt_gemini: 近 30 日 👍 反饋 12 次 — 可考慮關閉 Gemini fallback 純走 Ollama

tests/test_feedback_quality_tracker.py (10 tests 全綠)
- 4 閾值常數 / DB fail 安全 / 空 trends 容錯
- demote 規則(👎 多次)/ promote 規則(👍 多次)/ neutral 不觸發
- should_demote_caller 樣本不足保護
- trend 分類(positive/negative/neutral/no_data)正確

依 ADR-032 RAG 自主學習迴圈 + ADR-033 護欄 #1
不直接改 caller 行為(避循環自動修正失控),只產出建議給統帥審視。

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 11:12:52 +08:00

167 lines
6.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
tests/test_feedback_quality_tracker.py
─────────────────────────────────────────────────────────────────
Operation Ollama-First v5.0 / Phase 25 — 反饋環深化驗證
"""
from datetime import datetime, timedelta
from unittest.mock import MagicMock
import pytest
def test_constants_defined():
"""4 個閾值常數應存在"""
from services.feedback_quality_tracker import (
DEMOTE_THUMBS_DOWN_THRESHOLD, DEMOTE_AVG_SCORE_THRESHOLD,
PROMOTE_THUMBS_UP_THRESHOLD, PROMOTE_AVG_SCORE_THRESHOLD,
)
assert DEMOTE_THUMBS_DOWN_THRESHOLD == 5
assert DEMOTE_AVG_SCORE_THRESHOLD == 2.5
assert PROMOTE_THUMBS_UP_THRESHOLD == 10
assert PROMOTE_AVG_SCORE_THRESHOLD == 4.5
def test_compute_trend_db_fail_returns_empty(monkeypatch):
"""DB 異常應回 {} 不 raise"""
from services.feedback_quality_tracker import compute_caller_quality_trend
class _BrokenSession:
def execute(self, *a, **kw):
raise RuntimeError('rag_query_log not exist')
def close(self):
pass
monkeypatch.setattr('database.manager.get_session', lambda: _BrokenSession())
result = compute_caller_quality_trend(days=7)
assert result == {}
def test_render_summary_empty():
from services.feedback_quality_tracker import render_quality_summary
assert '無反饋資料' in render_quality_summary({})
def test_render_summary_with_trends():
from services.feedback_quality_tracker import render_quality_summary
trends = {
'openclaw_qa': {
'total_feedback': 20, 'thumbs_up': 15, 'thumbs_down': 2,
'avg_score': 4.2, 'trend': 'neutral',
},
'hermes_analyst': {
'total_feedback': 8, 'thumbs_up': 1, 'thumbs_down': 6,
'avg_score': 1.8, 'trend': 'negative',
},
}
out = render_quality_summary(trends)
assert 'openclaw_qa' in out
assert 'hermes_analyst' in out
# negative 排前面avg_score 升序)
assert out.index('hermes_analyst') < out.index('openclaw_qa')
assert '⚠️' in out # negative emoji
assert '' in out # neutral emoji
def test_get_recommendations_demote_on_thumbs_down(monkeypatch):
"""👎 ≥ 5 → review 建議"""
from services.feedback_quality_tracker import get_caller_recommendations
import services.feedback_quality_tracker as fqt
monkeypatch.setattr(fqt, 'compute_caller_quality_trend', lambda days: {
'bad_caller': {
'total_feedback': 8, 'thumbs_up': 1, 'thumbs_down': 6,
'avg_score': 1.8, 'trend': 'negative',
},
})
recs = get_caller_recommendations(days=7)
assert len(recs) == 1
assert recs[0]['caller'] == 'bad_caller'
assert recs[0]['action'] == 'review'
assert '6' in recs[0]['reason'] # 👎 6 次
def test_get_recommendations_promote_on_thumbs_up(monkeypatch):
"""👍 ≥ 10 + avg ≥ 4.5 → promote 建議"""
from services.feedback_quality_tracker import get_caller_recommendations
import services.feedback_quality_tracker as fqt
monkeypatch.setattr(fqt, 'compute_caller_quality_trend', lambda days: {
'great_caller': {
'total_feedback': 15, 'thumbs_up': 12, 'thumbs_down': 0,
'avg_score': 4.8, 'trend': 'positive',
},
})
recs = get_caller_recommendations(days=7)
assert len(recs) == 1
assert recs[0]['action'] == 'promote'
assert '可考慮關閉 Gemini fallback' in recs[0]['reason']
def test_get_recommendations_neutral_no_action(monkeypatch):
"""中等樣本不該觸發任何建議"""
from services.feedback_quality_tracker import get_caller_recommendations
import services.feedback_quality_tracker as fqt
monkeypatch.setattr(fqt, 'compute_caller_quality_trend', lambda days: {
'avg_caller': {
'total_feedback': 5, 'thumbs_up': 2, 'thumbs_down': 1,
'avg_score': 3.5, 'trend': 'neutral',
},
})
recs = get_caller_recommendations(days=7)
assert recs == []
def test_should_demote_caller_with_low_avg(monkeypatch):
from services.feedback_quality_tracker import should_demote_caller
import services.feedback_quality_tracker as fqt
monkeypatch.setattr(fqt, 'compute_caller_quality_trend', lambda days: {
'troubled_caller': {
'total_feedback': 10, 'thumbs_up': 0, 'thumbs_down': 8,
'avg_score': 1.5, 'trend': 'negative',
},
})
assert should_demote_caller('troubled_caller', days=7) is True
assert should_demote_caller('not_in_trends', days=7) is False
def test_should_demote_caller_insufficient_feedback(monkeypatch):
"""樣本 < 5 不該降權(避免少量負面誤判)"""
from services.feedback_quality_tracker import should_demote_caller
import services.feedback_quality_tracker as fqt
monkeypatch.setattr(fqt, 'compute_caller_quality_trend', lambda days: {
'new_caller': {
'total_feedback': 3, 'thumbs_up': 0, 'thumbs_down': 2,
'avg_score': 1.5, 'trend': 'negative',
},
})
assert should_demote_caller('new_caller', days=7) is False
def test_compute_trend_classifies_correctly(monkeypatch):
"""模擬 SQL 結果驗證 trend 分類"""
from services.feedback_quality_tracker import compute_caller_quality_trend
fake_session = MagicMock()
fake_session.execute.return_value.fetchall.return_value = [
('caller_positive', 12, 10, 0, 4.8),
('caller_negative', 8, 0, 6, 1.5),
('caller_neutral', 6, 3, 2, 3.2),
('caller_no_data', 2, 1, 0, 4.0),
]
monkeypatch.setattr('database.manager.get_session', lambda: fake_session)
trends = compute_caller_quality_trend(days=7)
assert trends['caller_positive']['trend'] == 'positive'
assert trends['caller_negative']['trend'] == 'negative'
assert trends['caller_neutral']['trend'] == 'neutral'
assert trends['caller_no_data']['trend'] == 'no_data' # n < 3