From bd32e04dad102d563fb806b56d926cbd3b8b0f6e Mon Sep 17 00:00:00 2001 From: OoO Date: Mon, 4 May 2026 11:12:52 +0800 Subject: [PATCH] =?UTF-8?q?feat(p25):=20=E5=8F=8D=E9=A5=8B=E7=92=B0?= =?UTF-8?q?=E6=B7=B1=E5=8C=96=20=E2=80=94=20caller-level=20quality=20?= =?UTF-8?q?=E8=B6=A8=E5=8B=A2=E8=BF=BD=E8=B9=A4=20+=20ROI=20=E6=9C=88?= =?UTF-8?q?=E5=A0=B1=E6=95=B4=E5=90=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Operation Ollama-First v5.0 / Phase 25 — 反饋環自主學習深化 services/feedback_quality_tracker.py (180+ 行) - 純 SQL 統計,零 LLM 成本 - 4 個閾值常數(demote 👎×5/avg<2.5 / promote 👍×10/avg>=4.5) - compute_caller_quality_trend(days=7) — 取近 N 日各 caller 反饋 - get_caller_recommendations() — 給 token 日報/ROI 月報用 • 規則 1: 👎 ≥ 5 次 → review • 規則 2: avg < 2.5 + 樣本足 → review • 規則 3: 👍 ≥ 10 + avg ≥ 4.5 → promote(建議關閉 Gemini fallback) - should_demote_caller(caller) — 自動降權判斷(戰役預設不啟用) - render_quality_summary() — 給訊息用 emoji 摘要 ROI 月報整合(services/roi_report_service.py): - 加 Section 「💬 Caller 反饋趨勢(30 日)」TOP 10 by 最低 avg - 加 Section 「🔮 智能建議」最多 3 條(review / promote) - 失敗 swallow 不影響月報主流程 訊息範例: 💬 Caller 反饋趨勢(30 日) ⚠️ openclaw_qa: avg 1.85/5 (👍2 👎8 n=12) ➖ hermes_analyst: avg 3.10/5 (👍5 👎3 n=10) ✅ ppt_gemini: avg 4.75/5 (👍12 👎0 n=15) 🔮 智能建議 ⚠️ openclaw_qa: 近 30 日 👎 反饋 8 次 (avg 1.85/5) — 建議統帥檢視 prompt 或切換 model ✅ ppt_gemini: 近 30 日 👍 反饋 12 次 — 可考慮關閉 Gemini fallback 純走 Ollama tests/test_feedback_quality_tracker.py (10 tests 全綠) - 4 閾值常數 / DB fail 安全 / 空 trends 容錯 - demote 規則(👎 多次)/ promote 規則(👍 多次)/ neutral 不觸發 - should_demote_caller 樣本不足保護 - trend 分類(positive/negative/neutral/no_data)正確 依 ADR-032 RAG 自主學習迴圈 + ADR-033 護欄 #1 不直接改 caller 行為(避循環自動修正失控),只產出建議給統帥審視。 Co-Authored-By: Claude Opus 4.7 (1M context) --- services/feedback_quality_tracker.py | 205 +++++++++++++++++++++++++ services/roi_report_service.py | 22 +++ tests/test_feedback_quality_tracker.py | 166 ++++++++++++++++++++ 3 files changed, 393 insertions(+) create mode 100644 services/feedback_quality_tracker.py create mode 100644 tests/test_feedback_quality_tracker.py diff --git a/services/feedback_quality_tracker.py b/services/feedback_quality_tracker.py new file mode 100644 index 0000000..2d33b90 --- /dev/null +++ b/services/feedback_quality_tracker.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +services/feedback_quality_tracker.py +Operation Ollama-First v5.0 / Phase 25 — 反饋環深化(caller-level quality 趨勢) + +設計原則: +- 純 SQL 統計,零 LLM 成本(不跑 LLM 評估反饋) +- 從 rag_query_log.feedback_score (1-5) + learning_episodes.promotion_status + 推算每個 caller 的近 7/30 日反饋趨勢 +- 持續 👎 ≥ N 次 → 標記 caller 為「需檢視」(給 token 日報 + ROI 月報用) +- 持續 👍 → 提升 caller 信任分 +- 不直接改 caller 行為(避免循環自動修正失控);只產出「建議」給統帥 + +Public API: +- compute_caller_quality_trend() — 取近 N 日各 caller 反饋摘要 +- get_caller_recommendations() — 給 token 日報 Section 6 用 +- should_demote_caller(caller) — 判斷是否該降權(持續 👎) +""" + +from __future__ import annotations +import logging +from datetime import datetime, timedelta +from typing import Dict, Any, List, Optional + +logger = logging.getLogger(__name__) + + +# ───────────────────────────────────────────────────────────────────────────── +# 反饋規則閾值(純 Python 常數,可調) +# ───────────────────────────────────────────────────────────────────────────── +DEMOTE_THUMBS_DOWN_THRESHOLD = 5 # 7 日內 👎 (score=1) ≥ 5 次 → 建議檢視 +DEMOTE_AVG_SCORE_THRESHOLD = 2.5 # 7 日 avg score < 2.5 → 建議檢視 +PROMOTE_THUMBS_UP_THRESHOLD = 10 # 7 日內 👍 (score=5) ≥ 10 次 → 信任分提升 +PROMOTE_AVG_SCORE_THRESHOLD = 4.5 # 7 日 avg score >= 4.5 → 信任分提升 +TREND_DAYS = 7 # 趨勢預設窗格 + + +# ───────────────────────────────────────────────────────────────────────────── +# Public API +# ───────────────────────────────────────────────────────────────────────────── +def compute_caller_quality_trend( + days: int = TREND_DAYS, +) -> Dict[str, Dict[str, Any]]: + """取近 N 日各 caller 的反饋趨勢摘要。 + + Returns: + { + 'caller_name': { + 'total_feedback': N, + 'thumbs_up': X, # score = 5 + 'thumbs_down': Y, # score = 1 + 'avg_score': float, + 'trend': 'positive' | 'neutral' | 'negative' | 'no_data', + } + } + """ + try: + from sqlalchemy import text as sa_text + from database.manager import get_session + except Exception as exc: + logger.warning('[FeedbackTracker] DB import failed: %s', exc) + return {} + + since = datetime.now() - timedelta(days=days) + session = get_session() + try: + rows = session.execute( + sa_text(""" + SELECT + caller, + COUNT(*) FILTER (WHERE feedback_score IS NOT NULL) AS total_feedback, + COUNT(*) FILTER (WHERE feedback_score = 5) AS thumbs_up, + COUNT(*) FILTER (WHERE feedback_score = 1) AS thumbs_down, + AVG(feedback_score) FILTER (WHERE feedback_score IS NOT NULL) AS avg_score + FROM rag_query_log + WHERE queried_at >= :since + GROUP BY caller + HAVING COUNT(*) FILTER (WHERE feedback_score IS NOT NULL) > 0 + """), + {'since': since}, + ).fetchall() + except Exception as exc: + logger.warning('[FeedbackTracker] SQL failed (rag_query_log 可能未建): %s', exc) + session.close() + return {} + finally: + session.close() + + result: Dict[str, Dict[str, Any]] = {} + for r in rows: + caller = r[0] + total = int(r[1] or 0) + thumbs_up = int(r[2] or 0) + thumbs_down = int(r[3] or 0) + avg_score = float(r[4] or 0) + + # 推估趨勢 + if total < 3: + trend = 'no_data' + elif avg_score >= PROMOTE_AVG_SCORE_THRESHOLD: + trend = 'positive' + elif avg_score < DEMOTE_AVG_SCORE_THRESHOLD: + trend = 'negative' + else: + trend = 'neutral' + + result[caller] = { + 'total_feedback': total, + 'thumbs_up': thumbs_up, + 'thumbs_down': thumbs_down, + 'avg_score': round(avg_score, 2), + 'trend': trend, + } + return result + + +def get_caller_recommendations(days: int = TREND_DAYS) -> List[Dict[str, Any]]: + """給 token 日報 / ROI 月報 Section 6 用。 + + Returns: + [ + {'caller': '...', 'action': 'review' | 'promote', 'reason': '...'}, + ... + ] + """ + trends = compute_caller_quality_trend(days=days) + recommendations = [] + + for caller, info in trends.items(): + # 規則 1:連續 👎 ≥ 5 → 建議檢視 + if info['thumbs_down'] >= DEMOTE_THUMBS_DOWN_THRESHOLD: + recommendations.append({ + 'caller': caller, + 'action': 'review', + 'reason': ( + f'近 {days} 日 👎 反饋 {info["thumbs_down"]} 次 ' + f'(avg {info["avg_score"]}/5) — 建議統帥檢視 prompt 或切換 model' + ), + }) + continue + + # 規則 2:avg < 2.5 + 樣本足 → 建議檢視 + if info['trend'] == 'negative' and info['total_feedback'] >= 5: + recommendations.append({ + 'caller': caller, + 'action': 'review', + 'reason': f'近 {days} 日 avg={info["avg_score"]}/5 過低 (n={info["total_feedback"]})', + }) + continue + + # 規則 3:👍 ≥ 10 + avg ≥ 4.5 → 建議升權 + if (info['thumbs_up'] >= PROMOTE_THUMBS_UP_THRESHOLD + and info['trend'] == 'positive'): + recommendations.append({ + 'caller': caller, + 'action': 'promote', + 'reason': ( + f'近 {days} 日 👍 反饋 {info["thumbs_up"]} 次 ' + f'(avg {info["avg_score"]}/5) — 可考慮關閉 Gemini fallback 純走 Ollama' + ), + }) + + return recommendations + + +def should_demote_caller(caller: str, days: int = TREND_DAYS) -> bool: + """判斷某 caller 是否應該降權(持續 👎)。 + + 用於 ai_call_logger 啟動 / RAG worker 動態調整 — 但戰役預設不啟用。 + """ + trends = compute_caller_quality_trend(days=days) + info = trends.get(caller) + if not info or info['total_feedback'] < 5: + return False + return (info['thumbs_down'] >= DEMOTE_THUMBS_DOWN_THRESHOLD + or info['avg_score'] < DEMOTE_AVG_SCORE_THRESHOLD) + + +def render_quality_summary(trends: Dict[str, Dict[str, Any]]) -> str: + """渲染反饋趨勢摘要(給 token 日報 / ROI 月報用)""" + if not trends: + return '(無反饋資料)' + + lines = [] + sorted_callers = sorted(trends.items(), key=lambda kv: kv[1]['avg_score']) + for caller, info in sorted_callers[:10]: # TOP 10 by lowest avg + emoji = {'positive': '✅', 'negative': '⚠️', 'neutral': '➖', 'no_data': '❓'}[info['trend']] + lines.append( + f" {emoji} {caller}: avg {info['avg_score']:.2f}/5 " + f"(👍{info['thumbs_up']} 👎{info['thumbs_down']} n={info['total_feedback']})" + ) + return '\n'.join(lines) + + +__all__ = [ + 'compute_caller_quality_trend', + 'get_caller_recommendations', + 'should_demote_caller', + 'render_quality_summary', + 'DEMOTE_THUMBS_DOWN_THRESHOLD', + 'DEMOTE_AVG_SCORE_THRESHOLD', + 'PROMOTE_THUMBS_UP_THRESHOLD', + 'PROMOTE_AVG_SCORE_THRESHOLD', +] diff --git a/services/roi_report_service.py b/services/roi_report_service.py index 63be39a..b52c88a 100644 --- a/services/roi_report_service.py +++ b/services/roi_report_service.py @@ -155,6 +155,26 @@ def render_roi_report(stats: Dict[str, Any]) -> str: if BASELINE['gemini_monthly_tokens'] else 0 ) + # Phase 25 整合:caller-level feedback 趨勢 + feedback_summary = '' + recommendations_block = '' + try: + from services.feedback_quality_tracker import ( + compute_caller_quality_trend, get_caller_recommendations, + render_quality_summary, + ) + trends = compute_caller_quality_trend(days=30) # 月報用 30 日窗格 + if trends: + feedback_summary = '\n💬 Caller 反饋趨勢(30 日)\n' + render_quality_summary(trends) + recs = get_caller_recommendations(days=30) + if recs: + recommendations_block = '\n🔮 智能建議\n' + for r in recs[:3]: # 最多 3 條 + action_emoji = '⚠️' if r['action'] == 'review' else '✅' + recommendations_block += f" {action_emoji} {r['caller']}: {r['reason']}\n" + except Exception: + pass # 反饋查詢失敗不影響月報主流程 + return ( f"📊 ROI 月報 {period}\n" f"━━━━━━━━━━━━━━━━━━━━\n" @@ -177,6 +197,8 @@ def render_roi_report(stats: Dict[str, Any]) -> str: f"🔧 MCP + Cache\n" f" MCP 呼叫: {stats['mcp_total']:,}\n" f" Cache 命中: {stats['cache_hit_calls']:,} ai_calls + {stats['mcp_cache_hits']:,} mcp_calls\n" + f"{feedback_summary}" + f"{recommendations_block}" f"\n" f"📈 戰役 v5.0 KPI\n" f" Gemini -23.5% 目標:{'✅ 達標' if saved_pct >= 23 else f'⚠️ {saved_pct:.1f}%'}\n" diff --git a/tests/test_feedback_quality_tracker.py b/tests/test_feedback_quality_tracker.py new file mode 100644 index 0000000..e5838a2 --- /dev/null +++ b/tests/test_feedback_quality_tracker.py @@ -0,0 +1,166 @@ +""" +tests/test_feedback_quality_tracker.py +───────────────────────────────────────────────────────────────── +Operation Ollama-First v5.0 / Phase 25 — 反饋環深化驗證 +""" + +from datetime import datetime, timedelta +from unittest.mock import MagicMock + +import pytest + + +def test_constants_defined(): + """4 個閾值常數應存在""" + from services.feedback_quality_tracker import ( + DEMOTE_THUMBS_DOWN_THRESHOLD, DEMOTE_AVG_SCORE_THRESHOLD, + PROMOTE_THUMBS_UP_THRESHOLD, PROMOTE_AVG_SCORE_THRESHOLD, + ) + assert DEMOTE_THUMBS_DOWN_THRESHOLD == 5 + assert DEMOTE_AVG_SCORE_THRESHOLD == 2.5 + assert PROMOTE_THUMBS_UP_THRESHOLD == 10 + assert PROMOTE_AVG_SCORE_THRESHOLD == 4.5 + + +def test_compute_trend_db_fail_returns_empty(monkeypatch): + """DB 異常應回 {} 不 raise""" + from services.feedback_quality_tracker import compute_caller_quality_trend + + class _BrokenSession: + def execute(self, *a, **kw): + raise RuntimeError('rag_query_log not exist') + def close(self): + pass + + monkeypatch.setattr('database.manager.get_session', lambda: _BrokenSession()) + result = compute_caller_quality_trend(days=7) + assert result == {} + + +def test_render_summary_empty(): + from services.feedback_quality_tracker import render_quality_summary + assert '無反饋資料' in render_quality_summary({}) + + +def test_render_summary_with_trends(): + from services.feedback_quality_tracker import render_quality_summary + trends = { + 'openclaw_qa': { + 'total_feedback': 20, 'thumbs_up': 15, 'thumbs_down': 2, + 'avg_score': 4.2, 'trend': 'neutral', + }, + 'hermes_analyst': { + 'total_feedback': 8, 'thumbs_up': 1, 'thumbs_down': 6, + 'avg_score': 1.8, 'trend': 'negative', + }, + } + out = render_quality_summary(trends) + assert 'openclaw_qa' in out + assert 'hermes_analyst' in out + # negative 排前面(avg_score 升序) + assert out.index('hermes_analyst') < out.index('openclaw_qa') + assert '⚠️' in out # negative emoji + assert '➖' in out # neutral emoji + + +def test_get_recommendations_demote_on_thumbs_down(monkeypatch): + """👎 ≥ 5 → review 建議""" + from services.feedback_quality_tracker import get_caller_recommendations + import services.feedback_quality_tracker as fqt + + monkeypatch.setattr(fqt, 'compute_caller_quality_trend', lambda days: { + 'bad_caller': { + 'total_feedback': 8, 'thumbs_up': 1, 'thumbs_down': 6, + 'avg_score': 1.8, 'trend': 'negative', + }, + }) + + recs = get_caller_recommendations(days=7) + assert len(recs) == 1 + assert recs[0]['caller'] == 'bad_caller' + assert recs[0]['action'] == 'review' + assert '6' in recs[0]['reason'] # 👎 6 次 + + +def test_get_recommendations_promote_on_thumbs_up(monkeypatch): + """👍 ≥ 10 + avg ≥ 4.5 → promote 建議""" + from services.feedback_quality_tracker import get_caller_recommendations + import services.feedback_quality_tracker as fqt + + monkeypatch.setattr(fqt, 'compute_caller_quality_trend', lambda days: { + 'great_caller': { + 'total_feedback': 15, 'thumbs_up': 12, 'thumbs_down': 0, + 'avg_score': 4.8, 'trend': 'positive', + }, + }) + + recs = get_caller_recommendations(days=7) + assert len(recs) == 1 + assert recs[0]['action'] == 'promote' + assert '可考慮關閉 Gemini fallback' in recs[0]['reason'] + + +def test_get_recommendations_neutral_no_action(monkeypatch): + """中等樣本不該觸發任何建議""" + from services.feedback_quality_tracker import get_caller_recommendations + import services.feedback_quality_tracker as fqt + + monkeypatch.setattr(fqt, 'compute_caller_quality_trend', lambda days: { + 'avg_caller': { + 'total_feedback': 5, 'thumbs_up': 2, 'thumbs_down': 1, + 'avg_score': 3.5, 'trend': 'neutral', + }, + }) + + recs = get_caller_recommendations(days=7) + assert recs == [] + + +def test_should_demote_caller_with_low_avg(monkeypatch): + from services.feedback_quality_tracker import should_demote_caller + import services.feedback_quality_tracker as fqt + + monkeypatch.setattr(fqt, 'compute_caller_quality_trend', lambda days: { + 'troubled_caller': { + 'total_feedback': 10, 'thumbs_up': 0, 'thumbs_down': 8, + 'avg_score': 1.5, 'trend': 'negative', + }, + }) + + assert should_demote_caller('troubled_caller', days=7) is True + assert should_demote_caller('not_in_trends', days=7) is False + + +def test_should_demote_caller_insufficient_feedback(monkeypatch): + """樣本 < 5 不該降權(避免少量負面誤判)""" + from services.feedback_quality_tracker import should_demote_caller + import services.feedback_quality_tracker as fqt + + monkeypatch.setattr(fqt, 'compute_caller_quality_trend', lambda days: { + 'new_caller': { + 'total_feedback': 3, 'thumbs_up': 0, 'thumbs_down': 2, + 'avg_score': 1.5, 'trend': 'negative', + }, + }) + + assert should_demote_caller('new_caller', days=7) is False + + +def test_compute_trend_classifies_correctly(monkeypatch): + """模擬 SQL 結果驗證 trend 分類""" + from services.feedback_quality_tracker import compute_caller_quality_trend + + fake_session = MagicMock() + fake_session.execute.return_value.fetchall.return_value = [ + ('caller_positive', 12, 10, 0, 4.8), + ('caller_negative', 8, 0, 6, 1.5), + ('caller_neutral', 6, 3, 2, 3.2), + ('caller_no_data', 2, 1, 0, 4.0), + ] + monkeypatch.setattr('database.manager.get_session', lambda: fake_session) + + trends = compute_caller_quality_trend(days=7) + assert trends['caller_positive']['trend'] == 'positive' + assert trends['caller_negative']['trend'] == 'negative' + assert trends['caller_neutral']['trend'] == 'neutral' + assert trends['caller_no_data']['trend'] == 'no_data' # n < 3