202 lines
8.0 KiB
Python
202 lines
8.0 KiB
Python
"""
|
||
Phase 12.1: Tool Calling 優化 - 行動解析測試
|
||
============================================
|
||
#60 OpenClaw 行動解析測試
|
||
#61 Tool Calling 準確度基線建立
|
||
|
||
測試 parse_operation_from_action() 的準確度
|
||
"""
|
||
|
||
import pytest
|
||
|
||
from src.services.executor import OperationType
|
||
from src.services.operation_parser import parse_operation_from_action
|
||
|
||
# =============================================================================
|
||
# 測試案例定義
|
||
# =============================================================================
|
||
|
||
# 格式: (action_string, expected_operation, expected_resource, expected_namespace)
|
||
|
||
ENGLISH_TEST_CASES = [
|
||
# kubectl 標準格式
|
||
("kubectl delete pod nginx-frontend-7d4b8c9f5-xk2m3 -n production",
|
||
OperationType.DELETE_POD, "nginx-frontend-7d4b8c9f5-xk2m3", "production"),
|
||
|
||
("kubectl rollout restart deployment/api-backend -n default",
|
||
OperationType.RESTART_DEPLOYMENT, "api-backend", "default"),
|
||
|
||
("kubectl rollout restart statefulset/postgres-primary -n awoooi-prod",
|
||
OperationType.RESTART_STATEFULSET, "postgres-primary", "awoooi-prod"),
|
||
|
||
("kubectl rollout restart daemonset/node-exporter -n monitoring",
|
||
OperationType.RESTART_DAEMONSET, "node-exporter", "monitoring"),
|
||
|
||
("kubectl delete pod awoooi-worker-0 -n default",
|
||
OperationType.DELETE_POD, "awoooi-worker-0", "default"),
|
||
|
||
# 自然語言英文 (無明確 -n,使用 default_namespace=awoooi-prod)
|
||
("restart deployment api-backend",
|
||
OperationType.RESTART_DEPLOYMENT, "api-backend", "awoooi-prod"),
|
||
|
||
("Restart deployment: web-frontend",
|
||
OperationType.RESTART_DEPLOYMENT, "web-frontend", "awoooi-prod"),
|
||
|
||
("delete pod nginx-ingress-abc123",
|
||
OperationType.DELETE_POD, "nginx-ingress-abc123", "awoooi-prod"),
|
||
|
||
("scale deployment web-frontend to 5 replicas",
|
||
OperationType.SCALE_DEPLOYMENT, "web-frontend", "awoooi-prod"),
|
||
|
||
("Scale deployment api-backend -n staging",
|
||
OperationType.SCALE_DEPLOYMENT, "api-backend", "staging"),
|
||
]
|
||
|
||
CHINESE_TEST_CASES = [
|
||
# 中文標準格式 (無明確 -n,使用 default_namespace=awoooi-prod)
|
||
("重新啟動 api-backend 服務",
|
||
OperationType.RESTART_DEPLOYMENT, "api-backend", "awoooi-prod"),
|
||
|
||
("重新啟動 awoooi-worker 服務",
|
||
OperationType.RESTART_DEPLOYMENT, "awoooi-worker", "awoooi-prod"),
|
||
|
||
("重新啟動 postgres-primary-0",
|
||
OperationType.DELETE_POD, "postgres-primary-0", "awoooi-prod"), # StatefulSet Pod
|
||
|
||
("擴容 api-backend",
|
||
OperationType.SCALE_DEPLOYMENT, "api-backend", "awoooi-prod"),
|
||
|
||
("擴展 web-frontend 副本數到 5",
|
||
OperationType.SCALE_DEPLOYMENT, "web-frontend", "awoooi-prod"),
|
||
|
||
("擴展 api-backend-deployment 副本數至 10",
|
||
OperationType.SCALE_DEPLOYMENT, "api-backend", "awoooi-prod"), # 移除 -deployment 後綴
|
||
|
||
# 刪除 Pod
|
||
("刪除 Pod nginx-ingress-7d6f8c9b5-abc12",
|
||
OperationType.DELETE_POD, "nginx-ingress-7d6f8c9b5-abc12", "awoooi-prod"),
|
||
]
|
||
|
||
MIXED_TEST_CASES = [
|
||
# 混合中英文
|
||
("kubectl delete pod api-backend-0 -n default",
|
||
OperationType.DELETE_POD, "api-backend-0", "default"),
|
||
|
||
("重新啟動 deployment api-backend",
|
||
OperationType.RESTART_DEPLOYMENT, "api-backend", "awoooi-prod"),
|
||
|
||
# OpenClaw 生成的常見格式
|
||
("建議行動: kubectl rollout restart deployment/awoooi-api -n default",
|
||
OperationType.RESTART_DEPLOYMENT, "awoooi-api", "default"),
|
||
]
|
||
|
||
EDGE_CASES = [
|
||
# 應該解析失敗的案例
|
||
("這是一段普通文字,沒有任何操作", None, None, "awoooi-prod"),
|
||
("SELECT * FROM users", None, None, "awoooi-prod"),
|
||
("", None, None, "awoooi-prod"),
|
||
|
||
# 邊界情況
|
||
("restart", None, None, "awoooi-prod"), # 缺少目標
|
||
("delete", None, None, "awoooi-prod"), # 缺少目標
|
||
]
|
||
|
||
|
||
# =============================================================================
|
||
# 測試函數
|
||
# =============================================================================
|
||
|
||
class TestEnglishActionParsing:
|
||
"""英文行動解析測試"""
|
||
|
||
@pytest.mark.parametrize("action,expected_op,expected_resource,expected_ns", ENGLISH_TEST_CASES)
|
||
def test_english_actions(self, action, expected_op, expected_resource, expected_ns):
|
||
op, resource, ns = parse_operation_from_action(action)
|
||
assert op == expected_op, f"Operation mismatch for '{action}': got {op}, expected {expected_op}"
|
||
assert resource == expected_resource, f"Resource mismatch for '{action}': got {resource}, expected {expected_resource}"
|
||
assert ns == expected_ns, f"Namespace mismatch for '{action}': got {ns}, expected {expected_ns}"
|
||
|
||
|
||
class TestChineseActionParsing:
|
||
"""中文行動解析測試"""
|
||
|
||
@pytest.mark.parametrize("action,expected_op,expected_resource,expected_ns", CHINESE_TEST_CASES)
|
||
def test_chinese_actions(self, action, expected_op, expected_resource, expected_ns):
|
||
op, resource, ns = parse_operation_from_action(action)
|
||
assert op == expected_op, f"Operation mismatch for '{action}': got {op}, expected {expected_op}"
|
||
assert resource == expected_resource, f"Resource mismatch for '{action}': got {resource}, expected {expected_resource}"
|
||
assert ns == expected_ns, f"Namespace mismatch for '{action}': got {ns}, expected {expected_ns}"
|
||
|
||
|
||
class TestMixedActionParsing:
|
||
"""混合語言行動解析測試"""
|
||
|
||
@pytest.mark.parametrize("action,expected_op,expected_resource,expected_ns", MIXED_TEST_CASES)
|
||
def test_mixed_actions(self, action, expected_op, expected_resource, expected_ns):
|
||
op, resource, ns = parse_operation_from_action(action)
|
||
assert op == expected_op, f"Operation mismatch for '{action}': got {op}, expected {expected_op}"
|
||
assert resource == expected_resource, f"Resource mismatch for '{action}': got {resource}, expected {expected_resource}"
|
||
assert ns == expected_ns, f"Namespace mismatch for '{action}': got {ns}, expected {expected_ns}"
|
||
|
||
|
||
class TestEdgeCases:
|
||
"""邊界情況測試"""
|
||
|
||
@pytest.mark.parametrize("action,expected_op,expected_resource,expected_ns", EDGE_CASES)
|
||
def test_edge_cases(self, action, expected_op, expected_resource, expected_ns):
|
||
op, resource, ns = parse_operation_from_action(action)
|
||
assert op == expected_op, f"Operation mismatch for '{action}': got {op}, expected {expected_op}"
|
||
assert resource == expected_resource, f"Resource mismatch for '{action}': got {resource}, expected {expected_resource}"
|
||
|
||
|
||
# =============================================================================
|
||
# 準確度報告
|
||
# =============================================================================
|
||
|
||
def test_accuracy_report():
|
||
"""生成準確度報告"""
|
||
all_cases = ENGLISH_TEST_CASES + CHINESE_TEST_CASES + MIXED_TEST_CASES
|
||
|
||
passed = 0
|
||
failed = 0
|
||
failures = []
|
||
|
||
for action, expected_op, expected_resource, expected_ns in all_cases:
|
||
op, resource, ns = parse_operation_from_action(action)
|
||
|
||
if op == expected_op and resource == expected_resource and ns == expected_ns:
|
||
passed += 1
|
||
else:
|
||
failed += 1
|
||
failures.append({
|
||
"action": action,
|
||
"expected": (expected_op, expected_resource, expected_ns),
|
||
"got": (op, resource, ns),
|
||
})
|
||
|
||
total = passed + failed
|
||
accuracy = (passed / total * 100) if total > 0 else 0
|
||
|
||
print("\n" + "=" * 60)
|
||
print("Phase 12.1: 行動解析準確度基線報告")
|
||
print("=" * 60)
|
||
print(f"總測試案例: {total}")
|
||
print(f"通過: {passed}")
|
||
print(f"失敗: {failed}")
|
||
print(f"準確率: {accuracy:.1f}%")
|
||
print("=" * 60)
|
||
|
||
if failures:
|
||
print("\n失敗案例:")
|
||
for f in failures:
|
||
print(f" - '{f['action']}'")
|
||
print(f" 期望: {f['expected']}")
|
||
print(f" 實際: {f['got']}")
|
||
|
||
# 確保準確率符合預期 (當前基線,後續改進後調高)
|
||
assert accuracy >= 70, f"Accuracy {accuracy}% is below baseline 70%"
|
||
|
||
|
||
if __name__ == "__main__":
|
||
pytest.main([__file__, "-v", "--tb=short"])
|