""" Phase 12.1: Tool Calling 優化 - 行動解析測試 ============================================ #60 OpenClaw 行動解析測試 #61 Tool Calling 準確度基線建立 測試 parse_operation_from_action() 的準確度 """ import pytest from src.services.executor import OperationType from src.services.operation_parser import parse_operation_from_action # ============================================================================= # 測試案例定義 # ============================================================================= # 格式: (action_string, expected_operation, expected_resource, expected_namespace) ENGLISH_TEST_CASES = [ # kubectl 標準格式 ("kubectl delete pod nginx-frontend-7d4b8c9f5-xk2m3 -n production", OperationType.DELETE_POD, "nginx-frontend-7d4b8c9f5-xk2m3", "production"), ("kubectl rollout restart deployment/api-backend -n default", OperationType.RESTART_DEPLOYMENT, "api-backend", "default"), ("kubectl rollout restart statefulset/postgres-primary -n awoooi-prod", OperationType.RESTART_STATEFULSET, "postgres-primary", "awoooi-prod"), ("kubectl rollout restart daemonset/node-exporter -n monitoring", OperationType.RESTART_DAEMONSET, "node-exporter", "monitoring"), ("kubectl delete pod awoooi-worker-0 -n default", OperationType.DELETE_POD, "awoooi-worker-0", "default"), # 自然語言英文 (無明確 -n,使用 default_namespace=awoooi-prod) ("restart deployment api-backend", OperationType.RESTART_DEPLOYMENT, "api-backend", "awoooi-prod"), ("Restart deployment: web-frontend", OperationType.RESTART_DEPLOYMENT, "web-frontend", "awoooi-prod"), ("delete pod nginx-ingress-abc123", OperationType.DELETE_POD, "nginx-ingress-abc123", "awoooi-prod"), ("scale deployment web-frontend to 5 replicas", OperationType.SCALE_DEPLOYMENT, "web-frontend", "awoooi-prod"), ("Scale deployment api-backend -n staging", OperationType.SCALE_DEPLOYMENT, "api-backend", "staging"), ] CHINESE_TEST_CASES = [ # 中文標準格式 (無明確 -n,使用 default_namespace=awoooi-prod) ("重新啟動 api-backend 服務", OperationType.RESTART_DEPLOYMENT, "api-backend", "awoooi-prod"), ("重新啟動 awoooi-worker 服務", OperationType.RESTART_DEPLOYMENT, "awoooi-worker", "awoooi-prod"), ("重新啟動 postgres-primary-0", OperationType.DELETE_POD, "postgres-primary-0", "awoooi-prod"), # StatefulSet Pod ("擴容 api-backend", OperationType.SCALE_DEPLOYMENT, "api-backend", "awoooi-prod"), ("擴展 web-frontend 副本數到 5", OperationType.SCALE_DEPLOYMENT, "web-frontend", "awoooi-prod"), ("擴展 api-backend-deployment 副本數至 10", OperationType.SCALE_DEPLOYMENT, "api-backend", "awoooi-prod"), # 移除 -deployment 後綴 # 刪除 Pod ("刪除 Pod nginx-ingress-7d6f8c9b5-abc12", OperationType.DELETE_POD, "nginx-ingress-7d6f8c9b5-abc12", "awoooi-prod"), ] MIXED_TEST_CASES = [ # 混合中英文 ("kubectl delete pod api-backend-0 -n default", OperationType.DELETE_POD, "api-backend-0", "default"), ("重新啟動 deployment api-backend", OperationType.RESTART_DEPLOYMENT, "api-backend", "awoooi-prod"), # OpenClaw 生成的常見格式 ("建議行動: kubectl rollout restart deployment/awoooi-api -n default", OperationType.RESTART_DEPLOYMENT, "awoooi-api", "default"), ] EDGE_CASES = [ # 應該解析失敗的案例 ("這是一段普通文字,沒有任何操作", None, None, "awoooi-prod"), ("SELECT * FROM users", None, None, "awoooi-prod"), ("", None, None, "awoooi-prod"), # 邊界情況 ("restart", None, None, "awoooi-prod"), # 缺少目標 ("delete", None, None, "awoooi-prod"), # 缺少目標 ] # ============================================================================= # 測試函數 # ============================================================================= class TestEnglishActionParsing: """英文行動解析測試""" @pytest.mark.parametrize("action,expected_op,expected_resource,expected_ns", ENGLISH_TEST_CASES) def test_english_actions(self, action, expected_op, expected_resource, expected_ns): op, resource, ns = parse_operation_from_action(action) assert op == expected_op, f"Operation mismatch for '{action}': got {op}, expected {expected_op}" assert resource == expected_resource, f"Resource mismatch for '{action}': got {resource}, expected {expected_resource}" assert ns == expected_ns, f"Namespace mismatch for '{action}': got {ns}, expected {expected_ns}" class TestChineseActionParsing: """中文行動解析測試""" @pytest.mark.parametrize("action,expected_op,expected_resource,expected_ns", CHINESE_TEST_CASES) def test_chinese_actions(self, action, expected_op, expected_resource, expected_ns): op, resource, ns = parse_operation_from_action(action) assert op == expected_op, f"Operation mismatch for '{action}': got {op}, expected {expected_op}" assert resource == expected_resource, f"Resource mismatch for '{action}': got {resource}, expected {expected_resource}" assert ns == expected_ns, f"Namespace mismatch for '{action}': got {ns}, expected {expected_ns}" class TestMixedActionParsing: """混合語言行動解析測試""" @pytest.mark.parametrize("action,expected_op,expected_resource,expected_ns", MIXED_TEST_CASES) def test_mixed_actions(self, action, expected_op, expected_resource, expected_ns): op, resource, ns = parse_operation_from_action(action) assert op == expected_op, f"Operation mismatch for '{action}': got {op}, expected {expected_op}" assert resource == expected_resource, f"Resource mismatch for '{action}': got {resource}, expected {expected_resource}" assert ns == expected_ns, f"Namespace mismatch for '{action}': got {ns}, expected {expected_ns}" class TestEdgeCases: """邊界情況測試""" @pytest.mark.parametrize("action,expected_op,expected_resource,expected_ns", EDGE_CASES) def test_edge_cases(self, action, expected_op, expected_resource, expected_ns): op, resource, ns = parse_operation_from_action(action) assert op == expected_op, f"Operation mismatch for '{action}': got {op}, expected {expected_op}" assert resource == expected_resource, f"Resource mismatch for '{action}': got {resource}, expected {expected_resource}" # ============================================================================= # 準確度報告 # ============================================================================= def test_accuracy_report(): """生成準確度報告""" all_cases = ENGLISH_TEST_CASES + CHINESE_TEST_CASES + MIXED_TEST_CASES passed = 0 failed = 0 failures = [] for action, expected_op, expected_resource, expected_ns in all_cases: op, resource, ns = parse_operation_from_action(action) if op == expected_op and resource == expected_resource and ns == expected_ns: passed += 1 else: failed += 1 failures.append({ "action": action, "expected": (expected_op, expected_resource, expected_ns), "got": (op, resource, ns), }) total = passed + failed accuracy = (passed / total * 100) if total > 0 else 0 print("\n" + "=" * 60) print("Phase 12.1: 行動解析準確度基線報告") print("=" * 60) print(f"總測試案例: {total}") print(f"通過: {passed}") print(f"失敗: {failed}") print(f"準確率: {accuracy:.1f}%") print("=" * 60) if failures: print("\n失敗案例:") for f in failures: print(f" - '{f['action']}'") print(f" 期望: {f['expected']}") print(f" 實際: {f['got']}") # 確保準確率符合預期 (當前基線,後續改進後調高) assert accuracy >= 70, f"Accuracy {accuracy}% is below baseline 70%" if __name__ == "__main__": pytest.main([__file__, "-v", "--tb=short"])