問題: LLM 測試因模型波動導致 CI 失敗 解決方案: 三層測試策略 - Tier 1 (CI): Schema 驗證 + Golden Responses - Tier 2 (Nightly): 屬性測試 + Live LLM - Tier 3 (Weekly): 語意相似度測試 新增檔案: - ADR-018-llm-testing-strategy.md - tests/llm_testing/ 框架 - schema_validators.py: Pydantic Schema 驗證 - property_validators.py: kubectl/風險等級驗證 - golden_responses.py: 預錄回應管理 - tests/test_llm_tier1_schema.py: 35 個 Tier 1 測試 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
155 lines
3.7 KiB
Python
155 lines
3.7 KiB
Python
"""
|
|
Schema Validators - Tier 1 測試
|
|
================================
|
|
驗證 LLM 輸出符合預定義 Schema
|
|
|
|
版本: v1.0
|
|
建立: 2026-03-26 (台北時區)
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from typing import Any, Literal
|
|
|
|
from pydantic import BaseModel, Field, ValidationError
|
|
|
|
|
|
class LLMProposalOutput(BaseModel):
|
|
"""
|
|
LLM 提案輸出 Schema
|
|
|
|
OpenClaw 必須輸出此格式的 JSON
|
|
"""
|
|
|
|
risk_level: Literal["LOW", "MEDIUM", "HIGH", "CRITICAL"] = Field(
|
|
description="風險等級"
|
|
)
|
|
kubectl_command: str | None = Field(
|
|
None,
|
|
description="kubectl 命令 (可選)"
|
|
)
|
|
action_description: str = Field(
|
|
description="行動描述"
|
|
)
|
|
reasoning: str = Field(
|
|
description="推理過程"
|
|
)
|
|
confidence: float = Field(
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="信心度 0-1"
|
|
)
|
|
|
|
|
|
class LLMAnalysisOutput(BaseModel):
|
|
"""
|
|
LLM 分析輸出 Schema
|
|
|
|
用於告警分析回應
|
|
"""
|
|
|
|
root_cause: str = Field(
|
|
description="根因分析"
|
|
)
|
|
severity: Literal["P0", "P1", "P2", "P3"] = Field(
|
|
description="嚴重度"
|
|
)
|
|
affected_services: list[str] = Field(
|
|
default_factory=list,
|
|
description="受影響服務"
|
|
)
|
|
recommended_actions: list[str] = Field(
|
|
default_factory=list,
|
|
description="建議行動"
|
|
)
|
|
|
|
|
|
def validate_proposal_schema(response: str) -> tuple[bool, str, LLMProposalOutput | None]:
|
|
"""
|
|
驗證 LLM 回應是否符合 Proposal Schema
|
|
|
|
Args:
|
|
response: LLM 原始回應
|
|
|
|
Returns:
|
|
(is_valid, error_message, parsed_output)
|
|
"""
|
|
# 嘗試提取 JSON
|
|
json_str = extract_json_from_response(response)
|
|
if not json_str:
|
|
return False, "無法從回應中提取 JSON", None
|
|
|
|
# 解析 JSON
|
|
try:
|
|
data = json.loads(json_str)
|
|
except json.JSONDecodeError as e:
|
|
return False, f"JSON 解析失敗: {e}", None
|
|
|
|
# Schema 驗證
|
|
try:
|
|
output = LLMProposalOutput.model_validate(data)
|
|
return True, "", output
|
|
except ValidationError as e:
|
|
errors = "; ".join([f"{err['loc']}: {err['msg']}" for err in e.errors()])
|
|
return False, f"Schema 驗證失敗: {errors}", None
|
|
|
|
|
|
def validate_analysis_schema(response: str) -> tuple[bool, str, LLMAnalysisOutput | None]:
|
|
"""
|
|
驗證 LLM 回應是否符合 Analysis Schema
|
|
|
|
Args:
|
|
response: LLM 原始回應
|
|
|
|
Returns:
|
|
(is_valid, error_message, parsed_output)
|
|
"""
|
|
json_str = extract_json_from_response(response)
|
|
if not json_str:
|
|
return False, "無法從回應中提取 JSON", None
|
|
|
|
try:
|
|
data = json.loads(json_str)
|
|
except json.JSONDecodeError as e:
|
|
return False, f"JSON 解析失敗: {e}", None
|
|
|
|
try:
|
|
output = LLMAnalysisOutput.model_validate(data)
|
|
return True, "", output
|
|
except ValidationError as e:
|
|
errors = "; ".join([f"{err['loc']}: {err['msg']}" for err in e.errors()])
|
|
return False, f"Schema 驗證失敗: {errors}", None
|
|
|
|
|
|
def extract_json_from_response(response: str) -> str | None:
|
|
"""
|
|
從 LLM 回應中提取 JSON
|
|
|
|
支援:
|
|
- 純 JSON
|
|
- ```json ... ``` 包裹
|
|
- 混合文字中的 JSON
|
|
"""
|
|
if not response:
|
|
return None
|
|
|
|
response = response.strip()
|
|
|
|
# Case 1: 純 JSON
|
|
if response.startswith("{") and response.endswith("}"):
|
|
return response
|
|
|
|
# Case 2: ```json 包裹
|
|
json_block_pattern = r"```(?:json)?\s*([\s\S]*?)```"
|
|
match = re.search(json_block_pattern, response)
|
|
if match:
|
|
return match.group(1).strip()
|
|
|
|
# Case 3: 尋找 { ... } 區塊
|
|
brace_pattern = r"\{[\s\S]*\}"
|
|
match = re.search(brace_pattern, response)
|
|
if match:
|
|
return match.group(0)
|
|
|
|
return None
|