問題: LLM 測試因模型波動導致 CI 失敗 解決方案: 三層測試策略 - Tier 1 (CI): Schema 驗證 + Golden Responses - Tier 2 (Nightly): 屬性測試 + Live LLM - Tier 3 (Weekly): 語意相似度測試 新增檔案: - ADR-018-llm-testing-strategy.md - tests/llm_testing/ 框架 - schema_validators.py: Pydantic Schema 驗證 - property_validators.py: kubectl/風險等級驗證 - golden_responses.py: 預錄回應管理 - tests/test_llm_tier1_schema.py: 35 個 Tier 1 測試 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
273 lines
6.5 KiB
Python
273 lines
6.5 KiB
Python
"""
|
||
Property Validators - Tier 2 測試
|
||
==================================
|
||
驗證 LLM 輸出的不變量屬性
|
||
|
||
版本: v1.0
|
||
建立: 2026-03-26 (台北時區)
|
||
"""
|
||
|
||
import re
|
||
import shlex
|
||
from dataclasses import dataclass
|
||
|
||
|
||
@dataclass
|
||
class ValidationResult:
|
||
"""驗證結果"""
|
||
is_valid: bool
|
||
message: str
|
||
details: dict | None = None
|
||
|
||
|
||
def validate_kubectl_syntax(command: str) -> ValidationResult:
|
||
"""
|
||
驗證 kubectl 命令語法有效性
|
||
|
||
檢查:
|
||
1. 命令可被 shell 解析
|
||
2. 第一個 token 是 kubectl
|
||
3. 包含有效的 kubectl 動詞
|
||
|
||
Args:
|
||
command: kubectl 命令字串
|
||
|
||
Returns:
|
||
ValidationResult
|
||
"""
|
||
if not command:
|
||
return ValidationResult(False, "命令為空")
|
||
|
||
command = command.strip()
|
||
|
||
# 移除可能的 $ 前綴
|
||
if command.startswith("$ "):
|
||
command = command[2:]
|
||
|
||
# 嘗試 shell 解析
|
||
try:
|
||
tokens = shlex.split(command)
|
||
except ValueError as e:
|
||
return ValidationResult(False, f"命令無法解析: {e}")
|
||
|
||
if not tokens:
|
||
return ValidationResult(False, "解析後無 token")
|
||
|
||
# 檢查第一個 token
|
||
if tokens[0] != "kubectl":
|
||
return ValidationResult(
|
||
False,
|
||
f"第一個 token 應為 kubectl,實際為 {tokens[0]}"
|
||
)
|
||
|
||
# 有效的 kubectl 動詞
|
||
valid_verbs = {
|
||
"get", "describe", "create", "apply", "delete",
|
||
"edit", "scale", "rollout", "logs", "exec",
|
||
"port-forward", "top", "patch", "label", "annotate",
|
||
"cordon", "uncordon", "drain", "taint",
|
||
}
|
||
|
||
if len(tokens) < 2:
|
||
return ValidationResult(False, "缺少 kubectl 動詞")
|
||
|
||
verb = tokens[1]
|
||
if verb not in valid_verbs:
|
||
return ValidationResult(
|
||
False,
|
||
f"無效的 kubectl 動詞: {verb}",
|
||
details={"valid_verbs": list(valid_verbs)}
|
||
)
|
||
|
||
return ValidationResult(
|
||
True,
|
||
"kubectl 語法有效",
|
||
details={"verb": verb, "tokens": tokens}
|
||
)
|
||
|
||
|
||
def validate_risk_level(risk_level: str) -> ValidationResult:
|
||
"""
|
||
驗證風險等級是否為有效值
|
||
|
||
Args:
|
||
risk_level: 風險等級字串
|
||
|
||
Returns:
|
||
ValidationResult
|
||
"""
|
||
valid_levels = {"LOW", "MEDIUM", "HIGH", "CRITICAL"}
|
||
|
||
if not risk_level:
|
||
return ValidationResult(False, "風險等級為空")
|
||
|
||
normalized = risk_level.strip().upper()
|
||
|
||
if normalized in valid_levels:
|
||
return ValidationResult(True, f"有效風險等級: {normalized}")
|
||
|
||
# 嘗試模糊匹配
|
||
fuzzy_mappings = {
|
||
"低": "LOW",
|
||
"中": "MEDIUM",
|
||
"高": "HIGH",
|
||
"危": "CRITICAL",
|
||
"嚴重": "CRITICAL",
|
||
}
|
||
|
||
for key, value in fuzzy_mappings.items():
|
||
if key in risk_level:
|
||
return ValidationResult(
|
||
True,
|
||
f"模糊匹配: {risk_level} -> {value}",
|
||
details={"original": risk_level, "normalized": value}
|
||
)
|
||
|
||
return ValidationResult(
|
||
False,
|
||
f"無效風險等級: {risk_level}",
|
||
details={"valid_levels": list(valid_levels)}
|
||
)
|
||
|
||
|
||
def validate_chinese_ratio(text: str, min_ratio: float = 0.3) -> ValidationResult:
|
||
"""
|
||
驗證繁體中文比例
|
||
|
||
Args:
|
||
text: 待驗證文字
|
||
min_ratio: 最低中文比例 (預設 30%)
|
||
|
||
Returns:
|
||
ValidationResult
|
||
"""
|
||
if not text:
|
||
return ValidationResult(False, "文字為空")
|
||
|
||
# 中文字元 Unicode 範圍
|
||
chinese_pattern = re.compile(r"[\u4e00-\u9fff]")
|
||
chinese_chars = chinese_pattern.findall(text)
|
||
|
||
# 計算比例 (排除空白)
|
||
non_space_chars = len(text.replace(" ", "").replace("\n", ""))
|
||
if non_space_chars == 0:
|
||
return ValidationResult(False, "無有效字元")
|
||
|
||
ratio = len(chinese_chars) / non_space_chars
|
||
|
||
if ratio >= min_ratio:
|
||
return ValidationResult(
|
||
True,
|
||
f"中文比例 {ratio:.1%} >= {min_ratio:.0%}",
|
||
details={"ratio": ratio, "chinese_count": len(chinese_chars)}
|
||
)
|
||
|
||
return ValidationResult(
|
||
False,
|
||
f"中文比例 {ratio:.1%} < {min_ratio:.0%}",
|
||
details={"ratio": ratio, "chinese_count": len(chinese_chars)}
|
||
)
|
||
|
||
|
||
def validate_response_length(
|
||
text: str,
|
||
min_length: int = 10,
|
||
max_length: int = 500,
|
||
) -> ValidationResult:
|
||
"""
|
||
驗證回應長度
|
||
|
||
Args:
|
||
text: 待驗證文字
|
||
min_length: 最小長度
|
||
max_length: 最大長度
|
||
|
||
Returns:
|
||
ValidationResult
|
||
"""
|
||
if not text:
|
||
return ValidationResult(False, "文字為空")
|
||
|
||
length = len(text.strip())
|
||
|
||
if length < min_length:
|
||
return ValidationResult(
|
||
False,
|
||
f"回應過短: {length} < {min_length}",
|
||
details={"length": length}
|
||
)
|
||
|
||
if length > max_length:
|
||
return ValidationResult(
|
||
False,
|
||
f"回應過長: {length} > {max_length}",
|
||
details={"length": length}
|
||
)
|
||
|
||
return ValidationResult(
|
||
True,
|
||
f"回應長度 {length} 在 [{min_length}, {max_length}] 範圍內",
|
||
details={"length": length}
|
||
)
|
||
|
||
|
||
def extract_kubectl_from_text(text: str) -> str | None:
|
||
"""
|
||
從文字中提取 kubectl 命令
|
||
|
||
Args:
|
||
text: 包含 kubectl 的文字
|
||
|
||
Returns:
|
||
提取的 kubectl 命令或 None
|
||
"""
|
||
if not text:
|
||
return None
|
||
|
||
# 匹配 kubectl 開頭的整行
|
||
patterns = [
|
||
r"```(?:bash|shell|sh)?\s*(kubectl[^\n`]+)", # code block
|
||
r"^\$?\s*(kubectl[^\n]+)", # 以 $ 或直接開頭
|
||
r"(kubectl\s+\S+(?:\s+\S+)*)", # 一般匹配
|
||
]
|
||
|
||
for pattern in patterns:
|
||
match = re.search(pattern, text, re.MULTILINE | re.IGNORECASE)
|
||
if match:
|
||
return match.group(1).strip()
|
||
|
||
return None
|
||
|
||
|
||
def extract_risk_level_from_text(text: str) -> str | None:
|
||
"""
|
||
從文字中提取風險等級
|
||
|
||
Args:
|
||
text: 包含風險等級的文字
|
||
|
||
Returns:
|
||
提取的風險等級或 None
|
||
"""
|
||
if not text:
|
||
return None
|
||
|
||
# 直接匹配
|
||
for level in ["CRITICAL", "HIGH", "MEDIUM", "LOW"]:
|
||
if level in text.upper():
|
||
return level
|
||
|
||
# 中文匹配
|
||
chinese_mappings = [
|
||
(r"極高|危險|嚴重|緊急", "CRITICAL"),
|
||
(r"高風險|高度", "HIGH"),
|
||
(r"中等|中度|一般", "MEDIUM"),
|
||
(r"低風險|低度|輕微", "LOW"),
|
||
]
|
||
|
||
for pattern, level in chinese_mappings:
|
||
if re.search(pattern, text):
|
||
return level
|
||
|
||
return None
|