fix(aiops): route backup failures rule-first
All checks were successful
CD Pipeline / tests (push) Successful in 1m51s
Code Review / ai-code-review (push) Successful in 30s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 42s
CD Pipeline / build-and-deploy (push) Successful in 8m21s
CD Pipeline / post-deploy-checks (push) Successful in 4m18s

This commit is contained in:
Your Name
2026-05-01 10:11:10 +08:00
parent 3e0ab0f8c6
commit ca22ec2fd2
4 changed files with 40 additions and 6 deletions

View File

@@ -101,13 +101,13 @@ def _should_bypass_alertmanager_llm(
rule_response: dict | None,
alert_category: str,
) -> bool:
"""Host 類告警命中 YAML NO_ACTION 時,直接走人工排查卡片。"""
"""主機/備份類告警命中 YAML NO_ACTION 時,直接走人工排查卡片。"""
return (
rule_response is not None
and rule_response.get("suggested_action") == "NO_ACTION"
and not str(rule_response.get("kubectl_command", "")).strip()
and rule_response.get("rule_id", "") not in ("generic_fallback", "")
and alert_category == "host_resource"
and alert_category in {"host_resource", "backup_failure"}
)
@@ -115,9 +115,9 @@ def _should_use_alertmanager_rule_first(
rule_response: dict | None,
alert_category: str,
) -> bool:
"""Host 類告警命中權威 YAML 規則時,避免再被 LLM 污染成 K8s 動作。"""
"""主機/備份類告警命中權威 YAML 規則時,避免再被 LLM 污染成 K8s 動作。"""
if not rule_response or alert_category != "host_resource":
if not rule_response or alert_category not in {"host_resource", "backup_failure"}:
return False
if rule_response.get("rule_id", "") in ("generic_fallback", ""):
return False
@@ -1363,7 +1363,7 @@ async def _process_new_alert_background(
alertname=alertname,
rule_id=rule_response.get("rule_id", ""),
alert_category=alert_category,
reason="host_resource YAML 權威規則命中,跳過 LLM 避免產生錯誤 K8s 動作",
reason="host/backup YAML 權威規則命中,跳過 LLM 避免產生錯誤 K8s 動作",
)
risk_mapping = {
"low": RiskLevel.LOW,