fix(alertmanager): keep auto repair moving on ai fallback
All checks were successful
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / tests (push) Successful in 1m10s
CD Pipeline / build-and-deploy (push) Successful in 3m25s
CD Pipeline / post-deploy-checks (push) Successful in 1m30s

This commit is contained in:
Your Name
2026-05-14 00:06:34 +08:00
parent 39581ab824
commit d835b666cf
2 changed files with 125 additions and 3 deletions

View File

@@ -139,6 +139,38 @@ def _should_use_alertmanager_rule_first(
)
async def _analyze_alertmanager_with_timeout(
openclaw,
alert_context: dict,
*,
alert_id: str,
alertname: str,
) -> tuple:
"""Run Alertmanager AI analysis without letting it block the workflow forever."""
try:
return await asyncio.wait_for(
openclaw.analyze_alert(alert_context),
timeout=ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS,
)
except TimeoutError:
logger.warning(
"alertmanager_openclaw_timeout_fallback",
alert_id=alert_id,
alertname=alertname,
timeout_sec=ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS,
)
return None, "fallback_timeout", "", None, "", 0, 0.0
except Exception as exc:
logger.warning(
"alertmanager_openclaw_failed_fallback",
alert_id=alert_id,
alertname=alertname,
error=str(exc),
)
return None, "fallback_error", "", None, "", 0, 0.0
async def _escalate_auto_repair_unavailable(
*,
incident_id: str,
@@ -796,6 +828,7 @@ async def verify_webhook_signature(
# 戰略 B: 滑動時間窗 (ADR-073: 5 分鐘改 30 分鐘,防同一問題反覆重建 Incident2026-04-12 ogt)
DEBOUNCE_WINDOW_MINUTES = 30
ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS = 90.0
# =============================================================================
@@ -1109,7 +1142,12 @@ async def receive_alert(
# 呼叫 OpenClaw LLM 分析 (v7.0 含 SignOz 整合)
# 2026-03-29 ogt: 加入 Token/Cost 追蹤
openclaw = get_openclaw()
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context)
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await _analyze_alertmanager_with_timeout(
openclaw,
alert_context,
alert_id=alert_id,
alertname=alert.alert_type,
)
if analysis_result:
# LLM 分析成功
@@ -1815,7 +1853,12 @@ async def _process_new_alert_background(
record_alert_chain_success("alertmanager")
return
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context)
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await _analyze_alertmanager_with_timeout(
openclaw,
alert_context,
alert_id=alert_id,
alertname=alertname,
)
if analysis_result:
risk_mapping = {
@@ -2115,11 +2158,17 @@ async def _process_new_alert_background(
else:
# LLM 失敗 - 使用預設值
# 2026-04-27 Claude Sonnet 4.6: shadow-run Step1 — 補 metadata kwarg讓 extra_metadata 可觀測
_matched_playbook_id_cs4 = await resolve_playbook_id_for_alert(
rule_id=str(rule_response.get("rule_id", "")),
alertname=alertname,
affected_services=[target_resource] if target_resource else [],
severity="medium",
)
_approval_metadata_cs4 = {
"source": "fallback",
"confidence_score": None,
"is_rule_based": False,
"playbook_id": None,
"playbook_id": _matched_playbook_id_cs4,
}
fallback_create = ApprovalRequestCreate(
action="OBSERVE",
@@ -2134,6 +2183,7 @@ async def _process_new_alert_background(
dry_run_checks=[],
requested_by="OpenClaw (fallback)",
metadata=_approval_metadata_cs4,
matched_playbook_id=_matched_playbook_id_cs4,
)
approval = await service.create_approval_with_fingerprint(
@@ -2205,6 +2255,37 @@ async def _process_new_alert_background(
annotations=alert_context.get("annotations", {}),
)
_is_heartbeat = is_heartbeat_alertname(alertname)
if can_auto_repair and not _is_heartbeat:
await _try_auto_repair_background(
incident_id=fallback_incident_id,
approval_id=str(approval.id),
alert_type=alert_type,
target_resource=target_resource,
namespace=namespace,
)
elif not can_auto_repair and not _is_heartbeat:
from src.repositories.alert_operation_log_repository import get_alert_operation_log_repository
_op_log_fallback = get_alert_operation_log_repository()
await _op_log_fallback.append(
"GUARDRAIL_BLOCKED",
incident_id=fallback_incident_id,
approval_id=str(approval.id),
actor="prometheus-rule",
action_detail=f"Prometheus rule 設定 auto_repair=falsefallback 轉人工: {alertname}",
success=False,
context={"alertname": alertname, "auto_repair_flag": False},
)
await _escalate_auto_repair_unavailable(
incident_id=fallback_incident_id,
approval_id=str(approval.id),
alert_type=alert_type,
target_resource=target_resource,
namespace=namespace,
failure_reason="Prometheus rule auto_repair=falsefallback 未進入自動修復評估",
attempted_actions="llm_fallback -> guardrail:auto_repair_false -> emergency_intervention",
)
await _push_to_telegram_background(
approval_id=str(approval.id),
risk_level="medium",

View File

@@ -1,6 +1,10 @@
import asyncio
from datetime import datetime
import pytest
from src.api.v1.webhooks import (
_analyze_alertmanager_with_timeout,
_should_bypass_alertmanager_llm,
_should_use_alertmanager_rule_first,
)
@@ -111,6 +115,43 @@ def test_alertmanager_llm_inflight_lock_key_is_fingerprint_scoped():
assert ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS == 600
@pytest.mark.asyncio
async def test_alertmanager_analysis_timeout_returns_fallback(monkeypatch):
from src.api.v1 import webhooks as webhooks_module
class SlowOpenClaw:
async def analyze_alert(self, alert_context):
await asyncio.sleep(1)
return "unexpected"
monkeypatch.setattr(webhooks_module, "ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS", 0.01)
result = await _analyze_alertmanager_with_timeout(
SlowOpenClaw(),
{"alertname": "AwoooPTimeoutCanary"},
alert_id="alert-timeout",
alertname="AwoooPTimeoutCanary",
)
assert result == (None, "fallback_timeout", "", None, "", 0, 0.0)
@pytest.mark.asyncio
async def test_alertmanager_analysis_error_returns_fallback():
class BrokenOpenClaw:
async def analyze_alert(self, alert_context):
raise RuntimeError("provider chain failed")
result = await _analyze_alertmanager_with_timeout(
BrokenOpenClaw(),
{"alertname": "AwoooPErrorCanary"},
alert_id="alert-error",
alertname="AwoooPErrorCanary",
)
assert result == (None, "fallback_error", "", None, "", 0, 0.0)
def test_resolved_guard_stamp_without_timestamp_is_clean():
assert _format_resolved_guard_stamp(None) == "✅ 此事件已解決"