fix(alertmanager): keep auto repair moving on ai fallback
This commit is contained in:
@@ -139,6 +139,38 @@ def _should_use_alertmanager_rule_first(
|
||||
)
|
||||
|
||||
|
||||
async def _analyze_alertmanager_with_timeout(
|
||||
openclaw,
|
||||
alert_context: dict,
|
||||
*,
|
||||
alert_id: str,
|
||||
alertname: str,
|
||||
) -> tuple:
|
||||
"""Run Alertmanager AI analysis without letting it block the workflow forever."""
|
||||
|
||||
try:
|
||||
return await asyncio.wait_for(
|
||||
openclaw.analyze_alert(alert_context),
|
||||
timeout=ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS,
|
||||
)
|
||||
except TimeoutError:
|
||||
logger.warning(
|
||||
"alertmanager_openclaw_timeout_fallback",
|
||||
alert_id=alert_id,
|
||||
alertname=alertname,
|
||||
timeout_sec=ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS,
|
||||
)
|
||||
return None, "fallback_timeout", "", None, "", 0, 0.0
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"alertmanager_openclaw_failed_fallback",
|
||||
alert_id=alert_id,
|
||||
alertname=alertname,
|
||||
error=str(exc),
|
||||
)
|
||||
return None, "fallback_error", "", None, "", 0, 0.0
|
||||
|
||||
|
||||
async def _escalate_auto_repair_unavailable(
|
||||
*,
|
||||
incident_id: str,
|
||||
@@ -796,6 +828,7 @@ async def verify_webhook_signature(
|
||||
|
||||
# 戰略 B: 滑動時間窗 (ADR-073: 5 分鐘改 30 分鐘,防同一問題反覆重建 Incident,2026-04-12 ogt)
|
||||
DEBOUNCE_WINDOW_MINUTES = 30
|
||||
ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS = 90.0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -1109,7 +1142,12 @@ async def receive_alert(
|
||||
# 呼叫 OpenClaw LLM 分析 (v7.0 含 SignOz 整合)
|
||||
# 2026-03-29 ogt: 加入 Token/Cost 追蹤
|
||||
openclaw = get_openclaw()
|
||||
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context)
|
||||
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await _analyze_alertmanager_with_timeout(
|
||||
openclaw,
|
||||
alert_context,
|
||||
alert_id=alert_id,
|
||||
alertname=alert.alert_type,
|
||||
)
|
||||
|
||||
if analysis_result:
|
||||
# LLM 分析成功
|
||||
@@ -1815,7 +1853,12 @@ async def _process_new_alert_background(
|
||||
record_alert_chain_success("alertmanager")
|
||||
return
|
||||
|
||||
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context)
|
||||
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await _analyze_alertmanager_with_timeout(
|
||||
openclaw,
|
||||
alert_context,
|
||||
alert_id=alert_id,
|
||||
alertname=alertname,
|
||||
)
|
||||
|
||||
if analysis_result:
|
||||
risk_mapping = {
|
||||
@@ -2115,11 +2158,17 @@ async def _process_new_alert_background(
|
||||
else:
|
||||
# LLM 失敗 - 使用預設值
|
||||
# 2026-04-27 Claude Sonnet 4.6: shadow-run Step1 — 補 metadata kwarg,讓 extra_metadata 可觀測
|
||||
_matched_playbook_id_cs4 = await resolve_playbook_id_for_alert(
|
||||
rule_id=str(rule_response.get("rule_id", "")),
|
||||
alertname=alertname,
|
||||
affected_services=[target_resource] if target_resource else [],
|
||||
severity="medium",
|
||||
)
|
||||
_approval_metadata_cs4 = {
|
||||
"source": "fallback",
|
||||
"confidence_score": None,
|
||||
"is_rule_based": False,
|
||||
"playbook_id": None,
|
||||
"playbook_id": _matched_playbook_id_cs4,
|
||||
}
|
||||
fallback_create = ApprovalRequestCreate(
|
||||
action="OBSERVE",
|
||||
@@ -2134,6 +2183,7 @@ async def _process_new_alert_background(
|
||||
dry_run_checks=[],
|
||||
requested_by="OpenClaw (fallback)",
|
||||
metadata=_approval_metadata_cs4,
|
||||
matched_playbook_id=_matched_playbook_id_cs4,
|
||||
)
|
||||
|
||||
approval = await service.create_approval_with_fingerprint(
|
||||
@@ -2205,6 +2255,37 @@ async def _process_new_alert_background(
|
||||
annotations=alert_context.get("annotations", {}),
|
||||
)
|
||||
|
||||
_is_heartbeat = is_heartbeat_alertname(alertname)
|
||||
if can_auto_repair and not _is_heartbeat:
|
||||
await _try_auto_repair_background(
|
||||
incident_id=fallback_incident_id,
|
||||
approval_id=str(approval.id),
|
||||
alert_type=alert_type,
|
||||
target_resource=target_resource,
|
||||
namespace=namespace,
|
||||
)
|
||||
elif not can_auto_repair and not _is_heartbeat:
|
||||
from src.repositories.alert_operation_log_repository import get_alert_operation_log_repository
|
||||
_op_log_fallback = get_alert_operation_log_repository()
|
||||
await _op_log_fallback.append(
|
||||
"GUARDRAIL_BLOCKED",
|
||||
incident_id=fallback_incident_id,
|
||||
approval_id=str(approval.id),
|
||||
actor="prometheus-rule",
|
||||
action_detail=f"Prometheus rule 設定 auto_repair=false,fallback 轉人工: {alertname}",
|
||||
success=False,
|
||||
context={"alertname": alertname, "auto_repair_flag": False},
|
||||
)
|
||||
await _escalate_auto_repair_unavailable(
|
||||
incident_id=fallback_incident_id,
|
||||
approval_id=str(approval.id),
|
||||
alert_type=alert_type,
|
||||
target_resource=target_resource,
|
||||
namespace=namespace,
|
||||
failure_reason="Prometheus rule auto_repair=false,fallback 未進入自動修復評估",
|
||||
attempted_actions="llm_fallback -> guardrail:auto_repair_false -> emergency_intervention",
|
||||
)
|
||||
|
||||
await _push_to_telegram_background(
|
||||
approval_id=str(approval.id),
|
||||
risk_level="medium",
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
|
||||
import pytest
|
||||
|
||||
from src.api.v1.webhooks import (
|
||||
_analyze_alertmanager_with_timeout,
|
||||
_should_bypass_alertmanager_llm,
|
||||
_should_use_alertmanager_rule_first,
|
||||
)
|
||||
@@ -111,6 +115,43 @@ def test_alertmanager_llm_inflight_lock_key_is_fingerprint_scoped():
|
||||
assert ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS == 600
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_alertmanager_analysis_timeout_returns_fallback(monkeypatch):
|
||||
from src.api.v1 import webhooks as webhooks_module
|
||||
|
||||
class SlowOpenClaw:
|
||||
async def analyze_alert(self, alert_context):
|
||||
await asyncio.sleep(1)
|
||||
return "unexpected"
|
||||
|
||||
monkeypatch.setattr(webhooks_module, "ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS", 0.01)
|
||||
|
||||
result = await _analyze_alertmanager_with_timeout(
|
||||
SlowOpenClaw(),
|
||||
{"alertname": "AwoooPTimeoutCanary"},
|
||||
alert_id="alert-timeout",
|
||||
alertname="AwoooPTimeoutCanary",
|
||||
)
|
||||
|
||||
assert result == (None, "fallback_timeout", "", None, "", 0, 0.0)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_alertmanager_analysis_error_returns_fallback():
|
||||
class BrokenOpenClaw:
|
||||
async def analyze_alert(self, alert_context):
|
||||
raise RuntimeError("provider chain failed")
|
||||
|
||||
result = await _analyze_alertmanager_with_timeout(
|
||||
BrokenOpenClaw(),
|
||||
{"alertname": "AwoooPErrorCanary"},
|
||||
alert_id="alert-error",
|
||||
alertname="AwoooPErrorCanary",
|
||||
)
|
||||
|
||||
assert result == (None, "fallback_error", "", None, "", 0, 0.0)
|
||||
|
||||
|
||||
def test_resolved_guard_stamp_without_timestamp_is_clean():
|
||||
assert _format_resolved_guard_stamp(None) == "✅ 此事件已解決"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user