fix(aiops): correct host alert fallback and resolved stamp
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 8m54s
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 8m54s
This commit is contained in:
@@ -33,7 +33,7 @@ from pydantic import BaseModel, Field
|
||||
|
||||
from src.core.config import settings
|
||||
from src.core.constants import is_cicd_alertname, is_heartbeat_alertname
|
||||
from src.services.alert_rule_engine import get_incident_type
|
||||
from src.services.alert_rule_engine import get_incident_type, match_rule
|
||||
from src.core.logging import get_logger
|
||||
from src.core.metrics import record_alert_chain_success
|
||||
|
||||
@@ -94,6 +94,20 @@ logger = get_logger("awoooi.webhooks")
|
||||
# 2026-04-05 ogt: 自動修復背景任務 (ADR-058 閉環)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _should_bypass_alertmanager_llm(
|
||||
rule_response: dict | None,
|
||||
alert_category: str,
|
||||
) -> bool:
|
||||
"""Host 類告警命中 YAML NO_ACTION 時,直接走人工排查卡片。"""
|
||||
return (
|
||||
rule_response is not None
|
||||
and rule_response.get("suggested_action") == "NO_ACTION"
|
||||
and not str(rule_response.get("kubectl_command", "")).strip()
|
||||
and rule_response.get("rule_id", "") not in ("generic_fallback", "")
|
||||
and alert_category == "host_resource"
|
||||
)
|
||||
|
||||
async def _try_auto_repair_background(
|
||||
incident_id: str,
|
||||
approval_id: str,
|
||||
@@ -1127,6 +1141,129 @@ async def _process_new_alert_background(
|
||||
service = get_approval_service()
|
||||
openclaw = get_openclaw()
|
||||
|
||||
rule_response = match_rule(alert_context)
|
||||
should_bypass_llm = _should_bypass_alertmanager_llm(rule_response, alert_category)
|
||||
|
||||
if should_bypass_llm:
|
||||
logger.info(
|
||||
"alertmanager_rule_bypass_llm",
|
||||
alert_id=alert_id,
|
||||
alertname=alertname,
|
||||
rule_id=rule_response.get("rule_id", ""),
|
||||
alert_category=alert_category,
|
||||
reason="host_resource YAML NO_ACTION 規則命中,跳過 LLM 產生人工排查卡片",
|
||||
)
|
||||
risk_mapping = {
|
||||
"low": RiskLevel.LOW,
|
||||
"medium": RiskLevel.MEDIUM,
|
||||
"critical": RiskLevel.CRITICAL,
|
||||
}
|
||||
rule_risk = risk_mapping.get(
|
||||
str(rule_response.get("risk_level", "low")).lower(),
|
||||
RiskLevel.LOW,
|
||||
)
|
||||
blast = rule_response.get("blast_radius", {}) or {}
|
||||
impact_mapping = {
|
||||
"NONE": DataImpact.NONE,
|
||||
"READ_ONLY": DataImpact.READ_ONLY,
|
||||
"WRITE": DataImpact.WRITE,
|
||||
"DESTRUCTIVE": DataImpact.DESTRUCTIVE,
|
||||
}
|
||||
data_impact = impact_mapping.get(
|
||||
str(blast.get("data_impact", "NONE")).upper(),
|
||||
DataImpact.NONE,
|
||||
)
|
||||
rule_action_title = str(rule_response.get("action_title", "人工排查主機告警"))
|
||||
rule_kubectl = str(rule_response.get("kubectl_command", "")).strip()
|
||||
rule_description = str(rule_response.get("description", message))
|
||||
rule_action = (
|
||||
f"{rule_action_title} | {rule_kubectl}"
|
||||
if rule_kubectl else
|
||||
f"NO_ACTION - {rule_description[:120]}"
|
||||
)
|
||||
|
||||
approval_create = ApprovalRequestCreate(
|
||||
action=rule_action,
|
||||
description=f"[Rule: {rule_response.get('rule_id', 'unknown')}] {rule_description}",
|
||||
risk_level=rule_risk,
|
||||
blast_radius=BlastRadius(
|
||||
affected_pods=int(blast.get("affected_pods", 1) or 1),
|
||||
estimated_downtime=str(blast.get("estimated_downtime", "N/A")),
|
||||
related_services=list(
|
||||
set(
|
||||
(blast.get("related_services") or [])
|
||||
+ [str(rule_response.get("target_resource", target_resource))]
|
||||
)
|
||||
),
|
||||
data_impact=data_impact,
|
||||
),
|
||||
dry_run_checks=[
|
||||
DryRunCheck(
|
||||
name="規則命中",
|
||||
passed=True,
|
||||
message=str(rule_response.get("rule_id", "unknown")),
|
||||
),
|
||||
DryRunCheck(
|
||||
name="來源",
|
||||
passed=True,
|
||||
message="alertmanager",
|
||||
),
|
||||
],
|
||||
requested_by="OpenClaw (rule-engine)",
|
||||
)
|
||||
|
||||
approval = await service.create_approval_with_fingerprint(
|
||||
request=approval_create,
|
||||
fingerprint=fingerprint,
|
||||
)
|
||||
|
||||
incident_id = await create_incident_for_approval(
|
||||
approval_id=str(approval.id),
|
||||
risk_level=rule_risk.value,
|
||||
target_resource=target_resource,
|
||||
namespace=namespace,
|
||||
alert_type=alert_type,
|
||||
message=message,
|
||||
source="alertmanager",
|
||||
alertname=alertname,
|
||||
alert_labels=alert_labels,
|
||||
notification_type=notification_type,
|
||||
alert_category=alert_category,
|
||||
)
|
||||
|
||||
try:
|
||||
await service.update_incident_id(approval.id, incident_id)
|
||||
approval.incident_id = incident_id
|
||||
except Exception as _meta_err:
|
||||
logger.warning(
|
||||
"rule_bypass_approval_incident_id_update_failed",
|
||||
approval_id=str(approval.id),
|
||||
incident_id=incident_id,
|
||||
error=str(_meta_err),
|
||||
)
|
||||
|
||||
await _push_to_telegram_background(
|
||||
approval_id=str(approval.id),
|
||||
risk_level=rule_risk.value,
|
||||
resource_name=target_resource,
|
||||
root_cause=rule_description,
|
||||
suggested_action=rule_action,
|
||||
estimated_downtime=str(blast.get("estimated_downtime", "N/A")),
|
||||
hit_count=1,
|
||||
primary_responsibility=str(
|
||||
rule_response.get("primary_responsibility", "INFRA")
|
||||
),
|
||||
confidence=float(rule_response.get("confidence", 0.0) or 0.0),
|
||||
namespace=namespace,
|
||||
incident_id=incident_id,
|
||||
notification_type=notification_type,
|
||||
alert_category=alert_category,
|
||||
fingerprint=fingerprint,
|
||||
)
|
||||
|
||||
record_alert_chain_success("alertmanager")
|
||||
return
|
||||
|
||||
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context)
|
||||
|
||||
if analysis_result:
|
||||
|
||||
@@ -262,6 +262,7 @@ class IncidentDBRepository(IIncidentRepository):
|
||||
incident_id: str,
|
||||
status: str,
|
||||
updated_at: datetime | None = None,
|
||||
resolved_at: datetime | None = None,
|
||||
) -> bool:
|
||||
"""更新 Incident 的狀態 (for debug)"""
|
||||
async with get_db_context() as db:
|
||||
@@ -277,6 +278,8 @@ class IncidentDBRepository(IIncidentRepository):
|
||||
|
||||
record.status = status
|
||||
record.updated_at = updated_at or datetime.now(UTC)
|
||||
if resolved_at is not None:
|
||||
record.resolved_at = resolved_at
|
||||
await db.commit()
|
||||
|
||||
logger.debug(
|
||||
|
||||
@@ -1008,7 +1008,8 @@ class IncidentService:
|
||||
await repo.update_status(
|
||||
incident_id=incident_id,
|
||||
status="resolved",
|
||||
updated_at=now_taipei(),
|
||||
updated_at=incident.updated_at,
|
||||
resolved_at=incident.resolved_at,
|
||||
)
|
||||
logger.info("resolve_db_updated", incident_id=incident_id)
|
||||
except Exception as e:
|
||||
|
||||
@@ -90,6 +90,13 @@ def _smart_truncate(text: str, limit: int, suffix: str = "…[截斷]") -> str:
|
||||
return text[:limit] + suffix
|
||||
|
||||
|
||||
def _format_resolved_guard_stamp(resolved_at: datetime | None) -> str:
|
||||
"""格式化 ADR-071-D 已解決狀態守衛文案。"""
|
||||
if resolved_at is None:
|
||||
return "✅ 此事件已解決"
|
||||
return f"✅ 此事件已於 {resolved_at.strftime('%Y-%m-%d %H:%M')} 解決"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Long Polling 配置 (Phase 5 內網修復)
|
||||
# =============================================================================
|
||||
@@ -3568,7 +3575,6 @@ class TelegramGateway:
|
||||
return None
|
||||
|
||||
if incident.status in (IncidentStatus.RESOLVED, IncidentStatus.CLOSED):
|
||||
resolved_at = incident.resolved_at.strftime("%Y-%m-%d %H:%M") if incident.resolved_at else "未知時間"
|
||||
await self._answer_callback(
|
||||
callback_query_id,
|
||||
"blocked",
|
||||
@@ -3577,7 +3583,7 @@ class TelegramGateway:
|
||||
try:
|
||||
separator = "──────────────"
|
||||
safe_original = html.escape(original_text) if original_text else ""
|
||||
stamp = f"✅ 此事件已於 {resolved_at} 解決"
|
||||
stamp = _format_resolved_guard_stamp(incident.resolved_at)
|
||||
await self._send_request("editMessageText", {
|
||||
"chat_id": self.chat_id,
|
||||
"message_id": message_id,
|
||||
|
||||
47
apps/api/tests/test_alertmanager_rule_bypass.py
Normal file
47
apps/api/tests/test_alertmanager_rule_bypass.py
Normal file
@@ -0,0 +1,47 @@
|
||||
from datetime import datetime
|
||||
|
||||
from src.api.v1.webhooks import _should_bypass_alertmanager_llm
|
||||
from src.services.telegram_gateway import _format_resolved_guard_stamp
|
||||
|
||||
|
||||
def test_host_resource_yaml_no_action_bypasses_llm():
|
||||
rule_response = {
|
||||
"rule_id": "host_resource_alert",
|
||||
"suggested_action": "NO_ACTION",
|
||||
"kubectl_command": "",
|
||||
}
|
||||
|
||||
assert _should_bypass_alertmanager_llm(rule_response, "host_resource") is True
|
||||
|
||||
|
||||
def test_generic_fallback_does_not_bypass_llm():
|
||||
rule_response = {
|
||||
"rule_id": "generic_fallback",
|
||||
"suggested_action": "NO_ACTION",
|
||||
"kubectl_command": "",
|
||||
}
|
||||
|
||||
assert _should_bypass_alertmanager_llm(rule_response, "host_resource") is False
|
||||
|
||||
|
||||
def test_non_host_category_does_not_bypass_llm():
|
||||
rule_response = {
|
||||
"rule_id": "host_resource_alert",
|
||||
"suggested_action": "NO_ACTION",
|
||||
"kubectl_command": "",
|
||||
}
|
||||
|
||||
assert _should_bypass_alertmanager_llm(rule_response, "kubernetes") is False
|
||||
|
||||
|
||||
def test_resolved_guard_stamp_without_timestamp_is_clean():
|
||||
assert _format_resolved_guard_stamp(None) == "✅ 此事件已解決"
|
||||
|
||||
|
||||
def test_resolved_guard_stamp_with_timestamp_formats_time():
|
||||
resolved_at = datetime(2026, 4, 25, 0, 2)
|
||||
|
||||
assert (
|
||||
_format_resolved_guard_stamp(resolved_at)
|
||||
== "✅ 此事件已於 2026-04-25 00:02 解決"
|
||||
)
|
||||
@@ -6,6 +6,17 @@
|
||||
|
||||
---
|
||||
|
||||
## 📍 2026-04-25 — Host 告警錯誤診斷與 resolved_at 缺漏修復
|
||||
|
||||
### 本次修復
|
||||
- **Incident resolve DB 同步補洞**:`IncidentService.resolve_incident()` 現在會把 `resolved_at` 一起傳給 `IncidentRepository.update_status()`,修正 Incident 狀態已是 `RESOLVED` 但 DB `resolved_at = NULL` 的斷鏈
|
||||
- **Telegram 已解決文案保底**:狀態守衛改為 `resolved_at` 缺漏時顯示 `✅ 此事件已解決`,不再出現 `此事件已於 未知時間 解決`
|
||||
- **Host 告警規則前置短路**:`/api/v1/webhooks/alertmanager` 背景流程新增 `host_resource + YAML NO_ACTION` 前置門,主機資源告警命中規則後直接生成人工排查卡片,跳過 LLM,避免產生「重啟 AWOOOI deployment」這種錯誤 K8s 建議
|
||||
|
||||
### 根因
|
||||
- `resolved_at` 只寫入 Redis / Working Memory,Repository `update_status()` 沒有同步回 PostgreSQL,造成 Telegram 狀態守衛讀到 `RESOLVED + NULL resolved_at`
|
||||
- Alertmanager 背景流程先跑 `openclaw.analyze_alert()`,沒有比照 Phase 2 的 YAML `NO_ACTION` 優先門,導致 `HostHighCpuLoad` 這類主機告警先被 LLM 汙染卡片內容,後續防護只能阻擋執行、不能修正已發出的錯誤建議
|
||||
|
||||
## 📍 2026-04-24 — Telegram「AI 分析超時」止血 + incident_id 單一真相補強
|
||||
|
||||
### 本次修復
|
||||
|
||||
Reference in New Issue
Block a user