fix(aiops): correct host alert fallback and resolved stamp
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 8m54s

This commit is contained in:
Your Name
2026-04-25 00:13:53 +08:00
parent 6df631c895
commit 55f111e0e3
6 changed files with 209 additions and 4 deletions

View File

@@ -33,7 +33,7 @@ from pydantic import BaseModel, Field
from src.core.config import settings
from src.core.constants import is_cicd_alertname, is_heartbeat_alertname
from src.services.alert_rule_engine import get_incident_type
from src.services.alert_rule_engine import get_incident_type, match_rule
from src.core.logging import get_logger
from src.core.metrics import record_alert_chain_success
@@ -94,6 +94,20 @@ logger = get_logger("awoooi.webhooks")
# 2026-04-05 ogt: 自動修復背景任務 (ADR-058 閉環)
# =============================================================================
def _should_bypass_alertmanager_llm(
rule_response: dict | None,
alert_category: str,
) -> bool:
"""Host 類告警命中 YAML NO_ACTION 時,直接走人工排查卡片。"""
return (
rule_response is not None
and rule_response.get("suggested_action") == "NO_ACTION"
and not str(rule_response.get("kubectl_command", "")).strip()
and rule_response.get("rule_id", "") not in ("generic_fallback", "")
and alert_category == "host_resource"
)
async def _try_auto_repair_background(
incident_id: str,
approval_id: str,
@@ -1127,6 +1141,129 @@ async def _process_new_alert_background(
service = get_approval_service()
openclaw = get_openclaw()
rule_response = match_rule(alert_context)
should_bypass_llm = _should_bypass_alertmanager_llm(rule_response, alert_category)
if should_bypass_llm:
logger.info(
"alertmanager_rule_bypass_llm",
alert_id=alert_id,
alertname=alertname,
rule_id=rule_response.get("rule_id", ""),
alert_category=alert_category,
reason="host_resource YAML NO_ACTION 規則命中,跳過 LLM 產生人工排查卡片",
)
risk_mapping = {
"low": RiskLevel.LOW,
"medium": RiskLevel.MEDIUM,
"critical": RiskLevel.CRITICAL,
}
rule_risk = risk_mapping.get(
str(rule_response.get("risk_level", "low")).lower(),
RiskLevel.LOW,
)
blast = rule_response.get("blast_radius", {}) or {}
impact_mapping = {
"NONE": DataImpact.NONE,
"READ_ONLY": DataImpact.READ_ONLY,
"WRITE": DataImpact.WRITE,
"DESTRUCTIVE": DataImpact.DESTRUCTIVE,
}
data_impact = impact_mapping.get(
str(blast.get("data_impact", "NONE")).upper(),
DataImpact.NONE,
)
rule_action_title = str(rule_response.get("action_title", "人工排查主機告警"))
rule_kubectl = str(rule_response.get("kubectl_command", "")).strip()
rule_description = str(rule_response.get("description", message))
rule_action = (
f"{rule_action_title} | {rule_kubectl}"
if rule_kubectl else
f"NO_ACTION - {rule_description[:120]}"
)
approval_create = ApprovalRequestCreate(
action=rule_action,
description=f"[Rule: {rule_response.get('rule_id', 'unknown')}] {rule_description}",
risk_level=rule_risk,
blast_radius=BlastRadius(
affected_pods=int(blast.get("affected_pods", 1) or 1),
estimated_downtime=str(blast.get("estimated_downtime", "N/A")),
related_services=list(
set(
(blast.get("related_services") or [])
+ [str(rule_response.get("target_resource", target_resource))]
)
),
data_impact=data_impact,
),
dry_run_checks=[
DryRunCheck(
name="規則命中",
passed=True,
message=str(rule_response.get("rule_id", "unknown")),
),
DryRunCheck(
name="來源",
passed=True,
message="alertmanager",
),
],
requested_by="OpenClaw (rule-engine)",
)
approval = await service.create_approval_with_fingerprint(
request=approval_create,
fingerprint=fingerprint,
)
incident_id = await create_incident_for_approval(
approval_id=str(approval.id),
risk_level=rule_risk.value,
target_resource=target_resource,
namespace=namespace,
alert_type=alert_type,
message=message,
source="alertmanager",
alertname=alertname,
alert_labels=alert_labels,
notification_type=notification_type,
alert_category=alert_category,
)
try:
await service.update_incident_id(approval.id, incident_id)
approval.incident_id = incident_id
except Exception as _meta_err:
logger.warning(
"rule_bypass_approval_incident_id_update_failed",
approval_id=str(approval.id),
incident_id=incident_id,
error=str(_meta_err),
)
await _push_to_telegram_background(
approval_id=str(approval.id),
risk_level=rule_risk.value,
resource_name=target_resource,
root_cause=rule_description,
suggested_action=rule_action,
estimated_downtime=str(blast.get("estimated_downtime", "N/A")),
hit_count=1,
primary_responsibility=str(
rule_response.get("primary_responsibility", "INFRA")
),
confidence=float(rule_response.get("confidence", 0.0) or 0.0),
namespace=namespace,
incident_id=incident_id,
notification_type=notification_type,
alert_category=alert_category,
fingerprint=fingerprint,
)
record_alert_chain_success("alertmanager")
return
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context)
if analysis_result:

View File

@@ -262,6 +262,7 @@ class IncidentDBRepository(IIncidentRepository):
incident_id: str,
status: str,
updated_at: datetime | None = None,
resolved_at: datetime | None = None,
) -> bool:
"""更新 Incident 的狀態 (for debug)"""
async with get_db_context() as db:
@@ -277,6 +278,8 @@ class IncidentDBRepository(IIncidentRepository):
record.status = status
record.updated_at = updated_at or datetime.now(UTC)
if resolved_at is not None:
record.resolved_at = resolved_at
await db.commit()
logger.debug(

View File

@@ -1008,7 +1008,8 @@ class IncidentService:
await repo.update_status(
incident_id=incident_id,
status="resolved",
updated_at=now_taipei(),
updated_at=incident.updated_at,
resolved_at=incident.resolved_at,
)
logger.info("resolve_db_updated", incident_id=incident_id)
except Exception as e:

View File

@@ -90,6 +90,13 @@ def _smart_truncate(text: str, limit: int, suffix: str = "…[截斷]") -> str:
return text[:limit] + suffix
def _format_resolved_guard_stamp(resolved_at: datetime | None) -> str:
"""格式化 ADR-071-D 已解決狀態守衛文案。"""
if resolved_at is None:
return "✅ 此事件已解決"
return f"✅ 此事件已於 {resolved_at.strftime('%Y-%m-%d %H:%M')} 解決"
# =============================================================================
# Long Polling 配置 (Phase 5 內網修復)
# =============================================================================
@@ -3568,7 +3575,6 @@ class TelegramGateway:
return None
if incident.status in (IncidentStatus.RESOLVED, IncidentStatus.CLOSED):
resolved_at = incident.resolved_at.strftime("%Y-%m-%d %H:%M") if incident.resolved_at else "未知時間"
await self._answer_callback(
callback_query_id,
"blocked",
@@ -3577,7 +3583,7 @@ class TelegramGateway:
try:
separator = "──────────────"
safe_original = html.escape(original_text) if original_text else ""
stamp = f"✅ 此事件已於 {resolved_at} 解決"
stamp = _format_resolved_guard_stamp(incident.resolved_at)
await self._send_request("editMessageText", {
"chat_id": self.chat_id,
"message_id": message_id,

View File

@@ -0,0 +1,47 @@
from datetime import datetime
from src.api.v1.webhooks import _should_bypass_alertmanager_llm
from src.services.telegram_gateway import _format_resolved_guard_stamp
def test_host_resource_yaml_no_action_bypasses_llm():
rule_response = {
"rule_id": "host_resource_alert",
"suggested_action": "NO_ACTION",
"kubectl_command": "",
}
assert _should_bypass_alertmanager_llm(rule_response, "host_resource") is True
def test_generic_fallback_does_not_bypass_llm():
rule_response = {
"rule_id": "generic_fallback",
"suggested_action": "NO_ACTION",
"kubectl_command": "",
}
assert _should_bypass_alertmanager_llm(rule_response, "host_resource") is False
def test_non_host_category_does_not_bypass_llm():
rule_response = {
"rule_id": "host_resource_alert",
"suggested_action": "NO_ACTION",
"kubectl_command": "",
}
assert _should_bypass_alertmanager_llm(rule_response, "kubernetes") is False
def test_resolved_guard_stamp_without_timestamp_is_clean():
assert _format_resolved_guard_stamp(None) == "✅ 此事件已解決"
def test_resolved_guard_stamp_with_timestamp_formats_time():
resolved_at = datetime(2026, 4, 25, 0, 2)
assert (
_format_resolved_guard_stamp(resolved_at)
== "✅ 此事件已於 2026-04-25 00:02 解決"
)

View File

@@ -6,6 +6,17 @@
---
## 📍 2026-04-25 — Host 告警錯誤診斷與 resolved_at 缺漏修復
### 本次修復
- **Incident resolve DB 同步補洞**`IncidentService.resolve_incident()` 現在會把 `resolved_at` 一起傳給 `IncidentRepository.update_status()`,修正 Incident 狀態已是 `RESOLVED` 但 DB `resolved_at = NULL` 的斷鏈
- **Telegram 已解決文案保底**:狀態守衛改為 `resolved_at` 缺漏時顯示 `✅ 此事件已解決`,不再出現 `此事件已於 未知時間 解決`
- **Host 告警規則前置短路**`/api/v1/webhooks/alertmanager` 背景流程新增 `host_resource + YAML NO_ACTION` 前置門,主機資源告警命中規則後直接生成人工排查卡片,跳過 LLM避免產生「重啟 AWOOOI deployment」這種錯誤 K8s 建議
### 根因
- `resolved_at` 只寫入 Redis / Working MemoryRepository `update_status()` 沒有同步回 PostgreSQL造成 Telegram 狀態守衛讀到 `RESOLVED + NULL resolved_at`
- Alertmanager 背景流程先跑 `openclaw.analyze_alert()`,沒有比照 Phase 2 的 YAML `NO_ACTION` 優先門,導致 `HostHighCpuLoad` 這類主機告警先被 LLM 汙染卡片內容,後續防護只能阻擋執行、不能修正已發出的錯誤建議
## 📍 2026-04-24 — Telegram「AI 分析超時」止血 + incident_id 單一真相補強
### 本次修復