diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 2b93172d..8fc48416 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -33,7 +33,7 @@ from pydantic import BaseModel, Field from src.core.config import settings from src.core.constants import is_cicd_alertname, is_heartbeat_alertname -from src.services.alert_rule_engine import get_incident_type +from src.services.alert_rule_engine import get_incident_type, match_rule from src.core.logging import get_logger from src.core.metrics import record_alert_chain_success @@ -94,6 +94,20 @@ logger = get_logger("awoooi.webhooks") # 2026-04-05 ogt: 自動修復背景任務 (ADR-058 閉環) # ============================================================================= + +def _should_bypass_alertmanager_llm( + rule_response: dict | None, + alert_category: str, +) -> bool: + """Host 類告警命中 YAML NO_ACTION 時,直接走人工排查卡片。""" + return ( + rule_response is not None + and rule_response.get("suggested_action") == "NO_ACTION" + and not str(rule_response.get("kubectl_command", "")).strip() + and rule_response.get("rule_id", "") not in ("generic_fallback", "") + and alert_category == "host_resource" + ) + async def _try_auto_repair_background( incident_id: str, approval_id: str, @@ -1127,6 +1141,129 @@ async def _process_new_alert_background( service = get_approval_service() openclaw = get_openclaw() + rule_response = match_rule(alert_context) + should_bypass_llm = _should_bypass_alertmanager_llm(rule_response, alert_category) + + if should_bypass_llm: + logger.info( + "alertmanager_rule_bypass_llm", + alert_id=alert_id, + alertname=alertname, + rule_id=rule_response.get("rule_id", ""), + alert_category=alert_category, + reason="host_resource YAML NO_ACTION 規則命中,跳過 LLM 產生人工排查卡片", + ) + risk_mapping = { + "low": RiskLevel.LOW, + "medium": RiskLevel.MEDIUM, + "critical": RiskLevel.CRITICAL, + } + rule_risk = risk_mapping.get( + str(rule_response.get("risk_level", "low")).lower(), + RiskLevel.LOW, + ) + blast = rule_response.get("blast_radius", {}) or {} + impact_mapping = { + "NONE": DataImpact.NONE, + "READ_ONLY": DataImpact.READ_ONLY, + "WRITE": DataImpact.WRITE, + "DESTRUCTIVE": DataImpact.DESTRUCTIVE, + } + data_impact = impact_mapping.get( + str(blast.get("data_impact", "NONE")).upper(), + DataImpact.NONE, + ) + rule_action_title = str(rule_response.get("action_title", "人工排查主機告警")) + rule_kubectl = str(rule_response.get("kubectl_command", "")).strip() + rule_description = str(rule_response.get("description", message)) + rule_action = ( + f"{rule_action_title} | {rule_kubectl}" + if rule_kubectl else + f"NO_ACTION - {rule_description[:120]}" + ) + + approval_create = ApprovalRequestCreate( + action=rule_action, + description=f"[Rule: {rule_response.get('rule_id', 'unknown')}] {rule_description}", + risk_level=rule_risk, + blast_radius=BlastRadius( + affected_pods=int(blast.get("affected_pods", 1) or 1), + estimated_downtime=str(blast.get("estimated_downtime", "N/A")), + related_services=list( + set( + (blast.get("related_services") or []) + + [str(rule_response.get("target_resource", target_resource))] + ) + ), + data_impact=data_impact, + ), + dry_run_checks=[ + DryRunCheck( + name="規則命中", + passed=True, + message=str(rule_response.get("rule_id", "unknown")), + ), + DryRunCheck( + name="來源", + passed=True, + message="alertmanager", + ), + ], + requested_by="OpenClaw (rule-engine)", + ) + + approval = await service.create_approval_with_fingerprint( + request=approval_create, + fingerprint=fingerprint, + ) + + incident_id = await create_incident_for_approval( + approval_id=str(approval.id), + risk_level=rule_risk.value, + target_resource=target_resource, + namespace=namespace, + alert_type=alert_type, + message=message, + source="alertmanager", + alertname=alertname, + alert_labels=alert_labels, + notification_type=notification_type, + alert_category=alert_category, + ) + + try: + await service.update_incident_id(approval.id, incident_id) + approval.incident_id = incident_id + except Exception as _meta_err: + logger.warning( + "rule_bypass_approval_incident_id_update_failed", + approval_id=str(approval.id), + incident_id=incident_id, + error=str(_meta_err), + ) + + await _push_to_telegram_background( + approval_id=str(approval.id), + risk_level=rule_risk.value, + resource_name=target_resource, + root_cause=rule_description, + suggested_action=rule_action, + estimated_downtime=str(blast.get("estimated_downtime", "N/A")), + hit_count=1, + primary_responsibility=str( + rule_response.get("primary_responsibility", "INFRA") + ), + confidence=float(rule_response.get("confidence", 0.0) or 0.0), + namespace=namespace, + incident_id=incident_id, + notification_type=notification_type, + alert_category=alert_category, + fingerprint=fingerprint, + ) + + record_alert_chain_success("alertmanager") + return + analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context) if analysis_result: diff --git a/apps/api/src/repositories/incident_repository.py b/apps/api/src/repositories/incident_repository.py index f84d4748..adae6668 100644 --- a/apps/api/src/repositories/incident_repository.py +++ b/apps/api/src/repositories/incident_repository.py @@ -262,6 +262,7 @@ class IncidentDBRepository(IIncidentRepository): incident_id: str, status: str, updated_at: datetime | None = None, + resolved_at: datetime | None = None, ) -> bool: """更新 Incident 的狀態 (for debug)""" async with get_db_context() as db: @@ -277,6 +278,8 @@ class IncidentDBRepository(IIncidentRepository): record.status = status record.updated_at = updated_at or datetime.now(UTC) + if resolved_at is not None: + record.resolved_at = resolved_at await db.commit() logger.debug( diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index bf19fa5a..7f0364fb 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -1008,7 +1008,8 @@ class IncidentService: await repo.update_status( incident_id=incident_id, status="resolved", - updated_at=now_taipei(), + updated_at=incident.updated_at, + resolved_at=incident.resolved_at, ) logger.info("resolve_db_updated", incident_id=incident_id) except Exception as e: diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index c9ee7d2d..71978de0 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -90,6 +90,13 @@ def _smart_truncate(text: str, limit: int, suffix: str = "…[截斷]") -> str: return text[:limit] + suffix +def _format_resolved_guard_stamp(resolved_at: datetime | None) -> str: + """格式化 ADR-071-D 已解決狀態守衛文案。""" + if resolved_at is None: + return "✅ 此事件已解決" + return f"✅ 此事件已於 {resolved_at.strftime('%Y-%m-%d %H:%M')} 解決" + + # ============================================================================= # Long Polling 配置 (Phase 5 內網修復) # ============================================================================= @@ -3568,7 +3575,6 @@ class TelegramGateway: return None if incident.status in (IncidentStatus.RESOLVED, IncidentStatus.CLOSED): - resolved_at = incident.resolved_at.strftime("%Y-%m-%d %H:%M") if incident.resolved_at else "未知時間" await self._answer_callback( callback_query_id, "blocked", @@ -3577,7 +3583,7 @@ class TelegramGateway: try: separator = "──────────────" safe_original = html.escape(original_text) if original_text else "" - stamp = f"✅ 此事件已於 {resolved_at} 解決" + stamp = _format_resolved_guard_stamp(incident.resolved_at) await self._send_request("editMessageText", { "chat_id": self.chat_id, "message_id": message_id, diff --git a/apps/api/tests/test_alertmanager_rule_bypass.py b/apps/api/tests/test_alertmanager_rule_bypass.py new file mode 100644 index 00000000..4f227ee1 --- /dev/null +++ b/apps/api/tests/test_alertmanager_rule_bypass.py @@ -0,0 +1,47 @@ +from datetime import datetime + +from src.api.v1.webhooks import _should_bypass_alertmanager_llm +from src.services.telegram_gateway import _format_resolved_guard_stamp + + +def test_host_resource_yaml_no_action_bypasses_llm(): + rule_response = { + "rule_id": "host_resource_alert", + "suggested_action": "NO_ACTION", + "kubectl_command": "", + } + + assert _should_bypass_alertmanager_llm(rule_response, "host_resource") is True + + +def test_generic_fallback_does_not_bypass_llm(): + rule_response = { + "rule_id": "generic_fallback", + "suggested_action": "NO_ACTION", + "kubectl_command": "", + } + + assert _should_bypass_alertmanager_llm(rule_response, "host_resource") is False + + +def test_non_host_category_does_not_bypass_llm(): + rule_response = { + "rule_id": "host_resource_alert", + "suggested_action": "NO_ACTION", + "kubectl_command": "", + } + + assert _should_bypass_alertmanager_llm(rule_response, "kubernetes") is False + + +def test_resolved_guard_stamp_without_timestamp_is_clean(): + assert _format_resolved_guard_stamp(None) == "✅ 此事件已解決" + + +def test_resolved_guard_stamp_with_timestamp_formats_time(): + resolved_at = datetime(2026, 4, 25, 0, 2) + + assert ( + _format_resolved_guard_stamp(resolved_at) + == "✅ 此事件已於 2026-04-25 00:02 解決" + ) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 1d938a03..c9385750 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,17 @@ --- +## 📍 2026-04-25 — Host 告警錯誤診斷與 resolved_at 缺漏修復 + +### 本次修復 +- **Incident resolve DB 同步補洞**:`IncidentService.resolve_incident()` 現在會把 `resolved_at` 一起傳給 `IncidentRepository.update_status()`,修正 Incident 狀態已是 `RESOLVED` 但 DB `resolved_at = NULL` 的斷鏈 +- **Telegram 已解決文案保底**:狀態守衛改為 `resolved_at` 缺漏時顯示 `✅ 此事件已解決`,不再出現 `此事件已於 未知時間 解決` +- **Host 告警規則前置短路**:`/api/v1/webhooks/alertmanager` 背景流程新增 `host_resource + YAML NO_ACTION` 前置門,主機資源告警命中規則後直接生成人工排查卡片,跳過 LLM,避免產生「重啟 AWOOOI deployment」這種錯誤 K8s 建議 + +### 根因 +- `resolved_at` 只寫入 Redis / Working Memory,Repository `update_status()` 沒有同步回 PostgreSQL,造成 Telegram 狀態守衛讀到 `RESOLVED + NULL resolved_at` +- Alertmanager 背景流程先跑 `openclaw.analyze_alert()`,沒有比照 Phase 2 的 YAML `NO_ACTION` 優先門,導致 `HostHighCpuLoad` 這類主機告警先被 LLM 汙染卡片內容,後續防護只能阻擋執行、不能修正已發出的錯誤建議 + ## 📍 2026-04-24 — Telegram「AI 分析超時」止血 + incident_id 單一真相補強 ### 本次修復