Files
awoooi/apps/api/src/services/incident_service.py
Your Name 92316dda04
All checks were successful
CD Pipeline / tests (push) Successful in 1m33s
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / build-and-deploy (push) Successful in 4m54s
CD Pipeline / post-deploy-checks (push) Successful in 2m8s
fix(api): resolve db-only stale incidents
2026-05-29 11:15:46 +08:00

1545 lines
57 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Incident Service - Phase 6.2 雙層記憶寫入
==========================================
功能:
- Working Memory (Redis): 活躍事件7 天 TTL
- Episodic Memory (PostgreSQL): 歷史事件,永久保留
設計原則:
- 先寫 Redis (快),再寫 PostgreSQL (持久)
- 兩者都成功才算完成
- 失敗時記錄日誌但不中斷主流程
統帥鐵律:
- 禁止硬編碼 IP 或密碼,嚴格讀取 .env
- 所有寫入操作都必須有結構化日誌
C2 修正 (首席架構師審查 2026-04-10 Claude Sonnet 4.6 Asia/Taipei):
create_incident_for_approval + _extract_affected_services 從 Router 層移入此 Service 層
原違規: 業務邏輯 (Severity 映射, Signal 建立, Incident 建立) 放在 api/v1/webhooks.py
"""
import json
from datetime import UTC, datetime
from typing import Any, Literal
from uuid import UUID
import structlog
from src.core.redis_client import get_redis
from src.db.base import get_db_context
from src.db.models import IncidentRecord
from src.models.incident import (
Incident,
IncidentFrequencyStats,
IncidentStatus,
Severity,
Signal,
)
from src.utils.timezone import now_taipei
logger = structlog.get_logger(__name__)
# =============================================================================
# C2 修正: 從 webhooks.py 遷入的業務邏輯
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei
# =============================================================================
# 風險等級 → 事件嚴重度映射 (原在 webhooks.py)
_RISK_TO_SEVERITY = {
"critical": Severity.P0,
"high": Severity.P1,
"medium": Severity.P2,
"low": Severity.P3,
}
# I1 修正: 提升為 module-level frozenset避免每次呼叫重建 (原在 webhooks.py 函數體內)
_INFRA_JOB_NAMES: frozenset[str] = frozenset(
j.lower().replace("-", "").replace("_", "")
for j in {"node", "node-exporter", "pushgateway", "blackbox",
"prometheus", "alertmanager", "cadvisor"}
)
def extract_affected_services(labels: dict, target_resource: str) -> list[str]:
"""
從告警 labels 提取真實服務名,防止 IP 或 alertname 污染 affected_services。
優先序:
1. component labelDocker-compose 層告警最可靠)
2. job label排除 node-exporter / pushgateway 等基礎設施 job
3. pod label取 deployment name去掉 hash suffix
4. target_resource不含冒號、不等於 alertname 時才採用)
5. 空列表(讓通用型 Playbook 透過空集合豁免規則匹配)
Phase 1 飛輪修復 — 2026-04-10 Claude Sonnet 4.6 Asia/Taipei
C2 修正: 從 api/v1/webhooks.py 移入 Service 層(純業務邏輯,無 I/O
"""
alertname = labels.get("alertname", "")
if comp := labels.get("component"):
return [comp]
if job := labels.get("job"):
normalized = job.lower().replace("-", "").replace("_", "")
if normalized not in _INFRA_JOB_NAMES:
return [job]
if pod := labels.get("pod"):
parts = pod.rsplit("-", 2)
if len(parts) >= 3 and len(parts[-1]) == 5 and len(parts[-2]) in (9, 10):
return [parts[0]]
elif len(parts) >= 2:
return ["-".join(parts[:-1])]
if (target_resource
and ":" not in target_resource
and target_resource != alertname
and not target_resource[0].isdigit()):
return [target_resource]
return []
def classify_alert_early(
alertname: str,
severity: str,
labels: dict | None = None,
age_hours: float = 0.0,
) -> tuple[str, str]:
"""
ADR-073 Phase 2-2: 早期分診,在 LLM 分析前決定 alert_category + notification_type。
防止 HostBackupFailed 等告警被誤路由到 K8s executor。
規則優先順序(由高到低):
1. ConfigurationDrift / KubeConfigDrift → TYPE-4D (Config Drift 卡片)
2. severity=info/none → TYPE-1 (純資訊,無按鈕)
3. backup/heartbeat 關鍵字 → TYPE-1但 backup failure age > 24h → TYPE-3見下
4. Docker/Host/Systemd runner 前綴 → infrastructure/host_resource TYPE-3
5. Kube/Pod/Deploy/Node/Velero/ArgoCD 前綴 → kubernetes TYPE-3
6. Postgres/Redis 前綴 → database TYPE-3
7. 預設 → general TYPE-3
2026-04-25 ogt + Claude Sonnet 4.6 (P0 備份告警升級修復):
- age_hours > 24HostBackupFailed/HostBackupStale/HostBackupMissing 升級為 TYPE-3
原因:備份 25h 未成功是 P0 故障,不是「純資訊」
此時應觸發 LLM 分析 + 自動修復建議,而非靜默發純文字通知
C3 修正 (首席架構師 CR 2026-04-13): 從 Router 層 (webhooks.py) 移入 Service 層
原違規: 業務邏輯函數定義在 api/v1/webhooks.py
Args:
alertname: Alertmanager alert name
severity: 告警嚴重度critical/warning/info/none
labels: Alertmanager labels dict
age_hours: 告警持續時數(由 startsAt 計算0.0 = 未知)
Returns:
tuple[str, str]: (alert_category, notification_type)
"""
# ADR-075 (2026-04-12): 完整重寫,修正 category 命名對齊 _build_inline_keyboard。
# 統帥決議kubernetes非 k8s_workload、host_resource 從 infrastructure 分離。
alertname_lower = alertname.lower()
# 1. Config Drift最高優先類型特殊不受 severity 影響)
if alertname in ("ConfigurationDrift", "KubeConfigDrift"):
return "config_drift", "TYPE-4D"
# 2. 告警鏈路健康meta-monitoring優先於 severity 判斷)
# 2026-04-12 ogt: 補入 NoAlertsReceived + PrometheusNotConnectedToAlertmanager
if alertname in (
"AlertChainBroken_Alertmanager",
"AlertChainBroken_Sentry",
"NoAlertsReceived",
"NoAlertsReceived2Hours",
"AlertChainUnhealthy",
"PrometheusNotConnectedToAlertmanager",
):
return "alertchain_health", "TYPE-8M"
# 3. 資安告警(高優先,防止被 severity/prefix 規則覆蓋)
# ADR-075 TYPE-5S (2026-04-12 ogt)
if any(alertname.startswith(p) for p in (
"UnauthorizedSSH", "KubeAudit", "CVECritical", "WAFAttack",
"PodAbnormal", "SecurityBreach",
)):
return "secops", "TYPE-5S"
# 4. 飛輪/AI 系統健康(優先於 severity 判斷)
if alertname in ("AutoRepairLowSuccessRate", "PermanentFixRequired") or any(
alertname.startswith(p) for p in ("Flywheel", "MCPProvider", "OllamaDown", "NemotronDown")
):
return "flywheel_health", "TYPE-8M"
# 4a. 業務/FinOps 告警ADR-075 TYPE-6B
if any(alertname.startswith(p) for p in (
"AITokenCost", "GeminiAPIError", "SLOBurn", "APIErrorBudget",
"MomoScraper", "ScraperSuccess",
)):
return "business", "TYPE-6B"
# 5. 純資訊
if severity in ("info", "none"):
return "info", "TYPE-1"
# 2026-05-05 ogt + Codex: self-hosted runners are host-level systemd services.
# Must run before the generic "watchdog" heartbeat rule because
# SystemdRunnerWatchdogEnabled contains "Watchdog" but is not a heartbeat.
if alertname.startswith("SystemdRunner"):
return "host_resource", "TYPE-3"
# 5. Backup / Heartbeat — 純資訊,不進 LLM
# HostBackupFailed 必須在 Host prefix 前攔截,否則被歸 host_resource/TYPE-3
# 2026-04-12 ogt: 只針對已知主機備份監控 alertname不用寬泛關鍵字
# BackupJobFailed severity=warning 仍走 TYPE-3見測試 test_backup_keyword_warning_not_type1
_BACKUP_TYPE1_NAMES = {
"HostBackupFailed", "HostBackupStale", "HostBackupMissing",
"BackupRestoreTestFailed", "BackupRestoreTestStale",
}
# 2026-04-25 ogt + Claude Sonnet 4.6 (P0 備份告警升級修復):
# 備份失敗 > 24h 不是「純資訊」,是 P0 故障,必須走 TYPE-3 觸發 LLM 分析 + 自動修復
# BackupRestoreTestFailed 屬測試驗證類,不受 age 升級影響(仍 TYPE-1
_BACKUP_AGE_UPGRADE_NAMES = {
"HostBackupFailed", "HostBackupStale", "HostBackupMissing",
}
_BACKUP_AGE_THRESHOLD_HOURS = 24.0
if alertname in _BACKUP_AGE_UPGRADE_NAMES and age_hours > _BACKUP_AGE_THRESHOLD_HOURS:
return "backup_failure", "TYPE-3"
# 2026-04-12 ogt: 補入 DeadMansSwitchHEARTBEAT_ALERT_NAMES 中但之前漏掉)
if (
"watchdog" in alertname_lower
or "deadmansswitch" in alertname_lower
or alertname == "Heartbeat"
or alertname in _BACKUP_TYPE1_NAMES
or alertname.startswith("HostBackup")
):
return "backup", "TYPE-1"
# 2026-04-18 ogt + Claude Opus 4.7: 擴規則降 general 兜底MASTER §7.1 #7 <10%
# 根據 7d 實測 general 17 種 alertname 整理:
#
# 5.1 測試告警攔截(避免污染生產指標)
# TestAlert / FingerprintTest / E2ETestAlert / ADR089Test / L4ClosureLoop
# FP[A-Z]... / *FreshUniq* → test category (TYPE-1 純通知)
if (
alertname.startswith(("Test", "FingerprintTest", "ADR089", "L4Closure", "FPTest"))
or "FreshUniq" in alertname
or alertname in ("E2ETestAlert",)
or alertname.startswith("FP") and alertname[2:3].isupper() # FPTestB, FPTestA
):
return "test", "TYPE-1"
# 5.2 HighCPU / HighMemory / 其他 High* 主機資源類
if alertname.startswith(("HighCPU", "HighMemory", "HighMem", "HighDisk", "HighLoad")):
return "host_resource", "TYPE-3"
# 5.3 TLS / SSL / ProbeFailure → ssl_cert 或 external_site
if (
alertname.startswith(("TLS", "SSL", "Certificate"))
or "ProbeFailure" in alertname
or alertname in ("TestConnectivity",) # ProbeFailure 同義
):
return "ssl_cert", "TYPE-3"
# 5.4 PostgreSQL 詳盡(補 PostgreSQL* 變體,原 rule 用 startswith("Postgres")
# 按理涵蓋 PostgreSQLDiskGrowthRate 但實測落 general → 加保險規則)
if (
alertname.startswith(("PostgreSQL", "MySQL", "MongoDB"))
or "DiskGrowthRate" in alertname
):
return "database", "TYPE-3"
# 6. 主機資源(從 infrastructure 分離ADR-075 統帥決議)
if alertname.startswith("Host"):
return "host_resource", "TYPE-3"
# 7. Docker 容器
if alertname.startswith("Docker"):
return "infrastructure", "TYPE-3"
# 8. K8s統帥決議統一用 kubernetes不用 k8s_workload
if alertname.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")):
return "kubernetes", "TYPE-3"
# 9. 資料庫
if alertname.startswith(("Postgres", "Redis")):
return "database", "TYPE-3"
# 10. 物件儲存
if alertname == "MinIODown":
return "storage", "TYPE-3"
# 11. DevOps 工具ADR-075 修正:從 general 分離)
if alertname in (
"OpenClawDown", "SignOzDown", "GiteaDown", "HarborDown",
"SentryDown", "AlertmanagerDown", "KaliScannerDown", "GiteaCIPipelineFailed",
):
return "devops_tool", "TYPE-3"
# 12. 外部網站ADR-075 修正:從 general 分離)
if alertname in (
"MoWoooWorkDown", "TsenyangWebsiteDown",
"StockWoooWorkDown", "BitanWoooWorkDown",
):
return "external_site", "TYPE-3"
# 13. SSL 憑證ADR-075 修正:從 general 分離)
# ≥14 天→TYPE-1提醒無需審核<14 天→TYPE-3緊急審核
if alertname.startswith(("ExternalSiteSSL", "TLSCert")):
days = int((labels or {}).get("days_remaining", 0)) if labels else 0
return "ssl_cert", ("TYPE-1" if days >= 14 else "TYPE-3")
# 14. cAdvisor 監控工具P0.5 2026-04-24 ogt: 從 general 分離,避免監控工具誤入 general
if alertname.startswith(("Cadvisor", "cadvisor", "CAdvisor")):
return "infrastructure", "TYPE-2"
# 15. CoreDNSP0.5 2026-04-24 ogt: 從 general 分離)
if alertname.startswith(("CoreDNS", "CoreDns", "Coredns")):
return "kubernetes", "TYPE-2"
return "general", "TYPE-3"
async def create_incident_for_approval(
approval_id: str,
risk_level: str,
target_resource: str,
namespace: str,
alert_type: str,
message: str,
source: str = "alertmanager",
alertname: str | None = None,
alert_labels: dict | None = None,
notification_type: str | None = None, # ADR-073 Phase 2-2
alert_category: str | None = None, # ADR-073 Phase 2-2
) -> str:
"""
為 Approval 創建對應的 Incident (活躍事件同步)。
設計原則:
- Approval 和 Incident 必須同時存在
- Incident 存入 Redis (Working Memory) + PostgreSQL (Episodic Memory)
- 7 天 TTL 自動過期
C2 修正: 從 api/v1/webhooks.py 移入 Service 層(業務邏輯不屬 Router 層)
Returns:
str: Incident ID
"""
incident_service = get_incident_service()
severity = _RISK_TO_SEVERITY.get(risk_level.lower(), Severity.P2)
_labels: dict = {
"namespace": namespace,
"resource": target_resource,
"alertname": alertname or alert_type,
**(alert_labels or {}),
}
signal = Signal(
alert_name=alertname or alert_type,
severity=severity,
source=source,
fired_at=now_taipei(),
labels=_labels,
annotations={"message": message},
)
_affected_services = extract_affected_services(_labels, target_resource)
# 2026-04-27 ogt + Claude Sonnet 4.6: 補 frequency_stats 寫入
# 根因Alertmanager 告警建立 Incident 時從未查詢 AnomalyCounter
# → frequency_snapshot 永遠 null → 歷史統計顯示「無建立時快照」
# 修復:建立前先 record_anomaly將頻率快照存入 frequency_stats
_freq_stats = None
try:
from src.services.anomaly_counter import get_anomaly_counter
_anomaly_sig = {
"alert_name": alertname or alert_type,
"service": (alert_labels or {}).get("service", target_resource),
"namespace": namespace,
"error_type": (alert_labels or {}).get("error_type", alert_type),
}
_freq = await get_anomaly_counter().record_anomaly(_anomaly_sig)
if _freq:
_freq_stats = _freq.to_dict()
except Exception as _freq_err:
logger.warning("incident_frequency_stats_failed", error=str(_freq_err))
_freq_model = None
if _freq_stats:
try:
_freq_model = IncidentFrequencyStats(
anomaly_key=_freq_stats.get("anomaly_key", "unknown"),
count_1h=_freq_stats.get("count_1h", 0),
count_24h=_freq_stats.get("count_24h", 0),
count_7d=_freq_stats.get("count_7d", 0),
count_30d=_freq_stats.get("count_30d", 0),
escalation_level=_freq_stats.get("escalation_level"),
auto_repair_count=_freq_stats.get("auto_repair_count", 0),
)
except Exception:
pass
incident = Incident(
status=IncidentStatus.INVESTIGATING,
severity=severity,
signals=[signal],
affected_services=_affected_services,
proposal_ids=[UUID(approval_id)],
notification_type=notification_type, # ADR-073 Phase 2-2
alert_category=alert_category, # ADR-073 Phase 2-2
frequency_stats=_freq_model,
)
await incident_service.save_to_working_memory(incident)
try:
await incident_service.save_to_episodic_memory(incident)
except Exception as _pg_err:
logger.warning(
"incident_episodic_memory_failed",
incident_id=incident.incident_id,
error=str(_pg_err),
)
try:
from src.services.approval_db import get_timeline_service
await get_timeline_service().add_event(
event_type="webhook",
status="success",
title=f"Webhook alert received: {alertname or alert_type}",
description=message,
actor=source,
actor_role="webhook",
risk_level=risk_level,
approval_id=approval_id,
incident_id=incident.incident_id,
)
except Exception as _timeline_err:
logger.warning(
"incident_timeline_webhook_event_failed",
incident_id=incident.incident_id,
approval_id=approval_id,
error=str(_timeline_err),
)
logger.info(
"incident_created_for_approval",
incident_id=incident.incident_id,
approval_id=approval_id,
severity=severity.value,
target=target_resource,
)
return incident.incident_id
# =============================================================================
# Legacy Value Normalization (方案 C - 代碼相容舊格式)
# =============================================================================
# 問題: Redis 有舊 Enum 值 (status='open', severity='critical')
# 解法: 解析時正規化,不動 Redis 資料
# 回滾: git revert (秒級恢復)
# =============================================================================
def normalize_status(value: str | IncidentStatus) -> str:
"""
正規化 IncidentStatus 舊格式值
舊值 → 新值:
- 'open''investigating'
"""
if isinstance(value, IncidentStatus):
return value.value
raw = str(value)
if raw in IncidentStatus.__members__:
return IncidentStatus[raw].value
normalized = raw.strip().lower()
legacy_map = {
"open": "investigating",
}
valid_values = {status.value for status in IncidentStatus}
if normalized in valid_values:
return normalized
return legacy_map.get(normalized, raw)
def normalize_severity(value: str | Severity) -> str:
"""
正規化 Severity 舊格式值
舊值 → 新值:
- 'critical''P0'
- 'high''P1'
- 'warning''P2'
- 'medium''P2'
- 'info''P3'
- 'low''P3'
- 'none''P3'
"""
if isinstance(value, Severity):
return value.value
raw = str(value)
if raw in Severity.__members__:
return Severity[raw].value
normalized = raw.strip().lower()
legacy_map = {
"critical": "P0",
"high": "P1",
"warning": "P2",
"medium": "P2",
"info": "P3",
"low": "P3",
"none": "P3",
}
return legacy_map.get(normalized, raw)
def parse_decision_chain(value: Any, incident_id: str | None = None):
"""Best-effort restore of legacy decision_chain payloads from PostgreSQL."""
if not value:
return None
from src.models.incident import AIDecisionChain
if not isinstance(value, dict):
logger.warning(
"legacy_decision_chain_skipped",
incident_id=incident_id,
value_type=type(value).__name__,
)
return None
try:
return AIDecisionChain(**value)
except Exception as exc:
logger.warning(
"decision_chain_parse_failed",
incident_id=incident_id,
error=str(exc),
)
return None
def parse_incident_outcome(value: Any, incident_id: str | None = None):
"""Best-effort restore of legacy outcome payloads from PostgreSQL."""
if not value:
return None
from src.models.incident import IncidentOutcome
if not isinstance(value, dict):
logger.warning(
"legacy_incident_outcome_skipped",
incident_id=incident_id,
value_type=type(value).__name__,
)
return None
try:
return IncidentOutcome(**value)
except Exception as exc:
logger.warning(
"incident_outcome_parse_failed",
incident_id=incident_id,
error=str(exc),
)
return None
# =============================================================================
# Constants
# =============================================================================
# Redis Key Prefix
INCIDENT_KEY_PREFIX = "incident:"
# Working Memory TTL: 7 天 = 604800 秒
WORKING_MEMORY_TTL = 604800
# =============================================================================
# Incident Service
# =============================================================================
class IncidentService:
"""
雙層記憶服務
職責:
1. Working Memory (Redis): 活躍事件快取
2. Episodic Memory (PostgreSQL): 歷史事件持久化
使用方式:
service = IncidentService()
incident = await service.create_incident_from_signal(signal_data)
"""
# =========================================================================
# Working Memory (Redis)
# =========================================================================
async def save_to_working_memory(self, incident: Incident) -> bool:
"""
將 Incident 寫入 Working Memory (Redis)
使用 Redis Hash 儲存Key 格式: incident:{incident_id}
TTL: 7 天 (604800 秒)
Returns:
bool: 是否成功寫入
"""
redis_client = get_redis()
key = f"{INCIDENT_KEY_PREFIX}{incident.incident_id}"
try:
# 序列化為 JSON
incident_json = incident.model_dump_json()
# SET with TTL
await redis_client.set(
key,
incident_json,
ex=WORKING_MEMORY_TTL,
)
logger.info(
"working_memory_saved",
incident_id=incident.incident_id,
key=key,
ttl_seconds=WORKING_MEMORY_TTL,
)
return True
except Exception as e:
logger.exception(
"working_memory_save_error",
incident_id=incident.incident_id,
error=str(e),
)
return False
async def get_from_working_memory(self, incident_id: str) -> Incident | None:
"""
從 Working Memory 讀取 Incident
方案 C: 解析時正規化舊格式 Enum 值
Returns:
Incident | None: 事件資料,若不存在則返回 None
"""
redis_client = get_redis()
key = f"{INCIDENT_KEY_PREFIX}{incident_id}"
try:
data = await redis_client.get(key)
if data is None:
return None
# 方案 C: 正規化舊格式 Enum 值
incident_dict = json.loads(data)
if "status" in incident_dict:
incident_dict["status"] = normalize_status(incident_dict["status"])
if "severity" in incident_dict:
incident_dict["severity"] = normalize_severity(incident_dict["severity"])
# 同時正規化 signals 內的 severity
for signal in incident_dict.get("signals", []):
if "severity" in signal:
signal["severity"] = normalize_severity(signal["severity"])
return Incident.model_validate(incident_dict)
except Exception as e:
logger.exception(
"working_memory_get_error",
incident_id=incident_id,
error=str(e),
)
return None
async def get_active_incidents(self) -> list[Incident]:
"""
列出所有活躍的 Incidents (從 Working Memory)
方案 C: 解析時正規化舊格式 Enum 值
Returns:
list[Incident]: 活躍事件列表 (investigating 或 mitigating)
"""
redis_client = get_redis()
incidents: list[Incident] = []
try:
# SCAN 所有 incident:* keys
async for key in redis_client.scan_iter(
match=f"{INCIDENT_KEY_PREFIX}*",
count=100,
):
# 排除索引 keys
if ":idx:" in key:
continue
data = await redis_client.get(key)
if data is None:
continue
try:
# 方案 C: 正規化舊格式 Enum 值
incident_dict = json.loads(data)
if "status" in incident_dict:
incident_dict["status"] = normalize_status(incident_dict["status"])
if "severity" in incident_dict:
incident_dict["severity"] = normalize_severity(incident_dict["severity"])
# 正規化 signals 內的 severity
for signal in incident_dict.get("signals", []):
if "severity" in signal:
signal["severity"] = normalize_severity(signal["severity"])
incident = Incident.model_validate(incident_dict)
# 只返回活躍狀態的 Incident
if incident.status in (
IncidentStatus.INVESTIGATING,
IncidentStatus.MITIGATING,
):
incidents.append(incident)
except Exception as e:
logger.warning(
"incident_parse_error",
key=key,
error=str(e),
)
continue
logger.info(
"get_active_incidents",
count=len(incidents),
)
return incidents
except Exception as e:
logger.exception(
"get_active_incidents_error",
error=str(e),
)
return []
# =========================================================================
# Episodic Memory (PostgreSQL)
# =========================================================================
async def save_to_episodic_memory(self, incident: Incident) -> bool:
"""
將 Incident 寫入 Episodic Memory (PostgreSQL)
使用 SQLAlchemy async session 寫入 incidents 表。
Returns:
bool: 是否成功寫入
"""
try:
async with get_db_context() as db:
# 轉換為 SQLAlchemy model
# 使用 model_dump(mode="json") 確保 datetime 正確序列化
# 從 signals 提取 alertnameADR-073 Phase 2: incidents.alertname 欄位)
_alertname = (
incident.signals[0].labels.get("alertname")
or incident.signals[0].alert_name
if incident.signals
else None
)
record = IncidentRecord(
incident_id=incident.incident_id,
status=incident.status.value,
severity=incident.severity.value,
signals=[
s.model_dump(mode="json") for s in incident.signals
],
affected_services=incident.affected_services,
decision_chain=(
incident.decision_chain.model_dump(mode="json")
if incident.decision_chain
else None
),
proposal_ids=[str(pid) for pid in incident.proposal_ids],
outcome=(
incident.outcome.model_dump(mode="json")
if incident.outcome
else None
),
created_at=incident.created_at,
updated_at=incident.updated_at,
resolved_at=incident.resolved_at,
closed_at=incident.closed_at,
ttl_days=incident.ttl_days,
vectorized=incident.vectorized,
# ADR-073 Phase 2-2: 三個分類欄位(之前遺漏未寫入 DB
alertname=_alertname,
notification_type=incident.notification_type,
alert_category=incident.alert_category,
)
db.add(record)
# commit 由 get_db_context 自動處理
logger.info(
"episodic_memory_saved",
incident_id=incident.incident_id,
table="incidents",
)
return True
except Exception as e:
logger.exception(
"episodic_memory_save_error",
incident_id=incident.incident_id,
error=str(e),
)
return False
async def get_from_episodic_memory(self, incident_id: str) -> Incident | None:
"""
從 Episodic Memory 讀取 Incident
Returns:
Incident | None: 事件資料,若不存在則返回 None
"""
try:
async with get_db_context() as db:
from sqlalchemy import select
stmt = select(IncidentRecord).where(
IncidentRecord.incident_id == incident_id
)
result = await db.execute(stmt)
record = result.scalar_one_or_none()
if record is None:
return None
# 轉換回 Pydantic model
return self._record_to_incident(record)
except Exception as e:
logger.exception(
"episodic_memory_get_error",
incident_id=incident_id,
error=str(e),
)
return None
def _record_to_incident(self, record: IncidentRecord) -> Incident:
"""
將 SQLAlchemy record 轉換為 Pydantic Incident
方案 C: 解析時正規化舊格式 Enum 值
"""
# 方案 C: 正規化 signals 內的舊格式 severity
signals = []
for s in (record.signals or []):
signal_data = s.copy()
if "severity" in signal_data:
signal_data["severity"] = normalize_severity(signal_data["severity"])
signals.append(Signal(**signal_data))
decision_chain = parse_decision_chain(
record.decision_chain,
incident_id=record.incident_id,
)
outcome = parse_incident_outcome(
record.outcome,
incident_id=record.incident_id,
)
# 方案 C: 正規化舊格式 Enum 值
normalized_status = normalize_status(record.status)
normalized_severity = normalize_severity(record.severity)
return Incident(
incident_id=record.incident_id,
status=IncidentStatus(normalized_status),
severity=Severity(normalized_severity),
signals=signals,
affected_services=record.affected_services or [],
decision_chain=decision_chain,
proposal_ids=record.proposal_ids or [],
outcome=outcome,
created_at=record.created_at,
updated_at=record.updated_at,
resolved_at=record.resolved_at,
closed_at=record.closed_at,
ttl_days=record.ttl_days,
persisted_to_pg=True, # 從 PG 讀取,必為 True
vectorized=record.vectorized,
# ADR-073: 分類欄位必須從 DB 還原,否則 KM 寫入時全為 "unknown"
notification_type=record.notification_type,
alert_category=record.alert_category,
)
# =========================================================================
# 雙層寫入核心邏輯
# =========================================================================
async def create_incident_from_signal(
self,
signal_data: dict[str, Any],
frequency_stats: dict[str, Any] | None = None,
) -> Incident | None:
"""
從 Signal 建立 Incident 並雙層寫入
Phase 6.2 核心邏輯:
1. 建立 Incident (含 Signal)
2. 寫入 Working Memory (Redis) - 7 天 TTL
3. 寫入 Episodic Memory (PostgreSQL) - 永久保留
4. 標記 persisted_to_pg = True
Phase 21 (ADR-037) 擴展:
5. 含異常頻率統計 (用於 Tier 分級修復策略)
Args:
signal_data: 從 Redis Stream 收到的 Signal 資料
frequency_stats: ADR-037 異常頻率統計 (可選)
Returns:
Incident | None: 成功返回 Incident失敗返回 None
"""
try:
# 0. 去抖動 (Debounce) - 防止告警風暴
fingerprint = signal_data.get("fingerprint")
if fingerprint:
try:
redis_client = get_redis()
debounce_key = f"debounce:{fingerprint}"
# SETNX 若成功表示是新的,給予 3 分鐘 TTL (180s)
is_new = await redis_client.set(debounce_key, "1", ex=180, nx=True)
if not is_new:
logger.info(
"incident_debounced",
fingerprint=fingerprint,
reason="Duplicate signal within 3 minutes",
)
return None
except Exception as e:
logger.warning("incident_debounce_redis_error", error=str(e))
# 1. 解析 Signal
signal = Signal(
alert_name=signal_data.get("alert_name", "unknown"),
severity=self._parse_severity(signal_data.get("severity", "warning")),
source=self._parse_source(signal_data.get("source", "manual")),
fired_at=datetime.now(UTC),
labels=self._parse_dict(signal_data.get("labels", "{}")),
annotations=self._parse_dict(signal_data.get("annotations", "{}")),
fingerprint=signal_data.get("fingerprint"),
)
# 2. 建立 Incident (含頻率統計)
# ADR-037: 統帥指示「重啟只是治標,太常發生的異常必須徹底解決」
freq_stats = None
if frequency_stats:
freq_stats = IncidentFrequencyStats(
anomaly_key=frequency_stats.get("anomaly_key", "unknown"),
count_1h=frequency_stats.get("count_1h", 0),
count_24h=frequency_stats.get("count_24h", 0),
count_7d=frequency_stats.get("count_7d", 0),
count_30d=frequency_stats.get("count_30d", 0),
escalation_level=frequency_stats.get("escalation_level"),
auto_repair_count=frequency_stats.get("auto_repair_count", 0),
)
incident = Incident(
severity=signal.severity,
signals=[signal],
affected_services=[signal_data.get("target", "unknown")],
frequency_stats=freq_stats,
)
logger.info(
"incident_created",
incident_id=incident.incident_id,
severity=incident.severity.value,
signal_count=len(incident.signals),
)
# 3. 寫入 Working Memory (Redis)
redis_success = await self.save_to_working_memory(incident)
# 4. 寫入 Episodic Memory (PostgreSQL)
pg_success = await self.save_to_episodic_memory(incident)
# 5. 更新狀態
if pg_success:
incident.persisted_to_pg = True
# 更新 Redis 中的狀態
if redis_success:
await self.save_to_working_memory(incident)
# 6. 記錄雙層寫入結果
logger.info(
"dual_layer_memory_result",
incident_id=incident.incident_id,
redis_success=redis_success,
pg_success=pg_success,
persisted_to_pg=incident.persisted_to_pg,
)
try:
from src.services.approval_db import get_timeline_service
await get_timeline_service().add_event(
event_type="webhook",
status="success" if redis_success or pg_success else "warning",
title=f"Signal received: {signal.alert_name}",
description=(
signal.annotations.get("message")
or signal.annotations.get("description")
or signal.annotations.get("summary")
),
actor=signal.source,
actor_role="webhook",
risk_level=incident.severity.value,
incident_id=incident.incident_id,
)
except Exception as timeline_error:
logger.warning(
"incident_timeline_signal_event_failed",
incident_id=incident.incident_id,
error=str(timeline_error),
)
return incident
except Exception as e:
logger.exception(
"create_incident_error",
error=str(e),
)
return None
def _parse_source(
self,
source_str: str,
) -> Literal["prometheus", "signoz", "alertmanager", "manual", "telegram"]:
"""
解析來源字串,映射到 Signal 允許的 Literal 值
不在白名單中的來源一律映射為 'manual'
"""
valid_sources = {"prometheus", "signoz", "alertmanager", "manual", "telegram"}
if source_str.lower() in valid_sources:
return source_str.lower() # type: ignore
return "manual"
def _parse_severity(self, severity_str: str) -> Severity:
"""解析嚴重度字串"""
mapping = {
"critical": Severity.P0,
"high": Severity.P1,
"warning": Severity.P2,
"medium": Severity.P2,
"low": Severity.P3,
"info": Severity.P3,
}
return mapping.get(severity_str.lower(), Severity.P2)
def _parse_dict(self, value: str | dict) -> dict[str, str]:
"""解析字典字串或字典"""
if isinstance(value, dict):
return {str(k): str(v) for k, v in value.items()}
if isinstance(value, str):
try:
# 嘗試解析 JSON
parsed = json.loads(value.replace("'", '"'))
return {str(k): str(v) for k, v in parsed.items()}
except (json.JSONDecodeError, TypeError):
return {}
return {}
# =========================================================================
# Phase 17 P0: Router 層違規修復 - 新增方法
# =========================================================================
async def update_outcome(
self,
incident_id: str,
effectiveness_score: int | None = None,
human_feedback: str | None = None,
learning_notes: str | None = None,
should_remember: bool = True,
) -> Incident | None:
"""
更新 Incident 的 outcome (人類回饋)
Phase 17: 從 Router 層遷移至 Service 層
Args:
incident_id: 事件 ID
effectiveness_score: 有效性評分 (1-5)
human_feedback: 文字回饋
learning_notes: 學習筆記
should_remember: 是否納入長期記憶
Returns:
Incident | None: 更新後的事件,失敗返回 None
"""
from src.models.incident import IncidentOutcome
from src.repositories.incident_repository import get_incident_repository
from src.utils.timezone import now_taipei
# 1. 從 Working Memory 讀取
incident = await self.get_from_working_memory(incident_id)
if incident is None:
logger.warning("incident_not_found_for_outcome", incident_id=incident_id)
return None
# 2. 更新 outcome
if incident.outcome is None:
incident.outcome = IncidentOutcome()
if effectiveness_score is not None:
incident.outcome.effectiveness_score = effectiveness_score
if human_feedback is not None:
incident.outcome.human_feedback = human_feedback
if learning_notes is not None:
incident.outcome.learning_notes = learning_notes
incident.outcome.should_remember = should_remember
incident.updated_at = now_taipei()
# 3. 寫入 Working Memory
redis_success = await self.save_to_working_memory(incident)
if not redis_success:
logger.error("outcome_redis_write_failed", incident_id=incident_id)
return None
# 4. 同步到 Episodic Memory (PostgreSQL)
try:
repo = get_incident_repository()
await repo.update_outcome(
incident_id=incident_id,
outcome=incident.outcome.model_dump(mode="json"),
updated_at=now_taipei(),
)
logger.info("outcome_db_updated", incident_id=incident_id)
except Exception as e:
logger.warning(
"outcome_db_update_failed",
incident_id=incident_id,
error=str(e),
)
# DB 失敗不影響主流程
return incident
async def resolve_incident(
self,
incident_id: str,
resolution_type: str = "manual",
emit_postmortem: bool = True,
) -> Incident | None:
"""
將 Incident 狀態更新為 RESOLVED
Phase 17: 從 Router 層遷移至 Service 層
Args:
incident_id: 事件 ID
resolution_type: "manual"(預設)| "timeout"Approval 48h 逾期自動結案)
emit_postmortem: 是否送出使用者可見 Postmortem。批次歷史 reconciler
會關閉此開關,避免一次補關大量舊 incident 時洗版 Telegram。
ADR-073 補丁 2026-04-15 ogt + Claude Sonnet 4.6:
新增 resolution_type="timeout" 路徑 — Approval EXPIRED 時由
approval_timeout_resolver 呼叫,記錄 "timeout_ignored" disposition
而非 "manual_resolved",確保 EWMA 採樣正確區分人工結案與逾期拋棄。
Returns:
Incident | None: 更新後的事件,失敗返回 None
"""
from src.repositories.incident_repository import get_incident_repository
from src.utils.timezone import now_taipei
# 1. 從 Working Memory 讀取Redis TTL 過期時退回 PostgreSQL。
hydrated_from_episodic = False
incident = await self.get_from_working_memory(incident_id)
if incident is None:
incident = await self.get_from_episodic_memory(incident_id)
if incident is None:
logger.warning("incident_not_found_for_resolve", incident_id=incident_id)
return None
hydrated_from_episodic = True
logger.info(
"incident_resolve_hydrated_from_episodic_memory",
incident_id=incident_id,
resolution_type=resolution_type,
)
# 1.5 F2 (2026-05-07 ogt + Codex + Claude Sonnet 4.6) — 冪等保護:
# 已經 RESOLVED/CLOSED 的 incident 直接 return existing避免後續所有副作用
# 重複觸發postmortem / KB extract / KM convert / disposition / Telegram
# F2 NO_ACTION 路徑會頻繁呼叫 resolve_incident必須擋在 status mutation 之前。
if incident.status in (IncidentStatus.RESOLVED, IncidentStatus.CLOSED):
logger.info(
"incident_resolve_skipped_already_resolved",
incident_id=incident_id,
resolution_type=resolution_type,
)
return incident
# 2. 更新狀態
incident.status = IncidentStatus.RESOLVED
incident.resolved_at = now_taipei()
incident.updated_at = now_taipei()
# 3. 寫入 Working Memory。Redis TTL 已過的歷史 DB-only 事件只更新 DB
# 不重新灌回 Redis working memory避免舊事件回流成 active workload。
if hydrated_from_episodic:
logger.info("resolve_db_only_incident", incident_id=incident_id)
else:
redis_success = await self.save_to_working_memory(incident)
if not redis_success:
logger.error("resolve_redis_write_failed", incident_id=incident_id)
return None
# 4. 同步到 Episodic Memory
try:
repo = get_incident_repository()
await repo.update_status(
incident_id=incident_id,
status="resolved",
updated_at=incident.updated_at,
resolved_at=incident.resolved_at,
)
logger.info("resolve_db_updated", incident_id=incident_id)
except Exception as e:
logger.warning(
"resolve_db_update_failed",
incident_id=incident_id,
error=str(e),
)
# KB Phase 2-A: 自動萃取 KB 草稿 (fire-and-forget, 2026-04-03 ogt)
try:
import asyncio
from src.services.knowledge_extractor_service import get_knowledge_extractor
asyncio.create_task(
get_knowledge_extractor().extract_from_incident(incident)
)
except Exception:
logger.exception("kb_extract_task_create_failed", incident_id=incident_id)
# M2 修復 2026-04-28 ogt + Claude Sonnet 4.6: 改走 KMWriter 統一契約
# 原路徑自製 if/else + create_task沒有 retry / DLQ / 冪等。
# 現在:用 km_write_with_flag 的 DLQ 包裝,在呼叫點加 retry + DLQ 保護。
# km_conversion_service 內部邏輯不改(保留 notification_type 分類等複雜決策),
# 改為在呼叫點加統一契約保護(指數退避 3 次 + DLQ 失敗回收)。
try:
import asyncio
from src.core.config import settings
from src.services.km_conversion_service import get_km_conversion_service
from src.services.km_writer import (
_RETRY_BASE_DELAY,
_RETRY_MAX,
KMWritePayload,
_is_retriable,
_write_to_dlq,
)
_conversion_svc = get_km_conversion_service()
_effective_timeout = settings.KM_WRITE_TIMEOUT_SECONDS
_last_exc: Exception | None = None
for _attempt in range(1, _RETRY_MAX + 1):
try:
await asyncio.wait_for(
_conversion_svc.convert(incident),
timeout=_effective_timeout,
)
break # 成功,離開 retry loop
except TimeoutError:
logger.warning(
"km_conversion_timeout",
incident_id=incident_id,
timeout_sec=_effective_timeout,
attempt=_attempt,
)
# Timeout 不重試(重試也會 timeout
await _write_to_dlq(
KMWritePayload(
path_type="incident_resolve",
incident_id=incident_id,
entry_create_kwargs={"title": f"[DLQ] resolve {incident_id}"},
),
f"km_conversion_timeout_{_effective_timeout}s",
)
break
except Exception as _exc:
_last_exc = _exc
if _attempt < _RETRY_MAX and _is_retriable(_exc):
_delay = _RETRY_BASE_DELAY * (2 ** (_attempt - 1))
logger.warning(
"km_conversion_retry",
incident_id=incident_id,
attempt=_attempt,
delay_sec=_delay,
error=str(_exc),
)
await asyncio.sleep(_delay)
else:
logger.error(
"km_conversion_failed_all_retries",
incident_id=incident_id,
error=str(_exc),
)
await _write_to_dlq(
KMWritePayload(
path_type="incident_resolve",
incident_id=incident_id,
entry_create_kwargs={"title": f"[DLQ] resolve {incident_id}"},
),
str(_exc),
)
break
except Exception:
logger.exception("km_conversion_task_create_failed", incident_id=incident_id)
# 2026-04-07 Claude Code: Sprint 4 B4 — 手動處理推斷
# I1+S1 Fix: 委託 derive_key_from_incident() 統一推導
try:
from src.services.anomaly_counter import AnomalyCounter, get_anomaly_counter
counter = get_anomaly_counter()
anomaly_key = AnomalyCounter.derive_key_from_incident(incident)
if anomaly_key:
disposition = await counter.get_disposition_stats(anomaly_key)
has_system_resolution = (
disposition["auto_repair"] > 0
or disposition["human_approved"] > 0
or disposition["cold_start_trust"] > 0
)
if not has_system_resolution:
disp = "timeout_ignored" if resolution_type == "timeout" else "manual_resolved"
await counter.record_disposition(anomaly_key, disp)
except Exception as _disp_e:
logger.warning("disposition_manual_resolve_failed", error=str(_disp_e))
if emit_postmortem:
# MASTER Task 4.2 (2026-04-14 Claude Sonnet 4.6): Postmortem 自動組裝
# Incident duration > POSTMORTEM_MIN_DURATION_MINUTES(10min) 時自動生成
# 孤兒 report_generation_service.trigger_postmortem 本次接上 resolve 路徑
try:
import asyncio
from src.services.report_generation_service import (
get_report_generation_service,
)
alertname = (
incident.signals[0].labels.get("alertname", "UnknownAlert")
if incident.signals else "UnknownAlert"
)
title = f"{alertname}{', '.join(incident.affected_services or ['N/A'])}"
root_cause = None
resolution_action = None
ai_provider = None
auto_repaired = False
if incident.decision_chain:
root_cause = incident.decision_chain.hypothesis
ai_provider = incident.decision_chain.model_used
if incident.outcome:
resolution_action = (incident.outcome.learning_notes or None)
auto_repaired = bool(incident.outcome.execution_success)
asyncio.create_task(
get_report_generation_service().trigger_postmortem(
incident_id=incident.incident_id,
title=title,
created_at=incident.signals[0].fired_at if incident.signals else incident.resolved_at,
resolved_at=incident.resolved_at,
root_cause=root_cause,
resolution_action=resolution_action,
ai_provider=ai_provider,
auto_repaired=auto_repaired,
)
)
except Exception as _pm_e:
logger.exception("postmortem_trigger_failed",
incident_id=incident_id, error=str(_pm_e))
else:
logger.info(
"postmortem_suppressed_for_batch_reconcile",
incident_id=incident_id,
resolution_type=resolution_type,
)
return incident
async def find_by_proposal_id(self, proposal_id: str) -> Incident | None:
"""
根據 proposal_id 查找關聯的 Incident
Phase 17: 從 Router 層遷移至 Service 層
Args:
proposal_id: 提案 ID (UUID 字串)
Returns:
Incident | None: 找到的事件,未找到返回 None
"""
from uuid import UUID
redis_client = get_redis()
try:
target_uuid = UUID(proposal_id)
async for key in redis_client.scan_iter(
match=f"{INCIDENT_KEY_PREFIX}INC-*",
count=100,
):
data = await redis_client.get(key)
if data is None:
continue
try:
# 方案 C: 正規化舊格式 Enum 值
incident_dict = json.loads(data)
if "status" in incident_dict:
incident_dict["status"] = normalize_status(incident_dict["status"])
if "severity" in incident_dict:
incident_dict["severity"] = normalize_severity(incident_dict["severity"])
for signal in incident_dict.get("signals", []):
if "severity" in signal:
signal["severity"] = normalize_severity(signal["severity"])
incident = Incident.model_validate(incident_dict)
if target_uuid in incident.proposal_ids:
return incident
except Exception as e:
logger.warning(
"incident_parse_error_in_find",
key=key,
error=str(e),
)
continue
return None
except Exception as e:
logger.exception(
"find_by_proposal_id_error",
proposal_id=proposal_id,
error=str(e),
)
return None
async def trigger_reanalysis(self, incident_id: str) -> dict:
"""
觸發 Incident 重診 (ADR-050 P2: reanalyze button)
去重保護:同一 incident 10 分鐘內只觸發一次。
觸發後將 incident status 標記為 analyzing等待 AI 自動接手。
Args:
incident_id: Incident ID
Returns:
dict: {
"triggered": bool,
"message": str,
"already_analyzing": bool,
}
2026-04-01 Claude Code (ADR-050 P2): reanalyze button handler
"""
REANALYZE_TTL_SECONDS = 600 # 10 分鐘去重 TTL (ADR-050)
dedup_key = f"reanalyze_dedup:{incident_id}"
try:
redis_client = get_redis()
# 去重檢查 (SETNX: 只有第一次設定會成功)
is_new = await redis_client.set(dedup_key, "1", ex=REANALYZE_TTL_SECONDS, nx=True)
if not is_new:
logger.info(
"reanalyze_deduplicated",
incident_id=incident_id,
reason="Already triggered within 10 minutes",
)
return {
"triggered": False,
"message": "重診已在進行中,請 10 分鐘後再試",
"already_analyzing": True,
}
# 從 Working Memory 取得 Incident
incident = await self.get_from_working_memory(incident_id)
if not incident:
incident = await self.get_from_episodic_memory(incident_id)
if not incident:
# 刪除剛設定的去重 key讓下次能重試
await redis_client.delete(dedup_key)
logger.warning("reanalyze_incident_not_found", incident_id=incident_id)
return {
"triggered": False,
"message": f"找不到事件 {incident_id}",
"already_analyzing": False,
}
# 標記 status 為 analyzing讓 AI 引擎接手)
# 使用延遲 import 避免循環依賴(同 create_incident_from_signal 模式)
from src.models.incident import IncidentStatus
# 使用 INVESTIGATING 若 ANALYZING 不存在
analyzing_status = getattr(IncidentStatus, "ANALYZING", None) or getattr(IncidentStatus, "INVESTIGATING", None)
if analyzing_status:
incident.status = analyzing_status
await self.save_to_working_memory(incident)
logger.info(
"reanalyze_triggered",
incident_id=incident_id,
severity=incident.severity.value,
)
return {
"triggered": True,
"message": "重診已排程AI 正在分析中",
"already_analyzing": False,
}
except Exception as e:
logger.exception("reanalyze_failed", incident_id=incident_id, error=str(e))
return {
"triggered": False,
"message": f"重診觸發失敗: {str(e)[:80]}",
"already_analyzing": False,
}
# =============================================================================
# Singleton
# =============================================================================
_incident_service: IncidentService | None = None
def get_incident_service() -> IncidentService:
"""取得 Incident Service 實例 (Singleton)"""
global _incident_service
if _incident_service is None:
_incident_service = IncidentService()
return _incident_service