Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
incident_service.resolve_incident() 結尾 fire-and-forget 呼叫 report_generation_service.trigger_postmortem(),補完孤兒服務的觸發路徑。 觸發條件(由 trigger_postmortem 內部判斷): - duration > POSTMORTEM_MIN_DURATION_MINUTES (10min) - 含 AI root_cause / resolution_action / provider / auto_repaired 背景: - report_generation_service.py 539 行服務於先前 session 建立 - main.py:322 已啟動 run_daily_report_loop(Task 4.1 ✅) - trigger_postmortem 在 src/ 下無呼叫方 → 本 commit 補上 MASTER 藍圖 Phase 4 至此完整收官: ✅ Task 4.1 日度巡檢報告(08:00 台北排程,生產環境已跑) ✅ Task 4.2 Postmortem 自動組裝(本 commit 接上 resolve hook) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
1205 lines
43 KiB
Python
1205 lines
43 KiB
Python
"""
|
||
Incident Service - Phase 6.2 雙層記憶寫入
|
||
==========================================
|
||
|
||
功能:
|
||
- Working Memory (Redis): 活躍事件,7 天 TTL
|
||
- Episodic Memory (PostgreSQL): 歷史事件,永久保留
|
||
|
||
設計原則:
|
||
- 先寫 Redis (快),再寫 PostgreSQL (持久)
|
||
- 兩者都成功才算完成
|
||
- 失敗時記錄日誌但不中斷主流程
|
||
|
||
統帥鐵律:
|
||
- 禁止硬編碼 IP 或密碼,嚴格讀取 .env
|
||
- 所有寫入操作都必須有結構化日誌
|
||
|
||
C2 修正 (首席架構師審查 2026-04-10 Claude Sonnet 4.6 Asia/Taipei):
|
||
create_incident_for_approval + _extract_affected_services 從 Router 層移入此 Service 層
|
||
原違規: 業務邏輯 (Severity 映射, Signal 建立, Incident 建立) 放在 api/v1/webhooks.py
|
||
"""
|
||
|
||
import json
|
||
from datetime import UTC, datetime
|
||
from typing import Any, Literal
|
||
from uuid import UUID
|
||
|
||
import structlog
|
||
|
||
from src.core.redis_client import get_redis
|
||
from src.db.base import get_db_context
|
||
from src.db.models import IncidentRecord
|
||
from src.models.incident import (
|
||
Incident,
|
||
IncidentStatus,
|
||
Severity,
|
||
Signal,
|
||
)
|
||
from src.utils.timezone import now_taipei
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# =============================================================================
|
||
# C2 修正: 從 webhooks.py 遷入的業務邏輯
|
||
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei
|
||
# =============================================================================
|
||
|
||
# 風險等級 → 事件嚴重度映射 (原在 webhooks.py)
|
||
_RISK_TO_SEVERITY = {
|
||
"critical": Severity.P0,
|
||
"high": Severity.P1,
|
||
"medium": Severity.P2,
|
||
"low": Severity.P3,
|
||
}
|
||
|
||
# I1 修正: 提升為 module-level frozenset,避免每次呼叫重建 (原在 webhooks.py 函數體內)
|
||
_INFRA_JOB_NAMES: frozenset[str] = frozenset(
|
||
j.lower().replace("-", "").replace("_", "")
|
||
for j in {"node", "node-exporter", "pushgateway", "blackbox",
|
||
"prometheus", "alertmanager", "cadvisor"}
|
||
)
|
||
|
||
|
||
def extract_affected_services(labels: dict, target_resource: str) -> list[str]:
|
||
"""
|
||
從告警 labels 提取真實服務名,防止 IP 或 alertname 污染 affected_services。
|
||
|
||
優先序:
|
||
1. component label(Docker-compose 層告警最可靠)
|
||
2. job label(排除 node-exporter / pushgateway 等基礎設施 job)
|
||
3. pod label(取 deployment name,去掉 hash suffix)
|
||
4. target_resource(不含冒號、不等於 alertname 時才採用)
|
||
5. 空列表(讓通用型 Playbook 透過空集合豁免規則匹配)
|
||
|
||
Phase 1 飛輪修復 — 2026-04-10 Claude Sonnet 4.6 Asia/Taipei
|
||
C2 修正: 從 api/v1/webhooks.py 移入 Service 層(純業務邏輯,無 I/O)
|
||
"""
|
||
alertname = labels.get("alertname", "")
|
||
|
||
if comp := labels.get("component"):
|
||
return [comp]
|
||
|
||
if job := labels.get("job"):
|
||
normalized = job.lower().replace("-", "").replace("_", "")
|
||
if normalized not in _INFRA_JOB_NAMES:
|
||
return [job]
|
||
|
||
if pod := labels.get("pod"):
|
||
parts = pod.rsplit("-", 2)
|
||
if len(parts) >= 3 and len(parts[-1]) == 5 and len(parts[-2]) in (9, 10):
|
||
return [parts[0]]
|
||
elif len(parts) >= 2:
|
||
return ["-".join(parts[:-1])]
|
||
|
||
if (target_resource
|
||
and ":" not in target_resource
|
||
and target_resource != alertname
|
||
and not target_resource[0].isdigit()):
|
||
return [target_resource]
|
||
|
||
return []
|
||
|
||
|
||
def classify_alert_early(alertname: str, severity: str, labels: dict | None = None) -> tuple[str, str]:
|
||
"""
|
||
ADR-073 Phase 2-2: 早期分診,在 LLM 分析前決定 alert_category + notification_type。
|
||
防止 HostBackupFailed 等告警被誤路由到 K8s executor。
|
||
|
||
規則優先順序(由高到低):
|
||
1. ConfigurationDrift / KubeConfigDrift → TYPE-4D (Config Drift 卡片)
|
||
2. severity=info/none → TYPE-1 (純資訊,無按鈕)
|
||
3. backup/heartbeat 關鍵字 → TYPE-1
|
||
4. Docker/Host 前綴 → infrastructure TYPE-3
|
||
5. Kube/Pod/Deploy/Node/Velero/ArgoCD 前綴 → kubernetes TYPE-3
|
||
6. Postgres/Redis 前綴 → database TYPE-3
|
||
7. 預設 → general TYPE-3
|
||
|
||
C3 修正 (首席架構師 CR 2026-04-13): 從 Router 層 (webhooks.py) 移入 Service 層
|
||
原違規: 業務邏輯函數定義在 api/v1/webhooks.py
|
||
|
||
Returns:
|
||
tuple[str, str]: (alert_category, notification_type)
|
||
"""
|
||
# ADR-075 (2026-04-12): 完整重寫,修正 category 命名對齊 _build_inline_keyboard。
|
||
# 統帥決議:kubernetes(非 k8s_workload)、host_resource 從 infrastructure 分離。
|
||
alertname_lower = alertname.lower()
|
||
|
||
# 1. Config Drift(最高優先,類型特殊不受 severity 影響)
|
||
if alertname in ("ConfigurationDrift", "KubeConfigDrift"):
|
||
return "config_drift", "TYPE-4D"
|
||
|
||
# 2. 告警鏈路健康(meta-monitoring,優先於 severity 判斷)
|
||
# 2026-04-12 ogt: 補入 NoAlertsReceived + PrometheusNotConnectedToAlertmanager
|
||
if alertname in (
|
||
"AlertChainBroken_Alertmanager",
|
||
"AlertChainBroken_Sentry",
|
||
"NoAlertsReceived",
|
||
"NoAlertsReceived2Hours",
|
||
"AlertChainUnhealthy",
|
||
"PrometheusNotConnectedToAlertmanager",
|
||
):
|
||
return "alertchain_health", "TYPE-8M"
|
||
|
||
# 3. 資安告警(高優先,防止被 severity/prefix 規則覆蓋)
|
||
# ADR-075 TYPE-5S (2026-04-12 ogt)
|
||
if any(alertname.startswith(p) for p in (
|
||
"UnauthorizedSSH", "KubeAudit", "CVECritical", "WAFAttack",
|
||
"PodAbnormal", "SecurityBreach",
|
||
)):
|
||
return "secops", "TYPE-5S"
|
||
|
||
# 4. 飛輪/AI 系統健康(優先於 severity 判斷)
|
||
if alertname in ("AutoRepairLowSuccessRate", "PermanentFixRequired") or any(
|
||
alertname.startswith(p) for p in ("Flywheel", "MCPProvider", "OllamaDown", "NemotronDown")
|
||
):
|
||
return "flywheel_health", "TYPE-8M"
|
||
|
||
# 4a. 業務/FinOps 告警(ADR-075 TYPE-6B)
|
||
if any(alertname.startswith(p) for p in (
|
||
"AITokenCost", "GeminiAPIError", "SLOBurn", "APIErrorBudget",
|
||
"MomoScraper", "ScraperSuccess",
|
||
)):
|
||
return "business", "TYPE-6B"
|
||
|
||
# 5. 純資訊
|
||
if severity in ("info", "none"):
|
||
return "info", "TYPE-1"
|
||
|
||
# 5. Backup / Heartbeat — 純資訊,不進 LLM
|
||
# HostBackupFailed 必須在 Host prefix 前攔截,否則被歸 host_resource/TYPE-3
|
||
# 2026-04-12 ogt: 只針對已知主機備份監控 alertname,不用寬泛關鍵字
|
||
# BackupJobFailed severity=warning 仍走 TYPE-3(見測試 test_backup_keyword_warning_not_type1)
|
||
_BACKUP_TYPE1_NAMES = {
|
||
"HostBackupFailed", "HostBackupStale", "HostBackupMissing",
|
||
"BackupRestoreTestFailed", "BackupRestoreTestStale",
|
||
}
|
||
# 2026-04-12 ogt: 補入 DeadMansSwitch(HEARTBEAT_ALERT_NAMES 中但之前漏掉)
|
||
if (
|
||
"watchdog" in alertname_lower
|
||
or "deadmansswitch" in alertname_lower
|
||
or alertname == "Heartbeat"
|
||
or alertname in _BACKUP_TYPE1_NAMES
|
||
or alertname.startswith("HostBackup")
|
||
):
|
||
return "backup", "TYPE-1"
|
||
|
||
# 6. 主機資源(從 infrastructure 分離,ADR-075 統帥決議)
|
||
if alertname.startswith("Host"):
|
||
return "host_resource", "TYPE-3"
|
||
|
||
# 7. Docker 容器
|
||
if alertname.startswith("Docker"):
|
||
return "infrastructure", "TYPE-3"
|
||
|
||
# 8. K8s(統帥決議:統一用 kubernetes,不用 k8s_workload)
|
||
if alertname.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")):
|
||
return "kubernetes", "TYPE-3"
|
||
|
||
# 9. 資料庫
|
||
if alertname.startswith(("Postgres", "Redis")):
|
||
return "database", "TYPE-3"
|
||
|
||
# 10. 物件儲存
|
||
if alertname == "MinIODown":
|
||
return "storage", "TYPE-3"
|
||
|
||
# 11. DevOps 工具(ADR-075 修正:從 general 分離)
|
||
if alertname in (
|
||
"OpenClawDown", "SignOzDown", "GiteaDown", "HarborDown",
|
||
"SentryDown", "AlertmanagerDown", "KaliScannerDown", "GiteaCIPipelineFailed",
|
||
):
|
||
return "devops_tool", "TYPE-3"
|
||
|
||
# 12. 外部網站(ADR-075 修正:從 general 分離)
|
||
if alertname in (
|
||
"MoWoooWorkDown", "TsenyangWebsiteDown",
|
||
"StockWoooWorkDown", "BitanWoooWorkDown",
|
||
):
|
||
return "external_site", "TYPE-3"
|
||
|
||
# 13. SSL 憑證(ADR-075 修正:從 general 分離)
|
||
# ≥14 天→TYPE-1(提醒,無需審核);<14 天→TYPE-3(緊急審核)
|
||
if alertname.startswith(("ExternalSiteSSL", "TLSCert")):
|
||
days = int((labels or {}).get("days_remaining", 0)) if labels else 0
|
||
return "ssl_cert", ("TYPE-1" if days >= 14 else "TYPE-3")
|
||
|
||
return "general", "TYPE-3"
|
||
|
||
|
||
async def create_incident_for_approval(
|
||
approval_id: str,
|
||
risk_level: str,
|
||
target_resource: str,
|
||
namespace: str,
|
||
alert_type: str,
|
||
message: str,
|
||
source: str = "alertmanager",
|
||
alertname: str | None = None,
|
||
alert_labels: dict | None = None,
|
||
notification_type: str | None = None, # ADR-073 Phase 2-2
|
||
alert_category: str | None = None, # ADR-073 Phase 2-2
|
||
) -> str:
|
||
"""
|
||
為 Approval 創建對應的 Incident (活躍事件同步)。
|
||
|
||
設計原則:
|
||
- Approval 和 Incident 必須同時存在
|
||
- Incident 存入 Redis (Working Memory) + PostgreSQL (Episodic Memory)
|
||
- 7 天 TTL 自動過期
|
||
|
||
C2 修正: 從 api/v1/webhooks.py 移入 Service 層(業務邏輯不屬 Router 層)
|
||
|
||
Returns:
|
||
str: Incident ID
|
||
"""
|
||
incident_service = get_incident_service()
|
||
severity = _RISK_TO_SEVERITY.get(risk_level.lower(), Severity.P2)
|
||
|
||
_labels: dict = {
|
||
"namespace": namespace,
|
||
"resource": target_resource,
|
||
"alertname": alertname or alert_type,
|
||
**(alert_labels or {}),
|
||
}
|
||
signal = Signal(
|
||
alert_name=alertname or alert_type,
|
||
severity=severity,
|
||
source=source,
|
||
fired_at=now_taipei(),
|
||
labels=_labels,
|
||
annotations={"message": message},
|
||
)
|
||
|
||
_affected_services = extract_affected_services(_labels, target_resource)
|
||
|
||
incident = Incident(
|
||
status=IncidentStatus.INVESTIGATING,
|
||
severity=severity,
|
||
signals=[signal],
|
||
affected_services=_affected_services,
|
||
proposal_ids=[UUID(approval_id)],
|
||
notification_type=notification_type, # ADR-073 Phase 2-2
|
||
alert_category=alert_category, # ADR-073 Phase 2-2
|
||
)
|
||
|
||
await incident_service.save_to_working_memory(incident)
|
||
|
||
try:
|
||
await incident_service.save_to_episodic_memory(incident)
|
||
except Exception as _pg_err:
|
||
logger.warning(
|
||
"incident_episodic_memory_failed",
|
||
incident_id=incident.incident_id,
|
||
error=str(_pg_err),
|
||
)
|
||
|
||
logger.info(
|
||
"incident_created_for_approval",
|
||
incident_id=incident.incident_id,
|
||
approval_id=approval_id,
|
||
severity=severity.value,
|
||
target=target_resource,
|
||
)
|
||
|
||
return incident.incident_id
|
||
|
||
|
||
# =============================================================================
|
||
# Legacy Value Normalization (方案 C - 代碼相容舊格式)
|
||
# =============================================================================
|
||
# 問題: Redis 有舊 Enum 值 (status='open', severity='critical')
|
||
# 解法: 解析時正規化,不動 Redis 資料
|
||
# 回滾: git revert (秒級恢復)
|
||
# =============================================================================
|
||
|
||
def normalize_status(value: str) -> str:
|
||
"""
|
||
正規化 IncidentStatus 舊格式值
|
||
|
||
舊值 → 新值:
|
||
- 'open' → 'investigating'
|
||
"""
|
||
legacy_map = {
|
||
"open": "investigating",
|
||
}
|
||
return legacy_map.get(value, value)
|
||
|
||
|
||
def normalize_severity(value: str) -> str:
|
||
"""
|
||
正規化 Severity 舊格式值
|
||
|
||
舊值 → 新值:
|
||
- 'critical' → 'P0'
|
||
- 'high' → 'P1'
|
||
- 'warning' → 'P2'
|
||
- 'medium' → 'P2'
|
||
- 'info' → 'P3'
|
||
- 'low' → 'P3'
|
||
- 'none' → 'P3'
|
||
"""
|
||
legacy_map = {
|
||
"critical": "P0",
|
||
"high": "P1",
|
||
"warning": "P2",
|
||
"medium": "P2",
|
||
"info": "P3",
|
||
"low": "P3",
|
||
"none": "P3",
|
||
}
|
||
return legacy_map.get(value, value)
|
||
|
||
|
||
# =============================================================================
|
||
# Constants
|
||
# =============================================================================
|
||
|
||
# Redis Key Prefix
|
||
INCIDENT_KEY_PREFIX = "incident:"
|
||
# Working Memory TTL: 7 天 = 604800 秒
|
||
WORKING_MEMORY_TTL = 604800
|
||
|
||
|
||
# =============================================================================
|
||
# Incident Service
|
||
# =============================================================================
|
||
|
||
class IncidentService:
|
||
"""
|
||
雙層記憶服務
|
||
|
||
職責:
|
||
1. Working Memory (Redis): 活躍事件快取
|
||
2. Episodic Memory (PostgreSQL): 歷史事件持久化
|
||
|
||
使用方式:
|
||
service = IncidentService()
|
||
incident = await service.create_incident_from_signal(signal_data)
|
||
"""
|
||
|
||
# =========================================================================
|
||
# Working Memory (Redis)
|
||
# =========================================================================
|
||
|
||
async def save_to_working_memory(self, incident: Incident) -> bool:
|
||
"""
|
||
將 Incident 寫入 Working Memory (Redis)
|
||
|
||
使用 Redis Hash 儲存,Key 格式: incident:{incident_id}
|
||
TTL: 7 天 (604800 秒)
|
||
|
||
Returns:
|
||
bool: 是否成功寫入
|
||
"""
|
||
redis_client = get_redis()
|
||
key = f"{INCIDENT_KEY_PREFIX}{incident.incident_id}"
|
||
|
||
try:
|
||
# 序列化為 JSON
|
||
incident_json = incident.model_dump_json()
|
||
|
||
# SET with TTL
|
||
await redis_client.set(
|
||
key,
|
||
incident_json,
|
||
ex=WORKING_MEMORY_TTL,
|
||
)
|
||
|
||
logger.info(
|
||
"working_memory_saved",
|
||
incident_id=incident.incident_id,
|
||
key=key,
|
||
ttl_seconds=WORKING_MEMORY_TTL,
|
||
)
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.exception(
|
||
"working_memory_save_error",
|
||
incident_id=incident.incident_id,
|
||
error=str(e),
|
||
)
|
||
return False
|
||
|
||
async def get_from_working_memory(self, incident_id: str) -> Incident | None:
|
||
"""
|
||
從 Working Memory 讀取 Incident
|
||
|
||
方案 C: 解析時正規化舊格式 Enum 值
|
||
|
||
Returns:
|
||
Incident | None: 事件資料,若不存在則返回 None
|
||
"""
|
||
redis_client = get_redis()
|
||
key = f"{INCIDENT_KEY_PREFIX}{incident_id}"
|
||
|
||
try:
|
||
data = await redis_client.get(key)
|
||
if data is None:
|
||
return None
|
||
|
||
# 方案 C: 正規化舊格式 Enum 值
|
||
incident_dict = json.loads(data)
|
||
if "status" in incident_dict:
|
||
incident_dict["status"] = normalize_status(incident_dict["status"])
|
||
if "severity" in incident_dict:
|
||
incident_dict["severity"] = normalize_severity(incident_dict["severity"])
|
||
|
||
# 同時正規化 signals 內的 severity
|
||
for signal in incident_dict.get("signals", []):
|
||
if "severity" in signal:
|
||
signal["severity"] = normalize_severity(signal["severity"])
|
||
|
||
return Incident.model_validate(incident_dict)
|
||
|
||
except Exception as e:
|
||
logger.exception(
|
||
"working_memory_get_error",
|
||
incident_id=incident_id,
|
||
error=str(e),
|
||
)
|
||
return None
|
||
|
||
async def get_active_incidents(self) -> list[Incident]:
|
||
"""
|
||
列出所有活躍的 Incidents (從 Working Memory)
|
||
|
||
方案 C: 解析時正規化舊格式 Enum 值
|
||
|
||
Returns:
|
||
list[Incident]: 活躍事件列表 (investigating 或 mitigating)
|
||
"""
|
||
redis_client = get_redis()
|
||
incidents: list[Incident] = []
|
||
|
||
try:
|
||
# SCAN 所有 incident:* keys
|
||
async for key in redis_client.scan_iter(
|
||
match=f"{INCIDENT_KEY_PREFIX}*",
|
||
count=100,
|
||
):
|
||
# 排除索引 keys
|
||
if ":idx:" in key:
|
||
continue
|
||
|
||
data = await redis_client.get(key)
|
||
if data is None:
|
||
continue
|
||
|
||
try:
|
||
# 方案 C: 正規化舊格式 Enum 值
|
||
incident_dict = json.loads(data)
|
||
if "status" in incident_dict:
|
||
incident_dict["status"] = normalize_status(incident_dict["status"])
|
||
if "severity" in incident_dict:
|
||
incident_dict["severity"] = normalize_severity(incident_dict["severity"])
|
||
|
||
# 正規化 signals 內的 severity
|
||
for signal in incident_dict.get("signals", []):
|
||
if "severity" in signal:
|
||
signal["severity"] = normalize_severity(signal["severity"])
|
||
|
||
incident = Incident.model_validate(incident_dict)
|
||
|
||
# 只返回活躍狀態的 Incident
|
||
if incident.status in (
|
||
IncidentStatus.INVESTIGATING,
|
||
IncidentStatus.MITIGATING,
|
||
):
|
||
incidents.append(incident)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"incident_parse_error",
|
||
key=key,
|
||
error=str(e),
|
||
)
|
||
continue
|
||
|
||
logger.info(
|
||
"get_active_incidents",
|
||
count=len(incidents),
|
||
)
|
||
return incidents
|
||
|
||
except Exception as e:
|
||
logger.exception(
|
||
"get_active_incidents_error",
|
||
error=str(e),
|
||
)
|
||
return []
|
||
|
||
# =========================================================================
|
||
# Episodic Memory (PostgreSQL)
|
||
# =========================================================================
|
||
|
||
async def save_to_episodic_memory(self, incident: Incident) -> bool:
|
||
"""
|
||
將 Incident 寫入 Episodic Memory (PostgreSQL)
|
||
|
||
使用 SQLAlchemy async session 寫入 incidents 表。
|
||
|
||
Returns:
|
||
bool: 是否成功寫入
|
||
"""
|
||
try:
|
||
async with get_db_context() as db:
|
||
# 轉換為 SQLAlchemy model
|
||
# 使用 model_dump(mode="json") 確保 datetime 正確序列化
|
||
# 從 signals 提取 alertname(ADR-073 Phase 2: incidents.alertname 欄位)
|
||
_alertname = (
|
||
incident.signals[0].labels.get("alertname")
|
||
or incident.signals[0].alert_name
|
||
if incident.signals
|
||
else None
|
||
)
|
||
record = IncidentRecord(
|
||
incident_id=incident.incident_id,
|
||
status=incident.status.value,
|
||
severity=incident.severity.value,
|
||
signals=[
|
||
s.model_dump(mode="json") for s in incident.signals
|
||
],
|
||
affected_services=incident.affected_services,
|
||
decision_chain=(
|
||
incident.decision_chain.model_dump(mode="json")
|
||
if incident.decision_chain
|
||
else None
|
||
),
|
||
proposal_ids=[str(pid) for pid in incident.proposal_ids],
|
||
outcome=(
|
||
incident.outcome.model_dump(mode="json")
|
||
if incident.outcome
|
||
else None
|
||
),
|
||
created_at=incident.created_at,
|
||
updated_at=incident.updated_at,
|
||
resolved_at=incident.resolved_at,
|
||
closed_at=incident.closed_at,
|
||
ttl_days=incident.ttl_days,
|
||
vectorized=incident.vectorized,
|
||
# ADR-073 Phase 2-2: 三個分類欄位(之前遺漏未寫入 DB)
|
||
alertname=_alertname,
|
||
notification_type=incident.notification_type,
|
||
alert_category=incident.alert_category,
|
||
)
|
||
|
||
db.add(record)
|
||
# commit 由 get_db_context 自動處理
|
||
|
||
logger.info(
|
||
"episodic_memory_saved",
|
||
incident_id=incident.incident_id,
|
||
table="incidents",
|
||
)
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.exception(
|
||
"episodic_memory_save_error",
|
||
incident_id=incident.incident_id,
|
||
error=str(e),
|
||
)
|
||
return False
|
||
|
||
async def get_from_episodic_memory(self, incident_id: str) -> Incident | None:
|
||
"""
|
||
從 Episodic Memory 讀取 Incident
|
||
|
||
Returns:
|
||
Incident | None: 事件資料,若不存在則返回 None
|
||
"""
|
||
try:
|
||
async with get_db_context() as db:
|
||
from sqlalchemy import select
|
||
|
||
stmt = select(IncidentRecord).where(
|
||
IncidentRecord.incident_id == incident_id
|
||
)
|
||
result = await db.execute(stmt)
|
||
record = result.scalar_one_or_none()
|
||
|
||
if record is None:
|
||
return None
|
||
|
||
# 轉換回 Pydantic model
|
||
return self._record_to_incident(record)
|
||
|
||
except Exception as e:
|
||
logger.exception(
|
||
"episodic_memory_get_error",
|
||
incident_id=incident_id,
|
||
error=str(e),
|
||
)
|
||
return None
|
||
|
||
def _record_to_incident(self, record: IncidentRecord) -> Incident:
|
||
"""
|
||
將 SQLAlchemy record 轉換為 Pydantic Incident
|
||
|
||
方案 C: 解析時正規化舊格式 Enum 值
|
||
"""
|
||
from src.models.incident import AIDecisionChain, IncidentOutcome
|
||
|
||
# 方案 C: 正規化 signals 內的舊格式 severity
|
||
signals = []
|
||
for s in (record.signals or []):
|
||
signal_data = s.copy()
|
||
if "severity" in signal_data:
|
||
signal_data["severity"] = normalize_severity(signal_data["severity"])
|
||
signals.append(Signal(**signal_data))
|
||
|
||
decision_chain = (
|
||
AIDecisionChain(**record.decision_chain)
|
||
if record.decision_chain
|
||
else None
|
||
)
|
||
outcome = (
|
||
IncidentOutcome(**record.outcome)
|
||
if record.outcome
|
||
else None
|
||
)
|
||
|
||
# 方案 C: 正規化舊格式 Enum 值
|
||
normalized_status = normalize_status(record.status)
|
||
normalized_severity = normalize_severity(record.severity)
|
||
|
||
return Incident(
|
||
incident_id=record.incident_id,
|
||
status=IncidentStatus(normalized_status),
|
||
severity=Severity(normalized_severity),
|
||
signals=signals,
|
||
affected_services=record.affected_services or [],
|
||
decision_chain=decision_chain,
|
||
proposal_ids=record.proposal_ids or [],
|
||
outcome=outcome,
|
||
created_at=record.created_at,
|
||
updated_at=record.updated_at,
|
||
resolved_at=record.resolved_at,
|
||
closed_at=record.closed_at,
|
||
ttl_days=record.ttl_days,
|
||
persisted_to_pg=True, # 從 PG 讀取,必為 True
|
||
vectorized=record.vectorized,
|
||
)
|
||
|
||
# =========================================================================
|
||
# 雙層寫入核心邏輯
|
||
# =========================================================================
|
||
|
||
async def create_incident_from_signal(
|
||
self,
|
||
signal_data: dict[str, Any],
|
||
frequency_stats: dict[str, Any] | None = None,
|
||
) -> Incident | None:
|
||
"""
|
||
從 Signal 建立 Incident 並雙層寫入
|
||
|
||
Phase 6.2 核心邏輯:
|
||
1. 建立 Incident (含 Signal)
|
||
2. 寫入 Working Memory (Redis) - 7 天 TTL
|
||
3. 寫入 Episodic Memory (PostgreSQL) - 永久保留
|
||
4. 標記 persisted_to_pg = True
|
||
|
||
Phase 21 (ADR-037) 擴展:
|
||
5. 含異常頻率統計 (用於 Tier 分級修復策略)
|
||
|
||
Args:
|
||
signal_data: 從 Redis Stream 收到的 Signal 資料
|
||
frequency_stats: ADR-037 異常頻率統計 (可選)
|
||
|
||
Returns:
|
||
Incident | None: 成功返回 Incident,失敗返回 None
|
||
"""
|
||
try:
|
||
# 0. 去抖動 (Debounce) - 防止告警風暴
|
||
fingerprint = signal_data.get("fingerprint")
|
||
if fingerprint:
|
||
try:
|
||
redis_client = get_redis()
|
||
debounce_key = f"debounce:{fingerprint}"
|
||
# SETNX 若成功表示是新的,給予 3 分鐘 TTL (180s)
|
||
is_new = await redis_client.set(debounce_key, "1", ex=180, nx=True)
|
||
if not is_new:
|
||
logger.info(
|
||
"incident_debounced",
|
||
fingerprint=fingerprint,
|
||
reason="Duplicate signal within 3 minutes",
|
||
)
|
||
return None
|
||
except Exception as e:
|
||
logger.warning("incident_debounce_redis_error", error=str(e))
|
||
|
||
# 1. 解析 Signal
|
||
signal = Signal(
|
||
alert_name=signal_data.get("alert_name", "unknown"),
|
||
severity=self._parse_severity(signal_data.get("severity", "warning")),
|
||
source=self._parse_source(signal_data.get("source", "manual")),
|
||
fired_at=datetime.now(UTC),
|
||
labels=self._parse_dict(signal_data.get("labels", "{}")),
|
||
annotations=self._parse_dict(signal_data.get("annotations", "{}")),
|
||
fingerprint=signal_data.get("fingerprint"),
|
||
)
|
||
|
||
# 2. 建立 Incident (含頻率統計)
|
||
# ADR-037: 統帥指示「重啟只是治標,太常發生的異常必須徹底解決」
|
||
from src.models.incident import IncidentFrequencyStats
|
||
|
||
freq_stats = None
|
||
if frequency_stats:
|
||
freq_stats = IncidentFrequencyStats(
|
||
anomaly_key=frequency_stats.get("anomaly_key", "unknown"),
|
||
count_1h=frequency_stats.get("count_1h", 0),
|
||
count_24h=frequency_stats.get("count_24h", 0),
|
||
count_7d=frequency_stats.get("count_7d", 0),
|
||
count_30d=frequency_stats.get("count_30d", 0),
|
||
escalation_level=frequency_stats.get("escalation_level"),
|
||
auto_repair_count=frequency_stats.get("auto_repair_count", 0),
|
||
)
|
||
|
||
incident = Incident(
|
||
severity=signal.severity,
|
||
signals=[signal],
|
||
affected_services=[signal_data.get("target", "unknown")],
|
||
frequency_stats=freq_stats,
|
||
)
|
||
|
||
logger.info(
|
||
"incident_created",
|
||
incident_id=incident.incident_id,
|
||
severity=incident.severity.value,
|
||
signal_count=len(incident.signals),
|
||
)
|
||
|
||
# 3. 寫入 Working Memory (Redis)
|
||
redis_success = await self.save_to_working_memory(incident)
|
||
|
||
# 4. 寫入 Episodic Memory (PostgreSQL)
|
||
pg_success = await self.save_to_episodic_memory(incident)
|
||
|
||
# 5. 更新狀態
|
||
if pg_success:
|
||
incident.persisted_to_pg = True
|
||
# 更新 Redis 中的狀態
|
||
if redis_success:
|
||
await self.save_to_working_memory(incident)
|
||
|
||
# 6. 記錄雙層寫入結果
|
||
logger.info(
|
||
"dual_layer_memory_result",
|
||
incident_id=incident.incident_id,
|
||
redis_success=redis_success,
|
||
pg_success=pg_success,
|
||
persisted_to_pg=incident.persisted_to_pg,
|
||
)
|
||
|
||
return incident
|
||
|
||
except Exception as e:
|
||
logger.exception(
|
||
"create_incident_error",
|
||
error=str(e),
|
||
)
|
||
return None
|
||
|
||
def _parse_source(
|
||
self,
|
||
source_str: str,
|
||
) -> Literal["prometheus", "signoz", "alertmanager", "manual", "telegram"]:
|
||
"""
|
||
解析來源字串,映射到 Signal 允許的 Literal 值
|
||
|
||
不在白名單中的來源一律映射為 'manual'
|
||
"""
|
||
valid_sources = {"prometheus", "signoz", "alertmanager", "manual", "telegram"}
|
||
if source_str.lower() in valid_sources:
|
||
return source_str.lower() # type: ignore
|
||
return "manual"
|
||
|
||
def _parse_severity(self, severity_str: str) -> Severity:
|
||
"""解析嚴重度字串"""
|
||
mapping = {
|
||
"critical": Severity.P0,
|
||
"high": Severity.P1,
|
||
"warning": Severity.P2,
|
||
"medium": Severity.P2,
|
||
"low": Severity.P3,
|
||
"info": Severity.P3,
|
||
}
|
||
return mapping.get(severity_str.lower(), Severity.P2)
|
||
|
||
def _parse_dict(self, value: str | dict) -> dict[str, str]:
|
||
"""解析字典字串或字典"""
|
||
if isinstance(value, dict):
|
||
return {str(k): str(v) for k, v in value.items()}
|
||
if isinstance(value, str):
|
||
try:
|
||
# 嘗試解析 JSON
|
||
parsed = json.loads(value.replace("'", '"'))
|
||
return {str(k): str(v) for k, v in parsed.items()}
|
||
except (json.JSONDecodeError, TypeError):
|
||
return {}
|
||
return {}
|
||
|
||
# =========================================================================
|
||
# Phase 17 P0: Router 層違規修復 - 新增方法
|
||
# =========================================================================
|
||
|
||
async def update_outcome(
|
||
self,
|
||
incident_id: str,
|
||
effectiveness_score: int | None = None,
|
||
human_feedback: str | None = None,
|
||
learning_notes: str | None = None,
|
||
should_remember: bool = True,
|
||
) -> Incident | None:
|
||
"""
|
||
更新 Incident 的 outcome (人類回饋)
|
||
|
||
Phase 17: 從 Router 層遷移至 Service 層
|
||
|
||
Args:
|
||
incident_id: 事件 ID
|
||
effectiveness_score: 有效性評分 (1-5)
|
||
human_feedback: 文字回饋
|
||
learning_notes: 學習筆記
|
||
should_remember: 是否納入長期記憶
|
||
|
||
Returns:
|
||
Incident | None: 更新後的事件,失敗返回 None
|
||
"""
|
||
from src.models.incident import IncidentOutcome
|
||
from src.repositories.incident_repository import get_incident_repository
|
||
from src.utils.timezone import now_taipei
|
||
|
||
# 1. 從 Working Memory 讀取
|
||
incident = await self.get_from_working_memory(incident_id)
|
||
if incident is None:
|
||
logger.warning("incident_not_found_for_outcome", incident_id=incident_id)
|
||
return None
|
||
|
||
# 2. 更新 outcome
|
||
if incident.outcome is None:
|
||
incident.outcome = IncidentOutcome()
|
||
|
||
if effectiveness_score is not None:
|
||
incident.outcome.effectiveness_score = effectiveness_score
|
||
if human_feedback is not None:
|
||
incident.outcome.human_feedback = human_feedback
|
||
if learning_notes is not None:
|
||
incident.outcome.learning_notes = learning_notes
|
||
incident.outcome.should_remember = should_remember
|
||
incident.updated_at = now_taipei()
|
||
|
||
# 3. 寫入 Working Memory
|
||
redis_success = await self.save_to_working_memory(incident)
|
||
if not redis_success:
|
||
logger.error("outcome_redis_write_failed", incident_id=incident_id)
|
||
return None
|
||
|
||
# 4. 同步到 Episodic Memory (PostgreSQL)
|
||
try:
|
||
repo = get_incident_repository()
|
||
await repo.update_outcome(
|
||
incident_id=incident_id,
|
||
outcome=incident.outcome.model_dump(mode="json"),
|
||
updated_at=now_taipei(),
|
||
)
|
||
logger.info("outcome_db_updated", incident_id=incident_id)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"outcome_db_update_failed",
|
||
incident_id=incident_id,
|
||
error=str(e),
|
||
)
|
||
# DB 失敗不影響主流程
|
||
|
||
return incident
|
||
|
||
async def resolve_incident(self, incident_id: str) -> Incident | None:
|
||
"""
|
||
將 Incident 狀態更新為 RESOLVED
|
||
|
||
Phase 17: 從 Router 層遷移至 Service 層
|
||
|
||
Args:
|
||
incident_id: 事件 ID
|
||
|
||
Returns:
|
||
Incident | None: 更新後的事件,失敗返回 None
|
||
"""
|
||
from src.repositories.incident_repository import get_incident_repository
|
||
from src.utils.timezone import now_taipei
|
||
|
||
# 1. 從 Working Memory 讀取
|
||
incident = await self.get_from_working_memory(incident_id)
|
||
if incident is None:
|
||
logger.warning("incident_not_found_for_resolve", incident_id=incident_id)
|
||
return None
|
||
|
||
# 2. 更新狀態
|
||
incident.status = IncidentStatus.RESOLVED
|
||
incident.resolved_at = now_taipei()
|
||
incident.updated_at = now_taipei()
|
||
|
||
# 3. 寫入 Working Memory
|
||
redis_success = await self.save_to_working_memory(incident)
|
||
if not redis_success:
|
||
logger.error("resolve_redis_write_failed", incident_id=incident_id)
|
||
return None
|
||
|
||
# 4. 同步到 Episodic Memory
|
||
try:
|
||
repo = get_incident_repository()
|
||
await repo.update_status(
|
||
incident_id=incident_id,
|
||
status="resolved",
|
||
updated_at=now_taipei(),
|
||
)
|
||
logger.info("resolve_db_updated", incident_id=incident_id)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"resolve_db_update_failed",
|
||
incident_id=incident_id,
|
||
error=str(e),
|
||
)
|
||
|
||
# KB Phase 2-A: 自動萃取 KB 草稿 (fire-and-forget, 2026-04-03 ogt)
|
||
try:
|
||
import asyncio
|
||
from src.services.knowledge_extractor_service import get_knowledge_extractor
|
||
asyncio.create_task(
|
||
get_knowledge_extractor().extract_from_incident(incident)
|
||
)
|
||
except Exception:
|
||
logger.exception("kb_extract_task_create_failed", incident_id=incident_id)
|
||
|
||
# ADR-073 Phase 4-2: resolve 時觸發 KM conversion (2026-04-12 ogt)
|
||
# 將已解決的 Incident 轉換為結構化 KM 條目,驅動飛輪學習固化節點
|
||
try:
|
||
import asyncio
|
||
from src.services.km_conversion_service import get_km_conversion_service
|
||
asyncio.create_task(
|
||
get_km_conversion_service().convert(incident)
|
||
)
|
||
except Exception:
|
||
logger.exception("km_conversion_task_create_failed", incident_id=incident_id)
|
||
|
||
# 2026-04-07 Claude Code: Sprint 4 B4 — 手動處理推斷
|
||
# I1+S1 Fix: 委託 derive_key_from_incident() 統一推導
|
||
try:
|
||
from src.services.anomaly_counter import AnomalyCounter, get_anomaly_counter
|
||
counter = get_anomaly_counter()
|
||
anomaly_key = AnomalyCounter.derive_key_from_incident(incident)
|
||
if anomaly_key:
|
||
disposition = await counter.get_disposition_stats(anomaly_key)
|
||
has_system_resolution = (
|
||
disposition["auto_repair"] > 0
|
||
or disposition["human_approved"] > 0
|
||
or disposition["cold_start_trust"] > 0
|
||
)
|
||
if not has_system_resolution:
|
||
await counter.record_disposition(anomaly_key, "manual_resolved")
|
||
except Exception as _disp_e:
|
||
logger.warning("disposition_manual_resolve_failed", error=str(_disp_e))
|
||
|
||
# MASTER Task 4.2 (2026-04-14 Claude Sonnet 4.6): Postmortem 自動組裝
|
||
# Incident duration > POSTMORTEM_MIN_DURATION_MINUTES(10min) 時自動生成
|
||
# 孤兒 report_generation_service.trigger_postmortem 本次接上 resolve 路徑
|
||
try:
|
||
import asyncio
|
||
from src.services.report_generation_service import get_report_generation_service
|
||
|
||
alertname = (
|
||
incident.signals[0].labels.get("alertname", "UnknownAlert")
|
||
if incident.signals else "UnknownAlert"
|
||
)
|
||
title = f"{alertname} — {', '.join(incident.affected_services or ['N/A'])}"
|
||
root_cause = None
|
||
resolution_action = None
|
||
ai_provider = None
|
||
auto_repaired = False
|
||
if incident.decision_chain:
|
||
root_cause = incident.decision_chain.hypothesis
|
||
ai_provider = incident.decision_chain.model_used
|
||
if incident.outcome:
|
||
resolution_action = (incident.outcome.learning_notes or None)
|
||
auto_repaired = bool(incident.outcome.execution_success)
|
||
|
||
asyncio.create_task(
|
||
get_report_generation_service().trigger_postmortem(
|
||
incident_id=incident.incident_id,
|
||
title=title,
|
||
created_at=incident.signals[0].fired_at if incident.signals else incident.resolved_at,
|
||
resolved_at=incident.resolved_at,
|
||
root_cause=root_cause,
|
||
resolution_action=resolution_action,
|
||
ai_provider=ai_provider,
|
||
auto_repaired=auto_repaired,
|
||
)
|
||
)
|
||
except Exception as _pm_e:
|
||
logger.exception("postmortem_trigger_failed",
|
||
incident_id=incident_id, error=str(_pm_e))
|
||
|
||
return incident
|
||
|
||
async def find_by_proposal_id(self, proposal_id: str) -> Incident | None:
|
||
"""
|
||
根據 proposal_id 查找關聯的 Incident
|
||
|
||
Phase 17: 從 Router 層遷移至 Service 層
|
||
|
||
Args:
|
||
proposal_id: 提案 ID (UUID 字串)
|
||
|
||
Returns:
|
||
Incident | None: 找到的事件,未找到返回 None
|
||
"""
|
||
from uuid import UUID
|
||
|
||
redis_client = get_redis()
|
||
|
||
try:
|
||
target_uuid = UUID(proposal_id)
|
||
|
||
async for key in redis_client.scan_iter(
|
||
match=f"{INCIDENT_KEY_PREFIX}INC-*",
|
||
count=100,
|
||
):
|
||
data = await redis_client.get(key)
|
||
if data is None:
|
||
continue
|
||
|
||
try:
|
||
# 方案 C: 正規化舊格式 Enum 值
|
||
incident_dict = json.loads(data)
|
||
if "status" in incident_dict:
|
||
incident_dict["status"] = normalize_status(incident_dict["status"])
|
||
if "severity" in incident_dict:
|
||
incident_dict["severity"] = normalize_severity(incident_dict["severity"])
|
||
|
||
for signal in incident_dict.get("signals", []):
|
||
if "severity" in signal:
|
||
signal["severity"] = normalize_severity(signal["severity"])
|
||
|
||
incident = Incident.model_validate(incident_dict)
|
||
|
||
if target_uuid in incident.proposal_ids:
|
||
return incident
|
||
|
||
except Exception as e:
|
||
logger.warning(
|
||
"incident_parse_error_in_find",
|
||
key=key,
|
||
error=str(e),
|
||
)
|
||
continue
|
||
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.exception(
|
||
"find_by_proposal_id_error",
|
||
proposal_id=proposal_id,
|
||
error=str(e),
|
||
)
|
||
return None
|
||
|
||
async def trigger_reanalysis(self, incident_id: str) -> dict:
|
||
"""
|
||
觸發 Incident 重診 (ADR-050 P2: reanalyze button)
|
||
|
||
去重保護:同一 incident 10 分鐘內只觸發一次。
|
||
觸發後將 incident status 標記為 analyzing,等待 AI 自動接手。
|
||
|
||
Args:
|
||
incident_id: Incident ID
|
||
|
||
Returns:
|
||
dict: {
|
||
"triggered": bool,
|
||
"message": str,
|
||
"already_analyzing": bool,
|
||
}
|
||
|
||
2026-04-01 Claude Code (ADR-050 P2): reanalyze button handler
|
||
"""
|
||
REANALYZE_TTL_SECONDS = 600 # 10 分鐘去重 TTL (ADR-050)
|
||
dedup_key = f"reanalyze_dedup:{incident_id}"
|
||
|
||
try:
|
||
redis_client = get_redis()
|
||
|
||
# 去重檢查 (SETNX: 只有第一次設定會成功)
|
||
is_new = await redis_client.set(dedup_key, "1", ex=REANALYZE_TTL_SECONDS, nx=True)
|
||
if not is_new:
|
||
logger.info(
|
||
"reanalyze_deduplicated",
|
||
incident_id=incident_id,
|
||
reason="Already triggered within 10 minutes",
|
||
)
|
||
return {
|
||
"triggered": False,
|
||
"message": "重診已在進行中,請 10 分鐘後再試",
|
||
"already_analyzing": True,
|
||
}
|
||
|
||
# 從 Working Memory 取得 Incident
|
||
incident = await self.get_from_working_memory(incident_id)
|
||
if not incident:
|
||
incident = await self.get_from_episodic_memory(incident_id)
|
||
|
||
if not incident:
|
||
# 刪除剛設定的去重 key(讓下次能重試)
|
||
await redis_client.delete(dedup_key)
|
||
logger.warning("reanalyze_incident_not_found", incident_id=incident_id)
|
||
return {
|
||
"triggered": False,
|
||
"message": f"找不到事件 {incident_id}",
|
||
"already_analyzing": False,
|
||
}
|
||
|
||
# 標記 status 為 analyzing(讓 AI 引擎接手)
|
||
# 使用延遲 import 避免循環依賴(同 create_incident_from_signal 模式)
|
||
from src.models.incident import IncidentStatus
|
||
# 使用 INVESTIGATING 若 ANALYZING 不存在
|
||
analyzing_status = getattr(IncidentStatus, "ANALYZING", None) or getattr(IncidentStatus, "INVESTIGATING", None)
|
||
if analyzing_status:
|
||
incident.status = analyzing_status
|
||
await self.save_to_working_memory(incident)
|
||
|
||
logger.info(
|
||
"reanalyze_triggered",
|
||
incident_id=incident_id,
|
||
severity=incident.severity.value,
|
||
)
|
||
|
||
return {
|
||
"triggered": True,
|
||
"message": "重診已排程,AI 正在分析中",
|
||
"already_analyzing": False,
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.exception("reanalyze_failed", incident_id=incident_id, error=str(e))
|
||
return {
|
||
"triggered": False,
|
||
"message": f"重診觸發失敗: {str(e)[:80]}",
|
||
"already_analyzing": False,
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton
|
||
# =============================================================================
|
||
|
||
_incident_service: IncidentService | None = None
|
||
|
||
|
||
def get_incident_service() -> IncidentService:
|
||
"""取得 Incident Service 實例 (Singleton)"""
|
||
global _incident_service
|
||
if _incident_service is None:
|
||
_incident_service = IncidentService()
|
||
return _incident_service
|