Files
awoooi/apps/api/src/services/incident_service.py
OG T 8de807c40d
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
feat(GAP-D5 Task 4.2): Postmortem 自動組裝 hook
incident_service.resolve_incident() 結尾 fire-and-forget 呼叫
report_generation_service.trigger_postmortem(),補完孤兒服務的觸發路徑。

觸發條件(由 trigger_postmortem 內部判斷):
- duration > POSTMORTEM_MIN_DURATION_MINUTES (10min)
- 含 AI root_cause / resolution_action / provider / auto_repaired

背景:
- report_generation_service.py 539 行服務於先前 session 建立
- main.py:322 已啟動 run_daily_report_loop(Task 4.1 )
- trigger_postmortem 在 src/ 下無呼叫方 → 本 commit 補上

MASTER 藍圖 Phase 4 至此完整收官:
 Task 4.1 日度巡檢報告(08:00 台北排程,生產環境已跑)
 Task 4.2 Postmortem 自動組裝(本 commit 接上 resolve hook)

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2026-04-14 18:25:15 +08:00

1205 lines
43 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Incident Service - Phase 6.2 雙層記憶寫入
==========================================
功能:
- Working Memory (Redis): 活躍事件7 天 TTL
- Episodic Memory (PostgreSQL): 歷史事件,永久保留
設計原則:
- 先寫 Redis (快),再寫 PostgreSQL (持久)
- 兩者都成功才算完成
- 失敗時記錄日誌但不中斷主流程
統帥鐵律:
- 禁止硬編碼 IP 或密碼,嚴格讀取 .env
- 所有寫入操作都必須有結構化日誌
C2 修正 (首席架構師審查 2026-04-10 Claude Sonnet 4.6 Asia/Taipei):
create_incident_for_approval + _extract_affected_services 從 Router 層移入此 Service 層
原違規: 業務邏輯 (Severity 映射, Signal 建立, Incident 建立) 放在 api/v1/webhooks.py
"""
import json
from datetime import UTC, datetime
from typing import Any, Literal
from uuid import UUID
import structlog
from src.core.redis_client import get_redis
from src.db.base import get_db_context
from src.db.models import IncidentRecord
from src.models.incident import (
Incident,
IncidentStatus,
Severity,
Signal,
)
from src.utils.timezone import now_taipei
logger = structlog.get_logger(__name__)
# =============================================================================
# C2 修正: 從 webhooks.py 遷入的業務邏輯
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei
# =============================================================================
# 風險等級 → 事件嚴重度映射 (原在 webhooks.py)
_RISK_TO_SEVERITY = {
"critical": Severity.P0,
"high": Severity.P1,
"medium": Severity.P2,
"low": Severity.P3,
}
# I1 修正: 提升為 module-level frozenset避免每次呼叫重建 (原在 webhooks.py 函數體內)
_INFRA_JOB_NAMES: frozenset[str] = frozenset(
j.lower().replace("-", "").replace("_", "")
for j in {"node", "node-exporter", "pushgateway", "blackbox",
"prometheus", "alertmanager", "cadvisor"}
)
def extract_affected_services(labels: dict, target_resource: str) -> list[str]:
"""
從告警 labels 提取真實服務名,防止 IP 或 alertname 污染 affected_services。
優先序:
1. component labelDocker-compose 層告警最可靠)
2. job label排除 node-exporter / pushgateway 等基礎設施 job
3. pod label取 deployment name去掉 hash suffix
4. target_resource不含冒號、不等於 alertname 時才採用)
5. 空列表(讓通用型 Playbook 透過空集合豁免規則匹配)
Phase 1 飛輪修復 — 2026-04-10 Claude Sonnet 4.6 Asia/Taipei
C2 修正: 從 api/v1/webhooks.py 移入 Service 層(純業務邏輯,無 I/O
"""
alertname = labels.get("alertname", "")
if comp := labels.get("component"):
return [comp]
if job := labels.get("job"):
normalized = job.lower().replace("-", "").replace("_", "")
if normalized not in _INFRA_JOB_NAMES:
return [job]
if pod := labels.get("pod"):
parts = pod.rsplit("-", 2)
if len(parts) >= 3 and len(parts[-1]) == 5 and len(parts[-2]) in (9, 10):
return [parts[0]]
elif len(parts) >= 2:
return ["-".join(parts[:-1])]
if (target_resource
and ":" not in target_resource
and target_resource != alertname
and not target_resource[0].isdigit()):
return [target_resource]
return []
def classify_alert_early(alertname: str, severity: str, labels: dict | None = None) -> tuple[str, str]:
"""
ADR-073 Phase 2-2: 早期分診,在 LLM 分析前決定 alert_category + notification_type。
防止 HostBackupFailed 等告警被誤路由到 K8s executor。
規則優先順序(由高到低):
1. ConfigurationDrift / KubeConfigDrift → TYPE-4D (Config Drift 卡片)
2. severity=info/none → TYPE-1 (純資訊,無按鈕)
3. backup/heartbeat 關鍵字 → TYPE-1
4. Docker/Host 前綴 → infrastructure TYPE-3
5. Kube/Pod/Deploy/Node/Velero/ArgoCD 前綴 → kubernetes TYPE-3
6. Postgres/Redis 前綴 → database TYPE-3
7. 預設 → general TYPE-3
C3 修正 (首席架構師 CR 2026-04-13): 從 Router 層 (webhooks.py) 移入 Service 層
原違規: 業務邏輯函數定義在 api/v1/webhooks.py
Returns:
tuple[str, str]: (alert_category, notification_type)
"""
# ADR-075 (2026-04-12): 完整重寫,修正 category 命名對齊 _build_inline_keyboard。
# 統帥決議kubernetes非 k8s_workload、host_resource 從 infrastructure 分離。
alertname_lower = alertname.lower()
# 1. Config Drift最高優先類型特殊不受 severity 影響)
if alertname in ("ConfigurationDrift", "KubeConfigDrift"):
return "config_drift", "TYPE-4D"
# 2. 告警鏈路健康meta-monitoring優先於 severity 判斷)
# 2026-04-12 ogt: 補入 NoAlertsReceived + PrometheusNotConnectedToAlertmanager
if alertname in (
"AlertChainBroken_Alertmanager",
"AlertChainBroken_Sentry",
"NoAlertsReceived",
"NoAlertsReceived2Hours",
"AlertChainUnhealthy",
"PrometheusNotConnectedToAlertmanager",
):
return "alertchain_health", "TYPE-8M"
# 3. 資安告警(高優先,防止被 severity/prefix 規則覆蓋)
# ADR-075 TYPE-5S (2026-04-12 ogt)
if any(alertname.startswith(p) for p in (
"UnauthorizedSSH", "KubeAudit", "CVECritical", "WAFAttack",
"PodAbnormal", "SecurityBreach",
)):
return "secops", "TYPE-5S"
# 4. 飛輪/AI 系統健康(優先於 severity 判斷)
if alertname in ("AutoRepairLowSuccessRate", "PermanentFixRequired") or any(
alertname.startswith(p) for p in ("Flywheel", "MCPProvider", "OllamaDown", "NemotronDown")
):
return "flywheel_health", "TYPE-8M"
# 4a. 業務/FinOps 告警ADR-075 TYPE-6B
if any(alertname.startswith(p) for p in (
"AITokenCost", "GeminiAPIError", "SLOBurn", "APIErrorBudget",
"MomoScraper", "ScraperSuccess",
)):
return "business", "TYPE-6B"
# 5. 純資訊
if severity in ("info", "none"):
return "info", "TYPE-1"
# 5. Backup / Heartbeat — 純資訊,不進 LLM
# HostBackupFailed 必須在 Host prefix 前攔截,否則被歸 host_resource/TYPE-3
# 2026-04-12 ogt: 只針對已知主機備份監控 alertname不用寬泛關鍵字
# BackupJobFailed severity=warning 仍走 TYPE-3見測試 test_backup_keyword_warning_not_type1
_BACKUP_TYPE1_NAMES = {
"HostBackupFailed", "HostBackupStale", "HostBackupMissing",
"BackupRestoreTestFailed", "BackupRestoreTestStale",
}
# 2026-04-12 ogt: 補入 DeadMansSwitchHEARTBEAT_ALERT_NAMES 中但之前漏掉)
if (
"watchdog" in alertname_lower
or "deadmansswitch" in alertname_lower
or alertname == "Heartbeat"
or alertname in _BACKUP_TYPE1_NAMES
or alertname.startswith("HostBackup")
):
return "backup", "TYPE-1"
# 6. 主機資源(從 infrastructure 分離ADR-075 統帥決議)
if alertname.startswith("Host"):
return "host_resource", "TYPE-3"
# 7. Docker 容器
if alertname.startswith("Docker"):
return "infrastructure", "TYPE-3"
# 8. K8s統帥決議統一用 kubernetes不用 k8s_workload
if alertname.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")):
return "kubernetes", "TYPE-3"
# 9. 資料庫
if alertname.startswith(("Postgres", "Redis")):
return "database", "TYPE-3"
# 10. 物件儲存
if alertname == "MinIODown":
return "storage", "TYPE-3"
# 11. DevOps 工具ADR-075 修正:從 general 分離)
if alertname in (
"OpenClawDown", "SignOzDown", "GiteaDown", "HarborDown",
"SentryDown", "AlertmanagerDown", "KaliScannerDown", "GiteaCIPipelineFailed",
):
return "devops_tool", "TYPE-3"
# 12. 外部網站ADR-075 修正:從 general 分離)
if alertname in (
"MoWoooWorkDown", "TsenyangWebsiteDown",
"StockWoooWorkDown", "BitanWoooWorkDown",
):
return "external_site", "TYPE-3"
# 13. SSL 憑證ADR-075 修正:從 general 分離)
# ≥14 天→TYPE-1提醒無需審核<14 天→TYPE-3緊急審核
if alertname.startswith(("ExternalSiteSSL", "TLSCert")):
days = int((labels or {}).get("days_remaining", 0)) if labels else 0
return "ssl_cert", ("TYPE-1" if days >= 14 else "TYPE-3")
return "general", "TYPE-3"
async def create_incident_for_approval(
approval_id: str,
risk_level: str,
target_resource: str,
namespace: str,
alert_type: str,
message: str,
source: str = "alertmanager",
alertname: str | None = None,
alert_labels: dict | None = None,
notification_type: str | None = None, # ADR-073 Phase 2-2
alert_category: str | None = None, # ADR-073 Phase 2-2
) -> str:
"""
為 Approval 創建對應的 Incident (活躍事件同步)。
設計原則:
- Approval 和 Incident 必須同時存在
- Incident 存入 Redis (Working Memory) + PostgreSQL (Episodic Memory)
- 7 天 TTL 自動過期
C2 修正: 從 api/v1/webhooks.py 移入 Service 層(業務邏輯不屬 Router 層)
Returns:
str: Incident ID
"""
incident_service = get_incident_service()
severity = _RISK_TO_SEVERITY.get(risk_level.lower(), Severity.P2)
_labels: dict = {
"namespace": namespace,
"resource": target_resource,
"alertname": alertname or alert_type,
**(alert_labels or {}),
}
signal = Signal(
alert_name=alertname or alert_type,
severity=severity,
source=source,
fired_at=now_taipei(),
labels=_labels,
annotations={"message": message},
)
_affected_services = extract_affected_services(_labels, target_resource)
incident = Incident(
status=IncidentStatus.INVESTIGATING,
severity=severity,
signals=[signal],
affected_services=_affected_services,
proposal_ids=[UUID(approval_id)],
notification_type=notification_type, # ADR-073 Phase 2-2
alert_category=alert_category, # ADR-073 Phase 2-2
)
await incident_service.save_to_working_memory(incident)
try:
await incident_service.save_to_episodic_memory(incident)
except Exception as _pg_err:
logger.warning(
"incident_episodic_memory_failed",
incident_id=incident.incident_id,
error=str(_pg_err),
)
logger.info(
"incident_created_for_approval",
incident_id=incident.incident_id,
approval_id=approval_id,
severity=severity.value,
target=target_resource,
)
return incident.incident_id
# =============================================================================
# Legacy Value Normalization (方案 C - 代碼相容舊格式)
# =============================================================================
# 問題: Redis 有舊 Enum 值 (status='open', severity='critical')
# 解法: 解析時正規化,不動 Redis 資料
# 回滾: git revert (秒級恢復)
# =============================================================================
def normalize_status(value: str) -> str:
"""
正規化 IncidentStatus 舊格式值
舊值 → 新值:
- 'open''investigating'
"""
legacy_map = {
"open": "investigating",
}
return legacy_map.get(value, value)
def normalize_severity(value: str) -> str:
"""
正規化 Severity 舊格式值
舊值 → 新值:
- 'critical''P0'
- 'high''P1'
- 'warning''P2'
- 'medium''P2'
- 'info''P3'
- 'low''P3'
- 'none''P3'
"""
legacy_map = {
"critical": "P0",
"high": "P1",
"warning": "P2",
"medium": "P2",
"info": "P3",
"low": "P3",
"none": "P3",
}
return legacy_map.get(value, value)
# =============================================================================
# Constants
# =============================================================================
# Redis Key Prefix
INCIDENT_KEY_PREFIX = "incident:"
# Working Memory TTL: 7 天 = 604800 秒
WORKING_MEMORY_TTL = 604800
# =============================================================================
# Incident Service
# =============================================================================
class IncidentService:
"""
雙層記憶服務
職責:
1. Working Memory (Redis): 活躍事件快取
2. Episodic Memory (PostgreSQL): 歷史事件持久化
使用方式:
service = IncidentService()
incident = await service.create_incident_from_signal(signal_data)
"""
# =========================================================================
# Working Memory (Redis)
# =========================================================================
async def save_to_working_memory(self, incident: Incident) -> bool:
"""
將 Incident 寫入 Working Memory (Redis)
使用 Redis Hash 儲存Key 格式: incident:{incident_id}
TTL: 7 天 (604800 秒)
Returns:
bool: 是否成功寫入
"""
redis_client = get_redis()
key = f"{INCIDENT_KEY_PREFIX}{incident.incident_id}"
try:
# 序列化為 JSON
incident_json = incident.model_dump_json()
# SET with TTL
await redis_client.set(
key,
incident_json,
ex=WORKING_MEMORY_TTL,
)
logger.info(
"working_memory_saved",
incident_id=incident.incident_id,
key=key,
ttl_seconds=WORKING_MEMORY_TTL,
)
return True
except Exception as e:
logger.exception(
"working_memory_save_error",
incident_id=incident.incident_id,
error=str(e),
)
return False
async def get_from_working_memory(self, incident_id: str) -> Incident | None:
"""
從 Working Memory 讀取 Incident
方案 C: 解析時正規化舊格式 Enum 值
Returns:
Incident | None: 事件資料,若不存在則返回 None
"""
redis_client = get_redis()
key = f"{INCIDENT_KEY_PREFIX}{incident_id}"
try:
data = await redis_client.get(key)
if data is None:
return None
# 方案 C: 正規化舊格式 Enum 值
incident_dict = json.loads(data)
if "status" in incident_dict:
incident_dict["status"] = normalize_status(incident_dict["status"])
if "severity" in incident_dict:
incident_dict["severity"] = normalize_severity(incident_dict["severity"])
# 同時正規化 signals 內的 severity
for signal in incident_dict.get("signals", []):
if "severity" in signal:
signal["severity"] = normalize_severity(signal["severity"])
return Incident.model_validate(incident_dict)
except Exception as e:
logger.exception(
"working_memory_get_error",
incident_id=incident_id,
error=str(e),
)
return None
async def get_active_incidents(self) -> list[Incident]:
"""
列出所有活躍的 Incidents (從 Working Memory)
方案 C: 解析時正規化舊格式 Enum 值
Returns:
list[Incident]: 活躍事件列表 (investigating 或 mitigating)
"""
redis_client = get_redis()
incidents: list[Incident] = []
try:
# SCAN 所有 incident:* keys
async for key in redis_client.scan_iter(
match=f"{INCIDENT_KEY_PREFIX}*",
count=100,
):
# 排除索引 keys
if ":idx:" in key:
continue
data = await redis_client.get(key)
if data is None:
continue
try:
# 方案 C: 正規化舊格式 Enum 值
incident_dict = json.loads(data)
if "status" in incident_dict:
incident_dict["status"] = normalize_status(incident_dict["status"])
if "severity" in incident_dict:
incident_dict["severity"] = normalize_severity(incident_dict["severity"])
# 正規化 signals 內的 severity
for signal in incident_dict.get("signals", []):
if "severity" in signal:
signal["severity"] = normalize_severity(signal["severity"])
incident = Incident.model_validate(incident_dict)
# 只返回活躍狀態的 Incident
if incident.status in (
IncidentStatus.INVESTIGATING,
IncidentStatus.MITIGATING,
):
incidents.append(incident)
except Exception as e:
logger.warning(
"incident_parse_error",
key=key,
error=str(e),
)
continue
logger.info(
"get_active_incidents",
count=len(incidents),
)
return incidents
except Exception as e:
logger.exception(
"get_active_incidents_error",
error=str(e),
)
return []
# =========================================================================
# Episodic Memory (PostgreSQL)
# =========================================================================
async def save_to_episodic_memory(self, incident: Incident) -> bool:
"""
將 Incident 寫入 Episodic Memory (PostgreSQL)
使用 SQLAlchemy async session 寫入 incidents 表。
Returns:
bool: 是否成功寫入
"""
try:
async with get_db_context() as db:
# 轉換為 SQLAlchemy model
# 使用 model_dump(mode="json") 確保 datetime 正確序列化
# 從 signals 提取 alertnameADR-073 Phase 2: incidents.alertname 欄位)
_alertname = (
incident.signals[0].labels.get("alertname")
or incident.signals[0].alert_name
if incident.signals
else None
)
record = IncidentRecord(
incident_id=incident.incident_id,
status=incident.status.value,
severity=incident.severity.value,
signals=[
s.model_dump(mode="json") for s in incident.signals
],
affected_services=incident.affected_services,
decision_chain=(
incident.decision_chain.model_dump(mode="json")
if incident.decision_chain
else None
),
proposal_ids=[str(pid) for pid in incident.proposal_ids],
outcome=(
incident.outcome.model_dump(mode="json")
if incident.outcome
else None
),
created_at=incident.created_at,
updated_at=incident.updated_at,
resolved_at=incident.resolved_at,
closed_at=incident.closed_at,
ttl_days=incident.ttl_days,
vectorized=incident.vectorized,
# ADR-073 Phase 2-2: 三個分類欄位(之前遺漏未寫入 DB
alertname=_alertname,
notification_type=incident.notification_type,
alert_category=incident.alert_category,
)
db.add(record)
# commit 由 get_db_context 自動處理
logger.info(
"episodic_memory_saved",
incident_id=incident.incident_id,
table="incidents",
)
return True
except Exception as e:
logger.exception(
"episodic_memory_save_error",
incident_id=incident.incident_id,
error=str(e),
)
return False
async def get_from_episodic_memory(self, incident_id: str) -> Incident | None:
"""
從 Episodic Memory 讀取 Incident
Returns:
Incident | None: 事件資料,若不存在則返回 None
"""
try:
async with get_db_context() as db:
from sqlalchemy import select
stmt = select(IncidentRecord).where(
IncidentRecord.incident_id == incident_id
)
result = await db.execute(stmt)
record = result.scalar_one_or_none()
if record is None:
return None
# 轉換回 Pydantic model
return self._record_to_incident(record)
except Exception as e:
logger.exception(
"episodic_memory_get_error",
incident_id=incident_id,
error=str(e),
)
return None
def _record_to_incident(self, record: IncidentRecord) -> Incident:
"""
將 SQLAlchemy record 轉換為 Pydantic Incident
方案 C: 解析時正規化舊格式 Enum 值
"""
from src.models.incident import AIDecisionChain, IncidentOutcome
# 方案 C: 正規化 signals 內的舊格式 severity
signals = []
for s in (record.signals or []):
signal_data = s.copy()
if "severity" in signal_data:
signal_data["severity"] = normalize_severity(signal_data["severity"])
signals.append(Signal(**signal_data))
decision_chain = (
AIDecisionChain(**record.decision_chain)
if record.decision_chain
else None
)
outcome = (
IncidentOutcome(**record.outcome)
if record.outcome
else None
)
# 方案 C: 正規化舊格式 Enum 值
normalized_status = normalize_status(record.status)
normalized_severity = normalize_severity(record.severity)
return Incident(
incident_id=record.incident_id,
status=IncidentStatus(normalized_status),
severity=Severity(normalized_severity),
signals=signals,
affected_services=record.affected_services or [],
decision_chain=decision_chain,
proposal_ids=record.proposal_ids or [],
outcome=outcome,
created_at=record.created_at,
updated_at=record.updated_at,
resolved_at=record.resolved_at,
closed_at=record.closed_at,
ttl_days=record.ttl_days,
persisted_to_pg=True, # 從 PG 讀取,必為 True
vectorized=record.vectorized,
)
# =========================================================================
# 雙層寫入核心邏輯
# =========================================================================
async def create_incident_from_signal(
self,
signal_data: dict[str, Any],
frequency_stats: dict[str, Any] | None = None,
) -> Incident | None:
"""
從 Signal 建立 Incident 並雙層寫入
Phase 6.2 核心邏輯:
1. 建立 Incident (含 Signal)
2. 寫入 Working Memory (Redis) - 7 天 TTL
3. 寫入 Episodic Memory (PostgreSQL) - 永久保留
4. 標記 persisted_to_pg = True
Phase 21 (ADR-037) 擴展:
5. 含異常頻率統計 (用於 Tier 分級修復策略)
Args:
signal_data: 從 Redis Stream 收到的 Signal 資料
frequency_stats: ADR-037 異常頻率統計 (可選)
Returns:
Incident | None: 成功返回 Incident失敗返回 None
"""
try:
# 0. 去抖動 (Debounce) - 防止告警風暴
fingerprint = signal_data.get("fingerprint")
if fingerprint:
try:
redis_client = get_redis()
debounce_key = f"debounce:{fingerprint}"
# SETNX 若成功表示是新的,給予 3 分鐘 TTL (180s)
is_new = await redis_client.set(debounce_key, "1", ex=180, nx=True)
if not is_new:
logger.info(
"incident_debounced",
fingerprint=fingerprint,
reason="Duplicate signal within 3 minutes",
)
return None
except Exception as e:
logger.warning("incident_debounce_redis_error", error=str(e))
# 1. 解析 Signal
signal = Signal(
alert_name=signal_data.get("alert_name", "unknown"),
severity=self._parse_severity(signal_data.get("severity", "warning")),
source=self._parse_source(signal_data.get("source", "manual")),
fired_at=datetime.now(UTC),
labels=self._parse_dict(signal_data.get("labels", "{}")),
annotations=self._parse_dict(signal_data.get("annotations", "{}")),
fingerprint=signal_data.get("fingerprint"),
)
# 2. 建立 Incident (含頻率統計)
# ADR-037: 統帥指示「重啟只是治標,太常發生的異常必須徹底解決」
from src.models.incident import IncidentFrequencyStats
freq_stats = None
if frequency_stats:
freq_stats = IncidentFrequencyStats(
anomaly_key=frequency_stats.get("anomaly_key", "unknown"),
count_1h=frequency_stats.get("count_1h", 0),
count_24h=frequency_stats.get("count_24h", 0),
count_7d=frequency_stats.get("count_7d", 0),
count_30d=frequency_stats.get("count_30d", 0),
escalation_level=frequency_stats.get("escalation_level"),
auto_repair_count=frequency_stats.get("auto_repair_count", 0),
)
incident = Incident(
severity=signal.severity,
signals=[signal],
affected_services=[signal_data.get("target", "unknown")],
frequency_stats=freq_stats,
)
logger.info(
"incident_created",
incident_id=incident.incident_id,
severity=incident.severity.value,
signal_count=len(incident.signals),
)
# 3. 寫入 Working Memory (Redis)
redis_success = await self.save_to_working_memory(incident)
# 4. 寫入 Episodic Memory (PostgreSQL)
pg_success = await self.save_to_episodic_memory(incident)
# 5. 更新狀態
if pg_success:
incident.persisted_to_pg = True
# 更新 Redis 中的狀態
if redis_success:
await self.save_to_working_memory(incident)
# 6. 記錄雙層寫入結果
logger.info(
"dual_layer_memory_result",
incident_id=incident.incident_id,
redis_success=redis_success,
pg_success=pg_success,
persisted_to_pg=incident.persisted_to_pg,
)
return incident
except Exception as e:
logger.exception(
"create_incident_error",
error=str(e),
)
return None
def _parse_source(
self,
source_str: str,
) -> Literal["prometheus", "signoz", "alertmanager", "manual", "telegram"]:
"""
解析來源字串,映射到 Signal 允許的 Literal 值
不在白名單中的來源一律映射為 'manual'
"""
valid_sources = {"prometheus", "signoz", "alertmanager", "manual", "telegram"}
if source_str.lower() in valid_sources:
return source_str.lower() # type: ignore
return "manual"
def _parse_severity(self, severity_str: str) -> Severity:
"""解析嚴重度字串"""
mapping = {
"critical": Severity.P0,
"high": Severity.P1,
"warning": Severity.P2,
"medium": Severity.P2,
"low": Severity.P3,
"info": Severity.P3,
}
return mapping.get(severity_str.lower(), Severity.P2)
def _parse_dict(self, value: str | dict) -> dict[str, str]:
"""解析字典字串或字典"""
if isinstance(value, dict):
return {str(k): str(v) for k, v in value.items()}
if isinstance(value, str):
try:
# 嘗試解析 JSON
parsed = json.loads(value.replace("'", '"'))
return {str(k): str(v) for k, v in parsed.items()}
except (json.JSONDecodeError, TypeError):
return {}
return {}
# =========================================================================
# Phase 17 P0: Router 層違規修復 - 新增方法
# =========================================================================
async def update_outcome(
self,
incident_id: str,
effectiveness_score: int | None = None,
human_feedback: str | None = None,
learning_notes: str | None = None,
should_remember: bool = True,
) -> Incident | None:
"""
更新 Incident 的 outcome (人類回饋)
Phase 17: 從 Router 層遷移至 Service 層
Args:
incident_id: 事件 ID
effectiveness_score: 有效性評分 (1-5)
human_feedback: 文字回饋
learning_notes: 學習筆記
should_remember: 是否納入長期記憶
Returns:
Incident | None: 更新後的事件,失敗返回 None
"""
from src.models.incident import IncidentOutcome
from src.repositories.incident_repository import get_incident_repository
from src.utils.timezone import now_taipei
# 1. 從 Working Memory 讀取
incident = await self.get_from_working_memory(incident_id)
if incident is None:
logger.warning("incident_not_found_for_outcome", incident_id=incident_id)
return None
# 2. 更新 outcome
if incident.outcome is None:
incident.outcome = IncidentOutcome()
if effectiveness_score is not None:
incident.outcome.effectiveness_score = effectiveness_score
if human_feedback is not None:
incident.outcome.human_feedback = human_feedback
if learning_notes is not None:
incident.outcome.learning_notes = learning_notes
incident.outcome.should_remember = should_remember
incident.updated_at = now_taipei()
# 3. 寫入 Working Memory
redis_success = await self.save_to_working_memory(incident)
if not redis_success:
logger.error("outcome_redis_write_failed", incident_id=incident_id)
return None
# 4. 同步到 Episodic Memory (PostgreSQL)
try:
repo = get_incident_repository()
await repo.update_outcome(
incident_id=incident_id,
outcome=incident.outcome.model_dump(mode="json"),
updated_at=now_taipei(),
)
logger.info("outcome_db_updated", incident_id=incident_id)
except Exception as e:
logger.warning(
"outcome_db_update_failed",
incident_id=incident_id,
error=str(e),
)
# DB 失敗不影響主流程
return incident
async def resolve_incident(self, incident_id: str) -> Incident | None:
"""
將 Incident 狀態更新為 RESOLVED
Phase 17: 從 Router 層遷移至 Service 層
Args:
incident_id: 事件 ID
Returns:
Incident | None: 更新後的事件,失敗返回 None
"""
from src.repositories.incident_repository import get_incident_repository
from src.utils.timezone import now_taipei
# 1. 從 Working Memory 讀取
incident = await self.get_from_working_memory(incident_id)
if incident is None:
logger.warning("incident_not_found_for_resolve", incident_id=incident_id)
return None
# 2. 更新狀態
incident.status = IncidentStatus.RESOLVED
incident.resolved_at = now_taipei()
incident.updated_at = now_taipei()
# 3. 寫入 Working Memory
redis_success = await self.save_to_working_memory(incident)
if not redis_success:
logger.error("resolve_redis_write_failed", incident_id=incident_id)
return None
# 4. 同步到 Episodic Memory
try:
repo = get_incident_repository()
await repo.update_status(
incident_id=incident_id,
status="resolved",
updated_at=now_taipei(),
)
logger.info("resolve_db_updated", incident_id=incident_id)
except Exception as e:
logger.warning(
"resolve_db_update_failed",
incident_id=incident_id,
error=str(e),
)
# KB Phase 2-A: 自動萃取 KB 草稿 (fire-and-forget, 2026-04-03 ogt)
try:
import asyncio
from src.services.knowledge_extractor_service import get_knowledge_extractor
asyncio.create_task(
get_knowledge_extractor().extract_from_incident(incident)
)
except Exception:
logger.exception("kb_extract_task_create_failed", incident_id=incident_id)
# ADR-073 Phase 4-2: resolve 時觸發 KM conversion (2026-04-12 ogt)
# 將已解決的 Incident 轉換為結構化 KM 條目,驅動飛輪學習固化節點
try:
import asyncio
from src.services.km_conversion_service import get_km_conversion_service
asyncio.create_task(
get_km_conversion_service().convert(incident)
)
except Exception:
logger.exception("km_conversion_task_create_failed", incident_id=incident_id)
# 2026-04-07 Claude Code: Sprint 4 B4 — 手動處理推斷
# I1+S1 Fix: 委託 derive_key_from_incident() 統一推導
try:
from src.services.anomaly_counter import AnomalyCounter, get_anomaly_counter
counter = get_anomaly_counter()
anomaly_key = AnomalyCounter.derive_key_from_incident(incident)
if anomaly_key:
disposition = await counter.get_disposition_stats(anomaly_key)
has_system_resolution = (
disposition["auto_repair"] > 0
or disposition["human_approved"] > 0
or disposition["cold_start_trust"] > 0
)
if not has_system_resolution:
await counter.record_disposition(anomaly_key, "manual_resolved")
except Exception as _disp_e:
logger.warning("disposition_manual_resolve_failed", error=str(_disp_e))
# MASTER Task 4.2 (2026-04-14 Claude Sonnet 4.6): Postmortem 自動組裝
# Incident duration > POSTMORTEM_MIN_DURATION_MINUTES(10min) 時自動生成
# 孤兒 report_generation_service.trigger_postmortem 本次接上 resolve 路徑
try:
import asyncio
from src.services.report_generation_service import get_report_generation_service
alertname = (
incident.signals[0].labels.get("alertname", "UnknownAlert")
if incident.signals else "UnknownAlert"
)
title = f"{alertname}{', '.join(incident.affected_services or ['N/A'])}"
root_cause = None
resolution_action = None
ai_provider = None
auto_repaired = False
if incident.decision_chain:
root_cause = incident.decision_chain.hypothesis
ai_provider = incident.decision_chain.model_used
if incident.outcome:
resolution_action = (incident.outcome.learning_notes or None)
auto_repaired = bool(incident.outcome.execution_success)
asyncio.create_task(
get_report_generation_service().trigger_postmortem(
incident_id=incident.incident_id,
title=title,
created_at=incident.signals[0].fired_at if incident.signals else incident.resolved_at,
resolved_at=incident.resolved_at,
root_cause=root_cause,
resolution_action=resolution_action,
ai_provider=ai_provider,
auto_repaired=auto_repaired,
)
)
except Exception as _pm_e:
logger.exception("postmortem_trigger_failed",
incident_id=incident_id, error=str(_pm_e))
return incident
async def find_by_proposal_id(self, proposal_id: str) -> Incident | None:
"""
根據 proposal_id 查找關聯的 Incident
Phase 17: 從 Router 層遷移至 Service 層
Args:
proposal_id: 提案 ID (UUID 字串)
Returns:
Incident | None: 找到的事件,未找到返回 None
"""
from uuid import UUID
redis_client = get_redis()
try:
target_uuid = UUID(proposal_id)
async for key in redis_client.scan_iter(
match=f"{INCIDENT_KEY_PREFIX}INC-*",
count=100,
):
data = await redis_client.get(key)
if data is None:
continue
try:
# 方案 C: 正規化舊格式 Enum 值
incident_dict = json.loads(data)
if "status" in incident_dict:
incident_dict["status"] = normalize_status(incident_dict["status"])
if "severity" in incident_dict:
incident_dict["severity"] = normalize_severity(incident_dict["severity"])
for signal in incident_dict.get("signals", []):
if "severity" in signal:
signal["severity"] = normalize_severity(signal["severity"])
incident = Incident.model_validate(incident_dict)
if target_uuid in incident.proposal_ids:
return incident
except Exception as e:
logger.warning(
"incident_parse_error_in_find",
key=key,
error=str(e),
)
continue
return None
except Exception as e:
logger.exception(
"find_by_proposal_id_error",
proposal_id=proposal_id,
error=str(e),
)
return None
async def trigger_reanalysis(self, incident_id: str) -> dict:
"""
觸發 Incident 重診 (ADR-050 P2: reanalyze button)
去重保護:同一 incident 10 分鐘內只觸發一次。
觸發後將 incident status 標記為 analyzing等待 AI 自動接手。
Args:
incident_id: Incident ID
Returns:
dict: {
"triggered": bool,
"message": str,
"already_analyzing": bool,
}
2026-04-01 Claude Code (ADR-050 P2): reanalyze button handler
"""
REANALYZE_TTL_SECONDS = 600 # 10 分鐘去重 TTL (ADR-050)
dedup_key = f"reanalyze_dedup:{incident_id}"
try:
redis_client = get_redis()
# 去重檢查 (SETNX: 只有第一次設定會成功)
is_new = await redis_client.set(dedup_key, "1", ex=REANALYZE_TTL_SECONDS, nx=True)
if not is_new:
logger.info(
"reanalyze_deduplicated",
incident_id=incident_id,
reason="Already triggered within 10 minutes",
)
return {
"triggered": False,
"message": "重診已在進行中,請 10 分鐘後再試",
"already_analyzing": True,
}
# 從 Working Memory 取得 Incident
incident = await self.get_from_working_memory(incident_id)
if not incident:
incident = await self.get_from_episodic_memory(incident_id)
if not incident:
# 刪除剛設定的去重 key讓下次能重試
await redis_client.delete(dedup_key)
logger.warning("reanalyze_incident_not_found", incident_id=incident_id)
return {
"triggered": False,
"message": f"找不到事件 {incident_id}",
"already_analyzing": False,
}
# 標記 status 為 analyzing讓 AI 引擎接手)
# 使用延遲 import 避免循環依賴(同 create_incident_from_signal 模式)
from src.models.incident import IncidentStatus
# 使用 INVESTIGATING 若 ANALYZING 不存在
analyzing_status = getattr(IncidentStatus, "ANALYZING", None) or getattr(IncidentStatus, "INVESTIGATING", None)
if analyzing_status:
incident.status = analyzing_status
await self.save_to_working_memory(incident)
logger.info(
"reanalyze_triggered",
incident_id=incident_id,
severity=incident.severity.value,
)
return {
"triggered": True,
"message": "重診已排程AI 正在分析中",
"already_analyzing": False,
}
except Exception as e:
logger.exception("reanalyze_failed", incident_id=incident_id, error=str(e))
return {
"triggered": False,
"message": f"重診觸發失敗: {str(e)[:80]}",
"already_analyzing": False,
}
# =============================================================================
# Singleton
# =============================================================================
_incident_service: IncidentService | None = None
def get_incident_service() -> IncidentService:
"""取得 Incident Service 實例 (Singleton)"""
global _incident_service
if _incident_service is None:
_incident_service = IncidentService()
return _incident_service