Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m59s
Wave 8 P3.1-T2 PathA 啟用 + Solver F4 安全強化 + test 對齊:
PathA — DiagnosisAggregator 信號分類層補 PDI:
- ENABLE_DIAGNOSIS_AGGREGATOR default=False → True
· PathA 純信號分類層(OOMKilled/CrashLoop 等業務邏輯)
· 不重複呼叫 K8s/SignOz API(只取 PDI 已收集的 raw 資料)
· 安全 default on — 純邏輯處理,無外部依賴重疊
- diagnosis_aggregator.py +155 行(PathA 實作)
- pre_decision_investigator.py 已接 (commit 3a2cd151)
F4 — Solver critical risk reject:
- solver_agent.py: _validate_recommended_action 拒絕 risk=critical
· 鐵律:critical 動作必須走人工審批,不可變 Telegram 按鈕
· log warning + return None(被 _extract 過濾掉)
- _extract_recommended_actions 改返回 (list, status_str) tuple
· status="ok"/"empty"/"all_invalid" 供呼叫端決策
- protocol.py +16 / metrics.py +9 / ai_router.py +18 — 配套 metric + protocol field
測試對齊:
- test_solver_recommended_actions.py 拆 test_all_valid → low/medium/high accepted +
test_critical_rejected
- result tuple unpack: result, _ = _extract_recommended_actions(...)
- test_diagnosis_aggregator_stub.py: feature flag default 改 True 對齊 PathA
Tests: 51 passed (solver 28 + aggregator 16 + router fallback 8)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-Authored-By: Multiple Engineers (Wave 8 P3.1-T2 PathA + F4) <noreply@anthropic.com>
746 lines
26 KiB
Python
746 lines
26 KiB
Python
"""
|
||
Diagnosis Aggregator - Phase 2 診斷資料整合層
|
||
==============================================
|
||
ADR-030: 智能自動修復系統
|
||
|
||
整合多來源診斷資料:
|
||
- K8s Diagnostics: Pod Events, Logs, Resource Usage
|
||
- SignOz Metrics: Gold Metrics, Error Logs
|
||
- Expert Rules: 規則匹配與診斷建議
|
||
|
||
設計原則:
|
||
- 非同步並行收集,最大化效能
|
||
- 錯誤容忍,部分失敗不影響整體
|
||
- 提供結構化 Context 給 LLM 分析
|
||
|
||
版本: v1.0
|
||
建立: 2026-03-26 (台北時區)
|
||
"""
|
||
|
||
from dataclasses import dataclass, field
|
||
from datetime import UTC, datetime
|
||
from enum import Enum
|
||
from typing import Any
|
||
|
||
import structlog
|
||
|
||
from src.services.k8s_diagnostics import (
|
||
K8sDiagnostics,
|
||
get_k8s_diagnostics_service,
|
||
)
|
||
from src.services.signoz_client import (
|
||
GoldMetrics,
|
||
get_signoz_client,
|
||
)
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
|
||
# =============================================================================
|
||
# Diagnosis Severity
|
||
# =============================================================================
|
||
|
||
|
||
class DiagnosisSeverity(str, Enum):
|
||
"""診斷嚴重程度"""
|
||
|
||
CRITICAL = "critical" # 需立即處理 (服務中斷、資料遺失風險)
|
||
HIGH = "high" # 1 小時內處理 (效能嚴重下降)
|
||
MEDIUM = "medium" # 24 小時內處理 (異常但服務可用)
|
||
LOW = "low" # 追蹤觀察 (輕微異常)
|
||
INFO = "info" # 資訊性,無需處理
|
||
|
||
|
||
# =============================================================================
|
||
# Data Models
|
||
# =============================================================================
|
||
|
||
|
||
@dataclass
|
||
class DiagnosisSignal:
|
||
"""診斷信號 (來自各資料源的發現)"""
|
||
|
||
source: str # k8s_events, k8s_logs, signoz_metrics, signoz_logs, expert_rules
|
||
signal_type: str # oom_killed, crash_loop, high_error_rate, etc.
|
||
severity: DiagnosisSeverity
|
||
message: str
|
||
evidence: dict[str, Any] = field(default_factory=dict) # 證據資料
|
||
timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
|
||
|
||
def to_dict(self) -> dict[str, Any]:
|
||
return {
|
||
"source": self.source,
|
||
"signal_type": self.signal_type,
|
||
"severity": self.severity.value,
|
||
"message": self.message,
|
||
"evidence": self.evidence,
|
||
"timestamp": self.timestamp.isoformat(),
|
||
}
|
||
|
||
|
||
@dataclass
|
||
class DiagnosisContext:
|
||
"""
|
||
診斷上下文 - 整合所有來源的診斷資料
|
||
|
||
提供給 LLM 分析的完整 Context
|
||
"""
|
||
|
||
# 識別資訊
|
||
target: str # Pod name, Service name, etc.
|
||
namespace: str = "awoooi-prod"
|
||
collected_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
||
|
||
# 診斷資料
|
||
k8s_diagnostics: K8sDiagnostics | None = None
|
||
gold_metrics: GoldMetrics | None = None
|
||
error_logs: list[dict] = field(default_factory=list)
|
||
|
||
# 診斷信號 (各來源的發現)
|
||
signals: list[DiagnosisSignal] = field(default_factory=list)
|
||
|
||
# Expert System 匹配結果
|
||
expert_match: dict[str, Any] | None = None
|
||
|
||
# 收集錯誤
|
||
collection_errors: list[str] = field(default_factory=list)
|
||
|
||
def to_dict(self) -> dict[str, Any]:
|
||
"""轉換為字典 (供 JSON 序列化)"""
|
||
return {
|
||
"target": self.target,
|
||
"namespace": self.namespace,
|
||
"collected_at": self.collected_at.isoformat(),
|
||
"k8s_diagnostics": self.k8s_diagnostics.to_dict() if self.k8s_diagnostics else None,
|
||
"gold_metrics": {
|
||
"rps": self.gold_metrics.rps if self.gold_metrics else None,
|
||
"error_rate": self.gold_metrics.error_rate if self.gold_metrics else None,
|
||
"p99_latency_ms": self.gold_metrics.p99_latency_ms if self.gold_metrics else None,
|
||
} if self.gold_metrics else None,
|
||
"error_logs_count": len(self.error_logs),
|
||
"signals": [s.to_dict() for s in self.signals],
|
||
"expert_match": self.expert_match,
|
||
"collection_errors": self.collection_errors,
|
||
}
|
||
|
||
@property
|
||
def highest_severity(self) -> DiagnosisSeverity:
|
||
"""取得最高嚴重程度"""
|
||
if not self.signals:
|
||
return DiagnosisSeverity.INFO
|
||
|
||
severity_order = [
|
||
DiagnosisSeverity.CRITICAL,
|
||
DiagnosisSeverity.HIGH,
|
||
DiagnosisSeverity.MEDIUM,
|
||
DiagnosisSeverity.LOW,
|
||
DiagnosisSeverity.INFO,
|
||
]
|
||
|
||
for severity in severity_order:
|
||
if any(s.severity == severity for s in self.signals):
|
||
return severity
|
||
|
||
return DiagnosisSeverity.INFO
|
||
|
||
def get_llm_prompt_context(self) -> str:
|
||
"""
|
||
生成 LLM 分析用的 Prompt Context
|
||
|
||
結構化呈現所有診斷資訊,讓 LLM 做出更好的判斷
|
||
"""
|
||
sections = []
|
||
|
||
# 1. Target Info
|
||
sections.append(f"## 診斷目標\n- Target: {self.target}\n- Namespace: {self.namespace}")
|
||
|
||
# 2. K8s Diagnostics
|
||
if self.k8s_diagnostics:
|
||
k8s_summary = self.k8s_diagnostics.get_diagnosis_summary()
|
||
sections.append(f"## K8s 診斷\n{k8s_summary}")
|
||
|
||
# 警告事件詳情
|
||
if self.k8s_diagnostics.warning_events:
|
||
events_text = "\n".join(
|
||
f"- [{e.reason}] {e.message[:150]}"
|
||
for e in self.k8s_diagnostics.warning_events[:5]
|
||
)
|
||
sections.append(f"## K8s 警告事件\n{events_text}")
|
||
|
||
# 3. Gold Metrics
|
||
if self.gold_metrics:
|
||
sections.append(f"## SignOz 黃金指標\n{self.gold_metrics.to_summary()}")
|
||
|
||
# 4. Error Logs
|
||
if self.error_logs:
|
||
log_text = "\n".join(
|
||
f"- [{log.get('severity', 'ERROR')}] {log.get('message', '')[:100]}"
|
||
for log in self.error_logs[:5]
|
||
)
|
||
sections.append(f"## 錯誤日誌 (最近 {len(self.error_logs)} 筆)\n{log_text}")
|
||
|
||
# 5. Signals
|
||
if self.signals:
|
||
signals_text = "\n".join(
|
||
f"- [{s.severity.value.upper()}] {s.source}: {s.message}"
|
||
for s in sorted(self.signals, key=lambda x: x.severity.value)
|
||
)
|
||
sections.append(f"## 診斷信號\n{signals_text}")
|
||
|
||
# 6. Expert Match
|
||
if self.expert_match:
|
||
sections.append(
|
||
f"## Expert System 匹配\n"
|
||
f"- 規則: {self.expert_match.get('rule_name', 'N/A')}\n"
|
||
f"- 說明: {self.expert_match.get('description', 'N/A')}\n"
|
||
f"- 風險: {self.expert_match.get('risk_level', 'N/A')}\n"
|
||
f"- 推理: {self.expert_match.get('reasoning', 'N/A')}"
|
||
)
|
||
|
||
return "\n\n".join(sections)
|
||
|
||
|
||
# =============================================================================
|
||
# Diagnosis Aggregator Service
|
||
# =============================================================================
|
||
|
||
|
||
class DiagnosisAggregator:
|
||
"""
|
||
診斷資料聚合器
|
||
|
||
整合 K8s、SignOz、Expert System 等多來源診斷資料
|
||
提供統一的 DiagnosisContext 供 LLM 或決策引擎使用
|
||
"""
|
||
|
||
def __init__(self):
|
||
self.k8s_service = get_k8s_diagnostics_service()
|
||
self.signoz_client = get_signoz_client()
|
||
|
||
async def collect_pod_diagnosis(
|
||
self,
|
||
pod_name: str,
|
||
namespace: str = "awoooi-prod",
|
||
include_signoz: bool = True,
|
||
include_error_logs: bool = True,
|
||
expert_match: dict | None = None,
|
||
) -> DiagnosisContext:
|
||
"""
|
||
收集 Pod 的完整診斷資料
|
||
|
||
Args:
|
||
pod_name: Pod 名稱 (支援部分匹配)
|
||
namespace: Namespace
|
||
include_signoz: 是否包含 SignOz 指標
|
||
include_error_logs: 是否包含錯誤日誌
|
||
expert_match: Expert System 匹配結果
|
||
|
||
Returns:
|
||
DiagnosisContext: 完整診斷上下文
|
||
"""
|
||
context = DiagnosisContext(
|
||
target=pod_name,
|
||
namespace=namespace,
|
||
expert_match=expert_match,
|
||
)
|
||
|
||
import asyncio
|
||
|
||
# 並行收集資料
|
||
tasks = []
|
||
|
||
# K8s Diagnostics (必收集)
|
||
tasks.append(self._collect_k8s_diagnostics(context, pod_name, namespace))
|
||
|
||
# SignOz Metrics (可選)
|
||
if include_signoz:
|
||
# 從 pod_name 推斷 service_name (去除 hash suffix)
|
||
service_name = self._pod_to_service_name(pod_name)
|
||
tasks.append(self._collect_signoz_metrics(context, service_name))
|
||
|
||
if include_error_logs:
|
||
tasks.append(self._collect_error_logs(context, service_name))
|
||
|
||
await asyncio.gather(*tasks, return_exceptions=True)
|
||
|
||
# 分析診斷資料,產生信號
|
||
self._analyze_signals(context)
|
||
|
||
logger.info(
|
||
"diagnosis_collected",
|
||
target=pod_name,
|
||
signals_count=len(context.signals),
|
||
highest_severity=context.highest_severity.value,
|
||
errors_count=len(context.collection_errors),
|
||
)
|
||
|
||
return context
|
||
|
||
async def collect_service_diagnosis(
|
||
self,
|
||
service_name: str,
|
||
namespace: str = "awoooi-prod",
|
||
expert_match: dict | None = None,
|
||
) -> DiagnosisContext:
|
||
"""
|
||
收集 Service 的診斷資料 (不含特定 Pod)
|
||
|
||
主要用於服務級別的監控告警分析
|
||
"""
|
||
context = DiagnosisContext(
|
||
target=service_name,
|
||
namespace=namespace,
|
||
expert_match=expert_match,
|
||
)
|
||
|
||
import asyncio
|
||
|
||
await asyncio.gather(
|
||
self._collect_signoz_metrics(context, service_name),
|
||
self._collect_error_logs(context, service_name),
|
||
return_exceptions=True,
|
||
)
|
||
|
||
self._analyze_signals(context)
|
||
|
||
return context
|
||
|
||
# =========================================================================
|
||
# Private Collection Methods
|
||
# =========================================================================
|
||
|
||
async def _collect_k8s_diagnostics(
|
||
self,
|
||
context: DiagnosisContext,
|
||
pod_name: str,
|
||
namespace: str,
|
||
) -> None:
|
||
"""收集 K8s 診斷資料"""
|
||
try:
|
||
diagnostics = await self.k8s_service.collect_diagnostics(
|
||
pod_name=pod_name,
|
||
namespace=namespace,
|
||
include_logs=True,
|
||
include_previous_logs=True,
|
||
log_tail_lines=100,
|
||
)
|
||
context.k8s_diagnostics = diagnostics
|
||
|
||
# 傳遞 K8s 收集錯誤
|
||
if diagnostics.errors:
|
||
context.collection_errors.extend(
|
||
[f"k8s: {e}" for e in diagnostics.errors]
|
||
)
|
||
|
||
except Exception as e:
|
||
error_msg = f"K8s diagnostics failed: {e}"
|
||
context.collection_errors.append(error_msg)
|
||
logger.warning("k8s_diagnostics_collection_failed", error=str(e))
|
||
|
||
async def _collect_signoz_metrics(
|
||
self,
|
||
context: DiagnosisContext,
|
||
service_name: str,
|
||
) -> None:
|
||
"""收集 SignOz Gold Metrics"""
|
||
try:
|
||
metrics = await self.signoz_client.get_gold_metrics(
|
||
service_name=service_name,
|
||
namespace=context.namespace,
|
||
time_window_minutes=10,
|
||
)
|
||
context.gold_metrics = metrics
|
||
|
||
except Exception as e:
|
||
error_msg = f"SignOz metrics failed: {e}"
|
||
context.collection_errors.append(error_msg)
|
||
logger.warning("signoz_metrics_collection_failed", error=str(e))
|
||
|
||
async def _collect_error_logs(
|
||
self,
|
||
context: DiagnosisContext,
|
||
service_name: str,
|
||
) -> None:
|
||
"""收集錯誤日誌"""
|
||
try:
|
||
logs = await self.signoz_client.get_logs(
|
||
service_name=service_name,
|
||
severity="ERROR,FATAL,CRITICAL",
|
||
time_window_minutes=30,
|
||
limit=20,
|
||
)
|
||
context.error_logs = logs
|
||
|
||
except Exception as e:
|
||
error_msg = f"Error logs failed: {e}"
|
||
context.collection_errors.append(error_msg)
|
||
logger.warning("error_logs_collection_failed", error=str(e))
|
||
|
||
# =========================================================================
|
||
# Signal Analysis
|
||
# =========================================================================
|
||
|
||
def _analyze_signals(self, context: DiagnosisContext) -> None:
|
||
"""分析診斷資料,產生診斷信號"""
|
||
|
||
# 1. K8s Signals
|
||
if context.k8s_diagnostics:
|
||
self._analyze_k8s_signals(context, context.k8s_diagnostics)
|
||
|
||
# 2. SignOz Metrics Signals
|
||
if context.gold_metrics:
|
||
self._analyze_metrics_signals(context, context.gold_metrics)
|
||
|
||
# 3. Error Log Signals
|
||
if context.error_logs:
|
||
self._analyze_log_signals(context, context.error_logs)
|
||
|
||
def _analyze_k8s_signals(
|
||
self,
|
||
context: DiagnosisContext,
|
||
k8s: K8sDiagnostics,
|
||
) -> None:
|
||
"""分析 K8s 診斷資料產生信號"""
|
||
|
||
# CrashLoopBackOff
|
||
if k8s.pod_status and k8s.pod_status.is_crash_loop():
|
||
context.signals.append(DiagnosisSignal(
|
||
source="k8s_status",
|
||
signal_type="crash_loop",
|
||
severity=DiagnosisSeverity.CRITICAL,
|
||
message=f"Pod {k8s.pod_name} is in CrashLoopBackOff state",
|
||
evidence={
|
||
"restart_count": k8s.pod_status.restart_count,
|
||
"container_statuses": k8s.pod_status.container_statuses,
|
||
},
|
||
))
|
||
|
||
# Image Pull Error
|
||
if k8s.pod_status and k8s.pod_status.is_image_pull_error():
|
||
context.signals.append(DiagnosisSignal(
|
||
source="k8s_status",
|
||
signal_type="image_pull_error",
|
||
severity=DiagnosisSeverity.HIGH,
|
||
message=f"Pod {k8s.pod_name} has image pull error",
|
||
evidence={
|
||
"container_statuses": k8s.pod_status.container_statuses,
|
||
},
|
||
))
|
||
|
||
# High Restart Count
|
||
if k8s.pod_status and k8s.pod_status.restart_count > 5:
|
||
context.signals.append(DiagnosisSignal(
|
||
source="k8s_status",
|
||
signal_type="high_restart_count",
|
||
severity=DiagnosisSeverity.MEDIUM,
|
||
message=f"Pod {k8s.pod_name} has high restart count: {k8s.pod_status.restart_count}",
|
||
evidence={
|
||
"restart_count": k8s.pod_status.restart_count,
|
||
},
|
||
))
|
||
|
||
# High Resource Usage
|
||
if k8s.resource_usage:
|
||
if k8s.resource_usage.is_cpu_high(threshold=80):
|
||
context.signals.append(DiagnosisSignal(
|
||
source="k8s_metrics",
|
||
signal_type="high_cpu",
|
||
severity=DiagnosisSeverity.MEDIUM,
|
||
message=f"High CPU usage: {k8s.resource_usage.cpu_percent:.1f}%",
|
||
evidence=k8s.resource_usage.to_dict(),
|
||
))
|
||
|
||
if k8s.resource_usage.is_memory_high(threshold=80):
|
||
context.signals.append(DiagnosisSignal(
|
||
source="k8s_metrics",
|
||
signal_type="high_memory",
|
||
severity=DiagnosisSeverity.HIGH,
|
||
message=f"High memory usage: {k8s.resource_usage.memory_percent:.1f}%",
|
||
evidence=k8s.resource_usage.to_dict(),
|
||
))
|
||
|
||
# Warning Events
|
||
for event in k8s.warning_events:
|
||
if event.is_recent(minutes=15):
|
||
# OOMKilled
|
||
if "oom" in event.message.lower() or "oomkilled" in event.reason.lower():
|
||
context.signals.append(DiagnosisSignal(
|
||
source="k8s_events",
|
||
signal_type="oom_killed",
|
||
severity=DiagnosisSeverity.CRITICAL,
|
||
message=f"OOMKilled detected: {event.message[:100]}",
|
||
evidence=event.to_dict(),
|
||
))
|
||
# FailedScheduling
|
||
elif "failedscheduling" in event.reason.lower():
|
||
context.signals.append(DiagnosisSignal(
|
||
source="k8s_events",
|
||
signal_type="failed_scheduling",
|
||
severity=DiagnosisSeverity.HIGH,
|
||
message=f"Failed to schedule: {event.message[:100]}",
|
||
evidence=event.to_dict(),
|
||
))
|
||
|
||
def _analyze_metrics_signals(
|
||
self,
|
||
context: DiagnosisContext,
|
||
metrics: GoldMetrics,
|
||
) -> None:
|
||
"""分析 SignOz Metrics 產生信號"""
|
||
|
||
# High Error Rate (> 5%)
|
||
if metrics.error_rate > 5:
|
||
context.signals.append(DiagnosisSignal(
|
||
source="signoz_metrics",
|
||
signal_type="high_error_rate",
|
||
severity=DiagnosisSeverity.CRITICAL if metrics.error_rate > 20 else DiagnosisSeverity.HIGH,
|
||
message=f"High error rate: {metrics.error_rate:.2f}%",
|
||
evidence={
|
||
"error_rate": metrics.error_rate,
|
||
"error_count": metrics.error_count,
|
||
"total_requests": metrics.total_requests,
|
||
},
|
||
))
|
||
|
||
# High Latency (P99 > 5s)
|
||
if metrics.p99_latency_ms > 5000:
|
||
context.signals.append(DiagnosisSignal(
|
||
source="signoz_metrics",
|
||
signal_type="high_latency",
|
||
severity=DiagnosisSeverity.MEDIUM if metrics.p99_latency_ms < 10000 else DiagnosisSeverity.HIGH,
|
||
message=f"High P99 latency: {metrics.p99_latency_ms:.0f}ms",
|
||
evidence={
|
||
"p50_ms": metrics.p50_latency_ms,
|
||
"p95_ms": metrics.p95_latency_ms,
|
||
"p99_ms": metrics.p99_latency_ms,
|
||
},
|
||
))
|
||
|
||
# Low/No Traffic
|
||
if metrics.rps < 0.01 and metrics.total_requests < 10:
|
||
context.signals.append(DiagnosisSignal(
|
||
source="signoz_metrics",
|
||
signal_type="no_traffic",
|
||
severity=DiagnosisSeverity.LOW,
|
||
message=f"Low/No traffic detected: {metrics.rps:.2f} RPS",
|
||
evidence={
|
||
"rps": metrics.rps,
|
||
"total_requests": metrics.total_requests,
|
||
},
|
||
))
|
||
|
||
def _analyze_log_signals(
|
||
self,
|
||
context: DiagnosisContext,
|
||
logs: list[dict],
|
||
) -> None:
|
||
"""分析錯誤日誌產生信號"""
|
||
|
||
if not logs:
|
||
return
|
||
|
||
# 計算各類錯誤數量
|
||
error_count = len(logs)
|
||
|
||
if error_count > 10:
|
||
# 取樣錯誤訊息
|
||
sample_messages = [log.get("message", "")[:100] for log in logs[:3]]
|
||
context.signals.append(DiagnosisSignal(
|
||
source="signoz_logs",
|
||
signal_type="frequent_errors",
|
||
severity=DiagnosisSeverity.MEDIUM if error_count < 50 else DiagnosisSeverity.HIGH,
|
||
message=f"Frequent errors detected: {error_count} errors in last 30 minutes",
|
||
evidence={
|
||
"error_count": error_count,
|
||
"sample_messages": sample_messages,
|
||
},
|
||
))
|
||
|
||
def classify_signals_from_raw(
|
||
self,
|
||
k8s_data: dict | None = None,
|
||
logs_data: str | None = None,
|
||
metrics_data: dict | None = None,
|
||
) -> list[DiagnosisSignal]:
|
||
"""
|
||
2026-04-27 P3.1-T2-PathA by Claude — DiagAggregator 信號分類層補 PDI
|
||
|
||
純邏輯信號分類:接受 PDI 已收集的 raw 資料做業務邏輯分類,
|
||
不打外部 API(K8s/SignOz),不重複收集。
|
||
|
||
Args:
|
||
k8s_data: EvidenceSnapshot.k8s_state(D1,PDI 已收集的 dict)
|
||
logs_data: EvidenceSnapshot.recent_logs(D2,sanitized string)
|
||
metrics_data: EvidenceSnapshot.metrics_snapshot(D3,PDI 已收集的 dict)
|
||
|
||
Returns:
|
||
list[DiagnosisSignal]: 分類後的信號清單(空清單代表無異常)
|
||
"""
|
||
# 組裝暫時 context 供 _analyze_signals 使用(不觸發任何 IO)
|
||
ctx = DiagnosisContext(target="_classify_only")
|
||
|
||
# D1: k8s_state dict → 嘗試映射為 K8sDiagnostics(只提取可分類欄位)
|
||
if k8s_data and isinstance(k8s_data, dict):
|
||
# 利用 K8sDiagnostics.from_dict(若存在)或直接從常見欄位提取信號
|
||
# 不依賴 K8sDiagnostics.from_dict(避免 import coupling),
|
||
# 改從 k8s_data 中提取已知信號模式
|
||
self._classify_k8s_dict_signals(ctx, k8s_data)
|
||
|
||
# D3: metrics_snapshot → GoldMetrics-like 分析
|
||
if metrics_data and isinstance(metrics_data, dict):
|
||
self._classify_metrics_dict_signals(ctx, metrics_data)
|
||
|
||
# D2: logs string → 錯誤計數分類
|
||
if logs_data and isinstance(logs_data, str):
|
||
self._classify_log_string_signals(ctx, logs_data)
|
||
|
||
return ctx.signals
|
||
|
||
def _classify_k8s_dict_signals(
|
||
self,
|
||
context: DiagnosisContext,
|
||
k8s_data: dict,
|
||
) -> None:
|
||
"""
|
||
2026-04-27 P3.1-T2-PathA by Claude — 從 PDI k8s_state dict 提取信號
|
||
不依賴 K8sDiagnostics 物件,直接從 dict 關鍵字段分類。
|
||
"""
|
||
phase = str(k8s_data.get("phase", "")).lower()
|
||
reason = str(k8s_data.get("reason", "")).lower()
|
||
restart_count = k8s_data.get("restart_count", 0) or 0
|
||
|
||
# CrashLoopBackOff
|
||
if "crashloop" in phase or "crashloopbackoff" in reason:
|
||
context.signals.append(DiagnosisSignal(
|
||
source="k8s_state",
|
||
signal_type="crash_loop",
|
||
severity=DiagnosisSeverity.CRITICAL,
|
||
message=f"CrashLoopBackOff detected (phase={k8s_data.get('phase', '?')})",
|
||
evidence={"phase": k8s_data.get("phase"), "reason": k8s_data.get("reason")},
|
||
))
|
||
|
||
# OOMKilled
|
||
if "oomkilled" in phase or "oomkilled" in reason or "oom" in reason:
|
||
context.signals.append(DiagnosisSignal(
|
||
source="k8s_state",
|
||
signal_type="oom_killed",
|
||
severity=DiagnosisSeverity.CRITICAL,
|
||
message=f"OOMKilled detected (reason={k8s_data.get('reason', '?')})",
|
||
evidence={"phase": k8s_data.get("phase"), "reason": k8s_data.get("reason")},
|
||
))
|
||
|
||
# Image pull error
|
||
if "imagepullerr" in reason or "errimagepull" in reason or "imagepullbackoff" in reason:
|
||
context.signals.append(DiagnosisSignal(
|
||
source="k8s_state",
|
||
signal_type="image_pull_error",
|
||
severity=DiagnosisSeverity.HIGH,
|
||
message=f"Image pull error (reason={k8s_data.get('reason', '?')})",
|
||
evidence={"reason": k8s_data.get("reason")},
|
||
))
|
||
|
||
# High restart count
|
||
try:
|
||
rc = int(restart_count)
|
||
except (TypeError, ValueError):
|
||
rc = 0
|
||
if rc > 5:
|
||
context.signals.append(DiagnosisSignal(
|
||
source="k8s_state",
|
||
signal_type="high_restart_count",
|
||
severity=DiagnosisSeverity.MEDIUM,
|
||
message=f"High restart count: {rc}",
|
||
evidence={"restart_count": rc},
|
||
))
|
||
|
||
def _classify_metrics_dict_signals(
|
||
self,
|
||
context: DiagnosisContext,
|
||
metrics_data: dict,
|
||
) -> None:
|
||
"""
|
||
2026-04-27 P3.1-T2-PathA by Claude — 從 PDI metrics_snapshot dict 提取信號
|
||
"""
|
||
try:
|
||
error_rate = float(metrics_data.get("error_rate", 0) or 0)
|
||
except (TypeError, ValueError):
|
||
error_rate = 0.0
|
||
|
||
try:
|
||
p99_ms = float(metrics_data.get("p99_latency_ms", 0) or 0)
|
||
except (TypeError, ValueError):
|
||
p99_ms = 0.0
|
||
|
||
if error_rate > 5:
|
||
context.signals.append(DiagnosisSignal(
|
||
source="metrics_snapshot",
|
||
signal_type="high_error_rate",
|
||
severity=DiagnosisSeverity.CRITICAL if error_rate > 20 else DiagnosisSeverity.HIGH,
|
||
message=f"High error rate: {error_rate:.2f}%",
|
||
evidence={"error_rate": error_rate},
|
||
))
|
||
|
||
if p99_ms > 5000:
|
||
context.signals.append(DiagnosisSignal(
|
||
source="metrics_snapshot",
|
||
signal_type="high_latency",
|
||
severity=DiagnosisSeverity.HIGH if p99_ms >= 10000 else DiagnosisSeverity.MEDIUM,
|
||
message=f"High P99 latency: {p99_ms:.0f}ms",
|
||
evidence={"p99_latency_ms": p99_ms},
|
||
))
|
||
|
||
def _classify_log_string_signals(
|
||
self,
|
||
context: DiagnosisContext,
|
||
logs_data: str,
|
||
) -> None:
|
||
"""
|
||
2026-04-27 P3.1-T2-PathA by Claude — 從 PDI recent_logs string 提取信號
|
||
"""
|
||
# 簡單計數 ERROR/FATAL 行
|
||
error_lines = [
|
||
line for line in logs_data.splitlines()
|
||
if any(kw in line.upper() for kw in ("ERROR", "FATAL", "CRITICAL", "EXCEPTION", "TRACEBACK"))
|
||
]
|
||
if len(error_lines) > 10:
|
||
context.signals.append(DiagnosisSignal(
|
||
source="recent_logs",
|
||
signal_type="frequent_errors",
|
||
severity=DiagnosisSeverity.HIGH if len(error_lines) >= 50 else DiagnosisSeverity.MEDIUM,
|
||
message=f"Frequent error lines in logs: {len(error_lines)}",
|
||
evidence={"error_line_count": len(error_lines), "sample": error_lines[:3]},
|
||
))
|
||
|
||
# =========================================================================
|
||
# Utilities
|
||
# =========================================================================
|
||
|
||
def _pod_to_service_name(self, pod_name: str) -> str:
|
||
"""
|
||
從 Pod 名稱推斷 Service 名稱
|
||
|
||
例如:
|
||
- awoooi-api-7f9d8b6c5d-x2k4j -> awoooi-api
|
||
- awoooi-web-5c8d7e6f4a-h3m9n -> awoooi-web
|
||
"""
|
||
# 移除 Deployment hash suffix
|
||
parts = pod_name.rsplit("-", 2)
|
||
if len(parts) >= 3:
|
||
return "-".join(parts[:-2])
|
||
return pod_name
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton
|
||
# =============================================================================
|
||
|
||
_aggregator: DiagnosisAggregator | None = None
|
||
|
||
|
||
def get_diagnosis_aggregator() -> DiagnosisAggregator:
|
||
"""取得診斷聚合器 singleton"""
|
||
global _aggregator
|
||
if _aggregator is None:
|
||
_aggregator = DiagnosisAggregator()
|
||
return _aggregator
|