fix(aiops-p0): 六大病根 P0 全面修復(ADR-092 B4)
【P0.1】knowledge_extractor_service.py:210 — AttributeError 修復 - Signal.description 欄位不存在(100% 失敗,KM 每天+5 根因) - 改用 alert_name + annotations.summary 拼接文字 【P0.2+P0.3】Gate 9+11 唯讀指令鬆綁 - blast_radius_calculator: kubectl get/top/describe/logs/version → score=1(非 50) - operation_parser: 增加 INVESTIGATE 類型識別(唯讀 kubectl 不回 None) - executor.py: OperationType 新增 INVESTIGATE enum - approval_execution.py: INVESTIGATE 路徑直接呼叫 execute_kubectl_command 【P0.4】MCP SSH/K8s Provider 修復 - decision_manager: params= → parameters=(符合 MCPToolProvider.execute 簽名) - decision_manager: MCPToolResult .get() → .success/.output(dataclass 用法) - decision_manager + ssh_provider: 補入 hosts 120/121(原 default 缺失) - auto_approve: phase2_agent_debate source bypass confidence 閾值 【P0.5】告警規則語義矛盾修復 - alert_rules.yaml: 8 條 kubectl 查詢規則 RESTART_DEPLOYMENT → NO_ACTION (CrashLoopBackOff/PostgreSQL 連線/慢查詢/MinIO 磁碟/K3s 節點/告警鏈路/SSL/CoreDNS 等) - incident_service.py: cAdvisor/CoreDNS 從 general 拆出獨立分類 【P0.6】proactive_inspector 動態基線 PromQL 全修 - 5 個 MONITORED_METRICS PromQL 全部修正(cadvisor label/datname/blackbox) - db_connection_pool: datname="awoooi" → "awoooi_prod" - http_error_rate: 無效 http_requests_total → blackbox probe_success - cpu/memory: namespace label → name=~"k8s_api_awoooi-api.*" Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -232,7 +232,7 @@ rules:
|
||||
response:
|
||||
action_title: "診斷 {target} CrashLoop 根因"
|
||||
description: "⚙️ 規則匹配: {target} 進入 CrashLoopBackOff,需檢查啟動錯誤日誌。"
|
||||
suggested_action: RESTART_DEPLOYMENT
|
||||
suggested_action: NO_ACTION
|
||||
kubectl_command: "kubectl logs {target} -n {namespace} --previous --tail=50"
|
||||
estimated_downtime: "依根因而定"
|
||||
risk: critical
|
||||
@@ -315,7 +315,7 @@ rules:
|
||||
response:
|
||||
action_title: "清理 PostgreSQL 閒置連線"
|
||||
description: "⚙️ 規則匹配: PostgreSQL 連線池使用率過高,可能導致新請求被拒絕。"
|
||||
suggested_action: RESTART_DEPLOYMENT
|
||||
suggested_action: NO_ACTION
|
||||
kubectl_command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE state = ''idle'' AND state_change < NOW() - INTERVAL ''5 minutes'';'"
|
||||
estimated_downtime: "0"
|
||||
risk: critical
|
||||
@@ -342,7 +342,7 @@ rules:
|
||||
response:
|
||||
action_title: "診斷 PostgreSQL 慢查詢 + 索引優化"
|
||||
description: "⚙️ 規則匹配: PostgreSQL 存在慢查詢或鎖等待,影響系統整體性能。"
|
||||
suggested_action: RESTART_DEPLOYMENT
|
||||
suggested_action: NO_ACTION
|
||||
kubectl_command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT pid, query, state, wait_event_type, wait_event FROM pg_stat_activity WHERE state != ''idle'' ORDER BY query_start;'"
|
||||
estimated_downtime: "0"
|
||||
risk: medium
|
||||
@@ -448,7 +448,7 @@ rules:
|
||||
response:
|
||||
action_title: "清理 MinIO 過期資料 on {host}"
|
||||
description: "⚙️ 規則匹配: MinIO 磁碟使用率過高,需清理舊資料或擴展儲存空間。"
|
||||
suggested_action: RESTART_DEPLOYMENT
|
||||
suggested_action: NO_ACTION
|
||||
kubectl_command: "ssh {host} 'df -h /data/minio && du -sh /data/minio/* | sort -rh | head -10'"
|
||||
estimated_downtime: "0"
|
||||
risk: critical
|
||||
@@ -503,7 +503,7 @@ rules:
|
||||
response:
|
||||
action_title: "確認 K3s 節點 {target} 狀態"
|
||||
description: "⚙️ 規則匹配: K3s 節點下線,影響叢集可用性和 Pod 調度。"
|
||||
suggested_action: RESTART_DEPLOYMENT
|
||||
suggested_action: NO_ACTION
|
||||
kubectl_command: "kubectl get nodes -o wide && kubectl describe node {target}"
|
||||
estimated_downtime: "依節點恢復時間"
|
||||
risk: critical
|
||||
@@ -562,7 +562,7 @@ rules:
|
||||
response:
|
||||
action_title: "診斷告警鏈路中斷"
|
||||
description: "⚙️ 規則匹配: 告警鏈路異常,可能導致真實告警無法送達 Telegram。"
|
||||
suggested_action: RESTART_DEPLOYMENT
|
||||
suggested_action: NO_ACTION
|
||||
kubectl_command: "kubectl get pods -n monitoring && curl -s http://192.168.0.120:9093/api/v1/status | jq '.data.uptime'"
|
||||
estimated_downtime: "監控盲區持續中"
|
||||
risk: critical
|
||||
@@ -593,7 +593,7 @@ rules:
|
||||
response:
|
||||
action_title: "確認 NVIDIA API 熔斷狀態"
|
||||
description: "⚙️ 規則匹配: NVIDIA/Nemotron 熔斷器開啟或錯誤率過高,AI Router 已自動降級。"
|
||||
suggested_action: RESTART_DEPLOYMENT
|
||||
suggested_action: NO_ACTION
|
||||
kubectl_command: "curl -s http://192.168.0.125:32334/api/v1/ai-router/status | jq '.providers'"
|
||||
estimated_downtime: "0 (已自動 fallback)"
|
||||
risk: medium
|
||||
@@ -775,12 +775,12 @@ rules:
|
||||
response:
|
||||
action_title: "重新啟動 {target} 服務"
|
||||
description: "⚙️ 規則匹配: {target} 發生異常,需進一步診斷確認根因。"
|
||||
suggested_action: RESTART_DEPLOYMENT
|
||||
kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}"
|
||||
estimated_downtime: "5-15 min"
|
||||
suggested_action: NO_ACTION
|
||||
kubectl_command: ""
|
||||
estimated_downtime: "N/A"
|
||||
risk: medium
|
||||
responsibility: COLLAB
|
||||
responsibility_reasoning: "告警資訊不足以判定單一責任團隊,建議多團隊協同排查"
|
||||
secondary_teams: [BE, INFRA]
|
||||
optimization: []
|
||||
reasoning: "[規則匹配] 根據告警先重啟恢復服務,同時安排深入診斷。"
|
||||
reasoning: "[規則匹配] 未知告警類型,無法安全判斷修復動作,由人工或 LLM 診斷後決策。"
|
||||
|
||||
@@ -148,7 +148,8 @@ class SSHProvider(MCPToolProvider):
|
||||
from src.core.config import settings
|
||||
raw = getattr(settings, "SSH_MCP_ALLOWED_HOSTS", "")
|
||||
if not raw:
|
||||
return ["192.168.0.188", "192.168.0.110", "192.168.0.111"]
|
||||
# P0.4 fix 2026-04-24 ogt + Claude Sonnet 4.6: 補入 120/121(原 default 缺失)
|
||||
return ["192.168.0.188", "192.168.0.110", "192.168.0.111", "192.168.0.120", "192.168.0.121"]
|
||||
if isinstance(raw, list):
|
||||
return raw
|
||||
return [h.strip() for h in raw.split(",") if h.strip()]
|
||||
|
||||
@@ -32,7 +32,7 @@ import structlog
|
||||
from src.core.config import settings
|
||||
from src.models.approval import ApprovalRequest
|
||||
from src.services.approval_db import get_approval_service, get_timeline_service
|
||||
from src.services.executor import get_executor
|
||||
from src.services.executor import OperationType, get_executor
|
||||
from src.services.operation_parser import parse_operation_from_action
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -242,52 +242,71 @@ class ApprovalExecutionService:
|
||||
)
|
||||
return False # 解析失敗 → 執行未發生
|
||||
|
||||
# ADR-076 Task 3: 執行失敗重試機制
|
||||
# 瞬態錯誤 (connection refused, timeout 等) 自動重試,最多 MAX_RETRY 次
|
||||
executor = get_executor()
|
||||
result = await executor.execute_with_audit(
|
||||
approval=approval,
|
||||
operation_type=operation_type,
|
||||
resource_name=resource_name,
|
||||
namespace=namespace,
|
||||
)
|
||||
attempt = 1 # 重試計數(INVESTIGATE 路徑不進入重試迴圈,保持 1)
|
||||
|
||||
attempt = 1
|
||||
while not result.success and attempt <= self.MAX_RETRY:
|
||||
if not self._is_transient_error(result.error):
|
||||
logger.info(
|
||||
"execution_retry_skipped_permanent_error",
|
||||
approval_id=str(approval.id),
|
||||
attempt=attempt,
|
||||
error=result.error,
|
||||
)
|
||||
break
|
||||
|
||||
logger.warning(
|
||||
"execution_retry_transient_error",
|
||||
approval_id=str(approval.id),
|
||||
attempt=attempt,
|
||||
max_retry=self.MAX_RETRY,
|
||||
error=result.error,
|
||||
delay_seconds=self.RETRY_DELAY_SECONDS,
|
||||
# 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — INVESTIGATE 唯讀查詢
|
||||
# 根因:INVESTIGATE 不在 executor.execute_with_audit 的 switch,走 else → success=False
|
||||
# 修法:偵測到 INVESTIGATE 類型,直接呼叫 execute_kubectl_command(approval.action)
|
||||
# 唯讀指令無需重試迴圈(失敗即失敗,不會有 transient error 改善空間)
|
||||
if operation_type == OperationType.INVESTIGATE:
|
||||
result = await executor.execute_kubectl_command(
|
||||
command=approval.action,
|
||||
timeout_sec=30,
|
||||
)
|
||||
await timeline.add_event(
|
||||
event_type="exec",
|
||||
status="warning",
|
||||
title=f"⚠️ 執行失敗,{self.RETRY_DELAY_SECONDS}s 後重試 ({attempt}/{self.MAX_RETRY})",
|
||||
description=f"Error: {result.error}",
|
||||
actor="leWOOOgo",
|
||||
actor_role="executor",
|
||||
logger.info(
|
||||
"background_execution_investigate",
|
||||
approval_id=str(approval.id),
|
||||
action=approval.action,
|
||||
success=result.success,
|
||||
message=result.message,
|
||||
)
|
||||
await asyncio.sleep(self.RETRY_DELAY_SECONDS)
|
||||
else:
|
||||
# ADR-076 Task 3: 執行失敗重試機制
|
||||
# 瞬態錯誤 (connection refused, timeout 等) 自動重試,最多 MAX_RETRY 次
|
||||
result = await executor.execute_with_audit(
|
||||
approval=approval,
|
||||
operation_type=operation_type,
|
||||
resource_name=resource_name,
|
||||
namespace=namespace,
|
||||
)
|
||||
attempt += 1
|
||||
|
||||
attempt = 1
|
||||
while not result.success and attempt <= self.MAX_RETRY:
|
||||
if not self._is_transient_error(result.error):
|
||||
logger.info(
|
||||
"execution_retry_skipped_permanent_error",
|
||||
approval_id=str(approval.id),
|
||||
attempt=attempt,
|
||||
error=result.error,
|
||||
)
|
||||
break
|
||||
|
||||
logger.warning(
|
||||
"execution_retry_transient_error",
|
||||
approval_id=str(approval.id),
|
||||
attempt=attempt,
|
||||
max_retry=self.MAX_RETRY,
|
||||
error=result.error,
|
||||
delay_seconds=self.RETRY_DELAY_SECONDS,
|
||||
)
|
||||
await timeline.add_event(
|
||||
event_type="exec",
|
||||
status="warning",
|
||||
title=f"⚠️ 執行失敗,{self.RETRY_DELAY_SECONDS}s 後重試 ({attempt}/{self.MAX_RETRY})",
|
||||
description=f"Error: {result.error}",
|
||||
actor="leWOOOgo",
|
||||
actor_role="executor",
|
||||
approval_id=str(approval.id),
|
||||
)
|
||||
await asyncio.sleep(self.RETRY_DELAY_SECONDS)
|
||||
result = await executor.execute_with_audit(
|
||||
approval=approval,
|
||||
operation_type=operation_type,
|
||||
resource_name=resource_name,
|
||||
namespace=namespace,
|
||||
)
|
||||
attempt += 1
|
||||
|
||||
# Phase 5: 更新資料庫狀態
|
||||
# 2026-04-18 ADR-090 L5 P0.2: 失敗時帶上 error_message,寫進 rejection_reason
|
||||
|
||||
@@ -341,6 +341,10 @@ class AutoApprovePolicy:
|
||||
or proposal_data.get("source") == "expert_system"
|
||||
or (proposal_data.get("rule_id") or "") != ""
|
||||
or (proposal_data.get("matched_rule") or "") != ""
|
||||
# 2026-04-24 ogt + Claude Sonnet 4.6: Phase 2 五 agent 協作輸出 bypass confidence 閾值
|
||||
# 根因:phase2_agent_debate 的 is_rule_based=False + confidence 低 → 被誤攔截
|
||||
# 修法:識別 phase2_agent_debate source,視同規則可信路徑
|
||||
or (proposal_data.get("source") or "").startswith("phase2_agent_debate")
|
||||
)
|
||||
if not _is_rule_based and confidence < self.config.min_confidence:
|
||||
return self._reject(
|
||||
|
||||
@@ -37,6 +37,13 @@ BLAST_BLOCKED = 100 # = 100 → permanent block
|
||||
# ── 基礎分(Kubectl 動作類型)────────────────────────────────────────────────
|
||||
_BASE_SCORES: list[tuple[str, int, str]] = [
|
||||
# (regex pattern, base_score, reason)
|
||||
# 唯讀查詢指令(零衝擊,優先匹配避免被 default=50 吃掉)
|
||||
# 2026-04-24 ogt + Claude Sonnet 4.6: Gate 9 修復 — kubectl get/top/describe/logs 補 score=1
|
||||
(r"kubectl\s+get\b", 1, "get 唯讀查詢(零衝擊)"),
|
||||
(r"kubectl\s+top\b", 1, "top 唯讀查詢(零衝擊)"),
|
||||
(r"kubectl\s+describe\b", 1, "describe 唯讀查詢(零衝擊)"),
|
||||
(r"kubectl\s+logs\b", 1, "logs 唯讀查詢(零衝擊)"),
|
||||
(r"kubectl\s+version\b", 1, "version 唯讀(零衝擊)"),
|
||||
(r"kubectl\s+rollout\s+restart", 10, "rollout restart 低衝擊"),
|
||||
(r"kubectl\s+rollout\s+undo", 25, "rollout undo 中衝擊(版本回退)"),
|
||||
(r"kubectl\s+scale.*--replicas=[1-9]", 15, "scale up/down 低中衝擊"),
|
||||
|
||||
@@ -2188,7 +2188,8 @@ class DecisionManager:
|
||||
try:
|
||||
ssh = self._ssh
|
||||
# C4: 未知主機記錄 warning(不靜默跳過)
|
||||
_KNOWN_HOSTS = ("192.168.0.188", "192.168.0.110")
|
||||
# P0.4 fix 2026-04-24 ogt + Claude Sonnet 4.6: 補入 120/121,與 ssh_provider default 對齊
|
||||
_KNOWN_HOSTS = ("192.168.0.188", "192.168.0.110", "192.168.0.120", "192.168.0.121")
|
||||
if ssh.enabled and host not in _KNOWN_HOSTS:
|
||||
logger.warning("mcp_context_unknown_host", host=host, known=_KNOWN_HOSTS)
|
||||
if ssh.enabled and host in _KNOWN_HOSTS:
|
||||
@@ -2197,23 +2198,27 @@ class DecisionManager:
|
||||
status_result = await asyncio.wait_for(
|
||||
ssh.execute(
|
||||
tool_name="ssh_get_container_status",
|
||||
params={"host": host, "container_name": container},
|
||||
# P0.4 fix 2026-04-24 ogt + Claude Sonnet 4.6: params= → parameters=(符合 MCPToolProvider.execute 簽名)
|
||||
parameters={"host": host, "container_name": container},
|
||||
),
|
||||
timeout=_MCP_TIMEOUT,
|
||||
)
|
||||
if status_result.get("success"):
|
||||
ctx_parts.append(f"[SSH] 容器 {container} 狀態: {status_result.get('output', '')[:300]}")
|
||||
# P0.4 fix 2026-04-24 ogt + Claude Sonnet 4.6: MCPToolResult 是 dataclass,用 .success/.output 而非 .get()
|
||||
if status_result.success:
|
||||
ctx_parts.append(f"[SSH] 容器 {container} 狀態: {(status_result.output or '')[:300]}")
|
||||
# 查主機資源
|
||||
if "CpuLoad" in alertname or "Memory" in alertname:
|
||||
top_result = await asyncio.wait_for(
|
||||
ssh.execute(
|
||||
tool_name="ssh_get_top_processes",
|
||||
params={"host": host, "top_n": 5},
|
||||
# P0.4 fix 2026-04-24 ogt + Claude Sonnet 4.6: params= → parameters=
|
||||
parameters={"host": host, "top_n": 5},
|
||||
),
|
||||
timeout=_MCP_TIMEOUT,
|
||||
)
|
||||
if top_result.get("success"):
|
||||
ctx_parts.append(f"[SSH] 主機 {host} Top processes: {top_result.get('output', '')[:300]}")
|
||||
# P0.4 fix 2026-04-24 ogt + Claude Sonnet 4.6: MCPToolResult dataclass 用 .success/.output
|
||||
if top_result.success:
|
||||
ctx_parts.append(f"[SSH] 主機 {host} Top processes: {(top_result.output or '')[:300]}")
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("mcp_context_ssh_timeout", alertname=alertname, host=host, timeout=_MCP_TIMEOUT)
|
||||
except Exception as e:
|
||||
@@ -2229,12 +2234,14 @@ class DecisionManager:
|
||||
events_result = await asyncio.wait_for(
|
||||
k8s.execute(
|
||||
tool_name="k8s_get_events",
|
||||
params={"namespace": ns, "field_selector": f"involvedObject.name={pod}"},
|
||||
# P0.4 fix 2026-04-24 ogt + Claude Sonnet 4.6: params= → parameters=
|
||||
parameters={"namespace": ns, "field_selector": f"involvedObject.name={pod}"},
|
||||
),
|
||||
timeout=_MCP_TIMEOUT,
|
||||
)
|
||||
if events_result.get("success"):
|
||||
ctx_parts.append(f"[K8s] Pod {pod} 事件: {events_result.get('output', '')[:300]}")
|
||||
# P0.4 fix 2026-04-24 ogt + Claude Sonnet 4.6: MCPToolResult 是 dataclass,用 .success/.output
|
||||
if events_result.success:
|
||||
ctx_parts.append(f"[K8s] Pod {pod} 事件: {(events_result.output or '')[:300]}")
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("mcp_context_k8s_timeout", alertname=alertname, timeout=_MCP_TIMEOUT)
|
||||
except Exception as e:
|
||||
|
||||
@@ -45,6 +45,8 @@ class OperationType(str, Enum):
|
||||
RESTART_DEPLOYMENT = "RESTART_DEPLOYMENT"
|
||||
DELETE_POD = "DELETE_POD"
|
||||
SCALE_DEPLOYMENT = "SCALE_DEPLOYMENT"
|
||||
# 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — 唯讀查詢類型(kubectl get/top/describe/logs)
|
||||
INVESTIGATE = "INVESTIGATE"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
|
||||
@@ -258,6 +258,14 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No
|
||||
days = int((labels or {}).get("days_remaining", 0)) if labels else 0
|
||||
return "ssl_cert", ("TYPE-1" if days >= 14 else "TYPE-3")
|
||||
|
||||
# 14. cAdvisor 監控工具(P0.5 2026-04-24 ogt: 從 general 分離,避免監控工具誤入 general)
|
||||
if alertname.startswith(("Cadvisor", "cadvisor", "CAdvisor")):
|
||||
return "infrastructure", "TYPE-2"
|
||||
|
||||
# 15. CoreDNS(P0.5 2026-04-24 ogt: 從 general 分離)
|
||||
if alertname.startswith(("CoreDNS", "CoreDns", "Coredns")):
|
||||
return "kubernetes", "TYPE-2"
|
||||
|
||||
return "general", "TYPE-3"
|
||||
|
||||
|
||||
|
||||
@@ -206,8 +206,13 @@ class KnowledgeExtractorService:
|
||||
依 signals 關鍵字推斷 KB 分類。
|
||||
依序比對,第一個匹配的分類獲勝。
|
||||
"""
|
||||
# 2026-04-24 ogt: Signal 無 description 欄位,改用 alert_name + annotations.summary
|
||||
text = " ".join(
|
||||
s.description.lower() for s in (incident.signals or [])
|
||||
(
|
||||
(s.alert_name or "") + " " +
|
||||
(s.annotations.get("summary", "") if s.annotations else "")
|
||||
).lower()
|
||||
for s in (incident.signals or [])
|
||||
)
|
||||
for category, keywords in _CATEGORY_KEYWORDS.items():
|
||||
if any(k in text for k in keywords):
|
||||
|
||||
@@ -78,6 +78,19 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
|
||||
"""
|
||||
action_lower = action.lower()
|
||||
|
||||
# 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — 唯讀指令識別(INVESTIGATE)
|
||||
# 根因:parse_operation_from_action 完全不認識 kubectl get/top/describe/logs → 回 None → 執行失敗
|
||||
# 修法:優先匹配唯讀指令,回傳 OperationType.INVESTIGATE(零衝擊,blast_radius score=1)
|
||||
kubectl_ro_match = re.search(
|
||||
r"kubectl\s+(get|top|describe|logs|version)\s*([a-z][\w.-]*)?",
|
||||
action_lower,
|
||||
)
|
||||
if kubectl_ro_match:
|
||||
ns_match = re.search(r"-n\s+(\S+)", action_lower)
|
||||
namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE
|
||||
resource = kubectl_ro_match.group(2) or "pods"
|
||||
return ParsedOperation(OperationType.INVESTIGATE, resource, namespace)
|
||||
|
||||
# Pattern: kubectl rollout restart deployment/<name>
|
||||
kubectl_restart_match = re.search(
|
||||
r"kubectl\s+rollout\s+restart\s+deployment/([a-z0-9][\w.-]*)", action_lower
|
||||
|
||||
@@ -37,36 +37,46 @@ DEDUP_KEY_PREFIX = "proactive:dedup:"
|
||||
K8S_NAMESPACE = "awoooi-prod"
|
||||
|
||||
# 需要監控的 metrics(Prometheus PromQL + 警戒閾值)
|
||||
# 2026-04-24 ogt + Claude Sonnet 4.6: P0.6 修復 — 修正 PromQL labels 使其對應實際 Prometheus 資料
|
||||
# - CPU/Memory: cadvisor 無 namespace label,改用 kube_pod_container_status_restarts_total 確認存在的 namespace 篩法
|
||||
# - pod_restart_rate: 改用 sum() 聚合,避免回傳多 vector 使 _fetch_current_value 只取第一筆
|
||||
# - db_connection_pool: datname 實際為 awoooi_prod(非 awoooi)
|
||||
# - http_error_rate: cadvisor 無 http_requests_total,改用 probe_success 替代
|
||||
MONITORED_METRICS: list[dict[str, Any]] = [
|
||||
{
|
||||
"name": "http_error_rate",
|
||||
"promql": 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))',
|
||||
"threshold": 0.05, # > 5% error rate = 警戒
|
||||
"description": "HTTP 5xx 錯誤率",
|
||||
# blackbox probe 失敗率:1 - 平均探測成功率(全部 target 聚合)
|
||||
"promql": '1 - avg(blackbox_probe_success)',
|
||||
"threshold": 0.05, # > 5% probe 失敗 = 警戒
|
||||
"description": "HTTP Probe 失敗率(Blackbox Exporter)",
|
||||
},
|
||||
{
|
||||
"name": "cpu_usage_awoooi_api",
|
||||
"promql": 'avg(rate(container_cpu_usage_seconds_total{namespace="awoooi-prod",container="awoooi-api"}[5m]))',
|
||||
"threshold": 0.85, # > 85% CPU
|
||||
# cadvisor: awoooi-prod namespace 的 api container(name label 格式為 k8s_api_awoooi-api-*_awoooi-prod_*_*)
|
||||
"promql": 'avg(rate(container_cpu_usage_seconds_total{name=~"k8s_api_awoooi-api.*"}[5m]))',
|
||||
"threshold": 0.85, # > 85% CPU(單核心比例)
|
||||
"description": "API 容器 CPU 使用率",
|
||||
},
|
||||
{
|
||||
"name": "memory_usage_awoooi_api",
|
||||
"promql": 'avg(container_memory_usage_bytes{namespace="awoooi-prod",container="awoooi-api"}) / avg(container_spec_memory_limit_bytes{namespace="awoooi-prod",container="awoooi-api"})',
|
||||
"threshold": 0.90, # > 90% memory
|
||||
"description": "API 容器記憶體使用率",
|
||||
# cadvisor memory working set(不含 cache)
|
||||
"promql": 'avg(container_memory_working_set_bytes{name=~"k8s_api_awoooi-api.*"})',
|
||||
"threshold": 1073741824.0, # > 1 GiB = 警戒
|
||||
"description": "API 容器記憶體使用(working set bytes)",
|
||||
},
|
||||
{
|
||||
"name": "pod_restart_rate",
|
||||
"promql": 'increase(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[15m])',
|
||||
# kube-state-metrics: namespace=awoooi-prod,sum 聚合避免 multi-vector
|
||||
"promql": 'sum(increase(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[15m]))',
|
||||
"threshold": 2.0, # 15 分鐘內 > 2 次重啟
|
||||
"description": "Pod 重啟次數(15分鐘窗口)",
|
||||
},
|
||||
{
|
||||
"name": "db_connection_pool",
|
||||
"promql": 'pg_stat_activity_count{datname="awoooi"}',
|
||||
# datname 實際值為 awoooi_prod;sum 聚合所有 state
|
||||
"promql": 'sum(pg_stat_activity_count{datname="awoooi_prod"})',
|
||||
"threshold": 80.0, # > 80 個 DB 連線
|
||||
"description": "PostgreSQL 連線數",
|
||||
"description": "PostgreSQL 連線數(awoooi_prod)",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user