diff --git a/apps/api/alert_rules.yaml b/apps/api/alert_rules.yaml index 02f97c17..3479251a 100644 --- a/apps/api/alert_rules.yaml +++ b/apps/api/alert_rules.yaml @@ -232,7 +232,7 @@ rules: response: action_title: "診斷 {target} CrashLoop 根因" description: "⚙️ 規則匹配: {target} 進入 CrashLoopBackOff,需檢查啟動錯誤日誌。" - suggested_action: RESTART_DEPLOYMENT + suggested_action: NO_ACTION kubectl_command: "kubectl logs {target} -n {namespace} --previous --tail=50" estimated_downtime: "依根因而定" risk: critical @@ -315,7 +315,7 @@ rules: response: action_title: "清理 PostgreSQL 閒置連線" description: "⚙️ 規則匹配: PostgreSQL 連線池使用率過高,可能導致新請求被拒絕。" - suggested_action: RESTART_DEPLOYMENT + suggested_action: NO_ACTION kubectl_command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE state = ''idle'' AND state_change < NOW() - INTERVAL ''5 minutes'';'" estimated_downtime: "0" risk: critical @@ -342,7 +342,7 @@ rules: response: action_title: "診斷 PostgreSQL 慢查詢 + 索引優化" description: "⚙️ 規則匹配: PostgreSQL 存在慢查詢或鎖等待,影響系統整體性能。" - suggested_action: RESTART_DEPLOYMENT + suggested_action: NO_ACTION kubectl_command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT pid, query, state, wait_event_type, wait_event FROM pg_stat_activity WHERE state != ''idle'' ORDER BY query_start;'" estimated_downtime: "0" risk: medium @@ -448,7 +448,7 @@ rules: response: action_title: "清理 MinIO 過期資料 on {host}" description: "⚙️ 規則匹配: MinIO 磁碟使用率過高,需清理舊資料或擴展儲存空間。" - suggested_action: RESTART_DEPLOYMENT + suggested_action: NO_ACTION kubectl_command: "ssh {host} 'df -h /data/minio && du -sh /data/minio/* | sort -rh | head -10'" estimated_downtime: "0" risk: critical @@ -503,7 +503,7 @@ rules: response: action_title: "確認 K3s 節點 {target} 狀態" description: "⚙️ 規則匹配: K3s 節點下線,影響叢集可用性和 Pod 調度。" - suggested_action: RESTART_DEPLOYMENT + suggested_action: NO_ACTION kubectl_command: "kubectl get nodes -o wide && kubectl describe node {target}" estimated_downtime: "依節點恢復時間" risk: critical @@ -562,7 +562,7 @@ rules: response: action_title: "診斷告警鏈路中斷" description: "⚙️ 規則匹配: 告警鏈路異常,可能導致真實告警無法送達 Telegram。" - suggested_action: RESTART_DEPLOYMENT + suggested_action: NO_ACTION kubectl_command: "kubectl get pods -n monitoring && curl -s http://192.168.0.120:9093/api/v1/status | jq '.data.uptime'" estimated_downtime: "監控盲區持續中" risk: critical @@ -593,7 +593,7 @@ rules: response: action_title: "確認 NVIDIA API 熔斷狀態" description: "⚙️ 規則匹配: NVIDIA/Nemotron 熔斷器開啟或錯誤率過高,AI Router 已自動降級。" - suggested_action: RESTART_DEPLOYMENT + suggested_action: NO_ACTION kubectl_command: "curl -s http://192.168.0.125:32334/api/v1/ai-router/status | jq '.providers'" estimated_downtime: "0 (已自動 fallback)" risk: medium @@ -775,12 +775,12 @@ rules: response: action_title: "重新啟動 {target} 服務" description: "⚙️ 規則匹配: {target} 發生異常,需進一步診斷確認根因。" - suggested_action: RESTART_DEPLOYMENT - kubectl_command: "kubectl rollout restart deployment/{target} -n {namespace}" - estimated_downtime: "5-15 min" + suggested_action: NO_ACTION + kubectl_command: "" + estimated_downtime: "N/A" risk: medium responsibility: COLLAB responsibility_reasoning: "告警資訊不足以判定單一責任團隊,建議多團隊協同排查" secondary_teams: [BE, INFRA] optimization: [] - reasoning: "[規則匹配] 根據告警先重啟恢復服務,同時安排深入診斷。" + reasoning: "[規則匹配] 未知告警類型,無法安全判斷修復動作,由人工或 LLM 診斷後決策。" diff --git a/apps/api/src/plugins/mcp/providers/ssh_provider.py b/apps/api/src/plugins/mcp/providers/ssh_provider.py index 2ddbd653..0237daa0 100644 --- a/apps/api/src/plugins/mcp/providers/ssh_provider.py +++ b/apps/api/src/plugins/mcp/providers/ssh_provider.py @@ -148,7 +148,8 @@ class SSHProvider(MCPToolProvider): from src.core.config import settings raw = getattr(settings, "SSH_MCP_ALLOWED_HOSTS", "") if not raw: - return ["192.168.0.188", "192.168.0.110", "192.168.0.111"] + # P0.4 fix 2026-04-24 ogt + Claude Sonnet 4.6: 補入 120/121(原 default 缺失) + return ["192.168.0.188", "192.168.0.110", "192.168.0.111", "192.168.0.120", "192.168.0.121"] if isinstance(raw, list): return raw return [h.strip() for h in raw.split(",") if h.strip()] diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index d77d07ad..83c126da 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -32,7 +32,7 @@ import structlog from src.core.config import settings from src.models.approval import ApprovalRequest from src.services.approval_db import get_approval_service, get_timeline_service -from src.services.executor import get_executor +from src.services.executor import OperationType, get_executor from src.services.operation_parser import parse_operation_from_action if TYPE_CHECKING: @@ -242,52 +242,71 @@ class ApprovalExecutionService: ) return False # 解析失敗 → 執行未發生 - # ADR-076 Task 3: 執行失敗重試機制 - # 瞬態錯誤 (connection refused, timeout 等) 自動重試,最多 MAX_RETRY 次 executor = get_executor() - result = await executor.execute_with_audit( - approval=approval, - operation_type=operation_type, - resource_name=resource_name, - namespace=namespace, - ) + attempt = 1 # 重試計數(INVESTIGATE 路徑不進入重試迴圈,保持 1) - attempt = 1 - while not result.success and attempt <= self.MAX_RETRY: - if not self._is_transient_error(result.error): - logger.info( - "execution_retry_skipped_permanent_error", - approval_id=str(approval.id), - attempt=attempt, - error=result.error, - ) - break - - logger.warning( - "execution_retry_transient_error", - approval_id=str(approval.id), - attempt=attempt, - max_retry=self.MAX_RETRY, - error=result.error, - delay_seconds=self.RETRY_DELAY_SECONDS, + # 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — INVESTIGATE 唯讀查詢 + # 根因:INVESTIGATE 不在 executor.execute_with_audit 的 switch,走 else → success=False + # 修法:偵測到 INVESTIGATE 類型,直接呼叫 execute_kubectl_command(approval.action) + # 唯讀指令無需重試迴圈(失敗即失敗,不會有 transient error 改善空間) + if operation_type == OperationType.INVESTIGATE: + result = await executor.execute_kubectl_command( + command=approval.action, + timeout_sec=30, ) - await timeline.add_event( - event_type="exec", - status="warning", - title=f"⚠️ 執行失敗,{self.RETRY_DELAY_SECONDS}s 後重試 ({attempt}/{self.MAX_RETRY})", - description=f"Error: {result.error}", - actor="leWOOOgo", - actor_role="executor", + logger.info( + "background_execution_investigate", approval_id=str(approval.id), + action=approval.action, + success=result.success, + message=result.message, ) - await asyncio.sleep(self.RETRY_DELAY_SECONDS) + else: + # ADR-076 Task 3: 執行失敗重試機制 + # 瞬態錯誤 (connection refused, timeout 等) 自動重試,最多 MAX_RETRY 次 result = await executor.execute_with_audit( approval=approval, operation_type=operation_type, resource_name=resource_name, namespace=namespace, ) - attempt += 1 + + attempt = 1 + while not result.success and attempt <= self.MAX_RETRY: + if not self._is_transient_error(result.error): + logger.info( + "execution_retry_skipped_permanent_error", + approval_id=str(approval.id), + attempt=attempt, + error=result.error, + ) + break + + logger.warning( + "execution_retry_transient_error", + approval_id=str(approval.id), + attempt=attempt, + max_retry=self.MAX_RETRY, + error=result.error, + delay_seconds=self.RETRY_DELAY_SECONDS, + ) + await timeline.add_event( + event_type="exec", + status="warning", + title=f"⚠️ 執行失敗,{self.RETRY_DELAY_SECONDS}s 後重試 ({attempt}/{self.MAX_RETRY})", + description=f"Error: {result.error}", + actor="leWOOOgo", + actor_role="executor", + approval_id=str(approval.id), + ) + await asyncio.sleep(self.RETRY_DELAY_SECONDS) + result = await executor.execute_with_audit( + approval=approval, + operation_type=operation_type, + resource_name=resource_name, + namespace=namespace, + ) + attempt += 1 # Phase 5: 更新資料庫狀態 # 2026-04-18 ADR-090 L5 P0.2: 失敗時帶上 error_message,寫進 rejection_reason diff --git a/apps/api/src/services/auto_approve.py b/apps/api/src/services/auto_approve.py index 1a347447..c52e6e1f 100644 --- a/apps/api/src/services/auto_approve.py +++ b/apps/api/src/services/auto_approve.py @@ -341,6 +341,10 @@ class AutoApprovePolicy: or proposal_data.get("source") == "expert_system" or (proposal_data.get("rule_id") or "") != "" or (proposal_data.get("matched_rule") or "") != "" + # 2026-04-24 ogt + Claude Sonnet 4.6: Phase 2 五 agent 協作輸出 bypass confidence 閾值 + # 根因:phase2_agent_debate 的 is_rule_based=False + confidence 低 → 被誤攔截 + # 修法:識別 phase2_agent_debate source,視同規則可信路徑 + or (proposal_data.get("source") or "").startswith("phase2_agent_debate") ) if not _is_rule_based and confidence < self.config.min_confidence: return self._reject( diff --git a/apps/api/src/services/blast_radius_calculator.py b/apps/api/src/services/blast_radius_calculator.py index 9cf8b2f2..00d93f9c 100644 --- a/apps/api/src/services/blast_radius_calculator.py +++ b/apps/api/src/services/blast_radius_calculator.py @@ -37,6 +37,13 @@ BLAST_BLOCKED = 100 # = 100 → permanent block # ── 基礎分(Kubectl 動作類型)──────────────────────────────────────────────── _BASE_SCORES: list[tuple[str, int, str]] = [ # (regex pattern, base_score, reason) + # 唯讀查詢指令(零衝擊,優先匹配避免被 default=50 吃掉) + # 2026-04-24 ogt + Claude Sonnet 4.6: Gate 9 修復 — kubectl get/top/describe/logs 補 score=1 + (r"kubectl\s+get\b", 1, "get 唯讀查詢(零衝擊)"), + (r"kubectl\s+top\b", 1, "top 唯讀查詢(零衝擊)"), + (r"kubectl\s+describe\b", 1, "describe 唯讀查詢(零衝擊)"), + (r"kubectl\s+logs\b", 1, "logs 唯讀查詢(零衝擊)"), + (r"kubectl\s+version\b", 1, "version 唯讀(零衝擊)"), (r"kubectl\s+rollout\s+restart", 10, "rollout restart 低衝擊"), (r"kubectl\s+rollout\s+undo", 25, "rollout undo 中衝擊(版本回退)"), (r"kubectl\s+scale.*--replicas=[1-9]", 15, "scale up/down 低中衝擊"), diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 4ba8b873..8381dd74 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -2188,7 +2188,8 @@ class DecisionManager: try: ssh = self._ssh # C4: 未知主機記錄 warning(不靜默跳過) - _KNOWN_HOSTS = ("192.168.0.188", "192.168.0.110") + # P0.4 fix 2026-04-24 ogt + Claude Sonnet 4.6: 補入 120/121,與 ssh_provider default 對齊 + _KNOWN_HOSTS = ("192.168.0.188", "192.168.0.110", "192.168.0.120", "192.168.0.121") if ssh.enabled and host not in _KNOWN_HOSTS: logger.warning("mcp_context_unknown_host", host=host, known=_KNOWN_HOSTS) if ssh.enabled and host in _KNOWN_HOSTS: @@ -2197,23 +2198,27 @@ class DecisionManager: status_result = await asyncio.wait_for( ssh.execute( tool_name="ssh_get_container_status", - params={"host": host, "container_name": container}, + # P0.4 fix 2026-04-24 ogt + Claude Sonnet 4.6: params= → parameters=(符合 MCPToolProvider.execute 簽名) + parameters={"host": host, "container_name": container}, ), timeout=_MCP_TIMEOUT, ) - if status_result.get("success"): - ctx_parts.append(f"[SSH] 容器 {container} 狀態: {status_result.get('output', '')[:300]}") + # P0.4 fix 2026-04-24 ogt + Claude Sonnet 4.6: MCPToolResult 是 dataclass,用 .success/.output 而非 .get() + if status_result.success: + ctx_parts.append(f"[SSH] 容器 {container} 狀態: {(status_result.output or '')[:300]}") # 查主機資源 if "CpuLoad" in alertname or "Memory" in alertname: top_result = await asyncio.wait_for( ssh.execute( tool_name="ssh_get_top_processes", - params={"host": host, "top_n": 5}, + # P0.4 fix 2026-04-24 ogt + Claude Sonnet 4.6: params= → parameters= + parameters={"host": host, "top_n": 5}, ), timeout=_MCP_TIMEOUT, ) - if top_result.get("success"): - ctx_parts.append(f"[SSH] 主機 {host} Top processes: {top_result.get('output', '')[:300]}") + # P0.4 fix 2026-04-24 ogt + Claude Sonnet 4.6: MCPToolResult dataclass 用 .success/.output + if top_result.success: + ctx_parts.append(f"[SSH] 主機 {host} Top processes: {(top_result.output or '')[:300]}") except asyncio.TimeoutError: logger.warning("mcp_context_ssh_timeout", alertname=alertname, host=host, timeout=_MCP_TIMEOUT) except Exception as e: @@ -2229,12 +2234,14 @@ class DecisionManager: events_result = await asyncio.wait_for( k8s.execute( tool_name="k8s_get_events", - params={"namespace": ns, "field_selector": f"involvedObject.name={pod}"}, + # P0.4 fix 2026-04-24 ogt + Claude Sonnet 4.6: params= → parameters= + parameters={"namespace": ns, "field_selector": f"involvedObject.name={pod}"}, ), timeout=_MCP_TIMEOUT, ) - if events_result.get("success"): - ctx_parts.append(f"[K8s] Pod {pod} 事件: {events_result.get('output', '')[:300]}") + # P0.4 fix 2026-04-24 ogt + Claude Sonnet 4.6: MCPToolResult 是 dataclass,用 .success/.output + if events_result.success: + ctx_parts.append(f"[K8s] Pod {pod} 事件: {(events_result.output or '')[:300]}") except asyncio.TimeoutError: logger.warning("mcp_context_k8s_timeout", alertname=alertname, timeout=_MCP_TIMEOUT) except Exception as e: diff --git a/apps/api/src/services/executor.py b/apps/api/src/services/executor.py index 982df79a..d636ecc8 100644 --- a/apps/api/src/services/executor.py +++ b/apps/api/src/services/executor.py @@ -45,6 +45,8 @@ class OperationType(str, Enum): RESTART_DEPLOYMENT = "RESTART_DEPLOYMENT" DELETE_POD = "DELETE_POD" SCALE_DEPLOYMENT = "SCALE_DEPLOYMENT" + # 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — 唯讀查詢類型(kubectl get/top/describe/logs) + INVESTIGATE = "INVESTIGATE" # ============================================================================= diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index feabbf29..bf19fa5a 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -258,6 +258,14 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No days = int((labels or {}).get("days_remaining", 0)) if labels else 0 return "ssl_cert", ("TYPE-1" if days >= 14 else "TYPE-3") + # 14. cAdvisor 監控工具(P0.5 2026-04-24 ogt: 從 general 分離,避免監控工具誤入 general) + if alertname.startswith(("Cadvisor", "cadvisor", "CAdvisor")): + return "infrastructure", "TYPE-2" + + # 15. CoreDNS(P0.5 2026-04-24 ogt: 從 general 分離) + if alertname.startswith(("CoreDNS", "CoreDns", "Coredns")): + return "kubernetes", "TYPE-2" + return "general", "TYPE-3" diff --git a/apps/api/src/services/knowledge_extractor_service.py b/apps/api/src/services/knowledge_extractor_service.py index 20203333..9acec547 100644 --- a/apps/api/src/services/knowledge_extractor_service.py +++ b/apps/api/src/services/knowledge_extractor_service.py @@ -206,8 +206,13 @@ class KnowledgeExtractorService: 依 signals 關鍵字推斷 KB 分類。 依序比對,第一個匹配的分類獲勝。 """ + # 2026-04-24 ogt: Signal 無 description 欄位,改用 alert_name + annotations.summary text = " ".join( - s.description.lower() for s in (incident.signals or []) + ( + (s.alert_name or "") + " " + + (s.annotations.get("summary", "") if s.annotations else "") + ).lower() + for s in (incident.signals or []) ) for category, keywords in _CATEGORY_KEYWORDS.items(): if any(k in text for k in keywords): diff --git a/apps/api/src/services/operation_parser.py b/apps/api/src/services/operation_parser.py index c455fa87..1977abb5 100644 --- a/apps/api/src/services/operation_parser.py +++ b/apps/api/src/services/operation_parser.py @@ -78,6 +78,19 @@ def parse_operation_from_action(action: str) -> ParsedOperation: """ action_lower = action.lower() + # 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — 唯讀指令識別(INVESTIGATE) + # 根因:parse_operation_from_action 完全不認識 kubectl get/top/describe/logs → 回 None → 執行失敗 + # 修法:優先匹配唯讀指令,回傳 OperationType.INVESTIGATE(零衝擊,blast_radius score=1) + kubectl_ro_match = re.search( + r"kubectl\s+(get|top|describe|logs|version)\s*([a-z][\w.-]*)?", + action_lower, + ) + if kubectl_ro_match: + ns_match = re.search(r"-n\s+(\S+)", action_lower) + namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE + resource = kubectl_ro_match.group(2) or "pods" + return ParsedOperation(OperationType.INVESTIGATE, resource, namespace) + # Pattern: kubectl rollout restart deployment/ kubectl_restart_match = re.search( r"kubectl\s+rollout\s+restart\s+deployment/([a-z0-9][\w.-]*)", action_lower diff --git a/apps/api/src/services/proactive_inspector.py b/apps/api/src/services/proactive_inspector.py index 64075aac..19193050 100644 --- a/apps/api/src/services/proactive_inspector.py +++ b/apps/api/src/services/proactive_inspector.py @@ -37,36 +37,46 @@ DEDUP_KEY_PREFIX = "proactive:dedup:" K8S_NAMESPACE = "awoooi-prod" # 需要監控的 metrics(Prometheus PromQL + 警戒閾值) +# 2026-04-24 ogt + Claude Sonnet 4.6: P0.6 修復 — 修正 PromQL labels 使其對應實際 Prometheus 資料 +# - CPU/Memory: cadvisor 無 namespace label,改用 kube_pod_container_status_restarts_total 確認存在的 namespace 篩法 +# - pod_restart_rate: 改用 sum() 聚合,避免回傳多 vector 使 _fetch_current_value 只取第一筆 +# - db_connection_pool: datname 實際為 awoooi_prod(非 awoooi) +# - http_error_rate: cadvisor 無 http_requests_total,改用 probe_success 替代 MONITORED_METRICS: list[dict[str, Any]] = [ { "name": "http_error_rate", - "promql": 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))', - "threshold": 0.05, # > 5% error rate = 警戒 - "description": "HTTP 5xx 錯誤率", + # blackbox probe 失敗率:1 - 平均探測成功率(全部 target 聚合) + "promql": '1 - avg(blackbox_probe_success)', + "threshold": 0.05, # > 5% probe 失敗 = 警戒 + "description": "HTTP Probe 失敗率(Blackbox Exporter)", }, { "name": "cpu_usage_awoooi_api", - "promql": 'avg(rate(container_cpu_usage_seconds_total{namespace="awoooi-prod",container="awoooi-api"}[5m]))', - "threshold": 0.85, # > 85% CPU + # cadvisor: awoooi-prod namespace 的 api container(name label 格式為 k8s_api_awoooi-api-*_awoooi-prod_*_*) + "promql": 'avg(rate(container_cpu_usage_seconds_total{name=~"k8s_api_awoooi-api.*"}[5m]))', + "threshold": 0.85, # > 85% CPU(單核心比例) "description": "API 容器 CPU 使用率", }, { "name": "memory_usage_awoooi_api", - "promql": 'avg(container_memory_usage_bytes{namespace="awoooi-prod",container="awoooi-api"}) / avg(container_spec_memory_limit_bytes{namespace="awoooi-prod",container="awoooi-api"})', - "threshold": 0.90, # > 90% memory - "description": "API 容器記憶體使用率", + # cadvisor memory working set(不含 cache) + "promql": 'avg(container_memory_working_set_bytes{name=~"k8s_api_awoooi-api.*"})', + "threshold": 1073741824.0, # > 1 GiB = 警戒 + "description": "API 容器記憶體使用(working set bytes)", }, { "name": "pod_restart_rate", - "promql": 'increase(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[15m])', + # kube-state-metrics: namespace=awoooi-prod,sum 聚合避免 multi-vector + "promql": 'sum(increase(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[15m]))', "threshold": 2.0, # 15 分鐘內 > 2 次重啟 "description": "Pod 重啟次數(15分鐘窗口)", }, { "name": "db_connection_pool", - "promql": 'pg_stat_activity_count{datname="awoooi"}', + # datname 實際值為 awoooi_prod;sum 聚合所有 state + "promql": 'sum(pg_stat_activity_count{datname="awoooi_prod"})', "threshold": 80.0, # > 80 個 DB 連線 - "description": "PostgreSQL 連線數", + "description": "PostgreSQL 連線數(awoooi_prod)", }, ]