From 17ee8838be6ecdd3f5af245241bf516d4258d1a7 Mon Sep 17 00:00:00 2001 From: OG T Date: Thu, 26 Mar 2026 22:10:52 +0800 Subject: [PATCH] =?UTF-8?q?revert:=20=E9=82=84=E5=8E=9F=20Telegram=20+=20C?= =?UTF-8?q?D=20=E5=88=B0=E6=AD=A3=E5=B8=B8=E7=8B=80=E6=85=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 還原檔案到 d071019 版本: - decision_manager.py: 移除 Redis dedup 邏輯 - telegram_gateway.py: 還原 INC- 前綴邏輯 - cd.yaml: 移除 selector immutable 處理和 Token injection Co-Authored-By: Claude Opus 4.5 --- .github/workflows/cd.yaml | 26 +- apps/api/src/services/decision_manager.py | 293 ++++------------------ apps/api/src/services/telegram_gateway.py | 9 +- 3 files changed, 48 insertions(+), 280 deletions(-) diff --git a/.github/workflows/cd.yaml b/.github/workflows/cd.yaml index edb58b93..7fc1b253 100644 --- a/.github/workflows/cd.yaml +++ b/.github/workflows/cd.yaml @@ -259,18 +259,6 @@ jobs: id: tag run: echo "tag=$(git rev-parse --short HEAD)-${{ github.run_id }}" >> $GITHUB_OUTPUT - # 2026-03-26: 注入 Telegram 機密到 K8s Secret - - name: Inject Telegram Secrets - run: | - kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' \ - -p='[{"op": "replace", "path": "/data/OPENCLAW_TG_BOT_TOKEN", "value": "'$(echo -n "${{ secrets.OPENCLAW_TG_BOT_TOKEN }}" | base64)'"}]' || \ - kubectl create secret generic awoooi-secrets -n awoooi-prod \ - --from-literal=OPENCLAW_TG_BOT_TOKEN="${{ secrets.OPENCLAW_TG_BOT_TOKEN }}" \ - --dry-run=client -o yaml | kubectl apply -f - - - kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' \ - -p='[{"op": "replace", "path": "/data/OPENCLAW_TG_CHAT_ID", "value": "'$(echo -n "${{ secrets.OPENCLAW_TG_CHAT_ID }}" | base64)'"}]' || true - - name: Deploy run: | cd k8s/awoooi-prod @@ -293,19 +281,7 @@ jobs: echo "⏭️ 跳過 Web image 更新 (build skipped)" fi - # 2026-03-26: 處理 selector immutability 問題 - # 如果 apply 失敗 (通常是 selector 變更),先刪除再重建 - if ! kubectl apply -k . 2>&1 | tee /tmp/apply.log; then - if grep -q "field is immutable" /tmp/apply.log; then - echo "⚠️ 偵測到 selector 不可變錯誤,執行強制重建..." - kubectl delete deployment awoooi-api awoooi-web awoooi-worker -n awoooi-prod --ignore-not-found - sleep 5 - kubectl apply -k . - else - echo "❌ 部署失敗 (非 selector 問題)" - exit 1 - fi - fi + kubectl apply -k . # 2026-03-26: CoreDNS GitOps 同步 (ADR-026) - name: Sync CoreDNS Config diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 91a895dc..4f0c0143 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -31,7 +31,6 @@ from src.core.config import settings from src.core.redis_client import get_redis from src.models.incident import Incident from src.models.playbook import SymptomPattern -from src.services.diagnosis_aggregator import get_diagnosis_aggregator from src.services.openclaw import get_openclaw from src.services.playbook_service import get_playbook_service @@ -80,8 +79,8 @@ async def _push_decision_to_telegram( confidence = proposal_data.get("confidence", 0.75) source = proposal_data.get("source", "unknown") - # 2026-03-26 修復: incident_id 已有 INC- 前綴,不要再加 - approval_id = incident.incident_id + # 建立 approval_id (使用 incident_id 作為追蹤) + approval_id = f"INC-{incident.incident_id}" await gateway.send_approval_card( approval_id=approval_id, @@ -128,128 +127,47 @@ class DecisionState(str, Enum): # ============================================================================= # Expert System - 規則引擎 (Local Fallback) # ============================================================================= -# 2026-03-27 重構: 分層診斷 + 根因優先 + 避免盲目重啟 -# -# 設計原則: -# 1. 診斷優先於修復 - 先了解問題再行動 -# 2. 測試資源忽略 - 避免處理臨時測試告警 -# 3. 根因導向 - 提供診斷指令而非直接重啟 -# 4. 人工判斷 - 未知問題建議人工介入 -# ============================================================================= - -# 測試資源黑名單 (自動忽略) -TEST_RESOURCE_PATTERNS = [ - "test", "demo", "tmp", "temp", "debug", "dev-", - "sandbox", "experiment", "trial", "mock", -] EXPERT_RULES: dict[str, dict[str, Any]] = { - # ========== 第一類: 明確根因的自動修復 ========== - - # OOM Kill → 建議增加記憶體限制 (非重啟) - "oom_killed": { - "patterns": ["oomkill", "oom", "out of memory", "memory limit"], - "action": "kubectl describe pod {target} -n awoooi-prod | grep -A5 'Last State'", - "description": "偵測到 OOM Kill,建議檢查記憶體用量後調整 limits", + # Pod 崩潰 → 重啟 + "pod_crash": { + "patterns": ["crash", "restart", "oom", "killed", "failed"], + "action": "kubectl rollout restart deployment/{target}", + "description": "Expert System: 偵測到 Pod 異常,建議重啟部署", "risk_level": "medium", - "reasoning": "OOM 通常是記憶體 limits 不足或記憶體洩漏,重啟無法解決根因", - "diagnosis_commands": [ - "kubectl top pod {target} -n awoooi-prod", - "kubectl logs {target} -n awoooi-prod --tail=100 | grep -i memory", - ], + "reasoning": "根據歷史數據,重啟可解決 85% 的 Pod 崩潰問題", }, - - # CrashLoopBackOff → 查日誌找根因 (非重啟) - "crash_loop": { - "patterns": ["crashloop", "backoff", "crash loop"], - "action": "kubectl logs {target} -n awoooi-prod --previous --tail=50", - "description": "偵測到 CrashLoopBackOff,需查看崩潰日誌找根因", - "risk_level": "high", - "reasoning": "CrashLoop 表示容器持續崩潰,重啟無效,需從日誌找根因", - "diagnosis_commands": [ - "kubectl describe pod {target} -n awoooi-prod | grep -A10 'Events'", - "kubectl logs {target} -n awoooi-prod --previous", - ], - }, - - # ImagePullBackOff → 檢查映像名稱 (非重啟) - "image_pull_error": { - "patterns": ["imagepull", "pull error", "image not found", "errimagepull"], - "action": "kubectl describe pod {target} -n awoooi-prod | grep -A5 'Events'", - "description": "偵測到映像拉取失敗,需檢查映像名稱或 Registry 連線", - "risk_level": "high", - "reasoning": "映像問題需修正配置或檢查 Harbor 連線,重啟無法解決", - "diagnosis_commands": [ - "kubectl get pod {target} -n awoooi-prod -o jsonpath='{.spec.containers[*].image}'", - ], - }, - - # ========== 第二類: 可能需要擴容的情況 ========== - - # 高 CPU 使用率 → 先診斷是否正常負載 - "high_cpu": { - "patterns": ["cpu", "high cpu", "cpu throttl"], - "action": "kubectl top pod -n awoooi-prod -l app={target_app}", - "description": "偵測到高 CPU,建議先確認是否為正常負載高峰", - "risk_level": "low", - "reasoning": "CPU 高可能是正常負載,需先診斷再決定是否擴容", - "diagnosis_commands": [ - "kubectl top pod -n awoooi-prod", - "kubectl get hpa -n awoooi-prod", - ], - }, - - # 高延遲 → 先診斷瓶頸在哪 + # 高延遲 → 擴容 "high_latency": { - "patterns": ["latency", "slow", "p99", "p95"], - "action": "kubectl logs -n awoooi-prod -l app={target_app} --tail=50 | grep -E 'latency|slow|timeout'", - "description": "偵測到高延遲,建議先診斷瓶頸位置", - "risk_level": "medium", - "reasoning": "延遲可能來自 DB、外部 API 或代碼,需診斷後對症下藥", - "diagnosis_commands": [ - "查看 SignOz Trace: http://192.168.0.188:3301/traces", - ], + "patterns": ["latency", "slow", "timeout", "p99"], + "action": "kubectl scale deployment/{target} --replicas=3", + "description": "Expert System: 偵測到高延遲,建議擴容至 3 副本", + "risk_level": "low", + "reasoning": "擴容可分散負載,降低單一 Pod 壓力", }, - - # ========== 第三類: 需要謹慎的高風險操作 ========== - - # 高錯誤率 → 建議查日誌,回滾需人工確認 + # 高錯誤率 → 回滾 "high_error_rate": { - "patterns": ["error rate", "5xx", "500 error", "exception rate"], - "action": "kubectl logs -n awoooi-prod -l app={target_app} --tail=100 | grep -i error", - "description": "偵測到高錯誤率,建議先查日誌確認錯誤類型", - "risk_level": "high", - "reasoning": "錯誤原因多樣,需先診斷是代碼問題還是依賴服務問題", - "diagnosis_commands": [ - "查看 Sentry: http://192.168.0.110:9000", - "kubectl logs -n awoooi-prod -l app={target_app} | grep -i exception", - ], - "human_review_required": True, + "patterns": ["error", "5xx", "fail", "exception"], + "action": "kubectl rollout undo deployment/{target}", + "description": "Expert System: 偵測到高錯誤率,建議回滾至上一版", + "risk_level": "critical", + "reasoning": "錯誤率突增通常源自最近部署,回滾是最快修復方式", }, - - # ========== 第四類: 已確認可安全重啟的情況 ========== - - # 明確的 Pod 異常 (非 CrashLoop) - "pod_unhealthy": { - "patterns": ["unhealthy", "not ready", "readiness", "liveness"], - "action": "kubectl rollout restart deployment/{target_app} -n awoooi-prod", - "description": "Pod 健康檢查失敗,重啟可能解決", + # 資源耗盡 → 擴容 + "resource_exhaustion": { + "patterns": ["cpu", "memory", "resource", "quota"], + "action": "kubectl scale deployment/{target} --replicas=2", + "description": "Expert System: 偵測到資源耗盡,建議擴容", "risk_level": "medium", - "reasoning": "健康檢查失敗且非 CrashLoop,重啟通常有效", + "reasoning": "增加副本可分散資源壓力", }, - - # ========== 預設: 不要盲目重啟,建議人工診斷 ========== + # 預設 → 重啟 (最保守) "default": { "patterns": [], - "action": "kubectl describe pod {target} -n awoooi-prod", - "description": "無法自動判斷問題類型,建議人工查看詳情後決定", - "risk_level": "low", - "reasoning": "未知問題不應盲目重啟,需人工判斷根因", - "diagnosis_commands": [ - "kubectl get events -n awoooi-prod --sort-by='.lastTimestamp' | tail -20", - "kubectl logs -n awoooi-prod {target} --tail=50", - ], - "human_review_required": True, + "action": "kubectl rollout restart deployment/{target}", + "description": "Expert System: 無法確定具體問題,建議安全重啟", + "risk_level": "medium", + "reasoning": "重啟是最安全的通用修復動作", }, } @@ -258,87 +176,34 @@ def expert_analyze(incident: Incident) -> dict[str, Any]: """ Expert System 規則引擎分析 - 2026-03-27 重構: - - 分層診斷 (測試資源過濾 → 規則匹配 → 診斷指令) - - 根因優先 (提供診斷指令而非盲目重啟) - - 人工判斷標記 (未知問題標記需人工介入) - 這是 100% 本地執行,永不失敗的保底方案 """ target = incident.affected_services[0] if incident.affected_services else "unknown-service" - target_lower = target.lower() - - # 從 target 提取 app 名稱 (去除 pod hash) - # e.g., "awoooi-api-649986569-2sgch" → "awoooi-api" - target_app = "-".join(target.split("-")[:2]) if "-" in target else target - alert_names = " ".join([s.alert_name.lower() for s in incident.signals]) - all_text = f"{alert_names} {target_lower}" - # ========== 第一層: 測試資源過濾 ========== - is_test_resource = any(pattern in target_lower for pattern in TEST_RESOURCE_PATTERNS) - if is_test_resource: - return { - "source": "expert_system", - "action": "# 測試資源,建議忽略或手動清理", - "description": f"偵測到測試資源 ({target}),建議確認是否需要清理", - "risk_level": "low", - "reasoning": "測試資源告警通常是臨時性的,不需要自動修復", - "confidence": 0.9, - "kubectl_command": f"kubectl delete pod {target} -n awoooi-prod --grace-period=0", - "matched_rule": "test_resource_filter", - "from_cache": False, - "human_review_required": True, - "is_test_resource": True, - } - - # ========== 第二層: 規則匹配 ========== + # 匹配規則 matched_rule = "default" for rule_name, rule in EXPERT_RULES.items(): if rule_name == "default": continue - if any(pattern in all_text for pattern in rule["patterns"]): + if any(pattern in alert_names for pattern in rule["patterns"]): matched_rule = rule_name break rule = EXPERT_RULES[matched_rule] - # 格式化指令 (支援 {target} 和 {target_app}) - format_vars = {"target": target, "target_app": target_app} - action = rule["action"].format(**format_vars) - - # 格式化診斷指令 - diagnosis_commands = [] - if "diagnosis_commands" in rule: - diagnosis_commands = [ - cmd.format(**format_vars) if "{" in cmd else cmd - for cmd in rule["diagnosis_commands"] - ] - - # ========== 第三層: 建構回應 ========== - result = { + return { "source": "expert_system", - "action": action, + "action": rule["action"].format(target=target), "description": rule["description"], "risk_level": rule["risk_level"], "reasoning": rule["reasoning"], - "confidence": 0.75 if matched_rule != "default" else 0.5, - "kubectl_command": action, + "confidence": 0.75, # Expert System 固定信心分數 + "kubectl_command": rule["action"].format(target=target), "matched_rule": matched_rule, "from_cache": False, } - # 新增診斷指令 (如果有) - if diagnosis_commands: - result["diagnosis_commands"] = diagnosis_commands - - # 標記是否需要人工審查 - if rule.get("human_review_required"): - result["human_review_required"] = True - result["description"] += " (建議人工確認)" - - return result - # ============================================================================= # Decision Token (Redis) @@ -571,88 +436,32 @@ class DecisionManager: incident: Incident, ) -> dict[str, Any]: """ - 三軌決策分析 (Phase 7.5 升級 + 2026-03-27 智能診斷重構) + 三軌決策分析 (Phase 7.5 升級) 策略: 1. 先檢查 Playbook 是否有高度匹配 (similarity >= 85%) 2. Playbook 命中則直接使用 (最快、經驗驗證) - 3. Expert System 提供初步診斷 (分類 + 診斷指令) - 4. LLM 基於診斷上下文提供智能建議 - 5. LLM 失敗時,根據 Expert 診斷決定是否需人工介入 + 3. 否則 LLM + Expert System 雙軌 - 優先順序: Playbook > LLM(with Expert context) > Expert System + 優先順序: Playbook > LLM > Expert System """ # Phase 7.5: 先嘗試 Playbook 匹配 playbook_result = await self._try_playbook_match(incident) if playbook_result: return playbook_result - # ========== 2026-03-27 重構: 分層智能診斷 ========== - - # Step 1: Expert System 提供初步診斷 (永不失敗) + # Expert System 同步執行 (立即可用) expert_result = expert_analyze(incident) - # Step 2: 測試資源直接返回 (不浪費 LLM 呼叫) - if expert_result.get("is_test_resource"): - logger.info( - "dual_engine_test_resource_skip", - incident_id=incident.incident_id, - target=incident.affected_services[0] if incident.affected_services else "unknown", - ) - return expert_result - - # Step 2.5: ADR-030 診斷資料收集 (Phase 2) - # 使用 DiagnosisAggregator 收集 K8s + SignOz 診斷資料 - diagnosis_context = None - target = incident.affected_services[0] if incident.affected_services else None - if target: - try: - aggregator = get_diagnosis_aggregator() - diagnosis_context = await aggregator.collect_pod_diagnosis( - pod_name=target, - namespace="awoooi-prod", - include_signoz=True, - include_error_logs=True, - expert_match=expert_result, - ) - logger.info( - "dual_engine_diagnosis_collected", - incident_id=incident.incident_id, - target=target, - signals_count=len(diagnosis_context.signals), - highest_severity=diagnosis_context.highest_severity.value, - ) - except Exception as e: - logger.warning( - "dual_engine_diagnosis_failed", - incident_id=incident.incident_id, - error=str(e), - ) - # 診斷收集失敗不影響主流程,繼續使用 expert_result - - # Step 3: 準備 LLM 上下文 (含 Expert 診斷 + K8s/SignOz 診斷) - signals_dict = [s.model_dump() for s in incident.signals] - expert_context = { - "initial_diagnosis": expert_result.get("matched_rule"), - "diagnosis_description": expert_result.get("description"), - "suggested_diagnosis_commands": expert_result.get("diagnosis_commands", []), - "expert_confidence": expert_result.get("confidence"), - "requires_human_review": expert_result.get("human_review_required", False), - } - - # 加入診斷上下文 (如果有) - if diagnosis_context: - expert_context["diagnosis_context"] = diagnosis_context.get_llm_prompt_context() - expert_context["diagnosis_signals"] = [s.to_dict() for s in diagnosis_context.signals] - - # Step 4: LLM 分析 (帶上 Expert 上下文) + # LLM 非同步執行 try: + signals_dict = [s.model_dump() for s in incident.signals] + llm_result, provider, success = await self._openclaw.generate_incident_proposal( incident_id=incident.incident_id, severity=incident.severity.value, signals=signals_dict, affected_services=incident.affected_services, - expert_context=expert_context, # 傳遞 Expert 診斷上下文 ) if success and llm_result: @@ -660,12 +469,10 @@ class DecisionManager: "dual_engine_llm_win", incident_id=incident.incident_id, provider=provider, - expert_rule=expert_result.get("matched_rule"), ) return { **llm_result, "source": f"llm_{provider}", - "expert_diagnosis": expert_result.get("matched_rule"), } except Exception as e: @@ -673,23 +480,13 @@ class DecisionManager: "dual_engine_llm_failed", incident_id=incident.incident_id, error=str(e), - expert_rule=expert_result.get("matched_rule"), ) - # Step 5: LLM 失敗,使用 Expert System 結果 - # 但根據診斷結果調整回應 + # LLM 失敗,使用 Expert System logger.info( "dual_engine_expert_fallback", incident_id=incident.incident_id, - expert_rule=expert_result.get("matched_rule"), - human_review=expert_result.get("human_review_required", False), ) - - # 如果 Expert 標記需人工介入,降低 confidence - if expert_result.get("human_review_required"): - expert_result["confidence"] = min(expert_result.get("confidence", 0.5), 0.5) - expert_result["description"] += " [LLM 分析失敗,建議人工確認]" - return expert_result async def _try_playbook_match( diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 3d405e20..c356cf5a 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -157,13 +157,8 @@ class TelegramMessage: else: conf_emoji = "🔴" - # 自動生成事件編號 (2026-03-26 修復: 檢查是否已有 INC- 前綴) - if self.incident_id: - incident_id = self.incident_id - elif self.approval_id.upper().startswith("INC-"): - incident_id = self.approval_id.upper() - else: - incident_id = f"INC-{self.approval_id[:8].upper()}" + # 自動生成事件編號 + incident_id = self.incident_id or f"INC-{self.approval_id[:8].upper()}" # SignOz URL (優先使用動態 URL) - 必須 HTML 轉義防止解析錯誤 service_name = self.resource_name.split("-")[0] if "-" in self.resource_name else self.resource_name