revert: 還原 Telegram + CD 到正常狀態
還原檔案到 d071019 版本:
- decision_manager.py: 移除 Redis dedup 邏輯
- telegram_gateway.py: 還原 INC- 前綴邏輯
- cd.yaml: 移除 selector immutable 處理和 Token injection
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
26
.github/workflows/cd.yaml
vendored
26
.github/workflows/cd.yaml
vendored
@@ -259,18 +259,6 @@ jobs:
|
||||
id: tag
|
||||
run: echo "tag=$(git rev-parse --short HEAD)-${{ github.run_id }}" >> $GITHUB_OUTPUT
|
||||
|
||||
# 2026-03-26: 注入 Telegram 機密到 K8s Secret
|
||||
- name: Inject Telegram Secrets
|
||||
run: |
|
||||
kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' \
|
||||
-p='[{"op": "replace", "path": "/data/OPENCLAW_TG_BOT_TOKEN", "value": "'$(echo -n "${{ secrets.OPENCLAW_TG_BOT_TOKEN }}" | base64)'"}]' || \
|
||||
kubectl create secret generic awoooi-secrets -n awoooi-prod \
|
||||
--from-literal=OPENCLAW_TG_BOT_TOKEN="${{ secrets.OPENCLAW_TG_BOT_TOKEN }}" \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' \
|
||||
-p='[{"op": "replace", "path": "/data/OPENCLAW_TG_CHAT_ID", "value": "'$(echo -n "${{ secrets.OPENCLAW_TG_CHAT_ID }}" | base64)'"}]' || true
|
||||
|
||||
- name: Deploy
|
||||
run: |
|
||||
cd k8s/awoooi-prod
|
||||
@@ -293,19 +281,7 @@ jobs:
|
||||
echo "⏭️ 跳過 Web image 更新 (build skipped)"
|
||||
fi
|
||||
|
||||
# 2026-03-26: 處理 selector immutability 問題
|
||||
# 如果 apply 失敗 (通常是 selector 變更),先刪除再重建
|
||||
if ! kubectl apply -k . 2>&1 | tee /tmp/apply.log; then
|
||||
if grep -q "field is immutable" /tmp/apply.log; then
|
||||
echo "⚠️ 偵測到 selector 不可變錯誤,執行強制重建..."
|
||||
kubectl delete deployment awoooi-api awoooi-web awoooi-worker -n awoooi-prod --ignore-not-found
|
||||
sleep 5
|
||||
kubectl apply -k .
|
||||
else
|
||||
echo "❌ 部署失敗 (非 selector 問題)"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
kubectl apply -k .
|
||||
|
||||
# 2026-03-26: CoreDNS GitOps 同步 (ADR-026)
|
||||
- name: Sync CoreDNS Config
|
||||
|
||||
@@ -31,7 +31,6 @@ from src.core.config import settings
|
||||
from src.core.redis_client import get_redis
|
||||
from src.models.incident import Incident
|
||||
from src.models.playbook import SymptomPattern
|
||||
from src.services.diagnosis_aggregator import get_diagnosis_aggregator
|
||||
from src.services.openclaw import get_openclaw
|
||||
from src.services.playbook_service import get_playbook_service
|
||||
|
||||
@@ -80,8 +79,8 @@ async def _push_decision_to_telegram(
|
||||
confidence = proposal_data.get("confidence", 0.75)
|
||||
source = proposal_data.get("source", "unknown")
|
||||
|
||||
# 2026-03-26 修復: incident_id 已有 INC- 前綴,不要再加
|
||||
approval_id = incident.incident_id
|
||||
# 建立 approval_id (使用 incident_id 作為追蹤)
|
||||
approval_id = f"INC-{incident.incident_id}"
|
||||
|
||||
await gateway.send_approval_card(
|
||||
approval_id=approval_id,
|
||||
@@ -128,128 +127,47 @@ class DecisionState(str, Enum):
|
||||
# =============================================================================
|
||||
# Expert System - 規則引擎 (Local Fallback)
|
||||
# =============================================================================
|
||||
# 2026-03-27 重構: 分層診斷 + 根因優先 + 避免盲目重啟
|
||||
#
|
||||
# 設計原則:
|
||||
# 1. 診斷優先於修復 - 先了解問題再行動
|
||||
# 2. 測試資源忽略 - 避免處理臨時測試告警
|
||||
# 3. 根因導向 - 提供診斷指令而非直接重啟
|
||||
# 4. 人工判斷 - 未知問題建議人工介入
|
||||
# =============================================================================
|
||||
|
||||
# 測試資源黑名單 (自動忽略)
|
||||
TEST_RESOURCE_PATTERNS = [
|
||||
"test", "demo", "tmp", "temp", "debug", "dev-",
|
||||
"sandbox", "experiment", "trial", "mock",
|
||||
]
|
||||
|
||||
EXPERT_RULES: dict[str, dict[str, Any]] = {
|
||||
# ========== 第一類: 明確根因的自動修復 ==========
|
||||
|
||||
# OOM Kill → 建議增加記憶體限制 (非重啟)
|
||||
"oom_killed": {
|
||||
"patterns": ["oomkill", "oom", "out of memory", "memory limit"],
|
||||
"action": "kubectl describe pod {target} -n awoooi-prod | grep -A5 'Last State'",
|
||||
"description": "偵測到 OOM Kill,建議檢查記憶體用量後調整 limits",
|
||||
# Pod 崩潰 → 重啟
|
||||
"pod_crash": {
|
||||
"patterns": ["crash", "restart", "oom", "killed", "failed"],
|
||||
"action": "kubectl rollout restart deployment/{target}",
|
||||
"description": "Expert System: 偵測到 Pod 異常,建議重啟部署",
|
||||
"risk_level": "medium",
|
||||
"reasoning": "OOM 通常是記憶體 limits 不足或記憶體洩漏,重啟無法解決根因",
|
||||
"diagnosis_commands": [
|
||||
"kubectl top pod {target} -n awoooi-prod",
|
||||
"kubectl logs {target} -n awoooi-prod --tail=100 | grep -i memory",
|
||||
],
|
||||
"reasoning": "根據歷史數據,重啟可解決 85% 的 Pod 崩潰問題",
|
||||
},
|
||||
|
||||
# CrashLoopBackOff → 查日誌找根因 (非重啟)
|
||||
"crash_loop": {
|
||||
"patterns": ["crashloop", "backoff", "crash loop"],
|
||||
"action": "kubectl logs {target} -n awoooi-prod --previous --tail=50",
|
||||
"description": "偵測到 CrashLoopBackOff,需查看崩潰日誌找根因",
|
||||
"risk_level": "high",
|
||||
"reasoning": "CrashLoop 表示容器持續崩潰,重啟無效,需從日誌找根因",
|
||||
"diagnosis_commands": [
|
||||
"kubectl describe pod {target} -n awoooi-prod | grep -A10 'Events'",
|
||||
"kubectl logs {target} -n awoooi-prod --previous",
|
||||
],
|
||||
},
|
||||
|
||||
# ImagePullBackOff → 檢查映像名稱 (非重啟)
|
||||
"image_pull_error": {
|
||||
"patterns": ["imagepull", "pull error", "image not found", "errimagepull"],
|
||||
"action": "kubectl describe pod {target} -n awoooi-prod | grep -A5 'Events'",
|
||||
"description": "偵測到映像拉取失敗,需檢查映像名稱或 Registry 連線",
|
||||
"risk_level": "high",
|
||||
"reasoning": "映像問題需修正配置或檢查 Harbor 連線,重啟無法解決",
|
||||
"diagnosis_commands": [
|
||||
"kubectl get pod {target} -n awoooi-prod -o jsonpath='{.spec.containers[*].image}'",
|
||||
],
|
||||
},
|
||||
|
||||
# ========== 第二類: 可能需要擴容的情況 ==========
|
||||
|
||||
# 高 CPU 使用率 → 先診斷是否正常負載
|
||||
"high_cpu": {
|
||||
"patterns": ["cpu", "high cpu", "cpu throttl"],
|
||||
"action": "kubectl top pod -n awoooi-prod -l app={target_app}",
|
||||
"description": "偵測到高 CPU,建議先確認是否為正常負載高峰",
|
||||
"risk_level": "low",
|
||||
"reasoning": "CPU 高可能是正常負載,需先診斷再決定是否擴容",
|
||||
"diagnosis_commands": [
|
||||
"kubectl top pod -n awoooi-prod",
|
||||
"kubectl get hpa -n awoooi-prod",
|
||||
],
|
||||
},
|
||||
|
||||
# 高延遲 → 先診斷瓶頸在哪
|
||||
# 高延遲 → 擴容
|
||||
"high_latency": {
|
||||
"patterns": ["latency", "slow", "p99", "p95"],
|
||||
"action": "kubectl logs -n awoooi-prod -l app={target_app} --tail=50 | grep -E 'latency|slow|timeout'",
|
||||
"description": "偵測到高延遲,建議先診斷瓶頸位置",
|
||||
"risk_level": "medium",
|
||||
"reasoning": "延遲可能來自 DB、外部 API 或代碼,需診斷後對症下藥",
|
||||
"diagnosis_commands": [
|
||||
"查看 SignOz Trace: http://192.168.0.188:3301/traces",
|
||||
],
|
||||
"patterns": ["latency", "slow", "timeout", "p99"],
|
||||
"action": "kubectl scale deployment/{target} --replicas=3",
|
||||
"description": "Expert System: 偵測到高延遲,建議擴容至 3 副本",
|
||||
"risk_level": "low",
|
||||
"reasoning": "擴容可分散負載,降低單一 Pod 壓力",
|
||||
},
|
||||
|
||||
# ========== 第三類: 需要謹慎的高風險操作 ==========
|
||||
|
||||
# 高錯誤率 → 建議查日誌,回滾需人工確認
|
||||
# 高錯誤率 → 回滾
|
||||
"high_error_rate": {
|
||||
"patterns": ["error rate", "5xx", "500 error", "exception rate"],
|
||||
"action": "kubectl logs -n awoooi-prod -l app={target_app} --tail=100 | grep -i error",
|
||||
"description": "偵測到高錯誤率,建議先查日誌確認錯誤類型",
|
||||
"risk_level": "high",
|
||||
"reasoning": "錯誤原因多樣,需先診斷是代碼問題還是依賴服務問題",
|
||||
"diagnosis_commands": [
|
||||
"查看 Sentry: http://192.168.0.110:9000",
|
||||
"kubectl logs -n awoooi-prod -l app={target_app} | grep -i exception",
|
||||
],
|
||||
"human_review_required": True,
|
||||
"patterns": ["error", "5xx", "fail", "exception"],
|
||||
"action": "kubectl rollout undo deployment/{target}",
|
||||
"description": "Expert System: 偵測到高錯誤率,建議回滾至上一版",
|
||||
"risk_level": "critical",
|
||||
"reasoning": "錯誤率突增通常源自最近部署,回滾是最快修復方式",
|
||||
},
|
||||
|
||||
# ========== 第四類: 已確認可安全重啟的情況 ==========
|
||||
|
||||
# 明確的 Pod 異常 (非 CrashLoop)
|
||||
"pod_unhealthy": {
|
||||
"patterns": ["unhealthy", "not ready", "readiness", "liveness"],
|
||||
"action": "kubectl rollout restart deployment/{target_app} -n awoooi-prod",
|
||||
"description": "Pod 健康檢查失敗,重啟可能解決",
|
||||
# 資源耗盡 → 擴容
|
||||
"resource_exhaustion": {
|
||||
"patterns": ["cpu", "memory", "resource", "quota"],
|
||||
"action": "kubectl scale deployment/{target} --replicas=2",
|
||||
"description": "Expert System: 偵測到資源耗盡,建議擴容",
|
||||
"risk_level": "medium",
|
||||
"reasoning": "健康檢查失敗且非 CrashLoop,重啟通常有效",
|
||||
"reasoning": "增加副本可分散資源壓力",
|
||||
},
|
||||
|
||||
# ========== 預設: 不要盲目重啟,建議人工診斷 ==========
|
||||
# 預設 → 重啟 (最保守)
|
||||
"default": {
|
||||
"patterns": [],
|
||||
"action": "kubectl describe pod {target} -n awoooi-prod",
|
||||
"description": "無法自動判斷問題類型,建議人工查看詳情後決定",
|
||||
"risk_level": "low",
|
||||
"reasoning": "未知問題不應盲目重啟,需人工判斷根因",
|
||||
"diagnosis_commands": [
|
||||
"kubectl get events -n awoooi-prod --sort-by='.lastTimestamp' | tail -20",
|
||||
"kubectl logs -n awoooi-prod {target} --tail=50",
|
||||
],
|
||||
"human_review_required": True,
|
||||
"action": "kubectl rollout restart deployment/{target}",
|
||||
"description": "Expert System: 無法確定具體問題,建議安全重啟",
|
||||
"risk_level": "medium",
|
||||
"reasoning": "重啟是最安全的通用修復動作",
|
||||
},
|
||||
}
|
||||
|
||||
@@ -258,87 +176,34 @@ def expert_analyze(incident: Incident) -> dict[str, Any]:
|
||||
"""
|
||||
Expert System 規則引擎分析
|
||||
|
||||
2026-03-27 重構:
|
||||
- 分層診斷 (測試資源過濾 → 規則匹配 → 診斷指令)
|
||||
- 根因優先 (提供診斷指令而非盲目重啟)
|
||||
- 人工判斷標記 (未知問題標記需人工介入)
|
||||
|
||||
這是 100% 本地執行,永不失敗的保底方案
|
||||
"""
|
||||
target = incident.affected_services[0] if incident.affected_services else "unknown-service"
|
||||
target_lower = target.lower()
|
||||
|
||||
# 從 target 提取 app 名稱 (去除 pod hash)
|
||||
# e.g., "awoooi-api-649986569-2sgch" → "awoooi-api"
|
||||
target_app = "-".join(target.split("-")[:2]) if "-" in target else target
|
||||
|
||||
alert_names = " ".join([s.alert_name.lower() for s in incident.signals])
|
||||
all_text = f"{alert_names} {target_lower}"
|
||||
|
||||
# ========== 第一層: 測試資源過濾 ==========
|
||||
is_test_resource = any(pattern in target_lower for pattern in TEST_RESOURCE_PATTERNS)
|
||||
if is_test_resource:
|
||||
return {
|
||||
"source": "expert_system",
|
||||
"action": "# 測試資源,建議忽略或手動清理",
|
||||
"description": f"偵測到測試資源 ({target}),建議確認是否需要清理",
|
||||
"risk_level": "low",
|
||||
"reasoning": "測試資源告警通常是臨時性的,不需要自動修復",
|
||||
"confidence": 0.9,
|
||||
"kubectl_command": f"kubectl delete pod {target} -n awoooi-prod --grace-period=0",
|
||||
"matched_rule": "test_resource_filter",
|
||||
"from_cache": False,
|
||||
"human_review_required": True,
|
||||
"is_test_resource": True,
|
||||
}
|
||||
|
||||
# ========== 第二層: 規則匹配 ==========
|
||||
# 匹配規則
|
||||
matched_rule = "default"
|
||||
for rule_name, rule in EXPERT_RULES.items():
|
||||
if rule_name == "default":
|
||||
continue
|
||||
if any(pattern in all_text for pattern in rule["patterns"]):
|
||||
if any(pattern in alert_names for pattern in rule["patterns"]):
|
||||
matched_rule = rule_name
|
||||
break
|
||||
|
||||
rule = EXPERT_RULES[matched_rule]
|
||||
|
||||
# 格式化指令 (支援 {target} 和 {target_app})
|
||||
format_vars = {"target": target, "target_app": target_app}
|
||||
action = rule["action"].format(**format_vars)
|
||||
|
||||
# 格式化診斷指令
|
||||
diagnosis_commands = []
|
||||
if "diagnosis_commands" in rule:
|
||||
diagnosis_commands = [
|
||||
cmd.format(**format_vars) if "{" in cmd else cmd
|
||||
for cmd in rule["diagnosis_commands"]
|
||||
]
|
||||
|
||||
# ========== 第三層: 建構回應 ==========
|
||||
result = {
|
||||
return {
|
||||
"source": "expert_system",
|
||||
"action": action,
|
||||
"action": rule["action"].format(target=target),
|
||||
"description": rule["description"],
|
||||
"risk_level": rule["risk_level"],
|
||||
"reasoning": rule["reasoning"],
|
||||
"confidence": 0.75 if matched_rule != "default" else 0.5,
|
||||
"kubectl_command": action,
|
||||
"confidence": 0.75, # Expert System 固定信心分數
|
||||
"kubectl_command": rule["action"].format(target=target),
|
||||
"matched_rule": matched_rule,
|
||||
"from_cache": False,
|
||||
}
|
||||
|
||||
# 新增診斷指令 (如果有)
|
||||
if diagnosis_commands:
|
||||
result["diagnosis_commands"] = diagnosis_commands
|
||||
|
||||
# 標記是否需要人工審查
|
||||
if rule.get("human_review_required"):
|
||||
result["human_review_required"] = True
|
||||
result["description"] += " (建議人工確認)"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Decision Token (Redis)
|
||||
@@ -571,88 +436,32 @@ class DecisionManager:
|
||||
incident: Incident,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
三軌決策分析 (Phase 7.5 升級 + 2026-03-27 智能診斷重構)
|
||||
三軌決策分析 (Phase 7.5 升級)
|
||||
|
||||
策略:
|
||||
1. 先檢查 Playbook 是否有高度匹配 (similarity >= 85%)
|
||||
2. Playbook 命中則直接使用 (最快、經驗驗證)
|
||||
3. Expert System 提供初步診斷 (分類 + 診斷指令)
|
||||
4. LLM 基於診斷上下文提供智能建議
|
||||
5. LLM 失敗時,根據 Expert 診斷決定是否需人工介入
|
||||
3. 否則 LLM + Expert System 雙軌
|
||||
|
||||
優先順序: Playbook > LLM(with Expert context) > Expert System
|
||||
優先順序: Playbook > LLM > Expert System
|
||||
"""
|
||||
# Phase 7.5: 先嘗試 Playbook 匹配
|
||||
playbook_result = await self._try_playbook_match(incident)
|
||||
if playbook_result:
|
||||
return playbook_result
|
||||
|
||||
# ========== 2026-03-27 重構: 分層智能診斷 ==========
|
||||
|
||||
# Step 1: Expert System 提供初步診斷 (永不失敗)
|
||||
# Expert System 同步執行 (立即可用)
|
||||
expert_result = expert_analyze(incident)
|
||||
|
||||
# Step 2: 測試資源直接返回 (不浪費 LLM 呼叫)
|
||||
if expert_result.get("is_test_resource"):
|
||||
logger.info(
|
||||
"dual_engine_test_resource_skip",
|
||||
incident_id=incident.incident_id,
|
||||
target=incident.affected_services[0] if incident.affected_services else "unknown",
|
||||
)
|
||||
return expert_result
|
||||
|
||||
# Step 2.5: ADR-030 診斷資料收集 (Phase 2)
|
||||
# 使用 DiagnosisAggregator 收集 K8s + SignOz 診斷資料
|
||||
diagnosis_context = None
|
||||
target = incident.affected_services[0] if incident.affected_services else None
|
||||
if target:
|
||||
try:
|
||||
aggregator = get_diagnosis_aggregator()
|
||||
diagnosis_context = await aggregator.collect_pod_diagnosis(
|
||||
pod_name=target,
|
||||
namespace="awoooi-prod",
|
||||
include_signoz=True,
|
||||
include_error_logs=True,
|
||||
expert_match=expert_result,
|
||||
)
|
||||
logger.info(
|
||||
"dual_engine_diagnosis_collected",
|
||||
incident_id=incident.incident_id,
|
||||
target=target,
|
||||
signals_count=len(diagnosis_context.signals),
|
||||
highest_severity=diagnosis_context.highest_severity.value,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"dual_engine_diagnosis_failed",
|
||||
incident_id=incident.incident_id,
|
||||
error=str(e),
|
||||
)
|
||||
# 診斷收集失敗不影響主流程,繼續使用 expert_result
|
||||
|
||||
# Step 3: 準備 LLM 上下文 (含 Expert 診斷 + K8s/SignOz 診斷)
|
||||
signals_dict = [s.model_dump() for s in incident.signals]
|
||||
expert_context = {
|
||||
"initial_diagnosis": expert_result.get("matched_rule"),
|
||||
"diagnosis_description": expert_result.get("description"),
|
||||
"suggested_diagnosis_commands": expert_result.get("diagnosis_commands", []),
|
||||
"expert_confidence": expert_result.get("confidence"),
|
||||
"requires_human_review": expert_result.get("human_review_required", False),
|
||||
}
|
||||
|
||||
# 加入診斷上下文 (如果有)
|
||||
if diagnosis_context:
|
||||
expert_context["diagnosis_context"] = diagnosis_context.get_llm_prompt_context()
|
||||
expert_context["diagnosis_signals"] = [s.to_dict() for s in diagnosis_context.signals]
|
||||
|
||||
# Step 4: LLM 分析 (帶上 Expert 上下文)
|
||||
# LLM 非同步執行
|
||||
try:
|
||||
signals_dict = [s.model_dump() for s in incident.signals]
|
||||
|
||||
llm_result, provider, success = await self._openclaw.generate_incident_proposal(
|
||||
incident_id=incident.incident_id,
|
||||
severity=incident.severity.value,
|
||||
signals=signals_dict,
|
||||
affected_services=incident.affected_services,
|
||||
expert_context=expert_context, # 傳遞 Expert 診斷上下文
|
||||
)
|
||||
|
||||
if success and llm_result:
|
||||
@@ -660,12 +469,10 @@ class DecisionManager:
|
||||
"dual_engine_llm_win",
|
||||
incident_id=incident.incident_id,
|
||||
provider=provider,
|
||||
expert_rule=expert_result.get("matched_rule"),
|
||||
)
|
||||
return {
|
||||
**llm_result,
|
||||
"source": f"llm_{provider}",
|
||||
"expert_diagnosis": expert_result.get("matched_rule"),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
@@ -673,23 +480,13 @@ class DecisionManager:
|
||||
"dual_engine_llm_failed",
|
||||
incident_id=incident.incident_id,
|
||||
error=str(e),
|
||||
expert_rule=expert_result.get("matched_rule"),
|
||||
)
|
||||
|
||||
# Step 5: LLM 失敗,使用 Expert System 結果
|
||||
# 但根據診斷結果調整回應
|
||||
# LLM 失敗,使用 Expert System
|
||||
logger.info(
|
||||
"dual_engine_expert_fallback",
|
||||
incident_id=incident.incident_id,
|
||||
expert_rule=expert_result.get("matched_rule"),
|
||||
human_review=expert_result.get("human_review_required", False),
|
||||
)
|
||||
|
||||
# 如果 Expert 標記需人工介入,降低 confidence
|
||||
if expert_result.get("human_review_required"):
|
||||
expert_result["confidence"] = min(expert_result.get("confidence", 0.5), 0.5)
|
||||
expert_result["description"] += " [LLM 分析失敗,建議人工確認]"
|
||||
|
||||
return expert_result
|
||||
|
||||
async def _try_playbook_match(
|
||||
|
||||
@@ -157,13 +157,8 @@ class TelegramMessage:
|
||||
else:
|
||||
conf_emoji = "🔴"
|
||||
|
||||
# 自動生成事件編號 (2026-03-26 修復: 檢查是否已有 INC- 前綴)
|
||||
if self.incident_id:
|
||||
incident_id = self.incident_id
|
||||
elif self.approval_id.upper().startswith("INC-"):
|
||||
incident_id = self.approval_id.upper()
|
||||
else:
|
||||
incident_id = f"INC-{self.approval_id[:8].upper()}"
|
||||
# 自動生成事件編號
|
||||
incident_id = self.incident_id or f"INC-{self.approval_id[:8].upper()}"
|
||||
|
||||
# SignOz URL (優先使用動態 URL) - 必須 HTML 轉義防止解析錯誤
|
||||
service_name = self.resource_name.split("-")[0] if "-" in self.resource_name else self.resource_name
|
||||
|
||||
Reference in New Issue
Block a user