revert: 還原 Telegram + CD 到正常狀態

還原檔案到 d071019 版本:
- decision_manager.py: 移除 Redis dedup 邏輯
- telegram_gateway.py: 還原 INC- 前綴邏輯
- cd.yaml: 移除 selector immutable 處理和 Token injection

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-03-26 22:10:52 +08:00
parent 3c034526a5
commit 17ee8838be
3 changed files with 48 additions and 280 deletions

View File

@@ -259,18 +259,6 @@ jobs:
id: tag
run: echo "tag=$(git rev-parse --short HEAD)-${{ github.run_id }}" >> $GITHUB_OUTPUT
# 2026-03-26: 注入 Telegram 機密到 K8s Secret
- name: Inject Telegram Secrets
run: |
kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' \
-p='[{"op": "replace", "path": "/data/OPENCLAW_TG_BOT_TOKEN", "value": "'$(echo -n "${{ secrets.OPENCLAW_TG_BOT_TOKEN }}" | base64)'"}]' || \
kubectl create secret generic awoooi-secrets -n awoooi-prod \
--from-literal=OPENCLAW_TG_BOT_TOKEN="${{ secrets.OPENCLAW_TG_BOT_TOKEN }}" \
--dry-run=client -o yaml | kubectl apply -f -
kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' \
-p='[{"op": "replace", "path": "/data/OPENCLAW_TG_CHAT_ID", "value": "'$(echo -n "${{ secrets.OPENCLAW_TG_CHAT_ID }}" | base64)'"}]' || true
- name: Deploy
run: |
cd k8s/awoooi-prod
@@ -293,19 +281,7 @@ jobs:
echo "⏭️ 跳過 Web image 更新 (build skipped)"
fi
# 2026-03-26: 處理 selector immutability 問題
# 如果 apply 失敗 (通常是 selector 變更),先刪除再重建
if ! kubectl apply -k . 2>&1 | tee /tmp/apply.log; then
if grep -q "field is immutable" /tmp/apply.log; then
echo "⚠️ 偵測到 selector 不可變錯誤,執行強制重建..."
kubectl delete deployment awoooi-api awoooi-web awoooi-worker -n awoooi-prod --ignore-not-found
sleep 5
kubectl apply -k .
else
echo "❌ 部署失敗 (非 selector 問題)"
exit 1
fi
fi
kubectl apply -k .
# 2026-03-26: CoreDNS GitOps 同步 (ADR-026)
- name: Sync CoreDNS Config

View File

@@ -31,7 +31,6 @@ from src.core.config import settings
from src.core.redis_client import get_redis
from src.models.incident import Incident
from src.models.playbook import SymptomPattern
from src.services.diagnosis_aggregator import get_diagnosis_aggregator
from src.services.openclaw import get_openclaw
from src.services.playbook_service import get_playbook_service
@@ -80,8 +79,8 @@ async def _push_decision_to_telegram(
confidence = proposal_data.get("confidence", 0.75)
source = proposal_data.get("source", "unknown")
# 2026-03-26 修復: incident_id 已有 INC- 前綴,不要再加
approval_id = incident.incident_id
# 建立 approval_id (使用 incident_id 作為追蹤)
approval_id = f"INC-{incident.incident_id}"
await gateway.send_approval_card(
approval_id=approval_id,
@@ -128,128 +127,47 @@ class DecisionState(str, Enum):
# =============================================================================
# Expert System - 規則引擎 (Local Fallback)
# =============================================================================
# 2026-03-27 重構: 分層診斷 + 根因優先 + 避免盲目重啟
#
# 設計原則:
# 1. 診斷優先於修復 - 先了解問題再行動
# 2. 測試資源忽略 - 避免處理臨時測試告警
# 3. 根因導向 - 提供診斷指令而非直接重啟
# 4. 人工判斷 - 未知問題建議人工介入
# =============================================================================
# 測試資源黑名單 (自動忽略)
TEST_RESOURCE_PATTERNS = [
"test", "demo", "tmp", "temp", "debug", "dev-",
"sandbox", "experiment", "trial", "mock",
]
EXPERT_RULES: dict[str, dict[str, Any]] = {
# ========== 第一類: 明確根因的自動修復 ==========
# OOM Kill → 建議增加記憶體限制 (非重啟)
"oom_killed": {
"patterns": ["oomkill", "oom", "out of memory", "memory limit"],
"action": "kubectl describe pod {target} -n awoooi-prod | grep -A5 'Last State'",
"description": "偵測到 OOM Kill建議檢查記憶體用量後調整 limits",
# Pod 崩潰 → 重啟
"pod_crash": {
"patterns": ["crash", "restart", "oom", "killed", "failed"],
"action": "kubectl rollout restart deployment/{target}",
"description": "Expert System: 偵測到 Pod 異常,建議重啟部署",
"risk_level": "medium",
"reasoning": "OOM 通常是記憶體 limits 不足或記憶體洩漏,重啟無法解決根因",
"diagnosis_commands": [
"kubectl top pod {target} -n awoooi-prod",
"kubectl logs {target} -n awoooi-prod --tail=100 | grep -i memory",
],
"reasoning": "根據歷史數據,重啟可解決 85% 的 Pod 崩潰問題",
},
# CrashLoopBackOff → 查日誌找根因 (非重啟)
"crash_loop": {
"patterns": ["crashloop", "backoff", "crash loop"],
"action": "kubectl logs {target} -n awoooi-prod --previous --tail=50",
"description": "偵測到 CrashLoopBackOff需查看崩潰日誌找根因",
"risk_level": "high",
"reasoning": "CrashLoop 表示容器持續崩潰,重啟無效,需從日誌找根因",
"diagnosis_commands": [
"kubectl describe pod {target} -n awoooi-prod | grep -A10 'Events'",
"kubectl logs {target} -n awoooi-prod --previous",
],
},
# ImagePullBackOff → 檢查映像名稱 (非重啟)
"image_pull_error": {
"patterns": ["imagepull", "pull error", "image not found", "errimagepull"],
"action": "kubectl describe pod {target} -n awoooi-prod | grep -A5 'Events'",
"description": "偵測到映像拉取失敗,需檢查映像名稱或 Registry 連線",
"risk_level": "high",
"reasoning": "映像問題需修正配置或檢查 Harbor 連線,重啟無法解決",
"diagnosis_commands": [
"kubectl get pod {target} -n awoooi-prod -o jsonpath='{.spec.containers[*].image}'",
],
},
# ========== 第二類: 可能需要擴容的情況 ==========
# 高 CPU 使用率 → 先診斷是否正常負載
"high_cpu": {
"patterns": ["cpu", "high cpu", "cpu throttl"],
"action": "kubectl top pod -n awoooi-prod -l app={target_app}",
"description": "偵測到高 CPU建議先確認是否為正常負載高峰",
"risk_level": "low",
"reasoning": "CPU 高可能是正常負載,需先診斷再決定是否擴容",
"diagnosis_commands": [
"kubectl top pod -n awoooi-prod",
"kubectl get hpa -n awoooi-prod",
],
},
# 高延遲 → 先診斷瓶頸在哪
# 高延遲 → 擴容
"high_latency": {
"patterns": ["latency", "slow", "p99", "p95"],
"action": "kubectl logs -n awoooi-prod -l app={target_app} --tail=50 | grep -E 'latency|slow|timeout'",
"description": "偵測到高延遲,建議先診斷瓶頸位置",
"risk_level": "medium",
"reasoning": "延遲可能來自 DB、外部 API 或代碼,需診斷後對症下藥",
"diagnosis_commands": [
"查看 SignOz Trace: http://192.168.0.188:3301/traces",
],
"patterns": ["latency", "slow", "timeout", "p99"],
"action": "kubectl scale deployment/{target} --replicas=3",
"description": "Expert System: 偵測到高延遲,建議擴容至 3 副本",
"risk_level": "low",
"reasoning": "擴容可分散負載,降低單一 Pod 壓力",
},
# ========== 第三類: 需要謹慎的高風險操作 ==========
# 高錯誤率 → 建議查日誌,回滾需人工確認
# 高錯誤率 → 回滾
"high_error_rate": {
"patterns": ["error rate", "5xx", "500 error", "exception rate"],
"action": "kubectl logs -n awoooi-prod -l app={target_app} --tail=100 | grep -i error",
"description": "偵測到高錯誤率,建議先查日誌確認錯誤類型",
"risk_level": "high",
"reasoning": "錯誤原因多樣,需先診斷是代碼問題還是依賴服務問題",
"diagnosis_commands": [
"查看 Sentry: http://192.168.0.110:9000",
"kubectl logs -n awoooi-prod -l app={target_app} | grep -i exception",
],
"human_review_required": True,
"patterns": ["error", "5xx", "fail", "exception"],
"action": "kubectl rollout undo deployment/{target}",
"description": "Expert System: 偵測到高錯誤率,建議回滾至上一版",
"risk_level": "critical",
"reasoning": "錯誤率突增通常源自最近部署,回滾是最快修復方式",
},
# ========== 第四類: 已確認可安全重啟的情況 ==========
# 明確的 Pod 異常 (非 CrashLoop)
"pod_unhealthy": {
"patterns": ["unhealthy", "not ready", "readiness", "liveness"],
"action": "kubectl rollout restart deployment/{target_app} -n awoooi-prod",
"description": "Pod 健康檢查失敗,重啟可能解決",
# 資源耗盡 → 擴容
"resource_exhaustion": {
"patterns": ["cpu", "memory", "resource", "quota"],
"action": "kubectl scale deployment/{target} --replicas=2",
"description": "Expert System: 偵測到資源耗盡,建議擴容",
"risk_level": "medium",
"reasoning": "健康檢查失敗且非 CrashLoop重啟通常有效",
"reasoning": "增加副本可分散資源壓力",
},
# ========== 預設: 不要盲目重啟,建議人工診斷 ==========
# 預設 → 重啟 (最保守)
"default": {
"patterns": [],
"action": "kubectl describe pod {target} -n awoooi-prod",
"description": "無法自動判斷問題類型,建議人工查看詳情後決定",
"risk_level": "low",
"reasoning": "未知問題不應盲目重啟,需人工判斷根因",
"diagnosis_commands": [
"kubectl get events -n awoooi-prod --sort-by='.lastTimestamp' | tail -20",
"kubectl logs -n awoooi-prod {target} --tail=50",
],
"human_review_required": True,
"action": "kubectl rollout restart deployment/{target}",
"description": "Expert System: 無法確定具體問題,建議安全重啟",
"risk_level": "medium",
"reasoning": "重啟是最安全的通用修復動作",
},
}
@@ -258,87 +176,34 @@ def expert_analyze(incident: Incident) -> dict[str, Any]:
"""
Expert System 規則引擎分析
2026-03-27 重構:
- 分層診斷 (測試資源過濾 → 規則匹配 → 診斷指令)
- 根因優先 (提供診斷指令而非盲目重啟)
- 人工判斷標記 (未知問題標記需人工介入)
這是 100% 本地執行,永不失敗的保底方案
"""
target = incident.affected_services[0] if incident.affected_services else "unknown-service"
target_lower = target.lower()
# 從 target 提取 app 名稱 (去除 pod hash)
# e.g., "awoooi-api-649986569-2sgch" → "awoooi-api"
target_app = "-".join(target.split("-")[:2]) if "-" in target else target
alert_names = " ".join([s.alert_name.lower() for s in incident.signals])
all_text = f"{alert_names} {target_lower}"
# ========== 第一層: 測試資源過濾 ==========
is_test_resource = any(pattern in target_lower for pattern in TEST_RESOURCE_PATTERNS)
if is_test_resource:
return {
"source": "expert_system",
"action": "# 測試資源,建議忽略或手動清理",
"description": f"偵測到測試資源 ({target}),建議確認是否需要清理",
"risk_level": "low",
"reasoning": "測試資源告警通常是臨時性的,不需要自動修復",
"confidence": 0.9,
"kubectl_command": f"kubectl delete pod {target} -n awoooi-prod --grace-period=0",
"matched_rule": "test_resource_filter",
"from_cache": False,
"human_review_required": True,
"is_test_resource": True,
}
# ========== 第二層: 規則匹配 ==========
# 匹配規則
matched_rule = "default"
for rule_name, rule in EXPERT_RULES.items():
if rule_name == "default":
continue
if any(pattern in all_text for pattern in rule["patterns"]):
if any(pattern in alert_names for pattern in rule["patterns"]):
matched_rule = rule_name
break
rule = EXPERT_RULES[matched_rule]
# 格式化指令 (支援 {target} 和 {target_app})
format_vars = {"target": target, "target_app": target_app}
action = rule["action"].format(**format_vars)
# 格式化診斷指令
diagnosis_commands = []
if "diagnosis_commands" in rule:
diagnosis_commands = [
cmd.format(**format_vars) if "{" in cmd else cmd
for cmd in rule["diagnosis_commands"]
]
# ========== 第三層: 建構回應 ==========
result = {
return {
"source": "expert_system",
"action": action,
"action": rule["action"].format(target=target),
"description": rule["description"],
"risk_level": rule["risk_level"],
"reasoning": rule["reasoning"],
"confidence": 0.75 if matched_rule != "default" else 0.5,
"kubectl_command": action,
"confidence": 0.75, # Expert System 固定信心分數
"kubectl_command": rule["action"].format(target=target),
"matched_rule": matched_rule,
"from_cache": False,
}
# 新增診斷指令 (如果有)
if diagnosis_commands:
result["diagnosis_commands"] = diagnosis_commands
# 標記是否需要人工審查
if rule.get("human_review_required"):
result["human_review_required"] = True
result["description"] += " (建議人工確認)"
return result
# =============================================================================
# Decision Token (Redis)
@@ -571,88 +436,32 @@ class DecisionManager:
incident: Incident,
) -> dict[str, Any]:
"""
三軌決策分析 (Phase 7.5 升級 + 2026-03-27 智能診斷重構)
三軌決策分析 (Phase 7.5 升級)
策略:
1. 先檢查 Playbook 是否有高度匹配 (similarity >= 85%)
2. Playbook 命中則直接使用 (最快、經驗驗證)
3. Expert System 提供初步診斷 (分類 + 診斷指令)
4. LLM 基於診斷上下文提供智能建議
5. LLM 失敗時,根據 Expert 診斷決定是否需人工介入
3. 否則 LLM + Expert System 雙軌
優先順序: Playbook > LLM(with Expert context) > Expert System
優先順序: Playbook > LLM > Expert System
"""
# Phase 7.5: 先嘗試 Playbook 匹配
playbook_result = await self._try_playbook_match(incident)
if playbook_result:
return playbook_result
# ========== 2026-03-27 重構: 分層智能診斷 ==========
# Step 1: Expert System 提供初步診斷 (永不失敗)
# Expert System 同步執行 (立即可用)
expert_result = expert_analyze(incident)
# Step 2: 測試資源直接返回 (不浪費 LLM 呼叫)
if expert_result.get("is_test_resource"):
logger.info(
"dual_engine_test_resource_skip",
incident_id=incident.incident_id,
target=incident.affected_services[0] if incident.affected_services else "unknown",
)
return expert_result
# Step 2.5: ADR-030 診斷資料收集 (Phase 2)
# 使用 DiagnosisAggregator 收集 K8s + SignOz 診斷資料
diagnosis_context = None
target = incident.affected_services[0] if incident.affected_services else None
if target:
try:
aggregator = get_diagnosis_aggregator()
diagnosis_context = await aggregator.collect_pod_diagnosis(
pod_name=target,
namespace="awoooi-prod",
include_signoz=True,
include_error_logs=True,
expert_match=expert_result,
)
logger.info(
"dual_engine_diagnosis_collected",
incident_id=incident.incident_id,
target=target,
signals_count=len(diagnosis_context.signals),
highest_severity=diagnosis_context.highest_severity.value,
)
except Exception as e:
logger.warning(
"dual_engine_diagnosis_failed",
incident_id=incident.incident_id,
error=str(e),
)
# 診斷收集失敗不影響主流程,繼續使用 expert_result
# Step 3: 準備 LLM 上下文 (含 Expert 診斷 + K8s/SignOz 診斷)
signals_dict = [s.model_dump() for s in incident.signals]
expert_context = {
"initial_diagnosis": expert_result.get("matched_rule"),
"diagnosis_description": expert_result.get("description"),
"suggested_diagnosis_commands": expert_result.get("diagnosis_commands", []),
"expert_confidence": expert_result.get("confidence"),
"requires_human_review": expert_result.get("human_review_required", False),
}
# 加入診斷上下文 (如果有)
if diagnosis_context:
expert_context["diagnosis_context"] = diagnosis_context.get_llm_prompt_context()
expert_context["diagnosis_signals"] = [s.to_dict() for s in diagnosis_context.signals]
# Step 4: LLM 分析 (帶上 Expert 上下文)
# LLM 非同步執行
try:
signals_dict = [s.model_dump() for s in incident.signals]
llm_result, provider, success = await self._openclaw.generate_incident_proposal(
incident_id=incident.incident_id,
severity=incident.severity.value,
signals=signals_dict,
affected_services=incident.affected_services,
expert_context=expert_context, # 傳遞 Expert 診斷上下文
)
if success and llm_result:
@@ -660,12 +469,10 @@ class DecisionManager:
"dual_engine_llm_win",
incident_id=incident.incident_id,
provider=provider,
expert_rule=expert_result.get("matched_rule"),
)
return {
**llm_result,
"source": f"llm_{provider}",
"expert_diagnosis": expert_result.get("matched_rule"),
}
except Exception as e:
@@ -673,23 +480,13 @@ class DecisionManager:
"dual_engine_llm_failed",
incident_id=incident.incident_id,
error=str(e),
expert_rule=expert_result.get("matched_rule"),
)
# Step 5: LLM 失敗,使用 Expert System 結果
# 但根據診斷結果調整回應
# LLM 失敗,使用 Expert System
logger.info(
"dual_engine_expert_fallback",
incident_id=incident.incident_id,
expert_rule=expert_result.get("matched_rule"),
human_review=expert_result.get("human_review_required", False),
)
# 如果 Expert 標記需人工介入,降低 confidence
if expert_result.get("human_review_required"):
expert_result["confidence"] = min(expert_result.get("confidence", 0.5), 0.5)
expert_result["description"] += " [LLM 分析失敗,建議人工確認]"
return expert_result
async def _try_playbook_match(

View File

@@ -157,13 +157,8 @@ class TelegramMessage:
else:
conf_emoji = "🔴"
# 自動生成事件編號 (2026-03-26 修復: 檢查是否已有 INC- 前綴)
if self.incident_id:
incident_id = self.incident_id
elif self.approval_id.upper().startswith("INC-"):
incident_id = self.approval_id.upper()
else:
incident_id = f"INC-{self.approval_id[:8].upper()}"
# 自動生成事件編號
incident_id = self.incident_id or f"INC-{self.approval_id[:8].upper()}"
# SignOz URL (優先使用動態 URL) - 必須 HTML 轉義防止解析錯誤
service_name = self.resource_name.split("-")[0] if "-" in self.resource_name else self.resource_name