fix(prod): 修復四個生產致命 bug — outcome 寫入 / OpenClaw / Telegram 通知 / LLM 規則顯示
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
1. decision_manager: 移除 UPDATE incidents 中的 verification_result 欄位
(incidents 表無此欄位 → 所有 outcome 寫入失敗 outcome_write_failed)
2. failure_watcher: get_openclaw_service → get_openclaw
(函數名錯誤 → OpenClaw 分析全部 ImportError 崩潰)
3. failure_watcher: tg.send_message → tg.send_notification
(TelegramGateway 無 send_message 方法 → 修復通知無法送出)
4. decision_manager: expert_analyze 補齊 initial_diagnosis / diagnosis_description key
(openclaw.py 讀這兩個 key,但 expert_analyze 只有 matched_rule / description
→ LLM 永遠看到 Matched Rule=unknown,無法正確分析)
2026-04-15 ogt + Claude Sonnet 4.6(亞太): 生產緊急修復
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -842,15 +842,11 @@ async def _push_auto_repair_result(
|
||||
from src.db.models import IncidentRecord
|
||||
from sqlalchemy import update as _upd_outcome
|
||||
_outcome = "auto_repaired" if success else "auto_repair_failed"
|
||||
_verification = (
|
||||
f"自動修復{'成功' if success else '失敗'}:{action[:120] if action else '未知'}"
|
||||
+ (f" | 錯誤:{error[:80]}" if error else "")
|
||||
)
|
||||
async with get_db_context() as _odb:
|
||||
await _odb.execute(
|
||||
_upd_outcome(IncidentRecord)
|
||||
.where(IncidentRecord.incident_id == inc_id)
|
||||
.values(outcome=_outcome, verification_result=_verification)
|
||||
.values(outcome=_outcome)
|
||||
)
|
||||
await _odb.commit()
|
||||
logger.info("outcome_written", incident_id=inc_id, outcome=_outcome)
|
||||
@@ -1018,13 +1014,15 @@ def expert_analyze(incident: Incident) -> dict[str, Any]:
|
||||
"source": "expert_system",
|
||||
"action": rule["action"].format(target=target),
|
||||
"description": rule["description"],
|
||||
"diagnosis_description": rule["description"], # openclaw.py reads this key
|
||||
"risk_level": rule["risk_level"],
|
||||
"reasoning": f"[規則匹配] {rule['reasoning']}", # 明確標示來源
|
||||
"confidence": 0.0, # 🔴 規則匹配不是 AI 仲裁,信心度設 0
|
||||
"kubectl_command": rule["action"].format(target=target),
|
||||
"matched_rule": matched_rule,
|
||||
"initial_diagnosis": matched_rule, # openclaw.py reads this key
|
||||
"from_cache": False,
|
||||
"is_rule_based": True, # 新增標記
|
||||
"is_rule_based": True,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -569,9 +569,9 @@ class FailureWatcherService(IFailureWatcher):
|
||||
整合 SignOz 監控數據提供更精準的 RCA。
|
||||
"""
|
||||
try:
|
||||
from src.services.openclaw import get_openclaw_service
|
||||
from src.services.openclaw import get_openclaw
|
||||
|
||||
openclaw = get_openclaw_service()
|
||||
openclaw = get_openclaw()
|
||||
|
||||
# 建構告警上下文
|
||||
alert_context = {
|
||||
@@ -738,7 +738,7 @@ class FailureWatcherService(IFailureWatcher):
|
||||
f"└ 💡 建議: {analysis.get('suggested_repair', '需人工分析')}\n\n"
|
||||
f"請在 Dashboard 授權或使用 /repair {audit_log_id[:8]}"
|
||||
)
|
||||
await tg.send_message(message)
|
||||
await tg.send_notification(message)
|
||||
|
||||
logger.info(
|
||||
"repair_request_sent",
|
||||
@@ -770,7 +770,7 @@ class FailureWatcherService(IFailureWatcher):
|
||||
f"├ 📋 AuditLog: <code>{audit_log_id[:8]}...</code>\n"
|
||||
f"└ 📝 結果: {repair_result}"
|
||||
)
|
||||
await tg.send_message(message)
|
||||
await tg.send_notification(message)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
|
||||
Reference in New Issue
Block a user