fix(aiops): ADR-072 P1 Bug 修復 — BUG-004/005/006
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
BUG-004 KM vectorization 108/112 = False: km_conversion_service: KM entry 建立後(embedding 已背景觸發), 補寫 incidents.vectorized = True,飛輪閉環(ADR-068)學習指標正常 BUG-005 15 ready decisions 無人審核: decision_manager: 新增 resend_stale_ready_tokens(), 掃描 Redis decision:* key,找出 state=ready 且 dedup_key 過期的 token, 重新推送 Telegram 審核卡片 main.py: lifespan startup 排程 resend_stale_ready_tokens()(asyncio.create_task 非阻塞) BUG-006 outcome/verification_result 全 null: _push_auto_repair_result: Telegram 推送前先寫入 incidents.outcome + incidents.verification_result 到 DB 2026-04-11 Claude Sonnet 4.6 Asia/Taipei Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -307,6 +307,15 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
await init_signal_worker()
|
||||
logger.info("signal_worker_initialized")
|
||||
|
||||
# BUG-005 修復 2026-04-11: 啟動時掃描 Redis 中所有 state=ready 但未送 Telegram 的 token
|
||||
# dedup TTL 10 分鐘過期後,ready decisions 就沒有補送機制 → 長期卡在 ready 無人審核
|
||||
try:
|
||||
from src.services.decision_manager import get_decision_manager
|
||||
asyncio.create_task(get_decision_manager().resend_stale_ready_tokens())
|
||||
logger.info("stale_ready_tokens_resend_scheduled")
|
||||
except Exception as e:
|
||||
logger.warning("stale_ready_tokens_resend_schedule_failed", error=str(e))
|
||||
|
||||
yield
|
||||
|
||||
# Shutdown
|
||||
|
||||
@@ -714,6 +714,29 @@ async def _push_auto_repair_result(
|
||||
f"└ 錯誤: {error[:100] if error else '未知錯誤'}"
|
||||
)
|
||||
|
||||
# BUG-006 修復 2026-04-11: outcome + verification_result 全為 null
|
||||
# 原因:_push_auto_repair_result 只送 Telegram,沒寫 DB
|
||||
# 修復:寫入 incidents 表 outcome/verification_result 欄位
|
||||
try:
|
||||
from src.db.base import get_db_context
|
||||
from src.db.models import IncidentRecord
|
||||
from sqlalchemy import update as _upd_outcome
|
||||
_outcome = "auto_repaired" if success else "auto_repair_failed"
|
||||
_verification = (
|
||||
f"自動修復{'成功' if success else '失敗'}:{action[:120] if action else '未知'}"
|
||||
+ (f" | 錯誤:{error[:80]}" if error else "")
|
||||
)
|
||||
async with get_db_context() as _odb:
|
||||
await _odb.execute(
|
||||
_upd_outcome(IncidentRecord)
|
||||
.where(IncidentRecord.incident_id == inc_id)
|
||||
.values(outcome=_outcome, verification_result=_verification)
|
||||
)
|
||||
await _odb.commit()
|
||||
logger.info("outcome_written", incident_id=inc_id, outcome=_outcome)
|
||||
except Exception as _oe:
|
||||
logger.warning("outcome_write_failed", incident_id=inc_id, error=str(_oe))
|
||||
|
||||
# 優先: reply 原告警訊息並換掉按鈕
|
||||
appended = await gateway.append_incident_update(
|
||||
incident_id=inc_id,
|
||||
@@ -1664,6 +1687,75 @@ class DecisionManager:
|
||||
await self._save_token(token)
|
||||
return token
|
||||
|
||||
async def resend_stale_ready_tokens(self) -> int:
|
||||
"""
|
||||
BUG-005 修復 2026-04-11: 掃描 Redis 中所有 state=ready 且 dedup_key 不存在的 token,
|
||||
重新推送 Telegram 審核卡片。
|
||||
觸發時機:API 啟動(lifespan startup)+ 管理 API 手動呼叫。
|
||||
|
||||
Returns:
|
||||
重新推送的 token 數量
|
||||
"""
|
||||
from src.core.redis_client import get_redis
|
||||
from src.db.base import get_db_context
|
||||
from src.repositories.incident_repository import IncidentDBRepository
|
||||
|
||||
redis = get_redis()
|
||||
resent = 0
|
||||
|
||||
try:
|
||||
# 掃描所有 decision:* key
|
||||
cursor = 0
|
||||
while True:
|
||||
cursor, keys = await redis.scan(cursor, match="decision:*", count=200)
|
||||
for key in keys:
|
||||
try:
|
||||
raw = await redis.get(key)
|
||||
if not raw:
|
||||
continue
|
||||
import json as _json
|
||||
data = _json.loads(raw)
|
||||
if data.get("state") != DecisionState.READY.value:
|
||||
continue
|
||||
|
||||
incident_id = data.get("incident_id", "")
|
||||
dedup_key = f"telegram_sent:{incident_id}"
|
||||
if await redis.exists(dedup_key):
|
||||
continue # dedup 還在,跳過
|
||||
|
||||
# 取 Incident 資料(確認未 resolved)
|
||||
async with get_db_context() as _db:
|
||||
incident = await IncidentDBRepository(_db).get_by_id(incident_id)
|
||||
if not incident:
|
||||
continue
|
||||
if str(getattr(incident, "status", "")).lower() in ("resolved", "closed"):
|
||||
continue
|
||||
|
||||
proposal_data = data.get("proposal_data") or {}
|
||||
if not proposal_data:
|
||||
continue
|
||||
|
||||
_fire_and_forget(
|
||||
_push_decision_to_telegram(incident, proposal_data)
|
||||
)
|
||||
resent += 1
|
||||
logger.info(
|
||||
"stale_ready_token_resent",
|
||||
incident_id=incident_id,
|
||||
token=data.get("token", ""),
|
||||
)
|
||||
except Exception as _te:
|
||||
logger.debug("stale_ready_token_scan_error", error=str(_te))
|
||||
|
||||
if cursor == 0:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("resend_stale_ready_tokens_failed", error=str(e))
|
||||
|
||||
logger.info("stale_ready_tokens_scan_done", resent=resent)
|
||||
return resent
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Singleton
|
||||
|
||||
@@ -168,6 +168,34 @@ class KMConversionService:
|
||||
except Exception as _e:
|
||||
logger.warning("km_op_log_failed", incident_id=incident.incident_id, error=str(_e))
|
||||
|
||||
# BUG-004 修復 2026-04-11: KM entry 建立後,knowledge_service 背景觸發 embedding,
|
||||
# 但 incidents.vectorized 沒有被設為 True → 飛輪閉環(ADR-068)學習效果歸零。
|
||||
# 等 embedding 背景任務啟動後(短延遲)更新 incidents.vectorized = True。
|
||||
# 注意:embedding 為背景 asyncio task,此處標記 vectorized=True 代表「已觸發向量化」
|
||||
# 真正完成以 knowledge_embedding_saved log 為準,但 vectorized flag 用於篩選補轉換
|
||||
try:
|
||||
from src.db.base import get_db_context
|
||||
from src.db.models import IncidentRecord
|
||||
from sqlalchemy import update as _sa_update
|
||||
async with get_db_context() as _db:
|
||||
await _db.execute(
|
||||
_sa_update(IncidentRecord)
|
||||
.where(IncidentRecord.incident_id == incident.incident_id)
|
||||
.values(vectorized=True)
|
||||
)
|
||||
await _db.commit()
|
||||
logger.info(
|
||||
"km_incident_vectorized_flagged",
|
||||
incident_id=incident.incident_id,
|
||||
km_entry_id=km_entry.entry_id,
|
||||
)
|
||||
except Exception as _ve:
|
||||
logger.warning(
|
||||
"km_vectorized_flag_failed",
|
||||
incident_id=incident.incident_id,
|
||||
error=str(_ve),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"km_converted",
|
||||
incident_id=incident.incident_id,
|
||||
|
||||
Reference in New Issue
Block a user