fix(aiops): ADR-072 P1 Bug 修復 — BUG-004/005/006
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

BUG-004 KM vectorization 108/112 = False:
  km_conversion_service: KM entry 建立後(embedding 已背景觸發),
  補寫 incidents.vectorized = True,飛輪閉環(ADR-068)學習指標正常

BUG-005 15 ready decisions 無人審核:
  decision_manager: 新增 resend_stale_ready_tokens(),
  掃描 Redis decision:* key,找出 state=ready 且 dedup_key 過期的 token,
  重新推送 Telegram 審核卡片
  main.py: lifespan startup 排程 resend_stale_ready_tokens()(asyncio.create_task 非阻塞)

BUG-006 outcome/verification_result 全 null:
  _push_auto_repair_result: Telegram 推送前先寫入
  incidents.outcome + incidents.verification_result 到 DB

2026-04-11 Claude Sonnet 4.6 Asia/Taipei

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-11 20:24:41 +08:00
parent 2185e1755c
commit 5aa0244c9a
3 changed files with 129 additions and 0 deletions

View File

@@ -307,6 +307,15 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
await init_signal_worker()
logger.info("signal_worker_initialized")
# BUG-005 修復 2026-04-11: 啟動時掃描 Redis 中所有 state=ready 但未送 Telegram 的 token
# dedup TTL 10 分鐘過期後ready decisions 就沒有補送機制 → 長期卡在 ready 無人審核
try:
from src.services.decision_manager import get_decision_manager
asyncio.create_task(get_decision_manager().resend_stale_ready_tokens())
logger.info("stale_ready_tokens_resend_scheduled")
except Exception as e:
logger.warning("stale_ready_tokens_resend_schedule_failed", error=str(e))
yield
# Shutdown

View File

@@ -714,6 +714,29 @@ async def _push_auto_repair_result(
f"└ 錯誤: {error[:100] if error else '未知錯誤'}"
)
# BUG-006 修復 2026-04-11: outcome + verification_result 全為 null
# 原因_push_auto_repair_result 只送 Telegram沒寫 DB
# 修復:寫入 incidents 表 outcome/verification_result 欄位
try:
from src.db.base import get_db_context
from src.db.models import IncidentRecord
from sqlalchemy import update as _upd_outcome
_outcome = "auto_repaired" if success else "auto_repair_failed"
_verification = (
f"自動修復{'成功' if success else '失敗'}{action[:120] if action else '未知'}"
+ (f" | 錯誤:{error[:80]}" if error else "")
)
async with get_db_context() as _odb:
await _odb.execute(
_upd_outcome(IncidentRecord)
.where(IncidentRecord.incident_id == inc_id)
.values(outcome=_outcome, verification_result=_verification)
)
await _odb.commit()
logger.info("outcome_written", incident_id=inc_id, outcome=_outcome)
except Exception as _oe:
logger.warning("outcome_write_failed", incident_id=inc_id, error=str(_oe))
# 優先: reply 原告警訊息並換掉按鈕
appended = await gateway.append_incident_update(
incident_id=inc_id,
@@ -1664,6 +1687,75 @@ class DecisionManager:
await self._save_token(token)
return token
async def resend_stale_ready_tokens(self) -> int:
"""
BUG-005 修復 2026-04-11: 掃描 Redis 中所有 state=ready 且 dedup_key 不存在的 token
重新推送 Telegram 審核卡片。
觸發時機API 啟動lifespan startup+ 管理 API 手動呼叫。
Returns:
重新推送的 token 數量
"""
from src.core.redis_client import get_redis
from src.db.base import get_db_context
from src.repositories.incident_repository import IncidentDBRepository
redis = get_redis()
resent = 0
try:
# 掃描所有 decision:* key
cursor = 0
while True:
cursor, keys = await redis.scan(cursor, match="decision:*", count=200)
for key in keys:
try:
raw = await redis.get(key)
if not raw:
continue
import json as _json
data = _json.loads(raw)
if data.get("state") != DecisionState.READY.value:
continue
incident_id = data.get("incident_id", "")
dedup_key = f"telegram_sent:{incident_id}"
if await redis.exists(dedup_key):
continue # dedup 還在,跳過
# 取 Incident 資料(確認未 resolved
async with get_db_context() as _db:
incident = await IncidentDBRepository(_db).get_by_id(incident_id)
if not incident:
continue
if str(getattr(incident, "status", "")).lower() in ("resolved", "closed"):
continue
proposal_data = data.get("proposal_data") or {}
if not proposal_data:
continue
_fire_and_forget(
_push_decision_to_telegram(incident, proposal_data)
)
resent += 1
logger.info(
"stale_ready_token_resent",
incident_id=incident_id,
token=data.get("token", ""),
)
except Exception as _te:
logger.debug("stale_ready_token_scan_error", error=str(_te))
if cursor == 0:
break
except Exception as e:
logger.warning("resend_stale_ready_tokens_failed", error=str(e))
logger.info("stale_ready_tokens_scan_done", resent=resent)
return resent
# =============================================================================
# Singleton

View File

@@ -168,6 +168,34 @@ class KMConversionService:
except Exception as _e:
logger.warning("km_op_log_failed", incident_id=incident.incident_id, error=str(_e))
# BUG-004 修復 2026-04-11: KM entry 建立後knowledge_service 背景觸發 embedding
# 但 incidents.vectorized 沒有被設為 True → 飛輪閉環ADR-068學習效果歸零。
# 等 embedding 背景任務啟動後(短延遲)更新 incidents.vectorized = True。
# 注意embedding 為背景 asyncio task此處標記 vectorized=True 代表「已觸發向量化」
# 真正完成以 knowledge_embedding_saved log 為準,但 vectorized flag 用於篩選補轉換
try:
from src.db.base import get_db_context
from src.db.models import IncidentRecord
from sqlalchemy import update as _sa_update
async with get_db_context() as _db:
await _db.execute(
_sa_update(IncidentRecord)
.where(IncidentRecord.incident_id == incident.incident_id)
.values(vectorized=True)
)
await _db.commit()
logger.info(
"km_incident_vectorized_flagged",
incident_id=incident.incident_id,
km_entry_id=km_entry.entry_id,
)
except Exception as _ve:
logger.warning(
"km_vectorized_flag_failed",
incident_id=incident.incident_id,
error=str(_ve),
)
logger.info(
"km_converted",
incident_id=incident.incident_id,