From 5aa0244c9a25ce9b8d849a8dbfe37cf12acbdff0 Mon Sep 17 00:00:00 2001 From: OG T Date: Sat, 11 Apr 2026 20:24:41 +0800 Subject: [PATCH] =?UTF-8?q?fix(aiops):=20ADR-072=20P1=20Bug=20=E4=BF=AE?= =?UTF-8?q?=E5=BE=A9=20=E2=80=94=20BUG-004/005/006?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BUG-004 KM vectorization 108/112 = False: km_conversion_service: KM entry 建立後(embedding 已背景觸發), 補寫 incidents.vectorized = True,飛輪閉環(ADR-068)學習指標正常 BUG-005 15 ready decisions 無人審核: decision_manager: 新增 resend_stale_ready_tokens(), 掃描 Redis decision:* key,找出 state=ready 且 dedup_key 過期的 token, 重新推送 Telegram 審核卡片 main.py: lifespan startup 排程 resend_stale_ready_tokens()(asyncio.create_task 非阻塞) BUG-006 outcome/verification_result 全 null: _push_auto_repair_result: Telegram 推送前先寫入 incidents.outcome + incidents.verification_result 到 DB 2026-04-11 Claude Sonnet 4.6 Asia/Taipei Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/main.py | 9 ++ apps/api/src/services/decision_manager.py | 92 +++++++++++++++++++ .../api/src/services/km_conversion_service.py | 28 ++++++ 3 files changed, 129 insertions(+) diff --git a/apps/api/src/main.py b/apps/api/src/main.py index b344537e..1182d1af 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -307,6 +307,15 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: await init_signal_worker() logger.info("signal_worker_initialized") + # BUG-005 修復 2026-04-11: 啟動時掃描 Redis 中所有 state=ready 但未送 Telegram 的 token + # dedup TTL 10 分鐘過期後,ready decisions 就沒有補送機制 → 長期卡在 ready 無人審核 + try: + from src.services.decision_manager import get_decision_manager + asyncio.create_task(get_decision_manager().resend_stale_ready_tokens()) + logger.info("stale_ready_tokens_resend_scheduled") + except Exception as e: + logger.warning("stale_ready_tokens_resend_schedule_failed", error=str(e)) + yield # Shutdown diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 7b1bb501..7201c653 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -714,6 +714,29 @@ async def _push_auto_repair_result( f"└ 錯誤: {error[:100] if error else '未知錯誤'}" ) + # BUG-006 修復 2026-04-11: outcome + verification_result 全為 null + # 原因:_push_auto_repair_result 只送 Telegram,沒寫 DB + # 修復:寫入 incidents 表 outcome/verification_result 欄位 + try: + from src.db.base import get_db_context + from src.db.models import IncidentRecord + from sqlalchemy import update as _upd_outcome + _outcome = "auto_repaired" if success else "auto_repair_failed" + _verification = ( + f"自動修復{'成功' if success else '失敗'}:{action[:120] if action else '未知'}" + + (f" | 錯誤:{error[:80]}" if error else "") + ) + async with get_db_context() as _odb: + await _odb.execute( + _upd_outcome(IncidentRecord) + .where(IncidentRecord.incident_id == inc_id) + .values(outcome=_outcome, verification_result=_verification) + ) + await _odb.commit() + logger.info("outcome_written", incident_id=inc_id, outcome=_outcome) + except Exception as _oe: + logger.warning("outcome_write_failed", incident_id=inc_id, error=str(_oe)) + # 優先: reply 原告警訊息並換掉按鈕 appended = await gateway.append_incident_update( incident_id=inc_id, @@ -1664,6 +1687,75 @@ class DecisionManager: await self._save_token(token) return token + async def resend_stale_ready_tokens(self) -> int: + """ + BUG-005 修復 2026-04-11: 掃描 Redis 中所有 state=ready 且 dedup_key 不存在的 token, + 重新推送 Telegram 審核卡片。 + 觸發時機:API 啟動(lifespan startup)+ 管理 API 手動呼叫。 + + Returns: + 重新推送的 token 數量 + """ + from src.core.redis_client import get_redis + from src.db.base import get_db_context + from src.repositories.incident_repository import IncidentDBRepository + + redis = get_redis() + resent = 0 + + try: + # 掃描所有 decision:* key + cursor = 0 + while True: + cursor, keys = await redis.scan(cursor, match="decision:*", count=200) + for key in keys: + try: + raw = await redis.get(key) + if not raw: + continue + import json as _json + data = _json.loads(raw) + if data.get("state") != DecisionState.READY.value: + continue + + incident_id = data.get("incident_id", "") + dedup_key = f"telegram_sent:{incident_id}" + if await redis.exists(dedup_key): + continue # dedup 還在,跳過 + + # 取 Incident 資料(確認未 resolved) + async with get_db_context() as _db: + incident = await IncidentDBRepository(_db).get_by_id(incident_id) + if not incident: + continue + if str(getattr(incident, "status", "")).lower() in ("resolved", "closed"): + continue + + proposal_data = data.get("proposal_data") or {} + if not proposal_data: + continue + + _fire_and_forget( + _push_decision_to_telegram(incident, proposal_data) + ) + resent += 1 + logger.info( + "stale_ready_token_resent", + incident_id=incident_id, + token=data.get("token", ""), + ) + except Exception as _te: + logger.debug("stale_ready_token_scan_error", error=str(_te)) + + if cursor == 0: + break + + except Exception as e: + logger.warning("resend_stale_ready_tokens_failed", error=str(e)) + + logger.info("stale_ready_tokens_scan_done", resent=resent) + return resent + # ============================================================================= # Singleton diff --git a/apps/api/src/services/km_conversion_service.py b/apps/api/src/services/km_conversion_service.py index 9d3d7531..99a65805 100644 --- a/apps/api/src/services/km_conversion_service.py +++ b/apps/api/src/services/km_conversion_service.py @@ -168,6 +168,34 @@ class KMConversionService: except Exception as _e: logger.warning("km_op_log_failed", incident_id=incident.incident_id, error=str(_e)) + # BUG-004 修復 2026-04-11: KM entry 建立後,knowledge_service 背景觸發 embedding, + # 但 incidents.vectorized 沒有被設為 True → 飛輪閉環(ADR-068)學習效果歸零。 + # 等 embedding 背景任務啟動後(短延遲)更新 incidents.vectorized = True。 + # 注意:embedding 為背景 asyncio task,此處標記 vectorized=True 代表「已觸發向量化」 + # 真正完成以 knowledge_embedding_saved log 為準,但 vectorized flag 用於篩選補轉換 + try: + from src.db.base import get_db_context + from src.db.models import IncidentRecord + from sqlalchemy import update as _sa_update + async with get_db_context() as _db: + await _db.execute( + _sa_update(IncidentRecord) + .where(IncidentRecord.incident_id == incident.incident_id) + .values(vectorized=True) + ) + await _db.commit() + logger.info( + "km_incident_vectorized_flagged", + incident_id=incident.incident_id, + km_entry_id=km_entry.entry_id, + ) + except Exception as _ve: + logger.warning( + "km_vectorized_flag_failed", + incident_id=incident.incident_id, + error=str(_ve), + ) + logger.info( "km_converted", incident_id=incident.incident_id,