diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index d5eeb5b8..14a1d58f 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -506,13 +506,23 @@ async def _resolve_target_from_k8s(incident: "Incident", namespace: str) -> str return None # alertname → 關鍵字映射(主機層告警常見類型) + # I2 修復 2026-04-11: HostHighDiskUsage → HostOutOfDiskSpace(與 alerts-unified.yml 一致) + # DockerContainerUnhealthy/HostOutOfDiskSpace keywords=[] 走 fallback(找第一個非 infra pod) + # 並加 log 便於追蹤 fallback 路徑 _ALERTNAME_KEYWORDS: dict[str, list[str]] = { - "HostHighCpuLoad": ["api", "web"], - "HostOutOfMemory": ["api", "web"], + "HostHighCpuLoad": ["api", "web"], + "HostOutOfMemory": ["api", "web"], "DockerContainerUnhealthy": [], - "HostHighDiskUsage": [], + "DockerContainerExited": [], + "HostOutOfDiskSpace": [], } keywords = _ALERTNAME_KEYWORDS.get(alertname, []) + if not keywords and alertname in _ALERTNAME_KEYWORDS: + logger.debug( + "resolve_target_k8s_fallback_to_first_pod", + alertname=alertname, + reason="alertname 有對應但 keywords=[],走 fallback 取第一個非 infra pod", + ) import re as _re for line in pod_lines: @@ -1755,6 +1765,7 @@ class DecisionManager: Returns: 重新推送的 token 數量 """ + import json as _json from src.core.redis_client import get_redis from src.db.base import get_db_context from src.repositories.incident_repository import IncidentDBRepository @@ -1772,7 +1783,6 @@ class DecisionManager: raw = await redis.get(key) if not raw: continue - import json as _json data = _json.loads(raw) if data.get("state") != DecisionState.READY.value: continue diff --git a/apps/api/src/services/drift_interpreter.py b/apps/api/src/services/drift_interpreter.py index f4275fdc..802e568c 100644 --- a/apps/api/src/services/drift_interpreter.py +++ b/apps/api/src/services/drift_interpreter.py @@ -106,8 +106,12 @@ class NemotronDriftInterpreter: 改用 Ollama httpx 直接呼叫,繞過 nvidia_provider,與 drift_narrator_service 一致 """ import httpx + from src.core.config import get_settings - OLLAMA_URL = "http://192.168.0.111:11434" + # C1 修復 2026-04-11: 禁止寫死內網 IP(feedback_frontend_internal_ip_ban 鐵律) + # 改從 settings.OLLAMA_URL 讀取(已有此設定,default=http://192.168.0.111:11434) + _settings = get_settings() + OLLAMA_URL = getattr(_settings, "OLLAMA_URL", "http://192.168.0.111:11434") MODEL = "qwen2.5:7b-instruct" TIMEOUT = 45.0 diff --git a/apps/api/src/services/km_conversion_service.py b/apps/api/src/services/km_conversion_service.py index 99a65805..cd5b3808 100644 --- a/apps/api/src/services/km_conversion_service.py +++ b/apps/api/src/services/km_conversion_service.py @@ -196,6 +196,33 @@ class KMConversionService: error=str(_ve), ) + # C2 修復 2026-04-11: DB 更新後,同步更新 Redis Working Memory 中的 vectorized 欄位 + # 審計查 Redis Incident 物件,若不同步則審計仍顯示 vectorized=False + # Key 格式: incident:{incident_id}(與 incident_service.save_to_working_memory 一致) + try: + import json as _json + from src.core.redis_client import get_redis + _redis = get_redis() + _redis_key = f"incident:{incident.incident_id}" + _raw = await _redis.get(_redis_key) + if _raw: + _data = _json.loads(_raw) + if not _data.get("vectorized"): + _data["vectorized"] = True + _ttl = await _redis.ttl(_redis_key) + _ex = _ttl if _ttl and _ttl > 0 else 604800 + await _redis.set(_redis_key, _json.dumps(_data), ex=_ex) + logger.info( + "km_incident_vectorized_redis_synced", + incident_id=incident.incident_id, + ) + except Exception as _re: + logger.debug( + "km_vectorized_redis_sync_failed", + incident_id=incident.incident_id, + error=str(_re), + ) + logger.info( "km_converted", incident_id=incident.incident_id, diff --git a/docs/adr/ADR-072-aiops-bug-fixes.md b/docs/adr/ADR-072-aiops-bug-fixes.md index f3ac2f6c..911a34dc 100644 --- a/docs/adr/ADR-072-aiops-bug-fixes.md +++ b/docs/adr/ADR-072-aiops-bug-fixes.md @@ -96,8 +96,30 @@ BUG-001 (drift_interpreter) → BUG-002 (deployment_name) → BUG-003 (nemotron_ | BUG-001 | K8s config drift 告警 → Telegram 出現可讀摘要(非錯誤訊息)| | BUG-002 | HostHighCpuLoad 告警 → DecisionToken status = completed(非 error)| | BUG-003 | 無效 deployment name → nemotron 拒絕並重新分析 | -| BUG-004 | 新 Incident 完成後 vectorized = True + persisted_to_pg = True | +| BUG-004 | 新 Incident 完成後 Redis + DB 兩處 vectorized = True(C2 修復:同步 Redis)| | BUG-005 | ready decisions → Telegram 發出審核卡片 | | BUG-006 | 修復完成後 outcome + verification_result 不為 null | -| BUG-007 | 新告警 severity label 正確 → P0/P1/P2 分級出現 | +| BUG-007 | 確認不需修:alerts-unified.yml 全 42 規則均有 severity label | | BUG-008 | HostHighCpuLoad → incident_type = "host_cpu"(非 "custom")| + +--- + +## Code Review 發現(2026-04-11 首席架構師審查) + +### 已修復(C1/C2/I2/M2) + +| 項目 | 問題 | 修復 | +|------|------|------| +| C1 | `drift_interpreter.py` 寫死內網 IP `192.168.0.111` | 改從 `settings.OLLAMA_URL` 讀取 | +| C2 | BUG-004 只更新 DB,Redis Working Memory `vectorized` 未同步 | 補 Redis JSON patch 同步 | +| I2 | `_ALERTNAME_KEYWORDS` 用 `HostHighDiskUsage`(與 alerts-unified.yml 不符) | 改為 `HostOutOfDiskSpace` + 補 `DockerContainerExited` + fallback log | +| M2 | `import json as _json` 在 for 迴圈體內 | 移至方法頂部 | + +### 已記錄技術債(不阻塞合併) + +| 項目 | 說明 | 後續 | +|------|------|------| +| I1 | BUG-008 規格說整合 ADR-064 Rule Engine,實際改用靜態 dict 擴充 | 下 Sprint 整合 ADR-064 YAML 規則動態推斷;靜態 dict 為可接受中間狀態 | +| I3 | `resend_stale_ready_tokens()` 直接 Redis SCAN + DB 存取,輕微違反積木化 | Phase R 清理;decision_manager 全域已有類似模式,不單獨修 | +| I4 | BUG-006 outcome 寫入非 atomic | 已足夠改善(從「永遠 null」到「大部分有值」),完整 atomic 待 Phase R | +| M3 | `alertname_to_type` dict 應抽至 constants 模組 | 下 Sprint 與 ADR-064 整合時一起重構 |