fix: 3 個飛輪沉默未打通節點 — 統帥截圖盤出

統帥截圖證據 (Telegram MEDIUM 告警仍走人工審核): INC-20260411-A03B2E / A2BB29 顯示「[規則匹配]」+ action=unknown-service 節點 1: AutoApprovePolicy 擋下規則匹配 (飛輪主因) - ADR-073 規則匹配 confidence=0.0 (防偽造) - AutoApprovePolicy.min_confidence=0.50 → 擋下 - 結果: MEDIUM 規則匹配永遠人工審核，飛輪不轉修復: auto_approve.py 加 _is_rule_based 判斷 (is_rule_based / source=expert_system / rule_id / matched_rule) → bypass min_confidence 檢查 → 驗證: should_auto_approve=True ✅ 節點 2: _is_bad_target 漏 unknown-service magic string - _resolve_target_from_k8s fallback 產 unknown-service / unknown-pod - GAP-A4 Phase 1/2 只擋 'unknown' 而非前綴修復: alert_rule_engine.py 加 unknown-/none-/null-/undefined- 前綴黑名單 → 驗證: 4 個 magic 全 bad ✅ 節點 3: stale_ready_tokens_resend 無時效過濾 - 截圖是 2026-04-11 (4 天前) 告警 - 舊 labels 過期，重 process 也產不出新 target - 壓爆 Ollama + 污染 Telegram 卡片修復: decision_manager.py 跳過 > 3 天的 stale incident → skip + log stale_ready_token_skipped_too_old 回歸: 113/113 Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2026-04-15 10:56:48 +08:00
parent e3d7c92100
commit 6c7f648b60
3 changed files with 38 additions and 1 deletions
--- a/apps/api/src/services/alert_rule_engine.py
+++ b/apps/api/src/services/alert_rule_engine.py
@@ -143,6 +143,12 @@ def _is_bad_target(target: str, alertname: str) -> bool:
    """
    if not target or target in ("unknown", "none", "null", ""):
        return True
+    # 2026-04-15 Claude Sonnet 4.6 (GAP-A4 Phase 3): 擴充 fallback magic string
+    # 截圖實證：Telegram 卡顯示 target=unknown-service 通過 _is_bad_target
+    # _resolve_target_from_k8s fallback 也會產 unknown-pod/unknown-container
+    _BAD_MAGIC_PREFIXES = ("unknown-", "none-", "null-", "undefined-")
+    if any(target.startswith(p) for p in _BAD_MAGIC_PREFIXES):
+        return True
    if target == alertname:
        return True
    if any(c in target for c in (" ", ":", "(", ")", '"', "'", "<", ">", "{", "}")):
--- a/apps/api/src/services/auto_approve.py
+++ b/apps/api/src/services/auto_approve.py
@@ -298,7 +298,17 @@ class AutoApprovePolicy:
            )

        # 條件 4: AI 信心度
-        if confidence < self.config.min_confidence:
+        # 2026-04-15 Claude Sonnet 4.6 (飛輪沉默節點 1 修復):
+        # 規則匹配的 confidence 固定 0.0（ADR-073 防偽造），會被此條件擋下
+        # 但 YAML 規則是人工審核過的，應直接信任 → bypass min_confidence
+        # 改用「Playbook 成功率」或「規則 source」判斷可信度
+        _is_rule_based = (
+            proposal_data.get("is_rule_based") is True
+            or proposal_data.get("source") == "expert_system"
+            or (proposal_data.get("rule_id") or "") != ""
+            or (proposal_data.get("matched_rule") or "") != ""
+        )
+        if not _is_rule_based and confidence < self.config.min_confidence:
            return self._reject(
                reason=AutoApproveReason.LOW_TRUST,
                detail=f"Confidence {confidence:.0%} < {self.config.min_confidence:.0%}",
--- a/apps/api/src/services/decision_manager.py
+++ b/apps/api/src/services/decision_manager.py
@@ -2150,6 +2150,27 @@ class DecisionManager:
                        if str(getattr(incident, "status", "")).lower() in ("resolved", "closed"):
                            continue

+                        # 2026-04-15 Claude Sonnet 4.6 (節點 3 修復):
+                        # 跳過 > 3 天的 stale incident — labels 已過時，重 process 無意義
+                        # 只會壓爆 Ollama + 污染 Telegram 卡片（截圖：4/11 的卡片今天還在彈）
+                        _STALE_DAYS = 3
+                        _created_at = getattr(incident, "created_at", None)
+                        if _created_at:
+                            from datetime import datetime as _dt, timedelta as _td, timezone as _tz
+                            _now = _dt.now(_tz.utc)
+                            _cutoff = _now - _td(days=_STALE_DAYS)
+                            # 確保 _created_at 有時區
+                            if _created_at.tzinfo is None:
+                                _created_at = _created_at.replace(tzinfo=_tz.utc)
+                            if _created_at < _cutoff:
+                                logger.debug(
+                                    "stale_ready_token_skipped_too_old",
+                                    incident_id=incident_id,
+                                    age_days=(_now - _created_at).days,
+                                    cutoff_days=_STALE_DAYS,
+                                )
+                                continue
+
                        proposal_data = data.get("proposal_data") or {}
                        if not proposal_data:
                            continue