From f6b698c873fa735e3be026f0ab5d0169bbdf8742 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 4 May 2026 14:31:53 +0800 Subject: [PATCH] =?UTF-8?q?fix(aiops):=20Critic=20=E4=BF=AE=E5=BE=A9=20?= =?UTF-8?q?=E2=80=94=20PromQL=20=E6=B3=A8=E5=85=A5=E9=98=B2=E7=B7=9A=20+?= =?UTF-8?q?=20flag=3DFalse=20escalation=20bug=20+=20=E8=A8=88=E6=95=B8?= =?UTF-8?q?=E8=99=9B=E5=A0=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug 1 (drift.py): DRIFT_AUTO_ADOPT_ENABLED=false 時仍設 auto_block_reason → 導致 escalation 被觸發,把「停用」誤判為「阻擋事故」 修法: flag=False 不設 auto_block_reason,視為靜默停用 Bug 2 (coverage_evaluator_job.py): asset name/host/namespace/ip 直接 f-string 進 PromQL,無白名單驗證 → 髒資料可生成語意污染規則或讓 Prometheus reload 失敗 修法: 加 _safe_label_val 正規表達式白名單(^[a-zA-Z0-9._\-]+$), 不合法直接 skip + debug log Bug 3 (coverage_evaluator_job.py): ON CONFLICT DO NOTHING 衝突時 created 仍 +1 → stats["rules_auto_created"] 計數虛高,Redis 冷卻被誤設 修法: 改用 INSERT ... RETURNING rule_name,fetchone() 確認實際插入才計數和設冷卻 附加: Redis RuntimeError 單獨 catch + log(不再靜默 pass) Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/api/v1/drift.py | 5 +- apps/api/src/jobs/coverage_evaluator_job.py | 59 +++++++++++++-------- 2 files changed, 40 insertions(+), 24 deletions(-) diff --git a/apps/api/src/api/v1/drift.py b/apps/api/src/api/v1/drift.py index 219317b8..ad8b7be2 100644 --- a/apps/api/src/api/v1/drift.py +++ b/apps/api/src/api/v1/drift.py @@ -186,9 +186,8 @@ async def _analyze_and_notify(report: DriftReport) -> None: auto_adopted = False auto_block_reason = "" from src.core.config import get_settings as _gs - _drift_auto_enabled = getattr(_gs(), "DRIFT_AUTO_ADOPT_ENABLED", True) - if not _drift_auto_enabled: - auto_block_reason = "DRIFT_AUTO_ADOPT_ENABLED=false,功能已停用" + _drift_auto_enabled = _gs().DRIFT_AUTO_ADOPT_ENABLED + # flag=False 視為「停用」,不設 auto_block_reason 避免誤觸 escalation try: if _drift_auto_enabled: adopt_svc = get_drift_adopt_service() diff --git a/apps/api/src/jobs/coverage_evaluator_job.py b/apps/api/src/jobs/coverage_evaluator_job.py index 87e74fb7..e08011e3 100644 --- a/apps/api/src/jobs/coverage_evaluator_job.py +++ b/apps/api/src/jobs/coverage_evaluator_job.py @@ -808,6 +808,9 @@ async def _auto_create_rules_for_uncovered_assets(run_id: str | None) -> int: ) assets = rows.fetchall() + # PromQL 值安全性:只允許合法 hostname/IP/k8s name 字元,防止 PromQL 語意污染 + _safe_label_val = re.compile(r'^[a-zA-Z0-9._\-]+$') + for asset in assets: asset_key = str(asset.asset_key or "") asset_type = str(asset.asset_type or "") @@ -825,26 +828,34 @@ async def _auto_create_rules_for_uncovered_assets(run_id: str | None) -> int: if already: logger.debug("coverage_auto_rule_cooldown", asset_key=asset_key) continue + except RuntimeError as e: + logger.warning("coverage_auto_rule_redis_unavailable", asset_key=asset_key, error=str(e)) except Exception: - pass # Redis 不可用,繼續建規則 + pass - # 建立 PromQL 規則 + # 建立 PromQL 規則(所有代入值必須通過白名單驗證) safe_key = re.sub(r"[^a-zA-Z0-9]", "_", asset_key)[:60] if asset_type == "host": ip_for_match = internal_ip or host - if not ip_for_match: + if not ip_for_match or not _safe_label_val.match(ip_for_match): + logger.debug("coverage_auto_rule_skip_unsafe_ip", asset_key=asset_key, ip=ip_for_match) continue rule_name = f"CoverageAuto_HostDown_{safe_key}" expr = f'up{{instance=~"{ip_for_match}:.*"}} == 0' severity = "warning" - labels = {"host": host or ip_for_match, "layer": "infrastructure", "source": "coverage_auto"} + display_host = host if _safe_label_val.match(host) else ip_for_match + labels = {"host": display_host, "layer": "infrastructure", "source": "coverage_auto"} annotations = { - "summary": f"主機 {host or ip_for_match} 無 Prometheus 探測響應", + "summary": f"主機 {display_host} 無 Prometheus 探測響應", "description": f"Coverage 缺口自動建規則 — asset_key={asset_key},請 SRE 複核 expr 後 approve", } duration_seconds = 120 elif asset_type == "k8s_workload": - if not name: + if not name or not _safe_label_val.match(name): + logger.debug("coverage_auto_rule_skip_unsafe_name", asset_key=asset_key, name=name) + continue + if namespace and not _safe_label_val.match(namespace): + logger.debug("coverage_auto_rule_skip_unsafe_ns", asset_key=asset_key, namespace=namespace) continue rule_name = f"CoverageAuto_WorkloadDown_{safe_key}" ns_selector = f',namespace="{namespace}"' if namespace else "" @@ -860,9 +871,10 @@ async def _auto_create_rules_for_uncovered_assets(run_id: str | None) -> int: continue # UPSERT 進 alert_rule_catalog(source='ai_generated') + # 用 RETURNING 判斷是否實際插入(ON CONFLICT DO NOTHING 衝突時無 RETURNING row) try: async with get_db_context() as db: - await db.execute( + row = await db.execute( _sql(""" INSERT INTO alert_rule_catalog ( rule_name, source, expr, duration_seconds, @@ -876,6 +888,7 @@ async def _auto_create_rules_for_uncovered_assets(run_id: str | None) -> int: NOW(), NOW() ) ON CONFLICT (rule_name) DO NOTHING + RETURNING rule_name """), { "rname": rule_name[:200], @@ -886,21 +899,25 @@ async def _auto_create_rules_for_uncovered_assets(run_id: str | None) -> int: "ann": _j.dumps(annotations, ensure_ascii=False), }, ) - created += 1 - logger.info( - "coverage_auto_rule_created", - rule_name=rule_name, - asset_key=asset_key, - asset_type=asset_type, - ) + actually_inserted = row.fetchone() is not None - # 設置 Redis 冷卻 - try: - from src.core.redis_client import get_redis - redis = get_redis() - await redis.set(cooldown_key, "1", ex=_COVERAGE_RULE_COOLDOWN_SEC) - except Exception: - pass + if actually_inserted: + created += 1 + logger.info( + "coverage_auto_rule_created", + rule_name=rule_name, + asset_key=asset_key, + asset_type=asset_type, + ) + # 設置 Redis 冷卻(僅實際插入才設) + try: + from src.core.redis_client import get_redis + redis = get_redis() + await redis.set(cooldown_key, "1", ex=_COVERAGE_RULE_COOLDOWN_SEC) + except Exception: + pass + else: + logger.debug("coverage_auto_rule_conflict_skip", rule_name=rule_name) except Exception as e: logger.warning("coverage_auto_rule_upsert_failed", asset_key=asset_key, error=str(e))