fix(aiops): Critic 修復 — PromQL 注入防線 + flag=False escalation bug + 計數虛報
All checks were successful
Code Review / ai-code-review (push) Successful in 53s
All checks were successful
Code Review / ai-code-review (push) Successful in 53s
Bug 1 (drift.py): DRIFT_AUTO_ADOPT_ENABLED=false 時仍設 auto_block_reason
→ 導致 escalation 被觸發,把「停用」誤判為「阻擋事故」
修法: flag=False 不設 auto_block_reason,視為靜默停用
Bug 2 (coverage_evaluator_job.py): asset name/host/namespace/ip 直接 f-string
進 PromQL,無白名單驗證
→ 髒資料可生成語意污染規則或讓 Prometheus reload 失敗
修法: 加 _safe_label_val 正規表達式白名單(^[a-zA-Z0-9._\-]+$),
不合法直接 skip + debug log
Bug 3 (coverage_evaluator_job.py): ON CONFLICT DO NOTHING 衝突時 created 仍 +1
→ stats["rules_auto_created"] 計數虛高,Redis 冷卻被誤設
修法: 改用 INSERT ... RETURNING rule_name,fetchone() 確認實際插入才計數和設冷卻
附加: Redis RuntimeError 單獨 catch + log(不再靜默 pass)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -186,9 +186,8 @@ async def _analyze_and_notify(report: DriftReport) -> None:
|
||||
auto_adopted = False
|
||||
auto_block_reason = ""
|
||||
from src.core.config import get_settings as _gs
|
||||
_drift_auto_enabled = getattr(_gs(), "DRIFT_AUTO_ADOPT_ENABLED", True)
|
||||
if not _drift_auto_enabled:
|
||||
auto_block_reason = "DRIFT_AUTO_ADOPT_ENABLED=false,功能已停用"
|
||||
_drift_auto_enabled = _gs().DRIFT_AUTO_ADOPT_ENABLED
|
||||
# flag=False 視為「停用」,不設 auto_block_reason 避免誤觸 escalation
|
||||
try:
|
||||
if _drift_auto_enabled:
|
||||
adopt_svc = get_drift_adopt_service()
|
||||
|
||||
@@ -808,6 +808,9 @@ async def _auto_create_rules_for_uncovered_assets(run_id: str | None) -> int:
|
||||
)
|
||||
assets = rows.fetchall()
|
||||
|
||||
# PromQL 值安全性:只允許合法 hostname/IP/k8s name 字元,防止 PromQL 語意污染
|
||||
_safe_label_val = re.compile(r'^[a-zA-Z0-9._\-]+$')
|
||||
|
||||
for asset in assets:
|
||||
asset_key = str(asset.asset_key or "")
|
||||
asset_type = str(asset.asset_type or "")
|
||||
@@ -825,26 +828,34 @@ async def _auto_create_rules_for_uncovered_assets(run_id: str | None) -> int:
|
||||
if already:
|
||||
logger.debug("coverage_auto_rule_cooldown", asset_key=asset_key)
|
||||
continue
|
||||
except RuntimeError as e:
|
||||
logger.warning("coverage_auto_rule_redis_unavailable", asset_key=asset_key, error=str(e))
|
||||
except Exception:
|
||||
pass # Redis 不可用,繼續建規則
|
||||
pass
|
||||
|
||||
# 建立 PromQL 規則
|
||||
# 建立 PromQL 規則(所有代入值必須通過白名單驗證)
|
||||
safe_key = re.sub(r"[^a-zA-Z0-9]", "_", asset_key)[:60]
|
||||
if asset_type == "host":
|
||||
ip_for_match = internal_ip or host
|
||||
if not ip_for_match:
|
||||
if not ip_for_match or not _safe_label_val.match(ip_for_match):
|
||||
logger.debug("coverage_auto_rule_skip_unsafe_ip", asset_key=asset_key, ip=ip_for_match)
|
||||
continue
|
||||
rule_name = f"CoverageAuto_HostDown_{safe_key}"
|
||||
expr = f'up{{instance=~"{ip_for_match}:.*"}} == 0'
|
||||
severity = "warning"
|
||||
labels = {"host": host or ip_for_match, "layer": "infrastructure", "source": "coverage_auto"}
|
||||
display_host = host if _safe_label_val.match(host) else ip_for_match
|
||||
labels = {"host": display_host, "layer": "infrastructure", "source": "coverage_auto"}
|
||||
annotations = {
|
||||
"summary": f"主機 {host or ip_for_match} 無 Prometheus 探測響應",
|
||||
"summary": f"主機 {display_host} 無 Prometheus 探測響應",
|
||||
"description": f"Coverage 缺口自動建規則 — asset_key={asset_key},請 SRE 複核 expr 後 approve",
|
||||
}
|
||||
duration_seconds = 120
|
||||
elif asset_type == "k8s_workload":
|
||||
if not name:
|
||||
if not name or not _safe_label_val.match(name):
|
||||
logger.debug("coverage_auto_rule_skip_unsafe_name", asset_key=asset_key, name=name)
|
||||
continue
|
||||
if namespace and not _safe_label_val.match(namespace):
|
||||
logger.debug("coverage_auto_rule_skip_unsafe_ns", asset_key=asset_key, namespace=namespace)
|
||||
continue
|
||||
rule_name = f"CoverageAuto_WorkloadDown_{safe_key}"
|
||||
ns_selector = f',namespace="{namespace}"' if namespace else ""
|
||||
@@ -860,9 +871,10 @@ async def _auto_create_rules_for_uncovered_assets(run_id: str | None) -> int:
|
||||
continue
|
||||
|
||||
# UPSERT 進 alert_rule_catalog(source='ai_generated')
|
||||
# 用 RETURNING 判斷是否實際插入(ON CONFLICT DO NOTHING 衝突時無 RETURNING row)
|
||||
try:
|
||||
async with get_db_context() as db:
|
||||
await db.execute(
|
||||
row = await db.execute(
|
||||
_sql("""
|
||||
INSERT INTO alert_rule_catalog (
|
||||
rule_name, source, expr, duration_seconds,
|
||||
@@ -876,6 +888,7 @@ async def _auto_create_rules_for_uncovered_assets(run_id: str | None) -> int:
|
||||
NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (rule_name) DO NOTHING
|
||||
RETURNING rule_name
|
||||
"""),
|
||||
{
|
||||
"rname": rule_name[:200],
|
||||
@@ -886,21 +899,25 @@ async def _auto_create_rules_for_uncovered_assets(run_id: str | None) -> int:
|
||||
"ann": _j.dumps(annotations, ensure_ascii=False),
|
||||
},
|
||||
)
|
||||
created += 1
|
||||
logger.info(
|
||||
"coverage_auto_rule_created",
|
||||
rule_name=rule_name,
|
||||
asset_key=asset_key,
|
||||
asset_type=asset_type,
|
||||
)
|
||||
actually_inserted = row.fetchone() is not None
|
||||
|
||||
# 設置 Redis 冷卻
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
await redis.set(cooldown_key, "1", ex=_COVERAGE_RULE_COOLDOWN_SEC)
|
||||
except Exception:
|
||||
pass
|
||||
if actually_inserted:
|
||||
created += 1
|
||||
logger.info(
|
||||
"coverage_auto_rule_created",
|
||||
rule_name=rule_name,
|
||||
asset_key=asset_key,
|
||||
asset_type=asset_type,
|
||||
)
|
||||
# 設置 Redis 冷卻(僅實際插入才設)
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
await redis.set(cooldown_key, "1", ex=_COVERAGE_RULE_COOLDOWN_SEC)
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
logger.debug("coverage_auto_rule_conflict_skip", rule_name=rule_name)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("coverage_auto_rule_upsert_failed", asset_key=asset_key, error=str(e))
|
||||
|
||||
Reference in New Issue
Block a user