feat(coverage_evaluator): 擴充 4 維 — playbook/remediation/rule_matching/rule_creation
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Review 盲點: coverage 7 維中原只實作 3 維 (monitoring/alerting/km),其餘 4 維永遠 unknown
v2 擴充:
+ auto_playbook: asset.name 出現在 playbooks.symptom_pattern/description (approved 狀態) → green
沒對應 playbook 但 type='k8s_workload' → yellow
+ auto_remediation: 過去 30d remediation_events.target_resource ILIKE asset.name → green
沒 target 但 k8s_workload/container → red (應有修復能力但沒)
+ auto_rule_matching: 過去 30d incidents.affected_services ILIKE asset.name
或 incidents.alertname match alert_rule.labels.host/namespace → green
沒觸發 → yellow (可能沒問題也可能沒覆蓋)
+ auto_rule_creation: alert_rule_catalog source='ai_generated' match asset → green
目前全 yaml_hardcoded → 全 red (表示尚未由 AI 主動建規則)
未來 Hermes 產出 AI rule 後會變 green
解鎖: coverage 7 維完整 SLO KPI (MASTER §7.1)
- red count = 真正的治理缺口
- green ratio = 自動化成熟度
- AI 可主動推薦 red asset 的補覆蓋動作
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -64,9 +64,20 @@ async def run_coverage_evaluator_loop() -> None:
|
|||||||
|
|
||||||
|
|
||||||
async def evaluate_once() -> dict[str, int]:
|
async def evaluate_once() -> dict[str, int]:
|
||||||
"""針對最新 asset_discovery_run 的 coverage_snapshot 升級 status."""
|
"""針對最新 asset_discovery_run 的 coverage_snapshot 升級 status.
|
||||||
|
|
||||||
|
2026-04-19 v2 擴充 4 維 (原 3 維 monitoring/alerting/km):
|
||||||
|
+ auto_playbook: asset.name 出現在 playbooks.symptom_pattern 或 description
|
||||||
|
+ auto_remediation: remediation_events 過去 30d 有 target match asset.name
|
||||||
|
+ auto_rule_matching: incidents 過去 30d 有 asset match (alertname+affected_services)
|
||||||
|
+ auto_rule_creation: alert_rule_catalog source='ai_generated' 覆蓋 asset
|
||||||
|
"""
|
||||||
started_ms = _time.time()
|
started_ms = _time.time()
|
||||||
stats = {"monitoring_updated": 0, "alerting_updated": 0, "km_updated": 0}
|
stats = {
|
||||||
|
"monitoring_updated": 0, "alerting_updated": 0, "km_updated": 0,
|
||||||
|
"playbook_updated": 0, "remediation_updated": 0,
|
||||||
|
"rule_matching_updated": 0, "rule_creation_updated": 0,
|
||||||
|
}
|
||||||
error_msg: str | None = None
|
error_msg: str | None = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -75,15 +86,17 @@ async def evaluate_once() -> dict[str, int]:
|
|||||||
logger.info("coverage_evaluator_no_run_yet")
|
logger.info("coverage_evaluator_no_run_yet")
|
||||||
return stats
|
return stats
|
||||||
|
|
||||||
# 1. auto_monitoring: Prometheus targets
|
# 原 3 維
|
||||||
stats["monitoring_updated"] = await _evaluate_monitoring(run_id)
|
stats["monitoring_updated"] = await _evaluate_monitoring(run_id)
|
||||||
|
|
||||||
# 2. auto_alerting: alert_rule_catalog labels match
|
|
||||||
stats["alerting_updated"] = await _evaluate_alerting(run_id)
|
stats["alerting_updated"] = await _evaluate_alerting(run_id)
|
||||||
|
|
||||||
# 3. auto_km_creation: knowledge_entries 覆蓋
|
|
||||||
stats["km_updated"] = await _evaluate_km_coverage(run_id)
|
stats["km_updated"] = await _evaluate_km_coverage(run_id)
|
||||||
|
|
||||||
|
# v2 新增 4 維
|
||||||
|
stats["playbook_updated"] = await _evaluate_playbook_coverage(run_id)
|
||||||
|
stats["remediation_updated"] = await _evaluate_remediation_coverage(run_id)
|
||||||
|
stats["rule_matching_updated"] = await _evaluate_rule_matching_coverage(run_id)
|
||||||
|
stats["rule_creation_updated"] = await _evaluate_rule_creation_coverage(run_id)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"{type(e).__name__}: {e}"[:1000]
|
error_msg = f"{type(e).__name__}: {e}"[:1000]
|
||||||
logger.exception("coverage_evaluate_once_failed", error=error_msg)
|
logger.exception("coverage_evaluate_once_failed", error=error_msg)
|
||||||
@@ -291,6 +304,182 @@ async def _evaluate_km_coverage(run_id: str) -> int:
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# v2 新增 4 維 evaluator
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def _evaluate_playbook_coverage(run_id: str) -> int:
|
||||||
|
"""
|
||||||
|
auto_playbook: k8s_workload asset 在 playbooks.symptom_pattern (JSON) 或 description 出現 → green
|
||||||
|
沒對應 playbook 但 type 合理 → yellow; 否則保持 unknown
|
||||||
|
"""
|
||||||
|
from sqlalchemy import text as _sql
|
||||||
|
from src.db.base import get_db_context
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with get_db_context() as db:
|
||||||
|
result = await db.execute(
|
||||||
|
_sql("""
|
||||||
|
UPDATE asset_coverage_snapshot cs
|
||||||
|
SET coverage_status = CASE
|
||||||
|
WHEN ai.asset_type = 'k8s_workload' AND EXISTS (
|
||||||
|
SELECT 1 FROM playbooks pb
|
||||||
|
WHERE pb.status = 'approved'
|
||||||
|
AND (pb.description ILIKE '%' || ai.name || '%'
|
||||||
|
OR pb.symptom_pattern::text ILIKE '%' || ai.name || '%')
|
||||||
|
) THEN 'green'
|
||||||
|
WHEN ai.asset_type = 'k8s_workload' THEN 'yellow'
|
||||||
|
ELSE cs.coverage_status
|
||||||
|
END,
|
||||||
|
evidence = jsonb_build_object(
|
||||||
|
'source', 'playbooks_symptom_pattern_or_description_match',
|
||||||
|
'asset_name', ai.name
|
||||||
|
)
|
||||||
|
FROM asset_inventory ai
|
||||||
|
WHERE cs.asset_id = ai.asset_id
|
||||||
|
AND cs.run_id = CAST(:rid AS uuid)
|
||||||
|
AND cs.dimension = 'auto_playbook'
|
||||||
|
AND ai.asset_type = 'k8s_workload'
|
||||||
|
"""),
|
||||||
|
{"rid": run_id},
|
||||||
|
)
|
||||||
|
return result.rowcount or 0
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("evaluate_playbook_coverage_failed", error=str(e))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
async def _evaluate_remediation_coverage(run_id: str) -> int:
|
||||||
|
"""
|
||||||
|
auto_remediation: 過去 30d remediation_events.target_resource 包含 asset.name → green
|
||||||
|
沒 target 匹配但 asset 是 k8s_workload/container → red (應有修復能力但沒)
|
||||||
|
"""
|
||||||
|
from sqlalchemy import text as _sql
|
||||||
|
from src.db.base import get_db_context
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with get_db_context() as db:
|
||||||
|
result = await db.execute(
|
||||||
|
_sql("""
|
||||||
|
UPDATE asset_coverage_snapshot cs
|
||||||
|
SET coverage_status = CASE
|
||||||
|
WHEN ai.asset_type IN ('k8s_workload', 'container') AND EXISTS (
|
||||||
|
SELECT 1 FROM remediation_events re
|
||||||
|
WHERE re.target_resource ILIKE '%' || ai.name || '%'
|
||||||
|
AND re.created_at > NOW() - INTERVAL '30 days'
|
||||||
|
) THEN 'green'
|
||||||
|
WHEN ai.asset_type IN ('k8s_workload', 'container') THEN 'red'
|
||||||
|
ELSE cs.coverage_status
|
||||||
|
END,
|
||||||
|
evidence = jsonb_build_object(
|
||||||
|
'source', 'remediation_events_target_match_30d',
|
||||||
|
'asset_name', ai.name
|
||||||
|
)
|
||||||
|
FROM asset_inventory ai
|
||||||
|
WHERE cs.asset_id = ai.asset_id
|
||||||
|
AND cs.run_id = CAST(:rid AS uuid)
|
||||||
|
AND cs.dimension = 'auto_remediation'
|
||||||
|
AND ai.asset_type IN ('k8s_workload', 'container')
|
||||||
|
"""),
|
||||||
|
{"rid": run_id},
|
||||||
|
)
|
||||||
|
return result.rowcount or 0
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("evaluate_remediation_coverage_failed", error=str(e))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
async def _evaluate_rule_matching_coverage(run_id: str) -> int:
|
||||||
|
"""
|
||||||
|
auto_rule_matching: 過去 30d incidents 有觸發過關聯到該 asset → green
|
||||||
|
關聯: incident.alertname match alert_rule_catalog + labels.namespace/host 對應 asset
|
||||||
|
或 incident.affected_services ILIKE asset.name
|
||||||
|
沒觸發 → yellow (可能沒問題也可能沒覆蓋,中性)
|
||||||
|
"""
|
||||||
|
from sqlalchemy import text as _sql
|
||||||
|
from src.db.base import get_db_context
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with get_db_context() as db:
|
||||||
|
result = await db.execute(
|
||||||
|
_sql("""
|
||||||
|
UPDATE asset_coverage_snapshot cs
|
||||||
|
SET coverage_status = CASE
|
||||||
|
WHEN EXISTS (
|
||||||
|
SELECT 1 FROM incidents i
|
||||||
|
WHERE i.created_at > NOW() - INTERVAL '30 days'
|
||||||
|
AND (i.affected_services::text ILIKE '%' || ai.name || '%'
|
||||||
|
OR (i.alertname IS NOT NULL AND EXISTS (
|
||||||
|
SELECT 1 FROM alert_rule_catalog arc
|
||||||
|
WHERE arc.rule_name = i.alertname
|
||||||
|
AND (arc.labels->>'host' = ai.host
|
||||||
|
OR arc.labels->>'namespace' = ai.namespace)
|
||||||
|
)))
|
||||||
|
) THEN 'green'
|
||||||
|
WHEN ai.asset_type IN ('host','k8s_workload','container') THEN 'yellow'
|
||||||
|
ELSE cs.coverage_status
|
||||||
|
END,
|
||||||
|
evidence = jsonb_build_object(
|
||||||
|
'source', 'incidents_match_30d',
|
||||||
|
'asset_name', ai.name
|
||||||
|
)
|
||||||
|
FROM asset_inventory ai
|
||||||
|
WHERE cs.asset_id = ai.asset_id
|
||||||
|
AND cs.run_id = CAST(:rid AS uuid)
|
||||||
|
AND cs.dimension = 'auto_rule_matching'
|
||||||
|
AND ai.asset_type IN ('host', 'k8s_workload', 'container')
|
||||||
|
"""),
|
||||||
|
{"rid": run_id},
|
||||||
|
)
|
||||||
|
return result.rowcount or 0
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("evaluate_rule_matching_coverage_failed", error=str(e))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
async def _evaluate_rule_creation_coverage(run_id: str) -> int:
|
||||||
|
"""
|
||||||
|
auto_rule_creation: asset 是否有被 AI-generated rule 覆蓋
|
||||||
|
current: 所有 rule source='yaml_hardcoded',沒 AI-generated → 全 red (表示尚未由 AI 主動建規則)
|
||||||
|
未來 Hermes 建出 AI rule 後會變 green
|
||||||
|
"""
|
||||||
|
from sqlalchemy import text as _sql
|
||||||
|
from src.db.base import get_db_context
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with get_db_context() as db:
|
||||||
|
result = await db.execute(
|
||||||
|
_sql("""
|
||||||
|
UPDATE asset_coverage_snapshot cs
|
||||||
|
SET coverage_status = CASE
|
||||||
|
WHEN EXISTS (
|
||||||
|
SELECT 1 FROM alert_rule_catalog arc
|
||||||
|
WHERE arc.source = 'ai_generated'
|
||||||
|
AND (arc.labels->>'host' = ai.host
|
||||||
|
OR arc.labels->>'namespace' = ai.namespace)
|
||||||
|
) THEN 'green'
|
||||||
|
WHEN ai.asset_type IN ('host','k8s_workload','container') THEN 'red'
|
||||||
|
ELSE cs.coverage_status
|
||||||
|
END,
|
||||||
|
evidence = jsonb_build_object(
|
||||||
|
'source', 'alert_rule_catalog_ai_generated_match',
|
||||||
|
'asset_name', ai.name,
|
||||||
|
'note', 'AI 自主建規則尚未啟用,後續 Hermes 產出後此欄變 green'
|
||||||
|
)
|
||||||
|
FROM asset_inventory ai
|
||||||
|
WHERE cs.asset_id = ai.asset_id
|
||||||
|
AND cs.run_id = CAST(:rid AS uuid)
|
||||||
|
AND cs.dimension = 'auto_rule_creation'
|
||||||
|
AND ai.asset_type IN ('host', 'k8s_workload', 'container')
|
||||||
|
"""),
|
||||||
|
{"rid": run_id},
|
||||||
|
)
|
||||||
|
return result.rowcount or 0
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("evaluate_rule_creation_coverage_failed", error=str(e))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# AOL
|
# AOL
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|||||||
Reference in New Issue
Block a user