diff --git a/apps/api/src/jobs/coverage_evaluator_job.py b/apps/api/src/jobs/coverage_evaluator_job.py index 6a18ddec..b4b80c28 100644 --- a/apps/api/src/jobs/coverage_evaluator_job.py +++ b/apps/api/src/jobs/coverage_evaluator_job.py @@ -64,9 +64,20 @@ async def run_coverage_evaluator_loop() -> None: async def evaluate_once() -> dict[str, int]: - """針對最新 asset_discovery_run 的 coverage_snapshot 升級 status.""" + """針對最新 asset_discovery_run 的 coverage_snapshot 升級 status. + + 2026-04-19 v2 擴充 4 維 (原 3 維 monitoring/alerting/km): + + auto_playbook: asset.name 出現在 playbooks.symptom_pattern 或 description + + auto_remediation: remediation_events 過去 30d 有 target match asset.name + + auto_rule_matching: incidents 過去 30d 有 asset match (alertname+affected_services) + + auto_rule_creation: alert_rule_catalog source='ai_generated' 覆蓋 asset + """ started_ms = _time.time() - stats = {"monitoring_updated": 0, "alerting_updated": 0, "km_updated": 0} + stats = { + "monitoring_updated": 0, "alerting_updated": 0, "km_updated": 0, + "playbook_updated": 0, "remediation_updated": 0, + "rule_matching_updated": 0, "rule_creation_updated": 0, + } error_msg: str | None = None try: @@ -75,15 +86,17 @@ async def evaluate_once() -> dict[str, int]: logger.info("coverage_evaluator_no_run_yet") return stats - # 1. auto_monitoring: Prometheus targets + # 原 3 維 stats["monitoring_updated"] = await _evaluate_monitoring(run_id) - - # 2. auto_alerting: alert_rule_catalog labels match stats["alerting_updated"] = await _evaluate_alerting(run_id) - - # 3. auto_km_creation: knowledge_entries 覆蓋 stats["km_updated"] = await _evaluate_km_coverage(run_id) + # v2 新增 4 維 + stats["playbook_updated"] = await _evaluate_playbook_coverage(run_id) + stats["remediation_updated"] = await _evaluate_remediation_coverage(run_id) + stats["rule_matching_updated"] = await _evaluate_rule_matching_coverage(run_id) + stats["rule_creation_updated"] = await _evaluate_rule_creation_coverage(run_id) + except Exception as e: error_msg = f"{type(e).__name__}: {e}"[:1000] logger.exception("coverage_evaluate_once_failed", error=error_msg) @@ -291,6 +304,182 @@ async def _evaluate_km_coverage(run_id: str) -> int: return 0 +# ============================================================================ +# v2 新增 4 維 evaluator +# ============================================================================ + +async def _evaluate_playbook_coverage(run_id: str) -> int: + """ + auto_playbook: k8s_workload asset 在 playbooks.symptom_pattern (JSON) 或 description 出現 → green + 沒對應 playbook 但 type 合理 → yellow; 否則保持 unknown + """ + from sqlalchemy import text as _sql + from src.db.base import get_db_context + + try: + async with get_db_context() as db: + result = await db.execute( + _sql(""" + UPDATE asset_coverage_snapshot cs + SET coverage_status = CASE + WHEN ai.asset_type = 'k8s_workload' AND EXISTS ( + SELECT 1 FROM playbooks pb + WHERE pb.status = 'approved' + AND (pb.description ILIKE '%' || ai.name || '%' + OR pb.symptom_pattern::text ILIKE '%' || ai.name || '%') + ) THEN 'green' + WHEN ai.asset_type = 'k8s_workload' THEN 'yellow' + ELSE cs.coverage_status + END, + evidence = jsonb_build_object( + 'source', 'playbooks_symptom_pattern_or_description_match', + 'asset_name', ai.name + ) + FROM asset_inventory ai + WHERE cs.asset_id = ai.asset_id + AND cs.run_id = CAST(:rid AS uuid) + AND cs.dimension = 'auto_playbook' + AND ai.asset_type = 'k8s_workload' + """), + {"rid": run_id}, + ) + return result.rowcount or 0 + except Exception as e: + logger.warning("evaluate_playbook_coverage_failed", error=str(e)) + return 0 + + +async def _evaluate_remediation_coverage(run_id: str) -> int: + """ + auto_remediation: 過去 30d remediation_events.target_resource 包含 asset.name → green + 沒 target 匹配但 asset 是 k8s_workload/container → red (應有修復能力但沒) + """ + from sqlalchemy import text as _sql + from src.db.base import get_db_context + + try: + async with get_db_context() as db: + result = await db.execute( + _sql(""" + UPDATE asset_coverage_snapshot cs + SET coverage_status = CASE + WHEN ai.asset_type IN ('k8s_workload', 'container') AND EXISTS ( + SELECT 1 FROM remediation_events re + WHERE re.target_resource ILIKE '%' || ai.name || '%' + AND re.created_at > NOW() - INTERVAL '30 days' + ) THEN 'green' + WHEN ai.asset_type IN ('k8s_workload', 'container') THEN 'red' + ELSE cs.coverage_status + END, + evidence = jsonb_build_object( + 'source', 'remediation_events_target_match_30d', + 'asset_name', ai.name + ) + FROM asset_inventory ai + WHERE cs.asset_id = ai.asset_id + AND cs.run_id = CAST(:rid AS uuid) + AND cs.dimension = 'auto_remediation' + AND ai.asset_type IN ('k8s_workload', 'container') + """), + {"rid": run_id}, + ) + return result.rowcount or 0 + except Exception as e: + logger.warning("evaluate_remediation_coverage_failed", error=str(e)) + return 0 + + +async def _evaluate_rule_matching_coverage(run_id: str) -> int: + """ + auto_rule_matching: 過去 30d incidents 有觸發過關聯到該 asset → green + 關聯: incident.alertname match alert_rule_catalog + labels.namespace/host 對應 asset + 或 incident.affected_services ILIKE asset.name + 沒觸發 → yellow (可能沒問題也可能沒覆蓋,中性) + """ + from sqlalchemy import text as _sql + from src.db.base import get_db_context + + try: + async with get_db_context() as db: + result = await db.execute( + _sql(""" + UPDATE asset_coverage_snapshot cs + SET coverage_status = CASE + WHEN EXISTS ( + SELECT 1 FROM incidents i + WHERE i.created_at > NOW() - INTERVAL '30 days' + AND (i.affected_services::text ILIKE '%' || ai.name || '%' + OR (i.alertname IS NOT NULL AND EXISTS ( + SELECT 1 FROM alert_rule_catalog arc + WHERE arc.rule_name = i.alertname + AND (arc.labels->>'host' = ai.host + OR arc.labels->>'namespace' = ai.namespace) + ))) + ) THEN 'green' + WHEN ai.asset_type IN ('host','k8s_workload','container') THEN 'yellow' + ELSE cs.coverage_status + END, + evidence = jsonb_build_object( + 'source', 'incidents_match_30d', + 'asset_name', ai.name + ) + FROM asset_inventory ai + WHERE cs.asset_id = ai.asset_id + AND cs.run_id = CAST(:rid AS uuid) + AND cs.dimension = 'auto_rule_matching' + AND ai.asset_type IN ('host', 'k8s_workload', 'container') + """), + {"rid": run_id}, + ) + return result.rowcount or 0 + except Exception as e: + logger.warning("evaluate_rule_matching_coverage_failed", error=str(e)) + return 0 + + +async def _evaluate_rule_creation_coverage(run_id: str) -> int: + """ + auto_rule_creation: asset 是否有被 AI-generated rule 覆蓋 + current: 所有 rule source='yaml_hardcoded',沒 AI-generated → 全 red (表示尚未由 AI 主動建規則) + 未來 Hermes 建出 AI rule 後會變 green + """ + from sqlalchemy import text as _sql + from src.db.base import get_db_context + + try: + async with get_db_context() as db: + result = await db.execute( + _sql(""" + UPDATE asset_coverage_snapshot cs + SET coverage_status = CASE + WHEN EXISTS ( + SELECT 1 FROM alert_rule_catalog arc + WHERE arc.source = 'ai_generated' + AND (arc.labels->>'host' = ai.host + OR arc.labels->>'namespace' = ai.namespace) + ) THEN 'green' + WHEN ai.asset_type IN ('host','k8s_workload','container') THEN 'red' + ELSE cs.coverage_status + END, + evidence = jsonb_build_object( + 'source', 'alert_rule_catalog_ai_generated_match', + 'asset_name', ai.name, + 'note', 'AI 自主建規則尚未啟用,後續 Hermes 產出後此欄變 green' + ) + FROM asset_inventory ai + WHERE cs.asset_id = ai.asset_id + AND cs.run_id = CAST(:rid AS uuid) + AND cs.dimension = 'auto_rule_creation' + AND ai.asset_type IN ('host', 'k8s_workload', 'container') + """), + {"rid": run_id}, + ) + return result.rowcount or 0 + except Exception as e: + logger.warning("evaluate_rule_creation_coverage_failed", error=str(e)) + return 0 + + # ============================================================================ # AOL # ============================================================================