"""Stable repeat identity for Config Drift reports. The drift scanner emits a fresh ``report_id`` for every run. Operators need a stable identity that answers whether two reports describe the same drift, not just whether they have the same HIGH/MEDIUM/INFO counts. """ from __future__ import annotations import hashlib import json from datetime import datetime, timedelta, timezone from typing import Any SCHEMA_VERSION = "drift_repeat_state_v1" FINGERPRINT_VERSION = "drift_fingerprint_v1" SEMANTIC_FINGERPRINT_VERSION = "drift_fingerprint_v2" VALUE_AWARE_MATCHING_STRATEGY = "namespace_and_stable_items_v1" SEMANTIC_MATCHING_STRATEGY = "namespace_resource_field_level_v2" def _get(obj: Any, key: str, default: Any = None) -> Any: if isinstance(obj, dict): return obj.get(key, default) return getattr(obj, key, default) def _enum_value(value: Any) -> Any: return getattr(value, "value", value) def _jsonable(value: Any) -> Any: value = _enum_value(value) if isinstance(value, dict): return {str(k): _jsonable(v) for k, v in value.items()} if isinstance(value, list): return [_jsonable(v) for v in value] if isinstance(value, tuple): return [_jsonable(v) for v in value] if isinstance(value, datetime): return value.isoformat() return value def _canonical_json(value: Any) -> str: return json.dumps( _jsonable(value), ensure_ascii=False, sort_keys=True, separators=(",", ":"), default=str, ) def _parse_datetime(value: Any) -> datetime | None: if value is None: return None if isinstance(value, datetime): parsed = value if parsed.tzinfo is not None: return parsed.astimezone(timezone.utc).replace(tzinfo=None) return parsed if isinstance(value, str): try: parsed = datetime.fromisoformat(value.replace("Z", "+00:00")) if parsed.tzinfo is not None: return parsed.astimezone(timezone.utc).replace(tzinfo=None) return parsed except ValueError: return None return None def _iso(value: Any) -> str | None: parsed = _parse_datetime(value) return parsed.isoformat() if parsed else None def drift_item_identity(item: Any, *, include_values: bool = True) -> dict[str, Any]: """Return the stable fields that define one drift item.""" identity = { "resource_kind": str(_get(item, "resource_kind", "")), "resource_name": str(_get(item, "resource_name", "")), "namespace": str(_get(item, "namespace", "")), "field_path": str(_get(item, "field_path", "")), "drift_level": str(_enum_value(_get(item, "drift_level", ""))), "is_allowlisted": bool(_get(item, "is_allowlisted", False)), } if include_values: identity["git_value"] = _jsonable(_get(item, "git_value")) identity["actual_value"] = _jsonable(_get(item, "actual_value")) return identity def build_drift_fingerprint( namespace: str, items: list[Any], *, include_values: bool = True, ) -> str: """Build a deterministic fingerprint from namespace + sorted drift items.""" identities = [ drift_item_identity(item, include_values=include_values) for item in items ] identities.sort(key=_canonical_json) payload = { "version": FINGERPRINT_VERSION if include_values else SEMANTIC_FINGERPRINT_VERSION, "namespace": namespace, "items": identities, } digest = hashlib.sha256(_canonical_json(payload).encode("utf-8")).hexdigest() return f"dfp_{digest[:16]}" def _report_identity(report: Any, *, include_values: bool = True) -> dict[str, Any]: items = _get(report, "items", []) or [] namespace = str(_get(report, "namespace", "")) strict_fingerprint = build_drift_fingerprint( namespace, list(items), include_values=True, ) return { "report_id": _get(report, "report_id"), "namespace": namespace, "status": str(_enum_value(_get(report, "status", ""))), "scanned_at": _get(report, "scanned_at"), "created_at": _get(report, "created_at"), "fingerprint": build_drift_fingerprint( namespace, list(items), include_values=include_values, ), "strict_fingerprint": strict_fingerprint, } def build_drift_repeat_state( report: Any, recent_reports: list[Any], *, include_values: bool = True, window_hours: int = 12, max_reports: int = 20, ) -> dict[str, Any]: """Summarize repeat state for one drift report using stable fingerprints.""" current = _report_identity(report, include_values=include_values) current_time = ( _parse_datetime(current.get("scanned_at")) or _parse_datetime(current.get("created_at")) or datetime.now() ) cutoff = current_time - timedelta(hours=window_hours) by_id: dict[str, dict[str, Any]] = {} for candidate in [report, *recent_reports]: identity = _report_identity(candidate, include_values=include_values) report_id = str(identity.get("report_id") or "") if not report_id: continue candidate_time = ( _parse_datetime(identity.get("scanned_at")) or _parse_datetime(identity.get("created_at")) ) if candidate_time is not None and candidate_time < cutoff: continue if identity["fingerprint"] != current["fingerprint"]: continue by_id[report_id] = identity matches = sorted( by_id.values(), key=lambda row: ( _parse_datetime(row.get("scanned_at")) or _parse_datetime(row.get("created_at")) or datetime.min ), ) first = matches[0] if matches else current last = matches[-1] if matches else current status = current.get("status") or "unknown" operator_stage = "pending_human" if status == "pending" else str(status) return { "schema_version": SCHEMA_VERSION, "fingerprint": current["fingerprint"], "strict_fingerprint": current["strict_fingerprint"], "matching_strategy": ( VALUE_AWARE_MATCHING_STRATEGY if include_values else SEMANTIC_MATCHING_STRATEGY ), "window_hours": window_hours, "occurrences_12h": len(matches), "first_scanned_at": _iso(first.get("scanned_at") or first.get("created_at")), "last_scanned_at": _iso(last.get("scanned_at") or last.get("created_at")), "operator_stage": operator_stage, "reports": [ { "report_id": row.get("report_id"), "scanned_at": _iso(row.get("scanned_at")), "created_at": _iso(row.get("created_at")), "status": row.get("status"), "strict_fingerprint": row.get("strict_fingerprint"), } for row in reversed(matches[-max_reports:]) ], }