awoooi/apps/api/src/services/agent_market_watch.py

"""
Agent market watch service
==========================

Builds a read-only report from primary Agent framework sources. This service
does not call LLMs, install SDKs, mutate production systems, or approve
integration. It only detects version/source changes and recommends the next
AWOOOI replay gate.
"""

from __future__ import annotations

import hashlib
import html
import json
import re
from collections.abc import Callable
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any
from urllib.error import HTTPError, URLError
from urllib.parse import urljoin
from urllib.request import Request, urlopen

FetchSource = Callable[[str, int], "FetchedSource"]


@dataclass(frozen=True)
class FetchedSource:
    """HTTP fetch result for one primary source."""

    status: str
    http_status: int | None = None
    body: bytes = b""
    error: str | None = None


def run_agent_market_watch(
    registry: dict[str, Any],
    *,
    registry_path: str,
    mode: str = "live",
    previous_report: dict[str, Any] | None = None,
    timeout_seconds: int = 12,
    fetcher: FetchSource | None = None,
    generated_at: str | None = None,
) -> dict[str, Any]:
    """Build an Agent market watch report from a source registry."""
    if mode not in {"live", "offline"}:
        raise ValueError("mode must be 'live' or 'offline'")
    if fetcher is None:
        fetcher = fetch_url

    previous_sources = _previous_source_map(previous_report or {})
    candidates = []
    integration_queue = []
    failures: list[str] = []
    source_count = 0

    for candidate in registry.get("candidates") or []:
        candidate_result = _evaluate_candidate(
            candidate,
            mode=mode,
            timeout_seconds=timeout_seconds,
            fetcher=fetcher,
            previous_sources=previous_sources,
        )
        source_count += len(candidate_result["sources"])
        candidates.append(candidate_result)
        failures.extend(
            f"{candidate_result['candidate_id']}:{source['source_id']}:{source['error']}"
            for source in candidate_result["sources"]
            if source.get("error")
        )
        if candidate_result["changed"]:
            integration_queue.append(_integration_queue_item(candidate, candidate_result))

    discovery_results = []
    if mode == "live":
        for source in registry.get("discovery_sources") or []:
            discovery = _fetch_discovery_source(source, fetcher, timeout_seconds)
            discovery_results.append(discovery)
            if discovery.get("error"):
                failures.append(f"{source.get('source_id')}:{discovery['error']}")

    changed_candidates = sum(1 for candidate in candidates if candidate["changed"])
    watch_only_candidates = sum(1 for candidate in candidates if not candidate["changed"])

    return {
        "schema_version": "agent_market_watch_report_v1",
        "generated_at": generated_at or datetime.now(timezone.utc).isoformat(),  # noqa: UP017
        "mode": mode,
        "registry": {
            "path": registry_path,
            "schema_version": str(registry.get("schema_version", "")),
            "updated_at": str(registry.get("updated_at", "")),
        },
        "cadence": dict(registry.get("cadence") or {}),
        "policy": dict(registry.get("policy") or {}),
        "summary": {
            "candidate_count": len(candidates),
            "source_count": source_count,
            "changed_candidates": changed_candidates,
            "watch_only_candidates": watch_only_candidates,
            "integration_queue_count": len(integration_queue),
            "failure_count": len(failures),
        },
        "candidates": candidates,
        "integration_queue": integration_queue,
        "new_candidate_discovery": discovery_results,
        "failures": failures,
    }


def fetch_url(url: str, timeout_seconds: int) -> FetchedSource:
    """Fetch one URL using only stdlib urllib."""
    return _fetch_url(url, timeout_seconds, redirects_remaining=3)


def _fetch_url(url: str, timeout_seconds: int, redirects_remaining: int) -> FetchedSource:
    request = Request(
        url,
        headers={
            "User-Agent": "awoooi-agent-market-watch/1.0",
            "Accept": "application/json,text/html,text/plain,*/*",
        },
    )
    try:
        with urlopen(request, timeout=timeout_seconds) as response:  # noqa: S310
            return FetchedSource(
                status="ok",
                http_status=int(response.status),
                body=response.read(),
            )
    except HTTPError as exc:
        if exc.code in {301, 302, 303, 307, 308} and redirects_remaining > 0:
            location = exc.headers.get("Location")
            if location:
                return _fetch_url(
                    urljoin(url, location),
                    timeout_seconds,
                    redirects_remaining - 1,
                )
        body = exc.read() if hasattr(exc, "read") else b""
        return FetchedSource(
            status="error",
            http_status=int(exc.code),
            body=body,
            error=f"http_{exc.code}",
        )
    except URLError as exc:
        return FetchedSource(status="error", error=str(exc.reason))
    except Exception as exc:
        return FetchedSource(status="error", error=str(exc))


def _evaluate_candidate(
    candidate: dict[str, Any],
    *,
    mode: str,
    timeout_seconds: int,
    fetcher: FetchSource,
    previous_sources: dict[tuple[str, str], dict[str, Any]],
) -> dict[str, Any]:
    candidate_id = str(candidate.get("candidate_id", "")).strip()
    source_results = [
        _evaluate_source(
            candidate_id,
            source,
            mode=mode,
            timeout_seconds=timeout_seconds,
            fetcher=fetcher,
            previous_sources=previous_sources,
        )
        for source in candidate.get("sources") or []
    ]
    changed = any(source.get("changed_since_reference") for source in source_results)
    source_errors = [source for source in source_results if source.get("error")]
    if changed:
        decision = "changed_requires_replay_readiness_review"
        actions = [
            "refresh_market_capability_evidence",
            "refresh_or_create_no_cost_adapter",
            "run_offline_replay_before_shadow",
            "do_not_promote_without_promotion_gate",
        ]
    elif source_errors:
        decision = "watch_with_source_failures"
        actions = ["retry_source_fetch", "do_not_change_integration_status"]
    else:
        decision = "watch_only_no_change"
        actions = ["keep_current_integration_status"]

    return {
        "candidate_id": candidate_id,
        "display_name": str(candidate.get("display_name", candidate_id)),
        "evaluation_priority": str(candidate.get("evaluation_priority", "watch")),
        "recommended_role": str(candidate.get("recommended_role", "")),
        "requires_cost_approval": bool(candidate.get("requires_cost_approval", False)),
        "requires_dependency_approval": bool(candidate.get("requires_dependency_approval", False)),
        "sources": source_results,
        "changed": changed,
        "decision": decision,
        "recommended_actions": actions,
    }


def _evaluate_source(
    candidate_id: str,
    source: dict[str, Any],
    *,
    mode: str,
    timeout_seconds: int,
    fetcher: FetchSource,
    previous_sources: dict[tuple[str, str], dict[str, Any]],
) -> dict[str, Any]:
    source_id = str(source.get("source_id", "")).strip()
    source_type = str(source.get("type", "docs")).strip()
    url = str(source.get("url", "")).strip()
    reference_version = source.get("reference_version")
    if mode == "offline":
        return {
            "source_id": source_id,
            "type": source_type,
            "url": url,
            "status": "skipped_offline",
            "http_status": None,
            "version": reference_version,
            "published_at": None,
            "content_hash": None,
            "changed_since_reference": False,
            "reference_version": reference_version,
            "error": None,
        }

    fetched = fetcher(url, timeout_seconds)
    parsed = _parse_source(source_type, fetched.body) if fetched.body else {}
    content_hash = _content_hash(fetched.body, source_type) if fetched.body else None
    previous = previous_sources.get((candidate_id, source_id), {})
    version = parsed.get("version")
    published_at = parsed.get("published_at")
    changed = _changed_since_reference(
        version=version,
        reference_version=reference_version,
        content_hash=content_hash,
        previous=previous,
    )
    return {
        "source_id": source_id,
        "type": source_type,
        "url": url,
        "status": fetched.status,
        "http_status": fetched.http_status,
        "version": version,
        "published_at": published_at,
        "content_hash": content_hash,
        "changed_since_reference": changed,
        "reference_version": reference_version,
        "error": fetched.error,
    }


def _parse_source(source_type: str, body: bytes) -> dict[str, str | None]:
    if source_type == "pypi":
        payload = _loads_json(body)
        info = payload.get("info") if isinstance(payload, dict) else {}
        version = str(info.get("version", "")) if isinstance(info, dict) else ""
        releases = payload.get("releases") if isinstance(payload, dict) else {}
        published_at = None
        if isinstance(releases, dict) and version in releases and releases[version]:
            first_file = releases[version][0]
            if isinstance(first_file, dict):
                published_at = first_file.get("upload_time_iso_8601")
        return {"version": version or None, "published_at": published_at}
    if source_type == "npm":
        payload = _loads_json(body)
        latest = None
        published_at = None
        if isinstance(payload, dict):
            dist_tags = payload.get("dist-tags") or {}
            latest = dist_tags.get("latest") if isinstance(dist_tags, dict) else None
            times = payload.get("time") or {}
            published_at = times.get(str(latest)) if isinstance(times, dict) and latest else None
        return {"version": str(latest) if latest else None, "published_at": published_at}
    if source_type == "github_release":
        payload = _loads_json(body)
        if isinstance(payload, dict):
            version = payload.get("tag_name") or payload.get("name")
            published_at = payload.get("published_at")
            return {
                "version": str(version) if version else None,
                "published_at": str(published_at) if published_at else None,
            }
    return {"version": None, "published_at": None}


def _fetch_discovery_source(
    source: dict[str, Any],
    fetcher: FetchSource,
    timeout_seconds: int,
) -> dict[str, Any]:
    source_id = str(source.get("source_id", "")).strip()
    url = str(source.get("url", "")).strip()
    fetched = fetcher(url, timeout_seconds)
    result: dict[str, Any] = {
        "source_id": source_id,
        "type": source.get("type"),
        "url": url,
        "status": fetched.status,
        "http_status": fetched.http_status,
        "items": [],
        "error": fetched.error,
    }
    if fetched.status != "ok" or not fetched.body:
        return result
    payload = _loads_json(fetched.body)
    if not isinstance(payload, dict):
        return result
    items = payload.get("items") or []
    if not isinstance(items, list):
        return result
    result["items"] = [
        {
            "full_name": item.get("full_name"),
            "html_url": item.get("html_url"),
            "stargazers_count": item.get("stargazers_count"),
            "updated_at": item.get("updated_at"),
        }
        for item in items[:5]
        if isinstance(item, dict)
    ]
    return result


def _integration_queue_item(
    candidate: dict[str, Any],
    candidate_result: dict[str, Any],
) -> dict[str, Any]:
    return {
        "candidate_id": candidate_result["candidate_id"],
        "reason": "primary_source_version_or_content_changed",
        "required_next_gate": "refresh_market_scorecard_then_offline_replay",
        "requires_cost_approval": bool(candidate.get("requires_cost_approval", False)),
        "requires_dependency_approval": bool(candidate.get("requires_dependency_approval", False)),
    }


def _previous_source_map(report: dict[str, Any]) -> dict[tuple[str, str], dict[str, Any]]:
    mapped: dict[tuple[str, str], dict[str, Any]] = {}
    for candidate in report.get("candidates") or []:
        candidate_id = str(candidate.get("candidate_id", "")).strip()
        for source in candidate.get("sources") or []:
            source_id = str(source.get("source_id", "")).strip()
            if candidate_id and source_id:
                mapped[(candidate_id, source_id)] = source
    return mapped


def _changed_since_reference(
    *,
    version: str | None,
    reference_version: Any,
    content_hash: str | None,
    previous: dict[str, Any],
) -> bool:
    if reference_version and version and str(reference_version) != str(version):
        return True
    previous_version = previous.get("version")
    if previous_version and version:
        return str(previous_version) != str(version)
    if version:
        return False
    previous_hash = previous.get("content_hash")
    if previous_hash and content_hash and str(previous_hash) != str(content_hash):
        return True
    return False


def _content_hash(body: bytes, source_type: str) -> str:
    if source_type == "docs":
        normalized = _normalized_docs_text(body)
        return hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:24]
    return hashlib.sha256(body).hexdigest()[:24]


def _normalized_docs_text(body: bytes) -> str:
    text = body.decode("utf-8", errors="replace")
    text = re.sub(r"<!--.*?-->", " ", text, flags=re.DOTALL)
    text = re.sub(r"<script\b[^>]*>.*?</script>", " ", text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r"<style\b[^>]*>.*?</style>", " ", text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r"<noscript\b[^>]*>.*?</noscript>", " ", text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r"<svg\b[^>]*>.*?</svg>", " ", text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r"<[^>]+>", " ", text)
    text = html.unescape(text)
    text = re.sub(r"\s+", " ", text)
    return text.strip().lower()


def _loads_json(body: bytes) -> Any:
    try:
        return json.loads(body.decode("utf-8"))
    except Exception:
        return {}