404 lines
14 KiB
Python
404 lines
14 KiB
Python
"""
|
|
Agent market watch service
|
|
==========================
|
|
|
|
Builds a read-only report from primary Agent framework sources. This service
|
|
does not call LLMs, install SDKs, mutate production systems, or approve
|
|
integration. It only detects version/source changes and recommends the next
|
|
AWOOOI replay gate.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import html
|
|
import json
|
|
import re
|
|
from collections.abc import Callable
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
from typing import Any
|
|
from urllib.error import HTTPError, URLError
|
|
from urllib.parse import urljoin
|
|
from urllib.request import Request, urlopen
|
|
|
|
FetchSource = Callable[[str, int], "FetchedSource"]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class FetchedSource:
|
|
"""HTTP fetch result for one primary source."""
|
|
|
|
status: str
|
|
http_status: int | None = None
|
|
body: bytes = b""
|
|
error: str | None = None
|
|
|
|
|
|
def run_agent_market_watch(
|
|
registry: dict[str, Any],
|
|
*,
|
|
registry_path: str,
|
|
mode: str = "live",
|
|
previous_report: dict[str, Any] | None = None,
|
|
timeout_seconds: int = 12,
|
|
fetcher: FetchSource | None = None,
|
|
generated_at: str | None = None,
|
|
) -> dict[str, Any]:
|
|
"""Build an Agent market watch report from a source registry."""
|
|
if mode not in {"live", "offline"}:
|
|
raise ValueError("mode must be 'live' or 'offline'")
|
|
if fetcher is None:
|
|
fetcher = fetch_url
|
|
|
|
previous_sources = _previous_source_map(previous_report or {})
|
|
candidates = []
|
|
integration_queue = []
|
|
failures: list[str] = []
|
|
source_count = 0
|
|
|
|
for candidate in registry.get("candidates") or []:
|
|
candidate_result = _evaluate_candidate(
|
|
candidate,
|
|
mode=mode,
|
|
timeout_seconds=timeout_seconds,
|
|
fetcher=fetcher,
|
|
previous_sources=previous_sources,
|
|
)
|
|
source_count += len(candidate_result["sources"])
|
|
candidates.append(candidate_result)
|
|
failures.extend(
|
|
f"{candidate_result['candidate_id']}:{source['source_id']}:{source['error']}"
|
|
for source in candidate_result["sources"]
|
|
if source.get("error")
|
|
)
|
|
if candidate_result["changed"]:
|
|
integration_queue.append(_integration_queue_item(candidate, candidate_result))
|
|
|
|
discovery_results = []
|
|
if mode == "live":
|
|
for source in registry.get("discovery_sources") or []:
|
|
discovery = _fetch_discovery_source(source, fetcher, timeout_seconds)
|
|
discovery_results.append(discovery)
|
|
if discovery.get("error"):
|
|
failures.append(f"{source.get('source_id')}:{discovery['error']}")
|
|
|
|
changed_candidates = sum(1 for candidate in candidates if candidate["changed"])
|
|
watch_only_candidates = sum(1 for candidate in candidates if not candidate["changed"])
|
|
|
|
return {
|
|
"schema_version": "agent_market_watch_report_v1",
|
|
"generated_at": generated_at or datetime.now(timezone.utc).isoformat(), # noqa: UP017
|
|
"mode": mode,
|
|
"registry": {
|
|
"path": registry_path,
|
|
"schema_version": str(registry.get("schema_version", "")),
|
|
"updated_at": str(registry.get("updated_at", "")),
|
|
},
|
|
"cadence": dict(registry.get("cadence") or {}),
|
|
"policy": dict(registry.get("policy") or {}),
|
|
"summary": {
|
|
"candidate_count": len(candidates),
|
|
"source_count": source_count,
|
|
"changed_candidates": changed_candidates,
|
|
"watch_only_candidates": watch_only_candidates,
|
|
"integration_queue_count": len(integration_queue),
|
|
"failure_count": len(failures),
|
|
},
|
|
"candidates": candidates,
|
|
"integration_queue": integration_queue,
|
|
"new_candidate_discovery": discovery_results,
|
|
"failures": failures,
|
|
}
|
|
|
|
|
|
def fetch_url(url: str, timeout_seconds: int) -> FetchedSource:
|
|
"""Fetch one URL using only stdlib urllib."""
|
|
return _fetch_url(url, timeout_seconds, redirects_remaining=3)
|
|
|
|
|
|
def _fetch_url(url: str, timeout_seconds: int, redirects_remaining: int) -> FetchedSource:
|
|
request = Request(
|
|
url,
|
|
headers={
|
|
"User-Agent": "awoooi-agent-market-watch/1.0",
|
|
"Accept": "application/json,text/html,text/plain,*/*",
|
|
},
|
|
)
|
|
try:
|
|
with urlopen(request, timeout=timeout_seconds) as response: # noqa: S310
|
|
return FetchedSource(
|
|
status="ok",
|
|
http_status=int(response.status),
|
|
body=response.read(),
|
|
)
|
|
except HTTPError as exc:
|
|
if exc.code in {301, 302, 303, 307, 308} and redirects_remaining > 0:
|
|
location = exc.headers.get("Location")
|
|
if location:
|
|
return _fetch_url(
|
|
urljoin(url, location),
|
|
timeout_seconds,
|
|
redirects_remaining - 1,
|
|
)
|
|
body = exc.read() if hasattr(exc, "read") else b""
|
|
return FetchedSource(
|
|
status="error",
|
|
http_status=int(exc.code),
|
|
body=body,
|
|
error=f"http_{exc.code}",
|
|
)
|
|
except URLError as exc:
|
|
return FetchedSource(status="error", error=str(exc.reason))
|
|
except Exception as exc:
|
|
return FetchedSource(status="error", error=str(exc))
|
|
|
|
|
|
def _evaluate_candidate(
|
|
candidate: dict[str, Any],
|
|
*,
|
|
mode: str,
|
|
timeout_seconds: int,
|
|
fetcher: FetchSource,
|
|
previous_sources: dict[tuple[str, str], dict[str, Any]],
|
|
) -> dict[str, Any]:
|
|
candidate_id = str(candidate.get("candidate_id", "")).strip()
|
|
source_results = [
|
|
_evaluate_source(
|
|
candidate_id,
|
|
source,
|
|
mode=mode,
|
|
timeout_seconds=timeout_seconds,
|
|
fetcher=fetcher,
|
|
previous_sources=previous_sources,
|
|
)
|
|
for source in candidate.get("sources") or []
|
|
]
|
|
changed = any(source.get("changed_since_reference") for source in source_results)
|
|
source_errors = [source for source in source_results if source.get("error")]
|
|
if changed:
|
|
decision = "changed_requires_replay_readiness_review"
|
|
actions = [
|
|
"refresh_market_capability_evidence",
|
|
"refresh_or_create_no_cost_adapter",
|
|
"run_offline_replay_before_shadow",
|
|
"do_not_promote_without_promotion_gate",
|
|
]
|
|
elif source_errors:
|
|
decision = "watch_with_source_failures"
|
|
actions = ["retry_source_fetch", "do_not_change_integration_status"]
|
|
else:
|
|
decision = "watch_only_no_change"
|
|
actions = ["keep_current_integration_status"]
|
|
|
|
return {
|
|
"candidate_id": candidate_id,
|
|
"display_name": str(candidate.get("display_name", candidate_id)),
|
|
"evaluation_priority": str(candidate.get("evaluation_priority", "watch")),
|
|
"recommended_role": str(candidate.get("recommended_role", "")),
|
|
"requires_cost_approval": bool(candidate.get("requires_cost_approval", False)),
|
|
"requires_dependency_approval": bool(candidate.get("requires_dependency_approval", False)),
|
|
"sources": source_results,
|
|
"changed": changed,
|
|
"decision": decision,
|
|
"recommended_actions": actions,
|
|
}
|
|
|
|
|
|
def _evaluate_source(
|
|
candidate_id: str,
|
|
source: dict[str, Any],
|
|
*,
|
|
mode: str,
|
|
timeout_seconds: int,
|
|
fetcher: FetchSource,
|
|
previous_sources: dict[tuple[str, str], dict[str, Any]],
|
|
) -> dict[str, Any]:
|
|
source_id = str(source.get("source_id", "")).strip()
|
|
source_type = str(source.get("type", "docs")).strip()
|
|
url = str(source.get("url", "")).strip()
|
|
reference_version = source.get("reference_version")
|
|
if mode == "offline":
|
|
return {
|
|
"source_id": source_id,
|
|
"type": source_type,
|
|
"url": url,
|
|
"status": "skipped_offline",
|
|
"http_status": None,
|
|
"version": reference_version,
|
|
"published_at": None,
|
|
"content_hash": None,
|
|
"changed_since_reference": False,
|
|
"reference_version": reference_version,
|
|
"error": None,
|
|
}
|
|
|
|
fetched = fetcher(url, timeout_seconds)
|
|
parsed = _parse_source(source_type, fetched.body) if fetched.body else {}
|
|
content_hash = _content_hash(fetched.body, source_type) if fetched.body else None
|
|
previous = previous_sources.get((candidate_id, source_id), {})
|
|
version = parsed.get("version")
|
|
published_at = parsed.get("published_at")
|
|
changed = _changed_since_reference(
|
|
version=version,
|
|
reference_version=reference_version,
|
|
content_hash=content_hash,
|
|
previous=previous,
|
|
)
|
|
return {
|
|
"source_id": source_id,
|
|
"type": source_type,
|
|
"url": url,
|
|
"status": fetched.status,
|
|
"http_status": fetched.http_status,
|
|
"version": version,
|
|
"published_at": published_at,
|
|
"content_hash": content_hash,
|
|
"changed_since_reference": changed,
|
|
"reference_version": reference_version,
|
|
"error": fetched.error,
|
|
}
|
|
|
|
|
|
def _parse_source(source_type: str, body: bytes) -> dict[str, str | None]:
|
|
if source_type == "pypi":
|
|
payload = _loads_json(body)
|
|
info = payload.get("info") if isinstance(payload, dict) else {}
|
|
version = str(info.get("version", "")) if isinstance(info, dict) else ""
|
|
releases = payload.get("releases") if isinstance(payload, dict) else {}
|
|
published_at = None
|
|
if isinstance(releases, dict) and version in releases and releases[version]:
|
|
first_file = releases[version][0]
|
|
if isinstance(first_file, dict):
|
|
published_at = first_file.get("upload_time_iso_8601")
|
|
return {"version": version or None, "published_at": published_at}
|
|
if source_type == "npm":
|
|
payload = _loads_json(body)
|
|
latest = None
|
|
published_at = None
|
|
if isinstance(payload, dict):
|
|
dist_tags = payload.get("dist-tags") or {}
|
|
latest = dist_tags.get("latest") if isinstance(dist_tags, dict) else None
|
|
times = payload.get("time") or {}
|
|
published_at = times.get(str(latest)) if isinstance(times, dict) and latest else None
|
|
return {"version": str(latest) if latest else None, "published_at": published_at}
|
|
if source_type == "github_release":
|
|
payload = _loads_json(body)
|
|
if isinstance(payload, dict):
|
|
version = payload.get("tag_name") or payload.get("name")
|
|
published_at = payload.get("published_at")
|
|
return {
|
|
"version": str(version) if version else None,
|
|
"published_at": str(published_at) if published_at else None,
|
|
}
|
|
return {"version": None, "published_at": None}
|
|
|
|
|
|
def _fetch_discovery_source(
|
|
source: dict[str, Any],
|
|
fetcher: FetchSource,
|
|
timeout_seconds: int,
|
|
) -> dict[str, Any]:
|
|
source_id = str(source.get("source_id", "")).strip()
|
|
url = str(source.get("url", "")).strip()
|
|
fetched = fetcher(url, timeout_seconds)
|
|
result: dict[str, Any] = {
|
|
"source_id": source_id,
|
|
"type": source.get("type"),
|
|
"url": url,
|
|
"status": fetched.status,
|
|
"http_status": fetched.http_status,
|
|
"items": [],
|
|
"error": fetched.error,
|
|
}
|
|
if fetched.status != "ok" or not fetched.body:
|
|
return result
|
|
payload = _loads_json(fetched.body)
|
|
if not isinstance(payload, dict):
|
|
return result
|
|
items = payload.get("items") or []
|
|
if not isinstance(items, list):
|
|
return result
|
|
result["items"] = [
|
|
{
|
|
"full_name": item.get("full_name"),
|
|
"html_url": item.get("html_url"),
|
|
"stargazers_count": item.get("stargazers_count"),
|
|
"updated_at": item.get("updated_at"),
|
|
}
|
|
for item in items[:5]
|
|
if isinstance(item, dict)
|
|
]
|
|
return result
|
|
|
|
|
|
def _integration_queue_item(
|
|
candidate: dict[str, Any],
|
|
candidate_result: dict[str, Any],
|
|
) -> dict[str, Any]:
|
|
return {
|
|
"candidate_id": candidate_result["candidate_id"],
|
|
"reason": "primary_source_version_or_content_changed",
|
|
"required_next_gate": "refresh_market_scorecard_then_offline_replay",
|
|
"requires_cost_approval": bool(candidate.get("requires_cost_approval", False)),
|
|
"requires_dependency_approval": bool(candidate.get("requires_dependency_approval", False)),
|
|
}
|
|
|
|
|
|
def _previous_source_map(report: dict[str, Any]) -> dict[tuple[str, str], dict[str, Any]]:
|
|
mapped: dict[tuple[str, str], dict[str, Any]] = {}
|
|
for candidate in report.get("candidates") or []:
|
|
candidate_id = str(candidate.get("candidate_id", "")).strip()
|
|
for source in candidate.get("sources") or []:
|
|
source_id = str(source.get("source_id", "")).strip()
|
|
if candidate_id and source_id:
|
|
mapped[(candidate_id, source_id)] = source
|
|
return mapped
|
|
|
|
|
|
def _changed_since_reference(
|
|
*,
|
|
version: str | None,
|
|
reference_version: Any,
|
|
content_hash: str | None,
|
|
previous: dict[str, Any],
|
|
) -> bool:
|
|
if reference_version and version and str(reference_version) != str(version):
|
|
return True
|
|
previous_version = previous.get("version")
|
|
if previous_version and version:
|
|
return str(previous_version) != str(version)
|
|
if version:
|
|
return False
|
|
previous_hash = previous.get("content_hash")
|
|
if previous_hash and content_hash and str(previous_hash) != str(content_hash):
|
|
return True
|
|
return False
|
|
|
|
|
|
def _content_hash(body: bytes, source_type: str) -> str:
|
|
if source_type == "docs":
|
|
normalized = _normalized_docs_text(body)
|
|
return hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:24]
|
|
return hashlib.sha256(body).hexdigest()[:24]
|
|
|
|
|
|
def _normalized_docs_text(body: bytes) -> str:
|
|
text = body.decode("utf-8", errors="replace")
|
|
text = re.sub(r"<!--.*?-->", " ", text, flags=re.DOTALL)
|
|
text = re.sub(r"<script\b[^>]*>.*?</script>", " ", text, flags=re.DOTALL | re.IGNORECASE)
|
|
text = re.sub(r"<style\b[^>]*>.*?</style>", " ", text, flags=re.DOTALL | re.IGNORECASE)
|
|
text = re.sub(r"<noscript\b[^>]*>.*?</noscript>", " ", text, flags=re.DOTALL | re.IGNORECASE)
|
|
text = re.sub(r"<svg\b[^>]*>.*?</svg>", " ", text, flags=re.DOTALL | re.IGNORECASE)
|
|
text = re.sub(r"<[^>]+>", " ", text)
|
|
text = html.unescape(text)
|
|
text = re.sub(r"\s+", " ", text)
|
|
return text.strip().lower()
|
|
|
|
|
|
def _loads_json(body: bytes) -> Any:
|
|
try:
|
|
return json.loads(body.decode("utf-8"))
|
|
except Exception:
|
|
return {}
|