Files
awoooi/apps/api/src/services/agent_market_watch.py
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

404 lines
14 KiB
Python

"""
Agent market watch service
==========================
Builds a read-only report from primary Agent framework sources. This service
does not call LLMs, install SDKs, mutate production systems, or approve
integration. It only detects version/source changes and recommends the next
AWOOOI replay gate.
"""
from __future__ import annotations
import hashlib
import html
import json
import re
from collections.abc import Callable
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any
from urllib.error import HTTPError, URLError
from urllib.parse import urljoin
from urllib.request import Request, urlopen
FetchSource = Callable[[str, int], "FetchedSource"]
@dataclass(frozen=True)
class FetchedSource:
"""HTTP fetch result for one primary source."""
status: str
http_status: int | None = None
body: bytes = b""
error: str | None = None
def run_agent_market_watch(
registry: dict[str, Any],
*,
registry_path: str,
mode: str = "live",
previous_report: dict[str, Any] | None = None,
timeout_seconds: int = 12,
fetcher: FetchSource | None = None,
generated_at: str | None = None,
) -> dict[str, Any]:
"""Build an Agent market watch report from a source registry."""
if mode not in {"live", "offline"}:
raise ValueError("mode must be 'live' or 'offline'")
if fetcher is None:
fetcher = fetch_url
previous_sources = _previous_source_map(previous_report or {})
candidates = []
integration_queue = []
failures: list[str] = []
source_count = 0
for candidate in registry.get("candidates") or []:
candidate_result = _evaluate_candidate(
candidate,
mode=mode,
timeout_seconds=timeout_seconds,
fetcher=fetcher,
previous_sources=previous_sources,
)
source_count += len(candidate_result["sources"])
candidates.append(candidate_result)
failures.extend(
f"{candidate_result['candidate_id']}:{source['source_id']}:{source['error']}"
for source in candidate_result["sources"]
if source.get("error")
)
if candidate_result["changed"]:
integration_queue.append(_integration_queue_item(candidate, candidate_result))
discovery_results = []
if mode == "live":
for source in registry.get("discovery_sources") or []:
discovery = _fetch_discovery_source(source, fetcher, timeout_seconds)
discovery_results.append(discovery)
if discovery.get("error"):
failures.append(f"{source.get('source_id')}:{discovery['error']}")
changed_candidates = sum(1 for candidate in candidates if candidate["changed"])
watch_only_candidates = sum(1 for candidate in candidates if not candidate["changed"])
return {
"schema_version": "agent_market_watch_report_v1",
"generated_at": generated_at or datetime.now(timezone.utc).isoformat(), # noqa: UP017
"mode": mode,
"registry": {
"path": registry_path,
"schema_version": str(registry.get("schema_version", "")),
"updated_at": str(registry.get("updated_at", "")),
},
"cadence": dict(registry.get("cadence") or {}),
"policy": dict(registry.get("policy") or {}),
"summary": {
"candidate_count": len(candidates),
"source_count": source_count,
"changed_candidates": changed_candidates,
"watch_only_candidates": watch_only_candidates,
"integration_queue_count": len(integration_queue),
"failure_count": len(failures),
},
"candidates": candidates,
"integration_queue": integration_queue,
"new_candidate_discovery": discovery_results,
"failures": failures,
}
def fetch_url(url: str, timeout_seconds: int) -> FetchedSource:
"""Fetch one URL using only stdlib urllib."""
return _fetch_url(url, timeout_seconds, redirects_remaining=3)
def _fetch_url(url: str, timeout_seconds: int, redirects_remaining: int) -> FetchedSource:
request = Request(
url,
headers={
"User-Agent": "awoooi-agent-market-watch/1.0",
"Accept": "application/json,text/html,text/plain,*/*",
},
)
try:
with urlopen(request, timeout=timeout_seconds) as response: # noqa: S310
return FetchedSource(
status="ok",
http_status=int(response.status),
body=response.read(),
)
except HTTPError as exc:
if exc.code in {301, 302, 303, 307, 308} and redirects_remaining > 0:
location = exc.headers.get("Location")
if location:
return _fetch_url(
urljoin(url, location),
timeout_seconds,
redirects_remaining - 1,
)
body = exc.read() if hasattr(exc, "read") else b""
return FetchedSource(
status="error",
http_status=int(exc.code),
body=body,
error=f"http_{exc.code}",
)
except URLError as exc:
return FetchedSource(status="error", error=str(exc.reason))
except Exception as exc:
return FetchedSource(status="error", error=str(exc))
def _evaluate_candidate(
candidate: dict[str, Any],
*,
mode: str,
timeout_seconds: int,
fetcher: FetchSource,
previous_sources: dict[tuple[str, str], dict[str, Any]],
) -> dict[str, Any]:
candidate_id = str(candidate.get("candidate_id", "")).strip()
source_results = [
_evaluate_source(
candidate_id,
source,
mode=mode,
timeout_seconds=timeout_seconds,
fetcher=fetcher,
previous_sources=previous_sources,
)
for source in candidate.get("sources") or []
]
changed = any(source.get("changed_since_reference") for source in source_results)
source_errors = [source for source in source_results if source.get("error")]
if changed:
decision = "changed_requires_replay_readiness_review"
actions = [
"refresh_market_capability_evidence",
"refresh_or_create_no_cost_adapter",
"run_offline_replay_before_shadow",
"do_not_promote_without_promotion_gate",
]
elif source_errors:
decision = "watch_with_source_failures"
actions = ["retry_source_fetch", "do_not_change_integration_status"]
else:
decision = "watch_only_no_change"
actions = ["keep_current_integration_status"]
return {
"candidate_id": candidate_id,
"display_name": str(candidate.get("display_name", candidate_id)),
"evaluation_priority": str(candidate.get("evaluation_priority", "watch")),
"recommended_role": str(candidate.get("recommended_role", "")),
"requires_cost_approval": bool(candidate.get("requires_cost_approval", False)),
"requires_dependency_approval": bool(candidate.get("requires_dependency_approval", False)),
"sources": source_results,
"changed": changed,
"decision": decision,
"recommended_actions": actions,
}
def _evaluate_source(
candidate_id: str,
source: dict[str, Any],
*,
mode: str,
timeout_seconds: int,
fetcher: FetchSource,
previous_sources: dict[tuple[str, str], dict[str, Any]],
) -> dict[str, Any]:
source_id = str(source.get("source_id", "")).strip()
source_type = str(source.get("type", "docs")).strip()
url = str(source.get("url", "")).strip()
reference_version = source.get("reference_version")
if mode == "offline":
return {
"source_id": source_id,
"type": source_type,
"url": url,
"status": "skipped_offline",
"http_status": None,
"version": reference_version,
"published_at": None,
"content_hash": None,
"changed_since_reference": False,
"reference_version": reference_version,
"error": None,
}
fetched = fetcher(url, timeout_seconds)
parsed = _parse_source(source_type, fetched.body) if fetched.body else {}
content_hash = _content_hash(fetched.body, source_type) if fetched.body else None
previous = previous_sources.get((candidate_id, source_id), {})
version = parsed.get("version")
published_at = parsed.get("published_at")
changed = _changed_since_reference(
version=version,
reference_version=reference_version,
content_hash=content_hash,
previous=previous,
)
return {
"source_id": source_id,
"type": source_type,
"url": url,
"status": fetched.status,
"http_status": fetched.http_status,
"version": version,
"published_at": published_at,
"content_hash": content_hash,
"changed_since_reference": changed,
"reference_version": reference_version,
"error": fetched.error,
}
def _parse_source(source_type: str, body: bytes) -> dict[str, str | None]:
if source_type == "pypi":
payload = _loads_json(body)
info = payload.get("info") if isinstance(payload, dict) else {}
version = str(info.get("version", "")) if isinstance(info, dict) else ""
releases = payload.get("releases") if isinstance(payload, dict) else {}
published_at = None
if isinstance(releases, dict) and version in releases and releases[version]:
first_file = releases[version][0]
if isinstance(first_file, dict):
published_at = first_file.get("upload_time_iso_8601")
return {"version": version or None, "published_at": published_at}
if source_type == "npm":
payload = _loads_json(body)
latest = None
published_at = None
if isinstance(payload, dict):
dist_tags = payload.get("dist-tags") or {}
latest = dist_tags.get("latest") if isinstance(dist_tags, dict) else None
times = payload.get("time") or {}
published_at = times.get(str(latest)) if isinstance(times, dict) and latest else None
return {"version": str(latest) if latest else None, "published_at": published_at}
if source_type == "github_release":
payload = _loads_json(body)
if isinstance(payload, dict):
version = payload.get("tag_name") or payload.get("name")
published_at = payload.get("published_at")
return {
"version": str(version) if version else None,
"published_at": str(published_at) if published_at else None,
}
return {"version": None, "published_at": None}
def _fetch_discovery_source(
source: dict[str, Any],
fetcher: FetchSource,
timeout_seconds: int,
) -> dict[str, Any]:
source_id = str(source.get("source_id", "")).strip()
url = str(source.get("url", "")).strip()
fetched = fetcher(url, timeout_seconds)
result: dict[str, Any] = {
"source_id": source_id,
"type": source.get("type"),
"url": url,
"status": fetched.status,
"http_status": fetched.http_status,
"items": [],
"error": fetched.error,
}
if fetched.status != "ok" or not fetched.body:
return result
payload = _loads_json(fetched.body)
if not isinstance(payload, dict):
return result
items = payload.get("items") or []
if not isinstance(items, list):
return result
result["items"] = [
{
"full_name": item.get("full_name"),
"html_url": item.get("html_url"),
"stargazers_count": item.get("stargazers_count"),
"updated_at": item.get("updated_at"),
}
for item in items[:5]
if isinstance(item, dict)
]
return result
def _integration_queue_item(
candidate: dict[str, Any],
candidate_result: dict[str, Any],
) -> dict[str, Any]:
return {
"candidate_id": candidate_result["candidate_id"],
"reason": "primary_source_version_or_content_changed",
"required_next_gate": "refresh_market_scorecard_then_offline_replay",
"requires_cost_approval": bool(candidate.get("requires_cost_approval", False)),
"requires_dependency_approval": bool(candidate.get("requires_dependency_approval", False)),
}
def _previous_source_map(report: dict[str, Any]) -> dict[tuple[str, str], dict[str, Any]]:
mapped: dict[tuple[str, str], dict[str, Any]] = {}
for candidate in report.get("candidates") or []:
candidate_id = str(candidate.get("candidate_id", "")).strip()
for source in candidate.get("sources") or []:
source_id = str(source.get("source_id", "")).strip()
if candidate_id and source_id:
mapped[(candidate_id, source_id)] = source
return mapped
def _changed_since_reference(
*,
version: str | None,
reference_version: Any,
content_hash: str | None,
previous: dict[str, Any],
) -> bool:
if reference_version and version and str(reference_version) != str(version):
return True
previous_version = previous.get("version")
if previous_version and version:
return str(previous_version) != str(version)
if version:
return False
previous_hash = previous.get("content_hash")
if previous_hash and content_hash and str(previous_hash) != str(content_hash):
return True
return False
def _content_hash(body: bytes, source_type: str) -> str:
if source_type == "docs":
normalized = _normalized_docs_text(body)
return hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:24]
return hashlib.sha256(body).hexdigest()[:24]
def _normalized_docs_text(body: bytes) -> str:
text = body.decode("utf-8", errors="replace")
text = re.sub(r"<!--.*?-->", " ", text, flags=re.DOTALL)
text = re.sub(r"<script\b[^>]*>.*?</script>", " ", text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<style\b[^>]*>.*?</style>", " ", text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<noscript\b[^>]*>.*?</noscript>", " ", text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<svg\b[^>]*>.*?</svg>", " ", text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<[^>]+>", " ", text)
text = html.unescape(text)
text = re.sub(r"\s+", " ", text)
return text.strip().lower()
def _loads_json(body: bytes) -> Any:
try:
return json.loads(body.decode("utf-8"))
except Exception:
return {}