From ceb61c3c8ec788c4d6691b97cc9f006339bb0cc5 Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 19 Apr 2026 20:06:34 +0800 Subject: [PATCH] =?UTF-8?q?feat(asset=5Fscanner):=20Gap=201=20=E4=BF=AE=20?= =?UTF-8?q?=E2=80=94=20Prometheus=20targets=20=E8=A3=9C=E9=BD=8A=20host-in?= =?UTF-8?q?stall=20services?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Audit 發現 asset_inventory 只涵蓋 K8s (mon=120, mon1=121 共 2 node+78 pods), 完全漏 110 (Harbor/Gitea/監控) + 112 (security) + 188 (PG/Redis/Ollama) + 125 (mon backup/standby) 這 4 主機的 host-install services. 用戶 4 主機架構 (110/112/120/121/188) 只覆蓋 2/5 = 40%. 新增 _collect_prometheus_targets: GET /api/v1/targets?state=active → 自動發現全部被監控的: - host_service (IP 形式 target → postgres-110/redis-110/minio-188/node-exporter 等) - third_party_service (非 IP 如 alertmanager/argocd-server) - host (每個 unique IP 建 asset_type='host') - target → host 的 depends_on relationship 預期新增 asset_inventory: - host: 6 個 (110/112/120/121/125/188,Prometheus 看到的 blackbox-icmp 全覆蓋) - host_service: ~15 個 (postgres/redis/minio/node-exporter/cadvisor 等) - third_party_service: ~5 個 (alertmanager/argocd/prometheus/velero 等) 解鎖: - 110/112/188 host-install services 進入 asset_inventory - coverage_evaluator 可評估這些 asset (monitoring/alerting/playbook 等 7 維) - blast_radius_calculator 可查「110 PostgreSQL 影響哪些 service」 - Hermes/forecaster 建議範圍擴大到非 K8s 服務 對齊統帥鐵律: 朝 AI 自主化 — 不硬編主機清單,動態從 Prometheus 發現 Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/api/src/jobs/asset_scanner_job.py | 114 +++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/apps/api/src/jobs/asset_scanner_job.py b/apps/api/src/jobs/asset_scanner_job.py index be1d8d5f..f666a576 100644 --- a/apps/api/src/jobs/asset_scanner_job.py +++ b/apps/api/src/jobs/asset_scanner_job.py @@ -477,6 +477,120 @@ async def _collect_all_k8s_assets() -> tuple[list[dict[str, Any]], list[dict[str except Exception as e: logger.warning("collect_configmaps_failed", error=str(e)) + # 6. Prometheus targets — 補齊 host-install services (110/112/188/125 等非 K8s) + # Gap 1 修補 (2026-04-19 audit): 原本 asset_inventory 只涵蓋 K8s, + # 110 Harbor/Gitea/監控 + 188 PostgreSQL/Redis/Ollama host-install 全漏 + # 用 Prometheus /api/v1/targets 自動發現全節點服務 + try: + prom_assets, host_relationships = await _collect_prometheus_targets() + assets.extend(prom_assets) + relationships.extend(host_relationships) + except Exception as e: + logger.warning("collect_prometheus_targets_failed", error=str(e)) + + return assets, relationships + + +async def _collect_prometheus_targets() -> tuple[list[dict[str, Any]], list[dict[str, str]]]: + """ + 從 Prometheus /api/v1/targets 發現所有被監控的 host-install service + 主機. + + 每個 target 建 third_party_service / host_service asset. + 每個 unique IP 建 host asset (若尚未存在). + target → host 建 depends_on relationship. + """ + import httpx + from src.core.config import settings + + assets: list[dict[str, Any]] = [] + relationships: list[dict[str, str]] = [] + seen_hosts: set[str] = set() + + url = f"{settings.PROMETHEUS_URL.rstrip('/')}/api/v1/targets" + try: + async with httpx.AsyncClient(timeout=10.0, trust_env=False) as client: + resp = await client.get(url, params={"state": "active"}) + resp.raise_for_status() + data = resp.json() + except Exception as e: + logger.warning("prometheus_targets_fetch_failed", error=str(e)) + return assets, relationships + + for t in (data.get("data", {}) or {}).get("activeTargets", []) or []: + labels = t.get("labels", {}) or {} + instance = labels.get("instance", "") + job = labels.get("job", "") + if not instance or not job: + continue + + # 解析 host IP — 優先 labels.host,其次 instance 的 IP 前綴 + host_ip = labels.get("host") or "" + if not host_ip and ":" in instance: + host_ip = instance.split(":")[0] + # 只處理看起來是 IP 的 host (避免 'alertmanager' / 'argocd-server' 等 K8s DNS) + if not host_ip or not host_ip.replace(".", "").isdigit(): + # target instance 不是 IP 形式 → 建 third_party_service asset 但 host 留空 + asset_key = f"prometheus_target/{job}/{instance}" + assets.append({ + "asset_key": asset_key, + "asset_type": "third_party_service", + "host": None, + "namespace": None, + "name": f"{job}:{instance}", + "metadata": { + "job": job, + "instance": instance, + "scrape_url": t.get("scrapeUrl"), + "health": t.get("health"), + "labels": labels, + }, + "tags": [f"job:{job}", "source:prometheus_target"], + }) + continue + + # IP 形式 target + asset_key = f"prometheus_target/{job}/{instance}" + assets.append({ + "asset_key": asset_key, + "asset_type": "host_service", + "host": host_ip, + "namespace": None, + "name": f"{job}@{host_ip}", + "metadata": { + "job": job, + "instance": instance, + "scrape_url": t.get("scrapeUrl"), + "health": t.get("health"), + "labels": labels, + }, + "tags": [f"job:{job}", f"host:{host_ip}", "source:prometheus_target"], + }) + + # 對每個 IP 建 host asset (若尚未) + if host_ip not in seen_hosts: + seen_hosts.add(host_ip) + host_key = f"host/{host_ip}" + assets.append({ + "asset_key": host_key, + "asset_type": "host", + "host": host_ip, + "namespace": None, + "name": host_ip, + "metadata": { + "discovered_by": "prometheus_targets", + "source": "blackbox_icmp_or_node_exporter", + }, + "tags": [f"ip:{host_ip}", "source:prometheus"], + }) + + # 建 target → host 的 depends_on relationship + relationships.append({ + "from_key": asset_key, + "to_key": f"host/{host_ip}", + "relationship_type": "depends_on", + }) + + logger.info("prometheus_targets_collected", count=len(assets), hosts=len(seen_hosts)) return assets, relationships