Files
awoooi/ops/monitoring/discover_docker.py

315 lines
9.1 KiB
Python
Executable File

#!/usr/bin/env python3
"""
AWOOOI Docker 容器自動發現
===========================
ADR-037 Wave C.3: 掃描 Docker 主機,自動更新 service-registry.yaml
功能:
1. SSH 連線 Docker 主機 (188, 110)
2. 取得執行中容器清單
3. 比對 service-registry.yaml
4. 輸出差異報告 (新增/移除/未監控)
用法:
python ops/monitoring/discover_docker.py
python ops/monitoring/discover_docker.py --update # 自動更新 registry
python ops/monitoring/discover_docker.py --json # JSON 輸出
版本: v1.0
建立: 2026-03-29 (台北時區)
建立者: Claude Code (Phase 21 ADR-037)
"""
import argparse
import json
import subprocess
import sys
from pathlib import Path
import yaml
# 配置
SCRIPT_DIR = Path(__file__).parent
REGISTRY_FILE = SCRIPT_DIR / "service-registry.yaml"
# Docker 主機清單
DOCKER_HOSTS = [
{
"host": "192.168.0.188",
"name": "pg",
"role": "ai-web",
"ssh_user": "ogt",
},
{
"host": "192.168.0.110",
"name": "harbor",
"role": "devops",
"ssh_user": "wooo",
},
]
# 忽略的容器 (系統容器)
IGNORE_CONTAINERS = {
"k3s-agent",
"k3s",
"rancher-",
"portainer",
}
def run_ssh_command(host: str, user: str, command: str) -> tuple[bool, str]:
"""執行 SSH 命令"""
try:
result = subprocess.run(
[
"ssh",
"-o",
"BatchMode=yes",
"-o",
"ConnectTimeout=5",
"-o",
"StrictHostKeyChecking=accept-new",
f"{user}@{host}",
command,
],
capture_output=True,
text=True,
timeout=30
)
return result.returncode == 0, result.stdout
except subprocess.TimeoutExpired:
return False, "SSH timeout"
except Exception as e:
return False, str(e)
def discover_containers(host_info: dict) -> list[dict]:
"""發現 Docker 容器"""
host = host_info["host"]
user = host_info["ssh_user"]
# 取得容器清單 (JSON 格式)
cmd = 'docker ps --format "{{json .}}"'
success, output = run_ssh_command(host, user, cmd)
if not success:
print(f" ❌ 無法連線 {host}: {output}")
return []
containers = []
for line in output.strip().split("\n"):
if not line:
continue
try:
data = json.loads(line)
name = data.get("Names", "")
# 過濾系統容器
if any(name.startswith(ignore) for ignore in IGNORE_CONTAINERS):
continue
# 解析 Port
ports = data.get("Ports", "")
exposed_port = None
if ports:
# 格式: "0.0.0.0:8080->8080/tcp"
for port_part in ports.split(","):
if "->" in port_part:
host_port = port_part.split("->")[0].split(":")[-1]
try:
exposed_port = int(host_port)
except ValueError:
pass
break
containers.append({
"name": name,
"image": data.get("Image", ""),
"status": data.get("Status", ""),
"port": exposed_port,
"host": host,
"host_name": host_info["name"],
"role": host_info["role"],
})
except json.JSONDecodeError:
continue
return containers
def load_registry() -> dict:
"""載入服務註冊表"""
if not REGISTRY_FILE.exists():
return {"services": []}
with open(REGISTRY_FILE) as f:
return yaml.safe_load(f) or {"services": []}
def compare_with_registry(containers: list[dict], registry: dict) -> dict:
"""比對容器與 registry"""
# 建立 registry 服務索引 (按 host + name)
registry_services = {}
for svc in registry.get("services", []):
if svc.get("type") == "docker":
key = f"{svc.get('host', '')}:{svc.get('name', '')}"
registry_services[key] = svc
# 分類
result = {
"monitored": [], # 已監控
"new": [], # 新發現 (未在 registry)
"missing": [], # registry 有但容器不存在
"no_prometheus": [], # 在 registry 但未啟用 Prometheus
}
discovered_keys = set()
for container in containers:
key = f"{container['host']}:{container['name']}"
discovered_keys.add(key)
if key in registry_services:
svc = registry_services[key]
if svc.get("monitoring", {}).get("prometheus"):
result["monitored"].append({
**container,
"registry_name": svc["name"]
})
else:
result["no_prometheus"].append({
**container,
"registry_name": svc["name"]
})
else:
result["new"].append(container)
# 找出 registry 有但容器不存在的
for key, svc in registry_services.items():
if key not in discovered_keys:
result["missing"].append({
"name": svc["name"],
"host": svc.get("host", ""),
"port": svc.get("port"),
})
return result
def generate_service_entry(container: dict) -> dict:
"""生成 service-registry.yaml entry"""
return {
"name": container["name"],
"type": "docker",
"host": container["host"],
"port": container.get("port", 8080),
"health_endpoint": "/health",
"monitoring": {
"prometheus": True,
"sentry": False,
"otel": False,
},
"alerts": ["service_down"],
"auto_repair": {
"enabled": False,
},
"owner": f"{container['role']}-team",
"criticality": "P2",
}
def print_report(comparison: dict):
"""輸出報告"""
print("\n" + "=" * 60)
print(" AWOOOI Docker Container Discovery Report")
print("=" * 60)
print(f"\n ✅ 已監控容器 ({len(comparison['monitored'])})")
for c in comparison["monitored"]:
print(f" - {c['name']} @ {c['host']}:{c.get('port', 'N/A')}")
if comparison["new"]:
print(f"\n 🆕 新發現容器 ({len(comparison['new'])})")
for c in comparison["new"]:
print(f" - {c['name']} @ {c['host']}:{c.get('port', 'N/A')}")
print(f" Image: {c['image']}")
if comparison["no_prometheus"]:
print(f"\n ⚠️ 未啟用 Prometheus ({len(comparison['no_prometheus'])})")
for c in comparison["no_prometheus"]:
print(f" - {c['name']} @ {c['host']}")
if comparison["missing"]:
print(f"\n ❌ Registry 有但容器不存在 ({len(comparison['missing'])})")
for c in comparison["missing"]:
print(f" - {c['name']} @ {c['host']}")
print("\n" + "=" * 60)
# 統計
total = (
len(comparison["monitored"]) +
len(comparison["new"]) +
len(comparison["no_prometheus"])
)
monitored = len(comparison["monitored"])
coverage = round(100 * monitored / total, 1) if total > 0 else 0
print(f"\n 總容器數: {total}")
print(f" 已監控: {monitored}")
print(f" 覆蓋率: {coverage}%")
return len(comparison["new"]) == 0 # 沒有新發現 = 通過
def main():
parser = argparse.ArgumentParser(description="AWOOOI Docker Container Discovery")
parser.add_argument("--update", action="store_true", help="Auto-update service-registry.yaml")
parser.add_argument("--json", action="store_true", help="Output as JSON")
parser.add_argument("--host", type=str, help="Only scan specific host")
args = parser.parse_args()
print("Discovering Docker containers...")
# 發現容器
all_containers = []
for host_info in DOCKER_HOSTS:
if args.host and host_info["host"] != args.host:
continue
print(f" Scanning {host_info['name']} ({host_info['host']})...")
containers = discover_containers(host_info)
print(f" Found {len(containers)} containers")
all_containers.extend(containers)
# 載入 registry 並比對
registry = load_registry()
comparison = compare_with_registry(all_containers, registry)
if args.json:
print(json.dumps(comparison, indent=2, default=str))
return
# 輸出報告
all_good = print_report(comparison)
# 自動更新
if args.update and comparison["new"]:
print("\nUpdating service-registry.yaml...")
for container in comparison["new"]:
entry = generate_service_entry(container)
registry["services"].append(entry)
print(f" Added: {entry['name']}")
with open(REGISTRY_FILE, "w") as f:
yaml.dump(registry, f, default_flow_style=False, allow_unicode=True)
print(f"Updated: {REGISTRY_FILE}")
if not all_good:
print("\n⚠️ 有新發現的容器未加入監控,請檢查上方報告")
sys.exit(1)
if __name__ == "__main__":
main()