# AWOOOI 監控自動整合架構 > **版本**: v1.0 > **建立日期**: 2026-03-29 > **目標**: 新服務/功能自動獲得監控,零遺漏 --- ## 核心原則 ``` ┌─────────────────────────────────────────────────────────────────┐ │ 🎯 監控即代碼 (Monitoring as Code) │ │ │ │ • 所有監控配置存放於 Git │ │ • CI/CD 自動驗證監控覆蓋率 │ │ • 新服務必須通過監控檢查才能部署 │ │ • 服務註冊表自動同步監控規則 │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## 一、服務註冊表 (Service Registry) ### 1.1 註冊表結構 ```yaml # /ops/monitoring/service-registry.yaml # 所有受監控服務的單一事實來源 (Single Source of Truth) services: # === K8s 工作負載 === - name: awoooi-api type: k8s-deployment namespace: awoooi-prod port: 8000 health_endpoint: /api/v1/health monitoring: prometheus: true sentry: true otel: true langfuse: true # 有 LLM 呼叫 alerts: - pod_crash - high_error_rate - slow_response owner: backend-team - name: awoooi-web type: k8s-deployment namespace: awoooi-prod port: 3000 health_endpoint: / monitoring: prometheus: true sentry: true otel: true langfuse: false alerts: - pod_crash - slow_page_load owner: frontend-team - name: awoooi-worker type: k8s-deployment namespace: awoooi-prod health_endpoint: /tmp/worker-healthy # exec probe monitoring: prometheus: true sentry: true otel: true langfuse: true alerts: - worker_stuck - queue_backlog owner: backend-team # === Docker 容器 (188) === - name: ollama type: docker host: 192.168.0.188 port: 11434 health_endpoint: /api/tags monitoring: prometheus: true sentry: false # 外部服務 otel: false alerts: - service_down - inference_timeout owner: ai-team - name: openclaw type: docker host: 192.168.0.188 port: 8089 health_endpoint: /health monitoring: prometheus: true sentry: true otel: true langfuse: true alerts: - service_down - analysis_timeout owner: ai-team - name: redis type: docker host: 192.168.0.188 port: 6380 health_endpoint: redis-cli ping monitoring: prometheus: true alerts: - service_down - memory_high owner: infra-team - name: postgres type: docker host: 192.168.0.188 port: 5432 health_endpoint: pg_isready monitoring: prometheus: true alerts: - service_down - connection_pool_exhausted - slow_query owner: infra-team # === Docker 容器 (110) === - name: harbor type: docker host: 192.168.0.110 port: 5000 health_endpoint: /api/v2.0/health monitoring: prometheus: true alerts: - service_down - storage_full owner: devops-team - name: sentry type: docker host: 192.168.0.110 port: 9000 health_endpoint: /_health/ monitoring: prometheus: true alerts: - service_down owner: devops-team - name: langfuse type: docker host: 192.168.0.110 port: 3100 health_endpoint: /api/public/health monitoring: prometheus: true alerts: - service_down owner: ai-team - name: github-runner type: systemd host: 192.168.0.110 service_name: actions.runner.owenhytsai-awoooi.awoooi-110.service monitoring: prometheus: true alerts: - runner_offline owner: devops-team # === 前端頁面 === pages: - path: / name: Dashboard monitoring: sentry_session: true web_vitals: true alerts: - slow_page_load - js_error - path: /authorizations name: 授權管理 monitoring: sentry_session: true web_vitals: true alerts: - slow_page_load - api_error - path: /action-logs name: 行動日誌 monitoring: sentry_session: true alerts: - slow_page_load - path: /errors name: 錯誤追蹤 monitoring: sentry_session: true alerts: - slow_page_load # === API 端點 === api_endpoints: - path: /api/v1/health method: GET critical: true slo_latency_ms: 100 slo_availability: 99.99 - path: /api/v1/approvals method: GET critical: true slo_latency_ms: 500 slo_availability: 99.9 - path: /api/v1/analyze method: POST critical: true slo_latency_ms: 30000 # 30s (LLM) slo_availability: 95 - path: /api/v1/webhooks/alertmanager method: POST critical: true slo_latency_ms: 5000 slo_availability: 99.9 ``` ### 1.2 自動生成工具 ```python # /ops/monitoring/generate_monitoring.py """ 從 service-registry.yaml 自動生成: 1. Prometheus scrape configs 2. Alertmanager alert rules 3. Grafana dashboards 4. Blackbox exporter targets """ import yaml from pathlib import Path def generate_prometheus_config(registry: dict) -> str: """生成 Prometheus scrape_configs""" scrape_configs = [] for service in registry['services']: if service['monitoring'].get('prometheus'): config = { 'job_name': service['name'], 'static_configs': [{ 'targets': [f"{service['host']}:{service['port']}"] }], 'metrics_path': '/metrics', 'scrape_interval': '15s', } # 根據類型調整 if service['type'] == 'k8s-deployment': config['kubernetes_sd_configs'] = [{ 'role': 'pod', 'namespaces': {'names': [service['namespace']]} }] del config['static_configs'] scrape_configs.append(config) return yaml.dump({'scrape_configs': scrape_configs}) def generate_alert_rules(registry: dict) -> str: """生成 Prometheus alert rules""" groups = [] for service in registry['services']: rules = [] for alert in service.get('alerts', []): rule = ALERT_TEMPLATES.get(alert, {}).copy() rule['labels'] = { 'service': service['name'], 'owner': service['owner'], 'severity': 'critical' if alert in CRITICAL_ALERTS else 'warning', } rules.append(rule) if rules: groups.append({ 'name': f"{service['name']}_alerts", 'rules': rules, }) return yaml.dump({'groups': groups}) def generate_blackbox_targets(registry: dict) -> list: """生成 Blackbox Exporter 健康檢查目標""" targets = [] for service in registry['services']: if service.get('health_endpoint'): if service['type'] in ['docker', 'k8s-deployment']: url = f"http://{service['host']}:{service['port']}{service['health_endpoint']}" targets.append({ 'targets': [url], 'labels': { 'service': service['name'], 'type': service['type'], } }) return targets # 告警模板 ALERT_TEMPLATES = { 'pod_crash': { 'alert': 'PodCrashLoopBackOff', 'expr': 'kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0', 'for': '2m', 'annotations': { 'summary': 'Pod {{ $labels.pod }} is crash looping', 'auto_repair': 'restart_pod', } }, 'high_error_rate': { 'alert': 'HighErrorRate', 'expr': 'rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.01', 'for': '5m', 'annotations': { 'summary': 'High error rate on {{ $labels.service }}', 'auto_repair': 'restart_pod', } }, 'service_down': { 'alert': 'ServiceDown', 'expr': 'probe_success == 0', 'for': '1m', 'annotations': { 'summary': '{{ $labels.service }} is down', 'auto_repair': 'restart_container', } }, 'slow_response': { 'alert': 'SlowResponse', 'expr': 'histogram_quantile(0.95, http_request_duration_seconds_bucket) > 2', 'for': '5m', 'annotations': { 'summary': 'Slow response on {{ $labels.service }}', 'auto_repair': 'scale_up', } }, 'memory_high': { 'alert': 'MemoryHigh', 'expr': 'container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9', 'for': '5m', 'annotations': { 'summary': 'High memory usage on {{ $labels.service }}', 'auto_repair': 'analyze_memory_leak', } }, 'runner_offline': { 'alert': 'GitHubRunnerOffline', 'expr': 'github_runner_status == 0', 'for': '5m', 'annotations': { 'summary': 'GitHub Runner is offline', 'auto_repair': 'restart_runner_service', } }, } CRITICAL_ALERTS = {'pod_crash', 'service_down', 'runner_offline'} ``` --- ## 二、CI/CD 整合 (自動監控) ### 2.1 新服務自動監控流程 ```yaml # .github/workflows/cd.yaml 新增步驟 jobs: monitoring-validation: name: "🔍 Monitoring Coverage Check" runs-on: self-hosted steps: - name: Checkout uses: actions/checkout@v4 - name: Validate Service Registry run: | # 檢查所有 K8s Deployment 是否都在 registry 中 python ops/monitoring/validate_coverage.py - name: Generate Monitoring Configs run: | # 從 registry 生成最新監控配置 python ops/monitoring/generate_monitoring.py - name: Check for Drift run: | # 檢查生成的配置與現有配置是否一致 diff -r ops/monitoring/generated/ ops/monitoring/active/ - name: Apply Monitoring Configs if: github.ref == 'refs/heads/main' run: | # 部署監控配置 kubectl apply -f ops/monitoring/generated/prometheus-rules.yaml kubectl apply -f ops/monitoring/generated/alertmanager-config.yaml ``` ### 2.2 新服務檢測腳本 ```python # /ops/monitoring/validate_coverage.py """ CI 檢查: 確保所有服務都有監控配置 """ import yaml import subprocess import sys def get_k8s_deployments() -> list[str]: """取得所有 K8s Deployments""" result = subprocess.run( ['kubectl', 'get', 'deployments', '-A', '-o', 'jsonpath={.items[*].metadata.name}'], capture_output=True, text=True ) return result.stdout.split() def get_docker_containers(host: str) -> list[str]: """取得主機上的 Docker 容器""" result = subprocess.run( ['ssh', host, 'docker', 'ps', '--format', '{{.Names}}'], capture_output=True, text=True ) return result.stdout.strip().split('\n') def load_registry() -> dict: """載入服務註冊表""" with open('ops/monitoring/service-registry.yaml') as f: return yaml.safe_load(f) def main(): registry = load_registry() registered_services = {s['name'] for s in registry['services']} errors = [] # 檢查 K8s Deployments k8s_deployments = get_k8s_deployments() for deploy in k8s_deployments: if deploy not in registered_services and not deploy.startswith('kube-'): errors.append(f"❌ K8s Deployment '{deploy}' 未在 service-registry.yaml 中註冊") # 檢查 Docker 容器 (188, 110) for host in ['192.168.0.188', '192.168.0.110']: try: containers = get_docker_containers(host) for container in containers: if container and container not in registered_services: # 忽略系統容器 if not any(x in container for x in ['k3s', 'pause', 'coredns']): errors.append(f"⚠️ Docker 容器 '{container}' on {host} 未在 registry 中") except Exception as e: print(f"Warning: Cannot check {host}: {e}") if errors: print("\n".join(errors)) print(f"\n❌ 發現 {len(errors)} 個未監控的服務!") print("請更新 ops/monitoring/service-registry.yaml") sys.exit(1) print("✅ 所有服務都已註冊監控") sys.exit(0) if __name__ == '__main__': main() ``` ### 2.3 新 API 端點自動監控 ```python # /apps/api/src/core/auto_monitoring.py """ FastAPI 路由自動監控 - 自動註冊所有端點到 Prometheus - 自動設置 Sentry 追蹤 - 自動建立健康檢查 """ from functools import wraps from fastapi import APIRouter, Request from prometheus_client import Counter, Histogram import time # Prometheus Metrics (自動建立) REQUEST_COUNT = Counter( 'http_requests_total', 'Total HTTP requests', ['method', 'path', 'status'] ) REQUEST_LATENCY = Histogram( 'http_request_duration_seconds', 'HTTP request latency', ['method', 'path'], buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0] ) def auto_monitor(router: APIRouter): """ 裝飾器: 為 Router 的所有端點添加監控 使用方式: ```python router = APIRouter(prefix="/api/v1/new-feature") auto_monitor(router) # 自動添加監控 ``` """ original_add_api_route = router.add_api_route def monitored_add_api_route(path, endpoint, **kwargs): @wraps(endpoint) async def monitored_endpoint(request: Request, *args, **inner_kwargs): start_time = time.time() try: response = await endpoint(request, *args, **inner_kwargs) status = getattr(response, 'status_code', 200) except Exception as e: status = 500 raise finally: # 記錄指標 REQUEST_COUNT.labels( method=request.method, path=path, status=status ).inc() REQUEST_LATENCY.labels( method=request.method, path=path ).observe(time.time() - start_time) return response return original_add_api_route(path, monitored_endpoint, **kwargs) router.add_api_route = monitored_add_api_route return router ``` --- ## 三、前端自動監控 ### 3.1 頁面自動埋點 ```typescript // /apps/web/src/lib/auto-monitoring.ts /** * 前端頁面自動監控 * - Web Vitals 自動收集 * - 頁面錯誤自動上報 * - API 呼叫自動追蹤 */ import * as Sentry from '@sentry/nextjs'; // 自動初始化 (在 _app.tsx 中調用) export function initAutoMonitoring() { // 1. Web Vitals if (typeof window !== 'undefined') { import('web-vitals').then(({ onCLS, onFID, onLCP, onTTFB, onINP }) => { onCLS(sendToAnalytics); onFID(sendToAnalytics); onLCP(sendToAnalytics); onTTFB(sendToAnalytics); onINP(sendToAnalytics); }); } // 2. 全局錯誤處理 if (typeof window !== 'undefined') { window.addEventListener('error', (event) => { Sentry.captureException(event.error); }); window.addEventListener('unhandledrejection', (event) => { Sentry.captureException(event.reason); }); } // 3. API 呼叫自動追蹤 patchFetch(); } function sendToAnalytics(metric: any) { // 發送到 Sentry Performance Sentry.metrics.distribution( `web_vitals.${metric.name}`, metric.value, { tags: { page: window.location.pathname, }, } ); } function patchFetch() { const originalFetch = window.fetch; window.fetch = async function(input, init) { const url = typeof input === 'string' ? input : input.url; const method = init?.method || 'GET'; const span = Sentry.startSpan({ name: `${method} ${url}`, op: 'http.client', }); try { const response = await originalFetch(input, init); // 記錄 API 錯誤 if (!response.ok) { Sentry.captureMessage(`API Error: ${method} ${url} - ${response.status}`, { level: response.status >= 500 ? 'error' : 'warning', extra: { status: response.status, statusText: response.statusText, }, }); } return response; } catch (error) { Sentry.captureException(error); throw error; } finally { span?.end(); } }; } ``` ### 3.2 新頁面自動檢測 ```typescript // /apps/web/src/middleware.ts import { NextResponse } from 'next/server'; import type { NextRequest } from 'next/server'; // 已知頁面清單 (從 service-registry 同步) const KNOWN_PAGES = new Set([ '/', '/authorizations', '/action-logs', '/errors', '/settings', '/knowledge-base', ]); export function middleware(request: NextRequest) { const path = request.nextUrl.pathname; // 檢測新頁面 if (!KNOWN_PAGES.has(path) && !path.startsWith('/api') && !path.startsWith('/_next')) { // 發送到監控系統 console.warn(`[MONITORING] 新頁面被訪問但未註冊: ${path}`); // TODO: 發送到 Sentry 或後端 API } return NextResponse.next(); } ``` --- ## 四、自動發現機制 ### 4.1 K8s 服務自動發現 ```yaml # /ops/monitoring/prometheus/kubernetes-sd.yaml # Prometheus 自動發現 K8s 服務 scrape_configs: # 自動發現所有 Pod - job_name: 'kubernetes-pods' kubernetes_sd_configs: - role: pod relabel_configs: # 只抓有 prometheus.io/scrape: "true" 標籤的 Pod - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true # 使用 Pod 標籤作為 job name - source_labels: [__meta_kubernetes_pod_label_app] target_label: job # 使用 namespace 標籤 - source_labels: [__meta_kubernetes_namespace] target_label: namespace ``` ### 4.2 Docker 容器自動發現 ```python # /ops/monitoring/discover_docker.py """ 定期掃描 Docker 容器,發現未監控的新服務 """ import subprocess import json from datetime import datetime HOSTS = ['192.168.0.188', '192.168.0.110'] def discover_containers(): """發現所有 Docker 容器""" all_containers = [] for host in HOSTS: try: result = subprocess.run( ['ssh', host, 'docker', 'ps', '--format', '{{json .}}'], capture_output=True, text=True, timeout=10 ) for line in result.stdout.strip().split('\n'): if line: container = json.loads(line) container['host'] = host all_containers.append(container) except Exception as e: print(f"Error scanning {host}: {e}") return all_containers def check_new_containers(containers: list, registry: dict): """檢查是否有新容器未在 registry 中""" registered = {s['name'] for s in registry['services']} new_containers = [] for c in containers: name = c['Names'] if name not in registered: new_containers.append({ 'name': name, 'host': c['host'], 'image': c['Image'], 'created': c['CreatedAt'], }) return new_containers def alert_new_containers(new_containers: list): """發送新容器告警""" if new_containers: message = f"""🆕 發現 {len(new_containers)} 個未監控的容器: """ for c in new_containers: message += f"• {c['name']} on {c['host']} ({c['image']})\n" message += "\n請更新 service-registry.yaml" # TODO: 發送 Telegram 告警 print(message) if __name__ == '__main__': # 作為 cron job 每小時執行 import yaml with open('ops/monitoring/service-registry.yaml') as f: registry = yaml.safe_load(f) containers = discover_containers() new_containers = check_new_containers(containers, registry) if new_containers: alert_new_containers(new_containers) ``` --- ## 五、監控覆蓋率儀表板 ### 5.1 覆蓋率計算 ```python # /ops/monitoring/coverage_report.py """ 計算監控覆蓋率並生成報告 """ def calculate_coverage(registry: dict) -> dict: """計算各維度的監控覆蓋率""" services = registry['services'] total = len(services) coverage = { 'prometheus': sum(1 for s in services if s['monitoring'].get('prometheus')) / total, 'sentry': sum(1 for s in services if s['monitoring'].get('sentry')) / total, 'otel': sum(1 for s in services if s['monitoring'].get('otel')) / total, 'langfuse': sum(1 for s in services if s['monitoring'].get('langfuse')) / total, 'alerts': sum(1 for s in services if s.get('alerts')) / total, } # 頁面覆蓋率 pages = registry.get('pages', []) if pages: coverage['page_sentry'] = sum(1 for p in pages if p['monitoring'].get('sentry_session')) / len(pages) coverage['page_vitals'] = sum(1 for p in pages if p['monitoring'].get('web_vitals')) / len(pages) # API SLO 覆蓋率 endpoints = registry.get('api_endpoints', []) if endpoints: coverage['api_slo'] = sum(1 for e in endpoints if e.get('slo_latency_ms')) / len(endpoints) return coverage def generate_report(coverage: dict) -> str: """生成覆蓋率報告""" report = """ # AWOOOI 監控覆蓋率報告 生成時間: {timestamp} ## 服務監控覆蓋率 | 監控類型 | 覆蓋率 | 狀態 | |----------|--------|------| | Prometheus Metrics | {prometheus:.0%} | {prometheus_status} | | Sentry 錯誤追蹤 | {sentry:.0%} | {sentry_status} | | OTEL Traces | {otel:.0%} | {otel_status} | | Langfuse LLM | {langfuse:.0%} | {langfuse_status} | | Alert Rules | {alerts:.0%} | {alerts_status} | ## 前端監控覆蓋率 | 監控類型 | 覆蓋率 | 狀態 | |----------|--------|------| | Sentry Session | {page_sentry:.0%} | {page_sentry_status} | | Web Vitals | {page_vitals:.0%} | {page_vitals_status} | ## API SLO 覆蓋率 | 類型 | 覆蓋率 | 狀態 | |------|--------|------| | SLO 定義 | {api_slo:.0%} | {api_slo_status} | --- 總體健康度: **{overall:.0%}** """.format( timestamp=datetime.now().isoformat(), **coverage, **{f"{k}_status": "✅" if v >= 0.9 else "⚠️" if v >= 0.7 else "❌" for k, v in coverage.items()}, overall=sum(coverage.values()) / len(coverage), ) return report ``` --- ## 六、整合流程圖 ``` ┌─────────────────────────────────────────────────────────────────────┐ │ 開發者新增服務 │ │ (新 K8s Deployment / Docker 容器 / API 端點 / 前端頁面) │ └────────────────────────────┬────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────────┐ │ Step 1: 更新 service-registry.yaml │ │ ┌───────────────────────────────────────────────────────────────┐ │ │ │ services: │ │ │ │ - name: new-service │ │ │ │ type: k8s-deployment │ │ │ │ monitoring: │ │ │ │ prometheus: true │ │ │ │ sentry: true │ │ │ │ alerts: │ │ │ │ - pod_crash │ │ │ │ - high_error_rate │ │ │ └───────────────────────────────────────────────────────────────┘ │ └────────────────────────────┬────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────────┐ │ Step 2: git push → CI/CD Pipeline │ │ ┌───────────────────────────────────────────────────────────────┐ │ │ │ 1. validate_coverage.py → 檢查所有服務都在 registry │ │ │ │ 2. generate_monitoring.py → 生成 Prometheus/Alertmanager 配置 │ │ │ │ 3. kubectl apply → 部署監控配置 │ │ │ │ 4. 部署新服務 │ │ │ └───────────────────────────────────────────────────────────────┘ │ └────────────────────────────┬────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────────┐ │ Step 3: 監控自動生效 │ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌────────────┐ │ │ │ Prometheus │ │ Alertmanager│ │ Sentry │ │ SignOz │ │ │ │ 開始抓 Metrics│ │ 開始監控告警 │ │ 開始追蹤錯誤│ │ 開始收 Traces│ │ │ └─────────────┘ └─────────────┘ └─────────────┘ └────────────┘ │ └────────────────────────────┬────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────────┐ │ Step 4: 異常發生 → AI 自動修復 │ │ ┌───────────────────────────────────────────────────────────────┐ │ │ │ 1. Prometheus 觸發告警 │ │ │ │ 2. Alertmanager → AWOOOI Webhook │ │ │ │ 3. OpenClaw AI 分析 │ │ │ │ 4. 自動/人工修復 │ │ │ │ 5. 結果回饋 → Playbook 更新 │ │ │ └───────────────────────────────────────────────────────────────┘ │ └─────────────────────────────────────────────────────────────────────┘ ``` --- ## 七、實作清單 ### 7.1 需要建立的檔案 | 檔案 | 用途 | 優先級 | |------|------|--------| | `ops/monitoring/service-registry.yaml` | 服務註冊表 | P0 | | `ops/monitoring/generate_monitoring.py` | 配置生成器 | P0 | | `ops/monitoring/validate_coverage.py` | 覆蓋率檢查 | P0 | | `ops/monitoring/discover_docker.py` | 容器發現 | P1 | | `apps/api/src/core/auto_monitoring.py` | API 自動監控 | P1 | | `apps/web/src/lib/auto-monitoring.ts` | 前端自動監控 | P1 | ### 7.2 CI/CD 修改 | 修改 | 用途 | 優先級 | |------|------|--------| | 新增 `monitoring-validation` job | 檢查覆蓋率 | P0 | | 新增 `monitoring-deploy` job | 部署配置 | P0 | ### 7.3 Cron Jobs | 任務 | 頻率 | 用途 | |------|------|------| | `discover_docker.py` | 每小時 | 發現新容器 | | `coverage_report.py` | 每日 | 生成報告 | --- **文件結束**