# AWOOOI 服務註冊表 (Single Source of Truth) # =========================================== # 版本: v1.0 # 建立日期: 2026-03-29 # 用途: 所有受監控服務的統一清單 # # 新增服務時: # 1. 在此檔案新增 entry # 2. CI/CD 會自動生成對應的監控配置 # 3. 部署後監控自動生效 # ============================================================================= # K8s 工作負載 (awoooi-prod namespace) # ============================================================================= services: # --- API 後端 --- - name: awoooi-api type: k8s-deployment namespace: awoooi-prod replicas: 2 port: 8000 health_endpoint: /api/v1/health monitoring: prometheus: true sentry: true otel: true langfuse: true alerts: - pod_crash - high_error_rate - slow_response - memory_high auto_repair: enabled: true actions: - restart_pod - scale_up owner: backend-team criticality: P0 # --- Web 前端 --- - name: awoooi-web type: k8s-deployment namespace: awoooi-prod replicas: 2 port: 3000 health_endpoint: / monitoring: prometheus: true sentry: true otel: true langfuse: false alerts: - pod_crash - slow_page_load auto_repair: enabled: true actions: - restart_pod owner: frontend-team criticality: P0 # --- Signal Worker --- - name: awoooi-worker type: k8s-deployment namespace: awoooi-prod replicas: 1 health_endpoint: /tmp/worker-healthy health_type: exec_mtime monitoring: prometheus: true sentry: true otel: true langfuse: true alerts: - worker_stuck - queue_backlog auto_repair: enabled: true actions: - restart_pod owner: backend-team criticality: P1 # --- ArgoCD --- - name: argocd-server type: k8s-deployment namespace: argocd port: 443 health_endpoint: /healthz monitoring: prometheus: true sentry: false otel: false alerts: - service_down - sync_failed owner: devops-team criticality: P1 # --- Prometheus --- - name: prometheus type: k8s-deployment namespace: monitoring port: 9090 health_endpoint: /-/ready monitoring: prometheus: false # 自己監控自己會循環 sentry: false alerts: - service_down owner: devops-team criticality: P0 # --- Alertmanager --- - name: alertmanager type: k8s-deployment namespace: monitoring port: 9093 health_endpoint: /-/ready monitoring: prometheus: true sentry: false alerts: - service_down owner: devops-team criticality: P0 # --- ArgoCD 完整元件 (Phase O-6 2026-04-10) --- - name: argocd-applicationset-controller type: k8s-deployment namespace: argocd monitoring: prometheus: true sentry: false otel: false alerts: - service_down owner: devops-team criticality: P2 - name: argocd-dex-server type: k8s-deployment namespace: argocd monitoring: prometheus: true sentry: false otel: false alerts: - service_down owner: devops-team criticality: P2 - name: argocd-notifications-controller type: k8s-deployment namespace: argocd monitoring: prometheus: true sentry: false otel: false alerts: - service_down owner: devops-team criticality: P2 - name: argocd-redis type: k8s-deployment namespace: argocd monitoring: prometheus: true sentry: false otel: false alerts: - service_down owner: devops-team criticality: P2 - name: argocd-repo-server type: k8s-deployment namespace: argocd monitoring: prometheus: true sentry: false otel: false alerts: - service_down owner: devops-team criticality: P2 # --- AWOOOI Dev 環境 --- - name: awoooi-api type: k8s-deployment namespace: awoooi-dev monitoring: prometheus: true sentry: false otel: false alerts: - pod_crash owner: backend-team criticality: P3 # --- kube-state-metrics --- - name: kube-state-metrics type: k8s-deployment namespace: kube-state-metrics monitoring: prometheus: true sentry: false otel: false alerts: - service_down owner: devops-team criticality: P1 # --- OTEL Event Exporter --- - name: event-exporter type: k8s-deployment namespace: observability monitoring: prometheus: true sentry: false otel: false alerts: - service_down owner: devops-team criticality: P1 # --- Velero 備份 --- - name: velero type: k8s-deployment namespace: velero monitoring: prometheus: true sentry: false otel: false alerts: - service_down - backup_failed owner: devops-team criticality: P1 # ============================================================================= # Docker 容器 (192.168.0.188 - AI/Web 中心) # ============================================================================= # --- Ollama LLM --- - name: ollama type: docker host: 192.168.0.188 port: 11434 health_endpoint: /api/tags monitoring: prometheus: true sentry: false otel: false alerts: - service_down - inference_timeout - model_load_failed auto_repair: enabled: true actions: - restart_container owner: ai-team criticality: P0 # --- OpenClaw AI 決策中心 --- - name: openclaw type: docker host: 192.168.0.188 port: 8089 health_endpoint: /health monitoring: prometheus: true sentry: true otel: true langfuse: true alerts: - service_down - analysis_timeout - high_error_rate auto_repair: enabled: true actions: - restart_container owner: ai-team criticality: P0 # --- Redis Stack --- - name: redis type: docker host: 192.168.0.188 port: 6380 health_endpoint: redis-cli ping health_type: exec monitoring: prometheus: true sentry: false alerts: - service_down - memory_high - connection_rejected auto_repair: enabled: false # 資料庫謹慎處理 owner: infra-team criticality: P0 # --- PostgreSQL --- - name: postgres type: docker host: 192.168.0.188 port: 5432 health_endpoint: pg_isready health_type: exec monitoring: prometheus: true sentry: false alerts: - service_down - connection_pool_exhausted - slow_query - replication_lag auto_repair: enabled: false # 資料庫謹慎處理 owner: infra-team criticality: P0 # --- SignOz OTEL Collector --- - name: signoz-collector type: docker host: 192.168.0.188 port: 24317 health_endpoint: grpc_health health_type: grpc monitoring: prometheus: true sentry: false alerts: - service_down - trace_dropped owner: devops-team criticality: P1 # --- SignOz UI --- - name: signoz-ui type: docker host: 192.168.0.188 port: 3301 health_endpoint: / monitoring: prometheus: true sentry: false alerts: - service_down owner: devops-team criticality: P2 # --- ClickHouse (SignOz 後端) --- - name: clickhouse type: docker host: 192.168.0.188 port: 8123 health_endpoint: /ping monitoring: prometheus: true sentry: false alerts: - service_down - disk_space_low - query_timeout owner: devops-team criticality: P1 # ============================================================================= # Docker 容器 (192.168.0.110 - DevOps 中心) # ============================================================================= # --- Harbor Registry --- - name: harbor type: docker host: 192.168.0.110 port: 5000 health_endpoint: /api/v2.0/health monitoring: prometheus: true sentry: false alerts: - service_down - storage_full - push_failed owner: devops-team criticality: P0 # --- Sentry --- - name: sentry type: docker host: 192.168.0.110 port: 9000 health_endpoint: /_health/ monitoring: prometheus: true sentry: false # 自己監控自己會循環 alerts: - service_down owner: devops-team criticality: P1 # --- Langfuse LLMOps --- - name: langfuse type: docker host: 192.168.0.110 port: 3100 health_endpoint: /api/public/health monitoring: prometheus: true sentry: false alerts: - service_down - trace_lost owner: ai-team criticality: P2 # --- GitHub Actions Runner --- - name: github-runner type: systemd host: 192.168.0.110 service_name: actions.runner.owenhytsai-awoooi.awoooi-110.service monitoring: prometheus: true sentry: false alerts: - runner_offline - job_stuck auto_repair: enabled: true actions: - restart_service owner: devops-team criticality: P0 # ============================================================================= # 主機節點 # ============================================================================= nodes: - name: mon ip: 192.168.0.120 role: k3s-master alerts: - node_down - cpu_high - memory_high - disk_space_low - etcd_latency_high owner: infra-team - name: mon1 ip: 192.168.0.121 role: k3s-worker alerts: - node_down - node_not_ready - cpu_high - memory_high - disk_space_low owner: infra-team - name: harbor ip: 192.168.0.110 role: devops alerts: - node_down - cpu_high - memory_high - disk_space_low owner: devops-team - name: pg ip: 192.168.0.188 role: ai-web alerts: - node_down - cpu_high - memory_high - disk_space_low - gpu_utilization_high owner: ai-team - name: kali ip: 192.168.0.112 port: 8080 role: security monitoring: blackbox_tcp: true prometheus_scrape: false # 隔離環境,只做 TCP probe alerts: - node_down - service_down owner: security-team # Phase O-1.3 2026-04-02: MinIO 備份儲存 (Phase O 補完) - name: minio ip: 192.168.0.188 port: 9000 role: storage monitoring: prometheus_scrape: true metrics_path: /minio/v2/metrics/cluster alerts: - service_down - disk_space_low criticality: P1 owner: devops-team # ============================================================================= # 前端頁面 # ============================================================================= pages: - path: / name: Dashboard monitoring: sentry_session: true web_vitals: true alerts: - slow_page_load - js_error slo: lcp_ms: 2500 fid_ms: 100 cls: 0.1 - path: /authorizations name: 授權管理 monitoring: sentry_session: true web_vitals: true alerts: - slow_page_load - api_error slo: lcp_ms: 2000 - path: /action-logs name: 行動日誌 monitoring: sentry_session: true web_vitals: true alerts: - slow_page_load - path: /errors name: 錯誤追蹤 monitoring: sentry_session: true web_vitals: true alerts: - slow_page_load - path: /settings name: 設定 monitoring: sentry_session: true alerts: - slow_page_load - path: /knowledge-base name: 知識庫 monitoring: sentry_session: true alerts: - slow_page_load # ============================================================================= # API 端點 (關鍵) # ============================================================================= api_endpoints: - path: /api/v1/health method: GET critical: true slo: latency_p95_ms: 100 availability: 99.99 - path: /api/v1/approvals method: GET critical: true slo: latency_p95_ms: 500 availability: 99.9 - path: /api/v1/approvals/{id}/sign method: POST critical: true slo: latency_p95_ms: 1000 availability: 99.9 - path: /api/v1/incidents method: GET critical: true slo: latency_p95_ms: 500 availability: 99.9 - path: /api/v1/analyze method: POST critical: true slo: latency_p95_ms: 30000 # 30s (LLM 分析) availability: 95 - path: /api/v1/webhooks/alertmanager method: POST critical: true slo: latency_p95_ms: 5000 availability: 99.9 - path: /api/v1/webhooks/sentry/error method: POST critical: true slo: latency_p95_ms: 5000 availability: 99.9 - path: /api/v1/execute method: POST critical: true slo: latency_p95_ms: 10000 availability: 99 # ============================================================================= # AI 服務 (特殊監控) # ============================================================================= ai_services: - name: gemini-api type: external rate_limit: requests_per_minute: 60 tokens_per_minute: 100000 alerts: - rate_limit_hit - budget_exceeded fallback: ollama cost_tracking: true - name: claude-api type: external rate_limit: requests_per_minute: 50 tokens_per_minute: 100000 alerts: - rate_limit_hit - budget_exceeded fallback: gemini cost_tracking: true - name: ollama-local type: local models: - qwen2.5:7b - llama3.2:3b alerts: - model_load_failed - inference_timeout cost_tracking: false # --- NVIDIA Nemotron Tool Calling (Phase 20) --- # 2026-03-29 ogt: ADR-036 新增 - name: nvidia-nemotron type: external endpoint: https://integrate.api.nvidia.com/v1 model: nvidia/llama-3.1-nemotron-70b-instruct rate_limit: requests_per_minute: 100 tokens_per_minute: 200000 features: - tool_calling - function_calling monitoring: prometheus: true langfuse: true otel: true metrics: - nvidia_tool_call_requests_total - nvidia_tool_call_latency_seconds - nvidia_circuit_breaker_state_changes_total alerts: - circuit_breaker_open - tool_calling_timeout - high_error_rate - rate_limit_hit fallback: gemini cost_tracking: true owner: ai-team criticality: P0 # ============================================================================= # 告警模板 (Alert Templates) # ============================================================================= alert_templates: pod_crash: expr: 'kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0' for: 2m severity: critical auto_repair: restart_pod high_error_rate: expr: 'rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.01' for: 5m severity: critical auto_repair: restart_pod slow_response: expr: 'histogram_quantile(0.95, http_request_duration_seconds_bucket) > 2' for: 5m severity: warning auto_repair: scale_up service_down: expr: 'probe_success == 0' for: 1m severity: critical auto_repair: restart_container memory_high: expr: 'container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9' for: 5m severity: warning auto_repair: analyze_memory_leak disk_space_low: expr: 'node_filesystem_avail_bytes / node_filesystem_size_bytes < 0.15' for: 10m severity: warning auto_repair: cleanup_logs inference_timeout: expr: 'ollama_inference_duration_seconds > 60' for: 3m severity: warning auto_repair: switch_model runner_offline: expr: 'github_runner_status == 0' for: 5m severity: critical auto_repair: restart_service # --- NVIDIA Nemotron 告警 (Phase 20) --- # 2026-03-29 ogt: ADR-036 新增 circuit_breaker_open: expr: 'nvidia_circuit_breaker_state_changes_total{to_state="open"} > 0' for: 1m severity: warning auto_repair: fallback_to_gemini annotations: summary: 'NVIDIA Circuit Breaker 已斷路,切換至備援' runbook: 'docs/runbooks/NVIDIA-CIRCUIT-BREAKER.md' tool_calling_timeout: expr: 'histogram_quantile(0.95, nvidia_tool_call_latency_seconds_bucket) > 45' for: 5m severity: warning auto_repair: switch_model annotations: summary: 'NVIDIA Tool Calling P95 延遲 > 45s' nvidia_high_error_rate: expr: 'rate(nvidia_tool_call_requests_total{status="error"}[5m]) / rate(nvidia_tool_call_requests_total[5m]) > 0.1' for: 5m severity: critical auto_repair: fallback_to_gemini annotations: summary: 'NVIDIA Tool Calling 錯誤率 > 10%' # ============================================================================= # 自動修復動作 (Auto-Repair Actions) # ============================================================================= auto_repair_actions: restart_pod: command: 'kubectl rollout restart deployment/{name} -n {namespace}' risk: low cooldown_minutes: 10 scale_up: command: 'kubectl scale deployment/{name} --replicas=+1 -n {namespace}' risk: low max_replicas: 5 cooldown_minutes: 15 restart_container: command: 'ssh {host} docker restart {container}' risk: low cooldown_minutes: 10 restart_service: command: 'ssh {host} sudo systemctl restart {service_name}' risk: low cooldown_minutes: 10 switch_model: command: 'internal:switch_to_smaller_model' risk: low cooldown_minutes: 5 cleanup_logs: command: 'ssh {host} find /var/log -name "*.log" -mtime +7 -delete' risk: low cooldown_minutes: 60 analyze_memory_leak: command: 'internal:trigger_memory_analysis' risk: low cooldown_minutes: 30 # --- NVIDIA Nemotron 自動修復 (Phase 20) --- # 2026-03-29 ogt: ADR-036 新增 fallback_to_gemini: command: 'internal:switch_provider_to_gemini' risk: low cooldown_minutes: 5 description: 'NVIDIA API 失敗時自動切換至 Gemini' fallback_to_ollama: command: 'internal:switch_provider_to_ollama' risk: low cooldown_minutes: 5 description: 'Cloud API 失敗時自動切換至本地 Ollama'