Files
awoooi/ops/monitoring/service-registry.yaml
OG T ab3e266a23 fix(monitoring): Phase O-6.2 service-registry 補齊 9 個缺失 K8s 部署
新增:
- argocd 5個元件 (applicationset/dex/notifications/redis/repo-server)
- awoooi-dev/awoooi-api
- kube-state-metrics
- observability/event-exporter
- velero/velero

結果: prometheus 覆蓋率 94%→96%, errors 9→0

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-10 10:44:36 +08:00

831 lines
18 KiB
YAML

# AWOOOI 服務註冊表 (Single Source of Truth)
# ===========================================
# 版本: v1.0
# 建立日期: 2026-03-29
# 用途: 所有受監控服務的統一清單
#
# 新增服務時:
# 1. 在此檔案新增 entry
# 2. CI/CD 會自動生成對應的監控配置
# 3. 部署後監控自動生效
# =============================================================================
# K8s 工作負載 (awoooi-prod namespace)
# =============================================================================
services:
# --- API 後端 ---
- name: awoooi-api
type: k8s-deployment
namespace: awoooi-prod
replicas: 2
port: 8000
health_endpoint: /api/v1/health
monitoring:
prometheus: true
sentry: true
otel: true
langfuse: true
alerts:
- pod_crash
- high_error_rate
- slow_response
- memory_high
auto_repair:
enabled: true
actions:
- restart_pod
- scale_up
owner: backend-team
criticality: P0
# --- Web 前端 ---
- name: awoooi-web
type: k8s-deployment
namespace: awoooi-prod
replicas: 2
port: 3000
health_endpoint: /
monitoring:
prometheus: true
sentry: true
otel: true
langfuse: false
alerts:
- pod_crash
- slow_page_load
auto_repair:
enabled: true
actions:
- restart_pod
owner: frontend-team
criticality: P0
# --- Signal Worker ---
- name: awoooi-worker
type: k8s-deployment
namespace: awoooi-prod
replicas: 1
health_endpoint: /tmp/worker-healthy
health_type: exec_mtime
monitoring:
prometheus: true
sentry: true
otel: true
langfuse: true
alerts:
- worker_stuck
- queue_backlog
auto_repair:
enabled: true
actions:
- restart_pod
owner: backend-team
criticality: P1
# --- ArgoCD ---
- name: argocd-server
type: k8s-deployment
namespace: argocd
port: 443
health_endpoint: /healthz
monitoring:
prometheus: true
sentry: false
otel: false
alerts:
- service_down
- sync_failed
owner: devops-team
criticality: P1
# --- Prometheus ---
- name: prometheus
type: k8s-deployment
namespace: monitoring
port: 9090
health_endpoint: /-/ready
monitoring:
prometheus: false # 自己監控自己會循環
sentry: false
alerts:
- service_down
owner: devops-team
criticality: P0
# --- Alertmanager ---
- name: alertmanager
type: k8s-deployment
namespace: monitoring
port: 9093
health_endpoint: /-/ready
monitoring:
prometheus: true
sentry: false
alerts:
- service_down
owner: devops-team
criticality: P0
# --- ArgoCD 完整元件 (Phase O-6 2026-04-10) ---
- name: argocd-applicationset-controller
type: k8s-deployment
namespace: argocd
monitoring:
prometheus: true
sentry: false
otel: false
alerts:
- service_down
owner: devops-team
criticality: P2
- name: argocd-dex-server
type: k8s-deployment
namespace: argocd
monitoring:
prometheus: true
sentry: false
otel: false
alerts:
- service_down
owner: devops-team
criticality: P2
- name: argocd-notifications-controller
type: k8s-deployment
namespace: argocd
monitoring:
prometheus: true
sentry: false
otel: false
alerts:
- service_down
owner: devops-team
criticality: P2
- name: argocd-redis
type: k8s-deployment
namespace: argocd
monitoring:
prometheus: true
sentry: false
otel: false
alerts:
- service_down
owner: devops-team
criticality: P2
- name: argocd-repo-server
type: k8s-deployment
namespace: argocd
monitoring:
prometheus: true
sentry: false
otel: false
alerts:
- service_down
owner: devops-team
criticality: P2
# --- AWOOOI Dev 環境 ---
- name: awoooi-api
type: k8s-deployment
namespace: awoooi-dev
monitoring:
prometheus: true
sentry: false
otel: false
alerts:
- pod_crash
owner: backend-team
criticality: P3
# --- kube-state-metrics ---
- name: kube-state-metrics
type: k8s-deployment
namespace: kube-state-metrics
monitoring:
prometheus: true
sentry: false
otel: false
alerts:
- service_down
owner: devops-team
criticality: P1
# --- OTEL Event Exporter ---
- name: event-exporter
type: k8s-deployment
namespace: observability
monitoring:
prometheus: true
sentry: false
otel: false
alerts:
- service_down
owner: devops-team
criticality: P1
# --- Velero 備份 ---
- name: velero
type: k8s-deployment
namespace: velero
monitoring:
prometheus: true
sentry: false
otel: false
alerts:
- service_down
- backup_failed
owner: devops-team
criticality: P1
# =============================================================================
# Docker 容器 (192.168.0.188 - AI/Web 中心)
# =============================================================================
# --- Ollama LLM ---
- name: ollama
type: docker
host: 192.168.0.188
port: 11434
health_endpoint: /api/tags
monitoring:
prometheus: true
sentry: false
otel: false
alerts:
- service_down
- inference_timeout
- model_load_failed
auto_repair:
enabled: true
actions:
- restart_container
owner: ai-team
criticality: P0
# --- OpenClaw AI 決策中心 ---
- name: openclaw
type: docker
host: 192.168.0.188
port: 8089
health_endpoint: /health
monitoring:
prometheus: true
sentry: true
otel: true
langfuse: true
alerts:
- service_down
- analysis_timeout
- high_error_rate
auto_repair:
enabled: true
actions:
- restart_container
owner: ai-team
criticality: P0
# --- Redis Stack ---
- name: redis
type: docker
host: 192.168.0.188
port: 6380
health_endpoint: redis-cli ping
health_type: exec
monitoring:
prometheus: true
sentry: false
alerts:
- service_down
- memory_high
- connection_rejected
auto_repair:
enabled: false # 資料庫謹慎處理
owner: infra-team
criticality: P0
# --- PostgreSQL ---
- name: postgres
type: docker
host: 192.168.0.188
port: 5432
health_endpoint: pg_isready
health_type: exec
monitoring:
prometheus: true
sentry: false
alerts:
- service_down
- connection_pool_exhausted
- slow_query
- replication_lag
auto_repair:
enabled: false # 資料庫謹慎處理
owner: infra-team
criticality: P0
# --- SignOz OTEL Collector ---
- name: signoz-collector
type: docker
host: 192.168.0.188
port: 24317
health_endpoint: grpc_health
health_type: grpc
monitoring:
prometheus: true
sentry: false
alerts:
- service_down
- trace_dropped
owner: devops-team
criticality: P1
# --- SignOz UI ---
- name: signoz-ui
type: docker
host: 192.168.0.188
port: 3301
health_endpoint: /
monitoring:
prometheus: true
sentry: false
alerts:
- service_down
owner: devops-team
criticality: P2
# --- ClickHouse (SignOz 後端) ---
- name: clickhouse
type: docker
host: 192.168.0.188
port: 8123
health_endpoint: /ping
monitoring:
prometheus: true
sentry: false
alerts:
- service_down
- disk_space_low
- query_timeout
owner: devops-team
criticality: P1
# =============================================================================
# Docker 容器 (192.168.0.110 - DevOps 中心)
# =============================================================================
# --- Harbor Registry ---
- name: harbor
type: docker
host: 192.168.0.110
port: 5000
health_endpoint: /api/v2.0/health
monitoring:
prometheus: true
sentry: false
alerts:
- service_down
- storage_full
- push_failed
owner: devops-team
criticality: P0
# --- Sentry ---
- name: sentry
type: docker
host: 192.168.0.110
port: 9000
health_endpoint: /_health/
monitoring:
prometheus: true
sentry: false # 自己監控自己會循環
alerts:
- service_down
owner: devops-team
criticality: P1
# --- Langfuse LLMOps ---
- name: langfuse
type: docker
host: 192.168.0.110
port: 3100
health_endpoint: /api/public/health
monitoring:
prometheus: true
sentry: false
alerts:
- service_down
- trace_lost
owner: ai-team
criticality: P2
# --- GitHub Actions Runner ---
- name: github-runner
type: systemd
host: 192.168.0.110
service_name: actions.runner.owenhytsai-awoooi.awoooi-110.service
monitoring:
prometheus: true
sentry: false
alerts:
- runner_offline
- job_stuck
auto_repair:
enabled: true
actions:
- restart_service
owner: devops-team
criticality: P0
# =============================================================================
# 主機節點
# =============================================================================
nodes:
- name: mon
ip: 192.168.0.120
role: k3s-master
alerts:
- node_down
- cpu_high
- memory_high
- disk_space_low
- etcd_latency_high
owner: infra-team
- name: mon1
ip: 192.168.0.121
role: k3s-worker
alerts:
- node_down
- node_not_ready
- cpu_high
- memory_high
- disk_space_low
owner: infra-team
- name: harbor
ip: 192.168.0.110
role: devops
alerts:
- node_down
- cpu_high
- memory_high
- disk_space_low
owner: devops-team
- name: pg
ip: 192.168.0.188
role: ai-web
alerts:
- node_down
- cpu_high
- memory_high
- disk_space_low
- gpu_utilization_high
owner: ai-team
- name: kali
ip: 192.168.0.112
port: 8080
role: security
monitoring:
blackbox_tcp: true
prometheus_scrape: false # 隔離環境,只做 TCP probe
alerts:
- node_down
- service_down
owner: security-team
# Phase O-1.3 2026-04-02: MinIO 備份儲存 (Phase O 補完)
- name: minio
ip: 192.168.0.188
port: 9000
role: storage
monitoring:
prometheus_scrape: true
metrics_path: /minio/v2/metrics/cluster
alerts:
- service_down
- disk_space_low
criticality: P1
owner: devops-team
# =============================================================================
# 前端頁面
# =============================================================================
pages:
- path: /
name: Dashboard
monitoring:
sentry_session: true
web_vitals: true
alerts:
- slow_page_load
- js_error
slo:
lcp_ms: 2500
fid_ms: 100
cls: 0.1
- path: /authorizations
name: 授權管理
monitoring:
sentry_session: true
web_vitals: true
alerts:
- slow_page_load
- api_error
slo:
lcp_ms: 2000
- path: /action-logs
name: 行動日誌
monitoring:
sentry_session: true
web_vitals: true
alerts:
- slow_page_load
- path: /errors
name: 錯誤追蹤
monitoring:
sentry_session: true
web_vitals: true
alerts:
- slow_page_load
- path: /settings
name: 設定
monitoring:
sentry_session: true
alerts:
- slow_page_load
- path: /knowledge-base
name: 知識庫
monitoring:
sentry_session: true
alerts:
- slow_page_load
# =============================================================================
# API 端點 (關鍵)
# =============================================================================
api_endpoints:
- path: /api/v1/health
method: GET
critical: true
slo:
latency_p95_ms: 100
availability: 99.99
- path: /api/v1/approvals
method: GET
critical: true
slo:
latency_p95_ms: 500
availability: 99.9
- path: /api/v1/approvals/{id}/sign
method: POST
critical: true
slo:
latency_p95_ms: 1000
availability: 99.9
- path: /api/v1/incidents
method: GET
critical: true
slo:
latency_p95_ms: 500
availability: 99.9
- path: /api/v1/analyze
method: POST
critical: true
slo:
latency_p95_ms: 30000 # 30s (LLM 分析)
availability: 95
- path: /api/v1/webhooks/alertmanager
method: POST
critical: true
slo:
latency_p95_ms: 5000
availability: 99.9
- path: /api/v1/webhooks/sentry/error
method: POST
critical: true
slo:
latency_p95_ms: 5000
availability: 99.9
- path: /api/v1/execute
method: POST
critical: true
slo:
latency_p95_ms: 10000
availability: 99
# =============================================================================
# AI 服務 (特殊監控)
# =============================================================================
ai_services:
- name: gemini-api
type: external
rate_limit:
requests_per_minute: 60
tokens_per_minute: 100000
alerts:
- rate_limit_hit
- budget_exceeded
fallback: ollama
cost_tracking: true
- name: claude-api
type: external
rate_limit:
requests_per_minute: 50
tokens_per_minute: 100000
alerts:
- rate_limit_hit
- budget_exceeded
fallback: gemini
cost_tracking: true
- name: ollama-local
type: local
models:
- qwen2.5:7b
- llama3.2:3b
alerts:
- model_load_failed
- inference_timeout
cost_tracking: false
# --- NVIDIA Nemotron Tool Calling (Phase 20) ---
# 2026-03-29 ogt: ADR-036 新增
- name: nvidia-nemotron
type: external
endpoint: https://integrate.api.nvidia.com/v1
model: nvidia/llama-3.1-nemotron-70b-instruct
rate_limit:
requests_per_minute: 100
tokens_per_minute: 200000
features:
- tool_calling
- function_calling
monitoring:
prometheus: true
langfuse: true
otel: true
metrics:
- nvidia_tool_call_requests_total
- nvidia_tool_call_latency_seconds
- nvidia_circuit_breaker_state_changes_total
alerts:
- circuit_breaker_open
- tool_calling_timeout
- high_error_rate
- rate_limit_hit
fallback: gemini
cost_tracking: true
owner: ai-team
criticality: P0
# =============================================================================
# 告警模板 (Alert Templates)
# =============================================================================
alert_templates:
pod_crash:
expr: 'kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0'
for: 2m
severity: critical
auto_repair: restart_pod
high_error_rate:
expr: 'rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.01'
for: 5m
severity: critical
auto_repair: restart_pod
slow_response:
expr: 'histogram_quantile(0.95, http_request_duration_seconds_bucket) > 2'
for: 5m
severity: warning
auto_repair: scale_up
service_down:
expr: 'probe_success == 0'
for: 1m
severity: critical
auto_repair: restart_container
memory_high:
expr: 'container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9'
for: 5m
severity: warning
auto_repair: analyze_memory_leak
disk_space_low:
expr: 'node_filesystem_avail_bytes / node_filesystem_size_bytes < 0.15'
for: 10m
severity: warning
auto_repair: cleanup_logs
inference_timeout:
expr: 'ollama_inference_duration_seconds > 60'
for: 3m
severity: warning
auto_repair: switch_model
runner_offline:
expr: 'github_runner_status == 0'
for: 5m
severity: critical
auto_repair: restart_service
# --- NVIDIA Nemotron 告警 (Phase 20) ---
# 2026-03-29 ogt: ADR-036 新增
circuit_breaker_open:
expr: 'nvidia_circuit_breaker_state_changes_total{to_state="open"} > 0'
for: 1m
severity: warning
auto_repair: fallback_to_gemini
annotations:
summary: 'NVIDIA Circuit Breaker 已斷路,切換至備援'
runbook: 'docs/runbooks/NVIDIA-CIRCUIT-BREAKER.md'
tool_calling_timeout:
expr: 'histogram_quantile(0.95, nvidia_tool_call_latency_seconds_bucket) > 45'
for: 5m
severity: warning
auto_repair: switch_model
annotations:
summary: 'NVIDIA Tool Calling P95 延遲 > 45s'
nvidia_high_error_rate:
expr: 'rate(nvidia_tool_call_requests_total{status="error"}[5m]) / rate(nvidia_tool_call_requests_total[5m]) > 0.1'
for: 5m
severity: critical
auto_repair: fallback_to_gemini
annotations:
summary: 'NVIDIA Tool Calling 錯誤率 > 10%'
# =============================================================================
# 自動修復動作 (Auto-Repair Actions)
# =============================================================================
auto_repair_actions:
restart_pod:
command: 'kubectl rollout restart deployment/{name} -n {namespace}'
risk: low
cooldown_minutes: 10
scale_up:
command: 'kubectl scale deployment/{name} --replicas=+1 -n {namespace}'
risk: low
max_replicas: 5
cooldown_minutes: 15
restart_container:
command: 'ssh {host} docker restart {container}'
risk: low
cooldown_minutes: 10
restart_service:
command: 'ssh {host} sudo systemctl restart {service_name}'
risk: low
cooldown_minutes: 10
switch_model:
command: 'internal:switch_to_smaller_model'
risk: low
cooldown_minutes: 5
cleanup_logs:
command: 'ssh {host} find /var/log -name "*.log" -mtime +7 -delete'
risk: low
cooldown_minutes: 60
analyze_memory_leak:
command: 'internal:trigger_memory_analysis'
risk: low
cooldown_minutes: 30
# --- NVIDIA Nemotron 自動修復 (Phase 20) ---
# 2026-03-29 ogt: ADR-036 新增
fallback_to_gemini:
command: 'internal:switch_provider_to_gemini'
risk: low
cooldown_minutes: 5
description: 'NVIDIA API 失敗時自動切換至 Gemini'
fallback_to_ollama:
command: 'internal:switch_provider_to_ollama'
risk: low
cooldown_minutes: 5
description: 'Cloud API 失敗時自動切換至本地 Ollama'