awoooi/ops/monitoring/service-registry.yaml

# AWOOOI 服務註冊表 (Single Source of Truth)
# ===========================================
# 版本: v1.0
# 建立日期: 2026-03-29
# 用途: 所有受監控服務的統一清單
#
# 新增服務時:
# 1. 在此檔案新增 entry
# 2. CI/CD 會自動生成對應的監控配置
# 3. 部署後監控自動生效

# =============================================================================
# K8s 工作負載 (awoooi-prod namespace)
# =============================================================================
services:
  # --- API 後端 ---
  - name: awoooi-api
    type: k8s-deployment
    namespace: awoooi-prod
    replicas: 2
    port: 8000
    health_endpoint: /api/v1/health
    monitoring:
      prometheus: true
      sentry: true
      otel: true
      langfuse: true
    alerts:
      - pod_crash
      - high_error_rate
      - slow_response
      - memory_high
    auto_repair:
      enabled: true
      actions:
        - restart_pod
        - scale_up
    owner: backend-team
    criticality: P0

  # --- Web 前端 ---
  - name: awoooi-web
    type: k8s-deployment
    namespace: awoooi-prod
    replicas: 2
    port: 3000
    health_endpoint: /
    monitoring:
      prometheus: true
      sentry: true
      otel: true
      langfuse: false
    alerts:
      - pod_crash
      - slow_page_load
    auto_repair:
      enabled: true
      actions:
        - restart_pod
    owner: frontend-team
    criticality: P0

  # --- Signal Worker ---
  - name: awoooi-worker
    type: k8s-deployment
    namespace: awoooi-prod
    replicas: 1
    health_endpoint: /tmp/worker-healthy
    health_type: exec_mtime
    monitoring:
      prometheus: true
      sentry: true
      otel: true
      langfuse: true
    alerts:
      - worker_stuck
      - queue_backlog
    auto_repair:
      enabled: true
      actions:
        - restart_pod
    owner: backend-team
    criticality: P1

  # --- ArgoCD ---
  - name: argocd-server
    type: k8s-deployment
    namespace: argocd
    port: 443
    health_endpoint: /healthz
    monitoring:
      prometheus: true
      sentry: false
      otel: false
    alerts:
      - service_down
      - sync_failed
    owner: devops-team
    criticality: P1

  # --- Prometheus ---
  - name: prometheus
    type: k8s-deployment
    namespace: monitoring
    port: 9090
    health_endpoint: /-/ready
    monitoring:
      prometheus: false  # 自己監控自己會循環
      sentry: false
    alerts:
      - service_down
    owner: devops-team
    criticality: P0

  # --- Alertmanager ---
  - name: alertmanager
    type: k8s-deployment
    namespace: monitoring
    port: 9093
    health_endpoint: /-/ready
    monitoring:
      prometheus: true
      sentry: false
    alerts:
      - service_down
    owner: devops-team
    criticality: P0

# =============================================================================
# Docker 容器 (192.168.0.188 - AI/Web 中心)
# =============================================================================

  # --- Ollama LLM ---
  - name: ollama
    type: docker
    host: 192.168.0.188
    port: 11434
    health_endpoint: /api/tags
    monitoring:
      prometheus: true
      sentry: false
      otel: false
    alerts:
      - service_down
      - inference_timeout
      - model_load_failed
    auto_repair:
      enabled: true
      actions:
        - restart_container
    owner: ai-team
    criticality: P0

  # --- OpenClaw AI 決策中心 ---
  - name: openclaw
    type: docker
    host: 192.168.0.188
    port: 8089
    health_endpoint: /health
    monitoring:
      prometheus: true
      sentry: true
      otel: true
      langfuse: true
    alerts:
      - service_down
      - analysis_timeout
      - high_error_rate
    auto_repair:
      enabled: true
      actions:
        - restart_container
    owner: ai-team
    criticality: P0

  # --- Redis Stack ---
  - name: redis
    type: docker
    host: 192.168.0.188
    port: 6380
    health_endpoint: redis-cli ping
    health_type: exec
    monitoring:
      prometheus: true
      sentry: false
    alerts:
      - service_down
      - memory_high
      - connection_rejected
    auto_repair:
      enabled: false  # 資料庫謹慎處理
    owner: infra-team
    criticality: P0

  # --- PostgreSQL ---
  - name: postgres
    type: docker
    host: 192.168.0.188
    port: 5432
    health_endpoint: pg_isready
    health_type: exec
    monitoring:
      prometheus: true
      sentry: false
    alerts:
      - service_down
      - connection_pool_exhausted
      - slow_query
      - replication_lag
    auto_repair:
      enabled: false  # 資料庫謹慎處理
    owner: infra-team
    criticality: P0

  # --- SignOz OTEL Collector ---
  - name: signoz-collector
    type: docker
    host: 192.168.0.188
    port: 24317
    health_endpoint: grpc_health
    health_type: grpc
    monitoring:
      prometheus: true
      sentry: false
    alerts:
      - service_down
      - trace_dropped
    owner: devops-team
    criticality: P1

  # --- SignOz UI ---
  - name: signoz-ui
    type: docker
    host: 192.168.0.188
    port: 3301
    health_endpoint: /
    monitoring:
      prometheus: true
      sentry: false
    alerts:
      - service_down
    owner: devops-team
    criticality: P2

  # --- ClickHouse (SignOz 後端) ---
  - name: clickhouse
    type: docker
    host: 192.168.0.188
    port: 8123
    health_endpoint: /ping
    monitoring:
      prometheus: true
      sentry: false
    alerts:
      - service_down
      - disk_space_low
      - query_timeout
    owner: devops-team
    criticality: P1

# =============================================================================
# Docker 容器 (192.168.0.110 - DevOps 中心)
# =============================================================================

  # --- Harbor Registry ---
  - name: harbor
    type: docker
    host: 192.168.0.110
    port: 5000
    health_endpoint: /api/v2.0/health
    monitoring:
      prometheus: true
      sentry: false
    alerts:
      - service_down
      - storage_full
      - push_failed
    owner: devops-team
    criticality: P0

  # --- Sentry ---
  - name: sentry
    type: docker
    host: 192.168.0.110
    port: 9000
    health_endpoint: /_health/
    monitoring:
      prometheus: true
      sentry: false  # 自己監控自己會循環
    alerts:
      - service_down
    owner: devops-team
    criticality: P1

  # --- Langfuse LLMOps ---
  - name: langfuse
    type: docker
    host: 192.168.0.110
    port: 3100
    health_endpoint: /api/public/health
    monitoring:
      prometheus: true
      sentry: false
    alerts:
      - service_down
      - trace_lost
    owner: ai-team
    criticality: P2

  # --- GitHub Actions Runner ---
  - name: github-runner
    type: systemd
    host: 192.168.0.110
    service_name: actions.runner.owenhytsai-awoooi.awoooi-110.service
    monitoring:
      prometheus: true
      sentry: false
    alerts:
      - runner_offline
      - job_stuck
    auto_repair:
      enabled: true
      actions:
        - restart_service
    owner: devops-team
    criticality: P0

# =============================================================================
# 主機節點
# =============================================================================
nodes:
  - name: mon
    ip: 192.168.0.120
    role: k3s-master
    alerts:
      - node_down
      - cpu_high
      - memory_high
      - disk_space_low
      - etcd_latency_high
    owner: infra-team

  - name: mon1
    ip: 192.168.0.121
    role: k3s-worker
    alerts:
      - node_down
      - node_not_ready
      - cpu_high
      - memory_high
      - disk_space_low
    owner: infra-team

  - name: harbor
    ip: 192.168.0.110
    role: devops
    alerts:
      - node_down
      - cpu_high
      - memory_high
      - disk_space_low
    owner: devops-team

  - name: pg
    ip: 192.168.0.188
    role: ai-web
    alerts:
      - node_down
      - cpu_high
      - memory_high
      - disk_space_low
      - gpu_utilization_high
    owner: ai-team

  - name: kali
    ip: 192.168.0.112
    role: security
    alerts:
      - node_down
    owner: security-team

# =============================================================================
# 前端頁面
# =============================================================================
pages:
  - path: /
    name: Dashboard
    monitoring:
      sentry_session: true
      web_vitals: true
    alerts:
      - slow_page_load
      - js_error
    slo:
      lcp_ms: 2500
      fid_ms: 100
      cls: 0.1

  - path: /authorizations
    name: 授權管理
    monitoring:
      sentry_session: true
      web_vitals: true
    alerts:
      - slow_page_load
      - api_error
    slo:
      lcp_ms: 2000

  - path: /action-logs
    name: 行動日誌
    monitoring:
      sentry_session: true
      web_vitals: true
    alerts:
      - slow_page_load

  - path: /errors
    name: 錯誤追蹤
    monitoring:
      sentry_session: true
      web_vitals: true
    alerts:
      - slow_page_load

  - path: /settings
    name: 設定
    monitoring:
      sentry_session: true
    alerts:
      - slow_page_load

  - path: /knowledge-base
    name: 知識庫
    monitoring:
      sentry_session: true
    alerts:
      - slow_page_load

# =============================================================================
# API 端點 (關鍵)
# =============================================================================
api_endpoints:
  - path: /api/v1/health
    method: GET
    critical: true
    slo:
      latency_p95_ms: 100
      availability: 99.99

  - path: /api/v1/approvals
    method: GET
    critical: true
    slo:
      latency_p95_ms: 500
      availability: 99.9

  - path: /api/v1/approvals/{id}/sign
    method: POST
    critical: true
    slo:
      latency_p95_ms: 1000
      availability: 99.9

  - path: /api/v1/incidents
    method: GET
    critical: true
    slo:
      latency_p95_ms: 500
      availability: 99.9

  - path: /api/v1/analyze
    method: POST
    critical: true
    slo:
      latency_p95_ms: 30000  # 30s (LLM 分析)
      availability: 95

  - path: /api/v1/webhooks/alertmanager
    method: POST
    critical: true
    slo:
      latency_p95_ms: 5000
      availability: 99.9

  - path: /api/v1/webhooks/sentry/error
    method: POST
    critical: true
    slo:
      latency_p95_ms: 5000
      availability: 99.9

  - path: /api/v1/execute
    method: POST
    critical: true
    slo:
      latency_p95_ms: 10000
      availability: 99

# =============================================================================
# AI 服務 (特殊監控)
# =============================================================================
ai_services:
  - name: gemini-api
    type: external
    rate_limit:
      requests_per_minute: 60
      tokens_per_minute: 100000
    alerts:
      - rate_limit_hit
      - budget_exceeded
    fallback: ollama
    cost_tracking: true

  - name: claude-api
    type: external
    rate_limit:
      requests_per_minute: 50
      tokens_per_minute: 100000
    alerts:
      - rate_limit_hit
      - budget_exceeded
    fallback: gemini
    cost_tracking: true

  - name: ollama-local
    type: local
    models:
      - qwen2.5:7b
      - llama3.2:3b
    alerts:
      - model_load_failed
      - inference_timeout
    cost_tracking: false

# =============================================================================
# 告警模板 (Alert Templates)
# =============================================================================
alert_templates:
  pod_crash:
    expr: 'kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0'
    for: 2m
    severity: critical
    auto_repair: restart_pod

  high_error_rate:
    expr: 'rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.01'
    for: 5m
    severity: critical
    auto_repair: restart_pod

  slow_response:
    expr: 'histogram_quantile(0.95, http_request_duration_seconds_bucket) > 2'
    for: 5m
    severity: warning
    auto_repair: scale_up

  service_down:
    expr: 'probe_success == 0'
    for: 1m
    severity: critical
    auto_repair: restart_container

  memory_high:
    expr: 'container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9'
    for: 5m
    severity: warning
    auto_repair: analyze_memory_leak

  disk_space_low:
    expr: 'node_filesystem_avail_bytes / node_filesystem_size_bytes < 0.15'
    for: 10m
    severity: warning
    auto_repair: cleanup_logs

  inference_timeout:
    expr: 'ollama_inference_duration_seconds > 60'
    for: 3m
    severity: warning
    auto_repair: switch_model

  runner_offline:
    expr: 'github_runner_status == 0'
    for: 5m
    severity: critical
    auto_repair: restart_service

# =============================================================================
# 自動修復動作 (Auto-Repair Actions)
# =============================================================================
auto_repair_actions:
  restart_pod:
    command: 'kubectl rollout restart deployment/{name} -n {namespace}'
    risk: low
    cooldown_minutes: 10

  scale_up:
    command: 'kubectl scale deployment/{name} --replicas=+1 -n {namespace}'
    risk: low
    max_replicas: 5
    cooldown_minutes: 15

  restart_container:
    command: 'ssh {host} docker restart {container}'
    risk: low
    cooldown_minutes: 10

  restart_service:
    command: 'ssh {host} sudo systemctl restart {service_name}'
    risk: low
    cooldown_minutes: 10

  switch_model:
    command: 'internal:switch_to_smaller_model'
    risk: low
    cooldown_minutes: 5

  cleanup_logs:
    command: 'ssh {host} find /var/log -name "*.log" -mtime +7 -delete'
    risk: low
    cooldown_minutes: 60

  analyze_memory_leak:
    command: 'internal:trigger_memory_analysis'
    risk: low
    cooldown_minutes: 30