Files
awoooi/ops/monitoring/service-registry.yaml
OG T 40163a51b5 feat(monitoring): 完整監控策略與自動整合架構
新增:
1. MONITORING_COMPLETE_STRATEGY.md - 完整監控策略
   - 5 主機 × 60+ 服務監控矩陣
   - P0/P1/P2 告警規則清單
   - AI 自動修復閉環流程
   - 安全護欄配置

2. MONITORING_INTEGRATION_ARCHITECTURE.md - 自動整合架構
   - 服務註冊表 (Single Source of Truth)
   - CI/CD 自動驗證監控覆蓋率
   - 新服務自動獲得監控

3. ops/monitoring/service-registry.yaml - 服務清單
   - K8s 工作負載 (API/Web/Worker/ArgoCD)
   - Docker 容器 (Ollama/OpenClaw/Redis/Postgres)
   - 前端頁面 SLO
   - API 端點 SLO
   - 告警模板與自動修復動作

4. ops/monitoring/validate_coverage.py - 覆蓋率驗證
   - CI 階段執行
   - 檢測未監控服務
   - 生成覆蓋率報告

設計原則:
- 監控即代碼 (Monitoring as Code)
- 新服務必須在 registry 註冊才能部署
- 自動發現機制防止遺漏

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-29 01:52:08 +08:00

627 lines
14 KiB
YAML

# AWOOOI 服務註冊表 (Single Source of Truth)
# ===========================================
# 版本: v1.0
# 建立日期: 2026-03-29
# 用途: 所有受監控服務的統一清單
#
# 新增服務時:
# 1. 在此檔案新增 entry
# 2. CI/CD 會自動生成對應的監控配置
# 3. 部署後監控自動生效
# =============================================================================
# K8s 工作負載 (awoooi-prod namespace)
# =============================================================================
services:
# --- API 後端 ---
- name: awoooi-api
type: k8s-deployment
namespace: awoooi-prod
replicas: 2
port: 8000
health_endpoint: /api/v1/health
monitoring:
prometheus: true
sentry: true
otel: true
langfuse: true
alerts:
- pod_crash
- high_error_rate
- slow_response
- memory_high
auto_repair:
enabled: true
actions:
- restart_pod
- scale_up
owner: backend-team
criticality: P0
# --- Web 前端 ---
- name: awoooi-web
type: k8s-deployment
namespace: awoooi-prod
replicas: 2
port: 3000
health_endpoint: /
monitoring:
prometheus: true
sentry: true
otel: true
langfuse: false
alerts:
- pod_crash
- slow_page_load
auto_repair:
enabled: true
actions:
- restart_pod
owner: frontend-team
criticality: P0
# --- Signal Worker ---
- name: awoooi-worker
type: k8s-deployment
namespace: awoooi-prod
replicas: 1
health_endpoint: /tmp/worker-healthy
health_type: exec_mtime
monitoring:
prometheus: true
sentry: true
otel: true
langfuse: true
alerts:
- worker_stuck
- queue_backlog
auto_repair:
enabled: true
actions:
- restart_pod
owner: backend-team
criticality: P1
# --- ArgoCD ---
- name: argocd-server
type: k8s-deployment
namespace: argocd
port: 443
health_endpoint: /healthz
monitoring:
prometheus: true
sentry: false
otel: false
alerts:
- service_down
- sync_failed
owner: devops-team
criticality: P1
# --- Prometheus ---
- name: prometheus
type: k8s-deployment
namespace: monitoring
port: 9090
health_endpoint: /-/ready
monitoring:
prometheus: false # 自己監控自己會循環
sentry: false
alerts:
- service_down
owner: devops-team
criticality: P0
# --- Alertmanager ---
- name: alertmanager
type: k8s-deployment
namespace: monitoring
port: 9093
health_endpoint: /-/ready
monitoring:
prometheus: true
sentry: false
alerts:
- service_down
owner: devops-team
criticality: P0
# =============================================================================
# Docker 容器 (192.168.0.188 - AI/Web 中心)
# =============================================================================
# --- Ollama LLM ---
- name: ollama
type: docker
host: 192.168.0.188
port: 11434
health_endpoint: /api/tags
monitoring:
prometheus: true
sentry: false
otel: false
alerts:
- service_down
- inference_timeout
- model_load_failed
auto_repair:
enabled: true
actions:
- restart_container
owner: ai-team
criticality: P0
# --- OpenClaw AI 決策中心 ---
- name: openclaw
type: docker
host: 192.168.0.188
port: 8089
health_endpoint: /health
monitoring:
prometheus: true
sentry: true
otel: true
langfuse: true
alerts:
- service_down
- analysis_timeout
- high_error_rate
auto_repair:
enabled: true
actions:
- restart_container
owner: ai-team
criticality: P0
# --- Redis Stack ---
- name: redis
type: docker
host: 192.168.0.188
port: 6380
health_endpoint: redis-cli ping
health_type: exec
monitoring:
prometheus: true
sentry: false
alerts:
- service_down
- memory_high
- connection_rejected
auto_repair:
enabled: false # 資料庫謹慎處理
owner: infra-team
criticality: P0
# --- PostgreSQL ---
- name: postgres
type: docker
host: 192.168.0.188
port: 5432
health_endpoint: pg_isready
health_type: exec
monitoring:
prometheus: true
sentry: false
alerts:
- service_down
- connection_pool_exhausted
- slow_query
- replication_lag
auto_repair:
enabled: false # 資料庫謹慎處理
owner: infra-team
criticality: P0
# --- SignOz OTEL Collector ---
- name: signoz-collector
type: docker
host: 192.168.0.188
port: 24317
health_endpoint: grpc_health
health_type: grpc
monitoring:
prometheus: true
sentry: false
alerts:
- service_down
- trace_dropped
owner: devops-team
criticality: P1
# --- SignOz UI ---
- name: signoz-ui
type: docker
host: 192.168.0.188
port: 3301
health_endpoint: /
monitoring:
prometheus: true
sentry: false
alerts:
- service_down
owner: devops-team
criticality: P2
# --- ClickHouse (SignOz 後端) ---
- name: clickhouse
type: docker
host: 192.168.0.188
port: 8123
health_endpoint: /ping
monitoring:
prometheus: true
sentry: false
alerts:
- service_down
- disk_space_low
- query_timeout
owner: devops-team
criticality: P1
# =============================================================================
# Docker 容器 (192.168.0.110 - DevOps 中心)
# =============================================================================
# --- Harbor Registry ---
- name: harbor
type: docker
host: 192.168.0.110
port: 5000
health_endpoint: /api/v2.0/health
monitoring:
prometheus: true
sentry: false
alerts:
- service_down
- storage_full
- push_failed
owner: devops-team
criticality: P0
# --- Sentry ---
- name: sentry
type: docker
host: 192.168.0.110
port: 9000
health_endpoint: /_health/
monitoring:
prometheus: true
sentry: false # 自己監控自己會循環
alerts:
- service_down
owner: devops-team
criticality: P1
# --- Langfuse LLMOps ---
- name: langfuse
type: docker
host: 192.168.0.110
port: 3100
health_endpoint: /api/public/health
monitoring:
prometheus: true
sentry: false
alerts:
- service_down
- trace_lost
owner: ai-team
criticality: P2
# --- GitHub Actions Runner ---
- name: github-runner
type: systemd
host: 192.168.0.110
service_name: actions.runner.owenhytsai-awoooi.awoooi-110.service
monitoring:
prometheus: true
sentry: false
alerts:
- runner_offline
- job_stuck
auto_repair:
enabled: true
actions:
- restart_service
owner: devops-team
criticality: P0
# =============================================================================
# 主機節點
# =============================================================================
nodes:
- name: mon
ip: 192.168.0.120
role: k3s-master
alerts:
- node_down
- cpu_high
- memory_high
- disk_space_low
- etcd_latency_high
owner: infra-team
- name: mon1
ip: 192.168.0.121
role: k3s-worker
alerts:
- node_down
- node_not_ready
- cpu_high
- memory_high
- disk_space_low
owner: infra-team
- name: harbor
ip: 192.168.0.110
role: devops
alerts:
- node_down
- cpu_high
- memory_high
- disk_space_low
owner: devops-team
- name: pg
ip: 192.168.0.188
role: ai-web
alerts:
- node_down
- cpu_high
- memory_high
- disk_space_low
- gpu_utilization_high
owner: ai-team
- name: kali
ip: 192.168.0.112
role: security
alerts:
- node_down
owner: security-team
# =============================================================================
# 前端頁面
# =============================================================================
pages:
- path: /
name: Dashboard
monitoring:
sentry_session: true
web_vitals: true
alerts:
- slow_page_load
- js_error
slo:
lcp_ms: 2500
fid_ms: 100
cls: 0.1
- path: /authorizations
name: 授權管理
monitoring:
sentry_session: true
web_vitals: true
alerts:
- slow_page_load
- api_error
slo:
lcp_ms: 2000
- path: /action-logs
name: 行動日誌
monitoring:
sentry_session: true
web_vitals: true
alerts:
- slow_page_load
- path: /errors
name: 錯誤追蹤
monitoring:
sentry_session: true
web_vitals: true
alerts:
- slow_page_load
- path: /settings
name: 設定
monitoring:
sentry_session: true
alerts:
- slow_page_load
- path: /knowledge-base
name: 知識庫
monitoring:
sentry_session: true
alerts:
- slow_page_load
# =============================================================================
# API 端點 (關鍵)
# =============================================================================
api_endpoints:
- path: /api/v1/health
method: GET
critical: true
slo:
latency_p95_ms: 100
availability: 99.99
- path: /api/v1/approvals
method: GET
critical: true
slo:
latency_p95_ms: 500
availability: 99.9
- path: /api/v1/approvals/{id}/sign
method: POST
critical: true
slo:
latency_p95_ms: 1000
availability: 99.9
- path: /api/v1/incidents
method: GET
critical: true
slo:
latency_p95_ms: 500
availability: 99.9
- path: /api/v1/analyze
method: POST
critical: true
slo:
latency_p95_ms: 30000 # 30s (LLM 分析)
availability: 95
- path: /api/v1/webhooks/alertmanager
method: POST
critical: true
slo:
latency_p95_ms: 5000
availability: 99.9
- path: /api/v1/webhooks/sentry/error
method: POST
critical: true
slo:
latency_p95_ms: 5000
availability: 99.9
- path: /api/v1/execute
method: POST
critical: true
slo:
latency_p95_ms: 10000
availability: 99
# =============================================================================
# AI 服務 (特殊監控)
# =============================================================================
ai_services:
- name: gemini-api
type: external
rate_limit:
requests_per_minute: 60
tokens_per_minute: 100000
alerts:
- rate_limit_hit
- budget_exceeded
fallback: ollama
cost_tracking: true
- name: claude-api
type: external
rate_limit:
requests_per_minute: 50
tokens_per_minute: 100000
alerts:
- rate_limit_hit
- budget_exceeded
fallback: gemini
cost_tracking: true
- name: ollama-local
type: local
models:
- qwen2.5:7b
- llama3.2:3b
alerts:
- model_load_failed
- inference_timeout
cost_tracking: false
# =============================================================================
# 告警模板 (Alert Templates)
# =============================================================================
alert_templates:
pod_crash:
expr: 'kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0'
for: 2m
severity: critical
auto_repair: restart_pod
high_error_rate:
expr: 'rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.01'
for: 5m
severity: critical
auto_repair: restart_pod
slow_response:
expr: 'histogram_quantile(0.95, http_request_duration_seconds_bucket) > 2'
for: 5m
severity: warning
auto_repair: scale_up
service_down:
expr: 'probe_success == 0'
for: 1m
severity: critical
auto_repair: restart_container
memory_high:
expr: 'container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9'
for: 5m
severity: warning
auto_repair: analyze_memory_leak
disk_space_low:
expr: 'node_filesystem_avail_bytes / node_filesystem_size_bytes < 0.15'
for: 10m
severity: warning
auto_repair: cleanup_logs
inference_timeout:
expr: 'ollama_inference_duration_seconds > 60'
for: 3m
severity: warning
auto_repair: switch_model
runner_offline:
expr: 'github_runner_status == 0'
for: 5m
severity: critical
auto_repair: restart_service
# =============================================================================
# 自動修復動作 (Auto-Repair Actions)
# =============================================================================
auto_repair_actions:
restart_pod:
command: 'kubectl rollout restart deployment/{name} -n {namespace}'
risk: low
cooldown_minutes: 10
scale_up:
command: 'kubectl scale deployment/{name} --replicas=+1 -n {namespace}'
risk: low
max_replicas: 5
cooldown_minutes: 15
restart_container:
command: 'ssh {host} docker restart {container}'
risk: low
cooldown_minutes: 10
restart_service:
command: 'ssh {host} sudo systemctl restart {service_name}'
risk: low
cooldown_minutes: 10
switch_model:
command: 'internal:switch_to_smaller_model'
risk: low
cooldown_minutes: 5
cleanup_logs:
command: 'ssh {host} find /var/log -name "*.log" -mtime +7 -delete'
risk: low
cooldown_minutes: 60
analyze_memory_leak:
command: 'internal:trigger_memory_analysis'
risk: low
cooldown_minutes: 30