新增: 1. MONITORING_COMPLETE_STRATEGY.md - 完整監控策略 - 5 主機 × 60+ 服務監控矩陣 - P0/P1/P2 告警規則清單 - AI 自動修復閉環流程 - 安全護欄配置 2. MONITORING_INTEGRATION_ARCHITECTURE.md - 自動整合架構 - 服務註冊表 (Single Source of Truth) - CI/CD 自動驗證監控覆蓋率 - 新服務自動獲得監控 3. ops/monitoring/service-registry.yaml - 服務清單 - K8s 工作負載 (API/Web/Worker/ArgoCD) - Docker 容器 (Ollama/OpenClaw/Redis/Postgres) - 前端頁面 SLO - API 端點 SLO - 告警模板與自動修復動作 4. ops/monitoring/validate_coverage.py - 覆蓋率驗證 - CI 階段執行 - 檢測未監控服務 - 生成覆蓋率報告 設計原則: - 監控即代碼 (Monitoring as Code) - 新服務必須在 registry 註冊才能部署 - 自動發現機制防止遺漏 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
627 lines
14 KiB
YAML
627 lines
14 KiB
YAML
# AWOOOI 服務註冊表 (Single Source of Truth)
|
|
# ===========================================
|
|
# 版本: v1.0
|
|
# 建立日期: 2026-03-29
|
|
# 用途: 所有受監控服務的統一清單
|
|
#
|
|
# 新增服務時:
|
|
# 1. 在此檔案新增 entry
|
|
# 2. CI/CD 會自動生成對應的監控配置
|
|
# 3. 部署後監控自動生效
|
|
|
|
# =============================================================================
|
|
# K8s 工作負載 (awoooi-prod namespace)
|
|
# =============================================================================
|
|
services:
|
|
# --- API 後端 ---
|
|
- name: awoooi-api
|
|
type: k8s-deployment
|
|
namespace: awoooi-prod
|
|
replicas: 2
|
|
port: 8000
|
|
health_endpoint: /api/v1/health
|
|
monitoring:
|
|
prometheus: true
|
|
sentry: true
|
|
otel: true
|
|
langfuse: true
|
|
alerts:
|
|
- pod_crash
|
|
- high_error_rate
|
|
- slow_response
|
|
- memory_high
|
|
auto_repair:
|
|
enabled: true
|
|
actions:
|
|
- restart_pod
|
|
- scale_up
|
|
owner: backend-team
|
|
criticality: P0
|
|
|
|
# --- Web 前端 ---
|
|
- name: awoooi-web
|
|
type: k8s-deployment
|
|
namespace: awoooi-prod
|
|
replicas: 2
|
|
port: 3000
|
|
health_endpoint: /
|
|
monitoring:
|
|
prometheus: true
|
|
sentry: true
|
|
otel: true
|
|
langfuse: false
|
|
alerts:
|
|
- pod_crash
|
|
- slow_page_load
|
|
auto_repair:
|
|
enabled: true
|
|
actions:
|
|
- restart_pod
|
|
owner: frontend-team
|
|
criticality: P0
|
|
|
|
# --- Signal Worker ---
|
|
- name: awoooi-worker
|
|
type: k8s-deployment
|
|
namespace: awoooi-prod
|
|
replicas: 1
|
|
health_endpoint: /tmp/worker-healthy
|
|
health_type: exec_mtime
|
|
monitoring:
|
|
prometheus: true
|
|
sentry: true
|
|
otel: true
|
|
langfuse: true
|
|
alerts:
|
|
- worker_stuck
|
|
- queue_backlog
|
|
auto_repair:
|
|
enabled: true
|
|
actions:
|
|
- restart_pod
|
|
owner: backend-team
|
|
criticality: P1
|
|
|
|
# --- ArgoCD ---
|
|
- name: argocd-server
|
|
type: k8s-deployment
|
|
namespace: argocd
|
|
port: 443
|
|
health_endpoint: /healthz
|
|
monitoring:
|
|
prometheus: true
|
|
sentry: false
|
|
otel: false
|
|
alerts:
|
|
- service_down
|
|
- sync_failed
|
|
owner: devops-team
|
|
criticality: P1
|
|
|
|
# --- Prometheus ---
|
|
- name: prometheus
|
|
type: k8s-deployment
|
|
namespace: monitoring
|
|
port: 9090
|
|
health_endpoint: /-/ready
|
|
monitoring:
|
|
prometheus: false # 自己監控自己會循環
|
|
sentry: false
|
|
alerts:
|
|
- service_down
|
|
owner: devops-team
|
|
criticality: P0
|
|
|
|
# --- Alertmanager ---
|
|
- name: alertmanager
|
|
type: k8s-deployment
|
|
namespace: monitoring
|
|
port: 9093
|
|
health_endpoint: /-/ready
|
|
monitoring:
|
|
prometheus: true
|
|
sentry: false
|
|
alerts:
|
|
- service_down
|
|
owner: devops-team
|
|
criticality: P0
|
|
|
|
# =============================================================================
|
|
# Docker 容器 (192.168.0.188 - AI/Web 中心)
|
|
# =============================================================================
|
|
|
|
# --- Ollama LLM ---
|
|
- name: ollama
|
|
type: docker
|
|
host: 192.168.0.188
|
|
port: 11434
|
|
health_endpoint: /api/tags
|
|
monitoring:
|
|
prometheus: true
|
|
sentry: false
|
|
otel: false
|
|
alerts:
|
|
- service_down
|
|
- inference_timeout
|
|
- model_load_failed
|
|
auto_repair:
|
|
enabled: true
|
|
actions:
|
|
- restart_container
|
|
owner: ai-team
|
|
criticality: P0
|
|
|
|
# --- OpenClaw AI 決策中心 ---
|
|
- name: openclaw
|
|
type: docker
|
|
host: 192.168.0.188
|
|
port: 8089
|
|
health_endpoint: /health
|
|
monitoring:
|
|
prometheus: true
|
|
sentry: true
|
|
otel: true
|
|
langfuse: true
|
|
alerts:
|
|
- service_down
|
|
- analysis_timeout
|
|
- high_error_rate
|
|
auto_repair:
|
|
enabled: true
|
|
actions:
|
|
- restart_container
|
|
owner: ai-team
|
|
criticality: P0
|
|
|
|
# --- Redis Stack ---
|
|
- name: redis
|
|
type: docker
|
|
host: 192.168.0.188
|
|
port: 6380
|
|
health_endpoint: redis-cli ping
|
|
health_type: exec
|
|
monitoring:
|
|
prometheus: true
|
|
sentry: false
|
|
alerts:
|
|
- service_down
|
|
- memory_high
|
|
- connection_rejected
|
|
auto_repair:
|
|
enabled: false # 資料庫謹慎處理
|
|
owner: infra-team
|
|
criticality: P0
|
|
|
|
# --- PostgreSQL ---
|
|
- name: postgres
|
|
type: docker
|
|
host: 192.168.0.188
|
|
port: 5432
|
|
health_endpoint: pg_isready
|
|
health_type: exec
|
|
monitoring:
|
|
prometheus: true
|
|
sentry: false
|
|
alerts:
|
|
- service_down
|
|
- connection_pool_exhausted
|
|
- slow_query
|
|
- replication_lag
|
|
auto_repair:
|
|
enabled: false # 資料庫謹慎處理
|
|
owner: infra-team
|
|
criticality: P0
|
|
|
|
# --- SignOz OTEL Collector ---
|
|
- name: signoz-collector
|
|
type: docker
|
|
host: 192.168.0.188
|
|
port: 24317
|
|
health_endpoint: grpc_health
|
|
health_type: grpc
|
|
monitoring:
|
|
prometheus: true
|
|
sentry: false
|
|
alerts:
|
|
- service_down
|
|
- trace_dropped
|
|
owner: devops-team
|
|
criticality: P1
|
|
|
|
# --- SignOz UI ---
|
|
- name: signoz-ui
|
|
type: docker
|
|
host: 192.168.0.188
|
|
port: 3301
|
|
health_endpoint: /
|
|
monitoring:
|
|
prometheus: true
|
|
sentry: false
|
|
alerts:
|
|
- service_down
|
|
owner: devops-team
|
|
criticality: P2
|
|
|
|
# --- ClickHouse (SignOz 後端) ---
|
|
- name: clickhouse
|
|
type: docker
|
|
host: 192.168.0.188
|
|
port: 8123
|
|
health_endpoint: /ping
|
|
monitoring:
|
|
prometheus: true
|
|
sentry: false
|
|
alerts:
|
|
- service_down
|
|
- disk_space_low
|
|
- query_timeout
|
|
owner: devops-team
|
|
criticality: P1
|
|
|
|
# =============================================================================
|
|
# Docker 容器 (192.168.0.110 - DevOps 中心)
|
|
# =============================================================================
|
|
|
|
# --- Harbor Registry ---
|
|
- name: harbor
|
|
type: docker
|
|
host: 192.168.0.110
|
|
port: 5000
|
|
health_endpoint: /api/v2.0/health
|
|
monitoring:
|
|
prometheus: true
|
|
sentry: false
|
|
alerts:
|
|
- service_down
|
|
- storage_full
|
|
- push_failed
|
|
owner: devops-team
|
|
criticality: P0
|
|
|
|
# --- Sentry ---
|
|
- name: sentry
|
|
type: docker
|
|
host: 192.168.0.110
|
|
port: 9000
|
|
health_endpoint: /_health/
|
|
monitoring:
|
|
prometheus: true
|
|
sentry: false # 自己監控自己會循環
|
|
alerts:
|
|
- service_down
|
|
owner: devops-team
|
|
criticality: P1
|
|
|
|
# --- Langfuse LLMOps ---
|
|
- name: langfuse
|
|
type: docker
|
|
host: 192.168.0.110
|
|
port: 3100
|
|
health_endpoint: /api/public/health
|
|
monitoring:
|
|
prometheus: true
|
|
sentry: false
|
|
alerts:
|
|
- service_down
|
|
- trace_lost
|
|
owner: ai-team
|
|
criticality: P2
|
|
|
|
# --- GitHub Actions Runner ---
|
|
- name: github-runner
|
|
type: systemd
|
|
host: 192.168.0.110
|
|
service_name: actions.runner.owenhytsai-awoooi.awoooi-110.service
|
|
monitoring:
|
|
prometheus: true
|
|
sentry: false
|
|
alerts:
|
|
- runner_offline
|
|
- job_stuck
|
|
auto_repair:
|
|
enabled: true
|
|
actions:
|
|
- restart_service
|
|
owner: devops-team
|
|
criticality: P0
|
|
|
|
# =============================================================================
|
|
# 主機節點
|
|
# =============================================================================
|
|
nodes:
|
|
- name: mon
|
|
ip: 192.168.0.120
|
|
role: k3s-master
|
|
alerts:
|
|
- node_down
|
|
- cpu_high
|
|
- memory_high
|
|
- disk_space_low
|
|
- etcd_latency_high
|
|
owner: infra-team
|
|
|
|
- name: mon1
|
|
ip: 192.168.0.121
|
|
role: k3s-worker
|
|
alerts:
|
|
- node_down
|
|
- node_not_ready
|
|
- cpu_high
|
|
- memory_high
|
|
- disk_space_low
|
|
owner: infra-team
|
|
|
|
- name: harbor
|
|
ip: 192.168.0.110
|
|
role: devops
|
|
alerts:
|
|
- node_down
|
|
- cpu_high
|
|
- memory_high
|
|
- disk_space_low
|
|
owner: devops-team
|
|
|
|
- name: pg
|
|
ip: 192.168.0.188
|
|
role: ai-web
|
|
alerts:
|
|
- node_down
|
|
- cpu_high
|
|
- memory_high
|
|
- disk_space_low
|
|
- gpu_utilization_high
|
|
owner: ai-team
|
|
|
|
- name: kali
|
|
ip: 192.168.0.112
|
|
role: security
|
|
alerts:
|
|
- node_down
|
|
owner: security-team
|
|
|
|
# =============================================================================
|
|
# 前端頁面
|
|
# =============================================================================
|
|
pages:
|
|
- path: /
|
|
name: Dashboard
|
|
monitoring:
|
|
sentry_session: true
|
|
web_vitals: true
|
|
alerts:
|
|
- slow_page_load
|
|
- js_error
|
|
slo:
|
|
lcp_ms: 2500
|
|
fid_ms: 100
|
|
cls: 0.1
|
|
|
|
- path: /authorizations
|
|
name: 授權管理
|
|
monitoring:
|
|
sentry_session: true
|
|
web_vitals: true
|
|
alerts:
|
|
- slow_page_load
|
|
- api_error
|
|
slo:
|
|
lcp_ms: 2000
|
|
|
|
- path: /action-logs
|
|
name: 行動日誌
|
|
monitoring:
|
|
sentry_session: true
|
|
web_vitals: true
|
|
alerts:
|
|
- slow_page_load
|
|
|
|
- path: /errors
|
|
name: 錯誤追蹤
|
|
monitoring:
|
|
sentry_session: true
|
|
web_vitals: true
|
|
alerts:
|
|
- slow_page_load
|
|
|
|
- path: /settings
|
|
name: 設定
|
|
monitoring:
|
|
sentry_session: true
|
|
alerts:
|
|
- slow_page_load
|
|
|
|
- path: /knowledge-base
|
|
name: 知識庫
|
|
monitoring:
|
|
sentry_session: true
|
|
alerts:
|
|
- slow_page_load
|
|
|
|
# =============================================================================
|
|
# API 端點 (關鍵)
|
|
# =============================================================================
|
|
api_endpoints:
|
|
- path: /api/v1/health
|
|
method: GET
|
|
critical: true
|
|
slo:
|
|
latency_p95_ms: 100
|
|
availability: 99.99
|
|
|
|
- path: /api/v1/approvals
|
|
method: GET
|
|
critical: true
|
|
slo:
|
|
latency_p95_ms: 500
|
|
availability: 99.9
|
|
|
|
- path: /api/v1/approvals/{id}/sign
|
|
method: POST
|
|
critical: true
|
|
slo:
|
|
latency_p95_ms: 1000
|
|
availability: 99.9
|
|
|
|
- path: /api/v1/incidents
|
|
method: GET
|
|
critical: true
|
|
slo:
|
|
latency_p95_ms: 500
|
|
availability: 99.9
|
|
|
|
- path: /api/v1/analyze
|
|
method: POST
|
|
critical: true
|
|
slo:
|
|
latency_p95_ms: 30000 # 30s (LLM 分析)
|
|
availability: 95
|
|
|
|
- path: /api/v1/webhooks/alertmanager
|
|
method: POST
|
|
critical: true
|
|
slo:
|
|
latency_p95_ms: 5000
|
|
availability: 99.9
|
|
|
|
- path: /api/v1/webhooks/sentry/error
|
|
method: POST
|
|
critical: true
|
|
slo:
|
|
latency_p95_ms: 5000
|
|
availability: 99.9
|
|
|
|
- path: /api/v1/execute
|
|
method: POST
|
|
critical: true
|
|
slo:
|
|
latency_p95_ms: 10000
|
|
availability: 99
|
|
|
|
# =============================================================================
|
|
# AI 服務 (特殊監控)
|
|
# =============================================================================
|
|
ai_services:
|
|
- name: gemini-api
|
|
type: external
|
|
rate_limit:
|
|
requests_per_minute: 60
|
|
tokens_per_minute: 100000
|
|
alerts:
|
|
- rate_limit_hit
|
|
- budget_exceeded
|
|
fallback: ollama
|
|
cost_tracking: true
|
|
|
|
- name: claude-api
|
|
type: external
|
|
rate_limit:
|
|
requests_per_minute: 50
|
|
tokens_per_minute: 100000
|
|
alerts:
|
|
- rate_limit_hit
|
|
- budget_exceeded
|
|
fallback: gemini
|
|
cost_tracking: true
|
|
|
|
- name: ollama-local
|
|
type: local
|
|
models:
|
|
- qwen2.5:7b
|
|
- llama3.2:3b
|
|
alerts:
|
|
- model_load_failed
|
|
- inference_timeout
|
|
cost_tracking: false
|
|
|
|
# =============================================================================
|
|
# 告警模板 (Alert Templates)
|
|
# =============================================================================
|
|
alert_templates:
|
|
pod_crash:
|
|
expr: 'kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0'
|
|
for: 2m
|
|
severity: critical
|
|
auto_repair: restart_pod
|
|
|
|
high_error_rate:
|
|
expr: 'rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.01'
|
|
for: 5m
|
|
severity: critical
|
|
auto_repair: restart_pod
|
|
|
|
slow_response:
|
|
expr: 'histogram_quantile(0.95, http_request_duration_seconds_bucket) > 2'
|
|
for: 5m
|
|
severity: warning
|
|
auto_repair: scale_up
|
|
|
|
service_down:
|
|
expr: 'probe_success == 0'
|
|
for: 1m
|
|
severity: critical
|
|
auto_repair: restart_container
|
|
|
|
memory_high:
|
|
expr: 'container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9'
|
|
for: 5m
|
|
severity: warning
|
|
auto_repair: analyze_memory_leak
|
|
|
|
disk_space_low:
|
|
expr: 'node_filesystem_avail_bytes / node_filesystem_size_bytes < 0.15'
|
|
for: 10m
|
|
severity: warning
|
|
auto_repair: cleanup_logs
|
|
|
|
inference_timeout:
|
|
expr: 'ollama_inference_duration_seconds > 60'
|
|
for: 3m
|
|
severity: warning
|
|
auto_repair: switch_model
|
|
|
|
runner_offline:
|
|
expr: 'github_runner_status == 0'
|
|
for: 5m
|
|
severity: critical
|
|
auto_repair: restart_service
|
|
|
|
# =============================================================================
|
|
# 自動修復動作 (Auto-Repair Actions)
|
|
# =============================================================================
|
|
auto_repair_actions:
|
|
restart_pod:
|
|
command: 'kubectl rollout restart deployment/{name} -n {namespace}'
|
|
risk: low
|
|
cooldown_minutes: 10
|
|
|
|
scale_up:
|
|
command: 'kubectl scale deployment/{name} --replicas=+1 -n {namespace}'
|
|
risk: low
|
|
max_replicas: 5
|
|
cooldown_minutes: 15
|
|
|
|
restart_container:
|
|
command: 'ssh {host} docker restart {container}'
|
|
risk: low
|
|
cooldown_minutes: 10
|
|
|
|
restart_service:
|
|
command: 'ssh {host} sudo systemctl restart {service_name}'
|
|
risk: low
|
|
cooldown_minutes: 10
|
|
|
|
switch_model:
|
|
command: 'internal:switch_to_smaller_model'
|
|
risk: low
|
|
cooldown_minutes: 5
|
|
|
|
cleanup_logs:
|
|
command: 'ssh {host} find /var/log -name "*.log" -mtime +7 -delete'
|
|
risk: low
|
|
cooldown_minutes: 60
|
|
|
|
analyze_memory_leak:
|
|
command: 'internal:trigger_memory_analysis'
|
|
risk: low
|
|
cooldown_minutes: 30
|