diff --git a/docs/MONITORING_COMPLETE_STRATEGY.md b/docs/MONITORING_COMPLETE_STRATEGY.md new file mode 100644 index 00000000..06379fe3 --- /dev/null +++ b/docs/MONITORING_COMPLETE_STRATEGY.md @@ -0,0 +1,575 @@ +# AWOOOI 完整監控與 AI 自動修復策略 + +> **版本**: v1.0 +> **建立日期**: 2026-03-29 +> **負責人**: 首席架構師 (Claude Code) +> **目標**: 100% 覆蓋率監控 + AI 驅動自動修復 + +--- + +## 執行摘要 + +本文件定義 AWOOOI 全棧監控策略,涵蓋: +- **5 大主機** × **60+ 服務** × **4 層監控** +- **三層可觀測性**: Sentry (錯誤) + SignOz (追蹤) + Prometheus (指標) +- **AI 自動修復閉環**: 異常 → OpenClaw 分析 → 自動執行/人工審核 + +--- + +## 一、監控覆蓋矩陣 + +### 1.1 主機層 (Infrastructure) + +| 主機 | IP | 角色 | 監控項目 | 告警規則 | +|------|----|----|----------|----------| +| **mon (K3s Master)** | 192.168.0.120 | K3s Server + keepalived | CPU/MEM/Disk/etcd | NodeDown, etcdHighLatency | +| **mon1 (K3s Worker)** | 192.168.0.121 | K3s Worker + keepalived | CPU/MEM/Disk/kubelet | NodeNotReady, DiskPressure | +| **harbor (DevOps)** | 192.168.0.110 | Harbor/Sentry/Langfuse/Runner | CPU/MEM/Disk/Docker | HarborDown, RunnerOffline | +| **pg (AI/Web)** | 192.168.0.188 | PostgreSQL/Redis/Ollama/SignOz | CPU/MEM/Disk/GPU | DBConnectionFailed, OllamaTimeout | +| **kali (Security)** | 192.168.0.112 | Kali Scanner | CPU/MEM/Disk | ScannerOffline | + +### 1.2 服務層 (Services) + +#### A. Kubernetes 工作負載 (K3s) + +| 命名空間 | Deployment/StatefulSet | 副本數 | 健康檢查 | 告警條件 | +|----------|------------------------|--------|----------|----------| +| **awoooi-prod** | awoooi-api | 2 | HTTP /api/v1/health | PodCrashLoopBackOff, ReplicasUnavailable | +| **awoooi-prod** | awoooi-web | 2 | HTTP / | HighErrorRate, SlowResponse | +| **awoooi-prod** | awoooi-worker | 1 | Exec mtime | WorkerStuck, QueueBacklog | +| **argocd** | argocd-server | 1 | HTTP /healthz | ArgoCDDown | +| **monitoring** | prometheus-server | 1 | HTTP /-/ready | PrometheusDown | +| **monitoring** | alertmanager | 1 | HTTP /-/ready | AlertmanagerDown | +| **velero** | velero | 1 | - | BackupFailed | + +#### B. 容器服務 (Docker on 188/110) + +| 主機 | 容器 | 端口 | 健康檢查 | 告警條件 | +|------|------|------|----------|----------| +| 188 | ollama | 11434 | GET /api/tags | OllamaUnresponsive, ModelLoadFailed | +| 188 | openclaw | 8089 | GET /health | OpenClawDown, AnalysisTimeout | +| 188 | signoz-collector | 24317/24318 | gRPC health | TraceDropped | +| 188 | signoz-ui | 3301 | HTTP / | SignOzUIDown | +| 188 | redis-stack | 6380 | redis-cli ping | RedisDown, MemoryExhausted | +| 188 | postgres | 5432 | pg_isready | PostgresDown, ConnectionPoolExhausted | +| 110 | harbor-core | 5000 | GET /api/v2.0/health | HarborDown | +| 110 | sentry-web | 9000 | GET /_health/ | SentryDown | +| 110 | langfuse | 3100 | GET /api/public/health | LangfuseDown | +| 110 | actions-runner | - | systemctl status | RunnerOffline | + +### 1.3 應用層 (Application) + +#### A. API 端點監控 + +| 端點 | 方法 | 預期回應 | SLO | 告警 | +|------|------|----------|-----|------| +| /api/v1/health | GET | 200 | 99.9% | APIHealthCheckFailed | +| /api/v1/approvals/pending | GET | 200 | 99% | ApprovalsAPIError | +| /api/v1/incidents | GET | 200 | 99% | IncidentsAPIError | +| /api/v1/analyze | POST | 200/202 | 95% | AnalysisTimeout (>30s) | +| /api/v1/execute | POST | 200 | 99% | ExecutionFailed | + +#### B. 錯誤率監控 (Sentry) + +| 類型 | 閾值 | 告警 | 自動修復 | +|------|------|------|----------| +| Unhandled Exception | >0 in 5min | SentryNewError | AI 分析 + Playbook 匹配 | +| HTTP 5xx | >1% | HighErrorRate | Pod 重啟 | +| HTTP 4xx | >10% | ClientErrorSpike | 告警 + 日誌分析 | +| Slow Transaction | P95 >2s | SlowTransaction | 資源擴展建議 | + +#### C. 前端監控 + +| 指標 | 來源 | 閾值 | 告警 | +|------|------|------|------| +| Page Load Time | Sentry Performance | >3s | SlowPageLoad | +| JS Error Rate | Sentry Issues | >0.1% | FrontendError | +| API Call Failures | Sentry Breadcrumbs | >1% | APICallFailed | +| Web Vitals (LCP/FID/CLS) | Sentry | Google 標準 | PoorWebVitals | + +### 1.4 資料層 (Data) + +| 資料庫 | 監控項目 | 告警條件 | +|--------|----------|----------| +| **PostgreSQL** | 連線數、QPS、慢查詢、WAL 延遲、Disk I/O | ConnectionPoolExhausted (>90%), SlowQuery (>5s), ReplicationLag (>30s) | +| **Redis** | 記憶體使用、命中率、延遲、Key 數量 | MemoryHigh (>80%), HitRatelow (<90%), SlowCommands | +| **ClickHouse** | 磁碟使用、查詢延遲、插入速率 | DiskFull (>85%), QueryTimeout | + +### 1.5 AI/LLM 層 + +| 服務 | 監控項目 | 告警條件 | 自動修復 | +|------|----------|----------|----------| +| **Ollama** | 推理延遲、模型載入狀態、GPU 使用 | InferenceTimeout (>60s), ModelLoadFailed | 容器重啟 | +| **OpenClaw** | 分析成功率、回應時間、Token 使用 | AnalysisFailed (>10%), HighTokenCost | Fallback to Gemini | +| **Gemini API** | Rate Limit、錯誤率、成本 | RateLimitHit, BudgetExceeded | 降級到 Ollama | +| **Claude API** | Rate Limit、錯誤率、成本 | RateLimitHit, BudgetExceeded | 降級到 Gemini | +| **Langfuse** | Trace 記錄成功率 | TraceLost (>1%) | Reconnect | + +### 1.6 CI/CD 層 + +| 元件 | 監控項目 | 告警條件 | +|------|----------|----------| +| **GitHub Actions** | Workflow 狀態、Runner 健康、Job 延遲 | WorkflowFailed, RunnerOffline, JobStuck (>30min) | +| **Harbor** | 映像推送/拉取成功率、儲存空間 | PushFailed, PullFailed, StorageFull | +| **ArgoCD** | Sync 狀態、Application 健康 | SyncFailed, AppDegraded | + +--- + +## 二、告警規則完整清單 + +### 2.1 P0 - Critical (5 分鐘回應) + +```yaml +# === 基礎設施層 === +- alert: NodeDown + expr: up{job="node-exporter"} == 0 + for: 1m + severity: critical + auto_repair: false # 需人工介入 + +- alert: K3sAPIServerDown + expr: up{job="kubernetes-apiservers"} == 0 + for: 1m + severity: critical + auto_repair: false + +- alert: PostgreSQLDown + expr: pg_up == 0 + for: 30s + severity: critical + auto_repair: restart_container + +- alert: RedisDown + expr: redis_up == 0 + for: 30s + severity: critical + auto_repair: restart_container + +# === 應用層 === +- alert: AWOOOIAPIDown + expr: probe_success{job="awoooi-api"} == 0 + for: 1m + severity: critical + auto_repair: restart_pod + +- alert: OpenClawDown + expr: probe_success{job="openclaw"} == 0 + for: 2m + severity: critical + auto_repair: restart_container + +- alert: PodCrashLoopBackOff + expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0 + for: 2m + severity: critical + auto_repair: collect_logs_and_rollback + +# === CI/CD 層 === +- alert: GitHubRunnerOffline + expr: github_runner_status == 0 + for: 5m + severity: critical + auto_repair: restart_runner_service +``` + +### 2.2 P1 - High (15 分鐘回應) + +```yaml +# === 效能告警 === +- alert: HighCPUUsage + expr: node_cpu_usage_percent > 90 + for: 5m + severity: high + auto_repair: scale_up_if_possible + +- alert: HighMemoryUsage + expr: node_memory_usage_percent > 90 + for: 5m + severity: high + auto_repair: investigate_memory_leak + +- alert: APIHighLatency + expr: histogram_quantile(0.95, http_request_duration_seconds_bucket) > 2 + for: 5m + severity: high + auto_repair: analyze_slow_endpoints + +- alert: HighErrorRate + expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.01 + for: 5m + severity: high + auto_repair: restart_pod + +- alert: OllamaSlowInference + expr: ollama_inference_duration_seconds > 60 + for: 3m + severity: high + auto_repair: switch_to_smaller_model + +# === 資源告警 === +- alert: DiskSpaceLow + expr: node_filesystem_avail_bytes / node_filesystem_size_bytes < 0.15 + for: 10m + severity: high + auto_repair: cleanup_old_logs + +- alert: PostgreSQLConnectionPoolHigh + expr: pg_stat_activity_count / pg_settings_max_connections > 0.8 + for: 5m + severity: high + auto_repair: analyze_connection_leaks +``` + +### 2.3 P2 - Medium (1 小時回應) + +```yaml +- alert: CertificateExpiringSoon + expr: ssl_cert_not_after - time() < 14 * 24 * 3600 + severity: medium + auto_repair: renew_certificate + +- alert: BackupNotSuccessful + expr: velero_backup_success_total < 1 in 24h + severity: medium + auto_repair: trigger_backup + +- alert: LangfuseTraceLoss + expr: langfuse_trace_drop_rate > 0.01 + severity: medium + auto_repair: reconnect_langfuse +``` + +--- + +## 三、AI 自動修復閉環 + +### 3.1 修復流程圖 + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ 異常發生 │ +│ (Prometheus Alert / Sentry Issue / SignOz Anomaly) │ +└────────────────────────────┬────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ Alertmanager 路由 │ +│ ┌───────────────┬───────────────┬───────────────┐ │ +│ │ route: awoooi │ route: infra │ route: aiops │ │ +│ └───────┬───────┴───────┬───────┴───────┬───────┘ │ +└──────────┼───────────────┼───────────────┼──────────────────────────┘ + │ │ │ + └───────────────┼───────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ AWOOOI API: /api/v1/webhooks/alertmanager │ +│ 1. 接收告警 → 2. 去重 (10min fingerprint) → 3. 建立 Incident │ +└────────────────────────────┬────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ OpenClaw AI 分析引擎 │ +│ ┌───────────────────────────────────────────────────────────────┐ │ +│ │ 輸入: │ │ +│ │ - Alert 內容 (labels, annotations) │ │ +│ │ - K8s 上下文 (Pod logs, events, metrics) │ │ +│ │ - 歷史 Playbook (相似案例) │ │ +│ │ - SignOz Traces (相關 Span) │ │ +│ │ - Sentry Issues (相關錯誤) │ │ +│ ├───────────────────────────────────────────────────────────────┤ │ +│ │ 輸出: │ │ +│ │ - suggested_action: RESTART_POD | DELETE_POD | SCALE_UP | ... │ │ +│ │ - confidence: 0.0-1.0 │ │ +│ │ - risk_level: LOW | MEDIUM | CRITICAL │ │ +│ │ - blast_radius: {affected_pods, estimated_downtime} │ │ +│ │ - kubectl_command: 具體指令 │ │ +│ │ - reasoning: 決策理由 (繁體中文) │ │ +│ └───────────────────────────────────────────────────────────────┘ │ +└────────────────────────────┬────────────────────────────────────────┘ + │ + ┌──────────────┴──────────────┐ + │ │ + ▼ ▼ +┌─────────────────────────┐ ┌─────────────────────────┐ +│ confidence >= 0.85 │ │ confidence < 0.85 │ +│ risk_level = LOW │ │ OR risk = CRITICAL │ +│ ↓ │ │ ↓ │ +│ 自動執行 │ │ 人工審核 │ +└───────────┬─────────────┘ └───────────┬─────────────┘ + │ │ + │ ▼ + │ ┌─────────────────────────┐ + │ │ Telegram 推送審核卡片 │ + │ │ [✅ 簽核] [❌ 拒絕] │ + │ │ [⏰ 稍後] [🔕 靜默] │ + │ └───────────┬─────────────┘ + │ │ + │ ┌───────────┴───────────┐ + │ │ │ + │ ▼ ▼ + │ ┌────────────┐ ┌────────────┐ + │ │ 人工批准 │ │ 人工拒絕 │ + │ └─────┬──────┘ └─────┬──────┘ + │ │ │ + └──────────────┼───────────────────────┤ + │ │ + ▼ ▼ + ┌─────────────────────────┐ ┌────────────────┐ + │ K8s Executor 執行 │ │ 記錄拒絕原因 │ + │ kubectl $command │ │ 更新 Playbook │ + └───────────┬─────────────┘ └────────────────┘ + │ + ▼ + ┌─────────────────────────┐ + │ 執行結果驗證 │ + │ - 健康檢查通過? │ + │ - 錯誤率下降? │ + │ - 延遲恢復正常? │ + └───────────┬─────────────┘ + │ + ┌───────────┴───────────┐ + │ │ + ▼ ▼ + ┌─────────────────┐ ┌─────────────────┐ + │ 修復成功 │ │ 修復失敗 │ + │ - 關閉 Incident│ │ - 升級告警 │ + │ - 更新 Playbook│ │ - 記錄失敗 │ + │ - Telegram 通知│ │ - 人工介入 │ + └─────────────────┘ └─────────────────┘ +``` + +### 3.2 自動修復動作清單 + +| 動作 | 觸發條件 | 執行指令 | 風險等級 | 自動執行? | +|------|----------|----------|----------|-----------| +| `RESTART_POD` | PodCrashLoop, HighErrorRate | `kubectl rollout restart deployment/{name}` | LOW | ✅ 可自動 | +| `DELETE_POD` | PodStuck, OOMKilled | `kubectl delete pod {name} --grace-period=30` | LOW | ✅ 可自動 | +| `SCALE_UP` | HighCPU, HighMemory, SlowResponse | `kubectl scale deployment/{name} --replicas=+1` | LOW | ✅ 可自動 | +| `SCALE_DOWN` | ResourceWaste | `kubectl scale deployment/{name} --replicas=-1` | MEDIUM | ❌ 需審核 | +| `ROLLBACK` | DeploymentFailed, VersionDrift | `kubectl rollout undo deployment/{name}` | MEDIUM | ❌ 需審核 | +| `RESTART_CONTAINER` | ContainerUnhealthy | `docker restart {container}` | LOW | ✅ 可自動 | +| `CLEAR_CACHE` | RedisMemoryHigh, StaleCache | `redis-cli FLUSHDB` | MEDIUM | ❌ 需審核 | +| `VACUUM_DB` | TableBloat, SlowQuery | `VACUUM ANALYZE {table}` | MEDIUM | ❌ 需審核 | +| `RENEW_CERT` | CertExpiring | `certbot renew` | LOW | ✅ 可自動 | +| `CLEANUP_LOGS` | DiskSpaceLow | `find /var/log -mtime +7 -delete` | LOW | ✅ 可自動 | +| `SWITCH_MODEL` | OllamaTimeout | 切換到更小模型 | LOW | ✅ 可自動 | +| `FALLBACK_AI` | GeminiRateLimit | Gemini → Ollama | LOW | ✅ 可自動 | + +### 3.3 安全護欄 + +```python +# === 自動修復安全限制 === +SAFETY_GUARDRAILS = { + # 頻率限制 + "max_repairs_per_hour": 5, # 每小時最多 5 次自動修復 + "max_repairs_per_resource": 3, # 同一資源每小時最多 3 次 + "cooldown_after_failure": 600, # 失敗後冷卻 10 分鐘 + + # 風險限制 + "auto_approve_max_risk": "LOW", # 自動批准僅限 LOW 風險 + "auto_approve_min_confidence": 0.85, # 最低信心度 85% + + # 影響範圍限制 + "max_affected_pods": 3, # 最多影響 3 個 Pod + "min_healthy_replicas": 1, # 至少保留 1 個健康副本 + + # 禁止自動執行 + "blacklist_actions": [ + "DROP_DATABASE", + "DELETE_NAMESPACE", + "FORCE_DELETE_PVC", + "DELETE_SECRET", + ], + + # 白名單命名空間 + "allowed_namespaces": [ + "awoooi-prod", + "monitoring", + ], +} +``` + +--- + +## 四、監控資料流整合 + +### 4.1 Sentry → OpenClaw + +```python +# /api/v1/webhooks/sentry - Sentry Issue Alert Webhook +async def handle_sentry_webhook(payload: dict): + """ + 1. 解析 Sentry Issue + 2. 去重檢查 (10 分鐘 TTL) + 3. 建立 Incident + 4. 觸發 OpenClaw 分析 + 5. 推送 Telegram + """ + issue_id = payload["data"]["issue"]["id"] + + # 去重 + if await redis.get(f"sentry_dedup:{issue_id}"): + return {"status": "deduplicated"} + await redis.setex(f"sentry_dedup:{issue_id}", 600, "1") + + # 建立 Incident + incident = await incident_service.create_from_sentry(payload) + + # AI 分析 + analysis = await openclaw.analyze_error( + error_title=payload["data"]["issue"]["title"], + stack_trace=payload["data"]["issue"]["culprit"], + sentry_url=payload["data"]["issue"]["web_url"], + trace_id=extract_trace_id(payload), + ) + + # Telegram 通知 + await telegram.send_error_alert( + incident_id=incident.id, + analysis=analysis, + sentry_url=payload["data"]["issue"]["web_url"], + ) +``` + +### 4.2 Alertmanager → OpenClaw + +```yaml +# alertmanager.yml +route: + receiver: awoooi-api + routes: + - match: + namespace: awoooi-prod + receiver: awoooi-api + - match: + severity: critical + receiver: awoooi-api + +receivers: + - name: awoooi-api + webhook_configs: + - url: http://192.168.0.125:32334/api/v1/webhooks/alertmanager + send_resolved: true + http_config: + basic_auth: + username: alertmanager + password_file: /etc/alertmanager/secrets/webhook-password +``` + +### 4.3 SignOz → OpenClaw + +```python +# 透過 ClickHouse 查詢異常 Span +async def detect_signoz_anomalies(): + """ + 定期查詢 SignOz ClickHouse 偵測: + - Error Rate 異常上升 + - Latency P99 異常 + - Trace 數量驟降 (服務可能掛了) + """ + anomalies = await clickhouse.query(""" + SELECT + serviceName, + count(*) as error_count, + avg(durationNano) / 1e6 as avg_latency_ms + FROM signoz_traces.signoz_index_v2 + WHERE timestamp > now() - INTERVAL 5 MINUTE + AND statusCode = 'STATUS_CODE_ERROR' + GROUP BY serviceName + HAVING error_count > 10 + """) + + for anomaly in anomalies: + await openclaw.analyze_trace_anomaly( + service=anomaly["serviceName"], + error_count=anomaly["error_count"], + avg_latency=anomaly["avg_latency_ms"], + ) +``` + +--- + +## 五、實作優先級 + +### Phase 1 (本週 - P0) + +| 項目 | 狀態 | 負責 | 說明 | +|------|------|------|------| +| Alertmanager → AWOOOI Webhook | ⬜ TODO | Claude Code | 配置 webhook + 測試告警 | +| Sentry Webhook → Telegram | ⬜ TODO | Claude Code | 錯誤直接推送 + AI 分析 | +| Secrets 自動注入 (CD) | ⬜ TODO | Claude Code | kubectl patch secret | +| 告警去重驗證 | ⬜ TODO | Claude Code | 10min fingerprint 測試 | + +### Phase 2 (下週 - P1) + +| 項目 | 狀態 | 負責 | 說明 | +|------|------|------|------| +| SignOz 告警規則 | ⬜ TODO | Claude Code | Error Rate, Latency P99 | +| 自動修復動作擴展 | ⬜ TODO | Claude Code | SCALE_UP, ROLLBACK | +| Playbook 自動萃取 | ⬜ TODO | Claude Code | 成功修復 → Playbook | +| 告警升級機制 | ⬜ TODO | Claude Code | SLA Engine | + +### Phase 3 (兩週後 - P2) + +| 項目 | 狀態 | 負責 | 說明 | +|------|------|------|------| +| Grafana 儀表板 | ⬜ TODO | Claude Code | 監控總覽 | +| SLO/SLI 定義 | ⬜ TODO | Claude Code | 99.9% 可用性目標 | +| 告警噪音抑制 | ⬜ TODO | Claude Code | ML 異常偵測 | +| 容量預測 | ⬜ TODO | Claude Code | 資源趨勢預測 | + +--- + +## 六、附錄 + +### A. 環境變數清單 + +```bash +# === Alertmanager === +ALERTMANAGER_WEBHOOK_URL=http://192.168.0.125:32334/api/v1/webhooks/alertmanager +ALERTMANAGER_WEBHOOK_SECRET= + +# === Sentry === +SENTRY_DSN=http://@192.168.0.110:9000/ +SENTRY_WEBHOOK_SECRET= +SENTRY_DEDUP_TTL=600 + +# === SignOz === +SIGNOZ_CLICKHOUSE_URL=http://192.168.0.188:8123 +SIGNOZ_ANOMALY_THRESHOLD_ERROR_COUNT=10 + +# === 自動修復 === +AUTO_REPAIR_ENABLED=true +AUTO_REPAIR_MAX_PER_HOUR=5 +AUTO_REPAIR_MIN_CONFIDENCE=0.85 +AUTO_REPAIR_DRY_RUN=false +``` + +### B. 告警模板 + +```markdown +🚨 **CRITICAL | awoooi-api** +━━━━━━━━━━━━━━━━━━━ +📋 INC-20260329-0001 +🎯 Pod: awoooi-api-7d4b8c9f5-abc12 +━━━━━━━━━━━━━━━━━━━ +🤖 **AI 分析** +👥 責任: BE (後端) +📊 信心: 🟢 92% +💡 原因: OOM Killed - Memory limit exceeded +━━━━━━━━━━━━━━━━━━━ +🔧 建議: DELETE_POD + SCALE_UP +⏱️ 停機: ~30s +💰 Tokens: 1,234 / $0.0012 +━━━━━━━━━━━━━━━━━━━ +🔗 [SignOz Trace](http://192.168.0.188:3301/trace/abc123) +🔗 [Sentry Issue](http://192.168.0.110:9000/issues/456) + +[✅ 簽核] [❌ 拒絕] [⏰ 稍後] [🔕 靜默] +``` + +--- + +**文件結束** +**下一步**: 執行 Phase 1 任務 diff --git a/docs/MONITORING_INTEGRATION_ARCHITECTURE.md b/docs/MONITORING_INTEGRATION_ARCHITECTURE.md new file mode 100644 index 00000000..eff72e39 --- /dev/null +++ b/docs/MONITORING_INTEGRATION_ARCHITECTURE.md @@ -0,0 +1,977 @@ +# AWOOOI 監控自動整合架構 + +> **版本**: v1.0 +> **建立日期**: 2026-03-29 +> **目標**: 新服務/功能自動獲得監控,零遺漏 + +--- + +## 核心原則 + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ 🎯 監控即代碼 (Monitoring as Code) │ +│ │ +│ • 所有監控配置存放於 Git │ +│ • CI/CD 自動驗證監控覆蓋率 │ +│ • 新服務必須通過監控檢查才能部署 │ +│ • 服務註冊表自動同步監控規則 │ +└─────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 一、服務註冊表 (Service Registry) + +### 1.1 註冊表結構 + +```yaml +# /ops/monitoring/service-registry.yaml +# 所有受監控服務的單一事實來源 (Single Source of Truth) + +services: + # === K8s 工作負載 === + - name: awoooi-api + type: k8s-deployment + namespace: awoooi-prod + port: 8000 + health_endpoint: /api/v1/health + monitoring: + prometheus: true + sentry: true + otel: true + langfuse: true # 有 LLM 呼叫 + alerts: + - pod_crash + - high_error_rate + - slow_response + owner: backend-team + + - name: awoooi-web + type: k8s-deployment + namespace: awoooi-prod + port: 3000 + health_endpoint: / + monitoring: + prometheus: true + sentry: true + otel: true + langfuse: false + alerts: + - pod_crash + - slow_page_load + owner: frontend-team + + - name: awoooi-worker + type: k8s-deployment + namespace: awoooi-prod + health_endpoint: /tmp/worker-healthy # exec probe + monitoring: + prometheus: true + sentry: true + otel: true + langfuse: true + alerts: + - worker_stuck + - queue_backlog + owner: backend-team + + # === Docker 容器 (188) === + - name: ollama + type: docker + host: 192.168.0.188 + port: 11434 + health_endpoint: /api/tags + monitoring: + prometheus: true + sentry: false # 外部服務 + otel: false + alerts: + - service_down + - inference_timeout + owner: ai-team + + - name: openclaw + type: docker + host: 192.168.0.188 + port: 8089 + health_endpoint: /health + monitoring: + prometheus: true + sentry: true + otel: true + langfuse: true + alerts: + - service_down + - analysis_timeout + owner: ai-team + + - name: redis + type: docker + host: 192.168.0.188 + port: 6380 + health_endpoint: redis-cli ping + monitoring: + prometheus: true + alerts: + - service_down + - memory_high + owner: infra-team + + - name: postgres + type: docker + host: 192.168.0.188 + port: 5432 + health_endpoint: pg_isready + monitoring: + prometheus: true + alerts: + - service_down + - connection_pool_exhausted + - slow_query + owner: infra-team + + # === Docker 容器 (110) === + - name: harbor + type: docker + host: 192.168.0.110 + port: 5000 + health_endpoint: /api/v2.0/health + monitoring: + prometheus: true + alerts: + - service_down + - storage_full + owner: devops-team + + - name: sentry + type: docker + host: 192.168.0.110 + port: 9000 + health_endpoint: /_health/ + monitoring: + prometheus: true + alerts: + - service_down + owner: devops-team + + - name: langfuse + type: docker + host: 192.168.0.110 + port: 3100 + health_endpoint: /api/public/health + monitoring: + prometheus: true + alerts: + - service_down + owner: ai-team + + - name: github-runner + type: systemd + host: 192.168.0.110 + service_name: actions.runner.owenhytsai-awoooi.awoooi-110.service + monitoring: + prometheus: true + alerts: + - runner_offline + owner: devops-team + +# === 前端頁面 === +pages: + - path: / + name: Dashboard + monitoring: + sentry_session: true + web_vitals: true + alerts: + - slow_page_load + - js_error + + - path: /authorizations + name: 授權管理 + monitoring: + sentry_session: true + web_vitals: true + alerts: + - slow_page_load + - api_error + + - path: /action-logs + name: 行動日誌 + monitoring: + sentry_session: true + alerts: + - slow_page_load + + - path: /errors + name: 錯誤追蹤 + monitoring: + sentry_session: true + alerts: + - slow_page_load + +# === API 端點 === +api_endpoints: + - path: /api/v1/health + method: GET + critical: true + slo_latency_ms: 100 + slo_availability: 99.99 + + - path: /api/v1/approvals + method: GET + critical: true + slo_latency_ms: 500 + slo_availability: 99.9 + + - path: /api/v1/analyze + method: POST + critical: true + slo_latency_ms: 30000 # 30s (LLM) + slo_availability: 95 + + - path: /api/v1/webhooks/alertmanager + method: POST + critical: true + slo_latency_ms: 5000 + slo_availability: 99.9 +``` + +### 1.2 自動生成工具 + +```python +# /ops/monitoring/generate_monitoring.py +""" +從 service-registry.yaml 自動生成: +1. Prometheus scrape configs +2. Alertmanager alert rules +3. Grafana dashboards +4. Blackbox exporter targets +""" + +import yaml +from pathlib import Path + +def generate_prometheus_config(registry: dict) -> str: + """生成 Prometheus scrape_configs""" + scrape_configs = [] + + for service in registry['services']: + if service['monitoring'].get('prometheus'): + config = { + 'job_name': service['name'], + 'static_configs': [{ + 'targets': [f"{service['host']}:{service['port']}"] + }], + 'metrics_path': '/metrics', + 'scrape_interval': '15s', + } + + # 根據類型調整 + if service['type'] == 'k8s-deployment': + config['kubernetes_sd_configs'] = [{ + 'role': 'pod', + 'namespaces': {'names': [service['namespace']]} + }] + del config['static_configs'] + + scrape_configs.append(config) + + return yaml.dump({'scrape_configs': scrape_configs}) + + +def generate_alert_rules(registry: dict) -> str: + """生成 Prometheus alert rules""" + groups = [] + + for service in registry['services']: + rules = [] + + for alert in service.get('alerts', []): + rule = ALERT_TEMPLATES.get(alert, {}).copy() + rule['labels'] = { + 'service': service['name'], + 'owner': service['owner'], + 'severity': 'critical' if alert in CRITICAL_ALERTS else 'warning', + } + rules.append(rule) + + if rules: + groups.append({ + 'name': f"{service['name']}_alerts", + 'rules': rules, + }) + + return yaml.dump({'groups': groups}) + + +def generate_blackbox_targets(registry: dict) -> list: + """生成 Blackbox Exporter 健康檢查目標""" + targets = [] + + for service in registry['services']: + if service.get('health_endpoint'): + if service['type'] in ['docker', 'k8s-deployment']: + url = f"http://{service['host']}:{service['port']}{service['health_endpoint']}" + targets.append({ + 'targets': [url], + 'labels': { + 'service': service['name'], + 'type': service['type'], + } + }) + + return targets + + +# 告警模板 +ALERT_TEMPLATES = { + 'pod_crash': { + 'alert': 'PodCrashLoopBackOff', + 'expr': 'kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0', + 'for': '2m', + 'annotations': { + 'summary': 'Pod {{ $labels.pod }} is crash looping', + 'auto_repair': 'restart_pod', + } + }, + 'high_error_rate': { + 'alert': 'HighErrorRate', + 'expr': 'rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.01', + 'for': '5m', + 'annotations': { + 'summary': 'High error rate on {{ $labels.service }}', + 'auto_repair': 'restart_pod', + } + }, + 'service_down': { + 'alert': 'ServiceDown', + 'expr': 'probe_success == 0', + 'for': '1m', + 'annotations': { + 'summary': '{{ $labels.service }} is down', + 'auto_repair': 'restart_container', + } + }, + 'slow_response': { + 'alert': 'SlowResponse', + 'expr': 'histogram_quantile(0.95, http_request_duration_seconds_bucket) > 2', + 'for': '5m', + 'annotations': { + 'summary': 'Slow response on {{ $labels.service }}', + 'auto_repair': 'scale_up', + } + }, + 'memory_high': { + 'alert': 'MemoryHigh', + 'expr': 'container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9', + 'for': '5m', + 'annotations': { + 'summary': 'High memory usage on {{ $labels.service }}', + 'auto_repair': 'analyze_memory_leak', + } + }, + 'runner_offline': { + 'alert': 'GitHubRunnerOffline', + 'expr': 'github_runner_status == 0', + 'for': '5m', + 'annotations': { + 'summary': 'GitHub Runner is offline', + 'auto_repair': 'restart_runner_service', + } + }, +} + +CRITICAL_ALERTS = {'pod_crash', 'service_down', 'runner_offline'} +``` + +--- + +## 二、CI/CD 整合 (自動監控) + +### 2.1 新服務自動監控流程 + +```yaml +# .github/workflows/cd.yaml 新增步驟 + +jobs: + monitoring-validation: + name: "🔍 Monitoring Coverage Check" + runs-on: self-hosted + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Validate Service Registry + run: | + # 檢查所有 K8s Deployment 是否都在 registry 中 + python ops/monitoring/validate_coverage.py + + - name: Generate Monitoring Configs + run: | + # 從 registry 生成最新監控配置 + python ops/monitoring/generate_monitoring.py + + - name: Check for Drift + run: | + # 檢查生成的配置與現有配置是否一致 + diff -r ops/monitoring/generated/ ops/monitoring/active/ + + - name: Apply Monitoring Configs + if: github.ref == 'refs/heads/main' + run: | + # 部署監控配置 + kubectl apply -f ops/monitoring/generated/prometheus-rules.yaml + kubectl apply -f ops/monitoring/generated/alertmanager-config.yaml +``` + +### 2.2 新服務檢測腳本 + +```python +# /ops/monitoring/validate_coverage.py +""" +CI 檢查: 確保所有服務都有監控配置 +""" + +import yaml +import subprocess +import sys + +def get_k8s_deployments() -> list[str]: + """取得所有 K8s Deployments""" + result = subprocess.run( + ['kubectl', 'get', 'deployments', '-A', '-o', 'jsonpath={.items[*].metadata.name}'], + capture_output=True, text=True + ) + return result.stdout.split() + +def get_docker_containers(host: str) -> list[str]: + """取得主機上的 Docker 容器""" + result = subprocess.run( + ['ssh', host, 'docker', 'ps', '--format', '{{.Names}}'], + capture_output=True, text=True + ) + return result.stdout.strip().split('\n') + +def load_registry() -> dict: + """載入服務註冊表""" + with open('ops/monitoring/service-registry.yaml') as f: + return yaml.safe_load(f) + +def main(): + registry = load_registry() + registered_services = {s['name'] for s in registry['services']} + + errors = [] + + # 檢查 K8s Deployments + k8s_deployments = get_k8s_deployments() + for deploy in k8s_deployments: + if deploy not in registered_services and not deploy.startswith('kube-'): + errors.append(f"❌ K8s Deployment '{deploy}' 未在 service-registry.yaml 中註冊") + + # 檢查 Docker 容器 (188, 110) + for host in ['192.168.0.188', '192.168.0.110']: + try: + containers = get_docker_containers(host) + for container in containers: + if container and container not in registered_services: + # 忽略系統容器 + if not any(x in container for x in ['k3s', 'pause', 'coredns']): + errors.append(f"⚠️ Docker 容器 '{container}' on {host} 未在 registry 中") + except Exception as e: + print(f"Warning: Cannot check {host}: {e}") + + if errors: + print("\n".join(errors)) + print(f"\n❌ 發現 {len(errors)} 個未監控的服務!") + print("請更新 ops/monitoring/service-registry.yaml") + sys.exit(1) + + print("✅ 所有服務都已註冊監控") + sys.exit(0) + +if __name__ == '__main__': + main() +``` + +### 2.3 新 API 端點自動監控 + +```python +# /apps/api/src/core/auto_monitoring.py +""" +FastAPI 路由自動監控 +- 自動註冊所有端點到 Prometheus +- 自動設置 Sentry 追蹤 +- 自動建立健康檢查 +""" + +from functools import wraps +from fastapi import APIRouter, Request +from prometheus_client import Counter, Histogram +import time + +# Prometheus Metrics (自動建立) +REQUEST_COUNT = Counter( + 'http_requests_total', + 'Total HTTP requests', + ['method', 'path', 'status'] +) + +REQUEST_LATENCY = Histogram( + 'http_request_duration_seconds', + 'HTTP request latency', + ['method', 'path'], + buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0] +) + +def auto_monitor(router: APIRouter): + """ + 裝飾器: 為 Router 的所有端點添加監控 + + 使用方式: + ```python + router = APIRouter(prefix="/api/v1/new-feature") + auto_monitor(router) # 自動添加監控 + ``` + """ + original_add_api_route = router.add_api_route + + def monitored_add_api_route(path, endpoint, **kwargs): + @wraps(endpoint) + async def monitored_endpoint(request: Request, *args, **inner_kwargs): + start_time = time.time() + + try: + response = await endpoint(request, *args, **inner_kwargs) + status = getattr(response, 'status_code', 200) + except Exception as e: + status = 500 + raise + finally: + # 記錄指標 + REQUEST_COUNT.labels( + method=request.method, + path=path, + status=status + ).inc() + + REQUEST_LATENCY.labels( + method=request.method, + path=path + ).observe(time.time() - start_time) + + return response + + return original_add_api_route(path, monitored_endpoint, **kwargs) + + router.add_api_route = monitored_add_api_route + return router +``` + +--- + +## 三、前端自動監控 + +### 3.1 頁面自動埋點 + +```typescript +// /apps/web/src/lib/auto-monitoring.ts +/** + * 前端頁面自動監控 + * - Web Vitals 自動收集 + * - 頁面錯誤自動上報 + * - API 呼叫自動追蹤 + */ + +import * as Sentry from '@sentry/nextjs'; + +// 自動初始化 (在 _app.tsx 中調用) +export function initAutoMonitoring() { + // 1. Web Vitals + if (typeof window !== 'undefined') { + import('web-vitals').then(({ onCLS, onFID, onLCP, onTTFB, onINP }) => { + onCLS(sendToAnalytics); + onFID(sendToAnalytics); + onLCP(sendToAnalytics); + onTTFB(sendToAnalytics); + onINP(sendToAnalytics); + }); + } + + // 2. 全局錯誤處理 + if (typeof window !== 'undefined') { + window.addEventListener('error', (event) => { + Sentry.captureException(event.error); + }); + + window.addEventListener('unhandledrejection', (event) => { + Sentry.captureException(event.reason); + }); + } + + // 3. API 呼叫自動追蹤 + patchFetch(); +} + +function sendToAnalytics(metric: any) { + // 發送到 Sentry Performance + Sentry.metrics.distribution( + `web_vitals.${metric.name}`, + metric.value, + { + tags: { + page: window.location.pathname, + }, + } + ); +} + +function patchFetch() { + const originalFetch = window.fetch; + + window.fetch = async function(input, init) { + const url = typeof input === 'string' ? input : input.url; + const method = init?.method || 'GET'; + + const span = Sentry.startSpan({ + name: `${method} ${url}`, + op: 'http.client', + }); + + try { + const response = await originalFetch(input, init); + + // 記錄 API 錯誤 + if (!response.ok) { + Sentry.captureMessage(`API Error: ${method} ${url} - ${response.status}`, { + level: response.status >= 500 ? 'error' : 'warning', + extra: { + status: response.status, + statusText: response.statusText, + }, + }); + } + + return response; + } catch (error) { + Sentry.captureException(error); + throw error; + } finally { + span?.end(); + } + }; +} +``` + +### 3.2 新頁面自動檢測 + +```typescript +// /apps/web/src/middleware.ts +import { NextResponse } from 'next/server'; +import type { NextRequest } from 'next/server'; + +// 已知頁面清單 (從 service-registry 同步) +const KNOWN_PAGES = new Set([ + '/', + '/authorizations', + '/action-logs', + '/errors', + '/settings', + '/knowledge-base', +]); + +export function middleware(request: NextRequest) { + const path = request.nextUrl.pathname; + + // 檢測新頁面 + if (!KNOWN_PAGES.has(path) && !path.startsWith('/api') && !path.startsWith('/_next')) { + // 發送到監控系統 + console.warn(`[MONITORING] 新頁面被訪問但未註冊: ${path}`); + + // TODO: 發送到 Sentry 或後端 API + } + + return NextResponse.next(); +} +``` + +--- + +## 四、自動發現機制 + +### 4.1 K8s 服務自動發現 + +```yaml +# /ops/monitoring/prometheus/kubernetes-sd.yaml +# Prometheus 自動發現 K8s 服務 + +scrape_configs: + # 自動發現所有 Pod + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: + - role: pod + relabel_configs: + # 只抓有 prometheus.io/scrape: "true" 標籤的 Pod + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + # 使用 Pod 標籤作為 job name + - source_labels: [__meta_kubernetes_pod_label_app] + target_label: job + # 使用 namespace 標籤 + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace +``` + +### 4.2 Docker 容器自動發現 + +```python +# /ops/monitoring/discover_docker.py +""" +定期掃描 Docker 容器,發現未監控的新服務 +""" + +import subprocess +import json +from datetime import datetime + +HOSTS = ['192.168.0.188', '192.168.0.110'] + +def discover_containers(): + """發現所有 Docker 容器""" + all_containers = [] + + for host in HOSTS: + try: + result = subprocess.run( + ['ssh', host, 'docker', 'ps', '--format', '{{json .}}'], + capture_output=True, text=True, timeout=10 + ) + + for line in result.stdout.strip().split('\n'): + if line: + container = json.loads(line) + container['host'] = host + all_containers.append(container) + except Exception as e: + print(f"Error scanning {host}: {e}") + + return all_containers + +def check_new_containers(containers: list, registry: dict): + """檢查是否有新容器未在 registry 中""" + registered = {s['name'] for s in registry['services']} + + new_containers = [] + for c in containers: + name = c['Names'] + if name not in registered: + new_containers.append({ + 'name': name, + 'host': c['host'], + 'image': c['Image'], + 'created': c['CreatedAt'], + }) + + return new_containers + +def alert_new_containers(new_containers: list): + """發送新容器告警""" + if new_containers: + message = f"""🆕 發現 {len(new_containers)} 個未監控的容器: + +""" + for c in new_containers: + message += f"• {c['name']} on {c['host']} ({c['image']})\n" + + message += "\n請更新 service-registry.yaml" + + # TODO: 發送 Telegram 告警 + print(message) + +if __name__ == '__main__': + # 作為 cron job 每小時執行 + import yaml + + with open('ops/monitoring/service-registry.yaml') as f: + registry = yaml.safe_load(f) + + containers = discover_containers() + new_containers = check_new_containers(containers, registry) + + if new_containers: + alert_new_containers(new_containers) +``` + +--- + +## 五、監控覆蓋率儀表板 + +### 5.1 覆蓋率計算 + +```python +# /ops/monitoring/coverage_report.py +""" +計算監控覆蓋率並生成報告 +""" + +def calculate_coverage(registry: dict) -> dict: + """計算各維度的監控覆蓋率""" + services = registry['services'] + + total = len(services) + + coverage = { + 'prometheus': sum(1 for s in services if s['monitoring'].get('prometheus')) / total, + 'sentry': sum(1 for s in services if s['monitoring'].get('sentry')) / total, + 'otel': sum(1 for s in services if s['monitoring'].get('otel')) / total, + 'langfuse': sum(1 for s in services if s['monitoring'].get('langfuse')) / total, + 'alerts': sum(1 for s in services if s.get('alerts')) / total, + } + + # 頁面覆蓋率 + pages = registry.get('pages', []) + if pages: + coverage['page_sentry'] = sum(1 for p in pages if p['monitoring'].get('sentry_session')) / len(pages) + coverage['page_vitals'] = sum(1 for p in pages if p['monitoring'].get('web_vitals')) / len(pages) + + # API SLO 覆蓋率 + endpoints = registry.get('api_endpoints', []) + if endpoints: + coverage['api_slo'] = sum(1 for e in endpoints if e.get('slo_latency_ms')) / len(endpoints) + + return coverage + +def generate_report(coverage: dict) -> str: + """生成覆蓋率報告""" + report = """ +# AWOOOI 監控覆蓋率報告 +生成時間: {timestamp} + +## 服務監控覆蓋率 + +| 監控類型 | 覆蓋率 | 狀態 | +|----------|--------|------| +| Prometheus Metrics | {prometheus:.0%} | {prometheus_status} | +| Sentry 錯誤追蹤 | {sentry:.0%} | {sentry_status} | +| OTEL Traces | {otel:.0%} | {otel_status} | +| Langfuse LLM | {langfuse:.0%} | {langfuse_status} | +| Alert Rules | {alerts:.0%} | {alerts_status} | + +## 前端監控覆蓋率 + +| 監控類型 | 覆蓋率 | 狀態 | +|----------|--------|------| +| Sentry Session | {page_sentry:.0%} | {page_sentry_status} | +| Web Vitals | {page_vitals:.0%} | {page_vitals_status} | + +## API SLO 覆蓋率 + +| 類型 | 覆蓋率 | 狀態 | +|------|--------|------| +| SLO 定義 | {api_slo:.0%} | {api_slo_status} | + +--- +總體健康度: **{overall:.0%}** +""".format( + timestamp=datetime.now().isoformat(), + **coverage, + **{f"{k}_status": "✅" if v >= 0.9 else "⚠️" if v >= 0.7 else "❌" for k, v in coverage.items()}, + overall=sum(coverage.values()) / len(coverage), + ) + + return report +``` + +--- + +## 六、整合流程圖 + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ 開發者新增服務 │ +│ (新 K8s Deployment / Docker 容器 / API 端點 / 前端頁面) │ +└────────────────────────────┬────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ Step 1: 更新 service-registry.yaml │ +│ ┌───────────────────────────────────────────────────────────────┐ │ +│ │ services: │ │ +│ │ - name: new-service │ │ +│ │ type: k8s-deployment │ │ +│ │ monitoring: │ │ +│ │ prometheus: true │ │ +│ │ sentry: true │ │ +│ │ alerts: │ │ +│ │ - pod_crash │ │ +│ │ - high_error_rate │ │ +│ └───────────────────────────────────────────────────────────────┘ │ +└────────────────────────────┬────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ Step 2: git push → CI/CD Pipeline │ +│ ┌───────────────────────────────────────────────────────────────┐ │ +│ │ 1. validate_coverage.py → 檢查所有服務都在 registry │ │ +│ │ 2. generate_monitoring.py → 生成 Prometheus/Alertmanager 配置 │ │ +│ │ 3. kubectl apply → 部署監控配置 │ │ +│ │ 4. 部署新服務 │ │ +│ └───────────────────────────────────────────────────────────────┘ │ +└────────────────────────────┬────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ Step 3: 監控自動生效 │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌────────────┐ │ +│ │ Prometheus │ │ Alertmanager│ │ Sentry │ │ SignOz │ │ +│ │ 開始抓 Metrics│ │ 開始監控告警 │ │ 開始追蹤錯誤│ │ 開始收 Traces│ │ +│ └─────────────┘ └─────────────┘ └─────────────┘ └────────────┘ │ +└────────────────────────────┬────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ Step 4: 異常發生 → AI 自動修復 │ +│ ┌───────────────────────────────────────────────────────────────┐ │ +│ │ 1. Prometheus 觸發告警 │ │ +│ │ 2. Alertmanager → AWOOOI Webhook │ │ +│ │ 3. OpenClaw AI 分析 │ │ +│ │ 4. 自動/人工修復 │ │ +│ │ 5. 結果回饋 → Playbook 更新 │ │ +│ └───────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 七、實作清單 + +### 7.1 需要建立的檔案 + +| 檔案 | 用途 | 優先級 | +|------|------|--------| +| `ops/monitoring/service-registry.yaml` | 服務註冊表 | P0 | +| `ops/monitoring/generate_monitoring.py` | 配置生成器 | P0 | +| `ops/monitoring/validate_coverage.py` | 覆蓋率檢查 | P0 | +| `ops/monitoring/discover_docker.py` | 容器發現 | P1 | +| `apps/api/src/core/auto_monitoring.py` | API 自動監控 | P1 | +| `apps/web/src/lib/auto-monitoring.ts` | 前端自動監控 | P1 | + +### 7.2 CI/CD 修改 + +| 修改 | 用途 | 優先級 | +|------|------|--------| +| 新增 `monitoring-validation` job | 檢查覆蓋率 | P0 | +| 新增 `monitoring-deploy` job | 部署配置 | P0 | + +### 7.3 Cron Jobs + +| 任務 | 頻率 | 用途 | +|------|------|------| +| `discover_docker.py` | 每小時 | 發現新容器 | +| `coverage_report.py` | 每日 | 生成報告 | + +--- + +**文件結束** diff --git a/ops/monitoring/service-registry.yaml b/ops/monitoring/service-registry.yaml new file mode 100644 index 00000000..be420566 --- /dev/null +++ b/ops/monitoring/service-registry.yaml @@ -0,0 +1,626 @@ +# AWOOOI 服務註冊表 (Single Source of Truth) +# =========================================== +# 版本: v1.0 +# 建立日期: 2026-03-29 +# 用途: 所有受監控服務的統一清單 +# +# 新增服務時: +# 1. 在此檔案新增 entry +# 2. CI/CD 會自動生成對應的監控配置 +# 3. 部署後監控自動生效 + +# ============================================================================= +# K8s 工作負載 (awoooi-prod namespace) +# ============================================================================= +services: + # --- API 後端 --- + - name: awoooi-api + type: k8s-deployment + namespace: awoooi-prod + replicas: 2 + port: 8000 + health_endpoint: /api/v1/health + monitoring: + prometheus: true + sentry: true + otel: true + langfuse: true + alerts: + - pod_crash + - high_error_rate + - slow_response + - memory_high + auto_repair: + enabled: true + actions: + - restart_pod + - scale_up + owner: backend-team + criticality: P0 + + # --- Web 前端 --- + - name: awoooi-web + type: k8s-deployment + namespace: awoooi-prod + replicas: 2 + port: 3000 + health_endpoint: / + monitoring: + prometheus: true + sentry: true + otel: true + langfuse: false + alerts: + - pod_crash + - slow_page_load + auto_repair: + enabled: true + actions: + - restart_pod + owner: frontend-team + criticality: P0 + + # --- Signal Worker --- + - name: awoooi-worker + type: k8s-deployment + namespace: awoooi-prod + replicas: 1 + health_endpoint: /tmp/worker-healthy + health_type: exec_mtime + monitoring: + prometheus: true + sentry: true + otel: true + langfuse: true + alerts: + - worker_stuck + - queue_backlog + auto_repair: + enabled: true + actions: + - restart_pod + owner: backend-team + criticality: P1 + + # --- ArgoCD --- + - name: argocd-server + type: k8s-deployment + namespace: argocd + port: 443 + health_endpoint: /healthz + monitoring: + prometheus: true + sentry: false + otel: false + alerts: + - service_down + - sync_failed + owner: devops-team + criticality: P1 + + # --- Prometheus --- + - name: prometheus + type: k8s-deployment + namespace: monitoring + port: 9090 + health_endpoint: /-/ready + monitoring: + prometheus: false # 自己監控自己會循環 + sentry: false + alerts: + - service_down + owner: devops-team + criticality: P0 + + # --- Alertmanager --- + - name: alertmanager + type: k8s-deployment + namespace: monitoring + port: 9093 + health_endpoint: /-/ready + monitoring: + prometheus: true + sentry: false + alerts: + - service_down + owner: devops-team + criticality: P0 + +# ============================================================================= +# Docker 容器 (192.168.0.188 - AI/Web 中心) +# ============================================================================= + + # --- Ollama LLM --- + - name: ollama + type: docker + host: 192.168.0.188 + port: 11434 + health_endpoint: /api/tags + monitoring: + prometheus: true + sentry: false + otel: false + alerts: + - service_down + - inference_timeout + - model_load_failed + auto_repair: + enabled: true + actions: + - restart_container + owner: ai-team + criticality: P0 + + # --- OpenClaw AI 決策中心 --- + - name: openclaw + type: docker + host: 192.168.0.188 + port: 8089 + health_endpoint: /health + monitoring: + prometheus: true + sentry: true + otel: true + langfuse: true + alerts: + - service_down + - analysis_timeout + - high_error_rate + auto_repair: + enabled: true + actions: + - restart_container + owner: ai-team + criticality: P0 + + # --- Redis Stack --- + - name: redis + type: docker + host: 192.168.0.188 + port: 6380 + health_endpoint: redis-cli ping + health_type: exec + monitoring: + prometheus: true + sentry: false + alerts: + - service_down + - memory_high + - connection_rejected + auto_repair: + enabled: false # 資料庫謹慎處理 + owner: infra-team + criticality: P0 + + # --- PostgreSQL --- + - name: postgres + type: docker + host: 192.168.0.188 + port: 5432 + health_endpoint: pg_isready + health_type: exec + monitoring: + prometheus: true + sentry: false + alerts: + - service_down + - connection_pool_exhausted + - slow_query + - replication_lag + auto_repair: + enabled: false # 資料庫謹慎處理 + owner: infra-team + criticality: P0 + + # --- SignOz OTEL Collector --- + - name: signoz-collector + type: docker + host: 192.168.0.188 + port: 24317 + health_endpoint: grpc_health + health_type: grpc + monitoring: + prometheus: true + sentry: false + alerts: + - service_down + - trace_dropped + owner: devops-team + criticality: P1 + + # --- SignOz UI --- + - name: signoz-ui + type: docker + host: 192.168.0.188 + port: 3301 + health_endpoint: / + monitoring: + prometheus: true + sentry: false + alerts: + - service_down + owner: devops-team + criticality: P2 + + # --- ClickHouse (SignOz 後端) --- + - name: clickhouse + type: docker + host: 192.168.0.188 + port: 8123 + health_endpoint: /ping + monitoring: + prometheus: true + sentry: false + alerts: + - service_down + - disk_space_low + - query_timeout + owner: devops-team + criticality: P1 + +# ============================================================================= +# Docker 容器 (192.168.0.110 - DevOps 中心) +# ============================================================================= + + # --- Harbor Registry --- + - name: harbor + type: docker + host: 192.168.0.110 + port: 5000 + health_endpoint: /api/v2.0/health + monitoring: + prometheus: true + sentry: false + alerts: + - service_down + - storage_full + - push_failed + owner: devops-team + criticality: P0 + + # --- Sentry --- + - name: sentry + type: docker + host: 192.168.0.110 + port: 9000 + health_endpoint: /_health/ + monitoring: + prometheus: true + sentry: false # 自己監控自己會循環 + alerts: + - service_down + owner: devops-team + criticality: P1 + + # --- Langfuse LLMOps --- + - name: langfuse + type: docker + host: 192.168.0.110 + port: 3100 + health_endpoint: /api/public/health + monitoring: + prometheus: true + sentry: false + alerts: + - service_down + - trace_lost + owner: ai-team + criticality: P2 + + # --- GitHub Actions Runner --- + - name: github-runner + type: systemd + host: 192.168.0.110 + service_name: actions.runner.owenhytsai-awoooi.awoooi-110.service + monitoring: + prometheus: true + sentry: false + alerts: + - runner_offline + - job_stuck + auto_repair: + enabled: true + actions: + - restart_service + owner: devops-team + criticality: P0 + +# ============================================================================= +# 主機節點 +# ============================================================================= +nodes: + - name: mon + ip: 192.168.0.120 + role: k3s-master + alerts: + - node_down + - cpu_high + - memory_high + - disk_space_low + - etcd_latency_high + owner: infra-team + + - name: mon1 + ip: 192.168.0.121 + role: k3s-worker + alerts: + - node_down + - node_not_ready + - cpu_high + - memory_high + - disk_space_low + owner: infra-team + + - name: harbor + ip: 192.168.0.110 + role: devops + alerts: + - node_down + - cpu_high + - memory_high + - disk_space_low + owner: devops-team + + - name: pg + ip: 192.168.0.188 + role: ai-web + alerts: + - node_down + - cpu_high + - memory_high + - disk_space_low + - gpu_utilization_high + owner: ai-team + + - name: kali + ip: 192.168.0.112 + role: security + alerts: + - node_down + owner: security-team + +# ============================================================================= +# 前端頁面 +# ============================================================================= +pages: + - path: / + name: Dashboard + monitoring: + sentry_session: true + web_vitals: true + alerts: + - slow_page_load + - js_error + slo: + lcp_ms: 2500 + fid_ms: 100 + cls: 0.1 + + - path: /authorizations + name: 授權管理 + monitoring: + sentry_session: true + web_vitals: true + alerts: + - slow_page_load + - api_error + slo: + lcp_ms: 2000 + + - path: /action-logs + name: 行動日誌 + monitoring: + sentry_session: true + web_vitals: true + alerts: + - slow_page_load + + - path: /errors + name: 錯誤追蹤 + monitoring: + sentry_session: true + web_vitals: true + alerts: + - slow_page_load + + - path: /settings + name: 設定 + monitoring: + sentry_session: true + alerts: + - slow_page_load + + - path: /knowledge-base + name: 知識庫 + monitoring: + sentry_session: true + alerts: + - slow_page_load + +# ============================================================================= +# API 端點 (關鍵) +# ============================================================================= +api_endpoints: + - path: /api/v1/health + method: GET + critical: true + slo: + latency_p95_ms: 100 + availability: 99.99 + + - path: /api/v1/approvals + method: GET + critical: true + slo: + latency_p95_ms: 500 + availability: 99.9 + + - path: /api/v1/approvals/{id}/sign + method: POST + critical: true + slo: + latency_p95_ms: 1000 + availability: 99.9 + + - path: /api/v1/incidents + method: GET + critical: true + slo: + latency_p95_ms: 500 + availability: 99.9 + + - path: /api/v1/analyze + method: POST + critical: true + slo: + latency_p95_ms: 30000 # 30s (LLM 分析) + availability: 95 + + - path: /api/v1/webhooks/alertmanager + method: POST + critical: true + slo: + latency_p95_ms: 5000 + availability: 99.9 + + - path: /api/v1/webhooks/sentry/error + method: POST + critical: true + slo: + latency_p95_ms: 5000 + availability: 99.9 + + - path: /api/v1/execute + method: POST + critical: true + slo: + latency_p95_ms: 10000 + availability: 99 + +# ============================================================================= +# AI 服務 (特殊監控) +# ============================================================================= +ai_services: + - name: gemini-api + type: external + rate_limit: + requests_per_minute: 60 + tokens_per_minute: 100000 + alerts: + - rate_limit_hit + - budget_exceeded + fallback: ollama + cost_tracking: true + + - name: claude-api + type: external + rate_limit: + requests_per_minute: 50 + tokens_per_minute: 100000 + alerts: + - rate_limit_hit + - budget_exceeded + fallback: gemini + cost_tracking: true + + - name: ollama-local + type: local + models: + - qwen2.5:7b + - llama3.2:3b + alerts: + - model_load_failed + - inference_timeout + cost_tracking: false + +# ============================================================================= +# 告警模板 (Alert Templates) +# ============================================================================= +alert_templates: + pod_crash: + expr: 'kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0' + for: 2m + severity: critical + auto_repair: restart_pod + + high_error_rate: + expr: 'rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.01' + for: 5m + severity: critical + auto_repair: restart_pod + + slow_response: + expr: 'histogram_quantile(0.95, http_request_duration_seconds_bucket) > 2' + for: 5m + severity: warning + auto_repair: scale_up + + service_down: + expr: 'probe_success == 0' + for: 1m + severity: critical + auto_repair: restart_container + + memory_high: + expr: 'container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9' + for: 5m + severity: warning + auto_repair: analyze_memory_leak + + disk_space_low: + expr: 'node_filesystem_avail_bytes / node_filesystem_size_bytes < 0.15' + for: 10m + severity: warning + auto_repair: cleanup_logs + + inference_timeout: + expr: 'ollama_inference_duration_seconds > 60' + for: 3m + severity: warning + auto_repair: switch_model + + runner_offline: + expr: 'github_runner_status == 0' + for: 5m + severity: critical + auto_repair: restart_service + +# ============================================================================= +# 自動修復動作 (Auto-Repair Actions) +# ============================================================================= +auto_repair_actions: + restart_pod: + command: 'kubectl rollout restart deployment/{name} -n {namespace}' + risk: low + cooldown_minutes: 10 + + scale_up: + command: 'kubectl scale deployment/{name} --replicas=+1 -n {namespace}' + risk: low + max_replicas: 5 + cooldown_minutes: 15 + + restart_container: + command: 'ssh {host} docker restart {container}' + risk: low + cooldown_minutes: 10 + + restart_service: + command: 'ssh {host} sudo systemctl restart {service_name}' + risk: low + cooldown_minutes: 10 + + switch_model: + command: 'internal:switch_to_smaller_model' + risk: low + cooldown_minutes: 5 + + cleanup_logs: + command: 'ssh {host} find /var/log -name "*.log" -mtime +7 -delete' + risk: low + cooldown_minutes: 60 + + analyze_memory_leak: + command: 'internal:trigger_memory_analysis' + risk: low + cooldown_minutes: 30 diff --git a/ops/monitoring/validate_coverage.py b/ops/monitoring/validate_coverage.py new file mode 100755 index 00000000..a76199cb --- /dev/null +++ b/ops/monitoring/validate_coverage.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 +""" +AWOOOI 監控覆蓋率驗證 +==================== +CI/CD 階段執行,確保所有服務都有對應的監控配置 + +用法: + python ops/monitoring/validate_coverage.py + +退出碼: + 0 - 所有服務都已註冊 + 1 - 發現未監控的服務 +""" + +import subprocess +import sys +import yaml +from pathlib import Path +from typing import NamedTuple + + +class ValidationResult(NamedTuple): + """驗證結果""" + passed: bool + errors: list[str] + warnings: list[str] + coverage: dict[str, float] + + +def load_registry() -> dict: + """載入服務註冊表""" + registry_path = Path(__file__).parent / 'service-registry.yaml' + with open(registry_path) as f: + return yaml.safe_load(f) + + +def get_k8s_deployments() -> list[dict]: + """取得所有 K8s Deployments""" + try: + result = subprocess.run( + [ + 'kubectl', 'get', 'deployments', '-A', + '-o', 'jsonpath={range .items[*]}{.metadata.namespace}/{.metadata.name}{\"\\n\"}{end}' + ], + capture_output=True, text=True, timeout=30 + ) + deployments = [] + for line in result.stdout.strip().split('\n'): + if line and '/' in line: + ns, name = line.split('/', 1) + deployments.append({'namespace': ns, 'name': name}) + return deployments + except Exception as e: + print(f"Warning: Cannot get K8s deployments: {e}") + return [] + + +def get_k8s_services() -> list[dict]: + """取得所有 K8s Services""" + try: + result = subprocess.run( + [ + 'kubectl', 'get', 'services', '-A', + '-o', 'jsonpath={range .items[*]}{.metadata.namespace}/{.metadata.name}{\"\\n\"}{end}' + ], + capture_output=True, text=True, timeout=30 + ) + services = [] + for line in result.stdout.strip().split('\n'): + if line and '/' in line: + ns, name = line.split('/', 1) + services.append({'namespace': ns, 'name': name}) + return services + except Exception as e: + print(f"Warning: Cannot get K8s services: {e}") + return [] + + +def check_docker_containers(host: str) -> list[str]: + """檢查主機上的 Docker 容器""" + try: + result = subprocess.run( + ['ssh', '-o', 'ConnectTimeout=5', host, 'docker', 'ps', '--format', '{{.Names}}'], + capture_output=True, text=True, timeout=10 + ) + return [c for c in result.stdout.strip().split('\n') if c] + except Exception as e: + print(f"Warning: Cannot check Docker on {host}: {e}") + return [] + + +def validate_registry(registry: dict) -> ValidationResult: + """驗證服務註冊表完整性""" + errors = [] + warnings = [] + + registered_services = {s['name'] for s in registry.get('services', [])} + registered_k8s = { + (s['namespace'], s['name']) + for s in registry.get('services', []) + if s.get('type') == 'k8s-deployment' + } + + # ========================================================================== + # 1. 檢查 K8s Deployments + # ========================================================================== + k8s_deployments = get_k8s_deployments() + ignored_namespaces = {'kube-system', 'kube-public', 'kube-node-lease', 'local-path-storage'} + ignored_prefixes = {'coredns', 'metrics-server', 'local-path-provisioner'} + + for deploy in k8s_deployments: + ns, name = deploy['namespace'], deploy['name'] + + # 跳過系統命名空間 + if ns in ignored_namespaces: + continue + + # 跳過系統元件 + if any(name.startswith(p) for p in ignored_prefixes): + continue + + if (ns, name) not in registered_k8s: + errors.append(f"K8s Deployment '{ns}/{name}' 未在 service-registry.yaml 中註冊") + + # ========================================================================== + # 2. 檢查 Docker 容器 + # ========================================================================== + docker_hosts = ['192.168.0.188', '192.168.0.110'] + docker_services = { + s['name'] + for s in registry.get('services', []) + if s.get('type') == 'docker' + } + + ignored_containers = { + 'k3s', 'pause', 'registry', 'nginx', 'traefik', + # SignOz 相關容器群組 + 'signoz-alertmanager', 'signoz-query-service', 'signoz-otel-collector-metrics', + 'zookeeper', 'clickhouse', + # Sentry 相關容器群組 + 'sentry-web', 'sentry-worker', 'sentry-cron', 'sentry-kafka', 'sentry-redis', + 'sentry-postgres', 'sentry-zookeeper', 'sentry-snuba', + } + + for host in docker_hosts: + containers = check_docker_containers(host) + for container in containers: + if not container: + continue + + # 跳過已知系統容器 + if any(ignored in container for ignored in ignored_containers): + continue + + # 提取主要名稱 (去除 _1, -1 等後綴) + base_name = container.split('_')[0].split('-')[0] + + if container not in docker_services and base_name not in docker_services: + warnings.append(f"Docker 容器 '{container}' on {host} 未在 registry 中 (可能需要加入)") + + # ========================================================================== + # 3. 檢查 API 端點覆蓋 + # ========================================================================== + api_endpoints = registry.get('api_endpoints', []) + critical_endpoints = [e for e in api_endpoints if e.get('critical')] + + if len(critical_endpoints) < 5: + warnings.append(f"僅定義了 {len(critical_endpoints)} 個關鍵 API 端點,建議至少 5 個") + + # ========================================================================== + # 4. 檢查前端頁面覆蓋 + # ========================================================================== + pages = registry.get('pages', []) + if len(pages) < 3: + warnings.append(f"僅定義了 {len(pages)} 個前端頁面監控,建議至少 3 個") + + # ========================================================================== + # 5. 計算覆蓋率 + # ========================================================================== + services = registry.get('services', []) + total = len(services) if services else 1 + + coverage = { + 'prometheus': sum(1 for s in services if s.get('monitoring', {}).get('prometheus')) / total, + 'sentry': sum(1 for s in services if s.get('monitoring', {}).get('sentry')) / total, + 'otel': sum(1 for s in services if s.get('monitoring', {}).get('otel')) / total, + 'alerts': sum(1 for s in services if s.get('alerts')) / total, + 'auto_repair': sum(1 for s in services if s.get('auto_repair', {}).get('enabled')) / total, + } + + # 覆蓋率低於 80% 產生警告 + for metric, rate in coverage.items(): + if rate < 0.8: + warnings.append(f"{metric} 覆蓋率僅 {rate:.0%},建議提升至 80% 以上") + + passed = len(errors) == 0 + return ValidationResult(passed=passed, errors=errors, warnings=warnings, coverage=coverage) + + +def print_report(result: ValidationResult): + """輸出驗證報告""" + print("\n" + "=" * 60) + print("AWOOOI 監控覆蓋率驗證報告") + print("=" * 60) + + # 覆蓋率 + print("\n📊 覆蓋率:") + for metric, rate in result.coverage.items(): + status = "✅" if rate >= 0.8 else "⚠️" if rate >= 0.5 else "❌" + print(f" {status} {metric}: {rate:.0%}") + + # 錯誤 + if result.errors: + print(f"\n❌ 錯誤 ({len(result.errors)}):") + for err in result.errors: + print(f" • {err}") + + # 警告 + if result.warnings: + print(f"\n⚠️ 警告 ({len(result.warnings)}):") + for warn in result.warnings: + print(f" • {warn}") + + # 結論 + print("\n" + "-" * 60) + if result.passed: + print("✅ 驗證通過 - 所有關鍵服務都已註冊監控") + else: + print("❌ 驗證失敗 - 請更新 ops/monitoring/service-registry.yaml") + print("=" * 60 + "\n") + + +def main(): + """主函數""" + registry = load_registry() + result = validate_registry(registry) + print_report(result) + + # 錯誤時退出碼 1 + if not result.passed: + sys.exit(1) + + sys.exit(0) + + +if __name__ == '__main__': + main()