Some checks failed
CD Pipeline / deploy (push) Failing after 59s
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml) - 部署模式: rsync Python 檔案至 188 → docker restart (volume mount) - Dockerfile/requirements 變動時自動重建 Docker image - 部署通知: Telegram (開始/成功/失敗) - 健康檢查: https://mo.wooo.work/health (最多 5 次重試) - 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
35 KiB
35 KiB
WOOO AIOps Platform - 技術架構設計 V10.2
讓新應用 5 分鐘內無痛接入的技術實現
建立日期: 2026-02-14 版本: V10.2 (技術架構)
🏗️ 整體技術架構
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ WOOO AIOps Platform - 技術架構 │
├─────────────────────────────────────────────────────────────────────────────────────┤
│ │
│ ╔═════════════════════════════════════════════════════════════════════════════╗ │
│ ║ 🌐 Web Portal (React/Vue) ║ │
│ ╠═════════════════════════════════════════════════════════════════════════════╣ │
│ ║ 用戶登入 │ 應用管理 │ 監控儀表板 │ 告警中心 │ 設定管理 ║ │
│ ╚═════════════════════════════════════════════════════════════════════════════╝ │
│ │ │
│ ▼ │
│ ╔═════════════════════════════════════════════════════════════════════════════╗ │
│ ║ 🔌 AIOps API Gateway (Flask) ║ │
│ ╠═════════════════════════════════════════════════════════════════════════════╣ │
│ ║ /api/apps - 應用 CRUD ║ │
│ ║ /api/deploy - 部署管理 ║ │
│ ║ /api/monitor - 監控數據 ║ │
│ ║ /api/alerts - 告警管理 ║ │
│ ║ /api/repair - 自動修復 ║ │
│ ║ /api/templates - 應用模板 ║ │
│ ╚═════════════════════════════════════════════════════════════════════════════╝ │
│ │ │
│ ┌──────────────────────────────┼──────────────────────────────┐ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ╔═══════════════╗ ╔═══════════════╗ ╔═══════════════╗ │
│ ║ Deploy ║ ║ Monitor ║ ║ Repair ║ │
│ ║ Engine ║ ║ Engine ║ ║ Engine ║ │
│ ╠═══════════════╣ ╠═══════════════╣ ╠═══════════════╣ │
│ ║ • K8s 操作 ║ ║ • Prometheus ║ ║ • 故障診斷 ║ │
│ ║ • 模板渲染 ║ ║ • Grafana ║ ║ • 自動修復 ║ │
│ ║ • SSL 配置 ║ ║ • 告警路由 ║ ║ • 回滾機制 ║ │
│ ║ • DNS 管理 ║ ║ • 數據聚合 ║ ║ • 知識庫 ║ │
│ ╚═══════════════╝ ╚═══════════════╝ ╚═══════════════╝ │
│ │ │ │ │
│ └──────────────────────────────┼──────────────────────────────┘ │
│ ▼ │
│ ╔═════════════════════════════════════════════════════════════════════════════╗ │
│ ║ 🗄️ PostgreSQL (元資料庫) ║ │
│ ╠═════════════════════════════════════════════════════════════════════════════╣ │
│ ║ apps │ app_configs │ alerts │ repair_logs ║ │
│ ║ users │ templates │ metrics_cache │ audit_logs ║ │
│ ╚═════════════════════════════════════════════════════════════════════════════╝ │
│ │ │
│ ╔═════════════════════════════════════════════════════════════════════════════╗ │
│ ║ ☸️ Kubernetes Cluster (K3s) ║ │
│ ╠═════════════════════════════════════════════════════════════════════════════╣ │
│ ║ ║ │
│ ║ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ║ │
│ ║ │ Namespace: │ │ Namespace: │ │ Namespace: │ │ Namespace: │ ║ │
│ ║ │ aiops-core │ │ app-momo │ │ app-crm │ │ app-event │ ║ │
│ ║ │ (平台核心) │ │ (客戶應用1) │ │ (客戶應用2) │ │ (客戶應用3) │ ║ │
│ ║ └─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘ ║ │
│ ║ ║ │
│ ║ ┌─────────────────────────────────────────────────────────────────────┐ ║ │
│ ║ │ monitoring Namespace │ ║ │
│ ║ │ Prometheus │ Grafana │ Alertmanager │ Loki │ n8n │ ║ │
│ ║ └─────────────────────────────────────────────────────────────────────┘ ║ │
│ ║ ║ │
│ ╚═════════════════════════════════════════════════════════════════════════════╝ │
│ │
└─────────────────────────────────────────────────────────────────────────────────────┘
📦 新應用接入流程 - 技術細節
Step 1: 應用註冊
# aiops_api/routes/apps.py
@app.route('/api/apps', methods=['POST'])
def create_app():
"""
創建新應用
Request:
{
"name": "event-system",
"display_name": "活動報名網站",
"domain": "event.wooo.work",
"source_type": "git",
"source_url": "https://gitlab.com/wooo/event-system.git",
"framework": "flask", # 可選,自動偵測
"plan": "standard"
}
Response:
{
"app_id": "app-12345",
"status": "initializing",
"namespace": "app-event-system",
"steps": [
{"step": "create_namespace", "status": "pending"},
{"step": "deploy_app", "status": "pending"},
{"step": "configure_ingress", "status": "pending"},
{"step": "setup_ssl", "status": "pending"},
{"step": "configure_monitoring", "status": "pending"},
{"step": "setup_auto_repair", "status": "pending"}
]
}
"""
# 1. 驗證域名 DNS
if not verify_dns(data['domain']):
return {"error": "DNS 未指向正確 IP"}, 400
# 2. 偵測技術框架(如果未指定)
if not data.get('framework'):
data['framework'] = detect_framework(data['source_url'])
# 3. 創建應用記錄
app_record = App.create(data)
# 4. 啟動異步部署任務
deploy_task.delay(app_record.id)
return app_record.to_dict()
Step 2: 自動部署
# aiops_api/engines/deploy_engine.py
class DeployEngine:
"""應用部署引擎"""
def deploy(self, app: App):
"""
完整部署流程
"""
# 1. 創建 Namespace
self.create_namespace(app)
# 2. 根據框架選擇模板
template = self.get_template(app.framework)
# 3. 渲染 K8s YAML
k8s_manifests = self.render_manifests(template, app)
# 4. 部署到 K8s
self.apply_manifests(k8s_manifests)
# 5. 配置 Ingress + SSL
self.setup_ingress(app)
self.setup_ssl(app)
# 6. 等待 Pod Ready
self.wait_for_ready(app)
# 7. 配置監控
self.setup_monitoring(app)
# 8. 配置自動修復
self.setup_auto_repair(app)
# 9. 發送通知
self.notify_success(app)
def render_manifests(self, template: str, app: App) -> dict:
"""
使用 Jinja2 渲染 K8s 配置
"""
from jinja2 import Template
# 資源配置對照表
PLAN_RESOURCES = {
'basic': {'memory_request': '256Mi', 'cpu_request': '250m'},
'standard': {'memory_request': '512Mi', 'cpu_request': '500m'},
'advanced': {'memory_request': '1Gi', 'cpu_request': '1000m'},
'professional': {'memory_request': '2Gi', 'cpu_request': '2000m'},
}
context = {
'app_name': app.name,
'namespace': app.namespace,
'image': app.image,
'domain': app.domain,
**PLAN_RESOURCES[app.plan]
}
return Template(template).render(**context)
Step 3: 自動監控配置
# aiops_api/engines/monitor_engine.py
class MonitorEngine:
"""監控配置引擎"""
def setup_monitoring(self, app: App):
"""
自動配置監控
"""
# 1. 創建 ServiceMonitor (Prometheus 自動抓取)
self.create_service_monitor(app)
# 2. 創建告警規則 (PrometheusRule)
self.create_alert_rules(app)
# 3. 配置 Alertmanager 路由
self.configure_alertmanager(app)
# 4. 創建 Grafana Dashboard
self.create_grafana_dashboard(app)
def create_service_monitor(self, app: App):
"""
創建 Prometheus ServiceMonitor
"""
manifest = f"""
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {app.name}
namespace: {app.namespace}
labels:
app: {app.name}
managed-by: wooo-aiops
spec:
selector:
matchLabels:
app: {app.name}
endpoints:
- port: http
interval: 15s
path: /metrics
namespaceSelector:
matchNames:
- {app.namespace}
"""
self.k8s_apply(manifest)
def create_alert_rules(self, app: App):
"""
創建標準告警規則
"""
manifest = f"""
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {app.name}-alerts
namespace: monitoring
labels:
managed-by: wooo-aiops
app: {app.name}
spec:
groups:
- name: {app.name}.rules
rules:
# 服務健康檢查
- alert: {app.name.title()}Down
expr: up{{namespace="{app.namespace}", job="{app.name}"}} == 0
for: 1m
labels:
severity: critical
app: {app.name}
auto_repair: restart
annotations:
summary: "{app.display_name} 服務停止"
description: "服務已停止運行超過 1 分鐘"
# 高記憶體使用
- alert: {app.name.title()}HighMemory
expr: |
(container_memory_usage_bytes{{namespace="{app.namespace}"}}
/ container_spec_memory_limit_bytes{{namespace="{app.namespace}"}}) > 0.9
for: 5m
labels:
severity: warning
app: {app.name}
auto_repair: scale_memory
annotations:
summary: "{app.display_name} 記憶體使用過高"
description: "記憶體使用率超過 90%"
# OOM Kill
- alert: {app.name.title()}OOMKilled
expr: |
increase(kube_pod_container_status_restarts_total{{
namespace="{app.namespace}",
reason="OOMKilled"
}}[5m]) > 0
labels:
severity: critical
app: {app.name}
auto_repair: oom_handler
annotations:
summary: "{app.display_name} OOM 被殺"
description: "容器因記憶體不足被終止"
# 5xx 錯誤率
- alert: {app.name.title()}High5xxRate
expr: |
(sum(rate(http_requests_total{{
namespace="{app.namespace}",
status=~"5.."
}}[5m]))
/ sum(rate(http_requests_total{{
namespace="{app.namespace}"
}}[5m]))) > 0.05
for: 5m
labels:
severity: warning
app: {app.name}
auto_repair: rollback
annotations:
summary: "{app.display_name} 5xx 錯誤率過高"
description: "5xx 錯誤率超過 5%"
"""
self.k8s_apply(manifest)
Step 4: 自動修復引擎
# aiops_api/engines/repair_engine.py
class RepairEngine:
"""自動修復引擎"""
# 修復策略對照表
REPAIR_STRATEGIES = {
'restart': 'restart_deployment',
'scale_memory': 'scale_memory_limit',
'oom_handler': 'handle_oom',
'rollback': 'rollback_deployment',
'cleanup': 'cleanup_resources',
}
def handle_alert(self, alert: dict):
"""
處理告警並執行自動修復
"""
app_name = alert['labels']['app']
repair_type = alert['labels'].get('auto_repair')
if not repair_type:
self.notify_manual_required(alert)
return
# 執行對應的修復策略
strategy = self.REPAIR_STRATEGIES.get(repair_type)
if strategy:
method = getattr(self, strategy)
result = method(app_name, alert)
self.log_repair(app_name, repair_type, result)
self.notify_repair_result(app_name, repair_type, result)
def restart_deployment(self, app_name: str, alert: dict) -> dict:
"""
重啟 Deployment
"""
namespace = self.get_app_namespace(app_name)
# 執行重啟
subprocess.run([
'kubectl', 'rollout', 'restart',
f'deployment/{app_name}',
'-n', namespace
])
# 等待就緒
time.sleep(30)
# 驗證修復
health = self.check_health(app_name)
return {
'action': 'restart',
'success': health['status'] == 'healthy',
'details': health
}
def handle_oom(self, app_name: str, alert: dict) -> dict:
"""
處理 OOM:增加記憶體限制 50%
"""
namespace = self.get_app_namespace(app_name)
# 獲取當前資源限制
current = self.get_resource_limits(app_name, namespace)
# 計算新的限制 (+50%)
new_memory = self.increase_memory(current['memory'], 1.5)
# 更新 Deployment
patch = {
'spec': {
'template': {
'spec': {
'containers': [{
'name': app_name,
'resources': {
'limits': {'memory': new_memory}
}
}]
}
}
}
}
subprocess.run([
'kubectl', 'patch', 'deployment', app_name,
'-n', namespace,
'--patch', json.dumps(patch)
])
return {
'action': 'scale_memory',
'old_memory': current['memory'],
'new_memory': new_memory,
'success': True
}
def rollback_deployment(self, app_name: str, alert: dict) -> dict:
"""
回滾到上一個版本
"""
namespace = self.get_app_namespace(app_name)
# 獲取上一個版本
result = subprocess.run([
'kubectl', 'rollout', 'undo',
f'deployment/{app_name}',
'-n', namespace
], capture_output=True, text=True)
# 等待就緒
time.sleep(30)
# 驗證修復
health = self.check_health(app_name)
return {
'action': 'rollback',
'success': health['status'] == 'healthy',
'output': result.stdout
}
📋 應用模板系統
支援的框架模板
templates/
├── base/
│ ├── namespace.yaml.j2
│ ├── service.yaml.j2
│ ├── ingress.yaml.j2
│ └── ssl-certificate.yaml.j2
├── frameworks/
│ ├── flask/
│ │ ├── deployment.yaml.j2
│ │ ├── configmap.yaml.j2
│ │ └── healthcheck.yaml.j2
│ ├── django/
│ │ ├── deployment.yaml.j2
│ │ ├── configmap.yaml.j2
│ │ └── celery-worker.yaml.j2
│ ├── nodejs/
│ │ ├── deployment.yaml.j2
│ │ └── configmap.yaml.j2
│ ├── nextjs/
│ │ ├── deployment.yaml.j2
│ │ └── configmap.yaml.j2
│ └── static/
│ └── deployment.yaml.j2
└── addons/
├── postgresql.yaml.j2
├── redis.yaml.j2
├── mongodb.yaml.j2
└── elasticsearch.yaml.j2
Flask 應用模板範例
# templates/frameworks/flask/deployment.yaml.j2
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ app_name }}
namespace: {{ namespace }}
labels:
app: {{ app_name }}
framework: flask
managed-by: wooo-aiops
annotations:
aiops.wooo.work/auto-repair: "enabled"
aiops.wooo.work/plan: "{{ plan }}"
spec:
replicas: {{ replicas | default(1) }}
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: {{ app_name }}
template:
metadata:
labels:
app: {{ app_name }}
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "80"
prometheus.io/path: "/metrics"
spec:
containers:
- name: {{ app_name }}
image: {{ image }}
imagePullPolicy: Always
ports:
- name: http
containerPort: 80
# 資源限制
resources:
requests:
memory: "{{ memory_request }}"
cpu: "{{ cpu_request }}"
limits:
memory: "{{ memory_limit }}"
cpu: "{{ cpu_limit }}"
# 健康檢查
livenessProbe:
httpGet:
path: /health
port: 80
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 80
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
# 環境變數
env:
- name: FLASK_ENV
value: "production"
- name: GUNICORN_WORKERS
value: "{{ gunicorn_workers | default(4) }}"
# 從 ConfigMap 載入
envFrom:
- configMapRef:
name: {{ app_name }}-config
# 從 Secret 載入敏感資訊
- secretRef:
name: {{ app_name }}-secrets
# 服務帳號
serviceAccountName: {{ app_name }}
# 反親和性 - 分散到不同節點
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchLabels:
app: {{ app_name }}
topologyKey: kubernetes.io/hostname
🔔 告警與通知系統
多渠道通知架構
# aiops_api/services/notification_service.py
class NotificationService:
"""多渠道通知服務"""
def __init__(self):
self.channels = {
'telegram': TelegramChannel(),
'email': EmailChannel(),
'line': LineChannel(),
'slack': SlackChannel(),
}
def notify(self, app: App, event: dict, channels: list = None):
"""
發送通知到指定渠道
"""
if channels is None:
channels = app.notification_channels
# 渲染通知內容
message = self.render_message(app, event)
# 發送到各渠道
results = {}
for channel_name in channels:
channel = self.channels.get(channel_name)
if channel:
results[channel_name] = channel.send(app, message)
return results
def render_message(self, app: App, event: dict) -> dict:
"""
渲染通知訊息
"""
templates = {
'deploy_success': """
🚀 *部署成功*
應用: {app_name}
域名: {domain}
時間: {timestamp}
您的應用已成功部署!
訪問: https://{domain}
""",
'alert_fired': """
🔴 *告警觸發*
應用: {app_name}
告警: {alert_name}
嚴重度: {severity}
時間: {timestamp}
{description}
""",
'auto_repair_success': """
🔧 *自動修復成功*
應用: {app_name}
問題: {problem}
修復動作: {action}
耗時: {duration}
服務已自動恢復正常。
""",
'auto_repair_failed': """
❌ *自動修復失敗*
應用: {app_name}
問題: {problem}
嘗試: {attempts}
狀態: 需要人工介入
請盡快檢查系統。
""",
}
template = templates.get(event['type'])
return {
'text': template.format(**event),
'html': self.to_html(template.format(**event))
}
class TelegramChannel:
"""Telegram 通知渠道"""
def send(self, app: App, message: dict):
chat_id = app.telegram_chat_id
bot_token = os.getenv('TELEGRAM_BOT_TOKEN')
requests.post(
f"https://api.telegram.org/bot{bot_token}/sendMessage",
json={
'chat_id': chat_id,
'text': message['text'],
'parse_mode': 'Markdown'
}
)
💾 資料庫設計
核心表結構
-- 應用表
CREATE TABLE apps (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
name VARCHAR(100) NOT NULL UNIQUE,
display_name VARCHAR(200),
domain VARCHAR(200) NOT NULL,
namespace VARCHAR(100),
framework VARCHAR(50),
plan VARCHAR(20) DEFAULT 'standard',
source_type VARCHAR(20), -- git, docker, upload
source_url TEXT,
status VARCHAR(20) DEFAULT 'initializing',
owner_id UUID REFERENCES users(id),
created_at TIMESTAMP DEFAULT NOW(),
updated_at TIMESTAMP DEFAULT NOW()
);
-- 應用配置表
CREATE TABLE app_configs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
app_id UUID REFERENCES apps(id) ON DELETE CASCADE,
config_key VARCHAR(100),
config_value TEXT,
is_secret BOOLEAN DEFAULT FALSE,
created_at TIMESTAMP DEFAULT NOW()
);
-- 告警歷史表
CREATE TABLE alerts (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
app_id UUID REFERENCES apps(id),
alert_name VARCHAR(200),
severity VARCHAR(20),
status VARCHAR(20), -- firing, resolved
description TEXT,
labels JSONB,
annotations JSONB,
fired_at TIMESTAMP,
resolved_at TIMESTAMP,
created_at TIMESTAMP DEFAULT NOW()
);
-- 修復歷史表
CREATE TABLE repair_logs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
app_id UUID REFERENCES apps(id),
alert_id UUID REFERENCES alerts(id),
repair_type VARCHAR(50),
action_taken TEXT,
result JSONB,
success BOOLEAN,
duration_seconds INTEGER,
created_at TIMESTAMP DEFAULT NOW()
);
-- 部署歷史表
CREATE TABLE deployments (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
app_id UUID REFERENCES apps(id),
version VARCHAR(100),
image VARCHAR(500),
status VARCHAR(20), -- pending, deploying, success, failed, rolled_back
rollback_version VARCHAR(100),
deployed_by UUID REFERENCES users(id),
deployed_at TIMESTAMP DEFAULT NOW(),
completed_at TIMESTAMP
);
-- 用戶表
CREATE TABLE users (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
email VARCHAR(200) NOT NULL UNIQUE,
password_hash VARCHAR(200),
name VARCHAR(100),
company VARCHAR(200),
plan VARCHAR(20) DEFAULT 'free',
telegram_chat_id VARCHAR(50),
created_at TIMESTAMP DEFAULT NOW()
);
🔐 安全架構
多租戶隔離
# 每個應用獨立的 Namespace
# 使用 NetworkPolicy 隔離
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: {{ app_name }}-isolation
namespace: {{ namespace }}
spec:
podSelector:
matchLabels:
app: {{ app_name }}
policyTypes:
- Ingress
- Egress
ingress:
# 只允許來自 Ingress Controller
- from:
- namespaceSelector:
matchLabels:
name: ingress-nginx
# 允許來自 Prometheus 監控
- from:
- namespaceSelector:
matchLabels:
name: monitoring
egress:
# 允許訪問 DNS
- to:
- namespaceSelector: {}
ports:
- protocol: UDP
port: 53
# 允許訪問資料庫(同 namespace)
- to:
- podSelector:
matchLabels:
app: {{ app_name }}-postgresql
API 認證
# aiops_api/middleware/auth.py
from functools import wraps
from flask import request, jsonify
import jwt
def require_auth(f):
@wraps(f)
def decorated(*args, **kwargs):
token = request.headers.get('Authorization', '').replace('Bearer ', '')
if not token:
return jsonify({'error': 'Missing token'}), 401
try:
payload = jwt.decode(token, SECRET_KEY, algorithms=['HS256'])
request.user = User.query.get(payload['user_id'])
except jwt.ExpiredSignatureError:
return jsonify({'error': 'Token expired'}), 401
except jwt.InvalidTokenError:
return jsonify({'error': 'Invalid token'}), 401
return f(*args, **kwargs)
return decorated
def require_app_access(f):
"""確保用戶有權限訪問指定應用"""
@wraps(f)
def decorated(app_id, *args, **kwargs):
app = App.query.get(app_id)
if not app:
return jsonify({'error': 'App not found'}), 404
if app.owner_id != request.user.id:
return jsonify({'error': 'Access denied'}), 403
return f(app_id, *args, **kwargs)
return decorated
📈 監控儀表板
Grafana Dashboard 自動生成
# aiops_api/services/grafana_service.py
class GrafanaService:
"""Grafana Dashboard 管理服務"""
def create_app_dashboard(self, app: App):
"""
為應用自動創建 Grafana Dashboard
"""
dashboard = {
"dashboard": {
"title": f"{app.display_name} - 監控儀表板",
"tags": ["aiops", app.name],
"timezone": "Asia/Taipei",
"panels": [
# 服務可用性
{
"type": "stat",
"title": "可用性",
"gridPos": {"x": 0, "y": 0, "w": 6, "h": 4},
"targets": [{
"expr": f'avg(up{{namespace="{app.namespace}"}}) * 100'
}],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 95},
{"color": "green", "value": 99}
]
}
}
}
},
# 請求數
{
"type": "timeseries",
"title": "請求量 (QPS)",
"gridPos": {"x": 6, "y": 0, "w": 12, "h": 8},
"targets": [{
"expr": f'sum(rate(http_requests_total{{namespace="{app.namespace}"}}[1m]))'
}]
},
# 回應時間
{
"type": "timeseries",
"title": "回應時間 (ms)",
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
"targets": [{
"expr": f'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{{namespace="{app.namespace}"}}[5m])) * 1000',
"legendFormat": "p95"
}]
},
# 記憶體使用
{
"type": "gauge",
"title": "記憶體使用",
"gridPos": {"x": 12, "y": 8, "w": 6, "h": 8},
"targets": [{
"expr": f'sum(container_memory_usage_bytes{{namespace="{app.namespace}"}}) / sum(container_spec_memory_limit_bytes{{namespace="{app.namespace}"}}) * 100'
}],
"fieldConfig": {
"defaults": {
"unit": "percent",
"max": 100,
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 70},
{"color": "red", "value": 90}
]
}
}
}
},
# 錯誤率
{
"type": "stat",
"title": "5xx 錯誤率",
"gridPos": {"x": 18, "y": 8, "w": 6, "h": 8},
"targets": [{
"expr": f'sum(rate(http_requests_total{{namespace="{app.namespace}", status=~"5.."}}[5m])) / sum(rate(http_requests_total{{namespace="{app.namespace}"}}[5m])) * 100'
}],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 1},
{"color": "red", "value": 5}
]
}
}
}
}
]
}
}
# 創建 Dashboard
response = requests.post(
f"{GRAFANA_URL}/api/dashboards/db",
headers={"Authorization": f"Bearer {GRAFANA_API_KEY}"},
json=dashboard
)
return response.json()
🚀 下一步行動
立即可做
-
提取現有代碼為模塊
- 將 MOMO Pro 的監控、自動修復代碼抽取為獨立服務
- 創建應用模板系統
-
開發 Web Portal
- 用戶註冊/登入
- 應用管理介面
- 新增應用精靈
-
完善 API
- RESTful API 設計
- API 文檔 (Swagger)
- SDK (Python, Node.js)
長期規劃
-
擴展框架支援 (Q2 2026)
- Django, Node.js, Next.js, Go
-
多雲支援 (Q3 2026)
- AWS EKS, GCP GKE, Azure AKS
-
AI 增強 (Q4 2026)
- 異常預測
- 智慧容量規劃
- 自動調優
「讓每一行代碼都有專業 DevOps 的守護」
— WOOO AIOps Platform