diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index cac21f29..6551d941 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -662,3 +662,103 @@ groups: annotations: summary: "SSL 憑證即將到期: {{ $labels.instance }}" description: "{{ $labels.instance }} SSL 憑證將在 14 天內到期,請手動更新" + +# ============================================================================= +# ADR-075 新增規則群組 (2026-04-12 ogt) +# ============================================================================= + + - name: awoooi_secops_alerts + interval: 60s + rules: + - alert: UnauthorizedSSHLogin + expr: increase(node_failed_auth_attempts_total[5m]) > 10 + for: 1m + labels: + severity: critical + layer: systemd-188 + team: security + auto_repair: "false" + alert_category: secops + annotations: + summary: "異常 SSH 登入嘗試: {{ $labels.instance }}" + description: "5 分鐘內失敗登入 {{ $value }} 次,可能遭受暴力破解" + + - name: awoooi_business_alerts + interval: 60s + rules: + - alert: AITokenCostSpike + expr: increase(awoooi_ai_token_cost_usd_total[1h]) > 10 + for: 5m + labels: + severity: warning + layer: k8s + team: finops + auto_repair: "false" + alert_category: business + annotations: + summary: "AI Token 費用 1 小時內暴增 ${{ $value | humanize }}" + description: "AI API 調用費用異常,請檢查是否有迴圈或濫用" + - alert: GeminiAPIErrorRateHigh + expr: rate(awoooi_ai_request_errors_total{provider="gemini"}[5m]) / rate(awoooi_ai_requests_total{provider="gemini"}[5m]) > 0.2 + for: 10m + labels: + severity: warning + layer: k8s + team: finops + auto_repair: "false" + alert_category: business + annotations: + summary: "Gemini API 錯誤率過高: {{ $value | humanizePercentage }}" + description: "Gemini API 5 分鐘錯誤率超過 20%,AI 降級可能失效" + + - name: awoooi_flywheel_meta_alerts + interval: 60s + rules: + - alert: FlywheelPlaybookZero + expr: awoooi_flywheel_playbook_count == 0 + for: 1h + labels: + severity: critical + layer: k8s + team: aiops + auto_repair: "false" + alert_category: flywheel_health + annotations: + summary: "飛輪 Playbook 數量為零,AI 修復完全依賴 LLM" + description: "Redis 中無任何已批准 Playbook,自動修復能力大幅降低" + - alert: FlywheelExecutionSuccessLow + expr: awoooi_flywheel_execution_success_rate < 0.1 + for: 2h + labels: + severity: warning + layer: k8s + team: aiops + auto_repair: "false" + alert_category: flywheel_health + annotations: + summary: "飛輪執行成功率 {{ $value | humanizePercentage }} 低於 10%" + description: "連續 2 小時執行成功率不足 10%,Playbook 可能已過時" + - alert: FlywheelKMVectorizationLow + expr: awoooi_flywheel_km_unvectorized_count > 10 + for: 30m + labels: + severity: warning + layer: k8s + team: aiops + auto_repair: "false" + alert_category: flywheel_health + annotations: + summary: "{{ $value }} 筆 KM 未向量化,RAG 查詢命中率下降" + description: "knowledge_entries 中 embedding IS NULL 超過 10 筆且持續 30 分鐘" + - alert: FlywheelIncidentsStuck + expr: awoooi_flywheel_incidents_stuck > 5 + for: 5m + labels: + severity: warning + layer: k8s + team: aiops + auto_repair: "false" + alert_category: flywheel_health + annotations: + summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24h" + description: "飛輪推理匹配節點可能堵塞,需人工清理或重新觸發診斷"