Files
awoooi/apps/api/src/core/config.py
Your Name 1a72a2f664
All checks were successful
CD Pipeline / tests (push) Successful in 1m19s
Code Review / ai-code-review (push) Successful in 13s
CD Pipeline / build-and-deploy (push) Successful in 3m45s
CD Pipeline / post-deploy-checks (push) Successful in 1m53s
fix(awooop): use ssh mcp transport for ansible check-mode
2026-05-31 14:15:11 +08:00

1052 lines
47 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
AWOOOI API Configuration
========================
Pydantic Settings + Environment Variables
ADR-005: BFF Architecture
ADR-006: AI Fallback Strategy (Ollama -> Gemini -> Claude)
Four Iron Laws:
1. Async-First
2. CORS Whitelist (NO wildcard)
3. Pydantic Config (this file)
4. structlog
"""
from functools import lru_cache
from typing import Literal
from pydantic import Field, HttpUrl, field_validator
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
"""
Application settings from environment variables
All settings can be overridden via .env file or environment variables.
"""
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
case_sensitive=True,
extra="ignore",
)
# ==========================================================================
# Application
# ==========================================================================
VERSION: str = "1.0.0"
ENVIRONMENT: Literal["dev", "prod"] = "dev"
DEBUG: bool = False
LOG_LEVEL: Literal["DEBUG", "INFO", "WARNING", "ERROR"] = "INFO"
SYSTEM_NAME: str = "awoooi"
# ==========================================================================
# Mock Mode - 開發時模擬外部服務
# ==========================================================================
MOCK_MODE: bool = Field(
default=False,
description="Enable mock mode for external services (Redis, Ollama, OpenClaw, PostgreSQL, SigNoz)",
)
# ==========================================================================
# ==========================================================================
# Phase 24: AI Provider Registry (ADR-052)
# 2026-04-02 ogt: 絞殺者開關 — true=新 AIRouter, false=舊 openclaw.py if/else
# 回滾指令: kubectl set env deployment/awoooi-api USE_AI_ROUTER=false
# ==========================================================================
USE_AI_ROUTER: bool = Field(
default=False,
description="Phase 24: True=新 AIRouter 路由, False=舊 openclaw.py fallback chain",
)
ENABLE_OPENCLAW_AGENT_LOOP_SHADOW: bool = Field(
default=False,
description="ADR-105 P1: True=OpenClaw 生成 proposal 後用本地 Agent Loop 做 read-only shadow investigation, False=不執行",
)
OPENCLAW_AGENT_LOOP_MAX_ITERATIONS: int = Field(
default=3,
ge=1,
le=5,
description="ADR-105 P1: OpenClaw Agent Loop shadow 最大 tool_use 輪數",
)
# ==========================================================================
# W1 PR-P1: Playbook 匹配 Feature Flag (2026-04-28 ogt + Claude Sonnet 4.6)
# 修復飛輪斷鏈 C1 — proposal_service 填 matched_playbook_id → EWMA 更新
# 回滾指令: kubectl set env deployment/awoooi-api ENABLE_PLAYBOOK_MATCHING=false
# ==========================================================================
ENABLE_PLAYBOOK_MATCHING: bool = Field(
default=True,
description="W1 PR-P1: True=generate_proposal 時執行 Playbook RAG 匹配並填 matched_playbook_id, False=行為與修復前完全相同(回滾用)",
)
# ==========================================================================
# W1 PR-R1: 規則 → Playbook 遷移 Feature Flag (2026-04-28 ogt + Claude Sonnet 4.6)
# 將 alert_rules.yaml 25 條規則遷移為 DRAFT Playbook飛輪 RAG 冷啟動)
# 回滾指令: kubectl set env deployment/awoooi-api ENABLE_RULE_MIGRATION_DRAFT=false
# ==========================================================================
ENABLE_RULE_MIGRATION_DRAFT: bool = Field(
default=True,
description="W1 PR-R1: True=允許 migrate_rules_to_playbooks CLI 寫入 DB, False=停用寫入(回滾用)",
)
# ==========================================================================
# P1-1: KMWriter 統一契約 (2026-04-28 ogt + Claude Sonnet 4.6)
# KM_WRITE_AWAIT=true → 強制 await asyncio.wait_for(timeout=KM_WRITE_TIMEOUT_SECONDS)
# KM_WRITE_AWAIT=false → 舊 fire-and-forget 行為(回滾用)
# 回滾指令: kubectl set env deployment/awoooi-api KM_WRITE_AWAIT=false
# ==========================================================================
KM_WRITE_AWAIT: bool = Field(
default=True,
description="P1-1: True=強制 await KM 寫入(可靠), False=緊急降級1次嘗試+DLQ回滾用",
)
KM_WRITE_TIMEOUT_SECONDS: float = Field(
default=5.0,
description="P1-1: KMWriter await timeout超時記錄 warning 但不阻塞主流程",
)
# C1 2026-04-28 ogt + Claude Sonnet 4.6: KM Backfill Reconciler
# 回滾指令: kubectl set env deployment/awoooi-api ENABLE_KM_BACKFILL_RECONCILER=false
ENABLE_KM_BACKFILL_RECONCILER: bool = Field(
default=True,
description="C1: True=啟用 km:backfill:dlq 補掃 job每 5 分鐘), False=停用",
)
# ==========================================================================
# W2 PR-R2: AOL → alert_rule_catalog Confidence EWMA Writeback
# ADR-091 Task T2 — 飛輪斷鏈 C2 修復:規則命中率回灌 catalog confidence
# default=false先寫 code人工驗證 AOL 資料品質後再開啟
# 啟用kubectl set env deployment/awoooi-api ENABLE_AOL_WRITEBACK_JOB=true
# 回滾kubectl set env deployment/awoooi-api ENABLE_AOL_WRITEBACK_JOB=false
# ==========================================================================
ENABLE_AOL_WRITEBACK_JOB: bool = Field(
default=False,
description="W2 PR-R2: True=每小時從 AOL 聚合 alertname 成功率並 EWMA 更新 alert_rule_catalog.confidence, False=停用(預設)",
)
# ==========================================================================
# W2 PR-L1: KM → Playbook 互饋回路 (2026-04-28 ogt + Claude Sonnet 4.6)
# 飛輪斷鏈 C3 + C4 修復 — KM 與 Playbook 演化互饋
# 邏輯 1: promote/demote 觸發 → 寫 KM 演化條目path_type=playbook_evolution
# 邏輯 2: 同 symptom_pattern_hash 累積 N=5 條 KM → 標記 playbook.review_required=true
# 邏輯 3: DEPRECATED Playbook → 降低 alert_rule_catalog.confidence *= 0.5
# 回滾指令: kubectl set env deployment/awoooi-api ENABLE_KM_PLAYBOOK_FEEDBACK_LOOP=false
# ==========================================================================
ENABLE_KM_PLAYBOOK_FEEDBACK_LOOP: bool = Field(
default=False,
description="W2 PR-L1: True=啟用 KM↔Playbook 互饋回路(飛輪 C3+C4 修復), False=停用default驗證後才開",
)
KM_PLAYBOOK_REVIEW_THRESHOLD: int = Field(
default=5,
description="W2 PR-L1: 同 symptom_pattern_hash 累積幾條 KM 後觸發 Playbook review_required 標記(預設 N=5",
)
# ==========================================================================
# ADR-104: LLM Playbook Generator
# 成功修復且未命中既有 Playbook 時,用本地 LLM 生成 DRAFT/REVIEW Playbook。
# 成本護欄:實作層只走 local providerGCP-A → GCP-B → 111不新增雲端 fallback。
# 回滾指令: kubectl set env deployment/awoooi-api ENABLE_LLM_PLAYBOOK_GENERATION=false
# ==========================================================================
ENABLE_LLM_PLAYBOOK_GENERATION: bool = Field(
default=True,
description="ADR-104 T1: True=成功修復無 matched_playbook_id 時啟動本地 LLM 生成 Playbook, False=只用 deterministic extraction",
)
ENABLE_PLAYBOOK_DRAFT_GOVERNANCE_JOB: bool = Field(
default=True,
description="ADR-104 T2: True=定期治理 LLM Playbook DRAFT/REVIEW 晉級, False=停用",
)
PLAYBOOK_DRAFT_GOVERNANCE_INTERVAL_SECONDS: int = Field(
default=3600,
ge=60,
description="ADR-104 T2: Playbook DRAFT governance job interval seconds",
)
# ==========================================================================
# aider-watch v2 integration (2026-04-20 ADR-091)
# 整合 Mac aider CLI 監控進 awoooi 飛輪events → incident → ai_router feedback
# 回滾kubectl set env deployment/awoooi-api USE_AIDER_FEEDBACK=false
# ==========================================================================
AIDER_WEBHOOK_SECRET: str = Field(
default="",
description="HMAC secret for /api/v1/aider/events webhook verification",
)
AIDER_EVENTS_STREAM_KEY: str = Field(
default="signals:aider:events",
description="Redis stream key for aider event ingestion",
)
AIDER_PATTERN_EXTRACT_INTERVAL_HOURS: float = Field(
default=24.0,
description="Aider event pattern extraction interval (future use)",
)
USE_AIDER_FEEDBACK: bool = Field(
default=False,
description="Phase 24 A8: True=ai_router.route() 讀 aider 成功率調權重, False=不讀(預設)",
)
# Phase 22: OpenClaw + Nemotron 協作 (ADR-044)
# 2026-03-31 Claude Code: 統帥批准實作
#
# 功能:
# - ENABLE_NEMOTRON_COLLABORATION: 啟用 OpenClaw + Nemotron 雙軌協作
# - NEMOTRON_TIMEOUT_SECONDS: Nemotron API 呼叫超時
# - NEMOTRON_ASYNC_UPDATE: 異步更新模式 (先推 OpenClaw後更新 Nemotron)
#
# 回滾指令: kubectl set env deployment/awoooi-api ENABLE_NEMOTRON_COLLABORATION=false
# ==========================================================================
ENABLE_NEMOTRON_COLLABORATION: bool = Field(
default=True,
description="Phase 22: True=啟用 OpenClaw+Nemotron 協作, False=僅 OpenClaw",
)
NEMOTRON_TIMEOUT_SECONDS: int = Field(
default=45,
description="Phase 22: Nemotron API 呼叫超時 (秒)",
)
NEMOTRON_ASYNC_UPDATE: bool = Field(
default=True,
description="Phase 22: True=異步更新 (先推 OpenClaw), False=同步等待",
)
# 2026-04-05 Claude Code: Phase 25 P0 v4.3 — DIAGNOSE timeout 依實測修正
# 實測依據 (2026-04-05):
# NIM (nvidia/nemotron-mini-4b-instruct): 2.2s~27.3s,平均 10.6s → 60s timeout (27s * 2 + buffer)
# Ollama llama3.2:3b CPU-only: 238s 回 {"ok":true} → 不可用於生產timeout 保留但實際走 NIM
NEMOTRON_DIAGNOSE_TIMEOUT_SECONDS: int = Field(
default=60,
description="Phase 25 P0: DIAGNOSE NIM timeout (秒),實測 2.2-27.3s avg 10.6s60s 含 buffer",
)
OLLAMA_DIAGNOSE_TIMEOUT_SECONDS: int = Field(
default=300,
description="Ollama diagnose timeout (秒)。GCP qwen3:14b CPU-only can exceed the old 120s proxy limit.",
)
# ==========================================================================
# Gitea — ADR-057 adopt() Gitea PR API (2026-04-05)
# ==========================================================================
GITEA_API_URL: str = Field(
default="http://192.168.0.110:3001",
description="Gitea 內網 API base URL",
)
GITEA_API_TOKEN: str = Field(
default="",
description="Gitea API Token需 write:repository scopeADR-057 adopt() 使用",
)
GITEA_REPO_OWNER: str = Field(default="wooo", description="Gitea repo owner")
GITEA_REPO_NAME: str = Field(default="awoooi", description="Gitea repo name")
# ==========================================================================
# CORS - 嚴格白名單 (無 UAT, 無 wildcard)
# ==========================================================================
CORS_ORIGINS: list[str] = Field(
default=[
"http://localhost:3000",
"http://localhost:3001",
"http://localhost:3002",
"http://localhost:3003",
"http://localhost:3333",
"http://192.168.0.168:3000", # 168 MacBook 本機開發
"http://192.168.0.188:3000", # 188 本機開發
"http://192.168.0.125:32335", # K3s VIP NodePort (staging/QA)
"http://192.168.0.120:32335", # K3s node-1 NodePort
"http://192.168.0.121:32335", # K3s node-2 NodePort
"https://awoooi.wooo.work",
],
description="Allowed CORS origins - NO wildcards allowed",
)
@field_validator("CORS_ORIGINS", mode="before")
@classmethod
def parse_cors_origins(cls, v: str | list[str]) -> list[str]:
if isinstance(v, str):
origins = [origin.strip() for origin in v.split(",")]
else:
origins = v
# Security check: reject wildcards
if "*" in origins:
raise ValueError("Wildcard (*) is NOT allowed in CORS_ORIGINS")
return origins
# ==========================================================================
# Database (PostgreSQL on 192.168.0.188)
# ==========================================================================
# 2026-04-22 ogt: 移除含 changeme 的 default改為必填。
# 來源: K8s Secret awoooi-secrets → DATABASE_URL
DATABASE_URL: str = Field(
description="PostgreSQL connection URL (必填,從 K8s Secret awoooi-secrets → DATABASE_URL 取得)",
)
# ==========================================================================
# Redis (192.168.0.188:6380, DB 0 - 與 OpenClaw 共用)
# ==========================================================================
REDIS_URL: str = Field(
default="redis://192.168.0.188:6380/0",
description="Redis connection URL (DB 0 shared with OpenClaw)",
)
# ==========================================================================
# External Services - Four Host Architecture
# ==========================================================================
# 2026-05-03 ogt: GCP 三層容災ADR-110GCP-A → GCP-B → Local → Gemini
OLLAMA_URL: str = Field(
default="http://34.143.170.20:11434", # 2026-05-03 ogt: 切換至 GCP-A SSD 主機9x 載速 + 2x 推理)
description="Ollama LLM service URL (GCP-A Primary)",
)
# 2026-05-03 ogt: GCP-B SSD 備援ADR-110 三層容災第二層)
OLLAMA_SECONDARY_URL: str = Field(
default="http://34.21.145.224:11434", # 2026-05-03 ogt: GCP-B SSD 備援
description="Ollama LLM secondary URL (GCP-B Secondary)",
)
# 2026-05-03 ogt: Local HDD 最後防線(原 2026-04-08 M1 Pro 主機降為第三層)
OLLAMA_FALLBACK_URL: str = Field(
default="http://192.168.0.111:11434", # 2026-05-03 ogt: M1 Pro Local HDD 最後防線
description="Ollama local fallback URL (Local HDD, 最後防線)",
)
# 2026-04-27 Wave8-X2 by Claude — vuln #1 URL endpoint poisoning 修復
# 攻擊情境:攻擊者改 ConfigMap OLLAMA_FALLBACK_URL=http://attacker.com:11434
# → ai_router 盲信 → C&C 通道。修法:啟動時拒絕非私網/loopback 的外部 URL。
# 2026-05-03 ogt: 擴充 validator 覆蓋 OLLAMA_SECONDARY_URL新增 GCP IP 白名單ADR-110
@field_validator("OLLAMA_URL", "OLLAMA_SECONDARY_URL", "OLLAMA_FALLBACK_URL")
@classmethod
def _validate_ollama_url(cls, v: str) -> str:
"""
Ollama URL 安全校驗:拒絕非 private/loopback IP 或非已知服務名稱的 URL。
允許:
- 空字串(未設定)
- 已知 Kubernetes Service hostname 白名單
- 私網 IPRFC 1918或 loopback127.x.x.x
- GCP 核准公網 IP 白名單ADR-110 GCP-A / GCP-B
拒絕:
- 非白名單公網 IP8.8.8.8
- 外部域名attacker.com
"""
if not v:
return v
import ipaddress
from urllib.parse import urlparse
try:
host = urlparse(v).hostname or ""
except Exception as exc:
raise ValueError(f"OLLAMA URL 格式無效:{v!r},錯誤:{exc}") from exc
if not host:
raise ValueError(f"OLLAMA URL 缺少 hostname{v!r}")
# Kubernetes Service hostname 白名單K8s DNS + 開發別名)
_ALLOWED_HOSTNAMES = {
"localhost",
"ollama",
"ollama-svc",
"ollama-fallback-svc",
"ollama-111",
"ollama-188",
}
if host in _ALLOWED_HOSTNAMES:
return v
# GCP 核准公網 IP 白名單ADR-1102026-05-03 ogt
# GCP-A: 34.143.170.20SSD, 9x 載速)
# GCP-B: 34.21.145.224SSD, 9x 載速)
_ALLOWED_PUBLIC_IPS: frozenset[str] = frozenset({
"34.143.170.20", # GCP-A Ollama Primary (SSD)
"34.21.145.224", # GCP-B Ollama Secondary (SSD)
})
if host in _ALLOWED_PUBLIC_IPS:
return v
# 否則必須是 private/loopback IP
try:
ip = ipaddress.ip_address(host)
except ValueError:
# hostname 不是 IP 也不在白名單 → 拒絕
raise ValueError(
f"OLLAMA URL host 不允許的外部域名:{host!r}(完整 URL{v!r}"
",必須使用私網 IP 或已知 K8s Service hostname"
) from None
if not (ip.is_private or ip.is_loopback):
raise ValueError(
f"OLLAMA URL 必須是私網/loopback IP、已知 K8s SVC 或 GCP 白名單 IP"
f"收到公網 IP {host!r}{v!r}),可能是端點中毒攻擊"
)
return v
# 2026-05-05 Codex: health inference must stay on alert-fast model; qwen2.5
# keeps reloading a 7B model on CPU-only GCP and slows incident fallback.
OLLAMA_HEALTH_CHECK_MODEL: str = Field(
default="gemma3:4b",
description="OllamaHealthMonitor 推理測試使用模型P1.1",
)
OLLAMA_EMBEDDING_MODEL: str = Field(
default="bge-m3:latest",
description="Ollama embedding model. ADR-110 migrated embeddings from nomic-embed-text to bge-m3.",
)
# 2026-04-12 ogt: 心跳必須確認載入的 Ollama 模型清單
# 2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP 升級更新必要模型清單nomic→bge-m3 + 新增 qwen3:14b + hermes3
OLLAMA_REQUIRED_MODELS: list[str] = Field(
default=["bge-m3:latest", "qwen2.5:7b-instruct", "qwen3:14b", "deepseek-r1:14b", "hermes3:latest"],
description="HeartbeatReportService 探測必要模型是否載入",
)
# 2026-04-25 critic-fix Part2 H7 by Claude Engineer-C2
# Gemini 帳單熔斷:每日呼叫上限,超過改走 188+Nemotron
# 超過上限後寫 Redis key ollama:gemini_daily_count:{date}TTL 86400s
GEMINI_DAILY_QUOTA: int = Field(
default=1000,
description="每日 Gemini 呼叫上限,超過切到 188+NemotronP1.1 帳單熔斷)",
)
# Deprecated: use OPENCLAW_URL instead
CLAWBOT_URL: str = Field(
default="http://192.168.0.188:8088", # 🔧 修正: OpenClaw 實際 port 是 8088
description="[Deprecated] Legacy OpenClaw URL - use OPENCLAW_URL",
)
KALI_SCANNER_URL: str = Field(
default="http://192.168.0.112:8080",
description="Kali security scanner URL",
)
SIGNOZ_URL: str = Field(
default="http://192.168.0.188:3301",
description="SigNoz observability URL",
)
CLICKHOUSE_URL: str = Field(
default="http://192.168.0.188:8123",
description="ClickHouse HTTP API URL (SignOz backend, direct query)",
)
# ==========================================================================
# Sentry Self-Hosted (Phase 10: Error Tracking + AI Analysis)
# 端點: http://192.168.0.110:9000 (DevOps 金庫)
# ==========================================================================
SENTRY_SELF_HOSTED_URL: str = Field(
default="http://192.168.0.110:9000",
description="Sentry Self-Hosted API URL",
)
SENTRY_ORG: str = Field(
default="sentry",
description="Sentry organization slug",
)
SENTRY_PROJECT: str = Field(
default="awoooi-api",
description="Sentry project slug",
)
SENTRY_AUTH_TOKEN: str = Field(
default="",
description="Sentry Auth Token for API access (from K8s Secret)",
)
# ==========================================================================
# OpenTelemetry (可觀測性鐵律)
# 四主機架構強制校驗: OTEL 必須指向 192.168.0.188AWOOOI 主站)
# ADR-121 + P0-08 修正:改為 config-driven允許 EwoooC 指向不同 host
# ==========================================================================
OTEL_ENABLED: bool = Field(
default=True,
description="Enable OpenTelemetry tracing (disable in MOCK_MODE)",
)
OTEL_EXPORTER_OTLP_ENDPOINT: str = Field(
default="192.168.0.188:24317",
description="SigNoz OTLP gRPC endpoint (Host port 24317 -> Container 4317) - NO http:// prefix for gRPC",
)
OTEL_ALLOWED_ENDPOINTS: list[str] = Field(
default=["192.168.0.188"],
description="允許的 OTEL endpoint host 列表(逗號分隔可用 env 覆寫。EwoooC 可設自己的 SigNoz host。",
)
OTEL_FORBIDDEN_ENDPOINTS: list[str] = Field(
default=["192.168.0.110", "192.168.0.112", "192.168.0.120", "192.168.0.121"],
description="明確禁止的 OTEL endpoint host 列表(不允許誤指向非 SigNoz 主機)",
)
AWOOOI_K8S_NAMESPACE: str = Field(
default="awoooi-prod",
description="K8s namespaceP0-13 修正不再硬碼EwoooC/Tsenyang 可設自己的 namespace",
)
OTEL_SERVICE_NAME: str = Field(
default="awoooi-api",
description="Service name for tracing",
)
OTEL_TRACES_SAMPLER_ARG: float = Field(
default=1.0,
description="Trace sampling rate (1.0 = 100%)",
)
# ==========================================================================
# Langfuse LLMOps (Phase 15.1)
# LLM 呼叫追蹤、成本監控、Prompt 版本管理
# 端點: http://192.168.0.110:3100 (DevOps 金庫)
# ==========================================================================
LANGFUSE_ENABLED: bool = Field(
default=True,
description="Enable Langfuse LLM observability",
)
LANGFUSE_URL: str = Field(
default="http://192.168.0.110:3100",
description="Langfuse self-hosted URL",
)
LANGFUSE_PUBLIC_KEY: str = Field(
default="",
description="Langfuse public key (from K8s Secret)",
)
LANGFUSE_SECRET_KEY: str = Field(
default="",
description="Langfuse secret key (from K8s Secret)",
)
# ==========================================================================
# AI Fallback Strategy (ADR-006 v1.3 + ADR-036)
# Order: Ollama (local) -> NVIDIA NIM -> Gemini (cloud) -> Claude (cloud)
# Tool Calling: Nemotron (專用) -> Gemini -> Claude
# 2026-04-28 ogt + Claude Opus 4.7: 補 nvidia 對齊 ConfigMap (k8s/awoooi-prod/04-configmap.yaml)
# 之前 default 缺 nvidia 與 ConfigMap driftfeedback_ai_fallback_order.md 鐵律
# ==========================================================================
AI_FALLBACK_ORDER: list[str] = Field(
default=["ollama", "nvidia", "gemini", "claude"],
description="AI provider fallback order",
)
GEMINI_API_KEY: str = Field(default="", description="Google Gemini API key")
CLAUDE_API_KEY: str = Field(default="", description="Anthropic Claude API key")
LOCAL_CODE_REVIEW_ALLOW_GEMINI_FALLBACK: bool = Field(
default=False,
description=(
"Allow LocalCodeReviewService to fall back to Gemini when the "
"local Ollama code-review lane fails. Default false to avoid "
"unexpected cloud spend from Gitea push/PR alerts."
),
)
ALERT_AI_ALLOW_CLOUD_FALLBACK: bool = Field(
default=True,
description=(
"Allow incident/alert OpenClaw analysis to use cloud fallback "
"providers after the GCP-A/GCP-B/111 Ollama lane is exhausted. "
"Default true so Gemini can act as the final backup, after the "
"ordered Ollama lane is exhausted."
),
)
ALERT_AI_ENFORCE_OLLAMA_FIRST: bool = Field(
default=True,
description=(
"Force incident/alert OpenClaw analysis to try GCP-A, then GCP-B, "
"then local 111 before cloud backup providers such as Gemini."
),
)
ALERT_OLLAMA_MODEL: str = Field(
default="qwen3:14b",
description=(
"Ollama model used for incident/alert deep diagnosis. Alert cards "
"may wait for this model; Gemini remains a backup after GCP-A, "
"GCP-B, and 111 fail."
),
)
INCIDENT_LLM_TIMEOUT_SECONDS: int = Field(
default=360,
description=(
"Outer timeout for incident OpenClaw proposal generation. This must "
"be long enough for the GCP-A/GCP-B/111 Ollama lane to complete "
"before Gemini backup is considered useful."
),
)
# 2026-03-29 ogt: ADR-036 Nemotron Tool Calling 整合
NVIDIA_API_KEY: str = Field(
default="",
description="NVIDIA NIM API key for Nemotron Tool Calling (ADR-036)",
)
# 2026-04-09 Claude Sonnet 4.6: Ollama Tool Calling — 替代 NVIDIA 雲端,本機推理
USE_OLLAMA_TOOL_CALLING: bool = Field(
default=True,
description="使用 Ollama 本機做 Tool Calling取代 NVIDIA NIM 雲端 (44s→5s)",
)
# 2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP 升級,改 hermes3:latest工具調用能力優於 llama3.1:8b
OLLAMA_TOOL_MODEL: str = Field(
default="hermes3:latest",
description="Ollama Tool Calling 模型 (支援 function calling 格式)",
)
@field_validator("AI_FALLBACK_ORDER", mode="before")
@classmethod
def parse_ai_fallback(cls, v: str | list[str]) -> list[str]:
"""
解析 AI_FALLBACK_ORDER支援三種格式:
1. JSON: '["gemini","ollama","claude"]'
2. CSV: 'gemini,ollama,claude'
3. List: ["gemini", "ollama", "claude"]
2026-03-27 修復: ConfigMap 用 JSON 格式,原本只支援 CSV
"""
import json
if isinstance(v, str):
v = v.strip()
# 嘗試 JSON 解析 (ConfigMap 格式)
if v.startswith("["):
try:
parsed = json.loads(v)
return [p.strip().lower() for p in parsed]
except json.JSONDecodeError:
pass # 降級到 CSV 解析
# CSV 格式
return [provider.strip().lower() for provider in v.split(",")]
return [p.lower() for p in v]
# ==========================================================================
# Kubernetes / K3s (CTO-201)
# ==========================================================================
KUBECONFIG_PATH: str = Field(
default="k3s-prod.yaml",
description="Path to kubeconfig file for K3s cluster (192.168.0.120)",
)
K8S_NAMESPACE_DEFAULT: str = Field(
default="default",
description="Default Kubernetes namespace for operations",
)
K8S_OPERATION_TIMEOUT: int = Field(
default=30,
description="Timeout for K8s operations in seconds",
)
K8S_API_KEY: str = Field(
default="",
description="API Key for K8s admin endpoints (X-K8s-Api-Key header)",
)
AWOOOP_OPERATOR_API_KEY: str = Field(
default="",
description=(
"API key for AwoooP operator mutation endpoints "
"(X-AwoooP-Operator-Key header)"
),
)
ENABLE_AWOOOP_ANSIBLE_CHECK_MODE_WORKER: bool = Field(
default=False,
description=(
"True=consume ansible_candidate_matched AOL rows and run "
"ansible-playbook --check --diff only. Apply remains disabled."
),
)
AWOOOP_ANSIBLE_CHECK_MODE_INTERVAL_SECONDS: int = Field(
default=300,
ge=60,
description="AwoooP Ansible check-mode worker polling interval.",
)
AWOOOP_ANSIBLE_CHECK_MODE_BATCH_LIMIT: int = Field(
default=1,
ge=1,
le=5,
description="Maximum Ansible check-mode candidates claimed per worker tick.",
)
AWOOOP_ANSIBLE_CHECK_MODE_TIMEOUT_SECONDS: int = Field(
default=180,
ge=30,
le=600,
description="Timeout for one ansible-playbook --check --diff execution.",
)
AWOOOP_ANSIBLE_CHECK_MODE_STARTUP_SLEEP_SECONDS: int = Field(
default=120,
ge=0,
le=900,
description="Delay before the check-mode worker first tick after API startup.",
)
AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_PROFILE: str = Field(
default="ssh_mcp",
description=(
"SSH transport profile used by Ansible check-mode. Production uses "
"the existing ssh-mcp key so repair-bot forced-command remains reserved "
"for whitelist repairs."
),
)
AWOOOP_ANSIBLE_CHECK_MODE_SSH_KEY_PATH: str = Field(
default="/run/secrets/ssh_mcp_key",
description="Private key path for Ansible check-mode SSH transport.",
)
AWOOOP_ANSIBLE_CHECK_MODE_KNOWN_HOSTS_PATH: str = Field(
default="/etc/ssh-mcp/known_hosts",
description="known_hosts path for Ansible check-mode SSH transport.",
)
AWOOOP_ANSIBLE_CHECK_MODE_CANDIDATE_MAX_AGE_HOURS: int = Field(
default=24,
ge=1,
le=168,
description=(
"Only recent Ansible candidate audit rows are eligible for automatic "
"check-mode claims; older backlog remains visible but is not drained as noise."
),
)
AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_COOLDOWN_SECONDS: int = Field(
default=21_600,
ge=300,
le=86_400,
description=(
"Cooldown after transport-level check-mode blockers such as "
"forced-command repair SSH denial."
),
)
# ==========================================================================
# 統帥鐵律:禁止 SQLite (AWOOOI 憲法)
# ==========================================================================
# ❌ 已移除 SQLITE_DATABASE_URL - 違反 AWOOOI 憲法
# 所有持久化必須使用 PostgreSQL (DATABASE_URL)
# 審計日誌請使用 PostgreSQL audit_logs 表
# ==========================================================================
# ==========================================================================
# Cache TTL (seconds)
# ==========================================================================
CACHE_TTL_DASHBOARD: int = Field(default=300, description="Dashboard cache TTL (5 min)")
CACHE_TTL_HOST_STATUS: int = Field(default=30, description="Host status cache TTL (30 sec)")
CACHE_TTL_AI_RESPONSE: int = Field(default=3600, description="AI response cache TTL (1 hour)")
# ==========================================================================
# Health Check Timeouts (seconds)
# ==========================================================================
HEALTH_CHECK_TIMEOUT: float = Field(default=5.0, description="Health check timeout")
# ==========================================================================
# Phase 5: OpenClaw AI Engine (正名自 OpenClaw)
# Synced from models.json - Ollama First Strategy
# ==========================================================================
OPENCLAW_URL: str = Field(
default="http://192.168.0.188:8088", # 🔧 修正: OpenClaw 實際 port 是 8088
description="OpenClaw AI Agent service URL",
)
# 2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP 升級,改 qwen3:14bGCP-A SSD 算力RCA 推理更強)
OPENCLAW_DEFAULT_MODEL: str = Field(
default="qwen3:14b",
description="Default Ollama model for RCA analysis",
)
OPENCLAW_TIMEOUT: int = Field(
default=30, # 2026-04-14 Claude Sonnet 4.6: 從 120s 改 30s配合 ADR-052 GAP-B4
# 25s LLM hard timeout + 5s buffer。原 120s 違反 defense-in-depth 設計,
# 導致 Ollama 過載時 thread 飢餓 120s 才降級 fallback。
description="Timeout for OpenClaw AI calls (seconds, aligned with GAP-B4 25s)",
)
# ==========================================================================
# Phase 5: Telegram Gateway (繼承自 AIOPS)
# CISO 要求: Token 必須存放於 K8s Secret此處為開發預設
# ==========================================================================
OPENCLAW_TG_BOT_TOKEN: str = Field(
default="",
description="Telegram Bot Token (from K8s Secret in prod)",
)
OPENCLAW_TG_CHAT_ID: str = Field(
default="",
description="Telegram Chat ID for notifications",
)
# 使用 str 避免 pydantic-settings 自動 JSON 解析
# Pydantic v2 禁止底線開頭的 Field 名稱
OPENCLAW_TG_USER_WHITELIST: str = Field(
default="",
description="Telegram user IDs allowed to sign approvals (comma-separated or JSON array)",
)
# 2026-03-23 架構修正 (遵循 C-Suite 決議)
# 鐵律: .188 為唯一大腦,禁止腦分裂
# OpenClaw (192.168.0.188) = 唯一 Telegram Gateway
# AWOOOI API (K8s) = Web API + Sensor不做 Polling
TELEGRAM_ENABLE_POLLING: bool = Field(
default=False,
description="Telegram Polling (False: OpenClaw handles it; True: only if OpenClaw unavailable)",
)
# 2026-04-03 ogt: SRE 戰情室群組三頭政治 (Triumvirate) — ADR-053
OPENCLAW_BOT_TOKEN: str = Field(
default="",
description="@OpenClawAwoooI_Bot Token — 群組內代表 OpenClaw AI 發言",
)
NEMOTRON_BOT_TOKEN: str = Field(
default="",
description="@NemoTronAwoooI_Bot Token — 群組內代表 NemoClaw AI 發言",
)
SRE_GROUP_CHAT_ID: str = Field(
default="",
description="AwoooI SRE 戰情室群組 Chat ID",
)
SSH_MCP_HOST_USERS: str = Field(
default="192.168.0.188=ollama",
description="Per-host SSH MCP username overrides, format host=user,host=user",
)
# ADR-093 灰階切流True 時由 notification_matrix 控制所有路由
# False預設時維持舊行為TYPE-3/4/4D/8M 僅 DM
TG_GROUP_CUTOVER: bool = Field(
default=False,
description="ADR-093: True 時啟用 notification_matrix 路由矩陣,取代 telegram_gateway 硬碼",
)
# ADR-095 2026-04-25 ogt + Claude Sonnet 4.6: 12-Agent ConsensusEngine
ENABLE_12AGENT_CONSENSUS: bool = Field(
default=False,
description="ADR-095: 啟用 12-Agent ConsensusEngine weights預設關閉",
)
# ==========================================================================
# ADR-091 Task T1: AI 自學規則雙寫 alert_rule_catalog (2026-04-28 ogt + Claude Sonnet 4.6)
# True=auto_generate_rule() 成功後同步寫入 DB source='ai_generated'
# False=回滾開關,只寫 YAML不寫 DB
# 回滾指令: kubectl set env deployment/awoooi-api ENABLE_AI_RULE_CATALOG_WRITE=false
# ==========================================================================
ENABLE_AI_RULE_CATALOG_WRITE: bool = Field(
default=True,
description="ADR-091 T1: True=AI 自學規則雙寫 alert_rule_catalog DB, False=僅 YAML回滾用",
)
# ==========================================================================
# 2026-05-04 ogt + Claude Sonnet 4.6: Drift 自動採納開關
# 根因修復後啟用report.interpretation in-memory 未更新 bug 已修)
# 回滾指令: kubectl set env deployment/awoooi-api DRIFT_AUTO_ADOPT_ENABLED=false
# ==========================================================================
DRIFT_AUTO_ADOPT_ENABLED: bool = Field(
default=True,
description="2026-05-04: True=啟用 drift auto_adopt_if_safe 自動採納低風險漂移, False=回滾停用",
)
# ==========================================================================
# 2026-05-04 ogt + Claude Sonnet 4.6: Coverage Gap → AI 規則自動生成
# evaluate_once() 末段:對 auto_alerting=red 的 asset 自動生成 alert_rule_catalog 記錄
# 回滾指令: kubectl set env deployment/awoooi-api COVERAGE_AUTO_RULE_ENABLED=false
# ==========================================================================
COVERAGE_AUTO_RULE_ENABLED: bool = Field(
default=True,
description="2026-05-04: True=coverage 缺口自動生成 alert_rule_catalogsource='ai_generated'review_status='pending_review', False=停用",
)
# 2026-04-27 P3.1-T2-PathA by Claude — DiagAggregator 信號分類層補 PDI
# 路徑 A 已啟用DA 只取 PDI 已收集的 raw 資料做業務邏輯分類OOMKilled/CrashLoop 等),
# 不重複呼叫 K8s/SignOz API純邏輯分類不打外部服務
# 啟用kubectl set env deployment/awoooi-api ENABLE_DIAGNOSIS_AGGREGATOR=true
ENABLE_DIAGNOSIS_AGGREGATOR: bool = Field(
default=True,
description="P3.1-T2-PathA: 啟用 DiagnosisAggregator 信號分類層補 PDI路徑 A不重複收集只分類已有 raw 資料)",
)
# ==========================================================================
# W2 PR-V1: SelfHealingValidator Feature Flag (2026-04-28 ogt + Claude Sonnet 4.6)
# 飛輪斷鏈 C6 修復 — 驗證層串接自愈品質評估
# 回滾指令: kubectl set env deployment/awoooi-api ENABLE_SELF_HEALING_VALIDATOR=false
# ==========================================================================
ENABLE_SELF_HEALING_VALIDATOR: bool = Field(
default=False,
description="W2 PR-V1: True=PostExecutionVerifier 執行後評估自愈品質分數score<0.5發Telegram警示, False=跳過(回滾用)",
)
def get_tg_user_whitelist(self) -> list[int]:
"""Parse comma-separated or JSON array user IDs to list[int]"""
raw = self.OPENCLAW_TG_USER_WHITELIST
# 已是 list測試 monkeypatch 或程式碼直接傳入)
if isinstance(raw, list):
return [int(uid) for uid in raw]
if not raw or not raw.strip():
return []
# Handle JSON array format or comma-separated
if raw.startswith("["):
import json
return json.loads(raw)
return [int(uid.strip()) for uid in raw.split(",")]
# ==========================================================================
# Phase 5: Webhook Security (CISO 要求)
# HMAC-SHA256 簽章驗證 + Nonce 防重放
# ==========================================================================
WEBHOOK_HMAC_SECRET: str = Field(
default="",
description="HMAC secret for webhook signature verification",
)
# ADR-116 P0-05: Callback Nonce 防偽造 HMAC Secret
# 2026-05-04 Claude Sonnet 4.6 (ADR-116): 附加至 callback nonce 末尾的 HMAC-SHA256[:16]
# 空字串 → 過渡期跳過驗證並記錄 warning
CALLBACK_HMAC_SECRET: str = Field(
default="",
description="ADR-116: HMAC secret for callback nonce anti-forgery (HMAC-SHA256 appended to nonce)",
)
# 2026-04-24 Claude Sonnet 4.6 (ADR-094): Telegram Webhook Secret Token
# 與 setWebhook API 呼叫時的 secret_token 相同;空字串 → dev 環境跳過驗證
TELEGRAM_WEBHOOK_SECRET: str = Field(
default="",
description="Telegram Webhook Secret TokensetWebhook 設定的同一值)",
)
# 2026-04-24 Claude Sonnet 4.6 (ADR-095 WS4): Hermes NL 自然語言閘道
# false=不啟用預設true=啟用 @mention 問答(需 ANTHROPIC_API_KEY
HERMES_NL_ENABLED: bool = Field(
default=False,
description="Hermes NL 對話功能開關ADR-095",
)
TELEGRAM_BOT_USERNAME: str = Field(
default="tsenyangbot",
description="Telegram Bot username不含 @),用於 @mention 識別",
)
WEBHOOK_NONCE_TTL: int = Field(
default=300,
description="Nonce TTL in seconds for replay attack prevention",
)
# ==========================================================================
# Phase 5: Shadow Mode (物理繳械)
# 統帥戰略 C: 接入真實告警,但物理閹割 AI 破壞力
# ==========================================================================
SHADOW_MODE_ENABLED: bool = Field(
default=True,
description="Shadow Mode: Force dry-run for all K8s operations (safe by default)",
)
SHADOW_MODE_LOG_ONLY: bool = Field(
default=True,
description="Shadow Mode: Only log operations without any K8s API calls",
)
# ==========================================================================
# Phase 5: Context Gatherer (首席架構師要求)
# 日誌清洗: 僅保留 ERROR/FATAL/CRITICAL
# ==========================================================================
CONTEXT_LOG_LEVELS: list[str] = Field(
default=["ERROR", "FATAL", "CRITICAL", "WARN", "WARNING"],
description="Log levels to include in AI context (ERROR Only principle)",
)
CONTEXT_MAX_LINES: int = Field(
default=100,
description="Maximum log lines to include in context",
)
@field_validator("CONTEXT_LOG_LEVELS", mode="before")
@classmethod
def parse_log_levels(cls, v: str | list[str]) -> list[str]:
if isinstance(v, str):
return [level.strip().upper() for level in v.split(",")]
return [level.upper() for level in v]
# ==========================================================================
# Notification Plugins (leWOOOgo Output)
# Fail-Fast: HttpUrl 驗證確保啟動時攔截設定錯誤
# ==========================================================================
DISCORD_WEBHOOK_URL: str = Field(
default="",
description="Discord webhook URL for sending execution reports",
)
SLACK_WEBHOOK_URL: str = Field(
default="",
description="Slack webhook URL for sending execution reports",
)
NOTIFICATION_ENABLED: bool = Field(
default=True,
description="Enable post-execution notifications",
)
@field_validator("DISCORD_WEBHOOK_URL", "SLACK_WEBHOOK_URL", mode="before")
@classmethod
def validate_webhook_url(cls, v: str | None) -> str:
"""
Fail-Fast Webhook URL 驗證
- 空字串 = 停用 (合法)
- 非空字串必須是合法 HttpUrl (否則啟動失敗)
"""
if not v or v.strip() == "":
return ""
# Validate as HttpUrl (raises ValueError if invalid)
HttpUrl(v)
return v
# ==========================================================================
# Phase 23 (ADR-048): Sentry Webhook → OpenClaw AI Triage
# Sentry Issue Alert Webhook 簽章驗證 (sentry-hook-signature header)
# ==========================================================================
SENTRY_WEBHOOK_SECRET: str = Field(
default="",
description="Sentry Webhook secret for HMAC-SHA256 signature verification",
)
# ==========================================================================
# Phase 13.1: GitHub Webhook → OpenClaw 整合
# Gitea PR/Push 事件自動觸發 AI 代碼審查 (ADR-059: GitHub → Gitea 遷移)
# ==========================================================================
GITEA_WEBHOOK_SECRET: str = Field(
default="",
description="Gitea Webhook secret for HMAC-SHA256 signature verification (X-Gitea-Signature)",
)
GITEA_ALLOWED_REPOS: str = Field(
default="wooo/awoooi",
description="Comma-separated list of allowed Gitea repositories (e.g., 'wooo/awoooi')",
)
def get_gitea_allowed_repos(self) -> list[str]:
"""Parse comma-separated allowed repos to list"""
# 2026-04-05 Claude Code (ADR-059): GitHub → Gitea webhook 遷移
raw = self.GITEA_ALLOWED_REPOS
if not raw or not raw.strip():
return []
return [repo.strip() for repo in raw.split(",") if repo.strip()]
# ==========================================================================
# MCP Phase 2b: Prometheus MCP Server (ADR-071, 2026-04-11 Claude Sonnet 4.6)
# ==========================================================================
# 2026-04-29 ogt + Claude Opus 4.7: drift fix — Prometheus 實際在 110
# ConfigMap 04-configmap.yaml 也是 110governance_agent / SLO check 連 188 會 timeout
# 此 drift 是 SPF-4 (governance_agent silently fail) 根因之一
PROMETHEUS_URL: str = Field(
default="http://192.168.0.110:9090",
description="Prometheus server URL (DevOps 金庫主機)",
)
PROMETHEUS_MCP_ENABLED: bool = Field(
default=True,
description="啟用 Prometheus MCP Provider",
)
# MCP Phase 2a: SSH MCP Server (ADR-071, 2026-04-11 Claude Sonnet 4.6)
# ==========================================================================
SSH_MCP_ENABLED: bool = Field(
default=False,
description="啟用 SSH MCP Provider需 K8s Secret ssh-mcp-key 掛載)",
)
SSH_MCP_ALLOWED_HOSTS: str = Field(
default="192.168.0.188,192.168.0.110,192.168.0.111",
description="允許 SSH 的主機 IP 清單(逗號分隔)",
)
# MCP Phase 3: ArgoCD MCP Server (2026-04-11 Claude Sonnet 4.6)
# ==========================================================================
# 2026-04-29 ogt + Claude Opus 4.7: drift fix — ArgoCD 實際在 121125 是舊位置
# ConfigMap 04-configmap.yaml 已寫 121T0 G3 drift 已知,此次正式對齊
ARGOCD_URL: str = Field(
default="https://192.168.0.121:30443",
description="ArgoCD API Server URLK3s NodePort HTTPS",
)
ARGOCD_API_TOKEN: str = Field(
default="",
description="ArgoCD API Token從 K8s Secret 取得)",
)
ARGOCD_MCP_ENABLED: bool = Field(
default=True,
description="啟用 ArgoCD MCP Provider需 ARGOCD_API_TOKEN",
)
# MCP Phase 3: Sentry MCP Server (2026-04-11 Claude Sonnet 4.6)
# ==========================================================================
SENTRY_MCP_ENABLED: bool = Field(
default=True,
description="啟用 Sentry MCP Provider需 SENTRY_AUTH_TOKEN",
)
# ==========================================================================
# Phase 13.2: Grafana MCP Tool (#83)
# ==========================================================================
GRAFANA_URL: str = Field(
default="http://192.168.0.188:3000",
description="Grafana server URL",
)
GRAFANA_API_KEY: str = Field(
default="",
description="Grafana API key for authentication (Bearer token)",
)
# ==========================================================================
# Computed Properties
# ==========================================================================
@property
def is_production(self) -> bool:
"""Check if running in production"""
return self.ENVIRONMENT == "prod"
@property
def four_hosts(self) -> dict[str, str]:
"""Four host architecture reference"""
return {
"devops": "192.168.0.110", # Harbor, GH Runner
"security": "192.168.0.112", # Kali Scanner
"k3s_master": "192.168.0.120", # K3s Master
"ai_web": "192.168.0.188", # Nginx, Postgres, Redis, SignOz
}
@lru_cache
def get_settings() -> Settings:
"""Get cached settings instance"""
return Settings()
# Singleton for direct import
settings = get_settings()