Compare commits

...

7 Commits

Author SHA1 Message Date
Your Name
2d37383fc6 fix(monitoring): fix false positive NoAlertsReceived2Hours by filtering only alertmanager source 2026-05-28 15:33:17 +08:00
Your Name
3779f6f1e0 fix(metrics): 串入飛輪指標到 /metrics 主端點,修復 FlywheelExecutionRateMissing 死告警
INC-20260507-99ADF2 根因(feedback_full_chain_first_then_fix.md 全鏈分析):

【鏈路斷點】規則層(5/3 加)vs 指標層(5/6 改)vs scrape 層(從沒同步)
- 577250a6(5/3)「反消音化」commit 加了 FlywheelExecutionRateMissing
  rule,要求 110 Prom scrape 到 awoooi_flywheel_execution_success_rate;
- a2c4b3d4(5/6)Codex 改 FlywheelStatsService 用 auto_repair_executions
  作 source of truth(24h 樣本 1-9 筆回 None 給 W-3b watchdog 接管);
- 但 awoooi_flywheel_* 指標自始至終只在 /api/v1/stats/flywheel/metrics
  暴露,110 Prom awoooi-api job 抓的是 /metrics → absent() 永遠 1
  → 自 2026-05-06T04:14 UTC 起 firing 26h+ 屬 dead alert

【修法】只動 awoooi-api 一處,不碰 Codex 設計、不碰 110 Prom 配置:
- main.py /metrics endpoint 改 async,在 generate_latest() 後串入
  FlywheelStatsService.compute() → to_prometheus_lines()。
- 既有 awoooi-api scrape job 自動拿到飛輪指標。
- 完全保留 Codex a2c4b3d4 設計:1-9 筆回 None 讓 W-3b watchdog 雙保險。

【不碰的部分】
- flywheel_stats_service.py 不動:Codex 5/6 LOGBOOK 已明確說明
  「Redis playbook counter 失準 → 用 auto_repair_executions 為唯一信任源」,
  1-9 筆 return None 是配合 ai_slo_watchdog_job W-3b grace+30min 設計的
  反消音化雙保險,不是 bug。

驗證計畫(部署後):
1. curl /metrics | grep awoooi_flywheel  → 看到飛輪指標
2. Prom query awoooi_flywheel_execution_success_rate  → 非空
3. ALERTS{alertname="FlywheelExecutionRateMissing"}  → resolved
4. 30 分鐘觀察 Telegram 不再收 INC-20260507-99ADF2

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 15:32:47 +08:00
Your Name
c38227e945 fix(ai): remove 188 ollama provider 2026-05-06 14:33:16 +08:00
Your Name
1b4a6c1e8c fix(awooop): align console with flywheel execution metrics 2026-05-06 00:44:53 +08:00
Your Name
894174da5b fix(ops): harden cold-start schedule recovery 2026-05-05 22:14:54 +08:00
Your Name
10cd9fc025 fix(openclaw): gate alert cloud fallback behind flag 2026-05-05 20:53:12 +08:00
Your Name
8161ccf83f fix(ops): persist host resource guardrails 2026-05-05 16:13:02 +08:00
69 changed files with 3272 additions and 541 deletions

View File

@@ -108,7 +108,9 @@ jobs:
mkdir -p ~/.ssh
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS
# 2026-05-05 Codex: kubectl runs on 120 control-plane. 121 is a
# worker and its local kubeconfig points at 127.0.0.1:6443.
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS
set -e
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
@@ -138,10 +140,10 @@ jobs:
SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
run: |
cat k8s/awoooi-dev/02-configmap.yaml | \
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 \
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 \
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << 'DEPLOY'
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << 'DEPLOY'
set -e
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml

View File

@@ -406,8 +406,11 @@ jobs:
mkdir -p ~/.ssh
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
ssh-keyscan 192.168.0.121 >> ~/.ssh/known_hosts 2>/dev/null
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS
# 2026-05-05 Codex: kubectl must run on the 120 control-plane.
# 121 is a worker after cold-start recovery; its kubeconfig points at
# 127.0.0.1:6443 and fails ADR-035 secret patching.
ssh-keyscan 192.168.0.120 >> ~/.ssh/known_hosts 2>/dev/null
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS
set -e
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
@@ -634,19 +637,21 @@ jobs:
mkdir -p ~/.ssh
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
ssh-keyscan 192.168.0.121 >> ~/.ssh/known_hosts 2>/dev/null
# 2026-05-05 Codex: deploy-side kubectl/ArgoCD operations run on 120
# control-plane, not 121 worker.
ssh-keyscan 192.168.0.120 >> ~/.ssh/known_hosts 2>/dev/null
IMAGE_TAG="${{ github.sha }}"
HARBOR=192.168.0.110:5000
# ─── Step 1: Apply ConfigMap + ServiceRegistry (ArgoCD 管的是 DeploymentConfigMap 仍直接 apply) ───
cat k8s/awoooi-prod/04-configmap.yaml | \
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
echo "✅ ConfigMap 已更新"
cat k8s/awoooi-prod/15-service-registry-configmap.yaml | \
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
echo "✅ Service Registry ConfigMap 已更新"
@@ -688,7 +693,7 @@ jobs:
}
# ─── Step 4: 等待 ArgoCD sync + rollout ───
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
"EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" << 'ARGOCD_WAIT'
set -e
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
@@ -814,7 +819,7 @@ jobs:
- name: Alert Chain Smoke Test
id: alert_chain_smoke
run: |
# 2026-04-05 Claude Code: 使用真實 API 地址192.168.0.121:32334 NodePort
# 2026-05-05 Codex: use the keepalived VIP instead of a fixed node.
# Host runner launches the CI image explicitly to avoid act RWLayer=nil.
if docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-alert-smoke" \
@@ -824,7 +829,7 @@ jobs:
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
"${{ env.CI_IMAGE }}" \
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url http://192.168.0.121:32334 --json | tee /tmp/alert_chain_result.json'; then
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url http://192.168.0.125:32334 --json | tee /tmp/alert_chain_result.json'; then
echo "alert_chain_status=pass" >> $GITHUB_OUTPUT
else
echo "alert_chain_status=fail" >> $GITHUB_OUTPUT

View File

@@ -6,8 +6,9 @@
-- bge-m3 產生 1024 維向量,現有 schema vector(768) 不相容INSERT 會直接失敗
--
-- 影響範圍:
-- 1. rag_chunks.embedding vector(768) → vector(1024)
-- 2. playbook_embeddings.embedding vector(768) → vector(1024)
-- 1. knowledge_entries.embedding vector(768) → vector(1024)
-- 2. rag_chunks.embedding vector(768) → vector(1024)
-- 3. playbook_embeddings.embedding vector(768) → vector(1024)
--
-- 遷移策略:清空現有向量資料,切換維度後由 re-embed script 重新嵌入
-- 現有向量資料若要保留,需先 dump 用 nomic 格式備份(舊維度無法轉換)
@@ -21,7 +22,24 @@
BEGIN;
-- 1. rag_chunks清空向量資料,變更欄位維度
-- 1. knowledge_entries備份舊向量並清空,變更欄位維度
CREATE TABLE IF NOT EXISTS knowledge_entries_embedding_backup_20260505 AS
SELECT
id,
embedding::text AS embedding_768,
NOW() AS backed_up_at
FROM knowledge_entries
WHERE embedding IS NOT NULL;
ALTER TABLE knowledge_entries
ALTER COLUMN embedding TYPE vector(1024)
USING NULL; -- 清空現有 768 維向量(維度不可轉換)
COMMENT ON COLUMN knowledge_entries.embedding IS
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-05 ADR-110 follow-up)';
-- 2. rag_chunks清空向量資料變更欄位維度
-- ivfflat index 必須先 DROP 才能 ALTER COLUMN
DROP INDEX IF EXISTS idx_rag_chunks_embedding;
@@ -39,7 +57,7 @@ COMMENT ON COLUMN rag_chunks.embedding IS
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-04 ADR-110)';
-- 2. playbook_embeddings清空向量資料變更欄位維度
-- 3. playbook_embeddings清空向量資料變更欄位維度
DROP INDEX IF EXISTS ix_playbook_embeddings_vec;
ALTER TABLE playbook_embeddings
@@ -61,9 +79,15 @@ COMMENT ON TABLE playbook_embeddings IS
-- 3. 驗證遷移結果
DO $$
DECLARE
v_km_dim integer;
v_rag_dim integer;
v_pb_dim integer;
BEGIN
SELECT atttypmod INTO v_km_dim
FROM pg_attribute
JOIN pg_class ON attrelid = pg_class.oid
WHERE relname = 'knowledge_entries' AND attname = 'embedding';
SELECT atttypmod INTO v_rag_dim
FROM pg_attribute
JOIN pg_class ON attrelid = pg_class.oid
@@ -74,15 +98,18 @@ BEGIN
JOIN pg_class ON attrelid = pg_class.oid
WHERE relname = 'playbook_embeddings' AND attname = 'embedding';
-- atttypmod for vector(1024) = 1024 + 1 = 1025
IF v_rag_dim != 1025 THEN
RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗expected 1025, got %', v_rag_dim;
-- pgvector atttypmod stores the configured dimension.
IF v_km_dim != 1024 THEN
RAISE EXCEPTION 'knowledge_entries.embedding 維度驗證失敗expected 1024, got %', v_km_dim;
END IF;
IF v_pb_dim != 1025 THEN
RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗expected 1025, got %', v_pb_dim;
IF v_rag_dim != 1024 THEN
RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗expected 1024, got %', v_rag_dim;
END IF;
IF v_pb_dim != 1024 THEN
RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗expected 1024, got %', v_pb_dim;
END IF;
RAISE NOTICE '✅ embedding 遷移驗證通過rag_chunksplaybook_embeddings 均為 vector(1024)';
RAISE NOTICE '✅ embedding 遷移驗證通過:knowledge_entries、rag_chunksplaybook_embeddings 均為 vector(1024)';
END $$;
COMMIT;

View File

@@ -11,7 +11,7 @@ Endpoints:
Components Checked:
- PostgreSQL (192.168.0.188:5432)
- Redis (192.168.0.188:6380)
- Ollama (192.168.0.188:11434)
- Ollama (settings.OLLAMA_URL / ADR-110 provider pool)
- OpenClaw (192.168.0.188:8089)
- SigNoz (192.168.0.188:3301)
"""

View File

@@ -108,8 +108,9 @@ async def list_runs(
)
async def list_approvals(
project_id: str | None = Query(None, description="租戶 ID可選"),
run_id: str | None = Query(None, description="Run ID可選M8 詳情頁查單筆)"),
) -> dict[str, Any]:
return await list_approvals_svc(project_id=project_id)
return await list_approvals_svc(project_id=project_id, run_id=run_id)
@router.post(

View File

@@ -145,7 +145,7 @@ class Settings(BaseSettings):
# ==========================================================================
# ADR-104: LLM Playbook Generator
# 成功修復且未命中既有 Playbook 時,用本地 LLM 生成 DRAFT/REVIEW Playbook。
# 成本護欄:實作層只走 local providerOllama 111 → Ollama 188),不新增雲端 fallback。
# 成本護欄:實作層只走 local providerGCP-A → GCP-B → 111),不新增雲端 fallback。
# 回滾指令: kubectl set env deployment/awoooi-api ENABLE_LLM_PLAYBOOK_GENERATION=false
# ==========================================================================
ENABLE_LLM_PLAYBOOK_GENERATION: bool = Field(
@@ -504,6 +504,22 @@ class Settings(BaseSettings):
"unexpected cloud spend from Gitea push/PR alerts."
),
)
ALERT_AI_ALLOW_CLOUD_FALLBACK: bool = Field(
default=True,
description=(
"Allow incident/alert OpenClaw analysis to use cloud fallback "
"providers after the GCP-A/GCP-B/111 Ollama lane is exhausted. "
"Default true so Gemini can act as the final backup, after the "
"ordered Ollama lane is exhausted."
),
)
ALERT_AI_ENFORCE_OLLAMA_FIRST: bool = Field(
default=True,
description=(
"Force incident/alert OpenClaw analysis to try GCP-A, then GCP-B, "
"then local 111 before cloud backup providers such as Gemini."
),
)
# 2026-03-29 ogt: ADR-036 Nemotron Tool Calling 整合
NVIDIA_API_KEY: str = Field(
default="",
@@ -855,7 +871,7 @@ class Settings(BaseSettings):
# ==========================================================================
# MCP Phase 2b: Prometheus MCP Server (ADR-071, 2026-04-11 Claude Sonnet 4.6)
# ==========================================================================
# 2026-04-29 ogt + Claude Opus 4.7: drift fix — 188 是 Ollama HubPrometheus 實際在 110
# 2026-04-29 ogt + Claude Opus 4.7: drift fix — Prometheus 實際在 110
# ConfigMap 04-configmap.yaml 也是 110governance_agent / SLO check 連 188 會 timeout
# 此 drift 是 SPF-4 (governance_agent silently fail) 根因之一
PROMETHEUS_URL: str = Field(
@@ -929,7 +945,7 @@ class Settings(BaseSettings):
"devops": "192.168.0.110", # Harbor, GH Runner
"security": "192.168.0.112", # Kali Scanner
"k3s_master": "192.168.0.120", # K3s Master
"ai_web": "192.168.0.188", # Nginx, Postgres, Redis, Ollama
"ai_web": "192.168.0.188", # Nginx, Postgres, Redis, SignOz
}

View File

@@ -479,7 +479,7 @@ async def _collect_all_k8s_assets() -> tuple[list[dict[str, Any]], list[dict[str
# 6. Prometheus targets — 補齊 host-install services (110/112/188/125 等非 K8s)
# Gap 1 修補 (2026-04-19 audit): 原本 asset_inventory 只涵蓋 K8s,
# 110 Harbor/Gitea/監控 + 188 PostgreSQL/Redis/Ollama host-install 全漏
# 110 Harbor/Gitea/監控 + 188 PostgreSQL/Redis host-install 全漏
# 用 Prometheus /api/v1/targets 自動發現全節點服務
try:
prom_assets, host_relationships = await _collect_prometheus_targets()

View File

@@ -172,7 +172,7 @@ _LLM_FORECAST_PROMPT = """你是 AWOOOI 容量規劃專家。以下 host 過去
{findings_json}
## 當前主機環境資訊
- 主機架構: 110 (Harbor/Gitea/監控), 112 (Security), 120/121 (K3s), 125 (K3s backup), 188 (PG/Redis/Ollama/MinIO)
- 主機架構: 110 (Harbor/Gitea/監控), 112 (Security), 120/121 (K3s), 125 (K3s backup), 188 (PG/Redis/MinIO)
- 判斷請考慮: 該主機上跑什麼服務、常見瓶頸模式
## 輸出規格 (必須是合法 JSON,純 JSON 無前後文字)

View File

@@ -80,6 +80,7 @@ from src.core.feature_flags import aiops_flags # ADR-080: AI 自主化飛輪 fe
from src.core.http_client import close_all_http_clients, init_all_http_clients
from src.core.logging import get_logger, setup_logging
from src.core.redis_client import close_redis_pool, init_redis_pool
from src.services.flywheel_stats_service import get_flywheel_stats_service
from src.core.sse import get_publisher
from src.core.telemetry import setup_telemetry, shutdown_telemetry
@@ -683,7 +684,7 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
logger.warning("ollama_failover_system_start_failed", error=str(e))
# 2026-04-27 P3.2.2 by Claude — AI Provider 版本追蹤(每 1 小時)
# 探測 5 Providerollama/ollama_188/gemini/claude/openclaw_nemo版本
# 探測 5 Providerollama/ollama_local/gemini/claude/openclaw_nemo版本
# 寫入 ai_provider_version_history版本變更時 log warningP3.2.3 alerter 後續整合
try:
async def _run_model_version_tracker_loop() -> None:
@@ -1005,10 +1006,17 @@ app.include_router(platform_v1.router, prefix="/api/v1/platform", tags=["AwoooP
@app.get("/metrics", include_in_schema=False)
async def prometheus_metrics() -> Response:
"""Prometheus metrics endpoint for alerting"""
return Response(
content=generate_latest(),
media_type=CONTENT_TYPE_LATEST,
)
content = generate_latest().decode("utf-8")
# 2026-05-07 ogt + Claude Sonnet 4.6 — INC-20260507-99ADF2 修復
# 飛輪指標awoooi_flywheel_*)原本只在 /api/v1/stats/flywheel/metrics 暴露,
# 110 Prom awoooi-api job scrape /metrics 時抓不到 → FlywheelExecutionRateMissing 永久 firing
# 修法:在此串入飛輪指標,讓既有 scrape job 無需新增 job 即可抓到
try:
flywheel_metrics = await get_flywheel_stats_service().compute()
content += flywheel_metrics.to_prometheus_lines()
except Exception:
logger.warning("prometheus_metrics_flywheel_error")
return Response(content=content, media_type=CONTENT_TYPE_LATEST)
# =============================================================================

View File

@@ -29,7 +29,7 @@ from __future__ import annotations
from prometheus_client import Histogram
# Buckets 對齊 NIM 實測分佈2-27s並覆蓋三段 timeout 30/20/15s 邊界
# 低端0.5-5s快速路徑Ollama 188 本地
# 低端0.5-5s快速路徑Ollama provider pool
# 中端5-20sNIM + Gemini fallback
# 高端20-60s超時 / 慢速 Provider
_AGENT_STEP_BUCKETS = [0.5, 1.0, 2.0, 5.0, 10.0, 15.0, 20.0, 30.0, 45.0, 60.0]

View File

@@ -60,13 +60,17 @@ class MetricsDBRepository(IMetricsRepository):
cutoff = datetime.now(UTC) - timedelta(hours=hours)
# Query: 統計 executed vs total (approved + executed + execution_failed)
# 2026-05-06 ogt + Codex:
# approval_records.status 目前實際寫入的是大寫 enum
# (APPROVED / EXECUTION_SUCCESS / EXECUTION_FAILED)。舊查詢只看
# lowercase executed導致 AI Success 在報表層永遠趨近 0。
query = text("""
SELECT
COUNT(CASE WHEN status = 'executed' THEN 1 END) as executed_count,
COUNT(CASE WHEN UPPER(status::text) = 'EXECUTION_SUCCESS' THEN 1 END) as executed_count,
COUNT(*) as total_count
FROM approval_records
WHERE created_at >= :cutoff
AND status IN ('approved', 'executed', 'execution_failed')
AND UPPER(status::text) IN ('APPROVED', 'EXECUTION_SUCCESS', 'EXECUTION_FAILED')
""")
result = await session.execute(query, {"cutoff": cutoff})
@@ -127,11 +131,11 @@ class MetricsDBRepository(IMetricsRepository):
trend_query = text("""
SELECT
date_trunc('hour', created_at) as hour_bucket,
COUNT(CASE WHEN status = 'executed' THEN 1 END) * 100.0 /
COUNT(CASE WHEN UPPER(status::text) = 'EXECUTION_SUCCESS' THEN 1 END) * 100.0 /
NULLIF(COUNT(*), 0) as hourly_rate
FROM approval_records
WHERE created_at >= :cutoff
AND status IN ('approved', 'executed', 'execution_failed')
AND UPPER(status::text) IN ('APPROVED', 'EXECUTION_SUCCESS', 'EXECUTION_FAILED')
GROUP BY hour_bucket
ORDER BY hour_bucket DESC
LIMIT :limit

View File

@@ -104,7 +104,7 @@ async def get_agent_thinking(
) -> StreamingResponse:
"""
OpenClaw 思考軌跡 (SSE 串流)
Phase 1.2: 真實串接 Ollama at 192.168.0.188:11434
Phase 1.2: 真實串接設定中的 Ollama provider pool
"""
async def generate_thinking_stream():

View File

@@ -1,10 +1,10 @@
"""
Ollama Provider - Phase 24 ADR-052
====================================
本地 LLM 推理 (192.168.0.188 VMware VM, CPU-only)
本地 / 私有 LLM 推理 Provider。
搬移自: openclaw.py _call_ollama (L349-409)
特性: 免費、隱私安全 (local)、但 CPU 慢 (~97s/30tokens for qwen2.5:7b)
特性: 免費、隱私安全 (local)、可依 ADR-110 指向 GCP-A/GCP-B/111。
2026-04-02 ogt: Phase 24-A 從 openclaw.py 抽出
"""
@@ -268,33 +268,27 @@ class OllamaProvider:
self._http_client = None
# 2026-04-26 Wave5 B1-fix by Claude Engineer-A4 — OLLAMA_188 provider 註冊
class Ollama188Provider(OllamaProvider):
# 2026-05-06 Codex — 188 不再作為 Ollama Provider本地備援統一命名為 ollama_local。
class OllamaLocalProvider(OllamaProvider):
"""
Ollama 188 CPU-only 備援 Provider
Ollama Local fallback Provider
繼承 OllamaProvider使用 OLLAMA_FALLBACK_URL192.168.0.188:11434
作為推理端點,模型預設 OLLAMA_HEALTH_CHECK_MODELqwen2.5:7b-instruct
B1 修復:原本 _init_registry 未登錄此 provider導致
executor.execute() 遇到 "ollama_188" → not_registered → 跳過,
188 從未被打到。此類別補全登錄鏈路。
2026-04-26 Wave5 B1-fix by Claude Engineer-A4
使用 OLLAMA_FALLBACK_URL 作為本地最後防線端點。
ADR-110 目前設定為 110 nginx proxy → 111 Ollama188 不得再作為 Ollama provider
"""
@property
def name(self) -> str:
return "ollama_188"
return "ollama_local"
@property
def is_enabled(self) -> bool:
import os
# 優先查 ENABLE_OLLAMA_188;若未設定(預設 true則看 OLLAMA_FALLBACK_URL 是否有值
env_override = os.getenv("ENABLE_OLLAMA_188", "true").lower() == "true"
# 優先查 ENABLE_OLLAMA_LOCAL;若未設定(預設 true則看 OLLAMA_FALLBACK_URL 是否有值
env_override = os.getenv("ENABLE_OLLAMA_LOCAL", "true").lower() == "true"
if not env_override:
return False
# OLLAMA_FALLBACK_URL 空字串 → 未設定 188 節點 → 停用
# OLLAMA_FALLBACK_URL 空字串 → 未設定本地節點 → 停用
return bool(getattr(settings, "OLLAMA_FALLBACK_URL", ""))
def _endpoint_url(self) -> str:
@@ -319,18 +313,18 @@ class Ollama188Provider(OllamaProvider):
client = await self._get_client()
registry = get_model_registry()
# 嘗試取 ollama_188 專屬設定fallback 到 ollama 預設
# 嘗試取本地 fallback 專屬設定fallback 到 ollama 預設
try:
model_name = registry.get_model("ollama_188", "rca")
model_name = registry.get_model("ollama_local", "rca")
except Exception:
model_name = getattr(settings, "OLLAMA_HEALTH_CHECK_MODEL", "qwen2.5:7b-instruct")
try:
options = registry.get_provider_options("ollama_188")
options = registry.get_provider_options("ollama_local")
except Exception:
options = registry.get_provider_options("ollama")
# CPU-only 備援:固定使用較長 timeoutCPU 推理慢)
# 本地備援:固定使用較長 timeout,避免 111 模型載入時被過早判死。
task_type = (context or {}).get("task_type", "")
if task_type in ("diagnose", "force_local"):
read_timeout = float(getattr(settings, "OLLAMA_DIAGNOSE_TIMEOUT_SECONDS", 200))
@@ -359,7 +353,7 @@ class Ollama188Provider(OllamaProvider):
latency = (time.perf_counter() - start) * 1000
logger.info(
"ollama_188_provider_success",
"ollama_local_provider_success",
response_length=len(result),
tokens=tokens,
latency_ms=round(latency, 1),
@@ -375,12 +369,12 @@ class Ollama188Provider(OllamaProvider):
except httpx.TimeoutException as e:
latency = (time.perf_counter() - start) * 1000
logger.warning("ollama_188_provider_timeout", error=str(e), latency_ms=round(latency, 1))
logger.warning("ollama_local_provider_timeout", error=str(e), latency_ms=round(latency, 1))
return AIResult(raw_response="", success=False, provider=self.name, latency_ms=latency, error=f"Timeout: {e}")
except Exception as e:
latency = (time.perf_counter() - start) * 1000
logger.warning("ollama_188_provider_failed", error=str(e), latency_ms=round(latency, 1))
logger.warning("ollama_local_provider_failed", error=str(e), latency_ms=round(latency, 1))
return AIResult(raw_response="", success=False, provider=self.name, latency_ms=latency, error=str(e))
async def health_check(self) -> bool:

View File

@@ -73,10 +73,6 @@ class AIProviderEnum(str, Enum):
"""AI 提供者"""
OLLAMA = "ollama"
# 2026-04-25 critic-fix Part2 B2 by Claude Engineer-C2
# P1.1b OllamaFailoverManager 使用 provider_name="ollama_188"
# 但 AIProviderEnum 沒有此值 → P1.2 整合時 lookup 失敗
OLLAMA_188 = "ollama_188" # 188 CPU-only 備援節點P1.1b
# 2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP 三層容災
# OllamaFailoverManager 回傳 provider_name="ollama_gcp_a"/"ollama_gcp_b"/"ollama_local"
# 缺少 enum 值 → AIProviderEnum(primary_str) 拋 ValueError → fallback chain 清空 → 直跳 Gemini
@@ -96,8 +92,6 @@ class AIProviderEnum(str, Enum):
# Provider 對應延遲預算 (ms)
PROVIDER_LATENCY_BUDGET: dict[AIProviderEnum, int] = {
AIProviderEnum.OLLAMA: 60000, # 本地,允許較長處理時間
# 2026-04-25 critic-fix Part2 B2 by Claude Engineer-C2 — 188 CPU-only 推理較慢
AIProviderEnum.OLLAMA_188: 120000, # 120s budget for CPU inference
# 2026-05-04 ogt: ADR-110 GCP 三層容災 — GCP NVMe SSD 推理快60s 足夠
AIProviderEnum.OLLAMA_GCP_A: 60000,
AIProviderEnum.OLLAMA_GCP_B: 60000,
@@ -432,7 +426,7 @@ class AIRouter:
model = failover_result.primary.model
reason = f"{reason} [failover→{primary_str}]"
except ValueError:
# provider_name 無法對應已知 enum理論上不應發生OLLAMA_188 已加)
# provider_name 無法對應已知 enum;避免未知 provider 靜默進入執行層。
logger.warning(
"ai_router_unknown_failover_provider",
provider=primary_str,
@@ -1078,11 +1072,46 @@ class AIRouterExecutor:
cached = await redis.get(cache_key)
if cached:
data = _json.loads(cached)
cached_provider = data.get("provider", "cache")
provider_allowed = cached_provider in provider_order
ollama_first_required = (
bool(context)
and any(
key in context
for key in (
"alert_type",
"alertname",
"alert_name",
"fingerprint",
"incident_id",
"severity",
"target_resource",
)
)
and bool(provider_order)
and provider_order[0].startswith("ollama")
)
if (
cached_provider == "ollama"
and any(provider.startswith("ollama") for provider in provider_order)
):
provider_allowed = True
if ollama_first_required and not cached_provider.startswith("ollama"):
provider_allowed = False
if not provider_allowed:
logger.info(
"ai_router_cache_provider_mismatch_skip",
cache_key=cache_key[:30],
cached_provider=cached_provider,
provider_order=provider_order,
ollama_first_required=ollama_first_required,
)
raise ValueError("cached provider not allowed by current provider_order")
logger.info("ai_router_cache_hit", cache_key=cache_key[:30])
return AIResult(
raw_response=data.get("response", ""),
success=True,
provider=data.get("provider", "cache"),
provider=cached_provider,
from_cache=True,
)
except Exception as e:
@@ -1306,7 +1335,7 @@ def _init_registry() -> AIProviderRegistry:
"""初始化 Provider Registry (首次呼叫時自動註冊所有 Provider)"""
from src.services.ai_providers.ollama import (
OllamaProvider,
Ollama188Provider,
OllamaLocalProvider,
OllamaGcpBProvider, # 2026-05-04 ADR-110 GCP-B
)
from src.services.ai_providers.gemini import GeminiProvider
@@ -1327,8 +1356,9 @@ def _init_registry() -> AIProviderRegistry:
from src.services.ai_providers.nemotron import NemotronProvider
registry.register(NemotronProvider())
# 2026-04-26 Wave5 B1-fix by Claude Engineer-A4 — 補登 OLLAMA_188 備援 provider
ollama_local = Ollama188Provider()
# 2026-05-06 Codex: 188 不再作為 Ollama provider
# Local fallback 統一命名為 ollama_local端點由 OLLAMA_FALLBACK_URL 指向 111/110 proxy。
ollama_local = OllamaLocalProvider()
registry.register(ollama_local)
# 2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP 三層容災修復
@@ -1337,7 +1367,7 @@ def _init_registry() -> AIProviderRegistry:
# 修復:
# "ollama_gcp_a" alias → 同 OllamaProviderOLLAMA_URL = GCP-A
# "ollama_gcp_b" → 新 OllamaGcpBProviderOLLAMA_SECONDARY_URL = GCP-B
# "ollama_local" alias → 同 Ollama188ProviderOLLAMA_FALLBACK_URL = 111
# "ollama_local" OllamaLocalProviderOLLAMA_FALLBACK_URL = 111 / 110:11437
registry._providers["ollama_gcp_a"] = ollama_gcp_a
registry.register(OllamaGcpBProvider())
registry._providers["ollama_local"] = ollama_local

View File

@@ -457,6 +457,8 @@ class AutoRepairService:
except Exception as _db_e:
logger.error("auto_repair_db_write_failed", error=str(_db_e))
self._record_auto_repair_metric(playbook, success=True)
# 2026-04-07 Claude Code: Sprint 4 B1/B2 — 記錄處置類型
# P0-1 Fix: 統一使用 AnomalyCounter.hash_signature()
try:
@@ -630,6 +632,8 @@ class AutoRepairService:
except Exception as _db_e:
logger.error("auto_repair_db_write_failed", error=str(_db_e))
self._record_auto_repair_metric(playbook, success=False)
# 2026-04-04 Claude Code: Phase 25 P1 — 失敗修復後 fire-and-forget 生成 ANTI_PATTERN
# 2026-04-05 Claude Code: I1 修正 — 補齊 _pending_tasks GC 防護(對稱化)
try:
@@ -700,6 +704,35 @@ class AutoRepairService:
return max_risk
def _record_auto_repair_metric(self, playbook: Playbook, success: bool) -> None:
"""把實際 auto-repair 執行寫入 Prometheus 指標。
2026-05-06 ogt + CodexDB 已有 auto_repair_executions
core.metrics.record_auto_repair() 長期零 caller導致治理/心跳用
Prometheus 看起來像「飛輪沒做事」。label 使用 action_type避免
playbook_id 造成高基數。
"""
try:
from src.core.metrics import record_auto_repair
first_step = playbook.repair_steps[0] if playbook.repair_steps else None
action = first_step.action_type.value if first_step else "unknown"
max_risk = self._get_max_risk_level(playbook)
tier = {
RiskLevel.LOW: 1,
RiskLevel.MEDIUM: 2,
RiskLevel.HIGH: 3,
RiskLevel.CRITICAL: 4,
}.get(max_risk, 0)
record_auto_repair(action=action, tier=tier, success=success)
except Exception as e:
logger.warning(
"auto_repair_metric_record_failed",
playbook_id=playbook.playbook_id,
success=success,
error=str(e),
)
def _is_host_or_backup_incident(self, incident: Incident) -> bool:
"""主機/備份類事件只能走 SSH/只讀診斷,不允許 K8s rollout 類修復。"""

View File

@@ -607,7 +607,7 @@ async def _nemoclaw_second_opinion(incident: "Incident", primary_result: dict) -
"""
MCP Phase 4a: NemoClaw second opinion — 信心 < 0.7 時觸發
============================================================
用 deepseek-r1:14b (Ollama 188) 對同一份資料做獨立推理,
用 deepseek-r1:14b (設定的 Ollama primary) 對同一份資料做獨立推理,
輸出純文字 advisory_note不執行任何操作。
2026-04-11 Claude Sonnet 4.6 Asia/Taipei
@@ -666,7 +666,7 @@ async def _generate_playbook_draft_if_new(incident: "Incident") -> None:
MCP Phase 4c: Playbook 無命中時,自動生成 AI 草稿 Playbook 寫入 KM
=====================================================================
- 僅在 KM 中不存在同 alertname 的 Playbook 時觸發(避免重複)
- 用 qwen2.5:7b-instruct (Ollama 188) 生成結構化 Playbook 草稿
- 用 qwen2.5:7b-instruct (設定的 Ollama primary) 生成結構化 Playbook 草稿
- 寫入 KnowledgeEntrystatus=DRAFT需人工審核後升為 APPROVED
- 寫入 AlertOperationLog PLAYBOOK_DRAFT_CREATED 事件

View File

@@ -237,6 +237,31 @@ class FlywheelStatsService:
except (json.JSONDecodeError, KeyError):
continue
# 2026-05-06 ogt + Codex:
# 執行成功率的 source of truth 是 auto_repair_executions。
# Redis playbook success_count/failure_count 會因回寫鏈路中斷而落後,
# 造成 governance / heartbeat 判定「飛輪沒有執行」。
try:
async with get_db_context() as db:
row = await db.execute(
text("""
SELECT
COUNT(*) FILTER (WHERE success IS TRUE) AS success,
COUNT(*) AS total
FROM auto_repair_executions
WHERE created_at >= NOW() - interval '24 hours'
""")
)
repair_stats = row.one()
db_total_exec = int(repair_stats.total or 0)
if db_total_exec >= FLYWHEEL_MIN_SAMPLE:
db_total_success = int(repair_stats.success or 0)
return count, db_total_success / db_total_exec
if db_total_exec > 0:
return count, None
except Exception:
logger.warning("flywheel_stats_auto_repair_execution_query_failed")
if total_exec < FLYWHEEL_MIN_SAMPLE:
# 樣本不足(含 Redis 空),回 None 通知呼叫方跳過 W-3 告警判斷
return count, None

View File

@@ -15,7 +15,7 @@ HeartbeatReportService — ADR-073 心跳監控重構
import asyncio
import html
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from datetime import datetime
from typing import Optional
import httpx
@@ -420,8 +420,8 @@ class HeartbeatReportService:
try:
# KM 向量化率DB 查詢)
from src.db.base import get_db_context
from src.db.models import IncidentRecord, KnowledgeEntryRecord
from sqlalchemy import func, select
from src.db.models import KnowledgeEntryRecord
from sqlalchemy import func, select, text as sa_text
async with get_db_context() as db:
# KM 總數
km_total = await db.scalar(select(func.count()).select_from(KnowledgeEntryRecord))
@@ -436,20 +436,22 @@ class HeartbeatReportService:
stats.km_vectorized = vec_result.scalar() or 0
# 24h 修復統計
since = datetime.utcnow() - timedelta(hours=24)
outcomes = await db.execute(
select(IncidentRecord.outcome).where(
IncidentRecord.created_at >= since,
IncidentRecord.outcome.isnot(None),
)
)
outcome_list = [r[0] for r in outcomes.all() if r[0]]
stats.attempt_24h = len(outcome_list)
stats.success_24h = sum(
1 for o in outcome_list
if isinstance(o, dict) and o.get("execution_success")
or isinstance(o, str) and "success" in o.lower()
# 2026-05-06 ogt + Codex:
# incidents.outcome 已不是自動修復 source of truth。實際執行紀錄
# 寫在 auto_repair_executions舊查詢會讓心跳報告顯示 0/15
# 造成「全系統正常」但飛輪 KPI 失真的假象。
repair_result = await db.execute(
sa_text("""
SELECT
COUNT(*) FILTER (WHERE success IS TRUE) AS success,
COUNT(*) AS total
FROM auto_repair_executions
WHERE created_at >= NOW() - interval '24 hours'
""")
)
repair_row = repair_result.one()
stats.success_24h = int(repair_row.success or 0)
stats.attempt_24h = int(repair_row.total or 0)
# 最後學習活動
last_km = await db.scalar(
@@ -865,9 +867,10 @@ def report_to_telegram_html(report: HeartbeatReport) -> str:
lines.append("☸️ <b>Kubernetes Pods</b>")
for i, pod in enumerate(report.pods):
prefix = "└─" if i == len(report.pods) - 1 else "├─"
ready_icon = "" if pod.ready else ""
ready_icon = "" if pod.ready or pod.status in ("Succeeded", "Completed") else ""
restart_str = f" (重啟×{pod.restarts})" if pod.restarts > 0 else ""
lines.append(f"{prefix} {ready_icon} {html.escape(pod.name[:35])}{restart_str}")
status_str = "" if pod.ready else f" <code>{html.escape(pod.status)}</code>"
lines.append(f"{prefix} {ready_icon} {html.escape(pod.name[:35])}{restart_str}{status_str}")
# --- Scanner 狀態 ---
if report.scanners.last_runs:

View File

@@ -7,7 +7,7 @@ Hosts:
- 192.168.0.110: DevOps 金庫 (Harbor, GH Runner)
- 192.168.0.112: Kali Security (Scanner API)
- 192.168.0.120: K3s Master (awoooi-prod namespace)
- 192.168.0.188: AI+Web 中心 (Nginx, PostgreSQL, Redis, Ollama, OpenClaw, SigNoz)
- 192.168.0.188: AI+Web 中心 (Nginx, PostgreSQL, Redis, OpenClaw, SigNoz)
Features:
- asyncio.gather for parallel fetching

View File

@@ -34,8 +34,12 @@ logger = structlog.get_logger(__name__)
# 台北時區
TZ_TAIPEI = ZoneInfo("Asia/Taipei")
# Prometheus 端點
PROMETHEUS_URL = "http://192.168.0.121:30090"
# Prometheus endpoint.
#
# 2026-05-05 Codex: do not pin this report job to a K3s worker NodePort.
# Production already injects PROMETHEUS_URL from ConfigMap, currently the
# Docker Prometheus on 110. This keeps reboot recovery independent of 121.
PROMETHEUS_URL = settings.PROMETHEUS_URL.rstrip("/")
# kube-state-metrics 查詢
PROM_QUERIES = {
@@ -215,7 +219,7 @@ class K3sMonitorService:
# 發送訊息
formatted = status.format()
result = await gateway.send_message(formatted)
result = await gateway.send_text(formatted)
if result:
logger.info("k3s_daily_report_sent", date=status.report_date)

View File

@@ -5,7 +5,7 @@ AI Provider 版本探測 — 為每個 Provider 提供 get_version()
Provider:
- ollama : 34.143.170.20 GCP-A Ollama (primary) — 2026-05-03 ogt: ADR-110 GCP-A Primary
- ollama_188 : 192.168.0.188 Ollama (fallback)
- ollama_local : 192.168.0.111 / 110 proxy Ollama (local fallback)
- gemini : Google Gemini API (版本 = model name)
- claude : Anthropic Claude (版本 = model name)
- openclaw_nemo : OpenClaw NemoTron (版本 = OPENCLAW_DEFAULT_MODEL)
@@ -31,7 +31,7 @@ TAIPEI_TZ = timezone(timedelta(hours=8))
class ProviderVersionInfo:
"""AI Provider 版本快照"""
provider: str # "ollama" / "ollama_188" / "gemini" / "claude" / "openclaw_nemo"
provider: str # "ollama" / "ollama_local" / "gemini" / "claude" / "openclaw_nemo"
model: str
version: str # version string 或 tagOllama 用 modified_at其他用 model name
digest: str | None = None # SHA256 digest僅 Ollama 有)
@@ -43,7 +43,7 @@ class ProviderVersionInfo:
# =============================================================================
async def probe_ollama_version(url: str, model: str) -> ProviderVersionInfo:
"""探測 OllamaGCP-A 或 188GET /api/tags 取 model digest + modified_at
"""探測 OllamaGCP-A/GCP-B 或本地 111GET /api/tags 取 model digest + modified_at
Args:
url: Ollama base URL例如 "http://34.143.170.20:11434"GCP-A Primary
@@ -58,15 +58,12 @@ async def probe_ollama_version(url: str, model: str) -> ProviderVersionInfo:
"""
import httpx
# 2026-05-03 ogt: ADR-110 GCP-A Primary — 擴展 provider 判斷邏輯支援 GCP 三層容災
# 188 保留 ollama_188 命名CPU-only 主機,雖移出 routing chain 但仍可被 probe
# 2026-05-06 Codex: 188 不再作為 Ollama providerlocal fallback 一律標示 ollama_local。
_GCP_OLLAMA_IPS = {"34.143.170.20", "34.21.145.224"}
if any(ip in url for ip in _GCP_OLLAMA_IPS):
provider_name = "ollama"
elif "192.168.0.111" in url:
elif "192.168.0.111" in url or "192.168.0.110:11437" in url:
provider_name = "ollama_local"
elif "192.168.0.188" in url:
provider_name = "ollama_188"
else:
provider_name = "ollama_remote"
@@ -179,7 +176,7 @@ async def probe_claude_version() -> ProviderVersionInfo:
async def probe_openclaw_nemo_version() -> ProviderVersionInfo:
"""OpenClaw NemoTron版本字串從 settings.OPENCLAW_DEFAULT_MODEL 讀取
NemoTron 運行在 OpenClaw 188 節點(使用 Ollama 推理)
NemoTron 運行在 OpenClaw 節點
透過 OPENCLAW_URL /api/tags 探測,模型名稱即版本識別。
Returns:
@@ -195,18 +192,18 @@ async def probe_openclaw_nemo_version() -> ProviderVersionInfo:
# OpenClaw 底層是 Ollama使用 OPENCLAW_URL 的 host:port 加上 Ollama port
# OPENCLAW_URL 是 8088OpenClaw APIOllama 通常在 11434
# 188 的 Ollama URL 若有設定則直接用 OLLAMA_FALLBACK_URL
ollama_188_url = settings.OLLAMA_FALLBACK_URL
if not ollama_188_url:
# OpenClaw 底層 tags 來源優先使用本地 fallback Ollama URL。
ollama_local_url = settings.OLLAMA_FALLBACK_URL
if not ollama_local_url:
# fallback從 OPENCLAW_URL host 構建 Ollama URL
from urllib.parse import urlparse
parsed = urlparse(settings.OPENCLAW_URL)
ollama_188_url = f"{parsed.scheme}://{parsed.hostname}:11434"
ollama_local_url = f"{parsed.scheme}://{parsed.hostname}:11434"
import httpx
async with httpx.AsyncClient(timeout=5.0) as client:
resp = await client.get(f"{ollama_188_url}/api/tags")
resp = await client.get(f"{ollama_local_url}/api/tags")
resp.raise_for_status()
models = resp.json().get("models", [])
@@ -220,7 +217,7 @@ async def probe_openclaw_nemo_version() -> ProviderVersionInfo:
)
# model 不在清單時version 用 model namedigest=None
logger.warning("openclaw_nemo_model_not_in_tags", model=model, url=ollama_188_url)
logger.warning("openclaw_nemo_model_not_in_tags", model=model, url=ollama_local_url)
return ProviderVersionInfo(
provider="openclaw_nemo",
model=model,
@@ -257,7 +254,7 @@ async def probe_all_providers() -> list[ProviderVersionInfo]:
raw = await asyncio.gather(*tasks, return_exceptions=True)
results: list[ProviderVersionInfo] = []
provider_labels = ["ollama", "ollama_188", "gemini", "claude", "openclaw_nemo"]
provider_labels = ["ollama", "ollama_local", "gemini", "claude", "openclaw_nemo"]
for label, outcome in zip(provider_labels, raw, strict=True):
if isinstance(outcome, ProviderVersionInfo):
results.append(outcome)

View File

@@ -5,7 +5,7 @@ Phase 5: OpenClaw 實體化升級 (2026-03-21)
統帥校正: SignOz 為唯一全能視力中心
Features:
- 真實 LLM SDK 整合 (Ollama → Gemini → Claude)
- 真實 LLM SDK 整合 (告警預設 Ollama GCP-A → GCP-B → 111 → Gemini)
- SignOz Gold Metrics 即時擷取 (P99/Error/RPS)
- AIOps Agent 專業人格 (K8s 維運 + SRE RCA 專精)
- 強制結構化 JSON 輸出 (符合 API 契約)
@@ -144,8 +144,8 @@ class OpenClawService:
"""
OpenClaw AI 決策服務 - True LLM + SignOz Integration
實作 AI_FALLBACK_ORDER 備援機制:
Ollama → Gemini → Claude → Mock
實作 AI_FALLBACK_ORDER 備援機制
告警/incident 上下文預設套用成本防線,只允許 Ollama GCP-A → GCP-B → 111。
新增 SignOz 整合:
- 自動擷取 Gold Metrics
@@ -176,6 +176,89 @@ class OpenClawService:
await self._http_client.aclose()
self._http_client = None
def _is_incident_alert_context(self, alert_context: dict | None) -> bool:
"""Return true when a request came from the alert/incident automation path."""
if not alert_context:
return False
alert_keys = {
"alert_type",
"alertname",
"alert_name",
"fingerprint",
"incident_id",
"severity",
"signals",
"target_resource",
}
return any(key in alert_context for key in alert_keys)
def _cloud_fallback_allowed_for_alert(self, alert_context: dict | None) -> bool:
"""Cloud fallback is allowed after the ordered Ollama lane for alerts."""
if not self._is_incident_alert_context(alert_context):
return True
return bool(getattr(settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", True))
def _alert_enforces_ollama_first(self, alert_context: dict | None) -> bool:
"""Alert cards must try GCP-A/GCP-B/111 before Gemini backup."""
return (
self._is_incident_alert_context(alert_context)
and bool(getattr(settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True))
)
async def _resolve_alert_provider_order(
self,
task_type: str = "diagnose",
alert_context: dict | None = None,
cloud_provider_order: list[str] | None = None,
) -> list[str]:
"""Resolve GCP-A/GCP-B/111, then Gemini backup, for alert analysis."""
provider_order: list[str] = []
try:
route = await get_ollama_failover_manager().select_provider(task_type=task_type)
provider_order = [
endpoint.provider_name
for endpoint in route.all_endpoints_in_order()
if endpoint.provider_name.startswith("ollama")
]
except Exception as route_error:
logger.warning(
"alert_ollama_route_lookup_failed",
error=str(route_error),
task_type=task_type,
)
if not provider_order:
provider_order = ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
deduped: list[str] = []
for provider_name in provider_order:
if provider_name and provider_name not in deduped:
deduped.append(provider_name)
if not self._alert_enforces_ollama_first(alert_context):
return deduped
ollama_order = {"ollama_gcp_a": 0, "ollama_gcp_b": 1, "ollama_local": 2}
ordered_ollama = [
provider_name
for provider_name in deduped
if provider_name in ollama_order
]
ordered_ollama.sort(key=lambda provider_name: ollama_order[provider_name])
if not ordered_ollama:
ordered_ollama = ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
if not self._cloud_fallback_allowed_for_alert(alert_context):
return ordered_ollama
cloud_candidates = cloud_provider_order or []
cloud_backup: list[str] = []
for provider_name in [*cloud_candidates, "gemini"]:
if provider_name == "gemini" and provider_name not in cloud_backup:
cloud_backup.append(provider_name)
return ordered_ollama + cloud_backup
# =========================================================================
# SignOz Integration
# =========================================================================
@@ -437,13 +520,13 @@ class OpenClawService:
# 完整移除時機: Phase 24 完整驗收後 (ADR-052 D11)
# =========================================================================
async def _call_ollama(self, prompt: str) -> tuple[str, bool]:
async def _call_ollama(self, prompt: str, *, ollama_only: bool = False) -> tuple[str, bool]:
"""
呼叫 Ollama (支援 JSON Mode)。
USE_AI_ROUTER=true 正常會走 AIRouterExecutor這裡是 legacy safety-net。
2026-05-05 Codex: safety-net 也必須遵守 ADR-110 三層 Ollama
路由,不能只打 OLLAMA_URL 後直接掉 Gemini。
路由,告警路徑預設只允許 GCP-A/GCP-B/111不能只打 OLLAMA_URL 後直接掉 Gemini。
"""
try:
client = await self._get_client()
@@ -484,6 +567,26 @@ class OpenClawService:
endpoints.append((provider_name, endpoint_url))
seen_urls.add(endpoint_url)
if ollama_only:
allowed_provider_order = {"ollama_gcp_a": 0, "ollama_gcp_b": 1, "ollama_local": 2}
endpoints = [
(provider_name, endpoint_url)
for provider_name, endpoint_url in endpoints
if provider_name in allowed_provider_order
]
endpoints.sort(key=lambda item: allowed_provider_order[item[0]])
if not endpoints:
endpoints = [
("ollama_gcp_a", settings.OLLAMA_URL),
("ollama_gcp_b", getattr(settings, "OLLAMA_SECONDARY_URL", "")),
("ollama_local", getattr(settings, "OLLAMA_FALLBACK_URL", "")),
]
endpoints = [
(provider_name, endpoint_url)
for provider_name, endpoint_url in endpoints
if endpoint_url
]
last_error = ""
for provider_name, endpoint_url in endpoints:
try:
@@ -973,7 +1076,11 @@ class OpenClawService:
try:
# 2026-04-02 ogt: C2 修復 — 呼叫 AIRouter.route() 智慧路由 (非靜態 order)
# D1 意圖分類路由、D7 隱私保護 (DIAGNOSE/CODE_REVIEW 強制 local) 生效
from src.services.ai_router import get_ai_router, get_ai_executor, IntentType
from src.services.ai_router import (
IntentType,
get_ai_executor,
get_ai_router,
)
router = get_ai_router()
executor = get_ai_executor()
@@ -987,7 +1094,10 @@ class OpenClawService:
if p.value != decision.selected_provider.value
]
try:
from src.services.ai_control import get_primary_provider, is_provider_disabled
from src.services.ai_control import (
get_primary_provider,
is_provider_disabled,
)
_primary = await get_primary_provider()
if _primary and _primary != decision.selected_provider.value:
# 把 primary 移到首位 (保留原始 fallback)
@@ -1003,6 +1113,20 @@ class OpenClawService:
except Exception as _e:
logger.warning("ai_control_override_failed", error=str(_e))
if self._alert_enforces_ollama_first(alert_context):
original_provider_order = list(provider_order)
provider_order = await self._resolve_alert_provider_order(
task_type=decision.intent.value if decision.intent else "diagnose",
alert_context=alert_context,
cloud_provider_order=original_provider_order,
)
logger.info(
"alert_ollama_first_provider_order",
original_provider_order=original_provider_order,
provider_order=provider_order,
cloud_fallback_allowed=self._cloud_fallback_allowed_for_alert(alert_context),
)
# Step 3: D7 隱私 — CODE_REVIEW 強制 local
# 2026-04-15 ogt: DIAGNOSE 移除 require_localv4.3 決策NIM 為主力,無隱私問題)
# ai_router.py v4.3 已明確「NIM 從 Phase 22 起就是主力,無隱私問題」
@@ -1045,13 +1169,18 @@ class OpenClawService:
_mock_json, _rule_id = self._generate_mock_response(alert_context or {}, signoz_metrics)
if _rule_id == "generic_fallback":
import asyncio
from src.services.alert_rule_engine import auto_generate_rule
try:
asyncio.create_task(auto_generate_rule(
alert_context or {},
ollama_url=settings.OLLAMA_URL,
model=settings.OPENCLAW_DEFAULT_MODEL,
gemini_api_key=getattr(settings, "GEMINI_API_KEY", ""),
gemini_api_key=(
getattr(settings, "GEMINI_API_KEY", "")
if self._cloud_fallback_allowed_for_alert(alert_context)
else ""
),
))
except Exception as _e:
logger.warning("auto_rule_trigger_failed", error=str(_e))
@@ -1086,7 +1215,18 @@ class OpenClawService:
from src.services.ai_rate_limiter import get_ai_rate_limiter
rate_limiter = get_ai_rate_limiter()
for provider in settings.AI_FALLBACK_ORDER:
legacy_provider_order = list(settings.AI_FALLBACK_ORDER)
if self._alert_enforces_ollama_first(alert_context):
legacy_provider_order = ["ollama"]
if self._cloud_fallback_allowed_for_alert(alert_context):
legacy_provider_order.append("gemini")
logger.info(
"legacy_alert_ollama_first_provider_order",
provider_order=legacy_provider_order,
cloud_fallback_allowed=self._cloud_fallback_allowed_for_alert(alert_context),
)
for provider in legacy_provider_order:
# Rate Limit 檢查 (nvidia/gemini/claude 需檢查ollama 不限)
# 2026-03-30 ogt: 加入 nvidia (RPM=5 限制)
if provider in ("nvidia", "gemini", "claude"):
@@ -1109,7 +1249,10 @@ class OpenClawService:
cost_usd = 0.0
if provider == "ollama":
response, success = await self._call_ollama(prompt)
response, success = await self._call_ollama(
prompt,
ollama_only=self._alert_enforces_ollama_first(alert_context),
)
elif provider == "gemini":
response, success, total_tokens, cost_usd = await self._call_gemini(prompt)
elif provider == "nvidia":
@@ -1165,13 +1308,18 @@ class OpenClawService:
_mock_json, _rule_id = self._generate_mock_response(alert_context or {}, signoz_metrics)
if _rule_id == "generic_fallback":
import asyncio
from src.services.alert_rule_engine import auto_generate_rule
try:
asyncio.create_task(auto_generate_rule(
alert_context or {},
ollama_url=settings.OLLAMA_URL,
model=settings.OPENCLAW_DEFAULT_MODEL,
gemini_api_key=getattr(settings, "GEMINI_API_KEY", ""),
gemini_api_key=(
getattr(settings, "GEMINI_API_KEY", "")
if self._cloud_fallback_allowed_for_alert(alert_context)
else ""
),
))
except Exception as _e:
logger.warning("auto_rule_trigger_failed", error=str(_e))
@@ -1218,14 +1366,14 @@ class OpenClawService:
except json.JSONDecodeError:
# 3. 啟發式修補: 如果結尾缺少括號,嘗試補齊
if candidate.startswith("{") and not candidate.endswith("}"):
for i in range(1, 5): # 嘗試補 1-5 個括號/引號
try:
repaired = candidate + '"' * (i-1) + "}" * i
json.loads(repaired)
logger.info("json_repaired_heuristically", level=i)
return repaired
except:
continue
for i in range(1, 5): # 嘗試補 1-5 個括號/引號
try:
repaired = candidate + '"' * (i - 1) + "}" * i
json.loads(repaired)
logger.info("json_repaired_heuristically", level=i)
return repaired
except json.JSONDecodeError:
continue
continue
# 4. 極端情況: 找出最後一個有效 key
@@ -1235,11 +1383,11 @@ class OpenClawService:
# 暴力去除非法尾綴 (如 \t\t...)
candidate = re.sub(r"[ \t\r\n]+$", "", candidate)
if not candidate.endswith("}"):
candidate += '"}' # 嘗試最簡單的閉合
candidate += '"}' # 嘗試最簡單的閉合
try:
json.loads(candidate)
return candidate
except:
except json.JSONDecodeError:
pass
return None
@@ -1791,7 +1939,7 @@ Focus on:
from src.services.ai_router import get_ai_registry
ai_registry = get_ai_registry()
provider = ai_registry.get("ollama") or ai_registry.get("ollama_188")
provider = ai_registry.get("ollama") or ai_registry.get("ollama_local")
if provider is None or not hasattr(provider, "analyze_with_tools"):
logger.warning(
"openclaw_agent_loop_shadow_skipped",
@@ -2200,6 +2348,7 @@ Expert context: {json.dumps(expert_context or {}, ensure_ascii=False, default=st
}
"""
import asyncio
from src.services.nvidia_provider import get_nvidia_provider
nvidia = get_nvidia_provider()
@@ -2334,7 +2483,7 @@ Expert context: {json.dumps(expert_context or {}, ensure_ascii=False, default=st
"latency_ms": latency_ms,
}
except asyncio.TimeoutError:
except TimeoutError:
latency_ms = (time.time() - start_time) * 1000
logger.error(
"nemotron_tool_call_timeout",
@@ -2528,6 +2677,7 @@ async def _fetch_k8s_inventory_for_openclaw(
"awoooi-api, awoooi-web, ..." 格式字串,失敗時返回 ""
"""
import asyncio as _asyncio
import structlog as _structlog
_logger = _structlog.get_logger(__name__)
try:
@@ -2542,7 +2692,7 @@ async def _fetch_k8s_inventory_for_openclaw(
)
try:
stdout, _ = await _asyncio.wait_for(proc.communicate(), timeout=timeout_sec)
except _asyncio.TimeoutError:
except TimeoutError:
proc.kill()
_logger.warning("k8s_inventory_timeout_openclaw", namespace=namespace)
return ""

View File

@@ -9,8 +9,6 @@ ADR-106AwoooP Agent Platform
from __future__ import annotations
import uuid
from datetime import datetime
from decimal import Decimal
from typing import Any
from uuid import UUID
@@ -153,8 +151,21 @@ async def list_runs(
# Approvals
# =============================================================================
async def list_approvals(project_id: str | None) -> dict[str, Any]:
"""列出所有 waiting_approval 狀態的 runs。"""
async def list_approvals(
project_id: str | None,
run_id: str | None = None,
) -> dict[str, Any]:
"""列出 waiting_approval runs可依 project_id 或 run_id 篩選。"""
run_uuid: UUID | None = None
if run_id:
try:
run_uuid = uuid.UUID(run_id)
except ValueError as exc:
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail=f"run_id 格式錯誤: {exc}",
) from exc
async with get_db_context("awoooi") as db:
stmt = (
select(AwoooPRunState)
@@ -163,6 +174,8 @@ async def list_approvals(project_id: str | None) -> dict[str, Any]:
)
if project_id is not None:
stmt = stmt.where(AwoooPRunState.project_id == project_id)
if run_uuid is not None:
stmt = stmt.where(AwoooPRunState.run_id == run_uuid)
count_stmt = select(func.count()).select_from(stmt.subquery())
total_result = await db.execute(count_stmt)

View File

@@ -4,7 +4,7 @@ LLM Playbook Generator - ADR-104 T1/T2/T6
從成功修復案例生成可治理的 Playbook 草稿。
設計重點:
- 只用 local provider 順序Ollama 111 -> Ollama 188),避免新增雲端成本。
- 只用 local/provider pool 順序GCP-A -> 111 local),避免新增雲端成本。
- LLM 產出必須經 Pydantic + action_parser 安全收斂。
- 不直接 APPROVED先 DRAFT/REVIEW再交治理 job 晉級。
"""
@@ -30,7 +30,6 @@ from src.models.playbook import (
RiskLevel,
SymptomPattern,
)
from src.services.action_parser import is_safe_kubectl_action
from src.services.action_parser import kubectl_safety_reason
logger = structlog.get_logger(__name__)
@@ -218,7 +217,7 @@ class LLMPlaybookGenerator:
executor = get_ai_executor()
result = await executor.execute(
prompt=prompt,
provider_order=["ollama", "ollama_188"],
provider_order=["ollama", "ollama_local"],
context=context,
cache_ttl=86400,
require_local=True,

View File

@@ -244,7 +244,7 @@ class WeeklyReportService:
# 發送訊息
formatted = report.format()
result = await gateway.send_message(formatted)
result = await gateway.send_text(formatted)
if result:
logger.info("weekly_report_sent", week=report.week_range)

View File

@@ -0,0 +1,90 @@
from __future__ import annotations
import json
from typing import Any
import pytest
from src.services import ai_router as ai_router_module
from src.services.ai_providers.interfaces import AIResult
from src.services.ai_router import AIProviderRegistry, AIRouterExecutor
class _FakeRedis:
def __init__(self, cached_provider: str) -> None:
self.cached_provider = cached_provider
self.set_calls: list[tuple[str, str, int | None]] = []
async def get(self, key: str) -> str:
return json.dumps({
"response": '{"provider":"stale"}',
"provider": self.cached_provider,
})
async def set(self, key: str, value: str, ex: int | None = None) -> None:
self.set_calls.append((key, value, ex))
class _FakeProvider:
name = "ollama_gcp_a"
privacy_level = "local"
is_enabled = True
capabilities = {"rca", "chat"}
def __init__(self) -> None:
self.calls = 0
async def analyze(self, prompt: str, context: dict[str, Any] | None = None) -> AIResult:
self.calls += 1
return AIResult(
raw_response='{"provider":"fresh_ollama"}',
success=True,
provider=self.name,
)
@pytest.mark.asyncio
async def test_executor_skips_cached_cloud_provider_when_ollama_lane_is_required(
monkeypatch: pytest.MonkeyPatch,
) -> None:
fake_redis = _FakeRedis(cached_provider="gemini")
fake_provider = _FakeProvider()
registry = AIProviderRegistry()
registry.register(fake_provider)
monkeypatch.setattr(ai_router_module._settings, "MOCK_MODE", False)
monkeypatch.setattr("src.core.redis_client.get_redis", lambda: fake_redis)
result = await AIRouterExecutor(registry).execute(
prompt="diagnose alert",
provider_order=["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"],
context={"intent_hint": "diagnose", "alert_type": "HostHighCpuLoad"},
)
assert result.provider == "ollama_gcp_a"
assert result.raw_response == '{"provider":"fresh_ollama"}'
assert fake_provider.calls == 1
assert fake_redis.set_calls
@pytest.mark.asyncio
async def test_executor_allows_cached_ollama_provider_for_ollama_lane(
monkeypatch: pytest.MonkeyPatch,
) -> None:
fake_redis = _FakeRedis(cached_provider="ollama")
fake_provider = _FakeProvider()
registry = AIProviderRegistry()
registry.register(fake_provider)
monkeypatch.setattr(ai_router_module._settings, "MOCK_MODE", False)
monkeypatch.setattr("src.core.redis_client.get_redis", lambda: fake_redis)
result = await AIRouterExecutor(registry).execute(
prompt="diagnose alert",
provider_order=["ollama_gcp_a", "ollama_gcp_b", "ollama_local"],
context={"intent_hint": "diagnose", "alert_type": "HostHighCpuLoad"},
)
assert result.provider == "ollama"
assert result.from_cache is True
assert fake_provider.calls == 0

View File

@@ -124,8 +124,9 @@ def test_diagnose_fallback_chain_ollama_primary():
assert AIProviderEnum.OPENCLAW_NEMO in providers_in_chain
assert AIProviderEnum.GEMINI in providers_in_chain
assert AIProviderEnum.CLAUDE in providers_in_chain
# OLLAMA_188 (CPU-only 備援) 仍排除M1 Pro 111 才是 GPU 主推理)
assert AIProviderEnum.OLLAMA_188 not in providers_in_chain
# 188 不得作為 Ollama provider本地備援只允許 ollama_local。
provider_values = {p.value for p in providers_in_chain}
assert "ollama_188" not in provider_values
def test_diagnose_fallback_chain_contains_cloud_providers():
@@ -159,7 +160,7 @@ async def test_diagnose_route_primary_is_ollama():
# 雲端 fallback 仍在OpenClaw / Gemini / Claude 救命備援)
fb_providers = [p for p, _ in decision.fallback_chain]
# ollama_failover_manager 可能轉到 ollama_188但 ollama variant 必須有
# ollama_failover_manager 可能轉到 GCP-B / ollama_local但雲端救命備援仍必須存在。
has_cloud_fallback = (
AIProviderEnum.GEMINI in fb_providers or AIProviderEnum.CLAUDE in fb_providers
)

View File

@@ -83,7 +83,7 @@ async def test_router_uses_failover_when_ollama_initial_provider():
return_value=_make_failover_result(
primary_provider="gemini",
primary_model="gemini-1.5-flash",
fallback=[("ollama_188", "qwen2.5:7b-instruct"), ("nemotron", "nvidia/nemotron-mini-4b-instruct")],
fallback=[("ollama_local", "qwen2.5:7b-instruct"), ("nemotron", "nvidia/nemotron-mini-4b-instruct")],
)
)
@@ -109,14 +109,14 @@ async def test_router_uses_failover_when_ollama_initial_provider():
@pytest.mark.asyncio
async def test_router_failover_fallback_chain_converted():
"""failover_manager 回傳 fallback_chain → decision.fallback_chain 包含 OLLAMA_188"""
"""failover_manager 回傳 fallback_chain → decision.fallback_chain 包含 OLLAMA_LOCAL"""
mock_fm = MagicMock()
mock_fm.select_provider = AsyncMock(
return_value=_make_failover_result(
primary_provider="gemini",
primary_model="gemini-1.5-flash",
fallback=[
("ollama_188", "qwen2.5:7b-instruct"),
("ollama_local", "qwen2.5:7b-instruct"),
("nemotron", "nvidia/nemotron-mini-4b-instruct"),
("claude", "claude-haiku-4-5-20251001"),
],
@@ -134,8 +134,8 @@ async def test_router_failover_fallback_chain_converted():
decision = await router.route("test alert message")
fb_providers = [p for p, _ in decision.fallback_chain]
assert AIProviderEnum.OLLAMA_188 in fb_providers, (
f"OLLAMA_188 not in fallback_chain: {fb_providers}"
assert AIProviderEnum.OLLAMA_LOCAL in fb_providers, (
f"OLLAMA_LOCAL not in fallback_chain: {fb_providers}"
)
assert AIProviderEnum.NEMOTRON in fb_providers
assert AIProviderEnum.CLAUDE in fb_providers

View File

@@ -68,7 +68,7 @@ async def test_alert_failover_dedup(mock_redis, mock_telegram_send):
"to_provider": "gemini",
"reason": "111 unhealthy",
"model": "qwen3:8b",
"fallback_chain_str": "gemini → ollama_188",
"fallback_chain_str": "gemini → ollama_local",
}
# 第 1 次dedup pass發送

View File

@@ -1,16 +1,15 @@
# apps/api/tests/test_failover_e2e_dispatch.py | 2026-04-26 @ Asia/Taipei
# 2026-04-26 Wave5 B4 by Claude Engineer-A4 — E2E executor dispatch 測試
# 驗證 failover 切到 OLLAMA_188 後HTTP 請求真的打到 OLLAMA_FALLBACK_URL
# apps/api/tests/test_failover_e2e_dispatch.py | 2026-05-06 @ Asia/Taipei
# 2026-05-06 Codex — 188 不再作為 Ollama Provider驗證 ollama_local dispatch
"""
E2Eexecutor dispatch 層驗證
===============================
測試覆蓋(補全 B4 — 整合測試只驗決策層,未驗執行層):
1. registry 確實有 ollama_188 providerB1 修復後基本健全性)
2. Ollama188Provider.is_enabled 在有 OLLAMA_FALLBACK_URL 時為 True
3. Ollama188Provider.is_enabled 在 OLLAMA_FALLBACK_URL 空字串時為 False
4. Ollama188Provider.analyze() 真的把 HTTP 打到 OLLAMA_FALLBACK_URL攔截 httpx
5. executor.execute(provider_order=["ollama_188"]) 真的路由到 188 URL
1. registry 確實有 ollama_local provider且沒有 ollama_188 provider
2. OllamaLocalProvider.is_enabled 在有 OLLAMA_FALLBACK_URL 時為 True
3. OllamaLocalProvider.is_enabled 在 OLLAMA_FALLBACK_URL 空字串時為 False
4. OllamaLocalProvider.analyze() 真的把 HTTP 打到 OLLAMA_FALLBACK_URL攔截 httpx
5. executor.execute(provider_order=["ollama_local"]) 真的路由到 local URL
6. Gemini quota pipeline 並行 5 次不超發B3 atomic 驗證)
7. Gemini quota TTL 第一次呼叫即設定
"""
@@ -28,31 +27,30 @@ import pytest
# =============================================================================
def test_registry_has_ollama_188_provider():
"""B1 基本健全性:_init_registry() 後 registry 必須有 ollama_188"""
def test_registry_has_ollama_local_provider_without_ollama_188():
"""_init_registry() 後 registry 必須有 ollama_local且不得有 ollama_188"""
from src.services.ai_router import _init_registry
registry = _init_registry()
# registry.get() 只返回 is_enabled=True 的 provider
# 用 _providers dict 直接檢查(不管 is_enabled
assert "ollama_188" in registry._providers, (
"ollama_188 not found in registry._providers — B1 fix 未生效"
)
assert "ollama_local" in registry._providers
assert "ollama_188" not in registry._providers
def test_ollama_188_provider_name():
"""Ollama188Provider.name == 'ollama_188'"""
from src.services.ai_providers.ollama import Ollama188Provider
def test_ollama_local_provider_name():
"""OllamaLocalProvider.name == 'ollama_local'"""
from src.services.ai_providers.ollama import OllamaLocalProvider
p = Ollama188Provider()
assert p.name == "ollama_188"
p = OllamaLocalProvider()
assert p.name == "ollama_local"
def test_ollama_188_provider_privacy_level():
"""Ollama188Provider.privacy_level == 'local'(本地推理,可接機密資料)"""
from src.services.ai_providers.ollama import Ollama188Provider
def test_ollama_local_provider_privacy_level():
"""OllamaLocalProvider.privacy_level == 'local'(本地推理,可接機密資料)"""
from src.services.ai_providers.ollama import OllamaLocalProvider
p = Ollama188Provider()
p = OllamaLocalProvider()
assert p.privacy_level == "local"
@@ -61,45 +59,44 @@ def test_ollama_188_provider_privacy_level():
# =============================================================================
def test_ollama_188_is_enabled_with_fallback_url(monkeypatch):
"""OLLAMA_FALLBACK_URL 有值 + ENABLE_OLLAMA_188 未設 → is_enabled == True"""
from src.services.ai_providers.ollama import Ollama188Provider
from src.core.config import get_settings
def test_ollama_local_is_enabled_with_fallback_url(monkeypatch):
"""OLLAMA_FALLBACK_URL 有值 + ENABLE_OLLAMA_LOCAL 未設 → is_enabled == True"""
from src.services.ai_providers.ollama import OllamaLocalProvider
monkeypatch.setenv("ENABLE_OLLAMA_188", "true")
monkeypatch.setenv("ENABLE_OLLAMA_LOCAL", "true")
# patch settings 的 OLLAMA_FALLBACK_URL
mock_settings = MagicMock()
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
mock_settings.OPENCLAW_TIMEOUT = "60"
p = Ollama188Provider()
p = OllamaLocalProvider()
# 直接 patch module-level settings 物件
with patch("src.services.ai_providers.ollama.settings", mock_settings):
assert p.is_enabled is True
def test_ollama_188_is_disabled_without_fallback_url(monkeypatch):
"""OLLAMA_FALLBACK_URL 空字串 → is_enabled == False188 節點未設定)"""
from src.services.ai_providers.ollama import Ollama188Provider
def test_ollama_local_is_disabled_without_fallback_url(monkeypatch):
"""OLLAMA_FALLBACK_URL 空字串 → is_enabled == Falselocal 節點未設定)"""
from src.services.ai_providers.ollama import OllamaLocalProvider
monkeypatch.setenv("ENABLE_OLLAMA_188", "true")
monkeypatch.setenv("ENABLE_OLLAMA_LOCAL", "true")
mock_settings = MagicMock()
mock_settings.OLLAMA_FALLBACK_URL = ""
p = Ollama188Provider()
p = OllamaLocalProvider()
with patch("src.services.ai_providers.ollama.settings", mock_settings):
assert p.is_enabled is False
def test_ollama_188_is_disabled_by_env_flag(monkeypatch):
"""ENABLE_OLLAMA_188=false → is_enabled == False即使有 URL"""
from src.services.ai_providers.ollama import Ollama188Provider
def test_ollama_local_is_disabled_by_env_flag(monkeypatch):
"""ENABLE_OLLAMA_LOCAL=false → is_enabled == False即使有 URL"""
from src.services.ai_providers.ollama import OllamaLocalProvider
monkeypatch.setenv("ENABLE_OLLAMA_188", "false")
monkeypatch.setenv("ENABLE_OLLAMA_LOCAL", "false")
mock_settings = MagicMock()
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
p = Ollama188Provider()
p = OllamaLocalProvider()
with patch("src.services.ai_providers.ollama.settings", mock_settings):
assert p.is_enabled is False
@@ -110,14 +107,14 @@ def test_ollama_188_is_disabled_by_env_flag(monkeypatch):
@pytest.mark.asyncio
async def test_ollama_188_analyze_dispatches_to_fallback_url():
async def test_ollama_local_analyze_dispatches_to_fallback_url():
"""
B4 核心Ollama188Provider.analyze() 必須把 HTTP 打到 OLLAMA_FALLBACK_URL。
攔截 httpx.AsyncClient.post記錄實際呼叫 URL斷言包含 188 IP。
B4 核心OllamaLocalProvider.analyze() 必須把 HTTP 打到 OLLAMA_FALLBACK_URL。
攔截 httpx.AsyncClient.post記錄實際呼叫 URL斷言包含本地 fallback IP。
"""
from src.services.ai_providers.ollama import Ollama188Provider
from src.services.ai_providers.ollama import OllamaLocalProvider
FALLBACK_URL = "http://192.168.0.188:11434"
FALLBACK_URL = "http://192.168.0.111:11434"
captured_urls: list[str] = []
mock_response = MagicMock()
@@ -149,7 +146,7 @@ async def test_ollama_188_analyze_dispatches_to_fallback_url():
"top_p": 0.9,
})
provider = Ollama188Provider()
provider = OllamaLocalProvider()
with patch("src.services.ai_providers.ollama.settings", mock_settings):
with patch("src.services.ai_providers.ollama.get_model_registry", return_value=mock_registry):
@@ -159,45 +156,45 @@ async def test_ollama_188_analyze_dispatches_to_fallback_url():
result = await provider.analyze("test prompt", context={})
assert len(captured_urls) > 0, "analyze() 未發出任何 HTTP 請求"
assert any("192.168.0.188" in url for url in captured_urls), (
f"HTTP 請求未打到 188,實際 URL: {captured_urls}"
assert any("192.168.0.111" in url for url in captured_urls), (
f"HTTP 請求未打到 local fallback,實際 URL: {captured_urls}"
)
assert result.provider == "ollama_188"
assert result.provider == "ollama_local"
@pytest.mark.asyncio
async def test_ollama_188_analyze_returns_error_when_no_fallback_url():
async def test_ollama_local_analyze_returns_error_when_no_fallback_url():
"""OLLAMA_FALLBACK_URL 未設定 → analyze() 應返回 success=False不發 HTTP"""
from src.services.ai_providers.ollama import Ollama188Provider
from src.services.ai_providers.ollama import OllamaLocalProvider
mock_settings = MagicMock()
mock_settings.OLLAMA_FALLBACK_URL = ""
provider = Ollama188Provider()
provider = OllamaLocalProvider()
with patch("src.services.ai_providers.ollama.settings", mock_settings):
result = await provider.analyze("test prompt")
assert result.success is False
assert result.provider == "ollama_188"
assert result.provider == "ollama_local"
assert "OLLAMA_FALLBACK_URL" in (result.error or "")
@pytest.mark.asyncio
async def test_executor_dispatches_ollama_188_to_fallback_url():
async def test_executor_dispatches_ollama_local_to_fallback_url():
"""
B4 執行層AIRouterExecutor.execute(provider_order=["ollama_188"])
應路由到 Ollama188Provider且 HTTP 打到 OLLAMA_FALLBACK_URL。
B4 執行層AIRouterExecutor.execute(provider_order=["ollama_local"])
應路由到 OllamaLocalProvider且 HTTP 打到 OLLAMA_FALLBACK_URL。
"""
from src.services.ai_router import AIProviderRegistry, AIRouterExecutor, reset_ai_router
from src.services.ai_providers.ollama import Ollama188Provider
from src.services.ai_providers.ollama import OllamaLocalProvider
from src.services.ai_providers.interfaces import AIResult
reset_ai_router()
FALLBACK_URL = "http://192.168.0.188:11434"
FALLBACK_URL = "http://192.168.0.111:11434"
captured_urls: list[str] = []
# 建立真實 registry只登錄 ollama_188
# 建立真實 registry只登錄 ollama_local
registry = AIProviderRegistry()
# mock analyze 讓它回傳成功,但驗 URL 路徑
@@ -206,15 +203,15 @@ async def test_executor_dispatches_ollama_188_to_fallback_url():
return AIResult(
raw_response='{"action_title":"ok","confidence":0.9}',
success=True,
provider="ollama_188",
provider="ollama_local",
tokens=10,
)
mock_settings_global = MagicMock()
mock_settings_global.OLLAMA_FALLBACK_URL = FALLBACK_URL
# 建立 Ollama188Providermock 其 analyze + is_enabled
provider = Ollama188Provider()
# 建立 OllamaLocalProvidermock 其 analyze + is_enabled
provider = OllamaLocalProvider()
provider.analyze = fake_analyze # type: ignore[method-assign]
# 強制 is_enabled = True繞過 settings patch 的複雜度)
@@ -233,14 +230,14 @@ async def test_executor_dispatches_ollama_188_to_fallback_url():
mock_settings.MOCK_MODE = False
result = await executor.execute(
prompt="test alert",
provider_order=["ollama_188"],
provider_order=["ollama_local"],
context={},
)
assert result.success is True, f"execute 失敗: {result.error}"
assert result.provider == "ollama_188", f"provider 不是 ollama_188: {result.provider}"
assert any("192.168.0.188" in u for u in captured_urls), (
f"HTTP 未打到 188captured: {captured_urls}"
assert result.provider == "ollama_local", f"provider 不是 ollama_local: {result.provider}"
assert any("192.168.0.111" in u for u in captured_urls), (
f"HTTP 未打到 local fallbackcaptured: {captured_urls}"
)

View File

@@ -16,7 +16,7 @@ import httpx
import pytest
# Ollama 伺服器配置
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.188:11434")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.111:11434")
DEFAULT_MODEL = os.getenv("OLLAMA_MODEL", "qwen2.5:7b-instruct")
TIMEOUT = 300 # 秒 (CPU 推理模式需 ~222-666 秒,見 2026-03-26 評估)
@@ -111,7 +111,7 @@ async def check_ollama_available() -> bool:
@pytest.mark.integration
class TestModelRegression:
"""模型回歸測試 — 需要 Ollama 服務 (192.168.0.188:11434)"""
"""模型回歸測試 — 需要 Ollama 服務(預設 111可用 OLLAMA_URL 覆寫)"""
@pytest.fixture(autouse=True)
async def check_ollama(self):

View File

@@ -90,8 +90,8 @@ class TestProbeOllamaVersion:
assert isinstance(info.captured_at, datetime)
@pytest.mark.asyncio
async def test_success_188_provider(self):
"""188 URL → provider='ollama_188'"""
async def test_success_local_provider(self):
"""111 / local proxy URL → provider='ollama_local'"""
model_entry = {
"name": "deepseek-r1:14b",
"modified_at": "2026-04-02T00:00:00Z",
@@ -106,10 +106,10 @@ class TestProbeOllamaVersion:
with patch("httpx.AsyncClient", return_value=mock_client):
info = await probe_ollama_version(
"http://192.168.0.188:11434", "deepseek-r1:14b"
"http://192.168.0.111:11434", "deepseek-r1:14b"
)
assert info.provider == "ollama_188"
assert info.provider == "ollama_local"
@pytest.mark.asyncio
async def test_model_not_found_raises(self):
@@ -279,7 +279,7 @@ class TestProbeOpenclawNemoVersion:
mock_settings = MagicMock()
mock_settings.OPENCLAW_DEFAULT_MODEL = "deepseek-r1:14b"
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
with patch("src.services.model_version_probe.settings", mock_settings), \
patch("httpx.AsyncClient", return_value=mock_client):
@@ -301,7 +301,7 @@ class TestProbeOpenclawNemoVersion:
mock_settings = MagicMock()
mock_settings.OPENCLAW_DEFAULT_MODEL = "deepseek-r1:14b"
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
with patch("src.services.model_version_probe.settings", mock_settings), \
patch("httpx.AsyncClient", return_value=mock_client):
@@ -333,7 +333,7 @@ class TestProbeAllProviders:
"""5 個 provider 全部成功 → 回傳 5 筆 ProviderVersionInfo"""
fake_results = [
ProviderVersionInfo(provider="ollama", model="qwen2.5:7b-instruct", version="v1"),
ProviderVersionInfo(provider="ollama_188", model="qwen2.5:7b-instruct", version="v1"),
ProviderVersionInfo(provider="ollama_local", model="qwen2.5:7b-instruct", version="v1"),
ProviderVersionInfo(provider="gemini", model="gemini-1.5-flash", version="gemini-1.5-flash"),
ProviderVersionInfo(provider="claude", model="claude-haiku-4-5-20251001", version="claude-haiku-4-5-20251001"),
ProviderVersionInfo(provider="openclaw_nemo", model="deepseek-r1:14b", version="v1"),
@@ -347,7 +347,7 @@ class TestProbeAllProviders:
mock_settings = MagicMock()
mock_settings.OLLAMA_URL = "http://34.143.170.20:11434" # GCP-AADR-110
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
mock_settings.OLLAMA_HEALTH_CHECK_MODEL = "qwen2.5:7b-instruct"
with patch("src.services.model_version_probe.settings", mock_settings):
@@ -364,8 +364,8 @@ class TestProbeAllProviders:
raise RuntimeError("simulated failure")
async def _fail_ollama(url, model):
if "188" in url:
raise RuntimeError("188 offline")
if "111" in url:
raise RuntimeError("local offline")
return good
with patch("src.services.model_version_probe.probe_ollama_version", side_effect=_fail_ollama), \
@@ -379,13 +379,13 @@ class TestProbeAllProviders:
mock_settings = MagicMock()
mock_settings.OLLAMA_URL = "http://34.143.170.20:11434" # GCP-AADR-110
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
mock_settings.OLLAMA_HEALTH_CHECK_MODEL = "qwen2.5:7b-instruct"
with patch("src.services.model_version_probe.settings", mock_settings):
results = await probe_all_providers()
# ollama(ok) + ollama_188(fail) + gemini(fail) + claude(ok) + openclaw_nemo(ok) → 3
# ollama(ok) + ollama_local(fail) + gemini(fail) + claude(ok) + openclaw_nemo(ok) → 3
assert len(results) == 3
providers = {r.provider for r in results}
assert "ollama" in providers

View File

@@ -48,7 +48,7 @@ def _make_info(provider: str, version: str = "v1", digest: str | None = "sha256:
def _make_five() -> list[ProviderVersionInfo]:
return [
_make_info("ollama"),
_make_info("ollama_188"),
_make_info("ollama_local"),
_make_info("gemini", digest=None),
_make_info("claude", digest=None),
_make_info("openclaw_nemo"),

View File

@@ -310,7 +310,7 @@ class TestSelectProvider:
)
with patch.object(manager, "_write_failover_audit", return_value=None):
result = await manager.select_provider()
await manager.select_provider()
# 並行 check 三台主機GCP-A / GCP-B / Local
assert mock_monitor.check.call_count == 3
@@ -625,7 +625,6 @@ class TestWriteFailoverAudit:
@pytest.mark.asyncio
async def test_audit_uses_structlog_not_db(self):
"""_write_failover_audit 應呼叫 structlog不呼叫 DB"""
import structlog
manager = _make_manager()
from src.services.ollama_failover_manager import OllamaEndpoint, OllamaRoutingResult
@@ -657,22 +656,22 @@ class TestWriteFailoverAudit:
# =============================================================================
# B2: AIProviderEnum.OLLAMA_188 存在
# 2026-04-25 critic-fix Part2 by Claude Engineer-C2
# B2: AIProviderEnum.OLLAMA_LOCAL 存在
# 2026-05-06 Codex — 188 不再作為 Ollama Provider
# =============================================================================
class TestAIProviderEnumOllama188:
"""B2 修復驗證AIProviderEnum.OLLAMA_188 存在且 PROVIDER_LATENCY_BUDGET 有對應值"""
class TestAIProviderEnumOllamaLocal:
"""B2 修復驗證AIProviderEnum.OLLAMA_LOCAL 存在且 PROVIDER_LATENCY_BUDGET 有對應值"""
def test_ollama_188_enum_exists(self):
def test_ollama_local_enum_exists(self):
from src.services.ai_router import AIProviderEnum
assert AIProviderEnum.OLLAMA_188.value == "ollama_188"
assert AIProviderEnum.OLLAMA_LOCAL.value == "ollama_local"
def test_ollama_188_in_latency_budget(self):
def test_ollama_local_in_latency_budget(self):
from src.services.ai_router import AIProviderEnum, PROVIDER_LATENCY_BUDGET
assert AIProviderEnum.OLLAMA_188 in PROVIDER_LATENCY_BUDGET
assert PROVIDER_LATENCY_BUDGET[AIProviderEnum.OLLAMA_188] == 120000
assert AIProviderEnum.OLLAMA_LOCAL in PROVIDER_LATENCY_BUDGET
assert PROVIDER_LATENCY_BUDGET[AIProviderEnum.OLLAMA_LOCAL] == 90000
# =============================================================================

View File

@@ -42,7 +42,7 @@ from src.services.ollama_health_monitor import (
# =============================================================================
HOST = "http://34.143.170.20:11434" # GCP-A PrimaryADR-110 2026-05-03
HOST_188 = "http://192.168.0.188:11434" # 歷史遺留參考常數(已移出主路由)
HOST_LOCAL = "http://192.168.0.111:11434" # Local fallback已移出 188 主路由)
@pytest.fixture(autouse=True)

View File

@@ -0,0 +1,231 @@
from __future__ import annotations
from dataclasses import dataclass
from types import SimpleNamespace
from typing import Any
from unittest.mock import AsyncMock
import pytest
from src.services import ai_control as ai_control_module
from src.services import ai_router as ai_router_module
from src.services import openclaw as openclaw_module
from src.services.ai_router import AIProviderEnum
from src.services.intent_classifier import IntentType
from src.services.openclaw import OpenClawService
@dataclass
class _FakeEndpoint:
provider_name: str
url: str = "http://example.test"
class _FakeRoute:
def all_endpoints_in_order(self) -> list[_FakeEndpoint]:
return [
_FakeEndpoint("ollama_gcp_a"),
_FakeEndpoint("ollama_gcp_b"),
_FakeEndpoint("ollama_local"),
_FakeEndpoint("gemini", ""),
]
class _FakeFailoverManager:
def __init__(self) -> None:
self.task_types: list[str] = []
async def select_provider(self, task_type: str = "general") -> _FakeRoute:
self.task_types.append(task_type)
return _FakeRoute()
class _UnorderedFailoverManager:
async def select_provider(self, task_type: str = "general") -> SimpleNamespace:
return SimpleNamespace(
all_endpoints_in_order=lambda: [
_FakeEndpoint("ollama_local"),
_FakeEndpoint("gemini"),
_FakeEndpoint("ollama_gcp_b"),
_FakeEndpoint("ollama_gcp_a"),
],
)
class _FakeRouter:
async def route(self, prompt: str, context: dict[str, Any]) -> SimpleNamespace:
return SimpleNamespace(
selected_provider=AIProviderEnum.GEMINI,
fallback_chain=[
(AIProviderEnum.CLAUDE, "claude"),
(AIProviderEnum.OLLAMA, "qwen2.5:7b-instruct"),
],
intent=IntentType.DIAGNOSE,
routing_reason="high complexity would normally prefer cloud",
)
class _FakeExecutor:
def __init__(self) -> None:
self.provider_order: list[str] | None = None
async def execute(
self,
*,
prompt: str,
provider_order: list[str],
context: dict[str, Any],
cache_ttl: int,
require_local: bool,
) -> SimpleNamespace:
self.provider_order = provider_order
return SimpleNamespace(
raw_response='{"root_cause":"ok","suggested_action":"NO_ACTION"}',
provider=provider_order[0],
success=True,
tokens=42,
cost_usd=0.0,
latency_ms=10.0,
)
@pytest.mark.asyncio
async def test_alert_context_uses_ollama_lane_then_gemini_backup(
monkeypatch: pytest.MonkeyPatch,
) -> None:
fake_executor = _FakeExecutor()
fake_failover = _FakeFailoverManager()
monkeypatch.setattr(openclaw_module.settings, "USE_AI_ROUTER", True)
monkeypatch.setattr(openclaw_module.settings, "MOCK_MODE", False)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", True)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
monkeypatch.setattr(ai_control_module, "get_ai_router_enabled", AsyncMock(return_value=None))
monkeypatch.setattr(ai_control_module, "get_primary_provider", AsyncMock(return_value=None))
monkeypatch.setattr(ai_control_module, "is_provider_disabled", AsyncMock(return_value=False))
monkeypatch.setattr(ai_router_module, "get_ai_router", lambda: _FakeRouter())
monkeypatch.setattr(ai_router_module, "get_ai_executor", lambda: fake_executor)
monkeypatch.setattr(openclaw_module, "get_ollama_failover_manager", lambda: fake_failover)
service = object.__new__(OpenClawService)
result = await service._call_with_fallback(
"diagnose alert",
alert_context={
"incident_id": "INC-1",
"alertname": "HostHighCpuLoad",
"target_resource": "node-exporter-110",
},
)
assert result == (
'{"root_cause":"ok","suggested_action":"NO_ACTION"}',
"ollama_gcp_a",
True,
42,
0.0,
)
assert fake_executor.provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"]
assert fake_failover.task_types == ["diagnose"]
@pytest.mark.asyncio
async def test_alert_context_can_disable_cloud_backup_for_cost_stop(
monkeypatch: pytest.MonkeyPatch,
) -> None:
fake_executor = _FakeExecutor()
fake_failover = _FakeFailoverManager()
monkeypatch.setattr(openclaw_module.settings, "USE_AI_ROUTER", True)
monkeypatch.setattr(openclaw_module.settings, "MOCK_MODE", False)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", False)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
monkeypatch.setattr(ai_control_module, "get_ai_router_enabled", AsyncMock(return_value=None))
monkeypatch.setattr(ai_control_module, "get_primary_provider", AsyncMock(return_value=None))
monkeypatch.setattr(ai_control_module, "is_provider_disabled", AsyncMock(return_value=False))
monkeypatch.setattr(ai_router_module, "get_ai_router", lambda: _FakeRouter())
monkeypatch.setattr(ai_router_module, "get_ai_executor", lambda: fake_executor)
monkeypatch.setattr(openclaw_module, "get_ollama_failover_manager", lambda: fake_failover)
service = object.__new__(OpenClawService)
await service._call_with_fallback(
"diagnose alert",
alert_context={"incident_id": "INC-1", "alertname": "HostHighCpuLoad"},
)
assert fake_executor.provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
@pytest.mark.asyncio
async def test_non_alert_context_keeps_router_cloud_order(
monkeypatch: pytest.MonkeyPatch,
) -> None:
fake_executor = _FakeExecutor()
monkeypatch.setattr(openclaw_module.settings, "USE_AI_ROUTER", True)
monkeypatch.setattr(openclaw_module.settings, "MOCK_MODE", False)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", False)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
monkeypatch.setattr(ai_control_module, "get_ai_router_enabled", AsyncMock(return_value=None))
monkeypatch.setattr(ai_control_module, "get_primary_provider", AsyncMock(return_value=None))
monkeypatch.setattr(ai_control_module, "is_provider_disabled", AsyncMock(return_value=False))
monkeypatch.setattr(ai_router_module, "get_ai_router", lambda: _FakeRouter())
monkeypatch.setattr(ai_router_module, "get_ai_executor", lambda: fake_executor)
service = object.__new__(OpenClawService)
await service._call_with_fallback("general question", alert_context={"intent_hint": "query"})
assert fake_executor.provider_order == ["gemini", "claude", "ollama"]
@pytest.mark.asyncio
async def test_alert_context_uses_gcp_a_gcp_b_then_111_order(
monkeypatch: pytest.MonkeyPatch,
) -> None:
fake_failover = _FakeFailoverManager()
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", False)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
monkeypatch.setattr(openclaw_module, "get_ollama_failover_manager", lambda: fake_failover)
service = object.__new__(OpenClawService)
provider_order = await service._resolve_alert_provider_order(
task_type="diagnose",
alert_context={"incident_id": "INC-1", "alertname": "HostHighCpuLoad"},
)
assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
@pytest.mark.asyncio
async def test_alert_context_sorts_ollama_lane_and_drops_cloud_providers(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", False)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
monkeypatch.setattr(openclaw_module, "get_ollama_failover_manager", lambda: _UnorderedFailoverManager())
service = object.__new__(OpenClawService)
provider_order = await service._resolve_alert_provider_order(
task_type="diagnose",
alert_context={"incident_id": "INC-1", "alertname": "HostHighCpuLoad"},
)
assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
@pytest.mark.asyncio
async def test_alert_context_sorts_ollama_lane_before_gemini_backup(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", True)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
monkeypatch.setattr(openclaw_module, "get_ollama_failover_manager", lambda: _UnorderedFailoverManager())
service = object.__new__(OpenClawService)
provider_order = await service._resolve_alert_provider_order(
task_type="diagnose",
alert_context={"incident_id": "INC-1", "alertname": "HostHighCpuLoad"},
cloud_provider_order=["claude", "gemini", "ollama"],
)
assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"]

View File

@@ -18,7 +18,7 @@ import pytest
from src.core.prompts import OPENCLAW_TEST_PROMPT
# Ollama 配置
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.188:11434")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.111:11434")
DEFAULT_MODEL = os.getenv("OLLAMA_MODEL", "qwen2.5:7b-instruct")
TIMEOUT = 300 # 秒 (CPU 推理模式需 ~222-666 秒,見 2026-03-26 評估)

View File

@@ -44,6 +44,8 @@ ARG NEXT_PUBLIC_SENTRY_DSN=
ENV NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL}
ENV NEXT_PUBLIC_SENTRY_DSN=${NEXT_PUBLIC_SENTRY_DSN}
ENV NEXT_TELEMETRY_DISABLED=1
# 2026-05-05 ogt + Codex: keep self-hosted 110 runner builds from saturating CPU.
ENV NEXT_PRIVATE_BUILD_WORKER_COUNT=1
# 2026-04-06 ogt: --mount=type=cache 持久化 .next/cache跨 build 增量編譯
# 只有變更的頁面重新編譯,未變更頁面直接用 cache → 節省 3-4 min
@@ -51,7 +53,7 @@ ENV NEXT_TELEMETRY_DISABLED=1
# /root/.cache/turbo 存放 turbo 的 task 輸出快取,避免每次重跑未變動的 packages
RUN --mount=type=cache,target=/app/apps/web/.next/cache \
--mount=type=cache,target=/root/.cache/turbo \
pnpm turbo build --filter=@awoooi/web
pnpm turbo build --filter=@awoooi/web --concurrency=1
FROM base AS runner
WORKDIR /app

View File

@@ -67,7 +67,8 @@
"operations": "Operations",
"securityCompliance": "Security & Compliance",
"classicAICenter": "Classic AI Center",
"governance": "AI Governance"
"governance": "AI Governance",
"awooop": "AwoooP"
},
"locale": {
"switch": "Switch Language",
@@ -1480,4 +1481,4 @@
"retry": "Retry"
}
}
}
}

View File

@@ -67,7 +67,8 @@
"operations": "營運",
"securityCompliance": "安全合規",
"classicAICenter": "經典 AI 中心",
"governance": "AI 治理"
"governance": "AI 治理",
"awooop": "AwoooP"
},
"locale": {
"switch": "切換語系",
@@ -1481,4 +1482,4 @@
"retry": "重試"
}
}
}
}

View File

@@ -114,7 +114,7 @@ function ApprovalRow({ approval }: { approval: Approval }) {
<tr
className={cn(
"border-b border-border hover:bg-accent/30 transition-colors",
isCritical && "bg-red-900/10 hover:bg-red-900/20"
isCritical && "bg-[#fff0ef] hover:bg-[#ffe4e1]"
)}
>
<td className="px-4 py-3">
@@ -232,7 +232,7 @@ export default function ApprovalsPage() {
{/* Error State */}
{error && (
<div className="flex items-start gap-3 p-4 bg-red-900/20 border border-red-800/40 rounded-lg">
<div className="flex items-start gap-3 border border-[#e2a29b] bg-[#fff0ef] p-4">
<AlertCircle className="w-5 h-5 text-red-400 flex-shrink-0 mt-0.5" aria-hidden="true" />
<div>
<p className="text-sm font-medium text-red-300"></p>
@@ -243,7 +243,7 @@ export default function ApprovalsPage() {
{/* Empty State — 所有審批已處理 */}
{!loading && approvals.length === 0 && !error && (
<div className="flex flex-col items-center justify-center py-16 bg-card border border-border rounded-xl">
<div className="flex flex-col items-center justify-center border border-[#e0ddd4] bg-white py-16">
<ShieldCheck className="w-12 h-12 text-green-400 mb-3" aria-hidden="true" />
<p className="text-sm font-medium text-foreground mb-1"></p>
<p className="text-xs text-muted-foreground"> Run</p>
@@ -252,7 +252,7 @@ export default function ApprovalsPage() {
{/* Table */}
{(loading || approvals.length > 0) && (
<div className="bg-card border border-border rounded-xl overflow-hidden shadow-sm">
<div className="overflow-hidden border border-[#e0ddd4] bg-white shadow-[0_1px_4px_rgba(0,0,0,0.05)]">
<div className="overflow-x-auto">
<table className="w-full" role="table" aria-label="審批佇列">
<thead>

View File

@@ -19,20 +19,25 @@ import { cn } from "@/lib/utils";
// Types
// =============================================================================
type ContractStatus = "draft" | "published" | "active";
type ContractStatus = "draft" | "published" | "active" | "revoked";
interface Contract {
id: string;
contract_id: string;
contract_family: string;
project_id: string;
status: ContractStatus;
lifecycle_status: ContractStatus;
body_hash: string;
created_at: string;
}
interface Tenant {
project_id: string;
name: string;
display_name: string;
}
interface ContractsResponse {
contracts?: Contract[];
items?: Contract[];
}
// =============================================================================
@@ -47,21 +52,27 @@ const STATUS_CONFIG: Record<
> = {
draft: {
label: "草稿",
bg: "bg-gray-800",
text: "text-gray-300",
border: "border-gray-600",
bg: "bg-[#f4f1e8]",
text: "text-[#5f5b52]",
border: "border-[#d8d3c7]",
},
published: {
label: "已發佈",
bg: "bg-blue-900/40",
text: "text-blue-300",
border: "border-blue-600/40",
bg: "bg-[#eef5ff]",
text: "text-[#1f5b9b]",
border: "border-[#9bb6d9]",
},
active: {
label: "生效中",
bg: "bg-green-900/40",
text: "text-green-300",
border: "border-green-600/40",
bg: "bg-[#f0faf2]",
text: "text-[#17602a]",
border: "border-[#9bc7a4]",
},
revoked: {
label: "已撤銷",
bg: "bg-[#fff0ef]",
text: "text-[#9f2f25]",
border: "border-[#e2a29b]",
},
};
@@ -109,7 +120,7 @@ function ContractRow({ contract }: { contract: Contract }) {
</span>
</td>
<td className="px-4 py-3">
<StatusBadge status={contract.status} />
<StatusBadge status={contract.lifecycle_status} />
</td>
<td className="px-4 py-3">
<span className="font-mono text-xs text-muted-foreground bg-muted px-2 py-0.5 rounded">
@@ -140,7 +151,10 @@ export default function ContractsPage() {
useEffect(() => {
fetch(`${API_BASE}/api/v1/platform/tenants`)
.then((r) => r.json())
.then((data) => setTenants(Array.isArray(data.items) ? data.items : []))
.then((data) => {
const rows = Array.isArray(data.tenants) ? data.tenants : data.items;
setTenants(Array.isArray(rows) ? rows : []);
})
.catch(() => {});
}, []);
@@ -154,8 +168,9 @@ export default function ContractsPage() {
`${API_BASE}/api/v1/platform/contracts?${params.toString()}`
);
if (!res.ok) throw new Error(`HTTP ${res.status}`);
const data = await res.json();
setContracts(Array.isArray(data.items) ? data.items : []);
const data: ContractsResponse = await res.json();
const rows = Array.isArray(data.contracts) ? data.contracts : data.items;
setContracts(Array.isArray(rows) ? rows : []);
} catch (err) {
setError(err instanceof Error ? err.message : "載入失敗");
} finally {
@@ -192,7 +207,7 @@ export default function ContractsPage() {
</div>
{/* Filters */}
<div className="flex items-center gap-3 p-4 bg-card border border-border rounded-xl">
<div className="flex items-center gap-3 border border-[#e0ddd4] bg-white p-4">
<Filter className="w-4 h-4 text-muted-foreground flex-shrink-0" aria-hidden="true" />
<span className="text-sm text-muted-foreground"></span>
<div className="relative">
@@ -205,7 +220,7 @@ export default function ContractsPage() {
<option value=""></option>
{tenants.map((t) => (
<option key={t.project_id} value={t.project_id}>
{t.name || t.project_id}
{t.display_name || t.project_id}
</option>
))}
</select>
@@ -225,7 +240,7 @@ export default function ContractsPage() {
)}
{/* Table */}
<div className="bg-card border border-border rounded-xl overflow-hidden shadow-sm">
<div className="overflow-hidden border border-[#e0ddd4] bg-white shadow-[0_1px_4px_rgba(0,0,0,0.05)]">
<div className="overflow-x-auto">
<table className="w-full" role="table" aria-label="合約清單">
<thead>
@@ -269,7 +284,7 @@ export default function ContractsPage() {
</tr>
) : (
contracts.map((contract) => (
<ContractRow key={contract.id} contract={contract} />
<ContractRow key={contract.contract_id + contract.body_hash} contract={contract} />
))
)}
</tbody>

View File

@@ -6,8 +6,9 @@
"use client";
import { AppLayout } from "@/components/layout";
import { Link, usePathname } from "@/i18n/routing";
import { Building2, FileText, Activity, ShieldCheck } from "lucide-react";
import { Activity, BrainCircuit, Building2, ClipboardList, FileText, ShieldCheck } from "lucide-react";
import { cn } from "@/lib/utils";
// =============================================================================
@@ -15,6 +16,11 @@ import { cn } from "@/lib/utils";
// =============================================================================
const navItems = [
{
label: "工作鏈路",
href: "/awooop/work-items" as const,
icon: ClipboardList,
},
{
label: "租戶管理",
href: "/awooop/tenants" as const,
@@ -43,64 +49,72 @@ const navItems = [
export default function AwoooPLayout({
children,
params,
}: {
children: React.ReactNode;
params: { locale: string };
}) {
const pathname = usePathname();
return (
<div className="min-h-full flex flex-col">
{/* Console Header */}
<div className="bg-card border-b border-border px-6 py-4">
<div className="flex items-center justify-between mb-4">
<div>
<h1 className="text-xl font-bold text-foreground tracking-tight">
AwoooP Operator Console
</h1>
<p className="text-xs text-muted-foreground mt-0.5">
Agent · · Run ·
</p>
</div>
<div className="flex items-center gap-2">
<span className="inline-flex items-center gap-1.5 px-2.5 py-1 rounded-full text-xs font-medium bg-brand-accent/10 text-brand-accent border border-brand-accent/20">
<span className="w-1.5 h-1.5 rounded-full bg-brand-accent animate-pulse" />
<AppLayout locale={params.locale} showBackground={false}>
<div className="min-h-[calc(100vh-116px)] bg-[#f7f5ee] border border-[#e0ddd4]">
<div className="border-b border-[#e0ddd4] bg-[#faf9f3] px-5 py-4">
<div className="flex flex-wrap items-center justify-between gap-3">
<div className="flex items-center gap-3">
<span className="flex h-9 w-9 items-center justify-center border border-[#d8d3c7] bg-white text-[#141413]">
<BrainCircuit className="h-4 w-4" aria-hidden="true" />
</span>
<div>
<h1 className="text-lg font-semibold tracking-normal text-[#141413]">
AwoooP Operator Console
</h1>
<div className="mt-1 flex items-center gap-2 text-xs text-[#77736a]">
<span className="font-mono">Control Plane</span>
<span className="h-1 w-1 rounded-full bg-[#d97757]" />
<span className="font-mono">Shadow First</span>
</div>
</div>
</div>
<span className="inline-flex items-center gap-2 border border-[#d8d3c7] bg-white px-3 py-1.5 text-xs font-semibold text-[#141413]">
<span className="h-1.5 w-1.5 rounded-full bg-[#22c55e]" />
OPERATOR
</span>
</div>
<nav
className="mt-4 flex flex-wrap gap-1"
role="navigation"
aria-label="AwoooP 主要導航"
>
{navItems.map((item) => {
const Icon = item.icon;
const isActive =
pathname === item.href ||
pathname?.startsWith(item.href + "/");
return (
<Link
key={item.href}
href={item.href}
aria-current={isActive ? "page" : undefined}
className={cn(
"inline-flex items-center gap-2 border px-3 py-2 text-sm font-medium transition-colors",
isActive
? "border-[#d97757] bg-white text-[#141413]"
: "border-transparent text-[#77736a] hover:border-[#d8d3c7] hover:bg-white hover:text-[#141413]"
)}
>
<Icon className="h-4 w-4" aria-hidden="true" />
{item.label}
</Link>
);
})}
</nav>
</div>
{/* Tab Navigation */}
<nav className="flex gap-1" role="navigation" aria-label="AwoooP 主要導航">
{navItems.map((item) => {
const Icon = item.icon;
const isActive =
pathname === item.href ||
pathname?.startsWith(item.href + "/");
return (
<Link
key={item.href}
href={item.href}
aria-current={isActive ? "page" : undefined}
className={cn(
"flex items-center gap-2 px-4 py-2 rounded-lg text-sm font-medium transition-all duration-150",
isActive
? "bg-brand-accent/15 text-brand-accent border border-brand-accent/30"
: "text-muted-foreground hover:text-foreground hover:bg-accent"
)}
>
<Icon className="w-4 h-4" aria-hidden="true" />
{item.label}
</Link>
);
})}
</nav>
<main className="px-5 py-5">{children}</main>
</div>
{/* Page Content */}
<main className="flex-1 px-6 py-6">
{children}
</main>
</div>
</AppLayout>
);
}

View File

@@ -2,8 +2,12 @@
// WOOO AIOps - AwoooP Console 入口重導向
// =============================================================================
import { redirect } from "@/i18n/routing";
import { redirect } from "next/navigation";
export default function AwoooPPage() {
redirect("/awooop/tenants");
export default function AwoooPPage({
params,
}: {
params: { locale: string };
}) {
redirect(`/${params.locale}/awooop/work-items`);
}

View File

@@ -23,16 +23,14 @@ import { cn } from "@/lib/utils";
// =============================================================================
type RunState =
| "CREATED"
| "QUEUED"
| "POLICY_RESOLVED"
| "RUNNING"
| "WAITING_TOOL"
| "WAITING_APPROVAL"
| "RESUMED"
| "COMPLETED"
| "FAILED"
| "CANCELLED";
| "pending"
| "running"
| "waiting_tool"
| "waiting_approval"
| "completed"
| "failed"
| "cancelled"
| "timeout";
interface Run {
run_id: string;
@@ -40,18 +38,19 @@ interface Run {
agent_id: string;
state: RunState;
is_shadow: boolean;
token_usage_input: number | null;
token_usage_output: number | null;
cost_usd: number | string;
step_count: number;
created_at: string;
}
interface Tenant {
project_id: string;
name: string;
display_name: string;
}
interface RunsResponse {
items: Run[];
runs?: Run[];
items?: Run[];
total: number;
page: number;
per_page: number;
@@ -69,66 +68,54 @@ const STATE_CONFIG: Record<
RunState,
{ label: string; bg: string; text: string; border: string; pulse?: boolean }
> = {
CREATED: {
label: "已建立",
bg: "bg-gray-800",
text: "text-gray-300",
border: "border-gray-600",
pending: {
label: "待執行",
bg: "bg-[#f4f1e8]",
text: "text-[#5f5b52]",
border: "border-[#d8d3c7]",
},
QUEUED: {
label: "排隊中",
bg: "bg-gray-800",
text: "text-gray-400",
border: "border-gray-600",
},
POLICY_RESOLVED: {
label: "策略已解析",
bg: "bg-blue-900/40",
text: "text-blue-300",
border: "border-blue-600/40",
},
RUNNING: {
running: {
label: "執行中",
bg: "bg-green-900/40",
text: "text-green-300",
border: "border-green-600/40",
bg: "bg-[#f0faf2]",
text: "text-[#17602a]",
border: "border-[#9bc7a4]",
pulse: true,
},
WAITING_TOOL: {
waiting_tool: {
label: "等待工具",
bg: "bg-yellow-900/40",
text: "text-yellow-300",
border: "border-yellow-600/40",
bg: "bg-[#fff7e8]",
text: "text-[#8a5a08]",
border: "border-[#d9b36f]",
},
WAITING_APPROVAL: {
waiting_approval: {
label: "等待審批",
bg: "bg-yellow-900/40",
text: "text-yellow-300",
border: "border-yellow-600/40",
bg: "bg-[#fff7e8]",
text: "text-[#8a5a08]",
border: "border-[#d9b36f]",
},
RESUMED: {
label: "已恢復",
bg: "bg-purple-900/40",
text: "text-purple-300",
border: "border-purple-600/40",
},
COMPLETED: {
completed: {
label: "已完成",
bg: "bg-green-900/40",
text: "text-green-400",
border: "border-green-600/40",
bg: "bg-[#f0faf2]",
text: "text-[#17602a]",
border: "border-[#9bc7a4]",
},
FAILED: {
failed: {
label: "失敗",
bg: "bg-red-900/40",
text: "text-red-300",
border: "border-red-600/40",
bg: "bg-[#fff0ef]",
text: "text-[#9f2f25]",
border: "border-[#e2a29b]",
},
CANCELLED: {
cancelled: {
label: "已取消",
bg: "bg-red-900/30",
text: "text-red-400",
border: "border-red-700/40",
bg: "bg-[#fff0ef]",
text: "text-[#9f2f25]",
border: "border-[#e2a29b]",
},
timeout: {
label: "已超時",
bg: "bg-[#fff0ef]",
text: "text-[#9f2f25]",
border: "border-[#e2a29b]",
},
};
@@ -137,7 +124,7 @@ const STATE_CONFIG: Record<
// =============================================================================
function RunStateBadge({ state }: { state: RunState }) {
const config = STATE_CONFIG[state] ?? STATE_CONFIG.CREATED;
const config = STATE_CONFIG[state] ?? STATE_CONFIG.pending;
return (
<span
className={cn(
@@ -158,7 +145,7 @@ function RunStateBadge({ state }: { state: RunState }) {
function ShadowBadge({ isShadow }: { isShadow: boolean }) {
if (!isShadow) return <span className="text-muted-foreground text-sm">--</span>;
return (
<span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-gray-800 text-gray-400 border border-gray-600">
<span className="inline-flex items-center border border-[#d8d3c7] bg-white px-2 py-0.5 text-xs font-medium text-[#5f5b52]">
Shadow
</span>
);
@@ -174,8 +161,7 @@ function RunRow({ run }: { run: Run }) {
})
: "--";
const totalTokens =
(run.token_usage_input ?? 0) + (run.token_usage_output ?? 0);
const cost = Number(run.cost_usd ?? 0);
return (
<tr className="border-b border-border hover:bg-accent/30 transition-colors">
@@ -202,12 +188,12 @@ function RunRow({ run }: { run: Run }) {
</td>
<td className="px-4 py-3">
<span className="flex items-center gap-1 text-sm font-mono text-muted-foreground">
{totalTokens > 0 ? (
{run.step_count > 0 || cost > 0 ? (
<>
<Cpu className="w-3.5 h-3.5" aria-hidden="true" />
{totalTokens.toLocaleString()}
{run.step_count.toLocaleString()} steps
<span className="text-xs text-muted-foreground/60">
({run.token_usage_input ?? 0} {run.token_usage_output ?? 0})
(${cost.toFixed(4)})
</span>
</>
) : (
@@ -244,7 +230,10 @@ export default function RunsPage() {
useEffect(() => {
fetch(`${API_BASE}/api/v1/platform/tenants`)
.then((r) => r.json())
.then((data) => setTenants(Array.isArray(data.items) ? data.items : []))
.then((data) => {
const rows = Array.isArray(data.tenants) ? data.tenants : data.items;
setTenants(Array.isArray(rows) ? rows : []);
})
.catch(() => {});
}, []);
@@ -253,7 +242,7 @@ export default function RunsPage() {
setError(null);
const params = new URLSearchParams();
if (projectFilter) params.set("project_id", projectFilter);
if (statusFilter) params.set("status", statusFilter);
if (statusFilter) params.set("state", statusFilter);
params.set("page", String(page));
params.set("per_page", String(PER_PAGE));
@@ -262,7 +251,8 @@ export default function RunsPage() {
);
if (!res.ok) throw new Error(`HTTP ${res.status}`);
const data: RunsResponse = await res.json();
setRuns(Array.isArray(data.items) ? data.items : []);
const rows = Array.isArray(data.runs) ? data.runs : data.items;
setRuns(Array.isArray(rows) ? rows : []);
setTotal(data.total ?? 0);
setLastRefresh(new Date());
} catch (err) {
@@ -320,7 +310,7 @@ export default function RunsPage() {
</div>
{/* Filters */}
<div className="flex items-center gap-3 p-4 bg-card border border-border rounded-xl flex-wrap">
<div className="flex flex-wrap items-center gap-3 border border-[#e0ddd4] bg-white p-4">
<Filter className="w-4 h-4 text-muted-foreground flex-shrink-0" aria-hidden="true" />
<span className="text-sm text-muted-foreground"></span>
@@ -335,7 +325,7 @@ export default function RunsPage() {
<option value=""></option>
{tenants.map((t) => (
<option key={t.project_id} value={t.project_id}>
{t.name || t.project_id}
{t.display_name || t.project_id}
</option>
))}
</select>
@@ -373,7 +363,7 @@ export default function RunsPage() {
)}
{/* Table */}
<div className="bg-card border border-border rounded-xl overflow-hidden shadow-sm">
<div className="overflow-hidden border border-[#e0ddd4] bg-white shadow-[0_1px_4px_rgba(0,0,0,0.05)]">
<div className="overflow-x-auto">
<table className="w-full" role="table" aria-label="Run 清單">
<thead>
@@ -394,7 +384,7 @@ export default function RunsPage() {
Shadow
</th>
<th className="text-left px-4 py-3 text-xs font-medium text-muted-foreground uppercase tracking-wider">
Token
/ Steps
</th>
<th className="text-left px-4 py-3 text-xs font-medium text-muted-foreground uppercase tracking-wider">

View File

@@ -21,6 +21,7 @@ import { cn } from "@/lib/utils";
// =============================================================================
type MigrationMode =
| "legacy_awoooi_default"
| "shadow"
| "canary"
| "read_only"
@@ -29,14 +30,15 @@ type MigrationMode =
interface Tenant {
project_id: string;
name: string;
display_name: string;
migration_mode: MigrationMode;
budget_limit_usd: number | null;
is_suspended: boolean;
budget_limit_usd: number | string | null;
is_active: boolean;
}
interface ApiResponse {
items: Tenant[];
tenants?: Tenant[];
items?: Tenant[];
total: number;
}
@@ -50,35 +52,41 @@ const MIGRATION_MODE_CONFIG: Record<
MigrationMode,
{ label: string; bg: string; text: string; border: string }
> = {
legacy_awoooi_default: {
label: "Legacy",
bg: "bg-white",
text: "text-[#5f5b52]",
border: "border-[#d8d3c7]",
},
shadow: {
label: "Shadow",
bg: "bg-gray-800",
text: "text-gray-300",
border: "border-gray-600",
bg: "bg-[#f4f1e8]",
text: "text-[#5f5b52]",
border: "border-[#d8d3c7]",
},
canary: {
label: "Canary",
bg: "bg-yellow-900/40",
text: "text-yellow-300",
border: "border-yellow-600/40",
bg: "bg-[#fff7e8]",
text: "text-[#8a5a08]",
border: "border-[#d9b36f]",
},
read_only: {
label: "Read Only",
bg: "bg-blue-900/40",
text: "text-blue-300",
border: "border-blue-600/40",
bg: "bg-[#eef5ff]",
text: "text-[#1f5b9b]",
border: "border-[#9bb6d9]",
},
suggest: {
label: "Suggest",
bg: "bg-purple-900/40",
text: "text-purple-300",
border: "border-purple-600/40",
bg: "bg-[#f6f0ff]",
text: "text-[#6541a5]",
border: "border-[#baa7de]",
},
auto_remediate: {
label: "Auto Remediate",
bg: "bg-green-900/40",
text: "text-green-300",
border: "border-green-600/40",
bg: "bg-[#f0faf2]",
text: "text-[#17602a]",
border: "border-[#9bc7a4]",
},
};
@@ -104,12 +112,12 @@ function MigrationModeBadge({ mode }: { mode: MigrationMode }) {
function SuspendedBadge({ suspended }: { suspended: boolean }) {
return suspended ? (
<span className="inline-flex items-center gap-1 px-2.5 py-0.5 rounded-md text-xs font-medium bg-red-900/40 text-red-300 border border-red-600/40">
<span className="inline-flex items-center gap-1 border border-[#e2a29b] bg-[#fff0ef] px-2.5 py-0.5 text-xs font-medium text-[#9f2f25]">
<Ban className="w-3 h-3" aria-hidden="true" />
</span>
) : (
<span className="inline-flex items-center gap-1 px-2.5 py-0.5 rounded-md text-xs font-medium bg-green-900/40 text-green-300 border border-green-600/40">
<span className="inline-flex items-center gap-1 border border-[#9bc7a4] bg-[#f0faf2] px-2.5 py-0.5 text-xs font-medium text-[#17602a]">
<CheckCircle2 className="w-3 h-3" aria-hidden="true" />
</span>
@@ -117,6 +125,9 @@ function SuspendedBadge({ suspended }: { suspended: boolean }) {
}
function TenantRow({ tenant }: { tenant: Tenant }) {
const budget =
tenant.budget_limit_usd == null ? null : Number(tenant.budget_limit_usd);
return (
<tr className="border-b border-border hover:bg-accent/30 transition-colors">
<td className="px-4 py-3">
@@ -125,7 +136,7 @@ function TenantRow({ tenant }: { tenant: Tenant }) {
</span>
</td>
<td className="px-4 py-3">
<span className="text-sm text-foreground font-medium">{tenant.name || "--"}</span>
<span className="text-sm text-foreground font-medium">{tenant.display_name || "--"}</span>
</td>
<td className="px-4 py-3">
<MigrationModeBadge mode={tenant.migration_mode} />
@@ -135,7 +146,7 @@ function TenantRow({ tenant }: { tenant: Tenant }) {
{tenant.budget_limit_usd != null ? (
<>
<DollarSign className="w-3.5 h-3.5" aria-hidden="true" />
{tenant.budget_limit_usd.toLocaleString("en-US", {
{budget?.toLocaleString("en-US", {
minimumFractionDigits: 2,
})}
</>
@@ -145,7 +156,7 @@ function TenantRow({ tenant }: { tenant: Tenant }) {
</span>
</td>
<td className="px-4 py-3">
<SuspendedBadge suspended={tenant.is_suspended} />
<SuspendedBadge suspended={!tenant.is_active} />
</td>
</tr>
);
@@ -166,7 +177,8 @@ export default function TenantsPage() {
const res = await fetch(`${API_BASE}/api/v1/platform/tenants`);
if (!res.ok) throw new Error(`HTTP ${res.status}`);
const data: ApiResponse = await res.json();
setTenants(Array.isArray(data.items) ? data.items : []);
const rows = Array.isArray(data.tenants) ? data.tenants : data.items;
setTenants(Array.isArray(rows) ? rows : []);
} catch (err) {
setError(err instanceof Error ? err.message : "載入失敗");
} finally {
@@ -214,7 +226,7 @@ export default function TenantsPage() {
)}
{/* Table */}
<div className="bg-card border border-border rounded-xl overflow-hidden shadow-sm">
<div className="overflow-hidden border border-[#e0ddd4] bg-white shadow-[0_1px_4px_rgba(0,0,0,0.05)]">
<div className="overflow-x-auto">
<table className="w-full" role="table" aria-label="租戶清單">
<thead>

View File

@@ -0,0 +1,254 @@
// =============================================================================
// WOOO AIOps - AwoooP 工作鏈路
// =============================================================================
// 將 AwoooP 實施項目對齊到 Operator Console 可觀測面。
"use client";
import {
Activity,
ArrowRight,
ClipboardList,
Database,
Gauge,
GitBranch,
Network,
ShieldCheck,
} from "lucide-react";
import { Link } from "@/i18n/routing";
import { cn } from "@/lib/utils";
type WorkStatus = "live" | "in_progress" | "blocked" | "watching";
type WorkItem = {
phase: string;
title: string;
status: WorkStatus;
surface: string;
source: string;
gate: string;
href: "/awooop/tenants" | "/awooop/contracts" | "/awooop/runs" | "/awooop/approvals";
};
const statusConfig: Record<WorkStatus, { label: string; className: string }> = {
live: {
label: "已接線",
className: "border-[#9bc7a4] bg-[#f0faf2] text-[#17602a]",
},
in_progress: {
label: "推進中",
className: "border-[#d9b36f] bg-[#fff7e8] text-[#8a5a08]",
},
blocked: {
label: "阻塞",
className: "border-[#e2a29b] bg-[#fff0ef] text-[#9f2f25]",
},
watching: {
label: "觀察期",
className: "border-[#9bb6d9] bg-[#eef5ff] text-[#1f5b9b]",
},
};
const workItems: WorkItem[] = [
{
phase: "P0",
title: "AI 路由以 GCP-A/GCP-B/111 Ollama 優先",
status: "live",
surface: "Run 監控",
source: "ai_routing_decision / ollama_failover_decision",
gate: "Gemini 僅能作為 fallback",
href: "/awooop/runs",
},
{
phase: "P0",
title: "飛輪 KPI 改讀 auto_repair_executions",
status: "in_progress",
surface: "工作鏈路 / 系統報告",
source: "auto_repair_executions",
gate: "修復率不得再讀 incidents.outcome",
href: "/awooop/runs",
},
{
phase: "P0",
title: "審批與 Run State 對齊",
status: "live",
surface: "審批佇列",
source: "awooop_run_state",
gate: "waiting_approval 才能 decide",
href: "/awooop/approvals",
},
{
phase: "P1",
title: "Contract Lifecycle",
status: "watching",
surface: "合約儀表板",
source: "awooop_contract_revisions",
gate: "draft → published → active",
href: "/awooop/contracts",
},
{
phase: "P1",
title: "Tenant Migration State",
status: "watching",
surface: "租戶管理",
source: "awooop_projects",
gate: "shadow gate 需量化",
href: "/awooop/tenants",
},
{
phase: "P1",
title: "MCP Gateway 與 Context Firewall",
status: "watching",
surface: "Run 監控",
source: "mcp_gateway audit / redaction",
gate: "tool call 必須帶 project_id",
href: "/awooop/runs",
},
{
phase: "P2",
title: "Communication Hub",
status: "watching",
surface: "Run 監控 / 審批佇列",
source: "conversation_event / outbound_message",
gate: "Telegram 先 mirror 再切流",
href: "/awooop/runs",
},
{
phase: "P2",
title: "Operator Console 正式接入主站",
status: "in_progress",
surface: "AwoooP",
source: "apps/web/src/app/[locale]/awooop",
gate: "/zh-TW/awooop 不得再 redirect 異常",
href: "/awooop/tenants",
},
];
function StatusBadge({ status }: { status: WorkStatus }) {
const config = statusConfig[status];
return (
<span
className={cn(
"inline-flex items-center border px-2 py-0.5 text-xs font-semibold",
config.className
)}
>
{config.label}
</span>
);
}
const summary = [
{ label: "Live", value: workItems.filter((i) => i.status === "live").length, icon: Activity },
{ label: "In Progress", value: workItems.filter((i) => i.status === "in_progress").length, icon: GitBranch },
{ label: "Watching", value: workItems.filter((i) => i.status === "watching").length, icon: Gauge },
{ label: "Blocked", value: workItems.filter((i) => i.status === "blocked").length, icon: ShieldCheck },
];
export default function AwoooPWorkItemsPage() {
return (
<div className="space-y-5">
<div className="flex flex-wrap items-center justify-between gap-3">
<div className="flex items-center gap-3">
<ClipboardList className="h-5 w-5 text-[#d97757]" aria-hidden="true" />
<div>
<h2 className="text-lg font-semibold tracking-normal text-[#141413]">
</h2>
<p className="text-xs text-[#77736a]">
{workItems.length}
</p>
</div>
</div>
</div>
<div className="grid gap-px border border-[#e0ddd4] bg-[#e0ddd4] md:grid-cols-4">
{summary.map((item) => {
const Icon = item.icon;
return (
<div key={item.label} className="bg-white px-4 py-3">
<div className="flex items-center justify-between">
<span className="text-xs font-medium text-[#77736a]">{item.label}</span>
<Icon className="h-4 w-4 text-[#87867f]" aria-hidden="true" />
</div>
<div className="mt-2 font-mono text-2xl font-semibold text-[#141413]">
{item.value}
</div>
</div>
);
})}
</div>
<div className="overflow-hidden border border-[#e0ddd4] bg-white">
<div className="overflow-x-auto">
<table className="w-full" role="table" aria-label="AwoooP 工作鏈路">
<thead>
<tr className="border-b border-[#e0ddd4] bg-[#faf9f3]">
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
Phase
</th>
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
</th>
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
</th>
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
</th>
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
Source
</th>
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
Gate
</th>
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
Link
</th>
</tr>
</thead>
<tbody>
{workItems.map((item) => (
<tr key={`${item.phase}-${item.title}`} className="border-b border-[#eee9dd] last:border-b-0">
<td className="px-4 py-3 font-mono text-xs font-semibold text-[#141413]">
{item.phase}
</td>
<td className="px-4 py-3 text-sm font-medium text-[#141413]">
{item.title}
</td>
<td className="px-4 py-3">
<StatusBadge status={item.status} />
</td>
<td className="px-4 py-3 text-sm text-[#5f5b52]">
{item.surface}
</td>
<td className="px-4 py-3">
<span className="inline-flex items-center gap-1.5 font-mono text-xs text-[#5f5b52]">
<Database className="h-3.5 w-3.5" aria-hidden="true" />
{item.source}
</span>
</td>
<td className="px-4 py-3">
<span className="inline-flex items-center gap-1.5 text-sm text-[#5f5b52]">
<Network className="h-3.5 w-3.5" aria-hidden="true" />
{item.gate}
</span>
</td>
<td className="px-4 py-3">
<Link
href={item.href}
className="inline-flex items-center gap-1.5 border border-[#d8d3c7] px-2.5 py-1 text-xs font-semibold text-[#141413] hover:border-[#d97757]"
>
<ArrowRight className="h-3.5 w-3.5" aria-hidden="true" />
</Link>
</td>
</tr>
))}
</tbody>
</table>
</div>
</div>
</div>
);
}

View File

@@ -87,6 +87,7 @@ const NAV_SECTIONS: NavSection[] = [
{ id: 'security-compliance', href: '/security-compliance', labelKey: 'securityCompliance',Icon: Shield },
{ id: 'knowledge', href: '/knowledge', labelKey: 'knowledge', Icon: BookOpen },
{ id: 'governance', href: '/governance', labelKey: 'governance', Icon: ShieldCheck },
{ id: 'awooop', href: '/awooop', labelKey: 'awooop', Icon: BrainCircuit },
],
},
{

View File

@@ -6,6 +6,71 @@
---
## 2026-05-05 | 重開機後排程與 startup baseline 修復
**背景**:四台主機非預期重開機後,統帥要求確認所有服務、網站、工具、資料庫與排程都能正常恢復,不能只看容器 `healthy`
**本次排程/啟動鏈修補**
- 120/121 K3s 回到 ReadyCD workflow 目標從 121 改為 120避免 121 worker kubeconfig `127.0.0.1:6443` 造成 Secrets patch 失敗120 已驗證 limited sudo kubectl 可用。
- K8s CronJob 修正:`k3s-status-report``weekly-report``km-vectorize` 改用存在的 service account、live API image、cluster service DNS手動 job 驗證 drift/k3s/weekly 可完成,歷史 failed jobs 已清掉。
- KM embedding schema 從 768/錯誤 typmod 修為 `vector(1024)`;原 embedding 已備份到 `knowledge_entries_embedding_backup_20260505`,正在以 `bge-m3:latest` 重建。
- 188 momo backup script 修正 quote/validation/Telegram optional/error cleanup成功產出 `/home/ollama/momo_backups/momo_analytics_20260505_212032.sql.gz`
- 188 `backup-from-110.sh` 因 SSH config 權限錯誤導致 `HostBackupFailed`;修正 `.ssh/config` 權限與 110 identity 設定後以低優先權手動備份成功Prometheus `backup_110_last_success_timestamp` 已更新。
- 188 momo-scheduler 修正 dashboard URL容器內改打 `http://momo-pro-system`,不再打 `127.0.0.1:5000`
- 188 Google Drive token 從 legacy pickle 轉為 JSONscheduler 容器內 `GoogleDriveService().authenticate()` 通過。
- 188 daily sales import 修正 Excel sheet 選擇,優先讀 `即時業績明細`;手動匯入成功 `19934` 筆,日期 `2026-04-01 ~ 2026-05-03`
- 188 import 尾端驗證修正:改比對本次匯入日期範圍,不再用全表筆數硬比;`daily_sales_snapshot``realtime_sales_monthly` 在該日期範圍皆 `19934` 筆且驗證通過。
- 110 startup 修復:移除 `/etc/sysctl.conf` 中誤寫的非法敏感純文字行;`systemd-sysctl` 恢復成功。
- 110 停用兩個過期 startup units`momo-startup-complete.service`(指向不存在路徑/錯 host`wooo-staggered-startup.service`(舊 GitLab 延遲啟動且會增加重開機負載)。
- 110 `awoooi-startup-110.service` timeout 從 5 分鐘延長到 15 分鐘,重跑後 `ActiveState=active``SubState=exited``Result=success``systemctl --failed` 為 0。
- 110 certbot timer 失敗追查:`grist.wooo.work` / `registry.wooo.work` public route 目前被導向 `aiops.wooo.work`HTTP-01 無法從 110 成功;已將兩個 stale renewal config 移至 `/etc/letsencrypt/renewal-disabled-codex-*`,並 reset certbot failed state。憑證 archive 未刪除;後續需修 public route 或改 DNS-01。
- `scripts/reboot-recovery/full-stack-cold-start-check.sh` 新增 `P2-SCHEDULES`,覆蓋 188/110/120/121 cron、textfile mtime、188 backup freshness、110 failed units、K8s CronJob/Job/Pod 狀態、121 DR drill cron。
- `docs/runbooks/FULL-STACK-COLD-START-SOP.md` 新增排程驗證章節與 done criteria要求排程真正可執行才算 reboot recovery 完成。
**最終驗證**
- KM reembed 完成:`1774/1774` success、`0` failedDB 目前 `knowledge_entries` total `1785`、embedded `1776`、vector dims `1024..1024`,舊 embedding backup `1691` rows。
- 手動 `km-vectorize` CronJob `km-vectorize-codex-220715` 完成,回 `embed-all: 200 {"total":0,"success":0,"failed":0}`
- `bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test``PASS=50 WARN=0 BLOCKED=0`,包含 Alertmanager webhook E2E、public routes、cron/CronJob/textfile/systemd schedule checks。
- Prometheus firing alerts 已從 `HostBackupFailed + FlywheelExecutionRateMissing` 收斂為僅剩 `FlywheelExecutionRateMissing`HostBackupFailed 解除。
- 188/110 負載回到低檔K3s node CPU 約 3-6%KM reembed 未造成主機過載。
**下一步**
- 將本次 runtime hotfix 對應的 repo changes 走正式 deploy避免下一版 image 覆蓋 hotfix。
-`grist.wooo.work` / `registry.wooo.work` public route 或改 DNS-01 renewal目前舊 renewal config 已停用以避免 certbot timer 每次失敗。
## 2026-05-05 | 110 Sentry resource limits persistence gap closed
**背景**110 guardrail 告警已清,但主機 load 仍有長尾;統帥擔心 Claude Code 只做 live `docker update`,重建後配置又失效。
**現場結論**
- 188 已回穩load 約 `2.26 / 2.84 / 3.21`momo/litellm/SignOz 核心容器都有 live CPU/memory guardrail仍有 `HostBackupFailed`,但與 CPU/load 無關。
- 110 仍是 Sentry 長尾,不是 runner 或 momo 類事故ClickHouse 約 2.2-3.0 coresKafka 約 0.6 coretaskworker/taskbroker/taskscheduler/redis/uptime-checker 合計形成背景 load。
- ClickHouse 目前不是查詢卡死:`system.processes` 無長查詢,`system.mutations` 無 pending`system.merges` 只看到短 transaction merge最大資料表是 `eap_items_1_local``6.68 GiB`
- Kafka consumer lag 查詢未見 backlog 膨脹;目前不應再靠降低 ClickHouse/Kafka memory 或泛用 restart。
- 真正缺口110 live limit 已存在,但 `/opt/sentry/docker-compose.yml` 只持久化了 `process-spans`ClickHouse/Kafka/taskworker/taskbroker/taskscheduler/redis 一旦 compose recreate 可能回到 unlimited。
**本次 live 修補**
- 110 `/opt/sentry/docker-compose.yml` 已備份為 `docker-compose.yml.bak-20260505-155707-codex-resource-limits`
- 持久化 Sentry 核心 guardrailClickHouse `2 CPU / 8 GiB / 16 GiB swap`、Kafka `2 CPU / 3 GiB / 6 GiB swap`、taskworker `2 CPU / 2 GiB / 4 GiB swap`、taskbroker `1 CPU / 512 MiB / 1 GiB swap`、taskscheduler `0.5 CPU / 512 MiB / 1 GiB swap`、redis `0.5 CPU / 512 MiB / 1 GiB swap`、uptime-checker `0.5 CPU / 512 MiB / 1 GiB swap`
- 只對 uptime-checker 補 live `docker update`,未重啟 Sentry/ClickHouse/Kafka容器仍 `Up 5 days`
- 110 `/opt/sentry/clickhouse/config.xml` 已備份為 `config.xml.bak-20260505-160120-codex-merge-pool4`ClickHouse 背景 merge 從 pool `8` 降到 `4`,三門檻從 `6/4/6` 降到 `3/2/3``max_bytes_to_merge_at_max_space_in_pool``512MiB` 降到 `256MiB`
- `SYSTEM RELOAD CONFIG` 不會熱套用這些 ClickHouse 25.3 設定,因此只重啟 `sentry-self-hosted-clickhouse-1`;重啟前 active foreground processes `1`查詢本身、pending mutations `0`
**驗證**
- `/opt/sentry/docker-compose.yml` `docker compose config` passed僅 upstream `version` obsolete warning
- `docker inspect` 顯示 ClickHouse/Kafka/taskworker/taskbroker/taskscheduler/redis/uptime-checker live limit 全部與 compose baseline 一致。
- 110 load 從約 `12.50 / 13.10 / 13.35` 降到 `7.41 / 10.60 / 12.35``HostLoadAverageSustainedHigh` 未 firing`DockerContainerCpuSustainedHigh` 僅 pending 於 Sentry ClickHouse。
- ClickHouse 重啟後 16 秒 healthyruntime setting 已確認 `background_pool_size=4`、三門檻 `3/2/3`、merge 上限 `268435456` bytesactive merges `0`、pending mutations `0`、ClickHouse CPU 約從 `2.1-2.7 cores` 降到 `0.67 core`
- 因 4 條 merge thread 仍可讓 ClickHouse 短暫回到 2.7 cores將 live + compose CPU quota 從 `4` 收到 `2`,記憶體維持 `8 GiB`;後續 topk 顯示 ClickHouse 約 `2.0 cores`,由 CPU quota 保護 host。
- 後續 host `ps` 顯示剩餘 `HostHighCpuLoad` 主因之一是 CD Web image build`node /app/.../next build``1.4 cores`,疊加 Gitea/ClickHouse/Kafka已在 `apps/web/Dockerfile``NEXT_PRIVATE_BUILD_WORKER_COUNT=1`,並將 `pnpm turbo build --filter=@awoooi/web` 改為 `--concurrency=1`,避免 Web build 再把 110 推到長時間高 CPU。
-`HostHighCpuLoad``CPU >80% for 5m` 調成 `CPU >90% for 10m` 的早期 warning真正長時間過載/自動診斷交給 `HostLoadAverageSustainedHigh``load5/core >1.5 for 15m`
- Prometheus firing alerts 只剩 `FlywheelExecutionRateMissing` 與 188 `HostBackupFailed`Docker/runner guardrail alerts clean。
**下一步**
- 110 若 ClickHouse sustained CPU 仍 pending 超過 drain window下一步查 EAP/profiling/replay/uptime 是否需要保留;不要先降 ClickHouse memory 或重啟。
- 將其他 unlimited 低流量容器分批納入 baseline不一次全量加避免把 Sentry/Harbor/monitoring 次要服務壓出新事故。
- 188 優先修 `HostBackupFailed` 與 momo scheduler Google Drive/白頁檢查雜訊CPU/load 不是當前阻塞。
## 2026-05-05 | 110/188 CPU/Mem 配額全景盤點 + Docker baseline 監控落地
**背景**:統帥擔心 Claude Code 對 110/188 服務 CPU/memory limit 亂配置,造成服務卡死或慢性過載;本輪接續盤點 live Docker inspect / docker stats / compose 宣告。
@@ -3033,3 +3098,130 @@ C1evolver 加 YAML_RULE guard+ C2seeder SQL `AND status != 'deprecated'
```bash
psql $DATABASE_URL -f apps/api/migrations/cleanup_duplicate_deprecated_playbooks.sql
```
---
## 2026-05-05台北— 四主機重開機後全站冷啟動救援
**觸發**110 / 120 / 121 / 188 同時重開機後,多數服務異常;統帥要求先恢復所有網站、主機、核心服務,並建立完整冷啟動 SOP。
### 已恢復
| 範圍 | 結果 |
|------|------|
| 188 host PostgreSQL | WAL checkpoint 損壞;已備份後 `pg_resetwal``k3s_datastore` `REINDEX` + `VACUUM ANALYZE` 完成 |
| K3s datastore | 刪除並備份可重建的腐壞 HPA / VPA / VPA checkpoint / `mon1` node rows120 / 121 重新 Ready |
| AWOOI prod | `awoooi-api` / `awoooi-web` / `awoooi-worker` RunningVIP `192.168.0.125` 內網驗證 API 200 / Web 307 |
| mo.wooo.work | `momo-db` WAL redo 損壞;備份後 `pg_resetwal``momo-pro-system` / scheduler / bot / DB 全部 healthy公網 `/` 200、`/health` 200 |
| 110 host overload | actions runner units 維持最後放行Sentry ClickHouse/Kafka 已從 dirty-reboot 損壞中恢復Sentry stack healthy |
| 188 SignOz | SignOz ClickHouse volume 出現 filesystem corruption已 clean-clone 可讀資料並保留原始 corrupt volumeSignOz HTTP 恢復 |
| 冷啟動 SOP | 新增 `docs/runbooks/FULL-STACK-COLD-START-SOP.md``scripts/reboot-recovery/full-stack-cold-start-check.sh` |
### 驗證
```bash
bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test
# PASS=31 WARN=0 BLOCKED=0
# Result: GREEN. Full stack is ready for controlled runner/CD release.
```
### Dirty reboot 資料保全
- 110 Sentry ClickHouse原始壞 volume 保留為 `/var/lib/docker/volumes/sentry-clickhouse/_data.corrupt-20260505-203346`;以 clean-clone 恢復可讀資料並加 `force_restore_data`
- 110 Sentry Kafkamalformed checkpoint 已備份至 `/var/backups/sentry-kafka-checkpoints-20260505-203942`,只重建 checkpoint不刪 topic/log data。
- 188 SignOz ClickHouse原始壞 volume 保留為 `/var/lib/docker/volumes/signoz-clickhouse/_data.corrupt-20260505-203735`;以 clean-clone 恢復可讀資料。
- 188 `momo-db`WAL reset 前備份 `/var/backups/postgresql/momo-db-before-pg-resetwal-20260505-200834.tgz`
### 已知隔離 / 後續
- 110 actions runner units 仍按策略最後放行guardrail 已套用,`CPUQuota=200%``MemoryMax=2G``WatchdogUSec=0`;需在 load/core 穩定後逐步開啟。
- `Bad message` / `Structure needs cleaning` 是 host filesystem 層訊號;線上 clean-clone 已恢復服務,但完整歷史資料追溯需安排離線 `fsck` 或備份驗證。
- `drift-scanner-29633040-qrf8w` 為單次 CronJob Error不阻斷主服務後續可清理或調查。
---
## 2026-05-05台北— GCP Ollama 告警路徑止血與內網化決策
**觸發**:告警卡仍顯示 `Router: Gemini`,且 GCP-A / GCP-B Ollama 先前在告警 JSON prompt 上連續 504導致 Gemini 備援產生費用。
### 已執行
| 範圍 | 結果 |
|------|------|
| 告警模型 | 將告警專用 Ollama 模型固定為 `gemma3:4b`,避免 `qwen3:14b` / `qwen2.5-coder:32b` 冷啟動拖入 Gemini |
| Production image | `awoooi-api` / `awoooi-worker` 已手動切到 `192.168.0.110:5000/awoooi/api:787acd3bda918f53b977f37133e0b5c73558033e` |
| Production env | 已明確設定 `ALERT_AI_ENFORCE_OLLAMA_FIRST=true``ALERT_AI_ALLOW_CLOUD_FALLBACK=true``ALERT_OLLAMA_MODEL=gemma3:4b` |
| GCP Ollama 保溫 | GCP-A / GCP-B 已卸載 14B / 32B 重模型,並以 `keep_alive=8h` 保溫 `gemma3:4b` |
| Meta W-6 降噪 | Trust Drift 未達 20% 時不再升級為 Meta System現場 Redis 已加 6h dedup 防止重複通知 |
### 現場驗證
```bash
kubectl -n awoooi-prod get deploy awoooi-api awoooi-worker -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{range .spec.template.spec.containers[*]}{.name}={.image}{" "}{end}{"\n"}{end}'
# awoooi-api api=192.168.0.110:5000/awoooi/api:787acd3bda918f53b977f37133e0b5c73558033e
# awoooi-worker worker=192.168.0.110:5000/awoooi/api:787acd3bda918f53b977f37133e0b5c73558033e
kubectl -n awoooi-prod exec deploy/awoooi-api -- printenv | grep -E 'ALERT_OLLAMA_MODEL|ALERT_AI_|OLLAMA_.*URL'
# ALERT_OLLAMA_MODEL=gemma3:4b
# ALERT_AI_ALLOW_CLOUD_FALLBACK=true
# ALERT_AI_ENFORCE_OLLAMA_FIRST=true
# OLLAMA_URL=http://192.168.0.110:11435
# OLLAMA_SECONDARY_URL=http://192.168.0.110:11436
# OLLAMA_FALLBACK_URL=http://192.168.0.111:11434
```
### 架構決策
- 目前 `192.168.0.110:11435/11436` 是經由 110 nginx 轉發到 GCP 公網 IP屬於過渡方案不應作為長期 primary Ollama lane。
- 建議建立 WireGuard site-to-site private mesh讓 K3s / 110 / 111 / GCP-A / GCP-B 以私網 IP 互連Ollama 僅綁定 mesh interface並由 AwoooP Inference Gateway 統一路由、熔斷、佇列與模型保溫。
- 注意:目前 GCP-A / GCP-B `/api/ps` 顯示 `size_vram: 0`,內網化可解決連線與安全問題,但無法讓 CPU-only GCP 等同 111 的 VRAM/GPU 效能;大模型應留在 111 或改用 GPU 型 GCP 節點。
---
## 2026-05-06台北— AwoooP Operator Console 與飛輪 KPI 對齊
**觸發**00:30 系統報告顯示「全系統正常」,但飛輪狀態為 `修復 0/15 (0%)`,使用者指出 AI 自動化幾乎沒有做;同步要求 AwoooP 工作項目必須與前端頁面、邏輯、操作面對齊。
### 已修正
| 範圍 | 結果 |
|------|------|
| 心跳報告 | `HeartbeatReportService._get_flywheel_stats()` 改讀 `auto_repair_executions`,不再用已失準的 `incidents.outcome` 推估修復率 |
| 飛輪 Prometheus KPI | `FlywheelStatsService._playbook_stats()` 優先以 `auto_repair_executions` 計算 24h execution success rateRedis playbook counter 僅作 fallback |
| AI Success | `MetricsDBRepository` 改用 `UPPER(status::text)` 對齊實際 `APPROVED / EXECUTION_SUCCESS / EXECUTION_FAILED` 狀態值 |
| Auto-repair metric | `AutoRepairService.execute_auto_repair()` 成功/失敗都呼叫 `record_auto_repair()`,修正 Prometheus 指標零 caller 問題 |
| K8s Pod 報告 | Completed/Succeeded CronJob pod 不再顯示為紅色失敗Telegram 報告會顯示 phase |
| AwoooP 前端 | `/zh-TW/awooop` redirect 修正Console 接入主 `AppLayout` 與 sidebar新增 `工作鏈路` 頁映射 P0/P1/P2 工作項目、source of truth、gate 與操作面 |
| AwoooP API | `GET /api/v1/platform/approvals?run_id=` 支援 M8 詳情頁查單筆 waiting approval |
### 驗證
```bash
DATABASE_URL='postgresql+asyncpg://test:test@localhost:5432/test' \
apps/api/.venv/bin/python -m py_compile \
apps/api/src/repositories/metrics_repository.py \
apps/api/src/services/heartbeat_report_service.py \
apps/api/src/services/auto_repair_service.py \
apps/api/src/services/flywheel_stats_service.py \
apps/api/src/api/v1/platform/operator_runs.py \
apps/api/src/services/platform_operator_service.py
DATABASE_URL='postgresql+asyncpg://test:test@localhost:5432/test' \
apps/api/.venv/bin/python -m ruff check --select E9,F401,F821 \
apps/api/src/repositories/metrics_repository.py \
apps/api/src/services/heartbeat_report_service.py \
apps/api/src/services/auto_repair_service.py \
apps/api/src/services/flywheel_stats_service.py \
apps/api/src/api/v1/platform/operator_runs.py \
apps/api/src/services/platform_operator_service.py
# All checks passed!
pnpm --filter @awoooi/web typecheck
# tsc --noEmit passed
```
### 後續
- 仍需處理 `approval_records.matched_playbook_id = NULL` 問題,否則執行結果無法完整回寫 Playbook trust。
- 仍需攔截 AI action hallucinationalertname 被當 deployment/host、namespace 亂填)進入 approval 前的路徑。
- AwoooP Console 下一步應接入真實 run step journal / trace view而不是只列 run state。

View File

@@ -0,0 +1,497 @@
# AWOOOI Full-Stack Cold Start SOP
> Version: v1.0
> Last updated: 2026-05-05 Asia/Taipei
> Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path.
---
## 0. When To Use This
Use this SOP when any of these happen:
- 110/120/121/188 reboot unexpectedly.
- All services are abnormal after a power/network event.
- K3s is stuck `activating`.
- Host load remains high during startup and service health is mixed.
- Monitoring, alerting, CD, AI auto-repair, and Docker Compose services disagree about the real state.
The rule is simple: **recover the dependency chain, not the loudest symptom.**
---
## 1. Golden Startup Order
```text
0. Freeze automation and preserve evidence
1. Physical/network layer
2. 188 data layer
3. 110 registry/observability layer
4. 120/121 K3s layer
5. AWOOOI workload layer
6. Public routes and alert chain
7. High-load batch/consumer/crawler services
8. Runner/CD
9. AI auto-remediation
10. 112 Kali scanner, if needed
```
Never start runner/CD before 188 PostgreSQL, 110 Harbor, K3s nodes, and AWOOOI API are healthy.
---
## 2. Automation Freeze
Cold start creates noisy metrics and partial failures. During P0/P1, keep automation in observe-only mode.
| Item | Cold-start policy | Reason |
|------|-------------------|--------|
| Gitea/GitHub runners | Last | Build jobs can saturate 110 CPU/RAM. |
| momo-scheduler / crawlers | Last | Chrome and batch work can saturate 188. |
| Sentry/Snuba consumers | Controlled | Kafka backlog and ClickHouse merge can create temporary high load. |
| Alertmanager outbound notification | Gate | Avoid alert storms before API webhook and Telegram are verified. |
| AI auto-repair | Observe-only | Metrics, Redis, KM, and playbooks may be incomplete. |
| Stateful DB restart | Human approval | PostgreSQL, Redis, ClickHouse, Harbor DB, Sentry DB are not generic restart targets. |
---
## 3. P0 Evidence And Network
Run from any machine on the same LAN:
```bash
for h in 110 120 121 188; do
ping -c 2 -W 2 192.168.0.$h >/dev/null && echo "PING_OK 192.168.0.$h" || echo "PING_FAIL 192.168.0.$h"
done
arp -an | grep -E '192\.168\.0\.(110|120|121|188)'
for h in 110 120 121 188; do
nc -G 3 -z 192.168.0.$h 22 && echo "SSH_OK 192.168.0.$h" || echo "SSH_FAIL 192.168.0.$h"
done
```
Then capture reboot evidence:
```bash
ssh ollama@192.168.0.188 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
ssh wooo@192.168.0.110 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
ssh wooo@192.168.0.120 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
ssh wooo@192.168.0.121 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
```
If any host has ARP `incomplete` or SSH port down, stop here and fix physical/network first.
---
## 4. P0 188 Data Layer
188 is the first real service dependency because K3s datastore and AWOOOI DB depend on PostgreSQL.
### 4.1 Startup order
1. `containerd`
2. `docker`
3. `postgresql@14-main`
4. `k3s_datastore.kine` maintenance
5. `redis-server` on `6380`
6. `ollama` or current AI proxy dependencies
7. `nginx`
8. Docker networks
9. MinIO / OpenClaw / SignOz
10. momo / litellm / batch services after load is stable
### 4.2 Read-only check
```bash
ssh ollama@192.168.0.188 '
hostname; date; uptime; free -h
systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx || true
pg_isready -h localhost -p 5432 || true
redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true
docker ps --format "{{.Names}}\t{{.Status}}\t{{.Ports}}" | head -120
'
```
### 4.3 PostgreSQL WAL checkpoint damage
Signature:
```text
PANIC: could not locate a valid checkpoint record
invalid primary checkpoint record
unexpected pageaddr ... in log segment ...
```
This blocks:
- `188:5432`
- K3s startup on 120/121
- AWOOOI API DB access
- Alertmanager webhook if API cannot start
Human-approved recovery command on 188:
```bash
sudo systemctl stop postgresql@14-main
sudo install -d -m 700 -o postgres -g postgres /var/backups/postgresql
sudo tar -C /var/lib/postgresql/14 -czf /var/backups/postgresql/14-main-before-pg-resetwal-$(date +%Y%m%d-%H%M%S).tgz main
sudo -u postgres /usr/lib/postgresql/14/bin/pg_resetwal -f /var/lib/postgresql/14/main
sudo systemctl start postgresql@14-main
pg_isready -h localhost -p 5432
sudo -u postgres psql -d k3s_datastore -c "VACUUM ANALYZE kine;"
```
Do not run `DROP`, reinitialize the cluster, delete `/var/lib/postgresql`, or restore an old backup unless the commander explicitly approves it.
---
## 5. P0/P1 110 Registry And Observability
110 must recover Harbor/Gitea/Monitoring early, but runners last.
### 5.1 Startup order
1. `docker`
2. Remove `Exited (128)` / `Exited (137)` orphan containers
3. Harbor `harbor-log`
4. Harbor full stack
5. Gitea
6. Prometheus / Alertmanager / Grafana / exporters
7. Langfuse
8. SignOz
9. Sentry DB layer
10. Sentry web/worker/consumer layer
11. Gitea host runner and actions runners
### 5.2 Checks
```bash
ssh wooo@192.168.0.110 '
hostname; date; uptime; free -h
systemctl is-active docker || true
curl -s -o /dev/null -w "harbor=%{http_code}\n" --max-time 5 http://127.0.0.1:5000/v2/ || true
curl -s -o /dev/null -w "gitea=%{http_code}\n" --max-time 5 http://127.0.0.1:3001/ || true
curl -s --max-time 5 http://127.0.0.1:9090/-/ready || true
curl -s --max-time 5 http://127.0.0.1:9093/-/healthy || true
curl -s -o /dev/null -w "sentry=%{http_code}\n" --max-time 10 http://127.0.0.1:9000/ || true
docker ps --format "{{.Names}}\t{{.Status}}" | head -120
'
```
Harbor healthy means `/v2/` returns `200` or `401`. Do not treat `401` as failure.
### 5.3 Runner gate
Runner may start only after all are true:
- `188 PostgreSQL` ready
- `110 Harbor` ready
- `110 Gitea` ready
- `120/121 K3s` nodes ready
- AWOOOI API health passes
- 110 load/core is below `1.0` for at least 15 minutes
- runner systemd guardrails are active: `CPUQuota=200%`, `MemoryMax=2G`, `WatchdogUSec=0`
Check:
```bash
ssh wooo@192.168.0.110 '
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain | awk "{print \$1}"); do
echo "=== $u ==="
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts
done
'
```
If `WatchdogUSec` is not `0`, apply the guardrail script manually with sudo:
```bash
sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply
```
---
## 6. P1 120/121 K3s
K3s must wait for 188 PostgreSQL and 110 Harbor.
### 6.1 Startup order
1. 120 `k3s.service`
2. 121 `k3s-agent.service` or its live role
3. CNI / kube-proxy
4. Nodes Ready
5. Core pods
6. `awoooi-prod` pods
7. keepalived VIP `192.168.0.125`
8. NodePorts `32334` and `32335`
### 6.2 Checks
```bash
ssh wooo@192.168.0.120 '
hostname; uptime
pg_isready -h 192.168.0.188 -p 5432 || true
systemctl is-active k3s k3s-agent keepalived 2>/dev/null || true
kubectl get nodes -o wide 2>/dev/null || true
kubectl get pods -A 2>/dev/null | grep -v -E "Running|Completed" || true
kubectl get pods -n awoooi-prod -o wide 2>/dev/null || true
ip addr show | grep 192.168.0.125 || true
'
ssh wooo@192.168.0.121 '
hostname; uptime
systemctl is-active k3s k3s-agent keepalived 2>/dev/null || true
ip addr show | grep 192.168.0.125 || true
'
```
If K3s is `activating` while 188 PostgreSQL is down, fix PostgreSQL first. Restarting K3s repeatedly will not solve it.
---
## 7. P2 AWOOOI Workloads
Run after K3s nodes are Ready:
```bash
ssh wooo@192.168.0.120 '
kubectl get deploy -n awoooi-prod
kubectl get pods -n awoooi-prod -o wide
kubectl get svc -n awoooi-prod
kubectl get events -n awoooi-prod --sort-by=.lastTimestamp | tail -40
'
curl -s --max-time 8 http://192.168.0.125:32334/api/v1/health
curl -s -o /dev/null -w "web=%{http_code}\n" --max-time 8 http://192.168.0.125:32335/
```
If pods are `ImagePullBackOff`, go back to 110 Harbor.
If API health fails because DB/Redis is down, go back to 188.
---
## 8. P2 Alert Chain
Current main path:
```text
Prometheus/Alertmanager on 110
-> http://192.168.0.125:32334/api/v1/webhooks/alertmanager
-> AWOOOI API
-> TelegramGateway
-> Telegram
```
Alertmanager health alone is not enough. Run E2E:
```bash
curl -s -X POST http://192.168.0.125:32334/api/v1/webhooks/alertmanager \
-H 'Content-Type: application/json' \
-d '{"receiver":"cold-start-test","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartE2ETest","severity":"info"},"annotations":{"summary":"Cold start E2E test, ignore"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-test"}'
```
Expected: API returns success and Telegram receives the test alert.
---
## 9. P2 Schedules And Delayed Work
Do not mark the reboot complete until scheduled work is proven runnable. A container can be healthy while its cron path is broken.
| Host / Layer | Required check | Success baseline |
|--------------|----------------|------------------|
| 188 cron | `systemctl is-active cron` and `crontab -l` | cron active; backup, restart exporter, stats exporter entries present |
| 188 backup-from-110 | `backup_110_last_success_timestamp` in textfile/Prometheus | last success age `< 25h` |
| 188 momo-scheduler | `docker logs momo-scheduler` | `全部排程任務已註冊`; Google Drive auth works; dashboard URLs use container-reachable hostnames |
| 188 momo import | manual `run_auto_import_task()` after parser changes | selected sheet is `即時業績明細`; imported date range has matching rows in `daily_sales_snapshot` and `realtime_sales_monthly` |
| 110 cron | `systemctl is-active cron` | cron active; Docker/systemd textfile exporters fresh |
| 110 startup units | `systemctl --failed` | zero failed units; stale `momo-startup-complete` and `wooo-staggered-startup` disabled |
| 120 K8s CronJobs | `kubectl get cronjobs -n awoooi-prod` | unsuspended; no failed Jobs remain after current validation |
| 121 DR drill | `crontab -l` | DR drill cron present unless explicitly paused |
Useful checks:
```bash
ssh ollama@192.168.0.188 'systemctl is-active cron; crontab -l; ls -l /home/ollama/node_exporter_textfiles/*.prom'
ssh wooo@192.168.0.110 'systemctl --failed --no-pager; systemctl is-active cron; crontab -l'
ssh wooo@192.168.0.120 'sudo kubectl get cronjobs,jobs -n awoooi-prod'
ssh wooo@192.168.0.121 'systemctl is-active cron; crontab -l'
```
If a schedule succeeds but emits a false verification alert, fix the verification rule before releasing AI auto-remediation. False positives train operators to ignore real alarms.
---
## 10. P2/P3 Stateful Service Guardrails
| Tier | Examples | Automation |
|------|----------|------------|
| BLOCK | PostgreSQL data dir, ClickHouse data dir, Harbor DB, Sentry DB | No automatic destructive action. Human approval only. |
| CRITICAL_HITL | Redis, Kafka, MinIO, SignOz ClickHouse, Sentry ClickHouse | Human-in-the-loop restart/repair. |
| STANDARD_HITL | API/Web/worker, OpenClaw, litellm | Restart only with evidence and blast-radius check. |
| AUTO | Stateless exporters, blackbox, nginx exporter | Auto restart allowed after verification. |
Never use generic `docker restart $(docker ps -q)` during cold start.
### 10.1 Dirty-Reboot Storage Corruption
Treat these log signatures as storage corruption, not ordinary service flakiness:
- `Bad message`
- `Structure needs cleaning`
- `Unknown codec`
- `PANIC: could not locate a valid checkpoint record`
- Kafka `Malformed line` in checkpoint files
- ClickHouse `broken and needs manual correction`
Cold-start automation may stop a restart storm and collect evidence, but it must not delete the original data directory. If a filesystem returns `Bad message` or `Structure needs cleaning`, the real root cause is below the container layer. Online recovery can restore service from readable data, but complete historical recovery requires an offline filesystem check or backup restore.
### 10.2 ClickHouse Clean-Clone Recovery Pattern
Use this pattern for Sentry ClickHouse or SignOz ClickHouse when individual corrupted parts cannot be moved because the host filesystem rejects reads.
```text
1. Stop the compose stack or at least stop dependent consumers.
2. Disable restart loops for the failing container.
3. Save logs and build an exclude list from unreadable store paths.
4. Preserve the original volume as _data.corrupt-YYYYMMDD-HHMMSS.
5. Create a clean _data clone with readable files only.
6. Add flags/force_restore_data.
7. Start ClickHouse first, then web/API, then consumers.
8. Verify HTTP, merge backlog, and restart count before releasing high-load services.
```
Do not replace this with `rm -rf store/...` unless the unreadable path is already backed up or the commander explicitly accepts data loss. The preferred incident artifact is:
```text
/var/lib/docker/volumes/<volume>/_data.corrupt-YYYYMMDD-HHMMSS
/var/backups/<service>-<component>-YYYYMMDD-HHMMSS
```
### 10.3 Kafka Checkpoint Recovery Pattern
If Kafka refuses to start with malformed checkpoint files after a dirty reboot, preserve and move only checkpoint files:
```text
log-start-offset-checkpoint
recovery-point-offset-checkpoint
replication-offset-checkpoint
```
Then start Kafka and confirm health before starting Snuba/Sentry consumers. Do not delete topic directories or Kafka logs during cold-start recovery.
---
## 11. P3 High-Load Services
Only release these after P0/P1/P2 gates are green:
| Host | Service | Release condition |
|------|---------|-------------------|
| 188 | momo-scheduler / crawler | load/core < 1.0 for 15 minutes and DB healthy |
| 188 | SignOz ClickHouse | healthy and merge backlog trending down |
| 188 | litellm | `/health/liveliness` good and provider route verified |
| 110 | Sentry Snuba consumers | ClickHouse healthy and Kafka backlog decreasing |
| 110 | Sentry uptime-checker | Sentry web/DB healthy |
| 110 | runners | all previous gates green and load/core < 1.0 for 15 minutes |
---
## 12. Baseline And AI Auto-Remediation Gate
### 12.1 Stable Runtime Baseline
These are release gates after the first cold-start recovery pass:
| Area | Baseline |
|------|----------|
| 188 host | PostgreSQL accepting, Redis PONG, momo `/health` 200, SignOz HTTP reachable, load/core < 1.0 sustained before crawlers |
| 110 host | Harbor `/v2/` 200/401, Gitea 200/302, Prometheus ready, Alertmanager healthy, Sentry HTTP 200/302/400, no ClickHouse/Kafka restart loop |
| K3s | 120/121 nodes Ready, VIP `192.168.0.125` present, AWOOOI API 2xx/3xx, Web 2xx/3xx |
| Public routes | `https://awoooi.wooo.work/api/v1/health` 2xx/3xx, `https://mo.wooo.work/health` 2xx/3xx |
| Guardrails | Docker/systemd textfile exporters fresh, runner `CPUQuota=200%`, `MemoryMax=2G`, `WatchdogUSec=0` |
| Schedules | cron active on 110/188/120/121; K8s CronJobs unsuspended; no current failed Jobs; 188 backup success `< 25h` |
| Backlog | ClickHouse merges and Kafka/Snuba lag trending down, not increasing for two consecutive checks |
If service health is green but load average remains high, check live CPU and IO before changing memory limits. High load after Sentry/Snuba or ClickHouse startup can be backlog drain; high CPU from runners/builds/crawlers is a release-order problem.
### 12.2 AI Auto-Remediation Gate
AI auto-repair can move from observe-only to limited execution only after:
- Prometheus rules are loaded.
- docker/systemd textfile exporter files are fresh.
- blackbox probes have stable results.
- cron/CronJob schedule checks are green.
- AWOOOI API `/api/v1/health` passes.
- Alertmanager E2E webhook passes.
- Redis/KM/playbook health is available.
- No active restart storm.
- Host load/core remains below `1.0` for 15 minutes.
Until then:
- diagnose only
- notify only
- require human approval for remediation
- no DB/ClickHouse/Harbor/Sentry destructive action
- no generic restart action against stateful services
---
## 13. One-Command Readiness Script
Run:
```bash
bash scripts/reboot-recovery/full-stack-cold-start-check.sh
```
The script is read-only. It reports gates:
- `P0-NETWORK`
- `P0-188-DATA`
- `P0-110-REGISTRY`
- `P1-K3S`
- `P2-WORKLOAD`
- `P2-ALERTCHAIN`
- `P2-PUBLIC-ROUTES`
- `P2-SCHEDULES`
- runner guardrail state inside `P0-110-REGISTRY-OBSERVABILITY`
If it prints `BLOCKED`, fix the first blocked gate before moving forward.
---
## 14. Done Criteria
All must be true:
- Four hosts reachable by SSH.
- 188 PostgreSQL and Redis healthy.
- 110 Harbor, Gitea, Prometheus, Alertmanager healthy.
- 120/121 K3s nodes Ready.
- VIP `192.168.0.125` present.
- AWOOOI API and Web reachable through NodePort/VIP.
- Alertmanager E2E webhook succeeds.
- cron/CronJob schedules are active, unsuspended, and verified.
- Sentry and SignOz are either healthy or explicitly in controlled backlog recovery.
- High-load batch services are capped or delayed.
- Runners are guarded and released last.
- AI auto-remediation is not in full execution mode until all gates are green.
---
## 15. Known Drift To Fix After Recovery
These must be cleaned after the incident, not during P0:
- `SERVICE-ENDPOINTS.md` still has old Prometheus/Alertmanager locations.
- Audit older docs for direct node webhook targets; current main path should be VIP `192.168.0.125:32334`.
- OpenClaw `8088` vs `8089` must be live-confirmed and normalized.
- 188 compose paths drift between `/home/ollama/*` and Ansible `/opt/*`.
- 110 runner docs still mention Docker runner in places; live startup prefers host `gitea-act-runner-host.service`.
- `scripts/setup-runner-watchdog.sh` conflicts with the 2026-05-05 runner watchdog disablement guardrail.
- `grist.wooo.work` / `registry.wooo.work` public HTTP/HTTPS currently route to `aiops.wooo.work`; their old 110 certbot renewal configs are disabled until public routing is corrected or DNS-01 renewal is configured.

View File

@@ -9,11 +9,13 @@
| Service | Live Limit | Live Usage Snapshot | Verdict |
|---|---:|---:|---|
| Sentry ClickHouse | 4 CPU / 8 GiB | ~235-291% CPU / 3.3-3.4 GiB | CPU capped but still hottest. Do not lower memory; keep merge settings explicit. |
| Sentry ClickHouse | 2 CPU / 8 GiB, merge pool 4 | capped near 2 cores after pool 8 -> 4 restart | Do not lower memory. CPU quota intentionally slows background merge so Sentry cannot dominate 110. If backlog grows, inspect `MergeMutate` and Sentry high-volume features before raising it. |
| Sentry Kafka | 2 CPU / 3 GiB | ~40-55% CPU / 2.5 GiB (84%) | Memory is close to pressure. Do not reduce memory. |
| Sentry taskworker | 2 CPU / 2 GiB, concurrency 2 | ~120-181% CPU after restart | Concurrency reduced from 4 to 2 after Kafka lag cleared. Watch Sentry task latency before further changes. |
| Sentry taskbroker | 1 CPU / 512 MiB | ~70-98% CPU / 160 MiB | CPU is tight; increasing may improve backlog but can raise host load. |
| Sentry taskscheduler | 0.5 CPU / 512 MiB | ~13% CPU / 387 MiB (76%) | Memory is tight; alert at 85% before it stalls. |
| Sentry redis | 0.5 CPU / 512 MiB | ~15-30% CPU / 19 MiB | Live and compose cap are aligned. |
| Sentry uptime-checker | 0.5 CPU / 512 MiB | ~26-30% CPU / 43-187 MiB | Capped after it showed sustained background CPU. |
| Gitea | 3 CPU / 3 GiB | ~4% CPU / 2.18 GiB (73%) | Good cap; memory headroom is not huge. |
| GitHub/Gitea runners | unlimited systemd services | one runner had WatchdogSec=5min and 8,490 restarts; `act` CI containers caused load spikes | Must be monitored outside Docker. Remove bad watchdog drop-in and apply per-runner CPU/Memory quotas. |
| node-exporter | 1 CPU / 256 MiB | ~0-5% CPU / 8 MiB | Good after disabling expensive `arp`, `netclass`, and `netdev` collectors. |
@@ -28,8 +30,11 @@
| SignOz ClickHouse | 4 CPU / 24 GiB | ~93-133% CPU / 1.1 GiB | Healthy enough; keep current cap. |
| SignOz Zookeeper | 1 CPU / 2 GiB | ~8-18% CPU / 1.09 GiB | OK. |
| cadvisor | 1.5 CPU / 1 GiB | ~0% CPU / 28 MiB | Good. |
| litellm | unlimited | ~0.6-0.9% CPU / 780 MiB | Add modest cap after observing traffic; do not re-add DATABASE_URL. |
| momo-pro-system / momo-db | unlimited | DB had short CPU bursts, then ~0.6% with no active long query | Needs service-specific limits after scheduler/schema pressure is controlled. |
| litellm | 1 CPU / 1 GiB | ~0.5-0.9% CPU / 780 MiB | Good cap; keep stateless mode and do not re-add `DATABASE_URL`. |
| momo-pro-system | 2 CPU / 2 GiB | ~1-2% CPU / 740 MiB | Good cap; startup cache prewarm must stay single-flight. |
| momo-scheduler | 2 CPU / 2 GiB | ~0.3% CPU / 105-163 MiB after crawler burst | CPU cap is working. Next fix is crawler concurrency and failed background jobs, not lower CPU. |
| momo-telegram-bot | 0.5 CPU / 512 MiB | ~0.7% CPU / 66 MiB | Good cap. |
| momo-db | 2 CPU / 4 GiB | DB had short CPU bursts, then ~0.6-29% with no active long query | Good cap; current bursts are query/workload, not limit pressure. |
| Monitoring tools / websites / exporters | mostly unlimited | low | Add caps gradually with textfile alerts watching pressure. |
## Baseline Policy
@@ -69,12 +74,13 @@ Use these thresholds for alerting and AI triage:
1. Deploy `scripts/ops/docker-stats-textfile-exporter.py` to 110 and 188 textfile collector cron.
2. Reload Prometheus rules with the new Docker CPU/memory/restart baseline alerts.
3. Observe 110 for one drain window after node-exporter collector trim and taskworker concurrency 2. Kafka lag is now near zero; if ClickHouse remains high, tune merge/query behavior, not Kafka consumers.
4. Tune `momo-scheduler` crawler concurrency on 188; keep 2 CPU / 2 GiB until success rate and latency prove it is too low.
5. Fix 188 Elephant Alpha/OpenClaw allowed-action drift before enabling resource auto-repair beyond diagnosis.
6. Add modest caps to currently unlimited low-risk services in small batches.
7. Deploy `scripts/ops/stop-stale-gitea-actions-jobs.sh` to 110 as `/home/wooo/scripts/stop-stale-gitea-actions-jobs.sh`; keep Prometheus auto action in dry-run mode.
8. Fix 110 runner services with sudo-capable host maintenance:
3. Persist live limits in the owning compose files before considering the host repaired; live `docker update` alone is not durable.
4. Observe 110 for one drain window after node-exporter collector trim and taskworker concurrency 2. Kafka lag is now near zero; if ClickHouse remains high, tune merge/query behavior or reduce high-volume Sentry features, not Kafka memory.
5. Tune `momo-scheduler` crawler concurrency on 188; keep 2 CPU / 2 GiB until success rate and latency prove it is too low.
6. Fix 188 Elephant Alpha/OpenClaw allowed-action drift before enabling resource auto-repair beyond diagnosis.
7. Add modest caps to currently unlimited low-risk services in small batches. Do not alert every unlimited auxiliary container at once; promote candidates only after 24h usage data.
8. Deploy `scripts/ops/stop-stale-gitea-actions-jobs.sh` to 110 as `/home/wooo/scripts/stop-stale-gitea-actions-jobs.sh`; keep Prometheus auto action in dry-run mode.
9. Fix 110 runner services with sudo-capable host maintenance:
```bash
sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply
@@ -88,3 +94,4 @@ sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply
- Treating "no alert" as healthy when cAdvisor or textfile exporters are missing.
- Letting monitoring collectors spend seconds per scrape; this turns observability into load.
- Leaving self-hosted runners unlimited on the same host as Sentry/ClickHouse/Gitea.
- Applying live `docker update` without persisting the same guardrail in compose/systemd/IaC.

View File

@@ -13,15 +13,15 @@ Dashboard 路徑:`Ollama 容災監控`uid: `ollama-failover-p23`
### Panel 1 — Ollama 可用性 (Stat)
**看什麼**`up{job=~"ollama_111|ollama_188"}` × 100顯示每 Ollama 主機的 scrape 存活狀態。
**看什麼**`up{job=~"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111"}` × 100顯示每 Ollama provider endpoint 的 scrape 存活狀態。
| 顏色 | 意義 |
|------|------|
| 綠色 100% | Prometheus 探測正常,主機在線 |
| 黃色 50% | 一台離線,另一台在線(容災中) |
| 紅色 0% | 兩台全離線,高風險 |
| 黃色 | 部分 endpoint 離線,系統應進入容災 |
| 紅色 0% | Ollama provider pool 全離線,高風險 |
**注意**:此面板反映 Prometheus scrape 狀態,需要 scrape job 命名 `ollama_111` / `ollama_188`
**注意**:此面板反映 Prometheus scrape 狀態,需要 scrape job 命名對齊 `ollama_gcp_a` / `ollama_gcp_b` / `ollama_local`
設定檔位於 `ops/monitoring/generated/prometheus-scrape-generated.yaml`
---
@@ -47,9 +47,10 @@ Dashboard 路徑:`Ollama 容災監控`uid: `ollama-failover-p23`
| 分布 | 意義 |
|------|------|
| ollama 佔 >90% | 正常,111 健康 |
| gemini 佔多數 | 111 SLOW/DEGRADED/OFFLINE容災 |
| ollama_188 出現 | Gemini 配額耗盡備援,或 111 和 Gemini 同時失敗 |
| ollama / ollama_gcp_a 佔 >90% | 正常,GCP-A 健康 |
| ollama_gcp_b 佔多數 | GCP-A SLOW/DEGRADED/OFFLINE容災到 GCP-B |
| ollama_local 出現 | GCP-A/B 均不可用,容災到 111 local |
| gemini 佔多數 | Ollama provider pool 全部不可用,使用付費備援 |
| 全部 nemotron/claude | 極端情況,所有主力 provider 失敗 |
---
@@ -71,10 +72,10 @@ Dashboard 路徑:`Ollama 容災監控`uid: `ollama-failover-p23`
### `OllamaInstanceDown` — Ollama 主機離線
**觸發條件**`up{job=~"ollama_111|ollama_188"} == 0` 持續 2 分鐘。
**觸發條件**`up{job=~"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111"} == 0` 持續 2 分鐘。
**影響評估**
- 系統應已自動切至 Gemini查 Panel 3 確認)
- 系統應已依序切至 GCP-B / 111 local / Gemini查 Panel 3 確認)
- 查 Panel 4 是否有 Failover 計數上升
**排查步驟**
@@ -82,11 +83,9 @@ Dashboard 路徑:`Ollama 容災監控`uid: `ollama-failover-p23`
```bash
# 步驟 1確認主機存活
ping -c 3 192.168.0.111
ping -c 3 192.168.0.188
# 步驟 2SSH 進主機確認 ollama 服務狀態
ssh wooo@192.168.0.111 'systemctl status ollama'
ssh wooo@192.168.0.188 'systemctl status ollama'
# 步驟 3查 ollama 最近的 journal log
ssh wooo@192.168.0.111 'journalctl -u ollama -n 50 --no-pager'
@@ -210,8 +209,9 @@ ssh wooo@192.168.0.111 'systemctl status ollama && nvidia-smi'
| Metric | 類型 | 狀態 | 說明 |
|--------|------|------|------|
| `up{job="ollama_111"}` | Gauge | ✅ 現有 | Prometheus scrape 存活 |
| `up{job="ollama_188"}` | Gauge | ✅ 現有 | Prometheus scrape 存活 |
| `up{job="ollama_gcp_a"}` | Gauge | ✅ 現有 | Prometheus scrape 存活 |
| `up{job="ollama_gcp_b"}` | Gauge | ✅ 現有 | Prometheus scrape 存活 |
| `up{job="ollama_local"}` | Gauge | ✅ 現有 | Prometheus scrape 存活 |
| `ollama_failover_triggered_total` | Counter | ✅ P2.3 補入 | failover 切換次數labels: from_provider, to_provider |
| `ollama_recovery_triggered_total` | Counter | ✅ P2.3 補入 | recovery 切回次數labels: from_provider |
| `ollama_health_status{host}` | Gauge | ✅ P2.3 補入 | 健康狀態 1=healthy, 0=not_healthy |

View File

@@ -1,34 +0,0 @@
# ============================================================================
# PATCH: 188 CPU-only Ollama 備援端點
# 日期: 2026-04-25 (台北時區)
# 負責人: ogt + Claude Sonnet 4.6
# ADR 參考: plan_complete_v3.md P0.5
# 診斷實測數據:
# 主機: 192.168.0.188, Intel Xeon Silver 4214 @ 2.2GHz, 12 核, CPU-only
# RAM: 62GB (used 14GB), Disk: 982GB (used 221GB)
# GPU: 無
# 現有模型: qwen2.5:7b-instruct (4.5GB), llama3.2:3b (1.9GB),
# deepseek-r1:14b (8.5GB), nomic-embed-text (261MB)
# 推理延遲實測: qwen2.5:7b-instruct → total=111s, eval_rate=0.09 token/s
# llama3.2:3b → total=155s (cold start, 比 7b 更慢)
# 目標 ~30s 無法達到 (CPU 推理硬上限 ~0.09 token/s)
# 決策: qwen2.5:7b-instruct 已存在,設為備援 (111s 延遲,使用者需知情)
# 連通性: 110 → 188:11434 ✅ 已驗證
# ⚠️ 注意: 188 推理極慢(~111s),應只在 111 GPU Ollama 完全失效時啟用
# 建議: 程式碼層應設 OLLAMA_FALLBACK_188_TIMEOUT_SEC = 150
# ============================================================================
#
# 將以下兩行加入 /Users/ogt/awoooi/k8s/awoooi-prod/04-configmap.yaml
# 建議位置: OLLAMA_URL 行 (第 20 行) 之後
#
# --- 新增內容 ---
# 2026-04-25 ogt + Claude Sonnet 4.6: 188 CPU-only Ollama 備援 (plan_complete_v3 P0.5)
# ⚠️ 188 推理延遲實測 ~111s (0.09 token/s, CPU-only Xeon 4214),僅作 111 完全失效時的降級備援
# 模型已存在: qwen2.5:7b-instruct (4.5GB), 無需重拉
OLLAMA_FALLBACK_188: "http://192.168.0.188:11434"
OLLAMA_188_MODEL: "qwen2.5:7b-instruct"
# --- 新增內容結束 ---
#
# 使用方式 (需用戶 review 後手動 apply):
# kubectl -n awoooi-prod apply -f k8s/awoooi-prod/04-configmap.yaml
# kubectl -n awoooi-prod rollout restart deployment/awoooi-api

View File

@@ -42,8 +42,11 @@ spec:
restartPolicy: OnFailure
containers:
- name: k3s-report
image: 192.168.0.110:5000/awoooi-api:latest
imagePullPolicy: Always
# 2026-05-05 Codex: keep the API image placeholder so CD
# injects the same immutable tag used by API/worker. The old
# awoooi-api:latest repo returns 400 from Harbor after reboot.
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
imagePullPolicy: IfNotPresent
command:
- python
- -m
@@ -63,5 +66,7 @@ spec:
limits:
cpu: "200m"
memory: "128Mi"
# 使用 API 的 ServiceAccount (需要 RBAC)
serviceAccountName: awoooi-api
# 2026-05-05 Codex: this report only calls Prometheus and Telegram.
# The old awoooi-api ServiceAccount does not exist, which prevented
# Job pods from being created after reboot.
serviceAccountName: default

View File

@@ -42,8 +42,11 @@ spec:
restartPolicy: OnFailure
containers:
- name: weekly-report
image: 192.168.0.110:5000/awoooi-api:latest
imagePullPolicy: Always
# 2026-05-05 Codex: keep the API image placeholder so CD
# injects the same immutable tag used by API/worker. The old
# awoooi-api:latest repo returns 400 from Harbor after reboot.
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
imagePullPolicy: IfNotPresent
command:
- python
- -m
@@ -63,5 +66,7 @@ spec:
limits:
cpu: "500m"
memory: "256Mi"
# 使用 API 的 ServiceAccount (需要 RBAC)
serviceAccountName: awoooi-api
# 2026-05-05 Codex: this report only calls app services, Prometheus,
# Git, and Telegram. The old awoooi-api ServiceAccount does not
# exist, which prevented Job pods from being created after reboot.
serviceAccountName: default

View File

@@ -27,7 +27,10 @@ spec:
jobTemplate:
spec:
backoffLimit: 2
activeDeadlineSeconds: 300
# 2026-05-05 Codex: allow post-reboot/post-migration catch-up batches.
# The script now fails if the API reports failed rows, so this longer
# deadline does not hide partial vectorization.
activeDeadlineSeconds: 1800
template:
metadata:
labels:
@@ -37,8 +40,11 @@ spec:
restartPolicy: OnFailure
containers:
- name: km-vectorize
image: 192.168.0.110:5000/awoooi-api:latest
imagePullPolicy: Always
# 2026-05-05 Codex: keep the API image placeholder so CD
# injects the same immutable tag used by API/worker. The old
# awoooi-api:latest repo returns 400 from Harbor after reboot.
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
imagePullPolicy: IfNotPresent
command:
- python
- /app/scripts/cron_km_vectorize.py
@@ -46,7 +52,9 @@ spec:
- name: TZ
value: "Asia/Taipei"
- name: INTERNAL_API_URL
value: "http://awoooi-api.awoooi-prod.svc.cluster.local:8000"
# 2026-05-05 Codex: use the actual Service name; the old
# awoooi-api DNS name does not exist in awoooi-prod.
value: "http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000"
resources:
requests:
cpu: "50m"
@@ -54,4 +62,7 @@ spec:
limits:
cpu: "200m"
memory: "128Mi"
serviceAccountName: awoooi-api
# 2026-05-05 Codex: this job only calls the internal API. The old
# awoooi-api ServiceAccount does not exist, which prevented Job pods
# from being created after reboot.
serviceAccountName: default

View File

@@ -63,10 +63,11 @@ spec:
print(f"status={r.status_code} body={r.text[:200]}")
asyncio.run(run())
env:
# 2026-04-09 Claude Sonnet 4.6: ClusterIP 和 DNS 在 Job Pod 均不可達
# 改用 NodePort 直連 K3s worker node同 K8s_API_SERVER_URL 解法)
# 2026-05-05 Codex: call the in-cluster Service instead of a
# fixed worker NodePort. After reboot, 121 can be unavailable
# while the Service and VIP are already healthy.
- name: INTERNAL_API_URL
value: "http://192.168.0.121:32334"
value: "http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000"
- name: DRIFT_SCAN_NAMESPACES
value: "awoooi-prod"
resources:

View File

@@ -88,7 +88,7 @@ spec:
# -----------------------------------------------------------------
- alert: NoAlertsReceived2Hours
expr: |
time() - max by (source)(awoooi_alert_chain_last_success_timestamp) > 7200
time() - max by (source)(awoooi_alert_chain_last_success_timestamp{source="alertmanager"}) > 7200
for: 5m
labels:
severity: warning

View File

@@ -15,6 +15,39 @@
groups:
# =========================================================================
# Full-stack recovery scorecard recording rules
# =========================================================================
- name: full_stack_recovery_scorecard_rules
interval: 60s
rules:
- record: awoooi_recovery_core_ready
expr: |
sum without(result) (
awoooi_cold_start_last_result{host="110",scope="110_120_121_188",result="green"} == bool 1
)
* on(host,scope) (
awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"} == bool 0
)
* on(host,scope) (
awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"} == bool 0
)
* on(host,scope) (
(time() - awoooi_cold_start_last_green_timestamp{host="110",scope="110_120_121_188"}) < bool 3600
)
- record: awoooi_recovery_dr_offsite_ready
expr: |
max by(host) (
awoooi_backup_offsite_configured{host="110"} == bool 1
)
* on(host) max by(host) (
awoooi_backup_offsite_fresh{host="110"} == bool 1
)
* on(host) min by(host) (
awoooi_backup_credential_escrow_fresh{host="110"} == bool 1
)
# =========================================================================
# 主機層告警 (host_alerts)
# =========================================================================
@@ -33,20 +66,22 @@ groups:
description: "Node Exporter 無回應超過 1 分鐘"
- alert: HostHighCpuLoad
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
# 2026-05-05 ogt + Codex: keep this as early warning only.
# Sustained overload/root-cause automation is handled by HostLoadAverageSustainedHigh.
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 10m
labels:
severity: warning
layer: systemd-188
team: ops
auto_repair: "true"
auto_repair: "false"
# MCP Phase 2a (ADR-071, 2026-04-11 Claude Sonnet 4.6): SSH MCP 路由標籤
mcp_provider: "ssh_host"
host_type: "bare_metal"
alert_category: "host_resource"
annotations:
summary: "主機 {{ $labels.host }} CPU 高負載"
description: "CPU 使用率超過 80%"
description: "CPU 使用率超過 90% 持續 10 分鐘;若 load5/core 未超過 1.5,先視為容量觀察與診斷,不直接修復。"
- alert: HostLoadAverageSustainedHigh
# 2026-05-05 ogt + Codex: 110/188 長時間過載基線。
@@ -165,7 +200,7 @@ groups:
description: "過去 24 小時有備份失敗"
- alert: VeleroBackupNotRun
expr: time() - velero_backup_last_successful_timestamp > 86400
expr: max by(host, namespace) (awoooi_velero_latest_completed_backup_fresh{host="110",namespace="velero"}) == 0
for: 10m
labels:
severity: critical
@@ -175,7 +210,7 @@ groups:
auto_repair: "false"
annotations:
summary: "Velero 超過 24 小時未成功備份"
description: "最後一次成功備份超過 24 小時"
description: "backup health exporter 顯示 latest Completed Velero backup 超過 25 小時或不存在。"
# Sprint C-2 Host rsync 備份告警 (2026-04-11 Claude Sonnet 4.6)
# backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success
@@ -505,7 +540,7 @@ groups:
description: "Sentry 錯誤可能無法正確處理"
- alert: NoAlertsReceived2Hours
expr: time() - max by (source)(awoooi_alert_chain_last_success_timestamp) > 7200
expr: time() - max by (source)(awoooi_alert_chain_last_success_timestamp{source="alertmanager"}) > 7200
for: 5m
labels:
severity: warning
@@ -665,7 +700,7 @@ groups:
- alert: DockerContainerMissingResourceLimit
# 2026-05-05 ogt + Codex: catch Compose services that silently run with unlimited CPU/memory.
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0)
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0)
for: 30m
labels:
severity: warning
@@ -1011,10 +1046,10 @@ groups:
# 備份還原告警 (awoooi_backup_restore) — 從主機補回 2026-04-12
# =========================================================================
- name: awoooi_backup_restore
interval: 1h
interval: 1m
rules:
- alert: BackupRestoreTestFailed
expr: awoooi_backup_restore_test_success == 0
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_failed_jobs{host="110",namespace="velero",cronjob="backup-restore-test"}) > 0
for: 5m
labels:
severity: critical
@@ -1023,11 +1058,37 @@ groups:
auto_repair: "false"
annotations:
summary: "備份還原 dry-run 測試失敗"
description: "Velero restore dry-run 失敗,備份可能無法還原。立即人工驗證備份狀態。"
runbook: "執行 velero backup describe awoooi-daily 及 velero restore create --from-backup awoooi-daily --dry-run"
description: "velero namespace 中保留了失敗的 backup-restore-test Job,備份可能無法還原。立即人工驗證備份狀態。"
runbook: "先找最新 Completed Velero backup再執行 restore dry-run禁止在 production namespace 做真還原"
- alert: BackupRestoreTestMissing
expr: absent(awoooi_velero_restore_test_cron_present{host="110",namespace="velero",cronjob="backup-restore-test"})
for: 30m
labels:
severity: warning
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "備份還原 dry-run 監控指標缺失"
description: "Prometheus 沒有收到 awoooi_velero_restore_test_cron_present110 backup health exporter 或 120 kubectl 查詢可能失效。"
runbook: "檢查 110 backup_health.prom、SSH 110→120、以及 velero namespace 的 backup-restore-test CronJob"
- alert: BackupRestoreTestCronMissing
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_cron_present{host="110",namespace="velero",cronjob="backup-restore-test"}) == 0
for: 15m
labels:
severity: critical
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "備份還原 dry-run CronJob 缺失"
description: "velero namespace 找不到 backup-restore-test CronJob備份可還原性沒有定期驗證。"
runbook: "kubectl apply k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml 與 16-cronjob-backup-restore-test.yaml"
- alert: BackupRestoreTestStale
expr: (time() - awoooi_backup_restore_test_timestamp_seconds) > 691200
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_last_success_fresh{host="110",namespace="velero",cronjob="backup-restore-test"}) == 0
for: 10m
labels:
severity: warning
@@ -1036,9 +1097,375 @@ groups:
auto_repair: "false"
annotations:
summary: "備份還原測試超過 8 天未執行"
description: "上次備份測試距今 {{ $value | humanizeDuration }}週排程 CronJob 可能失效。"
description: "backup-restore-test CronJob 沒有 8 天內成功紀錄;週排程 CronJob 可能失效。"
runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態"
# =========================================================================
# Host / service / config backup health
# =========================================================================
- name: full_stack_backup_health_alerts
interval: 1m
rules:
- alert: BackupHealthMonitorMissing110
expr: absent(awoooi_backup_health_monitor_up{host="110"})
for: 20m
labels:
severity: warning
layer: host-backup
component: backup-health-monitor
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份健康指標缺失"
description: "110 沒有輸出 backup_health.prom無法確認資料庫、設定檔與服務備份是否新鮮。"
runbook: "用 Ansible `110-devops.yml --tags textfile_exporters` 或手動部署 scripts/ops/backup-health-textfile-exporter.py"
- alert: BackupHealthMonitorMissing188
expr: absent(awoooi_backup_health_monitor_up{host="188"})
for: 20m
labels:
severity: warning
layer: host-backup
component: backup-health-monitor
host: "188"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "188 備份健康指標缺失"
description: "188 沒有輸出 backup_health.prom無法確認 110 rsync 與 momo PostgreSQL 備份是否新鮮。"
runbook: "用 Ansible `188-ai-web.yml --tags textfile_exporters` 或手動部署 scripts/ops/backup-health-textfile-exporter.py"
- alert: BackupHealthMonitorStale
expr: time() - awoooi_backup_health_last_run_timestamp{host=~"110|188"} > 1800
for: 10m
labels:
severity: warning
layer: host-backup
component: backup-health-monitor
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "{{ $labels.host }} 備份健康 exporter 超過 30 分鐘未更新"
description: "backup health textfile exporter stale備份狀態不可觀測。"
runbook: "SSH 主機檢查 cron、/tmp/awoooi-backup-health-textfile-exporter.cron.log 與 node-exporter textfile collector"
- alert: BackupExpectedJobMissing
expr: awoooi_backup_job_configured{host=~"110|188"} == 0
for: 15m
labels:
severity: critical
layer: host-backup
component: backup-cron
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "{{ $labels.host }} 備份排程缺失:{{ $labels.exported_job }}"
description: "預期備份 cron/config 不存在;下一次重開機後資料可能沒有可用還原點。"
runbook: "依 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的備份章節補回 cron先 dry-run 再執行"
- alert: BackupScheduleDuplicateActiveEntries
expr: awoooi_backup_cron_active_duplicate_count{host="110"} > 0
for: 15m
labels:
severity: warning
layer: host-backup
component: backup-cron
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份 crontab 有重複 active entries"
description: "110 crontab 目前有 {{ $value }} 個 exact duplicate active entry可能造成 offsite sync、verifier 或 status job 重複執行。"
runbook: "SSH 110 執行 `crontab -l | awk 'NF && $0 !~ /^#/ {count[$0]++} END {for (line in count) if (count[line] > 1) print count[line], line}'`,只移除重複 active entry不要刪除未理解的備份排程。"
- alert: BackupScheduleSingletonMismatch
expr: awoooi_backup_cron_singular_entry_ok{host="110"} == 0
for: 15m
labels:
severity: warning
layer: host-backup
component: backup-cron
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份排程單一入口異常:{{ $labels.entry }}"
description: "{{ $labels.entry }} 應該剛好只有一個 active cron entry目前 count={{ $value }},可能造成排程缺失或重複執行。"
runbook: "用 Ansible `110-devops.yml --tags backup_jobs` 收斂排程,並用 `scripts/reboot-recovery/reboot-recovery-readiness-audit.sh --live --no-color` 驗證。"
- alert: BackupScriptMissing
expr: awoooi_backup_script_present{host=~"110|188"} == 0
for: 15m
labels:
severity: critical
layer: host-backup
component: backup-script
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "{{ $labels.host }} 備份腳本缺失:{{ $labels.script }}"
description: "備份排程可能存在,但實際腳本不存在或路徑漂移。"
runbook: "從 repo 部署對應 scripts/backup 或 scripts/ops 腳本,確認權限 0755"
- alert: BackupJobStale
expr: awoooi_backup_job_fresh{host=~"110|188"} == 0
for: 15m
labels:
severity: critical
layer: host-backup
component: backup-freshness
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "{{ $labels.host }} 備份過舊:{{ $labels.exported_job }}"
description: "{{ $labels.exported_job }} 最新成功證據超過 {{ $labels.max_age_hours }} 小時或不存在;來源 {{ $labels.source }},目標 {{ $labels.target }}。"
runbook: "先檢查備份 log 與磁碟空間,再手動執行對應備份;禁止直接刪除舊備份或 production 資料"
- alert: BackupAggregateRunFailed
expr: awoooi_backup_last_run_failed_count{host="110",exported_job="backup_all"} > 0
for: 10m
labels:
severity: warning
layer: host-backup
component: backup-all
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 全服務備份最近一次有 {{ $value }} 個失敗項目"
description: "backup-all.sh 最近一次 aggregate run 仍有失敗;即使個別 DB 備份已手動補跑,也要重跑 aggregate backup 清除紅燈。"
runbook: "SSH 110 檢查 /backup/logs/cron.log 與 /backup/logs/backup.log修正後執行 /backup/scripts/backup-all.sh"
- alert: BackupConfigCapturePartial
expr: awoooi_backup_config_capture_ok{host="110",critical="true"} == 0
for: 10m
labels:
severity: warning
layer: host-backup
component: backup-config-capture
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 設定檔備份缺少關鍵目標:{{ $labels.target }}"
description: "configs restic snapshot 雖可能存在,但最新設定檔備份未成功捕捉 {{ $labels.target }}source={{ $labels.source }}。這會影響下一次冷啟動或災難還原的設定還原完整性。"
runbook: "先修復對應主機或 K8s API 可達性,再執行 /backup/scripts/backup-configs.sh確認 awoooi_backup_config_capture_ok 回到 1最後補跑 Google Drive/rclone offsite sync。"
- alert: BackupConfigCaptureStatusStale
expr: absent(awoooi_backup_config_capture_status_timestamp{host="110"}) or (time() - awoooi_backup_config_capture_status_timestamp{host="110"} > 172800)
for: 30m
labels:
severity: warning
layer: host-backup
component: backup-config-capture
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 設定檔備份覆蓋率狀態缺失或過舊"
description: "backup-configs.sh 沒有新鮮的 capture status無法判斷 110/120/121/188/K8s 設定檔是否真的被最新 snapshot 捕捉。"
runbook: "部署新版 /backup/scripts/backup-configs.sh 與 /home/wooo/scripts/backup-health-textfile-exporter.py執行 /backup/scripts/backup-configs.sh 後刷新 textfile exporter。"
- alert: BackupIntegrityCheckMissingOrFailed
expr: absent(awoooi_backup_integrity_fresh{host="110",scope="restic_check"}) or awoooi_backup_integrity_fresh{host="110",scope="restic_check"} == 0
for: 30m
labels:
severity: critical
layer: host-backup
component: backup-integrity
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "110 備份倉庫完整性檢查缺失或失敗"
description: "每週 restic check 沒有成功證據,或有 repo 檢查失敗;目前不能假設備份可讀。"
runbook: "SSH 110 執行 `/backup/scripts/check-backup-integrity.sh --mode check`,先看 /backup/logs/backup-integrity.log禁止刪 repo 或 prune 直到確認原因"
- alert: BackupRestoreDrillMissingOrFailed
expr: absent(awoooi_backup_integrity_fresh{host="110",scope="restore_drill"}) or awoooi_backup_integrity_fresh{host="110",scope="restore_drill"} == 0
for: 30m
labels:
severity: warning
layer: host-backup
component: backup-restore-drill
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份抽樣還原演練缺失或失敗"
description: "每月 restore drill 沒有成功證據,備份雖可能新鮮,但尚未驗證可讀取還原。"
runbook: "SSH 110 執行 `/backup/scripts/check-backup-integrity.sh --mode restore-drill`;只允許還原到隔離暫存目錄,不得覆蓋 production"
- alert: BackupOffsiteCopyNotConfigured
expr: sum by(host) (awoooi_backup_offsite_configured{host="110"}) == 0
for: 1m
labels:
severity: warning
layer: host-backup
component: backup-offsite
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 尚未配置離機備份 provider"
description: "backup health exporter 未偵測到 Google Drive/rclone 或其他 offsite provider 配置;本地 restic 全綠仍不等於異地可恢復。"
runbook: "在 110 以 `/backup/scripts/configure-offsite-rclone.sh --interactive` 建立 Google Drive remote產生 `/backup/offsite/*last_success` 證據;不得把 provider token 寫入 repo、Telegram 或 Prometheus label。"
- alert: BackupOffsiteCopyStale
expr: |
(
(sum by(host) (awoooi_backup_offsite_configured{host="110"}) > 0)
and
(sum by(host) (awoooi_backup_offsite_fresh{host="110"}) == 0)
)
and
(
(sum by(host) (awoooi_backup_offsite_full_sync_enabled{host="110"}) == 0)
or
((time() - max by(host) (awoooi_backup_offsite_full_sync_enabled_timestamp{host="110"})) > 30 * 3600)
)
for: 2h
labels:
severity: warning
layer: host-backup
component: backup-offsite
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 離機備份超過 48 小時未成功"
description: "已偵測到 offsite provider 配置,但沒有新鮮成功標記;本地備份可能無法抵抗整台 110 遺失。"
runbook: "SSH 110 檢查 Google Drive/rclone 同步 log 與 `/backup/offsite/*last_success`full sync 需在 enable marker 與低負載門檻成立後由 `/backup/scripts/sync-offsite-backups.sh --mode sync` 鏡像本地 latest-only repo。"
- alert: BackupRetentionPolicyNotLatestOnly
expr: |
absent(awoooi_backup_retention_latest_only{host="110"})
or
awoooi_backup_retention_latest_only{host="110"} != 1
or
absent(awoooi_backup_retention_offsite_delete_old_enabled{host="110",provider="rclone"})
or
awoooi_backup_retention_offsite_delete_old_enabled{host="110",provider="rclone"} != 1
for: 15m
labels:
severity: warning
layer: host-backup
component: backup-retention
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份保留策略不是 latest-only"
description: "operator 要求所有備份只保留最新一份;本地 restic 必須 keep-last=1Google Drive/rclone 必須在成功 mirror 後刪除遠端舊檔。"
runbook: "檢查 `/backup/scripts/common.sh` 的 BACKUP_RETENTION_MODE=latest、KEEP_LAST=1 與 OFFSITE_SYNC_DELETE_OLD=1刷新 backup-health textfile必要時在備份成功後執行 `/backup/scripts/enforce-latest-only-retention.sh`。"
- alert: BackupSnapshotRetentionExceeded
expr: awoooi_backup_job_snapshot_count{host="110",type="restic"} > 1
for: 30m
labels:
severity: warning
layer: host-backup
component: backup-retention
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份 repo {{ $labels.exported_job }} 保留超過 1 份 snapshot"
description: "{{ $labels.exported_job }} 目前有 {{ $value }} 份 restic snapshotlatest-only 策略要求每個 repo 全域只保留最新 1 份。"
runbook: "SSH 110 執行 `/backup/scripts/enforce-latest-only-retention.sh`;若仍未收斂,確認 `common.sh` 使用 `restic forget --group-by \"\" --keep-last 1 --prune`,避免 restic 依 path/tag 分組保留多份。"
- alert: BackupOffsiteFullVerifyFailed
expr: |
awoooi_backup_offsite_fresh{host="110",provider="rclone"} == 1
unless on(host, provider)
(awoooi_backup_offsite_remote_verify_ok{host="110",provider="rclone"} == 1)
for: 30m
labels:
severity: warning
layer: host-backup
component: backup-offsite
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 Google Drive full sync 完成但遠端驗證未通過"
description: "full offsite marker 已 fresh但 verify-offsite-full-sync.sh 沒有證明 13 個 Google Drive repo 都可列出且符合 latest-only。"
runbook: "SSH 110 執行 `/backup/scripts/verify-offsite-full-sync.sh --write-textfile --no-color`,檢查 `/backup/logs/offsite-full-sync-verify.log` 與 `/home/wooo/node_exporter_textfiles/offsite_full_sync_verify.prom`。"
- alert: BackupOffsiteRemoteSnapshotRetentionExceeded
expr: |
(awoooi_backup_offsite_remote_snapshot_count{host="110",provider="rclone"} > 1)
and on(host, provider)
(awoooi_backup_offsite_fresh{host="110",provider="rclone"} == 1)
for: 30m
labels:
severity: warning
layer: host-backup
component: backup-retention
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Google Drive repo {{ $labels.repo }} 保留超過 1 份 snapshot"
description: "{{ $labels.repo }} 在 Google Drive/rclone 遠端目前有 {{ $value }} 份 snapshotlatest-only 策略要求遠端也只保留最新一份。"
runbook: "確認 110 `/backup/scripts/sync-offsite-backups.sh --mode sync` 使用 `rclone sync`、`OFFSITE_SYNC_DELETE_OLD=1`、`RCLONE_DRIVE_USE_TRASH=false`,再於低峰重新執行 full sync 與 verifier。"
- alert: BackupCredentialEscrowEvidenceMissing
expr: awoooi_backup_credential_escrow_fresh{host="110"} == 0
for: 1m
labels:
severity: warning
layer: host-backup
component: credential-escrow
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "備份憑證金庫證據缺失或過期:{{ $labels.item }}"
description: "{{ $labels.item }} 沒有 31 天內人工驗證證據;重建時可能找不到 restic/offsite/break-glass/DNS/OAuth 復原材料。"
runbook: "在密碼管理器或離線加密金庫完成雙人覆核後,只建立不含 secret 的 `/backup/escrow-evidence/{{ $labels.item }}.last_verified` 時間戳證據。"
# =========================================================================
# 基礎設施詳細告警 (awoooi_infrastructure_detailed) — 從主機補回 2026-04-12
# =========================================================================
@@ -1321,3 +1748,284 @@ groups:
summary: "Prometheus ({{ $labels.instance }}) 停擺"
description: "Prometheus 自己停擺 → 所有其他告警失效"
runbook: "SSH 110 docker compose -f /home/wooo/monitoring/docker-compose.yml up -d prometheus"
# =========================================================================
# Full-stack cold-start recovery gate
# =========================================================================
- name: cold_start_recovery_alerts
rules:
- alert: PrometheusRuleDriftGuardFailed
expr: |
absent(awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="110"})
or
(time() - max by(host) (awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="110"}) > 900)
or
(awoooi_prometheus_rule_drift_guard_missing_required_count{host="110"} > 0)
or
(awoooi_prometheus_rule_drift_guard_current_matches_canonical{host="110"} == 0)
for: 10m
labels:
severity: critical
layer: systemd-110
component: prometheus-rule-drift-guard
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "Prometheus 規則漂移防護失效"
description: "110 Prometheus rule drift guard 沒有新鮮成功指標、required rules 缺失,或 active alerts.yml 不等於 canonical rules。"
runbook: "執行 `bash scripts/ops/deploy-alerts.sh` 重新部署 canonical rules 與 drift guard等待 1-2 個 Prometheus evaluation cycle 後重跑 readiness audit。"
- alert: PrometheusRuleDriftAutoRepaired
expr: awoooi_prometheus_rule_drift_guard_repaired{host="110"} > 0
for: 1m
labels:
severity: warning
layer: systemd-110
component: prometheus-rule-drift-guard
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Prometheus 規則漂移已被自動修復"
description: "110 drift guard 最近一次偵測到 active Prometheus rules 漂移,已回復 canonical rules 並 reload Prometheus。"
runbook: "檢查 `/home/wooo/logs/prometheus-rule-drift-guard.log` 與 `/home/wooo/monitoring/alerts.yml.guard.bak.*`,找出誰覆寫了 active rules。"
- alert: ColdStartMonitorMissing
expr: absent(awoooi_cold_start_monitor_up{host="110",scope="110_120_121_188"})
for: 15m
labels:
severity: warning
layer: systemd-110
component: cold-start-monitor
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Cold-start monitor textfile metric missing"
description: "110 沒有輸出 awoooi_cold_start_monitor_up重開機恢復 gate 目前不可觀測。"
runbook: "執行 scripts/reboot-recovery/install-cold-start-monitor-110.sh確認 /home/wooo/node_exporter_textfiles/cold_start_recovery.prom"
- alert: ColdStartMonitorStale
expr: time() - awoooi_cold_start_last_run_timestamp{host="110",scope="110_120_121_188"} > 900
for: 10m
labels:
severity: warning
layer: systemd-110
component: cold-start-monitor
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Cold-start monitor stale"
description: "cold-start monitor 超過 15 分鐘沒有更新,距離上次執行 {{ $value | humanizeDuration }}。"
runbook: "SSH 110 檢查 crontab、/tmp/awoooi-cold-start-monitor.cron.log、cold-start-last.log"
- alert: ColdStartRecoveryBlocked
expr: awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"} > 0
for: 5m
labels:
severity: critical
layer: full-stack
component: cold-start-gate
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "Full-stack cold-start recovery BLOCKED"
description: "cold-start gate 有 {{ $value }} 個 BLOCKED gate。AI 修復需保持 observe-only先處理第一個 blocked gate。"
runbook: "查看 /home/wooo/reboot-recovery/cold-start-last.log依 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的 P0→P2 順序修復"
- alert: K3sNodeFilesystemErrorGateBlocked
expr: awoooi_cold_start_blocker_reason{host="110",scope="110_120_121_188",reason="k3s_node_filesystem_error",target="120"} > 0
for: 5m
labels:
severity: critical
layer: k3s
component: node-filesystem
host: "120"
target_host: "120"
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "120 K3s 節點 filesystem error 阻擋重開機放行"
description: "cold-start log 偵測到 120 Node event 仍有 EXT4/I/O/deleted inode 類錯誤;即使 Pod Running、網站 200也不可宣告下一次重開機安全。"
runbook: "查看 110 `/home/wooo/reboot-recovery/cold-start-last.log`,執行 `scripts/reboot-recovery/120-fsck-maintenance-checklist.sh` 做維護前只讀檢查;維護窗內用 console/rescue 對 120 root LV 執行 fsck禁止 online fsck。"
- alert: ColdStartHost120Unreachable
expr: awoooi_cold_start_blocker_reason{host="110",scope="110_120_121_188",reason="host_unreachable",target="120"} > 0
for: 3m
labels:
severity: critical
layer: host
component: host-reachability
host: "120"
target_host: "120"
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "120 主機不可達Full-stack cold-start 已阻擋"
description: "110 cold-start monitor 無法 ping/SSH 192.168.0.120;目前只能由 121/VIP 撐住 K3s不能宣告所有主機重開機恢復完成。"
runbook: "查看 120 console。若停在 initramfs/manual fsck先對 root LV 做離線 fsck若主機關機或網卡異常先恢復電源/網路,再重跑 full-stack cold-start gate。禁止從自動修復直接重啟其他服務掩蓋主機離線。"
- alert: ColdStartRecoveryDegraded
expr: awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"} > 0
for: 15m
labels:
severity: warning
layer: full-stack
component: cold-start-gate
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Full-stack cold-start recovery DEGRADED"
description: "cold-start gate 有 {{ $value }} 個 WARN gate核心可用但不應放行 runner/CD/AI auto-repair full execution。"
runbook: "查看 /home/wooo/reboot-recovery/cold-start-last.log修到 PASS/WARN/BLOCKED = green"
- alert: ColdStartLastGreenTooOld
expr: time() - awoooi_cold_start_last_green_timestamp{host="110",scope="110_120_121_188"} > 3600
for: 15m
labels:
severity: warning
layer: full-stack
component: cold-start-gate
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Full-stack cold-start gate has not been GREEN recently"
description: "距離上次 GREEN 已超過 {{ $value | humanizeDuration }};需要確認 110/120/121/188 與排程/網站 gate。"
runbook: "執行 SSH_BATCH_MODE=yes bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test"
# =========================================================================
# Host storage health / dirty reboot evidence
# =========================================================================
- name: host_storage_health_alerts
rules:
- alert: Host110StorageHealthMonitorMissing
expr: absent(awoooi_host_storage_monitor_up{host="110"})
for: 15m
labels:
severity: warning
layer: systemd-110
component: storage-health-monitor
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 storage health textfile metric missing"
description: "110 沒有輸出 storage_health.promdirty reboot、root read-only 與 fsck 證據目前不可觀測。"
runbook: "用 Ansible `110-devops.yml --tags textfile_exporters` 或手動部署 scripts/ops/storage-health-textfile-exporter.py確認 /home/wooo/node_exporter_textfiles/storage_health.prom"
- alert: Host188StorageHealthMonitorMissing
expr: absent(awoooi_host_storage_monitor_up{host="188"})
for: 15m
labels:
severity: warning
layer: systemd-188
component: storage-health-monitor
host: "188"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "188 storage health textfile metric missing"
description: "188 沒有輸出 storage_health.promdirty reboot、root read-only 與 fsck 證據目前不可觀測。"
runbook: "用 Ansible `188-ai-web.yml --tags textfile_exporters` 或手動部署 scripts/ops/storage-health-textfile-exporter.py確認 /home/ollama/node_exporter_textfiles/storage_health.prom"
- alert: HostStorageHealthMonitorStale
expr: time() - awoooi_host_storage_last_run_timestamp{host=~"110|188"} > 900
for: 10m
labels:
severity: warning
layer: host-storage
component: storage-health-monitor
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} storage health textfile stale"
description: "storage health exporter 超過 15 分鐘沒有更新;重開機後檔案系統風險不可觀測。"
runbook: "SSH 主機檢查 cron、/tmp/awoooi-storage-health-textfile-exporter.cron.log 與 node-exporter textfile collector"
- alert: HostRootFilesystemReadOnly
expr: awoooi_host_root_filesystem_readonly{host=~"110|188",mountpoint="/"} > 0
for: 1m
labels:
severity: critical
layer: host-storage
component: root-filesystem
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} root filesystem 已變成 read-only"
description: "root filesystem 被掛載為唯讀,服務可能仍暫時存活但寫入會失敗;禁止自動修復,先保全證據並規劃維護窗。"
runbook: "依 docs/runbooks/FULL-STACK-COLD-START-SOP.md §16保全 journal/df/mount 證據,確認備份,再安排 console/offline fsck"
- alert: HostCurrentBootStorageErrorsDetected
expr: awoooi_host_storage_error_count{host=~"110|188",source="journalctl-kernel",boot="current"} > 0
for: 5m
labels:
severity: critical
layer: host-storage
component: kernel-storage
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} current boot 有 storage/kernel 錯誤"
description: "目前開機週期已出現 filesystem、I/O 或 fsck 類錯誤;不可只重啟容器掩蓋問題。"
runbook: "先執行 read-only 診斷journalctl -k -p warning..alert、mount、df、smartctl/raid 狀態;必要時進入維護窗處理"
- alert: HostPreviousBootStorageErrorsDetected
expr: awoooi_host_storage_error_count{host=~"110|188",source="journalctl-kernel",boot="previous"} > 0
for: 30m
labels:
severity: warning
layer: host-storage
component: dirty-reboot-evidence
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} previous boot 保留 storage/fsck 錯誤證據"
description: "上一個開機週期留有 storage/fsck 錯誤,代表這次重開機事故需要完成 fsck、備份與容量後續檢查。"
runbook: "把證據寫入 docs/LOGBOOK.md確認 full-stack cold-start gate 與 P3 gate下一次維護窗補 offline fsck/SMART/RAID 檢查"
- alert: HostFsckLogErrorsDetected
expr: sum by(host) (awoooi_host_storage_error_count{host=~"110|188",boot="last-fsck-log"}) > 0
for: 30m
labels:
severity: warning
layer: host-storage
component: fsck-log
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} fsck log 保留錯誤證據"
description: "主機 fsck log 內仍有 inconsistency 或 I/O 類錯誤文字;這是事故後追蹤項,不應交給自動修復直接處理。"
runbook: "確認 /run/initramfs/fsck.log 與 /var/log/fsck/*,將結果納入重開機事故報告與下次維護窗檢查項"

View File

@@ -33,8 +33,10 @@ groups:
description: "Node Exporter 無回應超過 1 分鐘"
- alert: HostHighCpuLoad
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
# 2026-05-05 ogt + Codex: keep this as early warning only.
# Sustained overload/root-cause automation is handled by HostLoadAverageSustainedHigh.
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 10m
labels:
severity: warning
layer: systemd-188
@@ -46,7 +48,7 @@ groups:
alert_category: "host_resource"
annotations:
summary: "主機 {{ $labels.host }} CPU 高負載"
description: "CPU 使用率超過 80%"
description: "CPU 使用率超過 90% 持續 10 分鐘;若 load5/core 未超過 1.5,先視為容量觀察與診斷,不直接修復。"
# 2026-05-02 ogt + Claude Sonnet 4.6: 引導 LLM 走 SSH 診斷而非 kubectl
auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -20' (host CPU 診斷;禁 kubectl restart awoooi-* — 主因常為第三方服務 Sentry/ClickHouse/Snuba)"
runbook: "host CPU 高負載排查:先 SSH ps aux 看 top 進程若為第三方服務Sentry/ClickHouse 等)寫 ADR 升級資源或調 limit禁止 kubectl restart 跨 domain"
@@ -671,7 +673,7 @@ groups:
- alert: DockerContainerMissingResourceLimit
# 2026-05-05 ogt + Codex: catch Compose services that silently run with unlimited CPU/memory.
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0)
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0)
for: 30m
labels:
severity: warning

View File

@@ -26,8 +26,18 @@
- labels:
criticality: P0
owner: ai-team
service: ollama
url: http://192.168.0.188:11434/api/tags
service: ollama-gcp-a
url: http://192.168.0.110:11435/api/tags
- labels:
criticality: P0
owner: ai-team
service: ollama-gcp-b
url: http://192.168.0.110:11436/api/tags
- labels:
criticality: P0
owner: ai-team
service: ollama-local
url: http://192.168.0.110:11437/api/tags
- labels:
criticality: P0
owner: ai-team

View File

@@ -92,7 +92,9 @@ scrape_configs:
service: ollama
type: docker
targets:
- 192.168.0.188:11434
- 192.168.0.110:11435
- 192.168.0.110:11436
- 192.168.0.110:11437
- job_name: openclaw
static_configs:
- labels:

View File

@@ -82,11 +82,11 @@
"textMode": "auto"
},
"title": "Ollama 可用性",
"description": "up{job=~\"ollama_111|ollama_188\"} × 100\n- 綠色 100% = 主機在線\n- 紅色 0% = 主機離線(容災應已觸發)\n\n資料來源: Prometheus scrape job ollama_111 / ollama_188",
"description": "up{job=~\"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111\"} × 100\n- 綠色 100% = 主機在線\n- 紅色 0% = 主機離線(容災應已觸發)\n\n資料來源: Prometheus scrape job ollama_gcp_a / ollama_gcp_b / ollama_local",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=~\"ollama_111|ollama_188\"} * 100",
"expr": "up{job=~\"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111\"} * 100",
"legendFormat": "{{ job }}",
"refId": "A"
}
@@ -188,7 +188,7 @@
"tooltip": { "mode": "single", "sort": "none" }
},
"title": "AI Provider 路由分布",
"description": "sum by (provider) (rate(ai_router_selected_provider_total[5m]))\n- 正常狀態: ollama 佔大多數\n- failover 中: gemini / ollama_188 比例上升\n- 全走 gemini = 111 完全 offline\n\n資料來源: OLLAMA_FAILOVER_TRIGGERED_TOTAL + AI_ROUTER_PROVIDER_TOTAL (src/core/metrics.py)",
"description": "sum by (provider) (rate(ai_router_selected_provider_total[5m]))\n- 正常狀態: ollama / ollama_gcp_a 佔大多數\n- failover 中: ollama_gcp_b / ollama_local / gemini 比例上升\n- 全走 gemini = Ollama provider pool 完全 offline\n\n資料來源: OLLAMA_FAILOVER_TRIGGERED_TOTAL + AI_ROUTER_PROVIDER_TOTAL (src/core/metrics.py)",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },

View File

@@ -6,7 +6,7 @@
# 部署方式: 手動合併至 alerts-unified.yml或 scripts/ops/deploy-alerts.sh 支援多檔時直接引用
#
# 標籤規範 (對齊 alerts-unified.yml):
# layer: systemd-188 | docker-188 (Ollama 跑在 188 主機)
# layer: ai-provider
# team: ai
# auto_repair: "true" | "false"
#
@@ -28,16 +28,16 @@ groups:
# -----------------------------------------------------------------------
# 🔴 [ACTIVE] Ollama 主機離線
# metric: up{job=~"ollama_111|ollama_188"}
# 前置條件: Prometheus scrape job 命名為 ollama_111 / ollama_188
# metric: up{job=~"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111"}
# 前置條件: Prometheus scrape job 命名對齊 ADR-110 provider pool
# (設定位於 ops/monitoring/generated/prometheus-scrape-generated.yaml)
# -----------------------------------------------------------------------
- alert: OllamaInstanceDown
expr: up{job=~"ollama_111|ollama_188"} == 0
expr: up{job=~"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111"} == 0
for: 2m
labels:
severity: critical
layer: systemd-188
layer: ai-provider
team: ai
auto_repair: "false"
alert_category: "ollama_failover"
@@ -57,7 +57,7 @@ groups:
for: 10m
labels:
severity: warning
layer: systemd-188
layer: ai-provider
team: ai
auto_repair: "false"
alert_category: "ollama_failover"

View File

@@ -19,6 +19,7 @@ Exit Codes:
"""
import json
import os
import subprocess
import sys
from pathlib import Path
@@ -29,7 +30,7 @@ import httpx
# Configuration
# =============================================================================
OLLAMA_URL = "http://192.168.0.188:11434/api/generate"
OLLAMA_URL = os.getenv("OLLAMA_GENERATE_URL", "http://192.168.0.111:11434/api/generate")
MODEL = "llama3.2:8b"
PROJECT_ROOT = Path(__file__).parent.parent
RULES_FILE = PROJECT_ROOT / ".awoooi-agent-rules.md"

View File

@@ -18,17 +18,21 @@ import httpx
async def main() -> int:
api_base = os.environ.get(
"INTERNAL_API_URL",
"http://awoooi-api.awoooi-prod.svc.cluster.local:8000",
"http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000",
)
url = f"{api_base}/api/v1/knowledge/embed-all"
async with httpx.AsyncClient(timeout=120) as client:
async with httpx.AsyncClient(timeout=1800) as client:
try:
resp = await client.post(url)
print(f"embed-all: {resp.status_code} {resp.text[:200]}")
if resp.status_code >= 400:
print(f"ERROR: embed-all returned {resp.status_code}", file=sys.stderr)
return 1
result = resp.json()
if int(result.get("failed", 0)) > 0:
print(f"ERROR: embed-all failed rows: {result}", file=sys.stderr)
return 1
return 0
except httpx.RequestError as exc:
print(f"ERROR: request failed — {exc}", file=sys.stderr)

View File

@@ -62,7 +62,6 @@ check_url "ArgoCD (121)" "https://192.168.0.121:30443"
echo ""
echo "--- AI 推理層 ---"
check_url "Ollama 111 GPU" "http://192.168.0.111:11434/api/tags"
check_url "Ollama 188 Hub" "http://192.168.0.188:11434/api/tags"
echo ""
echo "--- 觀測層 ---"

View File

@@ -0,0 +1,398 @@
#!/usr/bin/env bash
# AWOOOI full-stack cold-start readiness check.
# Read-only by design. It never restarts, deletes, repairs, or writes remote state.
set -uo pipefail
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6)
SEND_ALERT_TEST=0
for arg in "$@"; do
case "$arg" in
--send-alert-test)
SEND_ALERT_TEST=1
;;
-h|--help)
cat <<'USAGE'
Usage: bash scripts/reboot-recovery/full-stack-cold-start-check.sh [--send-alert-test]
Default mode is read-only and does not POST an Alertmanager test event.
Use --send-alert-test only after AWOOOI API is expected to be ready.
USAGE
exit 0
;;
*)
echo "Unknown argument: $arg" >&2
exit 64
;;
esac
done
RED=$'\033[0;31m'
GREEN=$'\033[0;32m'
YELLOW=$'\033[1;33m'
BLUE=$'\033[0;34m'
NC=$'\033[0m'
PASS=0
WARN=0
FAIL=0
log_section() {
printf "\n%s=== %s ===%s\n" "$BLUE" "$1" "$NC"
}
ok() {
printf "%sOK%s %s\n" "$GREEN" "$NC" "$1"
PASS=$((PASS + 1))
}
warn() {
printf "%sWARN%s %s\n" "$YELLOW" "$NC" "$1"
WARN=$((WARN + 1))
}
fail() {
printf "%sBLOCKED%s %s\n" "$RED" "$NC" "$1"
FAIL=$((FAIL + 1))
}
run_local() {
local label="$1"
shift
if "$@" >/tmp/awoooi-cold-start-check.out 2>&1; then
ok "$label"
cat /tmp/awoooi-cold-start-check.out
return 0
fi
fail "$label"
cat /tmp/awoooi-cold-start-check.out
return 1
}
ssh_cmd() {
local user_host="$1"
local cmd="$2"
local prefix=""
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
printf -v prefix 'REMOTE_SUDO_PASSWORD=%q ' "$REMOTE_SUDO_PASSWORD"
fi
ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}${cmd}"
}
probe_http_code() {
local url="$1"
local code
code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$url" 2>/dev/null || true)
echo "${code:-000}"
}
probe_tcp() {
local host="$1"
local port="$2"
nc -G 3 -z "$host" "$port" >/dev/null 2>&1 || nc -w 3 -z "$host" "$port" >/dev/null 2>&1
}
print_header() {
echo "AWOOOI full-stack cold-start check"
date '+%Y-%m-%d %H:%M:%S %Z'
echo "Scope: 110 / 120 / 121 / 188. 112 Kali is intentionally skipped."
}
check_network() {
log_section "P0-NETWORK"
local host
for host in 110 120 121 188; do
if ping -c 1 -W 2 "192.168.0.$host" >/dev/null 2>&1; then
ok "ping 192.168.0.$host"
else
fail "ping 192.168.0.$host"
fi
if probe_tcp "192.168.0.$host" 22; then
ok "ssh port 192.168.0.$host:22"
else
fail "ssh port 192.168.0.$host:22"
fi
done
arp -an | grep -E '192\.168\.0\.(110|120|121|188)' || warn "no ARP rows printed for one or more hosts"
}
check_188() {
log_section "P0-188-DATA"
local out
if ! out=$(ssh_cmd "ollama@192.168.0.188" '
echo "HOST $(hostname) $(uptime)"
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
echo "SYSTEMD $(systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx 2>/dev/null | tr "\n" " ")"
echo "PG $(pg_isready -h localhost -p 5432 2>&1)"
echo "REDIS $(redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true)"
echo "PORT5432 $(nc -z -w 2 127.0.0.1 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
echo "SIGNOZ_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3301/ || true)"
echo "MOMO_HEALTH_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5003/health || true)"
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -80
' 2>&1); then
fail "ssh 188 read-only check"
echo "$out"
return
fi
echo "$out"
grep -q "PORT5432 OPEN" <<<"$out" && ok "188 PostgreSQL port open" || fail "188 PostgreSQL port closed"
grep -q "accepting connections" <<<"$out" && ok "188 PostgreSQL accepting connections" || fail "188 PostgreSQL not accepting connections"
grep -q "REDIS PONG" <<<"$out" && ok "188 Redis PONG" || warn "188 Redis not confirmed"
grep -q "momo-db.*Restarting" <<<"$out" && warn "188 momo-db restarting" || ok "188 momo-db not in visible restart loop"
grep -Eq "SIGNOZ_CODE (200|302|307)" <<<"$out" && ok "188 SignOz HTTP reachable" || warn "188 SignOz HTTP not confirmed"
grep -q "MOMO_HEALTH_CODE 200" <<<"$out" && ok "188 momo health reachable" || warn "188 momo health not confirmed"
}
check_110() {
log_section "P0-110-REGISTRY-OBSERVABILITY"
local out
if ! out=$(ssh_cmd "wooo@192.168.0.110" '
echo "HOST $(hostname) $(uptime)"
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
echo "DOCKER_SYSTEMD $(systemctl is-active docker 2>/dev/null || true)"
echo "HARBOR_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5000/v2/ || true)"
echo "GITEA_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3001/ || true)"
echo "PROM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9090/-/ready || true)"
echo "AM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9093/-/healthy || true)"
echo "SENTRY_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 8 http://127.0.0.1:9000/ || true)"
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
done
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120
' 2>&1); then
fail "ssh 110 read-only check"
echo "$out"
return
fi
echo "$out"
grep -Eq "HARBOR_CODE (200|401)" <<<"$out" && ok "110 Harbor /v2 healthy code" || fail "110 Harbor not healthy"
grep -Eq "GITEA_CODE (200|302)" <<<"$out" && ok "110 Gitea reachable" || warn "110 Gitea not confirmed"
grep -q "PROM_CODE 200" <<<"$out" && ok "110 Prometheus ready" || warn "110 Prometheus not ready"
grep -q "AM_CODE 200" <<<"$out" && ok "110 Alertmanager healthy" || warn "110 Alertmanager not healthy"
grep -Eq "SENTRY_CODE (200|302|400)" <<<"$out" && ok "110 Sentry HTTP reachable" || warn "110 Sentry HTTP not confirmed"
grep -q "WatchdogUSec=0" <<<"$out" && ok "runner watchdog disabled on at least one unit" || warn "runner watchdog state not confirmed"
grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting"
}
check_k3s() {
log_section "P1-K3S"
local out local_kubectl_out
if ! out=$(ssh_cmd "wooo@192.168.0.120" '
echo "HOST $(hostname) $(uptime)"
echo "PG188_PORT $(nc -z -w 2 192.168.0.188 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
echo "SYSTEMD $(systemctl is-active k3s k3s-agent keepalived 2>/dev/null | tr "\n" " ")"
kcmd() {
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
else
sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
fi
}
kcmd get nodes -o wide 2>/dev/null || true
kcmd get pods -n awoooi-prod -o wide 2>/dev/null || true
ip addr show | grep 192.168.0.125 || true
' 2>&1); then
fail "ssh 120 k3s read-only check"
echo "$out"
return
fi
echo "$out"
if ! grep -q " Ready " <<<"$out"; then
local_kubectl_out=$(kubectl get nodes -o wide 2>/dev/null || true)
if [ -n "$local_kubectl_out" ]; then
echo "LOCAL_KUBECTL_FALLBACK"
echo "$local_kubectl_out"
fi
else
local_kubectl_out=""
fi
grep -q "PG188_PORT OPEN" <<<"$out" && ok "120 can reach 188 PostgreSQL port" || fail "120 cannot reach 188 PostgreSQL"
grep -q " Ready " <<<"$out$local_kubectl_out" && ok "K3s has Ready node output" || fail "K3s nodes not Ready or kubectl unavailable"
grep -q "192.168.0.125" <<<"$out" && ok "VIP 192.168.0.125 present on 120" || warn "VIP not confirmed on 120"
}
check_workload_and_alertchain() {
log_section "P2-WORKLOAD-ALERTCHAIN"
local api_code web_code alert_code
local out
if out=$(ssh_cmd "wooo@192.168.0.120" '
api_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32334/api/v1/health 2>/dev/null || true)
web_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32335/ 2>/dev/null || true)
echo "API_CODE ${api_code:-000}"
echo "WEB_CODE ${web_code:-000}"
' 2>/dev/null); then
api_code=$(awk '/^API_CODE / {print $2}' <<<"$out")
web_code=$(awk '/^WEB_CODE / {print $2}' <<<"$out")
else
api_code=$(probe_http_code "http://192.168.0.125:32334/api/v1/health")
web_code=$(probe_http_code "http://192.168.0.125:32335/")
out="API_CODE $api_code
WEB_CODE $web_code"
fi
echo "$out"
[[ "$api_code" =~ ^[23] ]] && ok "AWOOOI API reachable" || fail "AWOOOI API not reachable"
[[ "$web_code" =~ ^[23] ]] && ok "AWOOOI Web reachable" || warn "AWOOOI Web not confirmed"
if [ "$SEND_ALERT_TEST" -eq 1 ]; then
alert_code=$(ssh_cmd "wooo@192.168.0.120" 'curl -s -o /tmp/awoooi-alertchain.out -w "%{http_code}" --max-time 8 \
-X POST "http://192.168.0.125:32334/api/v1/webhooks/alertmanager" \
-H '"'"'Content-Type: application/json'"'"' \
-d '"'"'{"receiver":"cold-start-check","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartCheck","severity":"info"},"annotations":{"summary":"Cold start check"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-check"}'"'"' 2>/dev/null || echo "000"')
echo "ALERTCHAIN_CODE $alert_code"
[[ "$alert_code" =~ ^2 ]] && ok "Alertmanager webhook endpoint accepts POST" || warn "Alertmanager webhook E2E not confirmed"
else
warn "Alertmanager webhook POST skipped; rerun with --send-alert-test after API is ready"
fi
}
check_public_routes() {
log_section "P2-PUBLIC-ROUTES"
local awoooi_api_code awoooi_web_code momo_code momo_health_code
awoooi_api_code=$(probe_http_code "https://awoooi.wooo.work/api/v1/health")
awoooi_web_code=$(probe_http_code "https://awoooi.wooo.work/")
momo_code=$(probe_http_code "https://mo.wooo.work/")
momo_health_code=$(probe_http_code "https://mo.wooo.work/health")
echo "AWOOOI_PUBLIC_API_CODE $awoooi_api_code"
echo "AWOOOI_PUBLIC_WEB_CODE $awoooi_web_code"
echo "MOMO_PUBLIC_CODE $momo_code"
echo "MOMO_PUBLIC_HEALTH_CODE $momo_health_code"
[[ "$awoooi_api_code" =~ ^[23] ]] && ok "AWOOOI public API reachable" || warn "AWOOOI public API not confirmed"
[[ "$awoooi_web_code" =~ ^[23] ]] && ok "AWOOOI public web reachable" || warn "AWOOOI public web not confirmed"
[[ "$momo_code" =~ ^[23] ]] && ok "momo public route reachable" || warn "momo public route not confirmed"
[[ "$momo_health_code" =~ ^[23] ]] && ok "momo public health reachable" || warn "momo public health not confirmed"
}
check_schedules() {
log_section "P2-SCHEDULES"
local out
if out=$(ssh_cmd "ollama@192.168.0.188" '
now=$(date +%s)
echo "CRON_188 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
for f in /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/docker_stats.prom; do
if [ -f "$f" ]; then
mt=$(stat -c %Y "$f")
echo "TEXTFILE_188 $(basename "$f") age=$((now - mt))"
else
echo "TEXTFILE_188 $(basename "$f") missing"
fi
done
if [ -f /home/ollama/node_exporter_textfiles/backup.prom ]; then
awk -v now="$now" "/^backup_110_last_success_timestamp / {printf \"BACKUP_110_AGE %d\\n\", now - int(\$2)}" /home/ollama/node_exporter_textfiles/backup.prom
fi
echo "SCHEDULER_REGISTERED $(docker logs --tail 200 momo-scheduler 2>&1 | grep -c "全部排程任務已註冊" || true)"
' 2>&1); then
echo "$out"
grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed"
awk '/TEXTFILE_188 backup.prom age=/ {split($3,a,"="); exit !(a[2] < 90000)}' <<<"$out" && ok "188 backup textfile fresh enough" || warn "188 backup textfile stale or missing"
awk '/TEXTFILE_188 docker_restart_count.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker restart exporter fresh" || warn "188 docker restart exporter stale"
awk '/TEXTFILE_188 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker stats exporter fresh" || warn "188 docker stats exporter stale"
awk '/BACKUP_110_AGE / {exit !($2 < 90000)}' <<<"$out" && ok "188 backup-from-110 success within 25h" || warn "188 backup-from-110 success not confirmed"
awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out" && ok "188 momo scheduler registered jobs" || warn "188 momo scheduler registration not confirmed"
else
warn "188 schedule check unavailable"
echo "$out"
fi
if out=$(ssh_cmd "wooo@192.168.0.110" '
now=$(date +%s)
echo "CRON_110 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
echo "FAILED_UNITS_110 $(systemctl --failed --no-legend --plain 2>/dev/null | wc -l)"
echo "MOMO_STARTUP_ENABLED $(systemctl is-enabled momo-startup-complete.service 2>/dev/null || true)"
echo "STAGGERED_STARTUP_ENABLED $(systemctl is-enabled wooo-staggered-startup.service 2>/dev/null || true)"
for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom; do
if [ -f "$f" ]; then
mt=$(stat -c %Y "$f")
echo "TEXTFILE_110 $(basename "$f") age=$((now - mt))"
else
echo "TEXTFILE_110 $(basename "$f") missing"
fi
done
' 2>&1); then
echo "$out"
grep -q "CRON_110 active" <<<"$out" && ok "110 cron active" || warn "110 cron not confirmed"
grep -q "FAILED_UNITS_110 0" <<<"$out" && ok "110 systemd has no failed units" || warn "110 systemd failed units remain"
grep -q "MOMO_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale momo startup unit disabled" || warn "110 stale momo startup unit not disabled"
grep -q "STAGGERED_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale staggered startup unit disabled" || warn "110 stale staggered startup unit not disabled"
awk '/TEXTFILE_110 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 docker stats exporter fresh" || warn "110 docker stats exporter stale"
awk '/TEXTFILE_110 systemd_units.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 systemd units exporter fresh" || warn "110 systemd units exporter stale"
else
warn "110 schedule check unavailable"
echo "$out"
fi
if out=$(ssh_cmd "wooo@192.168.0.120" '
kcmd() {
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
else
sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
fi
}
echo "CRON_120 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
kcmd get cronjobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); items=d.get(\"items\", []); print(\"CRONJOB_COUNT\", len(items)); print(\"CRONJOB_SUSPENDED\", sum(1 for i in items if i.get(\"spec\",{}).get(\"suspend\")))"
kcmd get jobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); failed=0
for j in d.get(\"items\", []):
if any(c.get(\"type\")==\"Failed\" and c.get(\"status\")==\"True\" for c in j.get(\"status\",{}).get(\"conditions\",[]) or []):
failed += 1
print(\"FAILED_JOBS\", failed)"
kcmd get pods -n awoooi-prod --no-headers 2>/dev/null | awk "\$3 !~ /^(Running|Completed)$/ {bad++} END {print \"BAD_PODS\", bad+0}"
' 2>&1); then
echo "$out"
grep -q "CRON_120 active" <<<"$out" && ok "120 cron active" || warn "120 cron not confirmed"
awk '/CRONJOB_COUNT / {exit !($2 >= 4)}' <<<"$out" && ok "K8s AWOOOI CronJobs present" || warn "K8s AWOOOI CronJobs missing"
grep -q "CRONJOB_SUSPENDED 0" <<<"$out" && ok "K8s AWOOOI CronJobs unsuspended" || warn "K8s AWOOOI CronJob suspended"
grep -q "FAILED_JOBS 0" <<<"$out" && ok "K8s AWOOOI has no failed Jobs" || warn "K8s AWOOOI failed Jobs remain"
grep -q "BAD_PODS 0" <<<"$out" && ok "K8s AWOOOI pods Running/Completed only" || warn "K8s AWOOOI bad pod status remains"
else
warn "120 K8s schedule check unavailable"
echo "$out"
fi
if out=$(ssh_cmd "wooo@192.168.0.121" '
echo "CRON_121 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
crontab -l 2>/dev/null | grep -q "dr-drill.sh" && echo "DR_DRILL_CRON present" || echo "DR_DRILL_CRON missing"
' 2>&1); then
echo "$out"
grep -q "CRON_121 active" <<<"$out" && ok "121 cron active" || warn "121 cron not confirmed"
grep -q "DR_DRILL_CRON present" <<<"$out" && ok "121 DR drill cron present" || warn "121 DR drill cron missing"
else
warn "121 schedule check unavailable"
echo "$out"
fi
}
summary() {
log_section "SUMMARY"
echo "PASS=$PASS WARN=$WARN BLOCKED=$FAIL"
if [ "$FAIL" -gt 0 ]; then
echo "Result: BLOCKED. Fix the first blocked gate before releasing runner/CD/AI auto-remediation."
exit 2
fi
if [ "$WARN" -gt 0 ]; then
echo "Result: DEGRADED. Core gates passed but warnings remain."
exit 1
fi
echo "Result: GREEN. Full stack is ready for controlled runner/CD release."
}
print_header
check_network
check_188
check_110
check_k3s
check_workload_and_alertchain
check_public_routes
check_schedules
summary

View File

@@ -92,10 +92,10 @@ fi
echo ""
echo "🤖 Step 6: Verifying Ollama connection..."
OLLAMA_URL="http://192.168.0.188:11434/api/tags"
OLLAMA_URL="${OLLAMA_URL:-http://192.168.0.111:11434/api/tags}"
if curl -s --connect-timeout 5 "$OLLAMA_URL" > /dev/null 2>&1; then
echo " ✅ Ollama reachable at 192.168.0.188:11434"
echo " ✅ Ollama reachable at ${OLLAMA_URL}"
# Check if llama3.2:8b is available
MODELS=$(curl -s "$OLLAMA_URL" | grep -o '"name":"[^"]*"' || echo "")