Compare commits
9 Commits
codex/gith
...
codex/w1-r
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
337b2df60d | ||
|
|
ab21d8bad2 | ||
|
|
2d37383fc6 | ||
|
|
3779f6f1e0 | ||
|
|
c38227e945 | ||
|
|
1b4a6c1e8c | ||
|
|
894174da5b | ||
|
|
10cd9fc025 | ||
|
|
8161ccf83f |
@@ -108,7 +108,9 @@ jobs:
|
||||
mkdir -p ~/.ssh
|
||||
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
|
||||
chmod 600 ~/.ssh/deploy_key
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS
|
||||
# 2026-05-05 Codex: kubectl runs on 120 control-plane. 121 is a
|
||||
# worker and its local kubeconfig points at 127.0.0.1:6443.
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS
|
||||
set -e
|
||||
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
||||
|
||||
@@ -138,10 +140,10 @@ jobs:
|
||||
SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
|
||||
run: |
|
||||
cat k8s/awoooi-dev/02-configmap.yaml | \
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 \
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 \
|
||||
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
|
||||
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << 'DEPLOY'
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << 'DEPLOY'
|
||||
set -e
|
||||
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
||||
|
||||
|
||||
@@ -406,8 +406,11 @@ jobs:
|
||||
mkdir -p ~/.ssh
|
||||
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
|
||||
chmod 600 ~/.ssh/deploy_key
|
||||
ssh-keyscan 192.168.0.121 >> ~/.ssh/known_hosts 2>/dev/null
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS
|
||||
# 2026-05-05 Codex: kubectl must run on the 120 control-plane.
|
||||
# 121 is a worker after cold-start recovery; its kubeconfig points at
|
||||
# 127.0.0.1:6443 and fails ADR-035 secret patching.
|
||||
ssh-keyscan 192.168.0.120 >> ~/.ssh/known_hosts 2>/dev/null
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS
|
||||
set -e
|
||||
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
||||
|
||||
@@ -634,19 +637,21 @@ jobs:
|
||||
mkdir -p ~/.ssh
|
||||
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
|
||||
chmod 600 ~/.ssh/deploy_key
|
||||
ssh-keyscan 192.168.0.121 >> ~/.ssh/known_hosts 2>/dev/null
|
||||
# 2026-05-05 Codex: deploy-side kubectl/ArgoCD operations run on 120
|
||||
# control-plane, not 121 worker.
|
||||
ssh-keyscan 192.168.0.120 >> ~/.ssh/known_hosts 2>/dev/null
|
||||
|
||||
IMAGE_TAG="${{ github.sha }}"
|
||||
HARBOR=192.168.0.110:5000
|
||||
|
||||
# ─── Step 1: Apply ConfigMap + ServiceRegistry (ArgoCD 管的是 Deployment,ConfigMap 仍直接 apply) ───
|
||||
cat k8s/awoooi-prod/04-configmap.yaml | \
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
|
||||
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
|
||||
echo "✅ ConfigMap 已更新"
|
||||
|
||||
cat k8s/awoooi-prod/15-service-registry-configmap.yaml | \
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
|
||||
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
|
||||
echo "✅ Service Registry ConfigMap 已更新"
|
||||
|
||||
@@ -688,7 +693,7 @@ jobs:
|
||||
}
|
||||
|
||||
# ─── Step 4: 等待 ArgoCD sync + rollout ───
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
|
||||
"EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" << 'ARGOCD_WAIT'
|
||||
set -e
|
||||
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
||||
@@ -814,7 +819,7 @@ jobs:
|
||||
- name: Alert Chain Smoke Test
|
||||
id: alert_chain_smoke
|
||||
run: |
|
||||
# 2026-04-05 Claude Code: 使用真實 API 地址(192.168.0.121:32334 NodePort)
|
||||
# 2026-05-05 Codex: use the keepalived VIP instead of a fixed node.
|
||||
# Host runner launches the CI image explicitly to avoid act RWLayer=nil.
|
||||
if docker run --rm \
|
||||
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-alert-smoke" \
|
||||
@@ -824,7 +829,7 @@ jobs:
|
||||
-v awoooi-api-venv-cache:/opt/api-venv \
|
||||
-w /workspace \
|
||||
"${{ env.CI_IMAGE }}" \
|
||||
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url http://192.168.0.121:32334 --json | tee /tmp/alert_chain_result.json'; then
|
||||
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url http://192.168.0.125:32334 --json | tee /tmp/alert_chain_result.json'; then
|
||||
echo "alert_chain_status=pass" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "alert_chain_status=fail" >> $GITHUB_OUTPUT
|
||||
|
||||
@@ -6,8 +6,9 @@
|
||||
-- bge-m3 產生 1024 維向量,現有 schema vector(768) 不相容,INSERT 會直接失敗
|
||||
--
|
||||
-- 影響範圍:
|
||||
-- 1. rag_chunks.embedding vector(768) → vector(1024)
|
||||
-- 2. playbook_embeddings.embedding vector(768) → vector(1024)
|
||||
-- 1. knowledge_entries.embedding vector(768) → vector(1024)
|
||||
-- 2. rag_chunks.embedding vector(768) → vector(1024)
|
||||
-- 3. playbook_embeddings.embedding vector(768) → vector(1024)
|
||||
--
|
||||
-- 遷移策略:清空現有向量資料,切換維度後由 re-embed script 重新嵌入
|
||||
-- 現有向量資料若要保留,需先 dump 用 nomic 格式備份(舊維度無法轉換)
|
||||
@@ -21,7 +22,24 @@
|
||||
|
||||
BEGIN;
|
||||
|
||||
-- 1. rag_chunks:清空向量資料,變更欄位維度
|
||||
-- 1. knowledge_entries:備份舊向量並清空,變更欄位維度
|
||||
CREATE TABLE IF NOT EXISTS knowledge_entries_embedding_backup_20260505 AS
|
||||
SELECT
|
||||
id,
|
||||
embedding::text AS embedding_768,
|
||||
NOW() AS backed_up_at
|
||||
FROM knowledge_entries
|
||||
WHERE embedding IS NOT NULL;
|
||||
|
||||
ALTER TABLE knowledge_entries
|
||||
ALTER COLUMN embedding TYPE vector(1024)
|
||||
USING NULL; -- 清空現有 768 維向量(維度不可轉換)
|
||||
|
||||
COMMENT ON COLUMN knowledge_entries.embedding IS
|
||||
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-05 ADR-110 follow-up)';
|
||||
|
||||
|
||||
-- 2. rag_chunks:清空向量資料,變更欄位維度
|
||||
-- ivfflat index 必須先 DROP 才能 ALTER COLUMN
|
||||
DROP INDEX IF EXISTS idx_rag_chunks_embedding;
|
||||
|
||||
@@ -39,7 +57,7 @@ COMMENT ON COLUMN rag_chunks.embedding IS
|
||||
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-04 ADR-110)';
|
||||
|
||||
|
||||
-- 2. playbook_embeddings:清空向量資料,變更欄位維度
|
||||
-- 3. playbook_embeddings:清空向量資料,變更欄位維度
|
||||
DROP INDEX IF EXISTS ix_playbook_embeddings_vec;
|
||||
|
||||
ALTER TABLE playbook_embeddings
|
||||
@@ -61,9 +79,15 @@ COMMENT ON TABLE playbook_embeddings IS
|
||||
-- 3. 驗證遷移結果
|
||||
DO $$
|
||||
DECLARE
|
||||
v_km_dim integer;
|
||||
v_rag_dim integer;
|
||||
v_pb_dim integer;
|
||||
BEGIN
|
||||
SELECT atttypmod INTO v_km_dim
|
||||
FROM pg_attribute
|
||||
JOIN pg_class ON attrelid = pg_class.oid
|
||||
WHERE relname = 'knowledge_entries' AND attname = 'embedding';
|
||||
|
||||
SELECT atttypmod INTO v_rag_dim
|
||||
FROM pg_attribute
|
||||
JOIN pg_class ON attrelid = pg_class.oid
|
||||
@@ -74,15 +98,18 @@ BEGIN
|
||||
JOIN pg_class ON attrelid = pg_class.oid
|
||||
WHERE relname = 'playbook_embeddings' AND attname = 'embedding';
|
||||
|
||||
-- atttypmod for vector(1024) = 1024 + 1 = 1025
|
||||
IF v_rag_dim != 1025 THEN
|
||||
RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗:expected 1025, got %', v_rag_dim;
|
||||
-- pgvector atttypmod stores the configured dimension.
|
||||
IF v_km_dim != 1024 THEN
|
||||
RAISE EXCEPTION 'knowledge_entries.embedding 維度驗證失敗:expected 1024, got %', v_km_dim;
|
||||
END IF;
|
||||
IF v_pb_dim != 1025 THEN
|
||||
RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗:expected 1025, got %', v_pb_dim;
|
||||
IF v_rag_dim != 1024 THEN
|
||||
RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗:expected 1024, got %', v_rag_dim;
|
||||
END IF;
|
||||
IF v_pb_dim != 1024 THEN
|
||||
RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗:expected 1024, got %', v_pb_dim;
|
||||
END IF;
|
||||
|
||||
RAISE NOTICE '✅ embedding 遷移驗證通過:rag_chunks 和 playbook_embeddings 均為 vector(1024)';
|
||||
RAISE NOTICE '✅ embedding 遷移驗證通過:knowledge_entries、rag_chunks、playbook_embeddings 均為 vector(1024)';
|
||||
END $$;
|
||||
|
||||
COMMIT;
|
||||
|
||||
@@ -11,7 +11,7 @@ Endpoints:
|
||||
Components Checked:
|
||||
- PostgreSQL (192.168.0.188:5432)
|
||||
- Redis (192.168.0.188:6380)
|
||||
- Ollama (192.168.0.188:11434)
|
||||
- Ollama (settings.OLLAMA_URL / ADR-110 provider pool)
|
||||
- OpenClaw (192.168.0.188:8089)
|
||||
- SigNoz (192.168.0.188:3301)
|
||||
"""
|
||||
|
||||
@@ -108,8 +108,9 @@ async def list_runs(
|
||||
)
|
||||
async def list_approvals(
|
||||
project_id: str | None = Query(None, description="租戶 ID(可選)"),
|
||||
run_id: str | None = Query(None, description="Run ID(可選,M8 詳情頁查單筆)"),
|
||||
) -> dict[str, Any]:
|
||||
return await list_approvals_svc(project_id=project_id)
|
||||
return await list_approvals_svc(project_id=project_id, run_id=run_id)
|
||||
|
||||
|
||||
@router.post(
|
||||
|
||||
@@ -145,7 +145,7 @@ class Settings(BaseSettings):
|
||||
# ==========================================================================
|
||||
# ADR-104: LLM Playbook Generator
|
||||
# 成功修復且未命中既有 Playbook 時,用本地 LLM 生成 DRAFT/REVIEW Playbook。
|
||||
# 成本護欄:實作層只走 local provider(Ollama 111 → Ollama 188),不新增雲端 fallback。
|
||||
# 成本護欄:實作層只走 local provider(GCP-A → GCP-B → 111),不新增雲端 fallback。
|
||||
# 回滾指令: kubectl set env deployment/awoooi-api ENABLE_LLM_PLAYBOOK_GENERATION=false
|
||||
# ==========================================================================
|
||||
ENABLE_LLM_PLAYBOOK_GENERATION: bool = Field(
|
||||
@@ -504,6 +504,22 @@ class Settings(BaseSettings):
|
||||
"unexpected cloud spend from Gitea push/PR alerts."
|
||||
),
|
||||
)
|
||||
ALERT_AI_ALLOW_CLOUD_FALLBACK: bool = Field(
|
||||
default=True,
|
||||
description=(
|
||||
"Allow incident/alert OpenClaw analysis to use cloud fallback "
|
||||
"providers after the GCP-A/GCP-B/111 Ollama lane is exhausted. "
|
||||
"Default true so Gemini can act as the final backup, after the "
|
||||
"ordered Ollama lane is exhausted."
|
||||
),
|
||||
)
|
||||
ALERT_AI_ENFORCE_OLLAMA_FIRST: bool = Field(
|
||||
default=True,
|
||||
description=(
|
||||
"Force incident/alert OpenClaw analysis to try GCP-A, then GCP-B, "
|
||||
"then local 111 before cloud backup providers such as Gemini."
|
||||
),
|
||||
)
|
||||
# 2026-03-29 ogt: ADR-036 Nemotron Tool Calling 整合
|
||||
NVIDIA_API_KEY: str = Field(
|
||||
default="",
|
||||
@@ -855,7 +871,7 @@ class Settings(BaseSettings):
|
||||
# ==========================================================================
|
||||
# MCP Phase 2b: Prometheus MCP Server (ADR-071, 2026-04-11 Claude Sonnet 4.6)
|
||||
# ==========================================================================
|
||||
# 2026-04-29 ogt + Claude Opus 4.7: drift fix — 188 是 Ollama Hub,Prometheus 實際在 110
|
||||
# 2026-04-29 ogt + Claude Opus 4.7: drift fix — Prometheus 實際在 110
|
||||
# ConfigMap 04-configmap.yaml 也是 110;governance_agent / SLO check 連 188 會 timeout
|
||||
# 此 drift 是 SPF-4 (governance_agent silently fail) 根因之一
|
||||
PROMETHEUS_URL: str = Field(
|
||||
@@ -929,7 +945,7 @@ class Settings(BaseSettings):
|
||||
"devops": "192.168.0.110", # Harbor, GH Runner
|
||||
"security": "192.168.0.112", # Kali Scanner
|
||||
"k3s_master": "192.168.0.120", # K3s Master
|
||||
"ai_web": "192.168.0.188", # Nginx, Postgres, Redis, Ollama
|
||||
"ai_web": "192.168.0.188", # Nginx, Postgres, Redis, SignOz
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -10,13 +10,51 @@
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from contextvars import ContextVar
|
||||
from contextvars import ContextVar, Token
|
||||
|
||||
# 追蹤當前非同步任務的 project_id
|
||||
# default="awoooi" 確保未設時也能正常查詢(RLS fail-open 保護)
|
||||
PROJECT_ID: ContextVar[str] = ContextVar("project_id", default="awoooi")
|
||||
# Fail-Closed: 移除 default="awoooi",進 DB 路徑需要明確租戶標籤
|
||||
PROJECT_ID: ContextVar[str | None] = ContextVar("project_id")
|
||||
PROJECT_ID_SOURCE: ContextVar[str | None] = ContextVar("project_id_source")
|
||||
PROJECT_ID_REQUEST_ID: ContextVar[str | None] = ContextVar("project_id_request_id")
|
||||
|
||||
|
||||
def get_current_project_id() -> str:
|
||||
def set_project_context(
|
||||
project_id: str | None,
|
||||
source: str = "runtime",
|
||||
request_id: str | None = None,
|
||||
) -> tuple[Token[str | None], Token[str | None], Token[str | None]]:
|
||||
"""
|
||||
設定當前 request/context 的 project 上下文,並回傳 ContextVar token 供 restore。
|
||||
"""
|
||||
return (
|
||||
PROJECT_ID.set(project_id),
|
||||
PROJECT_ID_SOURCE.set(source),
|
||||
PROJECT_ID_REQUEST_ID.set(request_id),
|
||||
)
|
||||
|
||||
|
||||
def clear_project_context(tokens: tuple[Token[str | None], Token[str | None], Token[str | None]]) -> None:
|
||||
"""清除 request 上下文,回復前一個 ContextVar 狀態。"""
|
||||
PROJECT_ID_REQUEST_ID.reset(tokens[2])
|
||||
PROJECT_ID_SOURCE.reset(tokens[1])
|
||||
PROJECT_ID.reset(tokens[0])
|
||||
|
||||
|
||||
def get_project_context() -> dict[str, str | None]:
|
||||
"""取得目前上下文快照(可直接寫入 audit log)。"""
|
||||
return {
|
||||
"project_id": PROJECT_ID.get(None),
|
||||
"source": PROJECT_ID_SOURCE.get(None),
|
||||
"request_id": PROJECT_ID_REQUEST_ID.get(None),
|
||||
}
|
||||
|
||||
|
||||
def get_current_project_id() -> str | None:
|
||||
"""取得當前任務的 project_id(給 service 層使用)"""
|
||||
return PROJECT_ID.get()
|
||||
return PROJECT_ID.get(None)
|
||||
|
||||
|
||||
def get_current_project_context() -> dict[str, str | None]:
|
||||
"""取得可追溯上下文(同 get_project_context,保留 API 命名)。"""
|
||||
return get_project_context()
|
||||
|
||||
@@ -16,6 +16,7 @@ Features:
|
||||
from collections.abc import AsyncGenerator
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from fastapi import HTTPException
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.ext.asyncio import (
|
||||
AsyncEngine,
|
||||
@@ -26,6 +27,8 @@ from sqlalchemy.ext.asyncio import (
|
||||
from sqlalchemy.orm import DeclarativeBase
|
||||
|
||||
from src.core.config import settings
|
||||
from src.core.context import get_current_project_context
|
||||
from src.core.logging import get_logger
|
||||
|
||||
# =============================================================================
|
||||
# Base Model
|
||||
@@ -42,6 +45,19 @@ class Base(DeclarativeBase):
|
||||
|
||||
_engine: AsyncEngine | None = None
|
||||
_session_factory: async_sessionmaker[AsyncSession] | None = None
|
||||
logger = get_logger("awoooi.db")
|
||||
|
||||
|
||||
def _raise_unauthorized_db_context(msg: str) -> None:
|
||||
context = get_current_project_context()
|
||||
logger.error(
|
||||
"db_context_missing",
|
||||
reason=msg,
|
||||
project_id=context.get("project_id"),
|
||||
project_id_source=context.get("source"),
|
||||
request_id=context.get("request_id"),
|
||||
)
|
||||
raise HTTPException(status_code=401, detail="Missing tenant context: project_id is required")
|
||||
|
||||
|
||||
def get_engine() -> AsyncEngine:
|
||||
@@ -103,13 +119,21 @@ async def get_db() -> AsyncGenerator[AsyncSession, None]:
|
||||
async def get_items(db: AsyncSession = Depends(get_db)):
|
||||
...
|
||||
"""
|
||||
from src.core.context import get_current_project_id
|
||||
factory = get_session_factory()
|
||||
async with factory() as session:
|
||||
try:
|
||||
# AwoooP Phase 2.3 (2026-05-04 ogt): SET LOCAL app.project_id 讓 RLS Policy 生效
|
||||
# 預設 'awoooi',多租戶路由將在 middleware 注入實際 project_id
|
||||
# Fail-Closed RLS: 遇到未授權情境拋出錯誤而非回退到 "awoooi"
|
||||
pid = get_current_project_id()
|
||||
if not pid:
|
||||
_raise_unauthorized_db_context(
|
||||
"Unauthorized: project_id is missing in context (Fail-Closed RLS)"
|
||||
)
|
||||
|
||||
await session.execute(
|
||||
text("SELECT set_config('app.project_id', 'awoooi', TRUE)")
|
||||
text("SELECT set_config('app.project_id', :pid, TRUE)"),
|
||||
{"pid": pid},
|
||||
)
|
||||
yield session
|
||||
await session.commit()
|
||||
@@ -123,19 +147,22 @@ async def get_db_context(project_id: str | None = None) -> AsyncGenerator[AsyncS
|
||||
"""
|
||||
Context manager for database session (non-FastAPI usage)
|
||||
|
||||
AwoooP Phase 2.3/2.4: 優先序 — 明確參數 > contextvar > "awoooi"
|
||||
AwoooP Phase 2.3/2.4: 優先序 — 明確參數 > contextvar(缺失則 fail-closed)
|
||||
- Phase 2.3: 啟用 RLS tenant isolation(SET LOCAL app.project_id)
|
||||
- Phase 2.4: 從 asyncio contextvar 讀取 background loop 的 project_id
|
||||
|
||||
Usage:
|
||||
async with get_db_context() as db: # 繼承 contextvar 或預設 awoooi
|
||||
async with get_db_context() as db: # 繼承 contextvar(缺失將 fail-closed)
|
||||
...
|
||||
async with get_db_context("other-tenant") as db: # 明確指定 tenant
|
||||
...
|
||||
"""
|
||||
"""
|
||||
from src.core.context import get_current_project_id
|
||||
effective_pid = project_id if project_id is not None else get_current_project_id()
|
||||
|
||||
if not effective_pid:
|
||||
_raise_unauthorized_db_context("Unauthorized: project_id is missing in context (Fail-Closed RLS)")
|
||||
|
||||
factory = get_session_factory()
|
||||
async with factory() as session:
|
||||
try:
|
||||
|
||||
@@ -479,7 +479,7 @@ async def _collect_all_k8s_assets() -> tuple[list[dict[str, Any]], list[dict[str
|
||||
|
||||
# 6. Prometheus targets — 補齊 host-install services (110/112/188/125 等非 K8s)
|
||||
# Gap 1 修補 (2026-04-19 audit): 原本 asset_inventory 只涵蓋 K8s,
|
||||
# 110 Harbor/Gitea/監控 + 188 PostgreSQL/Redis/Ollama host-install 全漏
|
||||
# 110 Harbor/Gitea/監控 + 188 PostgreSQL/Redis host-install 全漏
|
||||
# 用 Prometheus /api/v1/targets 自動發現全節點服務
|
||||
try:
|
||||
prom_assets, host_relationships = await _collect_prometheus_targets()
|
||||
|
||||
@@ -172,7 +172,7 @@ _LLM_FORECAST_PROMPT = """你是 AWOOOI 容量規劃專家。以下 host 過去
|
||||
{findings_json}
|
||||
|
||||
## 當前主機環境資訊
|
||||
- 主機架構: 110 (Harbor/Gitea/監控), 112 (Security), 120/121 (K3s), 125 (K3s backup), 188 (PG/Redis/Ollama/MinIO)
|
||||
- 主機架構: 110 (Harbor/Gitea/監控), 112 (Security), 120/121 (K3s), 125 (K3s backup), 188 (PG/Redis/MinIO)
|
||||
- 判斷請考慮: 該主機上跑什麼服務、常見瓶頸模式
|
||||
|
||||
## 輸出規格 (必須是合法 JSON,純 JSON 無前後文字)
|
||||
|
||||
@@ -20,6 +20,7 @@ Date: 2026-03-20
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from uuid import uuid4
|
||||
from collections.abc import AsyncGenerator
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
@@ -80,6 +81,7 @@ from src.core.feature_flags import aiops_flags # ADR-080: AI 自主化飛輪 fe
|
||||
from src.core.http_client import close_all_http_clients, init_all_http_clients
|
||||
from src.core.logging import get_logger, setup_logging
|
||||
from src.core.redis_client import close_redis_pool, init_redis_pool
|
||||
from src.services.flywheel_stats_service import get_flywheel_stats_service
|
||||
from src.core.sse import get_publisher
|
||||
from src.core.telemetry import setup_telemetry, shutdown_telemetry
|
||||
|
||||
@@ -186,10 +188,9 @@ else:
|
||||
@asynccontextmanager
|
||||
async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
"""Application lifespan events"""
|
||||
# AwoooP Phase 2.4 (2026-05-04 ogt): 設定 startup handler 的 project_id context
|
||||
# asyncio.create_task() 自動繼承父任務的 ContextVar → 31 個 background loop 全部標記為 awoooi
|
||||
from src.core.context import PROJECT_ID
|
||||
PROJECT_ID.set("awoooi")
|
||||
# AwoooP Phase 2.4 (2026-05-04 ogt):
|
||||
# 改為不再在 lifespan 初始化預設 project_id context;
|
||||
# 後續請求皆需透過 middleware/runtime 攜帶 project_id 注入,否則拒絕查詢。
|
||||
|
||||
# Startup
|
||||
logger.info(
|
||||
@@ -683,7 +684,7 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
logger.warning("ollama_failover_system_start_failed", error=str(e))
|
||||
|
||||
# 2026-04-27 P3.2.2 by Claude — AI Provider 版本追蹤(每 1 小時)
|
||||
# 探測 5 Provider(ollama/ollama_188/gemini/claude/openclaw_nemo)版本
|
||||
# 探測 5 Provider(ollama/ollama_local/gemini/claude/openclaw_nemo)版本
|
||||
# 寫入 ai_provider_version_history;版本變更時 log warning,P3.2.3 alerter 後續整合
|
||||
try:
|
||||
async def _run_model_version_tracker_loop() -> None:
|
||||
@@ -819,7 +820,7 @@ app.add_middleware(
|
||||
allow_origins=settings.CORS_ORIGINS,
|
||||
allow_credentials=True,
|
||||
allow_methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
|
||||
allow_headers=["Authorization", "Content-Type", "X-Request-ID"],
|
||||
allow_headers=["Authorization", "Content-Type", "X-Request-ID", "X-Project-ID", "X-Tenant-ID"],
|
||||
expose_headers=["X-Request-ID"],
|
||||
)
|
||||
|
||||
@@ -837,27 +838,53 @@ async def request_logging_middleware(request: Request, call_next):
|
||||
"""
|
||||
import time
|
||||
|
||||
request_id = request.headers.get("X-Request-ID", "-")
|
||||
from src.core.context import clear_project_context, get_current_project_context, set_project_context
|
||||
|
||||
request_id = request.headers.get("X-Request-ID") or str(uuid4())
|
||||
project_id = (
|
||||
request.headers.get("X-Project-ID")
|
||||
or request.headers.get("X-Tenant-ID")
|
||||
or request.query_params.get("project_id")
|
||||
)
|
||||
project_id = project_id.strip() if project_id else None
|
||||
source = "request.project_id.missing"
|
||||
if project_id:
|
||||
source = "request.header_or_query"
|
||||
|
||||
context_tokens = set_project_context(
|
||||
project_id=project_id,
|
||||
source=source,
|
||||
request_id=request_id,
|
||||
)
|
||||
start_time = time.perf_counter()
|
||||
|
||||
# Bind request context for all logs in this request
|
||||
structlog.contextvars.clear_contextvars()
|
||||
current_context = get_current_project_context()
|
||||
structlog.contextvars.bind_contextvars(
|
||||
request_id=request_id,
|
||||
method=request.method,
|
||||
path=request.url.path,
|
||||
project_id=current_context["project_id"],
|
||||
project_context_source=current_context["source"],
|
||||
)
|
||||
|
||||
log = get_logger("awoooi.http")
|
||||
log.debug("request_start")
|
||||
|
||||
response = await call_next(request)
|
||||
try:
|
||||
response = await call_next(request)
|
||||
finally:
|
||||
clear_project_context(context_tokens)
|
||||
|
||||
duration_ms = (time.perf_counter() - start_time) * 1000
|
||||
log.info(
|
||||
"request_complete",
|
||||
status_code=response.status_code,
|
||||
duration_ms=round(duration_ms, 2),
|
||||
project_id=current_context["project_id"],
|
||||
project_context_source=current_context["source"],
|
||||
has_project_context=bool(current_context["project_id"]),
|
||||
)
|
||||
|
||||
# Add request ID to response headers
|
||||
@@ -865,6 +892,26 @@ async def request_logging_middleware(request: Request, call_next):
|
||||
return response
|
||||
|
||||
|
||||
@app.get("/api/v1/security/db-context-guard")
|
||||
async def db_context_guard() -> dict:
|
||||
"""
|
||||
Context Guard Endpoint (P1-1 runtime evidence)
|
||||
|
||||
- 未提供 project context(X-Project-ID / X-Tenant-ID / project_id query)
|
||||
時,應回傳 401,代表 RLS 已採 fail-closed
|
||||
- 有提供 context 時回傳 context snapshot,便於稽核
|
||||
"""
|
||||
from src.core.context import get_current_project_context
|
||||
from src.db.base import get_db_context
|
||||
|
||||
async with get_db_context():
|
||||
return {
|
||||
"status": "ok",
|
||||
"project_context": get_current_project_context(),
|
||||
"source": "runtime_guard",
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Exception Handlers
|
||||
# =============================================================================
|
||||
@@ -1005,10 +1052,17 @@ app.include_router(platform_v1.router, prefix="/api/v1/platform", tags=["AwoooP
|
||||
@app.get("/metrics", include_in_schema=False)
|
||||
async def prometheus_metrics() -> Response:
|
||||
"""Prometheus metrics endpoint for alerting"""
|
||||
return Response(
|
||||
content=generate_latest(),
|
||||
media_type=CONTENT_TYPE_LATEST,
|
||||
)
|
||||
content = generate_latest().decode("utf-8")
|
||||
# 2026-05-07 ogt + Claude Sonnet 4.6 — INC-20260507-99ADF2 修復
|
||||
# 飛輪指標(awoooi_flywheel_*)原本只在 /api/v1/stats/flywheel/metrics 暴露,
|
||||
# 110 Prom awoooi-api job scrape /metrics 時抓不到 → FlywheelExecutionRateMissing 永久 firing
|
||||
# 修法:在此串入飛輪指標,讓既有 scrape job 無需新增 job 即可抓到
|
||||
try:
|
||||
flywheel_metrics = await get_flywheel_stats_service().compute()
|
||||
content += flywheel_metrics.to_prometheus_lines()
|
||||
except Exception:
|
||||
logger.warning("prometheus_metrics_flywheel_error")
|
||||
return Response(content=content, media_type=CONTENT_TYPE_LATEST)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
|
||||
@@ -29,7 +29,7 @@ from __future__ import annotations
|
||||
from prometheus_client import Histogram
|
||||
|
||||
# Buckets 對齊 NIM 實測分佈(2-27s),並覆蓋三段 timeout 30/20/15s 邊界
|
||||
# 低端(0.5-5s):快速路徑(Ollama 188 本地)
|
||||
# 低端(0.5-5s):快速路徑(Ollama provider pool)
|
||||
# 中端(5-20s):NIM + Gemini fallback
|
||||
# 高端(20-60s):超時 / 慢速 Provider
|
||||
_AGENT_STEP_BUCKETS = [0.5, 1.0, 2.0, 5.0, 10.0, 15.0, 20.0, 30.0, 45.0, 60.0]
|
||||
|
||||
@@ -60,13 +60,17 @@ class MetricsDBRepository(IMetricsRepository):
|
||||
cutoff = datetime.now(UTC) - timedelta(hours=hours)
|
||||
|
||||
# Query: 統計 executed vs total (approved + executed + execution_failed)
|
||||
# 2026-05-06 ogt + Codex:
|
||||
# approval_records.status 目前實際寫入的是大寫 enum
|
||||
# (APPROVED / EXECUTION_SUCCESS / EXECUTION_FAILED)。舊查詢只看
|
||||
# lowercase executed,導致 AI Success 在報表層永遠趨近 0。
|
||||
query = text("""
|
||||
SELECT
|
||||
COUNT(CASE WHEN status = 'executed' THEN 1 END) as executed_count,
|
||||
COUNT(CASE WHEN UPPER(status::text) = 'EXECUTION_SUCCESS' THEN 1 END) as executed_count,
|
||||
COUNT(*) as total_count
|
||||
FROM approval_records
|
||||
WHERE created_at >= :cutoff
|
||||
AND status IN ('approved', 'executed', 'execution_failed')
|
||||
AND UPPER(status::text) IN ('APPROVED', 'EXECUTION_SUCCESS', 'EXECUTION_FAILED')
|
||||
""")
|
||||
|
||||
result = await session.execute(query, {"cutoff": cutoff})
|
||||
@@ -127,11 +131,11 @@ class MetricsDBRepository(IMetricsRepository):
|
||||
trend_query = text("""
|
||||
SELECT
|
||||
date_trunc('hour', created_at) as hour_bucket,
|
||||
COUNT(CASE WHEN status = 'executed' THEN 1 END) * 100.0 /
|
||||
COUNT(CASE WHEN UPPER(status::text) = 'EXECUTION_SUCCESS' THEN 1 END) * 100.0 /
|
||||
NULLIF(COUNT(*), 0) as hourly_rate
|
||||
FROM approval_records
|
||||
WHERE created_at >= :cutoff
|
||||
AND status IN ('approved', 'executed', 'execution_failed')
|
||||
AND UPPER(status::text) IN ('APPROVED', 'EXECUTION_SUCCESS', 'EXECUTION_FAILED')
|
||||
GROUP BY hour_bucket
|
||||
ORDER BY hour_bucket DESC
|
||||
LIMIT :limit
|
||||
|
||||
@@ -104,7 +104,7 @@ async def get_agent_thinking(
|
||||
) -> StreamingResponse:
|
||||
"""
|
||||
OpenClaw 思考軌跡 (SSE 串流)
|
||||
Phase 1.2: 真實串接 Ollama at 192.168.0.188:11434
|
||||
Phase 1.2: 真實串接設定中的 Ollama provider pool
|
||||
"""
|
||||
|
||||
async def generate_thinking_stream():
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
"""
|
||||
Ollama Provider - Phase 24 ADR-052
|
||||
====================================
|
||||
本地 LLM 推理 (192.168.0.188 VMware VM, CPU-only)
|
||||
本地 / 私有 LLM 推理 Provider。
|
||||
|
||||
搬移自: openclaw.py _call_ollama (L349-409)
|
||||
特性: 免費、隱私安全 (local)、但 CPU 慢 (~97s/30tokens for qwen2.5:7b)
|
||||
特性: 免費、隱私安全 (local)、可依 ADR-110 指向 GCP-A/GCP-B/111。
|
||||
|
||||
2026-04-02 ogt: Phase 24-A 從 openclaw.py 抽出
|
||||
"""
|
||||
@@ -268,33 +268,27 @@ class OllamaProvider:
|
||||
self._http_client = None
|
||||
|
||||
|
||||
# 2026-04-26 Wave5 B1-fix by Claude Engineer-A4 — OLLAMA_188 provider 註冊
|
||||
class Ollama188Provider(OllamaProvider):
|
||||
# 2026-05-06 Codex — 188 不再作為 Ollama Provider;本地備援統一命名為 ollama_local。
|
||||
class OllamaLocalProvider(OllamaProvider):
|
||||
"""
|
||||
Ollama 188 CPU-only 備援 Provider
|
||||
Ollama Local fallback Provider
|
||||
|
||||
繼承 OllamaProvider,但使用 OLLAMA_FALLBACK_URL(192.168.0.188:11434)
|
||||
作為推理端點,模型預設 OLLAMA_HEALTH_CHECK_MODEL(qwen2.5:7b-instruct)。
|
||||
|
||||
B1 修復:原本 _init_registry 未登錄此 provider,導致
|
||||
executor.execute() 遇到 "ollama_188" → not_registered → 跳過,
|
||||
188 從未被打到。此類別補全登錄鏈路。
|
||||
|
||||
2026-04-26 Wave5 B1-fix by Claude Engineer-A4
|
||||
使用 OLLAMA_FALLBACK_URL 作為本地最後防線端點。
|
||||
ADR-110 目前設定為 110 nginx proxy → 111 Ollama;188 不得再作為 Ollama provider。
|
||||
"""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "ollama_188"
|
||||
return "ollama_local"
|
||||
|
||||
@property
|
||||
def is_enabled(self) -> bool:
|
||||
import os
|
||||
# 優先查 ENABLE_OLLAMA_188;若未設定(預設 true)則看 OLLAMA_FALLBACK_URL 是否有值
|
||||
env_override = os.getenv("ENABLE_OLLAMA_188", "true").lower() == "true"
|
||||
# 優先查 ENABLE_OLLAMA_LOCAL;若未設定(預設 true)則看 OLLAMA_FALLBACK_URL 是否有值。
|
||||
env_override = os.getenv("ENABLE_OLLAMA_LOCAL", "true").lower() == "true"
|
||||
if not env_override:
|
||||
return False
|
||||
# OLLAMA_FALLBACK_URL 空字串 → 未設定 188 節點 → 停用
|
||||
# OLLAMA_FALLBACK_URL 空字串 → 未設定本地節點 → 停用。
|
||||
return bool(getattr(settings, "OLLAMA_FALLBACK_URL", ""))
|
||||
|
||||
def _endpoint_url(self) -> str:
|
||||
@@ -319,18 +313,18 @@ class Ollama188Provider(OllamaProvider):
|
||||
client = await self._get_client()
|
||||
|
||||
registry = get_model_registry()
|
||||
# 嘗試取 ollama_188 專屬設定,fallback 到 ollama 預設
|
||||
# 嘗試取本地 fallback 專屬設定,fallback 到 ollama 預設。
|
||||
try:
|
||||
model_name = registry.get_model("ollama_188", "rca")
|
||||
model_name = registry.get_model("ollama_local", "rca")
|
||||
except Exception:
|
||||
model_name = getattr(settings, "OLLAMA_HEALTH_CHECK_MODEL", "qwen2.5:7b-instruct")
|
||||
|
||||
try:
|
||||
options = registry.get_provider_options("ollama_188")
|
||||
options = registry.get_provider_options("ollama_local")
|
||||
except Exception:
|
||||
options = registry.get_provider_options("ollama")
|
||||
|
||||
# CPU-only 備援:固定使用較長 timeout(CPU 推理慢)
|
||||
# 本地備援:固定使用較長 timeout,避免 111 模型載入時被過早判死。
|
||||
task_type = (context or {}).get("task_type", "")
|
||||
if task_type in ("diagnose", "force_local"):
|
||||
read_timeout = float(getattr(settings, "OLLAMA_DIAGNOSE_TIMEOUT_SECONDS", 200))
|
||||
@@ -359,7 +353,7 @@ class Ollama188Provider(OllamaProvider):
|
||||
latency = (time.perf_counter() - start) * 1000
|
||||
|
||||
logger.info(
|
||||
"ollama_188_provider_success",
|
||||
"ollama_local_provider_success",
|
||||
response_length=len(result),
|
||||
tokens=tokens,
|
||||
latency_ms=round(latency, 1),
|
||||
@@ -375,12 +369,12 @@ class Ollama188Provider(OllamaProvider):
|
||||
|
||||
except httpx.TimeoutException as e:
|
||||
latency = (time.perf_counter() - start) * 1000
|
||||
logger.warning("ollama_188_provider_timeout", error=str(e), latency_ms=round(latency, 1))
|
||||
logger.warning("ollama_local_provider_timeout", error=str(e), latency_ms=round(latency, 1))
|
||||
return AIResult(raw_response="", success=False, provider=self.name, latency_ms=latency, error=f"Timeout: {e}")
|
||||
|
||||
except Exception as e:
|
||||
latency = (time.perf_counter() - start) * 1000
|
||||
logger.warning("ollama_188_provider_failed", error=str(e), latency_ms=round(latency, 1))
|
||||
logger.warning("ollama_local_provider_failed", error=str(e), latency_ms=round(latency, 1))
|
||||
return AIResult(raw_response="", success=False, provider=self.name, latency_ms=latency, error=str(e))
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
|
||||
@@ -73,10 +73,6 @@ class AIProviderEnum(str, Enum):
|
||||
"""AI 提供者"""
|
||||
|
||||
OLLAMA = "ollama"
|
||||
# 2026-04-25 critic-fix Part2 B2 by Claude Engineer-C2
|
||||
# P1.1b OllamaFailoverManager 使用 provider_name="ollama_188",
|
||||
# 但 AIProviderEnum 沒有此值 → P1.2 整合時 lookup 失敗
|
||||
OLLAMA_188 = "ollama_188" # 188 CPU-only 備援節點(P1.1b)
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP 三層容災
|
||||
# OllamaFailoverManager 回傳 provider_name="ollama_gcp_a"/"ollama_gcp_b"/"ollama_local"
|
||||
# 缺少 enum 值 → AIProviderEnum(primary_str) 拋 ValueError → fallback chain 清空 → 直跳 Gemini
|
||||
@@ -96,8 +92,6 @@ class AIProviderEnum(str, Enum):
|
||||
# Provider 對應延遲預算 (ms)
|
||||
PROVIDER_LATENCY_BUDGET: dict[AIProviderEnum, int] = {
|
||||
AIProviderEnum.OLLAMA: 60000, # 本地,允許較長處理時間
|
||||
# 2026-04-25 critic-fix Part2 B2 by Claude Engineer-C2 — 188 CPU-only 推理較慢
|
||||
AIProviderEnum.OLLAMA_188: 120000, # 120s budget for CPU inference
|
||||
# 2026-05-04 ogt: ADR-110 GCP 三層容災 — GCP NVMe SSD 推理快,60s 足夠
|
||||
AIProviderEnum.OLLAMA_GCP_A: 60000,
|
||||
AIProviderEnum.OLLAMA_GCP_B: 60000,
|
||||
@@ -432,7 +426,7 @@ class AIRouter:
|
||||
model = failover_result.primary.model
|
||||
reason = f"{reason} [failover→{primary_str}]"
|
||||
except ValueError:
|
||||
# provider_name 無法對應已知 enum(理論上不應發生,OLLAMA_188 已加)
|
||||
# provider_name 無法對應已知 enum;避免未知 provider 靜默進入執行層。
|
||||
logger.warning(
|
||||
"ai_router_unknown_failover_provider",
|
||||
provider=primary_str,
|
||||
@@ -1078,11 +1072,46 @@ class AIRouterExecutor:
|
||||
cached = await redis.get(cache_key)
|
||||
if cached:
|
||||
data = _json.loads(cached)
|
||||
cached_provider = data.get("provider", "cache")
|
||||
provider_allowed = cached_provider in provider_order
|
||||
ollama_first_required = (
|
||||
bool(context)
|
||||
and any(
|
||||
key in context
|
||||
for key in (
|
||||
"alert_type",
|
||||
"alertname",
|
||||
"alert_name",
|
||||
"fingerprint",
|
||||
"incident_id",
|
||||
"severity",
|
||||
"target_resource",
|
||||
)
|
||||
)
|
||||
and bool(provider_order)
|
||||
and provider_order[0].startswith("ollama")
|
||||
)
|
||||
if (
|
||||
cached_provider == "ollama"
|
||||
and any(provider.startswith("ollama") for provider in provider_order)
|
||||
):
|
||||
provider_allowed = True
|
||||
if ollama_first_required and not cached_provider.startswith("ollama"):
|
||||
provider_allowed = False
|
||||
if not provider_allowed:
|
||||
logger.info(
|
||||
"ai_router_cache_provider_mismatch_skip",
|
||||
cache_key=cache_key[:30],
|
||||
cached_provider=cached_provider,
|
||||
provider_order=provider_order,
|
||||
ollama_first_required=ollama_first_required,
|
||||
)
|
||||
raise ValueError("cached provider not allowed by current provider_order")
|
||||
logger.info("ai_router_cache_hit", cache_key=cache_key[:30])
|
||||
return AIResult(
|
||||
raw_response=data.get("response", ""),
|
||||
success=True,
|
||||
provider=data.get("provider", "cache"),
|
||||
provider=cached_provider,
|
||||
from_cache=True,
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -1306,7 +1335,7 @@ def _init_registry() -> AIProviderRegistry:
|
||||
"""初始化 Provider Registry (首次呼叫時自動註冊所有 Provider)"""
|
||||
from src.services.ai_providers.ollama import (
|
||||
OllamaProvider,
|
||||
Ollama188Provider,
|
||||
OllamaLocalProvider,
|
||||
OllamaGcpBProvider, # 2026-05-04 ADR-110 GCP-B
|
||||
)
|
||||
from src.services.ai_providers.gemini import GeminiProvider
|
||||
@@ -1327,8 +1356,9 @@ def _init_registry() -> AIProviderRegistry:
|
||||
from src.services.ai_providers.nemotron import NemotronProvider
|
||||
registry.register(NemotronProvider())
|
||||
|
||||
# 2026-04-26 Wave5 B1-fix by Claude Engineer-A4 — 補登 OLLAMA_188 備援 provider
|
||||
ollama_local = Ollama188Provider()
|
||||
# 2026-05-06 Codex: 188 不再作為 Ollama provider。
|
||||
# Local fallback 統一命名為 ollama_local,端點由 OLLAMA_FALLBACK_URL 指向 111/110 proxy。
|
||||
ollama_local = OllamaLocalProvider()
|
||||
registry.register(ollama_local)
|
||||
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP 三層容災修復
|
||||
@@ -1337,7 +1367,7 @@ def _init_registry() -> AIProviderRegistry:
|
||||
# 修復:
|
||||
# "ollama_gcp_a" alias → 同 OllamaProvider(OLLAMA_URL = GCP-A)
|
||||
# "ollama_gcp_b" → 新 OllamaGcpBProvider(OLLAMA_SECONDARY_URL = GCP-B)
|
||||
# "ollama_local" alias → 同 Ollama188Provider(OLLAMA_FALLBACK_URL = 111)
|
||||
# "ollama_local" → OllamaLocalProvider(OLLAMA_FALLBACK_URL = 111 / 110:11437)
|
||||
registry._providers["ollama_gcp_a"] = ollama_gcp_a
|
||||
registry.register(OllamaGcpBProvider())
|
||||
registry._providers["ollama_local"] = ollama_local
|
||||
|
||||
@@ -457,6 +457,8 @@ class AutoRepairService:
|
||||
except Exception as _db_e:
|
||||
logger.error("auto_repair_db_write_failed", error=str(_db_e))
|
||||
|
||||
self._record_auto_repair_metric(playbook, success=True)
|
||||
|
||||
# 2026-04-07 Claude Code: Sprint 4 B1/B2 — 記錄處置類型
|
||||
# P0-1 Fix: 統一使用 AnomalyCounter.hash_signature()
|
||||
try:
|
||||
@@ -630,6 +632,8 @@ class AutoRepairService:
|
||||
except Exception as _db_e:
|
||||
logger.error("auto_repair_db_write_failed", error=str(_db_e))
|
||||
|
||||
self._record_auto_repair_metric(playbook, success=False)
|
||||
|
||||
# 2026-04-04 Claude Code: Phase 25 P1 — 失敗修復後 fire-and-forget 生成 ANTI_PATTERN
|
||||
# 2026-04-05 Claude Code: I1 修正 — 補齊 _pending_tasks GC 防護(對稱化)
|
||||
try:
|
||||
@@ -700,6 +704,35 @@ class AutoRepairService:
|
||||
|
||||
return max_risk
|
||||
|
||||
def _record_auto_repair_metric(self, playbook: Playbook, success: bool) -> None:
|
||||
"""把實際 auto-repair 執行寫入 Prometheus 指標。
|
||||
|
||||
2026-05-06 ogt + Codex:DB 已有 auto_repair_executions,但
|
||||
core.metrics.record_auto_repair() 長期零 caller,導致治理/心跳用
|
||||
Prometheus 看起來像「飛輪沒做事」。label 使用 action_type,避免
|
||||
playbook_id 造成高基數。
|
||||
"""
|
||||
try:
|
||||
from src.core.metrics import record_auto_repair
|
||||
|
||||
first_step = playbook.repair_steps[0] if playbook.repair_steps else None
|
||||
action = first_step.action_type.value if first_step else "unknown"
|
||||
max_risk = self._get_max_risk_level(playbook)
|
||||
tier = {
|
||||
RiskLevel.LOW: 1,
|
||||
RiskLevel.MEDIUM: 2,
|
||||
RiskLevel.HIGH: 3,
|
||||
RiskLevel.CRITICAL: 4,
|
||||
}.get(max_risk, 0)
|
||||
record_auto_repair(action=action, tier=tier, success=success)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"auto_repair_metric_record_failed",
|
||||
playbook_id=playbook.playbook_id,
|
||||
success=success,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
def _is_host_or_backup_incident(self, incident: Incident) -> bool:
|
||||
"""主機/備份類事件只能走 SSH/只讀診斷,不允許 K8s rollout 類修復。"""
|
||||
|
||||
|
||||
@@ -607,7 +607,7 @@ async def _nemoclaw_second_opinion(incident: "Incident", primary_result: dict) -
|
||||
"""
|
||||
MCP Phase 4a: NemoClaw second opinion — 信心 < 0.7 時觸發
|
||||
============================================================
|
||||
用 deepseek-r1:14b (Ollama 188) 對同一份資料做獨立推理,
|
||||
用 deepseek-r1:14b (設定的 Ollama primary) 對同一份資料做獨立推理,
|
||||
輸出純文字 advisory_note,不執行任何操作。
|
||||
|
||||
2026-04-11 Claude Sonnet 4.6 Asia/Taipei
|
||||
@@ -666,7 +666,7 @@ async def _generate_playbook_draft_if_new(incident: "Incident") -> None:
|
||||
MCP Phase 4c: Playbook 無命中時,自動生成 AI 草稿 Playbook 寫入 KM
|
||||
=====================================================================
|
||||
- 僅在 KM 中不存在同 alertname 的 Playbook 時觸發(避免重複)
|
||||
- 用 qwen2.5:7b-instruct (Ollama 188) 生成結構化 Playbook 草稿
|
||||
- 用 qwen2.5:7b-instruct (設定的 Ollama primary) 生成結構化 Playbook 草稿
|
||||
- 寫入 KnowledgeEntry,status=DRAFT,需人工審核後升為 APPROVED
|
||||
- 寫入 AlertOperationLog PLAYBOOK_DRAFT_CREATED 事件
|
||||
|
||||
|
||||
@@ -237,6 +237,31 @@ class FlywheelStatsService:
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
continue
|
||||
|
||||
# 2026-05-06 ogt + Codex:
|
||||
# 執行成功率的 source of truth 是 auto_repair_executions。
|
||||
# Redis playbook success_count/failure_count 會因回寫鏈路中斷而落後,
|
||||
# 造成 governance / heartbeat 判定「飛輪沒有執行」。
|
||||
try:
|
||||
async with get_db_context() as db:
|
||||
row = await db.execute(
|
||||
text("""
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE success IS TRUE) AS success,
|
||||
COUNT(*) AS total
|
||||
FROM auto_repair_executions
|
||||
WHERE created_at >= NOW() - interval '24 hours'
|
||||
""")
|
||||
)
|
||||
repair_stats = row.one()
|
||||
db_total_exec = int(repair_stats.total or 0)
|
||||
if db_total_exec >= FLYWHEEL_MIN_SAMPLE:
|
||||
db_total_success = int(repair_stats.success or 0)
|
||||
return count, db_total_success / db_total_exec
|
||||
if db_total_exec > 0:
|
||||
return count, None
|
||||
except Exception:
|
||||
logger.warning("flywheel_stats_auto_repair_execution_query_failed")
|
||||
|
||||
if total_exec < FLYWHEEL_MIN_SAMPLE:
|
||||
# 樣本不足(含 Redis 空),回 None 通知呼叫方跳過 W-3 告警判斷
|
||||
return count, None
|
||||
|
||||
@@ -15,7 +15,7 @@ HeartbeatReportService — ADR-073 心跳監控重構
|
||||
import asyncio
|
||||
import html
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
@@ -420,8 +420,8 @@ class HeartbeatReportService:
|
||||
try:
|
||||
# KM 向量化率(DB 查詢)
|
||||
from src.db.base import get_db_context
|
||||
from src.db.models import IncidentRecord, KnowledgeEntryRecord
|
||||
from sqlalchemy import func, select
|
||||
from src.db.models import KnowledgeEntryRecord
|
||||
from sqlalchemy import func, select, text as sa_text
|
||||
async with get_db_context() as db:
|
||||
# KM 總數
|
||||
km_total = await db.scalar(select(func.count()).select_from(KnowledgeEntryRecord))
|
||||
@@ -436,20 +436,22 @@ class HeartbeatReportService:
|
||||
stats.km_vectorized = vec_result.scalar() or 0
|
||||
|
||||
# 24h 修復統計
|
||||
since = datetime.utcnow() - timedelta(hours=24)
|
||||
outcomes = await db.execute(
|
||||
select(IncidentRecord.outcome).where(
|
||||
IncidentRecord.created_at >= since,
|
||||
IncidentRecord.outcome.isnot(None),
|
||||
)
|
||||
)
|
||||
outcome_list = [r[0] for r in outcomes.all() if r[0]]
|
||||
stats.attempt_24h = len(outcome_list)
|
||||
stats.success_24h = sum(
|
||||
1 for o in outcome_list
|
||||
if isinstance(o, dict) and o.get("execution_success")
|
||||
or isinstance(o, str) and "success" in o.lower()
|
||||
# 2026-05-06 ogt + Codex:
|
||||
# incidents.outcome 已不是自動修復 source of truth。實際執行紀錄
|
||||
# 寫在 auto_repair_executions;舊查詢會讓心跳報告顯示 0/15,
|
||||
# 造成「全系統正常」但飛輪 KPI 失真的假象。
|
||||
repair_result = await db.execute(
|
||||
sa_text("""
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE success IS TRUE) AS success,
|
||||
COUNT(*) AS total
|
||||
FROM auto_repair_executions
|
||||
WHERE created_at >= NOW() - interval '24 hours'
|
||||
""")
|
||||
)
|
||||
repair_row = repair_result.one()
|
||||
stats.success_24h = int(repair_row.success or 0)
|
||||
stats.attempt_24h = int(repair_row.total or 0)
|
||||
|
||||
# 最後學習活動
|
||||
last_km = await db.scalar(
|
||||
@@ -865,9 +867,10 @@ def report_to_telegram_html(report: HeartbeatReport) -> str:
|
||||
lines.append("☸️ <b>Kubernetes Pods</b>")
|
||||
for i, pod in enumerate(report.pods):
|
||||
prefix = "└─" if i == len(report.pods) - 1 else "├─"
|
||||
ready_icon = "✅" if pod.ready else "❌"
|
||||
ready_icon = "✅" if pod.ready or pod.status in ("Succeeded", "Completed") else "❌"
|
||||
restart_str = f" (重啟×{pod.restarts})" if pod.restarts > 0 else ""
|
||||
lines.append(f"{prefix} {ready_icon} {html.escape(pod.name[:35])}{restart_str}")
|
||||
status_str = "" if pod.ready else f" <code>{html.escape(pod.status)}</code>"
|
||||
lines.append(f"{prefix} {ready_icon} {html.escape(pod.name[:35])}{restart_str}{status_str}")
|
||||
|
||||
# --- Scanner 狀態 ---
|
||||
if report.scanners.last_runs:
|
||||
|
||||
@@ -7,7 +7,7 @@ Hosts:
|
||||
- 192.168.0.110: DevOps 金庫 (Harbor, GH Runner)
|
||||
- 192.168.0.112: Kali Security (Scanner API)
|
||||
- 192.168.0.120: K3s Master (awoooi-prod namespace)
|
||||
- 192.168.0.188: AI+Web 中心 (Nginx, PostgreSQL, Redis, Ollama, OpenClaw, SigNoz)
|
||||
- 192.168.0.188: AI+Web 中心 (Nginx, PostgreSQL, Redis, OpenClaw, SigNoz)
|
||||
|
||||
Features:
|
||||
- asyncio.gather for parallel fetching
|
||||
|
||||
@@ -1097,26 +1097,45 @@ class IncidentService:
|
||||
from src.repositories.incident_repository import get_incident_repository
|
||||
from src.utils.timezone import now_taipei
|
||||
|
||||
# 1. 從 Working Memory 讀取
|
||||
repo = get_incident_repository()
|
||||
|
||||
# 1. 從 Working Memory 讀取;若 Redis TTL 已過,回退到 Episodic DB。
|
||||
# 2026-05-29 ogt + Codex: 舊 incident 只留在 DB 時仍需可收斂,
|
||||
# 否則 FlywheelIncidentsStuck 會永久累積歷史 INVESTIGATING。
|
||||
incident = await self.get_from_working_memory(incident_id)
|
||||
db_only = False
|
||||
if incident is None:
|
||||
logger.warning("incident_not_found_for_resolve", incident_id=incident_id)
|
||||
return None
|
||||
incident = await repo.get_by_id(incident_id)
|
||||
if incident is None:
|
||||
logger.warning("incident_not_found_for_resolve", incident_id=incident_id)
|
||||
return None
|
||||
db_only = True
|
||||
|
||||
if incident.status in (IncidentStatus.RESOLVED, IncidentStatus.CLOSED):
|
||||
logger.info(
|
||||
"incident_resolve_idempotent_skip",
|
||||
incident_id=incident_id,
|
||||
status=incident.status.value,
|
||||
db_only=db_only,
|
||||
)
|
||||
return incident
|
||||
|
||||
# 2. 更新狀態
|
||||
incident.status = IncidentStatus.RESOLVED
|
||||
incident.resolved_at = now_taipei()
|
||||
incident.updated_at = now_taipei()
|
||||
|
||||
# 3. 寫入 Working Memory
|
||||
redis_success = await self.save_to_working_memory(incident)
|
||||
if not redis_success:
|
||||
logger.error("resolve_redis_write_failed", incident_id=incident_id)
|
||||
return None
|
||||
# 3. 寫入 Working Memory。DB-only 舊案不重新灌回 Redis working memory。
|
||||
if not db_only:
|
||||
redis_success = await self.save_to_working_memory(incident)
|
||||
if not redis_success:
|
||||
logger.error("resolve_redis_write_failed", incident_id=incident_id)
|
||||
return None
|
||||
else:
|
||||
logger.info("resolve_db_only_incident", incident_id=incident_id)
|
||||
|
||||
# 4. 同步到 Episodic Memory
|
||||
try:
|
||||
repo = get_incident_repository()
|
||||
await repo.update_status(
|
||||
incident_id=incident_id,
|
||||
status="resolved",
|
||||
|
||||
@@ -34,8 +34,12 @@ logger = structlog.get_logger(__name__)
|
||||
# 台北時區
|
||||
TZ_TAIPEI = ZoneInfo("Asia/Taipei")
|
||||
|
||||
# Prometheus 端點
|
||||
PROMETHEUS_URL = "http://192.168.0.121:30090"
|
||||
# Prometheus endpoint.
|
||||
#
|
||||
# 2026-05-05 Codex: do not pin this report job to a K3s worker NodePort.
|
||||
# Production already injects PROMETHEUS_URL from ConfigMap, currently the
|
||||
# Docker Prometheus on 110. This keeps reboot recovery independent of 121.
|
||||
PROMETHEUS_URL = settings.PROMETHEUS_URL.rstrip("/")
|
||||
|
||||
# kube-state-metrics 查詢
|
||||
PROM_QUERIES = {
|
||||
@@ -215,7 +219,7 @@ class K3sMonitorService:
|
||||
|
||||
# 發送訊息
|
||||
formatted = status.format()
|
||||
result = await gateway.send_message(formatted)
|
||||
result = await gateway.send_text(formatted)
|
||||
|
||||
if result:
|
||||
logger.info("k3s_daily_report_sent", date=status.report_date)
|
||||
|
||||
@@ -5,7 +5,7 @@ AI Provider 版本探測 — 為每個 Provider 提供 get_version()
|
||||
|
||||
Provider:
|
||||
- ollama : 34.143.170.20 GCP-A Ollama (primary) — 2026-05-03 ogt: ADR-110 GCP-A Primary
|
||||
- ollama_188 : 192.168.0.188 Ollama (fallback)
|
||||
- ollama_local : 192.168.0.111 / 110 proxy Ollama (local fallback)
|
||||
- gemini : Google Gemini API (版本 = model name)
|
||||
- claude : Anthropic Claude (版本 = model name)
|
||||
- openclaw_nemo : OpenClaw NemoTron (版本 = OPENCLAW_DEFAULT_MODEL)
|
||||
@@ -31,7 +31,7 @@ TAIPEI_TZ = timezone(timedelta(hours=8))
|
||||
class ProviderVersionInfo:
|
||||
"""AI Provider 版本快照"""
|
||||
|
||||
provider: str # "ollama" / "ollama_188" / "gemini" / "claude" / "openclaw_nemo"
|
||||
provider: str # "ollama" / "ollama_local" / "gemini" / "claude" / "openclaw_nemo"
|
||||
model: str
|
||||
version: str # version string 或 tag(Ollama 用 modified_at,其他用 model name)
|
||||
digest: str | None = None # SHA256 digest(僅 Ollama 有)
|
||||
@@ -43,7 +43,7 @@ class ProviderVersionInfo:
|
||||
# =============================================================================
|
||||
|
||||
async def probe_ollama_version(url: str, model: str) -> ProviderVersionInfo:
|
||||
"""探測 Ollama(GCP-A 或 188):GET /api/tags 取 model digest + modified_at
|
||||
"""探測 Ollama(GCP-A/GCP-B 或本地 111):GET /api/tags 取 model digest + modified_at
|
||||
|
||||
Args:
|
||||
url: Ollama base URL,例如 "http://34.143.170.20:11434"(GCP-A Primary)
|
||||
@@ -58,15 +58,12 @@ async def probe_ollama_version(url: str, model: str) -> ProviderVersionInfo:
|
||||
"""
|
||||
import httpx
|
||||
|
||||
# 2026-05-03 ogt: ADR-110 GCP-A Primary — 擴展 provider 判斷邏輯支援 GCP 三層容災
|
||||
# 188 保留 ollama_188 命名(CPU-only 主機,雖移出 routing chain 但仍可被 probe)
|
||||
# 2026-05-06 Codex: 188 不再作為 Ollama provider;local fallback 一律標示 ollama_local。
|
||||
_GCP_OLLAMA_IPS = {"34.143.170.20", "34.21.145.224"}
|
||||
if any(ip in url for ip in _GCP_OLLAMA_IPS):
|
||||
provider_name = "ollama"
|
||||
elif "192.168.0.111" in url:
|
||||
elif "192.168.0.111" in url or "192.168.0.110:11437" in url:
|
||||
provider_name = "ollama_local"
|
||||
elif "192.168.0.188" in url:
|
||||
provider_name = "ollama_188"
|
||||
else:
|
||||
provider_name = "ollama_remote"
|
||||
|
||||
@@ -179,7 +176,7 @@ async def probe_claude_version() -> ProviderVersionInfo:
|
||||
async def probe_openclaw_nemo_version() -> ProviderVersionInfo:
|
||||
"""OpenClaw NemoTron:版本字串從 settings.OPENCLAW_DEFAULT_MODEL 讀取
|
||||
|
||||
NemoTron 運行在 OpenClaw 188 節點(使用 Ollama 推理),
|
||||
NemoTron 運行在 OpenClaw 節點,
|
||||
透過 OPENCLAW_URL /api/tags 探測,模型名稱即版本識別。
|
||||
|
||||
Returns:
|
||||
@@ -195,18 +192,18 @@ async def probe_openclaw_nemo_version() -> ProviderVersionInfo:
|
||||
|
||||
# OpenClaw 底層是 Ollama,使用 OPENCLAW_URL 的 host:port 加上 Ollama port
|
||||
# OPENCLAW_URL 是 8088(OpenClaw API),Ollama 通常在 11434
|
||||
# 188 的 Ollama URL 若有設定則直接用 OLLAMA_FALLBACK_URL
|
||||
ollama_188_url = settings.OLLAMA_FALLBACK_URL
|
||||
if not ollama_188_url:
|
||||
# OpenClaw 底層 tags 來源優先使用本地 fallback Ollama URL。
|
||||
ollama_local_url = settings.OLLAMA_FALLBACK_URL
|
||||
if not ollama_local_url:
|
||||
# fallback:從 OPENCLAW_URL host 構建 Ollama URL
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(settings.OPENCLAW_URL)
|
||||
ollama_188_url = f"{parsed.scheme}://{parsed.hostname}:11434"
|
||||
ollama_local_url = f"{parsed.scheme}://{parsed.hostname}:11434"
|
||||
|
||||
import httpx
|
||||
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
resp = await client.get(f"{ollama_188_url}/api/tags")
|
||||
resp = await client.get(f"{ollama_local_url}/api/tags")
|
||||
resp.raise_for_status()
|
||||
models = resp.json().get("models", [])
|
||||
|
||||
@@ -220,7 +217,7 @@ async def probe_openclaw_nemo_version() -> ProviderVersionInfo:
|
||||
)
|
||||
|
||||
# model 不在清單時:version 用 model name,digest=None
|
||||
logger.warning("openclaw_nemo_model_not_in_tags", model=model, url=ollama_188_url)
|
||||
logger.warning("openclaw_nemo_model_not_in_tags", model=model, url=ollama_local_url)
|
||||
return ProviderVersionInfo(
|
||||
provider="openclaw_nemo",
|
||||
model=model,
|
||||
@@ -257,7 +254,7 @@ async def probe_all_providers() -> list[ProviderVersionInfo]:
|
||||
raw = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
results: list[ProviderVersionInfo] = []
|
||||
provider_labels = ["ollama", "ollama_188", "gemini", "claude", "openclaw_nemo"]
|
||||
provider_labels = ["ollama", "ollama_local", "gemini", "claude", "openclaw_nemo"]
|
||||
for label, outcome in zip(provider_labels, raw, strict=True):
|
||||
if isinstance(outcome, ProviderVersionInfo):
|
||||
results.append(outcome)
|
||||
|
||||
@@ -5,7 +5,7 @@ Phase 5: OpenClaw 實體化升級 (2026-03-21)
|
||||
統帥校正: SignOz 為唯一全能視力中心
|
||||
|
||||
Features:
|
||||
- 真實 LLM SDK 整合 (Ollama → Gemini → Claude)
|
||||
- 真實 LLM SDK 整合 (告警預設 Ollama GCP-A → GCP-B → 111 → Gemini)
|
||||
- SignOz Gold Metrics 即時擷取 (P99/Error/RPS)
|
||||
- AIOps Agent 專業人格 (K8s 維運 + SRE RCA 專精)
|
||||
- 強制結構化 JSON 輸出 (符合 API 契約)
|
||||
@@ -144,8 +144,8 @@ class OpenClawService:
|
||||
"""
|
||||
OpenClaw AI 決策服務 - True LLM + SignOz Integration
|
||||
|
||||
實作 AI_FALLBACK_ORDER 備援機制:
|
||||
Ollama → Gemini → Claude → Mock
|
||||
實作 AI_FALLBACK_ORDER 備援機制。
|
||||
告警/incident 上下文預設套用成本防線,只允許 Ollama GCP-A → GCP-B → 111。
|
||||
|
||||
新增 SignOz 整合:
|
||||
- 自動擷取 Gold Metrics
|
||||
@@ -176,6 +176,89 @@ class OpenClawService:
|
||||
await self._http_client.aclose()
|
||||
self._http_client = None
|
||||
|
||||
def _is_incident_alert_context(self, alert_context: dict | None) -> bool:
|
||||
"""Return true when a request came from the alert/incident automation path."""
|
||||
if not alert_context:
|
||||
return False
|
||||
alert_keys = {
|
||||
"alert_type",
|
||||
"alertname",
|
||||
"alert_name",
|
||||
"fingerprint",
|
||||
"incident_id",
|
||||
"severity",
|
||||
"signals",
|
||||
"target_resource",
|
||||
}
|
||||
return any(key in alert_context for key in alert_keys)
|
||||
|
||||
def _cloud_fallback_allowed_for_alert(self, alert_context: dict | None) -> bool:
|
||||
"""Cloud fallback is allowed after the ordered Ollama lane for alerts."""
|
||||
if not self._is_incident_alert_context(alert_context):
|
||||
return True
|
||||
return bool(getattr(settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", True))
|
||||
|
||||
def _alert_enforces_ollama_first(self, alert_context: dict | None) -> bool:
|
||||
"""Alert cards must try GCP-A/GCP-B/111 before Gemini backup."""
|
||||
return (
|
||||
self._is_incident_alert_context(alert_context)
|
||||
and bool(getattr(settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True))
|
||||
)
|
||||
|
||||
async def _resolve_alert_provider_order(
|
||||
self,
|
||||
task_type: str = "diagnose",
|
||||
alert_context: dict | None = None,
|
||||
cloud_provider_order: list[str] | None = None,
|
||||
) -> list[str]:
|
||||
"""Resolve GCP-A/GCP-B/111, then Gemini backup, for alert analysis."""
|
||||
provider_order: list[str] = []
|
||||
try:
|
||||
route = await get_ollama_failover_manager().select_provider(task_type=task_type)
|
||||
provider_order = [
|
||||
endpoint.provider_name
|
||||
for endpoint in route.all_endpoints_in_order()
|
||||
if endpoint.provider_name.startswith("ollama")
|
||||
]
|
||||
except Exception as route_error:
|
||||
logger.warning(
|
||||
"alert_ollama_route_lookup_failed",
|
||||
error=str(route_error),
|
||||
task_type=task_type,
|
||||
)
|
||||
|
||||
if not provider_order:
|
||||
provider_order = ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
|
||||
|
||||
deduped: list[str] = []
|
||||
for provider_name in provider_order:
|
||||
if provider_name and provider_name not in deduped:
|
||||
deduped.append(provider_name)
|
||||
|
||||
if not self._alert_enforces_ollama_first(alert_context):
|
||||
return deduped
|
||||
|
||||
ollama_order = {"ollama_gcp_a": 0, "ollama_gcp_b": 1, "ollama_local": 2}
|
||||
ordered_ollama = [
|
||||
provider_name
|
||||
for provider_name in deduped
|
||||
if provider_name in ollama_order
|
||||
]
|
||||
ordered_ollama.sort(key=lambda provider_name: ollama_order[provider_name])
|
||||
if not ordered_ollama:
|
||||
ordered_ollama = ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
|
||||
|
||||
if not self._cloud_fallback_allowed_for_alert(alert_context):
|
||||
return ordered_ollama
|
||||
|
||||
cloud_candidates = cloud_provider_order or []
|
||||
cloud_backup: list[str] = []
|
||||
for provider_name in [*cloud_candidates, "gemini"]:
|
||||
if provider_name == "gemini" and provider_name not in cloud_backup:
|
||||
cloud_backup.append(provider_name)
|
||||
|
||||
return ordered_ollama + cloud_backup
|
||||
|
||||
# =========================================================================
|
||||
# SignOz Integration
|
||||
# =========================================================================
|
||||
@@ -437,13 +520,13 @@ class OpenClawService:
|
||||
# 完整移除時機: Phase 24 完整驗收後 (ADR-052 D11)
|
||||
# =========================================================================
|
||||
|
||||
async def _call_ollama(self, prompt: str) -> tuple[str, bool]:
|
||||
async def _call_ollama(self, prompt: str, *, ollama_only: bool = False) -> tuple[str, bool]:
|
||||
"""
|
||||
呼叫 Ollama (支援 JSON Mode)。
|
||||
|
||||
USE_AI_ROUTER=true 正常會走 AIRouterExecutor;這裡是 legacy safety-net。
|
||||
2026-05-05 Codex: safety-net 也必須遵守 ADR-110 三層 Ollama
|
||||
路由,不能只打 OLLAMA_URL 後直接掉 Gemini。
|
||||
路由,告警路徑預設只允許 GCP-A/GCP-B/111,不能只打 OLLAMA_URL 後直接掉 Gemini。
|
||||
"""
|
||||
try:
|
||||
client = await self._get_client()
|
||||
@@ -484,6 +567,26 @@ class OpenClawService:
|
||||
endpoints.append((provider_name, endpoint_url))
|
||||
seen_urls.add(endpoint_url)
|
||||
|
||||
if ollama_only:
|
||||
allowed_provider_order = {"ollama_gcp_a": 0, "ollama_gcp_b": 1, "ollama_local": 2}
|
||||
endpoints = [
|
||||
(provider_name, endpoint_url)
|
||||
for provider_name, endpoint_url in endpoints
|
||||
if provider_name in allowed_provider_order
|
||||
]
|
||||
endpoints.sort(key=lambda item: allowed_provider_order[item[0]])
|
||||
if not endpoints:
|
||||
endpoints = [
|
||||
("ollama_gcp_a", settings.OLLAMA_URL),
|
||||
("ollama_gcp_b", getattr(settings, "OLLAMA_SECONDARY_URL", "")),
|
||||
("ollama_local", getattr(settings, "OLLAMA_FALLBACK_URL", "")),
|
||||
]
|
||||
endpoints = [
|
||||
(provider_name, endpoint_url)
|
||||
for provider_name, endpoint_url in endpoints
|
||||
if endpoint_url
|
||||
]
|
||||
|
||||
last_error = ""
|
||||
for provider_name, endpoint_url in endpoints:
|
||||
try:
|
||||
@@ -973,7 +1076,11 @@ class OpenClawService:
|
||||
try:
|
||||
# 2026-04-02 ogt: C2 修復 — 呼叫 AIRouter.route() 智慧路由 (非靜態 order)
|
||||
# D1 意圖分類路由、D7 隱私保護 (DIAGNOSE/CODE_REVIEW 強制 local) 生效
|
||||
from src.services.ai_router import get_ai_router, get_ai_executor, IntentType
|
||||
from src.services.ai_router import (
|
||||
IntentType,
|
||||
get_ai_executor,
|
||||
get_ai_router,
|
||||
)
|
||||
router = get_ai_router()
|
||||
executor = get_ai_executor()
|
||||
|
||||
@@ -987,7 +1094,10 @@ class OpenClawService:
|
||||
if p.value != decision.selected_provider.value
|
||||
]
|
||||
try:
|
||||
from src.services.ai_control import get_primary_provider, is_provider_disabled
|
||||
from src.services.ai_control import (
|
||||
get_primary_provider,
|
||||
is_provider_disabled,
|
||||
)
|
||||
_primary = await get_primary_provider()
|
||||
if _primary and _primary != decision.selected_provider.value:
|
||||
# 把 primary 移到首位 (保留原始 fallback)
|
||||
@@ -1003,6 +1113,20 @@ class OpenClawService:
|
||||
except Exception as _e:
|
||||
logger.warning("ai_control_override_failed", error=str(_e))
|
||||
|
||||
if self._alert_enforces_ollama_first(alert_context):
|
||||
original_provider_order = list(provider_order)
|
||||
provider_order = await self._resolve_alert_provider_order(
|
||||
task_type=decision.intent.value if decision.intent else "diagnose",
|
||||
alert_context=alert_context,
|
||||
cloud_provider_order=original_provider_order,
|
||||
)
|
||||
logger.info(
|
||||
"alert_ollama_first_provider_order",
|
||||
original_provider_order=original_provider_order,
|
||||
provider_order=provider_order,
|
||||
cloud_fallback_allowed=self._cloud_fallback_allowed_for_alert(alert_context),
|
||||
)
|
||||
|
||||
# Step 3: D7 隱私 — CODE_REVIEW 強制 local
|
||||
# 2026-04-15 ogt: DIAGNOSE 移除 require_local(v4.3 決策:NIM 為主力,無隱私問題)
|
||||
# ai_router.py v4.3 已明確:「NIM 從 Phase 22 起就是主力,無隱私問題」
|
||||
@@ -1045,13 +1169,18 @@ class OpenClawService:
|
||||
_mock_json, _rule_id = self._generate_mock_response(alert_context or {}, signoz_metrics)
|
||||
if _rule_id == "generic_fallback":
|
||||
import asyncio
|
||||
|
||||
from src.services.alert_rule_engine import auto_generate_rule
|
||||
try:
|
||||
asyncio.create_task(auto_generate_rule(
|
||||
alert_context or {},
|
||||
ollama_url=settings.OLLAMA_URL,
|
||||
model=settings.OPENCLAW_DEFAULT_MODEL,
|
||||
gemini_api_key=getattr(settings, "GEMINI_API_KEY", ""),
|
||||
gemini_api_key=(
|
||||
getattr(settings, "GEMINI_API_KEY", "")
|
||||
if self._cloud_fallback_allowed_for_alert(alert_context)
|
||||
else ""
|
||||
),
|
||||
))
|
||||
except Exception as _e:
|
||||
logger.warning("auto_rule_trigger_failed", error=str(_e))
|
||||
@@ -1086,7 +1215,18 @@ class OpenClawService:
|
||||
from src.services.ai_rate_limiter import get_ai_rate_limiter
|
||||
rate_limiter = get_ai_rate_limiter()
|
||||
|
||||
for provider in settings.AI_FALLBACK_ORDER:
|
||||
legacy_provider_order = list(settings.AI_FALLBACK_ORDER)
|
||||
if self._alert_enforces_ollama_first(alert_context):
|
||||
legacy_provider_order = ["ollama"]
|
||||
if self._cloud_fallback_allowed_for_alert(alert_context):
|
||||
legacy_provider_order.append("gemini")
|
||||
logger.info(
|
||||
"legacy_alert_ollama_first_provider_order",
|
||||
provider_order=legacy_provider_order,
|
||||
cloud_fallback_allowed=self._cloud_fallback_allowed_for_alert(alert_context),
|
||||
)
|
||||
|
||||
for provider in legacy_provider_order:
|
||||
# Rate Limit 檢查 (nvidia/gemini/claude 需檢查,ollama 不限)
|
||||
# 2026-03-30 ogt: 加入 nvidia (RPM=5 限制)
|
||||
if provider in ("nvidia", "gemini", "claude"):
|
||||
@@ -1109,7 +1249,10 @@ class OpenClawService:
|
||||
cost_usd = 0.0
|
||||
|
||||
if provider == "ollama":
|
||||
response, success = await self._call_ollama(prompt)
|
||||
response, success = await self._call_ollama(
|
||||
prompt,
|
||||
ollama_only=self._alert_enforces_ollama_first(alert_context),
|
||||
)
|
||||
elif provider == "gemini":
|
||||
response, success, total_tokens, cost_usd = await self._call_gemini(prompt)
|
||||
elif provider == "nvidia":
|
||||
@@ -1165,13 +1308,18 @@ class OpenClawService:
|
||||
_mock_json, _rule_id = self._generate_mock_response(alert_context or {}, signoz_metrics)
|
||||
if _rule_id == "generic_fallback":
|
||||
import asyncio
|
||||
|
||||
from src.services.alert_rule_engine import auto_generate_rule
|
||||
try:
|
||||
asyncio.create_task(auto_generate_rule(
|
||||
alert_context or {},
|
||||
ollama_url=settings.OLLAMA_URL,
|
||||
model=settings.OPENCLAW_DEFAULT_MODEL,
|
||||
gemini_api_key=getattr(settings, "GEMINI_API_KEY", ""),
|
||||
gemini_api_key=(
|
||||
getattr(settings, "GEMINI_API_KEY", "")
|
||||
if self._cloud_fallback_allowed_for_alert(alert_context)
|
||||
else ""
|
||||
),
|
||||
))
|
||||
except Exception as _e:
|
||||
logger.warning("auto_rule_trigger_failed", error=str(_e))
|
||||
@@ -1218,14 +1366,14 @@ class OpenClawService:
|
||||
except json.JSONDecodeError:
|
||||
# 3. 啟發式修補: 如果結尾缺少括號,嘗試補齊
|
||||
if candidate.startswith("{") and not candidate.endswith("}"):
|
||||
for i in range(1, 5): # 嘗試補 1-5 個括號/引號
|
||||
try:
|
||||
repaired = candidate + '"' * (i-1) + "}" * i
|
||||
json.loads(repaired)
|
||||
logger.info("json_repaired_heuristically", level=i)
|
||||
return repaired
|
||||
except:
|
||||
continue
|
||||
for i in range(1, 5): # 嘗試補 1-5 個括號/引號
|
||||
try:
|
||||
repaired = candidate + '"' * (i - 1) + "}" * i
|
||||
json.loads(repaired)
|
||||
logger.info("json_repaired_heuristically", level=i)
|
||||
return repaired
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
continue
|
||||
|
||||
# 4. 極端情況: 找出最後一個有效 key
|
||||
@@ -1235,11 +1383,11 @@ class OpenClawService:
|
||||
# 暴力去除非法尾綴 (如 \t\t...)
|
||||
candidate = re.sub(r"[ \t\r\n]+$", "", candidate)
|
||||
if not candidate.endswith("}"):
|
||||
candidate += '"}' # 嘗試最簡單的閉合
|
||||
candidate += '"}' # 嘗試最簡單的閉合
|
||||
try:
|
||||
json.loads(candidate)
|
||||
return candidate
|
||||
except:
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return None
|
||||
@@ -1791,7 +1939,7 @@ Focus on:
|
||||
from src.services.ai_router import get_ai_registry
|
||||
|
||||
ai_registry = get_ai_registry()
|
||||
provider = ai_registry.get("ollama") or ai_registry.get("ollama_188")
|
||||
provider = ai_registry.get("ollama") or ai_registry.get("ollama_local")
|
||||
if provider is None or not hasattr(provider, "analyze_with_tools"):
|
||||
logger.warning(
|
||||
"openclaw_agent_loop_shadow_skipped",
|
||||
@@ -2200,6 +2348,7 @@ Expert context: {json.dumps(expert_context or {}, ensure_ascii=False, default=st
|
||||
}
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
from src.services.nvidia_provider import get_nvidia_provider
|
||||
|
||||
nvidia = get_nvidia_provider()
|
||||
@@ -2334,7 +2483,7 @@ Expert context: {json.dumps(expert_context or {}, ensure_ascii=False, default=st
|
||||
"latency_ms": latency_ms,
|
||||
}
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
except TimeoutError:
|
||||
latency_ms = (time.time() - start_time) * 1000
|
||||
logger.error(
|
||||
"nemotron_tool_call_timeout",
|
||||
@@ -2528,6 +2677,7 @@ async def _fetch_k8s_inventory_for_openclaw(
|
||||
"awoooi-api, awoooi-web, ..." 格式字串,失敗時返回 ""
|
||||
"""
|
||||
import asyncio as _asyncio
|
||||
|
||||
import structlog as _structlog
|
||||
_logger = _structlog.get_logger(__name__)
|
||||
try:
|
||||
@@ -2542,7 +2692,7 @@ async def _fetch_k8s_inventory_for_openclaw(
|
||||
)
|
||||
try:
|
||||
stdout, _ = await _asyncio.wait_for(proc.communicate(), timeout=timeout_sec)
|
||||
except _asyncio.TimeoutError:
|
||||
except TimeoutError:
|
||||
proc.kill()
|
||||
_logger.warning("k8s_inventory_timeout_openclaw", namespace=namespace)
|
||||
return ""
|
||||
|
||||
@@ -9,8 +9,6 @@ ADR-106(AwoooP Agent Platform)
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
@@ -153,8 +151,21 @@ async def list_runs(
|
||||
# Approvals
|
||||
# =============================================================================
|
||||
|
||||
async def list_approvals(project_id: str | None) -> dict[str, Any]:
|
||||
"""列出所有 waiting_approval 狀態的 runs。"""
|
||||
async def list_approvals(
|
||||
project_id: str | None,
|
||||
run_id: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""列出 waiting_approval runs,可依 project_id 或 run_id 篩選。"""
|
||||
run_uuid: UUID | None = None
|
||||
if run_id:
|
||||
try:
|
||||
run_uuid = uuid.UUID(run_id)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
detail=f"run_id 格式錯誤: {exc}",
|
||||
) from exc
|
||||
|
||||
async with get_db_context("awoooi") as db:
|
||||
stmt = (
|
||||
select(AwoooPRunState)
|
||||
@@ -163,6 +174,8 @@ async def list_approvals(project_id: str | None) -> dict[str, Any]:
|
||||
)
|
||||
if project_id is not None:
|
||||
stmt = stmt.where(AwoooPRunState.project_id == project_id)
|
||||
if run_uuid is not None:
|
||||
stmt = stmt.where(AwoooPRunState.run_id == run_uuid)
|
||||
|
||||
count_stmt = select(func.count()).select_from(stmt.subquery())
|
||||
total_result = await db.execute(count_stmt)
|
||||
|
||||
@@ -4,7 +4,7 @@ LLM Playbook Generator - ADR-104 T1/T2/T6
|
||||
從成功修復案例生成可治理的 Playbook 草稿。
|
||||
|
||||
設計重點:
|
||||
- 只用 local provider 順序(Ollama 111 -> Ollama 188),避免新增雲端成本。
|
||||
- 只用 local/provider pool 順序(GCP-A -> 111 local),避免新增雲端成本。
|
||||
- LLM 產出必須經 Pydantic + action_parser 安全收斂。
|
||||
- 不直接 APPROVED;先 DRAFT/REVIEW,再交治理 job 晉級。
|
||||
"""
|
||||
@@ -30,7 +30,6 @@ from src.models.playbook import (
|
||||
RiskLevel,
|
||||
SymptomPattern,
|
||||
)
|
||||
from src.services.action_parser import is_safe_kubectl_action
|
||||
from src.services.action_parser import kubectl_safety_reason
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
@@ -218,7 +217,7 @@ class LLMPlaybookGenerator:
|
||||
executor = get_ai_executor()
|
||||
result = await executor.execute(
|
||||
prompt=prompt,
|
||||
provider_order=["ollama", "ollama_188"],
|
||||
provider_order=["ollama", "ollama_local"],
|
||||
context=context,
|
||||
cache_ttl=86400,
|
||||
require_local=True,
|
||||
|
||||
@@ -244,7 +244,7 @@ class WeeklyReportService:
|
||||
|
||||
# 發送訊息
|
||||
formatted = report.format()
|
||||
result = await gateway.send_message(formatted)
|
||||
result = await gateway.send_text(formatted)
|
||||
|
||||
if result:
|
||||
logger.info("weekly_report_sent", week=report.week_range)
|
||||
|
||||
90
apps/api/tests/test_ai_router_cache_provider_policy.py
Normal file
90
apps/api/tests/test_ai_router_cache_provider_policy.py
Normal file
@@ -0,0 +1,90 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from src.services import ai_router as ai_router_module
|
||||
from src.services.ai_providers.interfaces import AIResult
|
||||
from src.services.ai_router import AIProviderRegistry, AIRouterExecutor
|
||||
|
||||
|
||||
class _FakeRedis:
|
||||
def __init__(self, cached_provider: str) -> None:
|
||||
self.cached_provider = cached_provider
|
||||
self.set_calls: list[tuple[str, str, int | None]] = []
|
||||
|
||||
async def get(self, key: str) -> str:
|
||||
return json.dumps({
|
||||
"response": '{"provider":"stale"}',
|
||||
"provider": self.cached_provider,
|
||||
})
|
||||
|
||||
async def set(self, key: str, value: str, ex: int | None = None) -> None:
|
||||
self.set_calls.append((key, value, ex))
|
||||
|
||||
|
||||
class _FakeProvider:
|
||||
name = "ollama_gcp_a"
|
||||
privacy_level = "local"
|
||||
is_enabled = True
|
||||
capabilities = {"rca", "chat"}
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.calls = 0
|
||||
|
||||
async def analyze(self, prompt: str, context: dict[str, Any] | None = None) -> AIResult:
|
||||
self.calls += 1
|
||||
return AIResult(
|
||||
raw_response='{"provider":"fresh_ollama"}',
|
||||
success=True,
|
||||
provider=self.name,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_executor_skips_cached_cloud_provider_when_ollama_lane_is_required(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
fake_redis = _FakeRedis(cached_provider="gemini")
|
||||
fake_provider = _FakeProvider()
|
||||
registry = AIProviderRegistry()
|
||||
registry.register(fake_provider)
|
||||
|
||||
monkeypatch.setattr(ai_router_module._settings, "MOCK_MODE", False)
|
||||
monkeypatch.setattr("src.core.redis_client.get_redis", lambda: fake_redis)
|
||||
|
||||
result = await AIRouterExecutor(registry).execute(
|
||||
prompt="diagnose alert",
|
||||
provider_order=["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"],
|
||||
context={"intent_hint": "diagnose", "alert_type": "HostHighCpuLoad"},
|
||||
)
|
||||
|
||||
assert result.provider == "ollama_gcp_a"
|
||||
assert result.raw_response == '{"provider":"fresh_ollama"}'
|
||||
assert fake_provider.calls == 1
|
||||
assert fake_redis.set_calls
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_executor_allows_cached_ollama_provider_for_ollama_lane(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
fake_redis = _FakeRedis(cached_provider="ollama")
|
||||
fake_provider = _FakeProvider()
|
||||
registry = AIProviderRegistry()
|
||||
registry.register(fake_provider)
|
||||
|
||||
monkeypatch.setattr(ai_router_module._settings, "MOCK_MODE", False)
|
||||
monkeypatch.setattr("src.core.redis_client.get_redis", lambda: fake_redis)
|
||||
|
||||
result = await AIRouterExecutor(registry).execute(
|
||||
prompt="diagnose alert",
|
||||
provider_order=["ollama_gcp_a", "ollama_gcp_b", "ollama_local"],
|
||||
context={"intent_hint": "diagnose", "alert_type": "HostHighCpuLoad"},
|
||||
)
|
||||
|
||||
assert result.provider == "ollama"
|
||||
assert result.from_cache is True
|
||||
assert fake_provider.calls == 0
|
||||
@@ -124,8 +124,9 @@ def test_diagnose_fallback_chain_ollama_primary():
|
||||
assert AIProviderEnum.OPENCLAW_NEMO in providers_in_chain
|
||||
assert AIProviderEnum.GEMINI in providers_in_chain
|
||||
assert AIProviderEnum.CLAUDE in providers_in_chain
|
||||
# OLLAMA_188 (CPU-only 備援) 仍排除(M1 Pro 111 才是 GPU 主推理)
|
||||
assert AIProviderEnum.OLLAMA_188 not in providers_in_chain
|
||||
# 188 不得作為 Ollama provider;本地備援只允許 ollama_local。
|
||||
provider_values = {p.value for p in providers_in_chain}
|
||||
assert "ollama_188" not in provider_values
|
||||
|
||||
|
||||
def test_diagnose_fallback_chain_contains_cloud_providers():
|
||||
@@ -159,7 +160,7 @@ async def test_diagnose_route_primary_is_ollama():
|
||||
|
||||
# 雲端 fallback 仍在(OpenClaw / Gemini / Claude 救命備援)
|
||||
fb_providers = [p for p, _ in decision.fallback_chain]
|
||||
# ollama_failover_manager 可能轉到 ollama_188,但 ollama variant 必須有
|
||||
# ollama_failover_manager 可能轉到 GCP-B / ollama_local,但雲端救命備援仍必須存在。
|
||||
has_cloud_fallback = (
|
||||
AIProviderEnum.GEMINI in fb_providers or AIProviderEnum.CLAUDE in fb_providers
|
||||
)
|
||||
|
||||
@@ -83,7 +83,7 @@ async def test_router_uses_failover_when_ollama_initial_provider():
|
||||
return_value=_make_failover_result(
|
||||
primary_provider="gemini",
|
||||
primary_model="gemini-1.5-flash",
|
||||
fallback=[("ollama_188", "qwen2.5:7b-instruct"), ("nemotron", "nvidia/nemotron-mini-4b-instruct")],
|
||||
fallback=[("ollama_local", "qwen2.5:7b-instruct"), ("nemotron", "nvidia/nemotron-mini-4b-instruct")],
|
||||
)
|
||||
)
|
||||
|
||||
@@ -109,14 +109,14 @@ async def test_router_uses_failover_when_ollama_initial_provider():
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_router_failover_fallback_chain_converted():
|
||||
"""failover_manager 回傳 fallback_chain → decision.fallback_chain 包含 OLLAMA_188"""
|
||||
"""failover_manager 回傳 fallback_chain → decision.fallback_chain 包含 OLLAMA_LOCAL"""
|
||||
mock_fm = MagicMock()
|
||||
mock_fm.select_provider = AsyncMock(
|
||||
return_value=_make_failover_result(
|
||||
primary_provider="gemini",
|
||||
primary_model="gemini-1.5-flash",
|
||||
fallback=[
|
||||
("ollama_188", "qwen2.5:7b-instruct"),
|
||||
("ollama_local", "qwen2.5:7b-instruct"),
|
||||
("nemotron", "nvidia/nemotron-mini-4b-instruct"),
|
||||
("claude", "claude-haiku-4-5-20251001"),
|
||||
],
|
||||
@@ -134,8 +134,8 @@ async def test_router_failover_fallback_chain_converted():
|
||||
decision = await router.route("test alert message")
|
||||
|
||||
fb_providers = [p for p, _ in decision.fallback_chain]
|
||||
assert AIProviderEnum.OLLAMA_188 in fb_providers, (
|
||||
f"OLLAMA_188 not in fallback_chain: {fb_providers}"
|
||||
assert AIProviderEnum.OLLAMA_LOCAL in fb_providers, (
|
||||
f"OLLAMA_LOCAL not in fallback_chain: {fb_providers}"
|
||||
)
|
||||
assert AIProviderEnum.NEMOTRON in fb_providers
|
||||
assert AIProviderEnum.CLAUDE in fb_providers
|
||||
|
||||
@@ -68,7 +68,7 @@ async def test_alert_failover_dedup(mock_redis, mock_telegram_send):
|
||||
"to_provider": "gemini",
|
||||
"reason": "111 unhealthy",
|
||||
"model": "qwen3:8b",
|
||||
"fallback_chain_str": "gemini → ollama_188",
|
||||
"fallback_chain_str": "gemini → ollama_local",
|
||||
}
|
||||
|
||||
# 第 1 次:dedup pass,發送
|
||||
|
||||
@@ -1,16 +1,15 @@
|
||||
# apps/api/tests/test_failover_e2e_dispatch.py | 2026-04-26 @ Asia/Taipei
|
||||
# 2026-04-26 Wave5 B4 by Claude Engineer-A4 — E2E executor dispatch 測試
|
||||
# 驗證 failover 切到 OLLAMA_188 後,HTTP 請求真的打到 OLLAMA_FALLBACK_URL
|
||||
# apps/api/tests/test_failover_e2e_dispatch.py | 2026-05-06 @ Asia/Taipei
|
||||
# 2026-05-06 Codex — 188 不再作為 Ollama Provider;驗證 ollama_local dispatch。
|
||||
"""
|
||||
E2E:executor dispatch 層驗證
|
||||
===============================
|
||||
測試覆蓋(補全 B4 — 整合測試只驗決策層,未驗執行層):
|
||||
|
||||
1. registry 確實有 ollama_188 provider(B1 修復後基本健全性)
|
||||
2. Ollama188Provider.is_enabled 在有 OLLAMA_FALLBACK_URL 時為 True
|
||||
3. Ollama188Provider.is_enabled 在 OLLAMA_FALLBACK_URL 空字串時為 False
|
||||
4. Ollama188Provider.analyze() 真的把 HTTP 打到 OLLAMA_FALLBACK_URL(攔截 httpx)
|
||||
5. executor.execute(provider_order=["ollama_188"]) 真的路由到 188 URL
|
||||
1. registry 確實有 ollama_local provider,且沒有 ollama_188 provider
|
||||
2. OllamaLocalProvider.is_enabled 在有 OLLAMA_FALLBACK_URL 時為 True
|
||||
3. OllamaLocalProvider.is_enabled 在 OLLAMA_FALLBACK_URL 空字串時為 False
|
||||
4. OllamaLocalProvider.analyze() 真的把 HTTP 打到 OLLAMA_FALLBACK_URL(攔截 httpx)
|
||||
5. executor.execute(provider_order=["ollama_local"]) 真的路由到 local URL
|
||||
6. Gemini quota pipeline 並行 5 次不超發(B3 atomic 驗證)
|
||||
7. Gemini quota TTL 第一次呼叫即設定
|
||||
"""
|
||||
@@ -28,31 +27,30 @@ import pytest
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def test_registry_has_ollama_188_provider():
|
||||
"""B1 基本健全性:_init_registry() 後 registry 必須有 ollama_188"""
|
||||
def test_registry_has_ollama_local_provider_without_ollama_188():
|
||||
"""_init_registry() 後 registry 必須有 ollama_local,且不得有 ollama_188"""
|
||||
from src.services.ai_router import _init_registry
|
||||
|
||||
registry = _init_registry()
|
||||
# registry.get() 只返回 is_enabled=True 的 provider
|
||||
# 用 _providers dict 直接檢查(不管 is_enabled)
|
||||
assert "ollama_188" in registry._providers, (
|
||||
"ollama_188 not found in registry._providers — B1 fix 未生效"
|
||||
)
|
||||
assert "ollama_local" in registry._providers
|
||||
assert "ollama_188" not in registry._providers
|
||||
|
||||
|
||||
def test_ollama_188_provider_name():
|
||||
"""Ollama188Provider.name == 'ollama_188'"""
|
||||
from src.services.ai_providers.ollama import Ollama188Provider
|
||||
def test_ollama_local_provider_name():
|
||||
"""OllamaLocalProvider.name == 'ollama_local'"""
|
||||
from src.services.ai_providers.ollama import OllamaLocalProvider
|
||||
|
||||
p = Ollama188Provider()
|
||||
assert p.name == "ollama_188"
|
||||
p = OllamaLocalProvider()
|
||||
assert p.name == "ollama_local"
|
||||
|
||||
|
||||
def test_ollama_188_provider_privacy_level():
|
||||
"""Ollama188Provider.privacy_level == 'local'(本地推理,可接機密資料)"""
|
||||
from src.services.ai_providers.ollama import Ollama188Provider
|
||||
def test_ollama_local_provider_privacy_level():
|
||||
"""OllamaLocalProvider.privacy_level == 'local'(本地推理,可接機密資料)"""
|
||||
from src.services.ai_providers.ollama import OllamaLocalProvider
|
||||
|
||||
p = Ollama188Provider()
|
||||
p = OllamaLocalProvider()
|
||||
assert p.privacy_level == "local"
|
||||
|
||||
|
||||
@@ -61,45 +59,44 @@ def test_ollama_188_provider_privacy_level():
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def test_ollama_188_is_enabled_with_fallback_url(monkeypatch):
|
||||
"""OLLAMA_FALLBACK_URL 有值 + ENABLE_OLLAMA_188 未設 → is_enabled == True"""
|
||||
from src.services.ai_providers.ollama import Ollama188Provider
|
||||
from src.core.config import get_settings
|
||||
def test_ollama_local_is_enabled_with_fallback_url(monkeypatch):
|
||||
"""OLLAMA_FALLBACK_URL 有值 + ENABLE_OLLAMA_LOCAL 未設 → is_enabled == True"""
|
||||
from src.services.ai_providers.ollama import OllamaLocalProvider
|
||||
|
||||
monkeypatch.setenv("ENABLE_OLLAMA_188", "true")
|
||||
monkeypatch.setenv("ENABLE_OLLAMA_LOCAL", "true")
|
||||
# patch settings 的 OLLAMA_FALLBACK_URL
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
|
||||
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
|
||||
mock_settings.OPENCLAW_TIMEOUT = "60"
|
||||
|
||||
p = Ollama188Provider()
|
||||
p = OllamaLocalProvider()
|
||||
# 直接 patch module-level settings 物件
|
||||
with patch("src.services.ai_providers.ollama.settings", mock_settings):
|
||||
assert p.is_enabled is True
|
||||
|
||||
|
||||
def test_ollama_188_is_disabled_without_fallback_url(monkeypatch):
|
||||
"""OLLAMA_FALLBACK_URL 空字串 → is_enabled == False(188 節點未設定)"""
|
||||
from src.services.ai_providers.ollama import Ollama188Provider
|
||||
def test_ollama_local_is_disabled_without_fallback_url(monkeypatch):
|
||||
"""OLLAMA_FALLBACK_URL 空字串 → is_enabled == False(local 節點未設定)"""
|
||||
from src.services.ai_providers.ollama import OllamaLocalProvider
|
||||
|
||||
monkeypatch.setenv("ENABLE_OLLAMA_188", "true")
|
||||
monkeypatch.setenv("ENABLE_OLLAMA_LOCAL", "true")
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.OLLAMA_FALLBACK_URL = ""
|
||||
|
||||
p = Ollama188Provider()
|
||||
p = OllamaLocalProvider()
|
||||
with patch("src.services.ai_providers.ollama.settings", mock_settings):
|
||||
assert p.is_enabled is False
|
||||
|
||||
|
||||
def test_ollama_188_is_disabled_by_env_flag(monkeypatch):
|
||||
"""ENABLE_OLLAMA_188=false → is_enabled == False(即使有 URL)"""
|
||||
from src.services.ai_providers.ollama import Ollama188Provider
|
||||
def test_ollama_local_is_disabled_by_env_flag(monkeypatch):
|
||||
"""ENABLE_OLLAMA_LOCAL=false → is_enabled == False(即使有 URL)"""
|
||||
from src.services.ai_providers.ollama import OllamaLocalProvider
|
||||
|
||||
monkeypatch.setenv("ENABLE_OLLAMA_188", "false")
|
||||
monkeypatch.setenv("ENABLE_OLLAMA_LOCAL", "false")
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
|
||||
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
|
||||
|
||||
p = Ollama188Provider()
|
||||
p = OllamaLocalProvider()
|
||||
with patch("src.services.ai_providers.ollama.settings", mock_settings):
|
||||
assert p.is_enabled is False
|
||||
|
||||
@@ -110,14 +107,14 @@ def test_ollama_188_is_disabled_by_env_flag(monkeypatch):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ollama_188_analyze_dispatches_to_fallback_url():
|
||||
async def test_ollama_local_analyze_dispatches_to_fallback_url():
|
||||
"""
|
||||
B4 核心:Ollama188Provider.analyze() 必須把 HTTP 打到 OLLAMA_FALLBACK_URL。
|
||||
攔截 httpx.AsyncClient.post,記錄實際呼叫 URL,斷言包含 188 IP。
|
||||
B4 核心:OllamaLocalProvider.analyze() 必須把 HTTP 打到 OLLAMA_FALLBACK_URL。
|
||||
攔截 httpx.AsyncClient.post,記錄實際呼叫 URL,斷言包含本地 fallback IP。
|
||||
"""
|
||||
from src.services.ai_providers.ollama import Ollama188Provider
|
||||
from src.services.ai_providers.ollama import OllamaLocalProvider
|
||||
|
||||
FALLBACK_URL = "http://192.168.0.188:11434"
|
||||
FALLBACK_URL = "http://192.168.0.111:11434"
|
||||
captured_urls: list[str] = []
|
||||
|
||||
mock_response = MagicMock()
|
||||
@@ -149,7 +146,7 @@ async def test_ollama_188_analyze_dispatches_to_fallback_url():
|
||||
"top_p": 0.9,
|
||||
})
|
||||
|
||||
provider = Ollama188Provider()
|
||||
provider = OllamaLocalProvider()
|
||||
|
||||
with patch("src.services.ai_providers.ollama.settings", mock_settings):
|
||||
with patch("src.services.ai_providers.ollama.get_model_registry", return_value=mock_registry):
|
||||
@@ -159,45 +156,45 @@ async def test_ollama_188_analyze_dispatches_to_fallback_url():
|
||||
result = await provider.analyze("test prompt", context={})
|
||||
|
||||
assert len(captured_urls) > 0, "analyze() 未發出任何 HTTP 請求"
|
||||
assert any("192.168.0.188" in url for url in captured_urls), (
|
||||
f"HTTP 請求未打到 188,實際 URL: {captured_urls}"
|
||||
assert any("192.168.0.111" in url for url in captured_urls), (
|
||||
f"HTTP 請求未打到 local fallback,實際 URL: {captured_urls}"
|
||||
)
|
||||
assert result.provider == "ollama_188"
|
||||
assert result.provider == "ollama_local"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ollama_188_analyze_returns_error_when_no_fallback_url():
|
||||
async def test_ollama_local_analyze_returns_error_when_no_fallback_url():
|
||||
"""OLLAMA_FALLBACK_URL 未設定 → analyze() 應返回 success=False,不發 HTTP"""
|
||||
from src.services.ai_providers.ollama import Ollama188Provider
|
||||
from src.services.ai_providers.ollama import OllamaLocalProvider
|
||||
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.OLLAMA_FALLBACK_URL = ""
|
||||
|
||||
provider = Ollama188Provider()
|
||||
provider = OllamaLocalProvider()
|
||||
with patch("src.services.ai_providers.ollama.settings", mock_settings):
|
||||
result = await provider.analyze("test prompt")
|
||||
|
||||
assert result.success is False
|
||||
assert result.provider == "ollama_188"
|
||||
assert result.provider == "ollama_local"
|
||||
assert "OLLAMA_FALLBACK_URL" in (result.error or "")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_executor_dispatches_ollama_188_to_fallback_url():
|
||||
async def test_executor_dispatches_ollama_local_to_fallback_url():
|
||||
"""
|
||||
B4 執行層:AIRouterExecutor.execute(provider_order=["ollama_188"])
|
||||
應路由到 Ollama188Provider,且 HTTP 打到 OLLAMA_FALLBACK_URL。
|
||||
B4 執行層:AIRouterExecutor.execute(provider_order=["ollama_local"])
|
||||
應路由到 OllamaLocalProvider,且 HTTP 打到 OLLAMA_FALLBACK_URL。
|
||||
"""
|
||||
from src.services.ai_router import AIProviderRegistry, AIRouterExecutor, reset_ai_router
|
||||
from src.services.ai_providers.ollama import Ollama188Provider
|
||||
from src.services.ai_providers.ollama import OllamaLocalProvider
|
||||
from src.services.ai_providers.interfaces import AIResult
|
||||
|
||||
reset_ai_router()
|
||||
|
||||
FALLBACK_URL = "http://192.168.0.188:11434"
|
||||
FALLBACK_URL = "http://192.168.0.111:11434"
|
||||
captured_urls: list[str] = []
|
||||
|
||||
# 建立真實 registry,只登錄 ollama_188
|
||||
# 建立真實 registry,只登錄 ollama_local
|
||||
registry = AIProviderRegistry()
|
||||
|
||||
# mock analyze 讓它回傳成功,但驗 URL 路徑
|
||||
@@ -206,15 +203,15 @@ async def test_executor_dispatches_ollama_188_to_fallback_url():
|
||||
return AIResult(
|
||||
raw_response='{"action_title":"ok","confidence":0.9}',
|
||||
success=True,
|
||||
provider="ollama_188",
|
||||
provider="ollama_local",
|
||||
tokens=10,
|
||||
)
|
||||
|
||||
mock_settings_global = MagicMock()
|
||||
mock_settings_global.OLLAMA_FALLBACK_URL = FALLBACK_URL
|
||||
|
||||
# 建立 Ollama188Provider,mock 其 analyze + is_enabled
|
||||
provider = Ollama188Provider()
|
||||
# 建立 OllamaLocalProvider,mock 其 analyze + is_enabled
|
||||
provider = OllamaLocalProvider()
|
||||
provider.analyze = fake_analyze # type: ignore[method-assign]
|
||||
|
||||
# 強制 is_enabled = True(繞過 settings patch 的複雜度)
|
||||
@@ -233,14 +230,14 @@ async def test_executor_dispatches_ollama_188_to_fallback_url():
|
||||
mock_settings.MOCK_MODE = False
|
||||
result = await executor.execute(
|
||||
prompt="test alert",
|
||||
provider_order=["ollama_188"],
|
||||
provider_order=["ollama_local"],
|
||||
context={},
|
||||
)
|
||||
|
||||
assert result.success is True, f"execute 失敗: {result.error}"
|
||||
assert result.provider == "ollama_188", f"provider 不是 ollama_188: {result.provider}"
|
||||
assert any("192.168.0.188" in u for u in captured_urls), (
|
||||
f"HTTP 未打到 188,captured: {captured_urls}"
|
||||
assert result.provider == "ollama_local", f"provider 不是 ollama_local: {result.provider}"
|
||||
assert any("192.168.0.111" in u for u in captured_urls), (
|
||||
f"HTTP 未打到 local fallback,captured: {captured_urls}"
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ import httpx
|
||||
import pytest
|
||||
|
||||
# Ollama 伺服器配置
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.188:11434")
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.111:11434")
|
||||
DEFAULT_MODEL = os.getenv("OLLAMA_MODEL", "qwen2.5:7b-instruct")
|
||||
TIMEOUT = 300 # 秒 (CPU 推理模式需 ~222-666 秒,見 2026-03-26 評估)
|
||||
|
||||
@@ -111,7 +111,7 @@ async def check_ollama_available() -> bool:
|
||||
|
||||
@pytest.mark.integration
|
||||
class TestModelRegression:
|
||||
"""模型回歸測試 — 需要 Ollama 服務 (192.168.0.188:11434)"""
|
||||
"""模型回歸測試 — 需要 Ollama 服務(預設 111,可用 OLLAMA_URL 覆寫)"""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
async def check_ollama(self):
|
||||
|
||||
@@ -90,8 +90,8 @@ class TestProbeOllamaVersion:
|
||||
assert isinstance(info.captured_at, datetime)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_success_188_provider(self):
|
||||
"""188 URL → provider='ollama_188'"""
|
||||
async def test_success_local_provider(self):
|
||||
"""111 / local proxy URL → provider='ollama_local'"""
|
||||
model_entry = {
|
||||
"name": "deepseek-r1:14b",
|
||||
"modified_at": "2026-04-02T00:00:00Z",
|
||||
@@ -106,10 +106,10 @@ class TestProbeOllamaVersion:
|
||||
|
||||
with patch("httpx.AsyncClient", return_value=mock_client):
|
||||
info = await probe_ollama_version(
|
||||
"http://192.168.0.188:11434", "deepseek-r1:14b"
|
||||
"http://192.168.0.111:11434", "deepseek-r1:14b"
|
||||
)
|
||||
|
||||
assert info.provider == "ollama_188"
|
||||
assert info.provider == "ollama_local"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_model_not_found_raises(self):
|
||||
@@ -279,7 +279,7 @@ class TestProbeOpenclawNemoVersion:
|
||||
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.OPENCLAW_DEFAULT_MODEL = "deepseek-r1:14b"
|
||||
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
|
||||
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
|
||||
|
||||
with patch("src.services.model_version_probe.settings", mock_settings), \
|
||||
patch("httpx.AsyncClient", return_value=mock_client):
|
||||
@@ -301,7 +301,7 @@ class TestProbeOpenclawNemoVersion:
|
||||
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.OPENCLAW_DEFAULT_MODEL = "deepseek-r1:14b"
|
||||
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
|
||||
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
|
||||
|
||||
with patch("src.services.model_version_probe.settings", mock_settings), \
|
||||
patch("httpx.AsyncClient", return_value=mock_client):
|
||||
@@ -333,7 +333,7 @@ class TestProbeAllProviders:
|
||||
"""5 個 provider 全部成功 → 回傳 5 筆 ProviderVersionInfo"""
|
||||
fake_results = [
|
||||
ProviderVersionInfo(provider="ollama", model="qwen2.5:7b-instruct", version="v1"),
|
||||
ProviderVersionInfo(provider="ollama_188", model="qwen2.5:7b-instruct", version="v1"),
|
||||
ProviderVersionInfo(provider="ollama_local", model="qwen2.5:7b-instruct", version="v1"),
|
||||
ProviderVersionInfo(provider="gemini", model="gemini-1.5-flash", version="gemini-1.5-flash"),
|
||||
ProviderVersionInfo(provider="claude", model="claude-haiku-4-5-20251001", version="claude-haiku-4-5-20251001"),
|
||||
ProviderVersionInfo(provider="openclaw_nemo", model="deepseek-r1:14b", version="v1"),
|
||||
@@ -347,7 +347,7 @@ class TestProbeAllProviders:
|
||||
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.OLLAMA_URL = "http://34.143.170.20:11434" # GCP-A(ADR-110)
|
||||
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
|
||||
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
|
||||
mock_settings.OLLAMA_HEALTH_CHECK_MODEL = "qwen2.5:7b-instruct"
|
||||
|
||||
with patch("src.services.model_version_probe.settings", mock_settings):
|
||||
@@ -364,8 +364,8 @@ class TestProbeAllProviders:
|
||||
raise RuntimeError("simulated failure")
|
||||
|
||||
async def _fail_ollama(url, model):
|
||||
if "188" in url:
|
||||
raise RuntimeError("188 offline")
|
||||
if "111" in url:
|
||||
raise RuntimeError("local offline")
|
||||
return good
|
||||
|
||||
with patch("src.services.model_version_probe.probe_ollama_version", side_effect=_fail_ollama), \
|
||||
@@ -379,13 +379,13 @@ class TestProbeAllProviders:
|
||||
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.OLLAMA_URL = "http://34.143.170.20:11434" # GCP-A(ADR-110)
|
||||
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
|
||||
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
|
||||
mock_settings.OLLAMA_HEALTH_CHECK_MODEL = "qwen2.5:7b-instruct"
|
||||
|
||||
with patch("src.services.model_version_probe.settings", mock_settings):
|
||||
results = await probe_all_providers()
|
||||
|
||||
# ollama(ok) + ollama_188(fail) + gemini(fail) + claude(ok) + openclaw_nemo(ok) → 3
|
||||
# ollama(ok) + ollama_local(fail) + gemini(fail) + claude(ok) + openclaw_nemo(ok) → 3
|
||||
assert len(results) == 3
|
||||
providers = {r.provider for r in results}
|
||||
assert "ollama" in providers
|
||||
|
||||
@@ -48,7 +48,7 @@ def _make_info(provider: str, version: str = "v1", digest: str | None = "sha256:
|
||||
def _make_five() -> list[ProviderVersionInfo]:
|
||||
return [
|
||||
_make_info("ollama"),
|
||||
_make_info("ollama_188"),
|
||||
_make_info("ollama_local"),
|
||||
_make_info("gemini", digest=None),
|
||||
_make_info("claude", digest=None),
|
||||
_make_info("openclaw_nemo"),
|
||||
|
||||
@@ -310,7 +310,7 @@ class TestSelectProvider:
|
||||
)
|
||||
|
||||
with patch.object(manager, "_write_failover_audit", return_value=None):
|
||||
result = await manager.select_provider()
|
||||
await manager.select_provider()
|
||||
|
||||
# 並行 check 三台主機(GCP-A / GCP-B / Local)
|
||||
assert mock_monitor.check.call_count == 3
|
||||
@@ -625,7 +625,6 @@ class TestWriteFailoverAudit:
|
||||
@pytest.mark.asyncio
|
||||
async def test_audit_uses_structlog_not_db(self):
|
||||
"""_write_failover_audit 應呼叫 structlog,不呼叫 DB"""
|
||||
import structlog
|
||||
manager = _make_manager()
|
||||
from src.services.ollama_failover_manager import OllamaEndpoint, OllamaRoutingResult
|
||||
|
||||
@@ -657,22 +656,22 @@ class TestWriteFailoverAudit:
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# B2: AIProviderEnum.OLLAMA_188 存在
|
||||
# 2026-04-25 critic-fix Part2 by Claude Engineer-C2
|
||||
# B2: AIProviderEnum.OLLAMA_LOCAL 存在
|
||||
# 2026-05-06 Codex — 188 不再作為 Ollama Provider
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestAIProviderEnumOllama188:
|
||||
"""B2 修復驗證:AIProviderEnum.OLLAMA_188 存在且 PROVIDER_LATENCY_BUDGET 有對應值"""
|
||||
class TestAIProviderEnumOllamaLocal:
|
||||
"""B2 修復驗證:AIProviderEnum.OLLAMA_LOCAL 存在且 PROVIDER_LATENCY_BUDGET 有對應值"""
|
||||
|
||||
def test_ollama_188_enum_exists(self):
|
||||
def test_ollama_local_enum_exists(self):
|
||||
from src.services.ai_router import AIProviderEnum
|
||||
assert AIProviderEnum.OLLAMA_188.value == "ollama_188"
|
||||
assert AIProviderEnum.OLLAMA_LOCAL.value == "ollama_local"
|
||||
|
||||
def test_ollama_188_in_latency_budget(self):
|
||||
def test_ollama_local_in_latency_budget(self):
|
||||
from src.services.ai_router import AIProviderEnum, PROVIDER_LATENCY_BUDGET
|
||||
assert AIProviderEnum.OLLAMA_188 in PROVIDER_LATENCY_BUDGET
|
||||
assert PROVIDER_LATENCY_BUDGET[AIProviderEnum.OLLAMA_188] == 120000
|
||||
assert AIProviderEnum.OLLAMA_LOCAL in PROVIDER_LATENCY_BUDGET
|
||||
assert PROVIDER_LATENCY_BUDGET[AIProviderEnum.OLLAMA_LOCAL] == 90000
|
||||
|
||||
|
||||
# =============================================================================
|
||||
|
||||
@@ -42,7 +42,7 @@ from src.services.ollama_health_monitor import (
|
||||
# =============================================================================
|
||||
|
||||
HOST = "http://34.143.170.20:11434" # GCP-A Primary(ADR-110 2026-05-03)
|
||||
HOST_188 = "http://192.168.0.188:11434" # 歷史遺留參考常數(已移出主路由)
|
||||
HOST_LOCAL = "http://192.168.0.111:11434" # Local fallback(已移出 188 主路由)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
|
||||
231
apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py
Normal file
231
apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py
Normal file
@@ -0,0 +1,231 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from types import SimpleNamespace
|
||||
from typing import Any
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
|
||||
from src.services import ai_control as ai_control_module
|
||||
from src.services import ai_router as ai_router_module
|
||||
from src.services import openclaw as openclaw_module
|
||||
from src.services.ai_router import AIProviderEnum
|
||||
from src.services.intent_classifier import IntentType
|
||||
from src.services.openclaw import OpenClawService
|
||||
|
||||
|
||||
@dataclass
|
||||
class _FakeEndpoint:
|
||||
provider_name: str
|
||||
url: str = "http://example.test"
|
||||
|
||||
|
||||
class _FakeRoute:
|
||||
def all_endpoints_in_order(self) -> list[_FakeEndpoint]:
|
||||
return [
|
||||
_FakeEndpoint("ollama_gcp_a"),
|
||||
_FakeEndpoint("ollama_gcp_b"),
|
||||
_FakeEndpoint("ollama_local"),
|
||||
_FakeEndpoint("gemini", ""),
|
||||
]
|
||||
|
||||
|
||||
class _FakeFailoverManager:
|
||||
def __init__(self) -> None:
|
||||
self.task_types: list[str] = []
|
||||
|
||||
async def select_provider(self, task_type: str = "general") -> _FakeRoute:
|
||||
self.task_types.append(task_type)
|
||||
return _FakeRoute()
|
||||
|
||||
|
||||
class _UnorderedFailoverManager:
|
||||
async def select_provider(self, task_type: str = "general") -> SimpleNamespace:
|
||||
return SimpleNamespace(
|
||||
all_endpoints_in_order=lambda: [
|
||||
_FakeEndpoint("ollama_local"),
|
||||
_FakeEndpoint("gemini"),
|
||||
_FakeEndpoint("ollama_gcp_b"),
|
||||
_FakeEndpoint("ollama_gcp_a"),
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class _FakeRouter:
|
||||
async def route(self, prompt: str, context: dict[str, Any]) -> SimpleNamespace:
|
||||
return SimpleNamespace(
|
||||
selected_provider=AIProviderEnum.GEMINI,
|
||||
fallback_chain=[
|
||||
(AIProviderEnum.CLAUDE, "claude"),
|
||||
(AIProviderEnum.OLLAMA, "qwen2.5:7b-instruct"),
|
||||
],
|
||||
intent=IntentType.DIAGNOSE,
|
||||
routing_reason="high complexity would normally prefer cloud",
|
||||
)
|
||||
|
||||
|
||||
class _FakeExecutor:
|
||||
def __init__(self) -> None:
|
||||
self.provider_order: list[str] | None = None
|
||||
|
||||
async def execute(
|
||||
self,
|
||||
*,
|
||||
prompt: str,
|
||||
provider_order: list[str],
|
||||
context: dict[str, Any],
|
||||
cache_ttl: int,
|
||||
require_local: bool,
|
||||
) -> SimpleNamespace:
|
||||
self.provider_order = provider_order
|
||||
return SimpleNamespace(
|
||||
raw_response='{"root_cause":"ok","suggested_action":"NO_ACTION"}',
|
||||
provider=provider_order[0],
|
||||
success=True,
|
||||
tokens=42,
|
||||
cost_usd=0.0,
|
||||
latency_ms=10.0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_alert_context_uses_ollama_lane_then_gemini_backup(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
fake_executor = _FakeExecutor()
|
||||
fake_failover = _FakeFailoverManager()
|
||||
|
||||
monkeypatch.setattr(openclaw_module.settings, "USE_AI_ROUTER", True)
|
||||
monkeypatch.setattr(openclaw_module.settings, "MOCK_MODE", False)
|
||||
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", True)
|
||||
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
|
||||
monkeypatch.setattr(ai_control_module, "get_ai_router_enabled", AsyncMock(return_value=None))
|
||||
monkeypatch.setattr(ai_control_module, "get_primary_provider", AsyncMock(return_value=None))
|
||||
monkeypatch.setattr(ai_control_module, "is_provider_disabled", AsyncMock(return_value=False))
|
||||
monkeypatch.setattr(ai_router_module, "get_ai_router", lambda: _FakeRouter())
|
||||
monkeypatch.setattr(ai_router_module, "get_ai_executor", lambda: fake_executor)
|
||||
monkeypatch.setattr(openclaw_module, "get_ollama_failover_manager", lambda: fake_failover)
|
||||
|
||||
service = object.__new__(OpenClawService)
|
||||
result = await service._call_with_fallback(
|
||||
"diagnose alert",
|
||||
alert_context={
|
||||
"incident_id": "INC-1",
|
||||
"alertname": "HostHighCpuLoad",
|
||||
"target_resource": "node-exporter-110",
|
||||
},
|
||||
)
|
||||
|
||||
assert result == (
|
||||
'{"root_cause":"ok","suggested_action":"NO_ACTION"}',
|
||||
"ollama_gcp_a",
|
||||
True,
|
||||
42,
|
||||
0.0,
|
||||
)
|
||||
assert fake_executor.provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"]
|
||||
assert fake_failover.task_types == ["diagnose"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_alert_context_can_disable_cloud_backup_for_cost_stop(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
fake_executor = _FakeExecutor()
|
||||
fake_failover = _FakeFailoverManager()
|
||||
|
||||
monkeypatch.setattr(openclaw_module.settings, "USE_AI_ROUTER", True)
|
||||
monkeypatch.setattr(openclaw_module.settings, "MOCK_MODE", False)
|
||||
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", False)
|
||||
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
|
||||
monkeypatch.setattr(ai_control_module, "get_ai_router_enabled", AsyncMock(return_value=None))
|
||||
monkeypatch.setattr(ai_control_module, "get_primary_provider", AsyncMock(return_value=None))
|
||||
monkeypatch.setattr(ai_control_module, "is_provider_disabled", AsyncMock(return_value=False))
|
||||
monkeypatch.setattr(ai_router_module, "get_ai_router", lambda: _FakeRouter())
|
||||
monkeypatch.setattr(ai_router_module, "get_ai_executor", lambda: fake_executor)
|
||||
monkeypatch.setattr(openclaw_module, "get_ollama_failover_manager", lambda: fake_failover)
|
||||
|
||||
service = object.__new__(OpenClawService)
|
||||
await service._call_with_fallback(
|
||||
"diagnose alert",
|
||||
alert_context={"incident_id": "INC-1", "alertname": "HostHighCpuLoad"},
|
||||
)
|
||||
|
||||
assert fake_executor.provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_alert_context_keeps_router_cloud_order(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
fake_executor = _FakeExecutor()
|
||||
|
||||
monkeypatch.setattr(openclaw_module.settings, "USE_AI_ROUTER", True)
|
||||
monkeypatch.setattr(openclaw_module.settings, "MOCK_MODE", False)
|
||||
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", False)
|
||||
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
|
||||
monkeypatch.setattr(ai_control_module, "get_ai_router_enabled", AsyncMock(return_value=None))
|
||||
monkeypatch.setattr(ai_control_module, "get_primary_provider", AsyncMock(return_value=None))
|
||||
monkeypatch.setattr(ai_control_module, "is_provider_disabled", AsyncMock(return_value=False))
|
||||
monkeypatch.setattr(ai_router_module, "get_ai_router", lambda: _FakeRouter())
|
||||
monkeypatch.setattr(ai_router_module, "get_ai_executor", lambda: fake_executor)
|
||||
|
||||
service = object.__new__(OpenClawService)
|
||||
await service._call_with_fallback("general question", alert_context={"intent_hint": "query"})
|
||||
|
||||
assert fake_executor.provider_order == ["gemini", "claude", "ollama"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_alert_context_uses_gcp_a_gcp_b_then_111_order(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
fake_failover = _FakeFailoverManager()
|
||||
|
||||
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", False)
|
||||
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
|
||||
monkeypatch.setattr(openclaw_module, "get_ollama_failover_manager", lambda: fake_failover)
|
||||
|
||||
service = object.__new__(OpenClawService)
|
||||
provider_order = await service._resolve_alert_provider_order(
|
||||
task_type="diagnose",
|
||||
alert_context={"incident_id": "INC-1", "alertname": "HostHighCpuLoad"},
|
||||
)
|
||||
|
||||
assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_alert_context_sorts_ollama_lane_and_drops_cloud_providers(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", False)
|
||||
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
|
||||
monkeypatch.setattr(openclaw_module, "get_ollama_failover_manager", lambda: _UnorderedFailoverManager())
|
||||
|
||||
service = object.__new__(OpenClawService)
|
||||
provider_order = await service._resolve_alert_provider_order(
|
||||
task_type="diagnose",
|
||||
alert_context={"incident_id": "INC-1", "alertname": "HostHighCpuLoad"},
|
||||
)
|
||||
|
||||
assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_alert_context_sorts_ollama_lane_before_gemini_backup(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", True)
|
||||
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
|
||||
monkeypatch.setattr(openclaw_module, "get_ollama_failover_manager", lambda: _UnorderedFailoverManager())
|
||||
|
||||
service = object.__new__(OpenClawService)
|
||||
provider_order = await service._resolve_alert_provider_order(
|
||||
task_type="diagnose",
|
||||
alert_context={"incident_id": "INC-1", "alertname": "HostHighCpuLoad"},
|
||||
cloud_provider_order=["claude", "gemini", "ollama"],
|
||||
)
|
||||
|
||||
assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"]
|
||||
@@ -18,7 +18,7 @@ import pytest
|
||||
from src.core.prompts import OPENCLAW_TEST_PROMPT
|
||||
|
||||
# Ollama 配置
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.188:11434")
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.111:11434")
|
||||
DEFAULT_MODEL = os.getenv("OLLAMA_MODEL", "qwen2.5:7b-instruct")
|
||||
TIMEOUT = 300 # 秒 (CPU 推理模式需 ~222-666 秒,見 2026-03-26 評估)
|
||||
|
||||
|
||||
@@ -8,8 +8,8 @@
|
||||
# 必填 (REQUIRED)
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
# API 後端 URL(Next.js build-time 寫入 JS bundle,禁止使用內網 IP)
|
||||
NEXT_PUBLIC_API_URL=http://192.168.0.188:32334
|
||||
# API 後端 URL(Next.js build-time 寫入 JS bundle)
|
||||
NEXT_PUBLIC_API_URL=https://awoooi.wooo.work
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# 可選 (OPTIONAL)
|
||||
@@ -19,17 +19,17 @@ NEXT_PUBLIC_API_URL=http://192.168.0.188:32334
|
||||
NEXT_PUBLIC_ENABLE_DEMO=false
|
||||
|
||||
# SignOz 可觀測性平台 URL
|
||||
NEXT_PUBLIC_SIGNOZ_URL=http://192.168.0.110:3301
|
||||
NEXT_PUBLIC_SIGNOZ_URL=https://signoz.wooo.work
|
||||
|
||||
# 主機 IP 列表(逗號分隔,live-dashboard 用於 fallback 顯示)
|
||||
NEXT_PUBLIC_HOST_IPS=192.168.0.110,192.168.0.112,192.168.0.120,192.168.0.188
|
||||
NEXT_PUBLIC_HOST_IPS=devops,security,k3s,ai-web
|
||||
|
||||
# K8s Cluster VIP 資訊字串(host-grid 顯示用)
|
||||
NEXT_PUBLIC_K8S_VIP_INFO=VIP 192.168.0.125 · kubectl :6443 · Web :32335 · API :32334
|
||||
NEXT_PUBLIC_K8S_VIP_INFO=K8S VIP topology (ops-only) · kubectl:6443 · web:32335 · api:32334
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Server-side Only(不含 NEXT_PUBLIC_ 前綴,不會暴露在 JS bundle)
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
# Sentry 自建主機 URL(sentry-tunnel route handler 使用)
|
||||
SENTRY_HOST=http://192.168.0.110:9000
|
||||
SENTRY_HOST=http://sentry.internal:9000
|
||||
|
||||
@@ -44,6 +44,8 @@ ARG NEXT_PUBLIC_SENTRY_DSN=
|
||||
ENV NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL}
|
||||
ENV NEXT_PUBLIC_SENTRY_DSN=${NEXT_PUBLIC_SENTRY_DSN}
|
||||
ENV NEXT_TELEMETRY_DISABLED=1
|
||||
# 2026-05-05 ogt + Codex: keep self-hosted 110 runner builds from saturating CPU.
|
||||
ENV NEXT_PRIVATE_BUILD_WORKER_COUNT=1
|
||||
|
||||
# 2026-04-06 ogt: --mount=type=cache 持久化 .next/cache,跨 build 增量編譯
|
||||
# 只有變更的頁面重新編譯,未變更頁面直接用 cache → 節省 3-4 min
|
||||
@@ -51,7 +53,7 @@ ENV NEXT_TELEMETRY_DISABLED=1
|
||||
# /root/.cache/turbo 存放 turbo 的 task 輸出快取,避免每次重跑未變動的 packages
|
||||
RUN --mount=type=cache,target=/app/apps/web/.next/cache \
|
||||
--mount=type=cache,target=/root/.cache/turbo \
|
||||
pnpm turbo build --filter=@awoooi/web
|
||||
pnpm turbo build --filter=@awoooi/web --concurrency=1
|
||||
|
||||
FROM base AS runner
|
||||
WORKDIR /app
|
||||
|
||||
@@ -67,7 +67,8 @@
|
||||
"operations": "Operations",
|
||||
"securityCompliance": "Security & Compliance",
|
||||
"classicAICenter": "Classic AI Center",
|
||||
"governance": "AI Governance"
|
||||
"governance": "AI Governance",
|
||||
"awooop": "AwoooP"
|
||||
},
|
||||
"locale": {
|
||||
"switch": "Switch Language",
|
||||
@@ -1480,4 +1481,4 @@
|
||||
"retry": "Retry"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -67,7 +67,8 @@
|
||||
"operations": "營運",
|
||||
"securityCompliance": "安全合規",
|
||||
"classicAICenter": "經典 AI 中心",
|
||||
"governance": "AI 治理"
|
||||
"governance": "AI 治理",
|
||||
"awooop": "AwoooP"
|
||||
},
|
||||
"locale": {
|
||||
"switch": "切換語系",
|
||||
@@ -1481,4 +1482,4 @@
|
||||
"retry": "重試"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -114,7 +114,7 @@ function ApprovalRow({ approval }: { approval: Approval }) {
|
||||
<tr
|
||||
className={cn(
|
||||
"border-b border-border hover:bg-accent/30 transition-colors",
|
||||
isCritical && "bg-red-900/10 hover:bg-red-900/20"
|
||||
isCritical && "bg-[#fff0ef] hover:bg-[#ffe4e1]"
|
||||
)}
|
||||
>
|
||||
<td className="px-4 py-3">
|
||||
@@ -232,7 +232,7 @@ export default function ApprovalsPage() {
|
||||
|
||||
{/* Error State */}
|
||||
{error && (
|
||||
<div className="flex items-start gap-3 p-4 bg-red-900/20 border border-red-800/40 rounded-lg">
|
||||
<div className="flex items-start gap-3 border border-[#e2a29b] bg-[#fff0ef] p-4">
|
||||
<AlertCircle className="w-5 h-5 text-red-400 flex-shrink-0 mt-0.5" aria-hidden="true" />
|
||||
<div>
|
||||
<p className="text-sm font-medium text-red-300">無法載入審批資料</p>
|
||||
@@ -243,7 +243,7 @@ export default function ApprovalsPage() {
|
||||
|
||||
{/* Empty State — 所有審批已處理 */}
|
||||
{!loading && approvals.length === 0 && !error && (
|
||||
<div className="flex flex-col items-center justify-center py-16 bg-card border border-border rounded-xl">
|
||||
<div className="flex flex-col items-center justify-center border border-[#e0ddd4] bg-white py-16">
|
||||
<ShieldCheck className="w-12 h-12 text-green-400 mb-3" aria-hidden="true" />
|
||||
<p className="text-sm font-medium text-foreground mb-1">審批佇列為空</p>
|
||||
<p className="text-xs text-muted-foreground">目前沒有待審批的 Run</p>
|
||||
@@ -252,7 +252,7 @@ export default function ApprovalsPage() {
|
||||
|
||||
{/* Table */}
|
||||
{(loading || approvals.length > 0) && (
|
||||
<div className="bg-card border border-border rounded-xl overflow-hidden shadow-sm">
|
||||
<div className="overflow-hidden border border-[#e0ddd4] bg-white shadow-[0_1px_4px_rgba(0,0,0,0.05)]">
|
||||
<div className="overflow-x-auto">
|
||||
<table className="w-full" role="table" aria-label="審批佇列">
|
||||
<thead>
|
||||
|
||||
@@ -19,20 +19,25 @@ import { cn } from "@/lib/utils";
|
||||
// Types
|
||||
// =============================================================================
|
||||
|
||||
type ContractStatus = "draft" | "published" | "active";
|
||||
type ContractStatus = "draft" | "published" | "active" | "revoked";
|
||||
|
||||
interface Contract {
|
||||
id: string;
|
||||
contract_id: string;
|
||||
contract_family: string;
|
||||
project_id: string;
|
||||
status: ContractStatus;
|
||||
lifecycle_status: ContractStatus;
|
||||
body_hash: string;
|
||||
created_at: string;
|
||||
}
|
||||
|
||||
interface Tenant {
|
||||
project_id: string;
|
||||
name: string;
|
||||
display_name: string;
|
||||
}
|
||||
|
||||
interface ContractsResponse {
|
||||
contracts?: Contract[];
|
||||
items?: Contract[];
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
@@ -47,21 +52,27 @@ const STATUS_CONFIG: Record<
|
||||
> = {
|
||||
draft: {
|
||||
label: "草稿",
|
||||
bg: "bg-gray-800",
|
||||
text: "text-gray-300",
|
||||
border: "border-gray-600",
|
||||
bg: "bg-[#f4f1e8]",
|
||||
text: "text-[#5f5b52]",
|
||||
border: "border-[#d8d3c7]",
|
||||
},
|
||||
published: {
|
||||
label: "已發佈",
|
||||
bg: "bg-blue-900/40",
|
||||
text: "text-blue-300",
|
||||
border: "border-blue-600/40",
|
||||
bg: "bg-[#eef5ff]",
|
||||
text: "text-[#1f5b9b]",
|
||||
border: "border-[#9bb6d9]",
|
||||
},
|
||||
active: {
|
||||
label: "生效中",
|
||||
bg: "bg-green-900/40",
|
||||
text: "text-green-300",
|
||||
border: "border-green-600/40",
|
||||
bg: "bg-[#f0faf2]",
|
||||
text: "text-[#17602a]",
|
||||
border: "border-[#9bc7a4]",
|
||||
},
|
||||
revoked: {
|
||||
label: "已撤銷",
|
||||
bg: "bg-[#fff0ef]",
|
||||
text: "text-[#9f2f25]",
|
||||
border: "border-[#e2a29b]",
|
||||
},
|
||||
};
|
||||
|
||||
@@ -109,7 +120,7 @@ function ContractRow({ contract }: { contract: Contract }) {
|
||||
</span>
|
||||
</td>
|
||||
<td className="px-4 py-3">
|
||||
<StatusBadge status={contract.status} />
|
||||
<StatusBadge status={contract.lifecycle_status} />
|
||||
</td>
|
||||
<td className="px-4 py-3">
|
||||
<span className="font-mono text-xs text-muted-foreground bg-muted px-2 py-0.5 rounded">
|
||||
@@ -140,7 +151,10 @@ export default function ContractsPage() {
|
||||
useEffect(() => {
|
||||
fetch(`${API_BASE}/api/v1/platform/tenants`)
|
||||
.then((r) => r.json())
|
||||
.then((data) => setTenants(Array.isArray(data.items) ? data.items : []))
|
||||
.then((data) => {
|
||||
const rows = Array.isArray(data.tenants) ? data.tenants : data.items;
|
||||
setTenants(Array.isArray(rows) ? rows : []);
|
||||
})
|
||||
.catch(() => {});
|
||||
}, []);
|
||||
|
||||
@@ -154,8 +168,9 @@ export default function ContractsPage() {
|
||||
`${API_BASE}/api/v1/platform/contracts?${params.toString()}`
|
||||
);
|
||||
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
||||
const data = await res.json();
|
||||
setContracts(Array.isArray(data.items) ? data.items : []);
|
||||
const data: ContractsResponse = await res.json();
|
||||
const rows = Array.isArray(data.contracts) ? data.contracts : data.items;
|
||||
setContracts(Array.isArray(rows) ? rows : []);
|
||||
} catch (err) {
|
||||
setError(err instanceof Error ? err.message : "載入失敗");
|
||||
} finally {
|
||||
@@ -192,7 +207,7 @@ export default function ContractsPage() {
|
||||
</div>
|
||||
|
||||
{/* Filters */}
|
||||
<div className="flex items-center gap-3 p-4 bg-card border border-border rounded-xl">
|
||||
<div className="flex items-center gap-3 border border-[#e0ddd4] bg-white p-4">
|
||||
<Filter className="w-4 h-4 text-muted-foreground flex-shrink-0" aria-hidden="true" />
|
||||
<span className="text-sm text-muted-foreground">篩選:</span>
|
||||
<div className="relative">
|
||||
@@ -205,7 +220,7 @@ export default function ContractsPage() {
|
||||
<option value="">所有租戶</option>
|
||||
{tenants.map((t) => (
|
||||
<option key={t.project_id} value={t.project_id}>
|
||||
{t.name || t.project_id}
|
||||
{t.display_name || t.project_id}
|
||||
</option>
|
||||
))}
|
||||
</select>
|
||||
@@ -225,7 +240,7 @@ export default function ContractsPage() {
|
||||
)}
|
||||
|
||||
{/* Table */}
|
||||
<div className="bg-card border border-border rounded-xl overflow-hidden shadow-sm">
|
||||
<div className="overflow-hidden border border-[#e0ddd4] bg-white shadow-[0_1px_4px_rgba(0,0,0,0.05)]">
|
||||
<div className="overflow-x-auto">
|
||||
<table className="w-full" role="table" aria-label="合約清單">
|
||||
<thead>
|
||||
@@ -269,7 +284,7 @@ export default function ContractsPage() {
|
||||
</tr>
|
||||
) : (
|
||||
contracts.map((contract) => (
|
||||
<ContractRow key={contract.id} contract={contract} />
|
||||
<ContractRow key={contract.contract_id + contract.body_hash} contract={contract} />
|
||||
))
|
||||
)}
|
||||
</tbody>
|
||||
|
||||
@@ -6,8 +6,9 @@
|
||||
|
||||
"use client";
|
||||
|
||||
import { AppLayout } from "@/components/layout";
|
||||
import { Link, usePathname } from "@/i18n/routing";
|
||||
import { Building2, FileText, Activity, ShieldCheck } from "lucide-react";
|
||||
import { Activity, BrainCircuit, Building2, ClipboardList, FileText, ShieldCheck } from "lucide-react";
|
||||
import { cn } from "@/lib/utils";
|
||||
|
||||
// =============================================================================
|
||||
@@ -15,6 +16,11 @@ import { cn } from "@/lib/utils";
|
||||
// =============================================================================
|
||||
|
||||
const navItems = [
|
||||
{
|
||||
label: "工作鏈路",
|
||||
href: "/awooop/work-items" as const,
|
||||
icon: ClipboardList,
|
||||
},
|
||||
{
|
||||
label: "租戶管理",
|
||||
href: "/awooop/tenants" as const,
|
||||
@@ -43,64 +49,72 @@ const navItems = [
|
||||
|
||||
export default function AwoooPLayout({
|
||||
children,
|
||||
params,
|
||||
}: {
|
||||
children: React.ReactNode;
|
||||
params: { locale: string };
|
||||
}) {
|
||||
const pathname = usePathname();
|
||||
|
||||
return (
|
||||
<div className="min-h-full flex flex-col">
|
||||
{/* Console Header */}
|
||||
<div className="bg-card border-b border-border px-6 py-4">
|
||||
<div className="flex items-center justify-between mb-4">
|
||||
<div>
|
||||
<h1 className="text-xl font-bold text-foreground tracking-tight">
|
||||
AwoooP Operator Console
|
||||
</h1>
|
||||
<p className="text-xs text-muted-foreground mt-0.5">
|
||||
Agent 平台管理員後台 — 租戶 · 合約 · Run · 審批
|
||||
</p>
|
||||
</div>
|
||||
<div className="flex items-center gap-2">
|
||||
<span className="inline-flex items-center gap-1.5 px-2.5 py-1 rounded-full text-xs font-medium bg-brand-accent/10 text-brand-accent border border-brand-accent/20">
|
||||
<span className="w-1.5 h-1.5 rounded-full bg-brand-accent animate-pulse" />
|
||||
<AppLayout locale={params.locale} showBackground={false}>
|
||||
<div className="min-h-[calc(100vh-116px)] bg-[#f7f5ee] border border-[#e0ddd4]">
|
||||
<div className="border-b border-[#e0ddd4] bg-[#faf9f3] px-5 py-4">
|
||||
<div className="flex flex-wrap items-center justify-between gap-3">
|
||||
<div className="flex items-center gap-3">
|
||||
<span className="flex h-9 w-9 items-center justify-center border border-[#d8d3c7] bg-white text-[#141413]">
|
||||
<BrainCircuit className="h-4 w-4" aria-hidden="true" />
|
||||
</span>
|
||||
<div>
|
||||
<h1 className="text-lg font-semibold tracking-normal text-[#141413]">
|
||||
AwoooP Operator Console
|
||||
</h1>
|
||||
<div className="mt-1 flex items-center gap-2 text-xs text-[#77736a]">
|
||||
<span className="font-mono">Control Plane</span>
|
||||
<span className="h-1 w-1 rounded-full bg-[#d97757]" />
|
||||
<span className="font-mono">Shadow First</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<span className="inline-flex items-center gap-2 border border-[#d8d3c7] bg-white px-3 py-1.5 text-xs font-semibold text-[#141413]">
|
||||
<span className="h-1.5 w-1.5 rounded-full bg-[#22c55e]" />
|
||||
OPERATOR
|
||||
</span>
|
||||
</div>
|
||||
|
||||
<nav
|
||||
className="mt-4 flex flex-wrap gap-1"
|
||||
role="navigation"
|
||||
aria-label="AwoooP 主要導航"
|
||||
>
|
||||
{navItems.map((item) => {
|
||||
const Icon = item.icon;
|
||||
const isActive =
|
||||
pathname === item.href ||
|
||||
pathname?.startsWith(item.href + "/");
|
||||
|
||||
return (
|
||||
<Link
|
||||
key={item.href}
|
||||
href={item.href}
|
||||
aria-current={isActive ? "page" : undefined}
|
||||
className={cn(
|
||||
"inline-flex items-center gap-2 border px-3 py-2 text-sm font-medium transition-colors",
|
||||
isActive
|
||||
? "border-[#d97757] bg-white text-[#141413]"
|
||||
: "border-transparent text-[#77736a] hover:border-[#d8d3c7] hover:bg-white hover:text-[#141413]"
|
||||
)}
|
||||
>
|
||||
<Icon className="h-4 w-4" aria-hidden="true" />
|
||||
{item.label}
|
||||
</Link>
|
||||
);
|
||||
})}
|
||||
</nav>
|
||||
</div>
|
||||
|
||||
{/* Tab Navigation */}
|
||||
<nav className="flex gap-1" role="navigation" aria-label="AwoooP 主要導航">
|
||||
{navItems.map((item) => {
|
||||
const Icon = item.icon;
|
||||
const isActive =
|
||||
pathname === item.href ||
|
||||
pathname?.startsWith(item.href + "/");
|
||||
|
||||
return (
|
||||
<Link
|
||||
key={item.href}
|
||||
href={item.href}
|
||||
aria-current={isActive ? "page" : undefined}
|
||||
className={cn(
|
||||
"flex items-center gap-2 px-4 py-2 rounded-lg text-sm font-medium transition-all duration-150",
|
||||
isActive
|
||||
? "bg-brand-accent/15 text-brand-accent border border-brand-accent/30"
|
||||
: "text-muted-foreground hover:text-foreground hover:bg-accent"
|
||||
)}
|
||||
>
|
||||
<Icon className="w-4 h-4" aria-hidden="true" />
|
||||
{item.label}
|
||||
</Link>
|
||||
);
|
||||
})}
|
||||
</nav>
|
||||
<main className="px-5 py-5">{children}</main>
|
||||
</div>
|
||||
|
||||
{/* Page Content */}
|
||||
<main className="flex-1 px-6 py-6">
|
||||
{children}
|
||||
</main>
|
||||
</div>
|
||||
</AppLayout>
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2,8 +2,12 @@
|
||||
// WOOO AIOps - AwoooP Console 入口重導向
|
||||
// =============================================================================
|
||||
|
||||
import { redirect } from "@/i18n/routing";
|
||||
import { redirect } from "next/navigation";
|
||||
|
||||
export default function AwoooPPage() {
|
||||
redirect("/awooop/tenants");
|
||||
export default function AwoooPPage({
|
||||
params,
|
||||
}: {
|
||||
params: { locale: string };
|
||||
}) {
|
||||
redirect(`/${params.locale}/awooop/work-items`);
|
||||
}
|
||||
|
||||
@@ -23,16 +23,14 @@ import { cn } from "@/lib/utils";
|
||||
// =============================================================================
|
||||
|
||||
type RunState =
|
||||
| "CREATED"
|
||||
| "QUEUED"
|
||||
| "POLICY_RESOLVED"
|
||||
| "RUNNING"
|
||||
| "WAITING_TOOL"
|
||||
| "WAITING_APPROVAL"
|
||||
| "RESUMED"
|
||||
| "COMPLETED"
|
||||
| "FAILED"
|
||||
| "CANCELLED";
|
||||
| "pending"
|
||||
| "running"
|
||||
| "waiting_tool"
|
||||
| "waiting_approval"
|
||||
| "completed"
|
||||
| "failed"
|
||||
| "cancelled"
|
||||
| "timeout";
|
||||
|
||||
interface Run {
|
||||
run_id: string;
|
||||
@@ -40,18 +38,19 @@ interface Run {
|
||||
agent_id: string;
|
||||
state: RunState;
|
||||
is_shadow: boolean;
|
||||
token_usage_input: number | null;
|
||||
token_usage_output: number | null;
|
||||
cost_usd: number | string;
|
||||
step_count: number;
|
||||
created_at: string;
|
||||
}
|
||||
|
||||
interface Tenant {
|
||||
project_id: string;
|
||||
name: string;
|
||||
display_name: string;
|
||||
}
|
||||
|
||||
interface RunsResponse {
|
||||
items: Run[];
|
||||
runs?: Run[];
|
||||
items?: Run[];
|
||||
total: number;
|
||||
page: number;
|
||||
per_page: number;
|
||||
@@ -69,66 +68,54 @@ const STATE_CONFIG: Record<
|
||||
RunState,
|
||||
{ label: string; bg: string; text: string; border: string; pulse?: boolean }
|
||||
> = {
|
||||
CREATED: {
|
||||
label: "已建立",
|
||||
bg: "bg-gray-800",
|
||||
text: "text-gray-300",
|
||||
border: "border-gray-600",
|
||||
pending: {
|
||||
label: "待執行",
|
||||
bg: "bg-[#f4f1e8]",
|
||||
text: "text-[#5f5b52]",
|
||||
border: "border-[#d8d3c7]",
|
||||
},
|
||||
QUEUED: {
|
||||
label: "排隊中",
|
||||
bg: "bg-gray-800",
|
||||
text: "text-gray-400",
|
||||
border: "border-gray-600",
|
||||
},
|
||||
POLICY_RESOLVED: {
|
||||
label: "策略已解析",
|
||||
bg: "bg-blue-900/40",
|
||||
text: "text-blue-300",
|
||||
border: "border-blue-600/40",
|
||||
},
|
||||
RUNNING: {
|
||||
running: {
|
||||
label: "執行中",
|
||||
bg: "bg-green-900/40",
|
||||
text: "text-green-300",
|
||||
border: "border-green-600/40",
|
||||
bg: "bg-[#f0faf2]",
|
||||
text: "text-[#17602a]",
|
||||
border: "border-[#9bc7a4]",
|
||||
pulse: true,
|
||||
},
|
||||
WAITING_TOOL: {
|
||||
waiting_tool: {
|
||||
label: "等待工具",
|
||||
bg: "bg-yellow-900/40",
|
||||
text: "text-yellow-300",
|
||||
border: "border-yellow-600/40",
|
||||
bg: "bg-[#fff7e8]",
|
||||
text: "text-[#8a5a08]",
|
||||
border: "border-[#d9b36f]",
|
||||
},
|
||||
WAITING_APPROVAL: {
|
||||
waiting_approval: {
|
||||
label: "等待審批",
|
||||
bg: "bg-yellow-900/40",
|
||||
text: "text-yellow-300",
|
||||
border: "border-yellow-600/40",
|
||||
bg: "bg-[#fff7e8]",
|
||||
text: "text-[#8a5a08]",
|
||||
border: "border-[#d9b36f]",
|
||||
},
|
||||
RESUMED: {
|
||||
label: "已恢復",
|
||||
bg: "bg-purple-900/40",
|
||||
text: "text-purple-300",
|
||||
border: "border-purple-600/40",
|
||||
},
|
||||
COMPLETED: {
|
||||
completed: {
|
||||
label: "已完成",
|
||||
bg: "bg-green-900/40",
|
||||
text: "text-green-400",
|
||||
border: "border-green-600/40",
|
||||
bg: "bg-[#f0faf2]",
|
||||
text: "text-[#17602a]",
|
||||
border: "border-[#9bc7a4]",
|
||||
},
|
||||
FAILED: {
|
||||
failed: {
|
||||
label: "失敗",
|
||||
bg: "bg-red-900/40",
|
||||
text: "text-red-300",
|
||||
border: "border-red-600/40",
|
||||
bg: "bg-[#fff0ef]",
|
||||
text: "text-[#9f2f25]",
|
||||
border: "border-[#e2a29b]",
|
||||
},
|
||||
CANCELLED: {
|
||||
cancelled: {
|
||||
label: "已取消",
|
||||
bg: "bg-red-900/30",
|
||||
text: "text-red-400",
|
||||
border: "border-red-700/40",
|
||||
bg: "bg-[#fff0ef]",
|
||||
text: "text-[#9f2f25]",
|
||||
border: "border-[#e2a29b]",
|
||||
},
|
||||
timeout: {
|
||||
label: "已超時",
|
||||
bg: "bg-[#fff0ef]",
|
||||
text: "text-[#9f2f25]",
|
||||
border: "border-[#e2a29b]",
|
||||
},
|
||||
};
|
||||
|
||||
@@ -137,7 +124,7 @@ const STATE_CONFIG: Record<
|
||||
// =============================================================================
|
||||
|
||||
function RunStateBadge({ state }: { state: RunState }) {
|
||||
const config = STATE_CONFIG[state] ?? STATE_CONFIG.CREATED;
|
||||
const config = STATE_CONFIG[state] ?? STATE_CONFIG.pending;
|
||||
return (
|
||||
<span
|
||||
className={cn(
|
||||
@@ -158,7 +145,7 @@ function RunStateBadge({ state }: { state: RunState }) {
|
||||
function ShadowBadge({ isShadow }: { isShadow: boolean }) {
|
||||
if (!isShadow) return <span className="text-muted-foreground text-sm">--</span>;
|
||||
return (
|
||||
<span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-gray-800 text-gray-400 border border-gray-600">
|
||||
<span className="inline-flex items-center border border-[#d8d3c7] bg-white px-2 py-0.5 text-xs font-medium text-[#5f5b52]">
|
||||
Shadow
|
||||
</span>
|
||||
);
|
||||
@@ -174,8 +161,7 @@ function RunRow({ run }: { run: Run }) {
|
||||
})
|
||||
: "--";
|
||||
|
||||
const totalTokens =
|
||||
(run.token_usage_input ?? 0) + (run.token_usage_output ?? 0);
|
||||
const cost = Number(run.cost_usd ?? 0);
|
||||
|
||||
return (
|
||||
<tr className="border-b border-border hover:bg-accent/30 transition-colors">
|
||||
@@ -202,12 +188,12 @@ function RunRow({ run }: { run: Run }) {
|
||||
</td>
|
||||
<td className="px-4 py-3">
|
||||
<span className="flex items-center gap-1 text-sm font-mono text-muted-foreground">
|
||||
{totalTokens > 0 ? (
|
||||
{run.step_count > 0 || cost > 0 ? (
|
||||
<>
|
||||
<Cpu className="w-3.5 h-3.5" aria-hidden="true" />
|
||||
{totalTokens.toLocaleString()}
|
||||
{run.step_count.toLocaleString()} steps
|
||||
<span className="text-xs text-muted-foreground/60">
|
||||
({run.token_usage_input ?? 0}↑ {run.token_usage_output ?? 0}↓)
|
||||
(${cost.toFixed(4)})
|
||||
</span>
|
||||
</>
|
||||
) : (
|
||||
@@ -244,7 +230,10 @@ export default function RunsPage() {
|
||||
useEffect(() => {
|
||||
fetch(`${API_BASE}/api/v1/platform/tenants`)
|
||||
.then((r) => r.json())
|
||||
.then((data) => setTenants(Array.isArray(data.items) ? data.items : []))
|
||||
.then((data) => {
|
||||
const rows = Array.isArray(data.tenants) ? data.tenants : data.items;
|
||||
setTenants(Array.isArray(rows) ? rows : []);
|
||||
})
|
||||
.catch(() => {});
|
||||
}, []);
|
||||
|
||||
@@ -253,7 +242,7 @@ export default function RunsPage() {
|
||||
setError(null);
|
||||
const params = new URLSearchParams();
|
||||
if (projectFilter) params.set("project_id", projectFilter);
|
||||
if (statusFilter) params.set("status", statusFilter);
|
||||
if (statusFilter) params.set("state", statusFilter);
|
||||
params.set("page", String(page));
|
||||
params.set("per_page", String(PER_PAGE));
|
||||
|
||||
@@ -262,7 +251,8 @@ export default function RunsPage() {
|
||||
);
|
||||
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
||||
const data: RunsResponse = await res.json();
|
||||
setRuns(Array.isArray(data.items) ? data.items : []);
|
||||
const rows = Array.isArray(data.runs) ? data.runs : data.items;
|
||||
setRuns(Array.isArray(rows) ? rows : []);
|
||||
setTotal(data.total ?? 0);
|
||||
setLastRefresh(new Date());
|
||||
} catch (err) {
|
||||
@@ -320,7 +310,7 @@ export default function RunsPage() {
|
||||
</div>
|
||||
|
||||
{/* Filters */}
|
||||
<div className="flex items-center gap-3 p-4 bg-card border border-border rounded-xl flex-wrap">
|
||||
<div className="flex flex-wrap items-center gap-3 border border-[#e0ddd4] bg-white p-4">
|
||||
<Filter className="w-4 h-4 text-muted-foreground flex-shrink-0" aria-hidden="true" />
|
||||
<span className="text-sm text-muted-foreground">篩選:</span>
|
||||
|
||||
@@ -335,7 +325,7 @@ export default function RunsPage() {
|
||||
<option value="">所有租戶</option>
|
||||
{tenants.map((t) => (
|
||||
<option key={t.project_id} value={t.project_id}>
|
||||
{t.name || t.project_id}
|
||||
{t.display_name || t.project_id}
|
||||
</option>
|
||||
))}
|
||||
</select>
|
||||
@@ -373,7 +363,7 @@ export default function RunsPage() {
|
||||
)}
|
||||
|
||||
{/* Table */}
|
||||
<div className="bg-card border border-border rounded-xl overflow-hidden shadow-sm">
|
||||
<div className="overflow-hidden border border-[#e0ddd4] bg-white shadow-[0_1px_4px_rgba(0,0,0,0.05)]">
|
||||
<div className="overflow-x-auto">
|
||||
<table className="w-full" role="table" aria-label="Run 清單">
|
||||
<thead>
|
||||
@@ -394,7 +384,7 @@ export default function RunsPage() {
|
||||
Shadow
|
||||
</th>
|
||||
<th className="text-left px-4 py-3 text-xs font-medium text-muted-foreground uppercase tracking-wider">
|
||||
Token 用量
|
||||
成本 / Steps
|
||||
</th>
|
||||
<th className="text-left px-4 py-3 text-xs font-medium text-muted-foreground uppercase tracking-wider">
|
||||
建立時間
|
||||
|
||||
@@ -21,6 +21,7 @@ import { cn } from "@/lib/utils";
|
||||
// =============================================================================
|
||||
|
||||
type MigrationMode =
|
||||
| "legacy_awoooi_default"
|
||||
| "shadow"
|
||||
| "canary"
|
||||
| "read_only"
|
||||
@@ -29,14 +30,15 @@ type MigrationMode =
|
||||
|
||||
interface Tenant {
|
||||
project_id: string;
|
||||
name: string;
|
||||
display_name: string;
|
||||
migration_mode: MigrationMode;
|
||||
budget_limit_usd: number | null;
|
||||
is_suspended: boolean;
|
||||
budget_limit_usd: number | string | null;
|
||||
is_active: boolean;
|
||||
}
|
||||
|
||||
interface ApiResponse {
|
||||
items: Tenant[];
|
||||
tenants?: Tenant[];
|
||||
items?: Tenant[];
|
||||
total: number;
|
||||
}
|
||||
|
||||
@@ -50,35 +52,41 @@ const MIGRATION_MODE_CONFIG: Record<
|
||||
MigrationMode,
|
||||
{ label: string; bg: string; text: string; border: string }
|
||||
> = {
|
||||
legacy_awoooi_default: {
|
||||
label: "Legacy",
|
||||
bg: "bg-white",
|
||||
text: "text-[#5f5b52]",
|
||||
border: "border-[#d8d3c7]",
|
||||
},
|
||||
shadow: {
|
||||
label: "Shadow",
|
||||
bg: "bg-gray-800",
|
||||
text: "text-gray-300",
|
||||
border: "border-gray-600",
|
||||
bg: "bg-[#f4f1e8]",
|
||||
text: "text-[#5f5b52]",
|
||||
border: "border-[#d8d3c7]",
|
||||
},
|
||||
canary: {
|
||||
label: "Canary",
|
||||
bg: "bg-yellow-900/40",
|
||||
text: "text-yellow-300",
|
||||
border: "border-yellow-600/40",
|
||||
bg: "bg-[#fff7e8]",
|
||||
text: "text-[#8a5a08]",
|
||||
border: "border-[#d9b36f]",
|
||||
},
|
||||
read_only: {
|
||||
label: "Read Only",
|
||||
bg: "bg-blue-900/40",
|
||||
text: "text-blue-300",
|
||||
border: "border-blue-600/40",
|
||||
bg: "bg-[#eef5ff]",
|
||||
text: "text-[#1f5b9b]",
|
||||
border: "border-[#9bb6d9]",
|
||||
},
|
||||
suggest: {
|
||||
label: "Suggest",
|
||||
bg: "bg-purple-900/40",
|
||||
text: "text-purple-300",
|
||||
border: "border-purple-600/40",
|
||||
bg: "bg-[#f6f0ff]",
|
||||
text: "text-[#6541a5]",
|
||||
border: "border-[#baa7de]",
|
||||
},
|
||||
auto_remediate: {
|
||||
label: "Auto Remediate",
|
||||
bg: "bg-green-900/40",
|
||||
text: "text-green-300",
|
||||
border: "border-green-600/40",
|
||||
bg: "bg-[#f0faf2]",
|
||||
text: "text-[#17602a]",
|
||||
border: "border-[#9bc7a4]",
|
||||
},
|
||||
};
|
||||
|
||||
@@ -104,12 +112,12 @@ function MigrationModeBadge({ mode }: { mode: MigrationMode }) {
|
||||
|
||||
function SuspendedBadge({ suspended }: { suspended: boolean }) {
|
||||
return suspended ? (
|
||||
<span className="inline-flex items-center gap-1 px-2.5 py-0.5 rounded-md text-xs font-medium bg-red-900/40 text-red-300 border border-red-600/40">
|
||||
<span className="inline-flex items-center gap-1 border border-[#e2a29b] bg-[#fff0ef] px-2.5 py-0.5 text-xs font-medium text-[#9f2f25]">
|
||||
<Ban className="w-3 h-3" aria-hidden="true" />
|
||||
停用
|
||||
</span>
|
||||
) : (
|
||||
<span className="inline-flex items-center gap-1 px-2.5 py-0.5 rounded-md text-xs font-medium bg-green-900/40 text-green-300 border border-green-600/40">
|
||||
<span className="inline-flex items-center gap-1 border border-[#9bc7a4] bg-[#f0faf2] px-2.5 py-0.5 text-xs font-medium text-[#17602a]">
|
||||
<CheckCircle2 className="w-3 h-3" aria-hidden="true" />
|
||||
正常
|
||||
</span>
|
||||
@@ -117,6 +125,9 @@ function SuspendedBadge({ suspended }: { suspended: boolean }) {
|
||||
}
|
||||
|
||||
function TenantRow({ tenant }: { tenant: Tenant }) {
|
||||
const budget =
|
||||
tenant.budget_limit_usd == null ? null : Number(tenant.budget_limit_usd);
|
||||
|
||||
return (
|
||||
<tr className="border-b border-border hover:bg-accent/30 transition-colors">
|
||||
<td className="px-4 py-3">
|
||||
@@ -125,7 +136,7 @@ function TenantRow({ tenant }: { tenant: Tenant }) {
|
||||
</span>
|
||||
</td>
|
||||
<td className="px-4 py-3">
|
||||
<span className="text-sm text-foreground font-medium">{tenant.name || "--"}</span>
|
||||
<span className="text-sm text-foreground font-medium">{tenant.display_name || "--"}</span>
|
||||
</td>
|
||||
<td className="px-4 py-3">
|
||||
<MigrationModeBadge mode={tenant.migration_mode} />
|
||||
@@ -135,7 +146,7 @@ function TenantRow({ tenant }: { tenant: Tenant }) {
|
||||
{tenant.budget_limit_usd != null ? (
|
||||
<>
|
||||
<DollarSign className="w-3.5 h-3.5" aria-hidden="true" />
|
||||
{tenant.budget_limit_usd.toLocaleString("en-US", {
|
||||
{budget?.toLocaleString("en-US", {
|
||||
minimumFractionDigits: 2,
|
||||
})}
|
||||
</>
|
||||
@@ -145,7 +156,7 @@ function TenantRow({ tenant }: { tenant: Tenant }) {
|
||||
</span>
|
||||
</td>
|
||||
<td className="px-4 py-3">
|
||||
<SuspendedBadge suspended={tenant.is_suspended} />
|
||||
<SuspendedBadge suspended={!tenant.is_active} />
|
||||
</td>
|
||||
</tr>
|
||||
);
|
||||
@@ -166,7 +177,8 @@ export default function TenantsPage() {
|
||||
const res = await fetch(`${API_BASE}/api/v1/platform/tenants`);
|
||||
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
||||
const data: ApiResponse = await res.json();
|
||||
setTenants(Array.isArray(data.items) ? data.items : []);
|
||||
const rows = Array.isArray(data.tenants) ? data.tenants : data.items;
|
||||
setTenants(Array.isArray(rows) ? rows : []);
|
||||
} catch (err) {
|
||||
setError(err instanceof Error ? err.message : "載入失敗");
|
||||
} finally {
|
||||
@@ -214,7 +226,7 @@ export default function TenantsPage() {
|
||||
)}
|
||||
|
||||
{/* Table */}
|
||||
<div className="bg-card border border-border rounded-xl overflow-hidden shadow-sm">
|
||||
<div className="overflow-hidden border border-[#e0ddd4] bg-white shadow-[0_1px_4px_rgba(0,0,0,0.05)]">
|
||||
<div className="overflow-x-auto">
|
||||
<table className="w-full" role="table" aria-label="租戶清單">
|
||||
<thead>
|
||||
|
||||
254
apps/web/src/app/[locale]/awooop/work-items/page.tsx
Normal file
254
apps/web/src/app/[locale]/awooop/work-items/page.tsx
Normal file
@@ -0,0 +1,254 @@
|
||||
// =============================================================================
|
||||
// WOOO AIOps - AwoooP 工作鏈路
|
||||
// =============================================================================
|
||||
// 將 AwoooP 實施項目對齊到 Operator Console 可觀測面。
|
||||
|
||||
"use client";
|
||||
|
||||
import {
|
||||
Activity,
|
||||
ArrowRight,
|
||||
ClipboardList,
|
||||
Database,
|
||||
Gauge,
|
||||
GitBranch,
|
||||
Network,
|
||||
ShieldCheck,
|
||||
} from "lucide-react";
|
||||
import { Link } from "@/i18n/routing";
|
||||
import { cn } from "@/lib/utils";
|
||||
|
||||
type WorkStatus = "live" | "in_progress" | "blocked" | "watching";
|
||||
|
||||
type WorkItem = {
|
||||
phase: string;
|
||||
title: string;
|
||||
status: WorkStatus;
|
||||
surface: string;
|
||||
source: string;
|
||||
gate: string;
|
||||
href: "/awooop/tenants" | "/awooop/contracts" | "/awooop/runs" | "/awooop/approvals";
|
||||
};
|
||||
|
||||
const statusConfig: Record<WorkStatus, { label: string; className: string }> = {
|
||||
live: {
|
||||
label: "已接線",
|
||||
className: "border-[#9bc7a4] bg-[#f0faf2] text-[#17602a]",
|
||||
},
|
||||
in_progress: {
|
||||
label: "推進中",
|
||||
className: "border-[#d9b36f] bg-[#fff7e8] text-[#8a5a08]",
|
||||
},
|
||||
blocked: {
|
||||
label: "阻塞",
|
||||
className: "border-[#e2a29b] bg-[#fff0ef] text-[#9f2f25]",
|
||||
},
|
||||
watching: {
|
||||
label: "觀察期",
|
||||
className: "border-[#9bb6d9] bg-[#eef5ff] text-[#1f5b9b]",
|
||||
},
|
||||
};
|
||||
|
||||
const workItems: WorkItem[] = [
|
||||
{
|
||||
phase: "P0",
|
||||
title: "AI 路由以 GCP-A/GCP-B/111 Ollama 優先",
|
||||
status: "live",
|
||||
surface: "Run 監控",
|
||||
source: "ai_routing_decision / ollama_failover_decision",
|
||||
gate: "Gemini 僅能作為 fallback",
|
||||
href: "/awooop/runs",
|
||||
},
|
||||
{
|
||||
phase: "P0",
|
||||
title: "飛輪 KPI 改讀 auto_repair_executions",
|
||||
status: "in_progress",
|
||||
surface: "工作鏈路 / 系統報告",
|
||||
source: "auto_repair_executions",
|
||||
gate: "修復率不得再讀 incidents.outcome",
|
||||
href: "/awooop/runs",
|
||||
},
|
||||
{
|
||||
phase: "P0",
|
||||
title: "審批與 Run State 對齊",
|
||||
status: "live",
|
||||
surface: "審批佇列",
|
||||
source: "awooop_run_state",
|
||||
gate: "waiting_approval 才能 decide",
|
||||
href: "/awooop/approvals",
|
||||
},
|
||||
{
|
||||
phase: "P1",
|
||||
title: "Contract Lifecycle",
|
||||
status: "watching",
|
||||
surface: "合約儀表板",
|
||||
source: "awooop_contract_revisions",
|
||||
gate: "draft → published → active",
|
||||
href: "/awooop/contracts",
|
||||
},
|
||||
{
|
||||
phase: "P1",
|
||||
title: "Tenant Migration State",
|
||||
status: "watching",
|
||||
surface: "租戶管理",
|
||||
source: "awooop_projects",
|
||||
gate: "shadow gate 需量化",
|
||||
href: "/awooop/tenants",
|
||||
},
|
||||
{
|
||||
phase: "P1",
|
||||
title: "MCP Gateway 與 Context Firewall",
|
||||
status: "watching",
|
||||
surface: "Run 監控",
|
||||
source: "mcp_gateway audit / redaction",
|
||||
gate: "tool call 必須帶 project_id",
|
||||
href: "/awooop/runs",
|
||||
},
|
||||
{
|
||||
phase: "P2",
|
||||
title: "Communication Hub",
|
||||
status: "watching",
|
||||
surface: "Run 監控 / 審批佇列",
|
||||
source: "conversation_event / outbound_message",
|
||||
gate: "Telegram 先 mirror 再切流",
|
||||
href: "/awooop/runs",
|
||||
},
|
||||
{
|
||||
phase: "P2",
|
||||
title: "Operator Console 正式接入主站",
|
||||
status: "in_progress",
|
||||
surface: "AwoooP",
|
||||
source: "apps/web/src/app/[locale]/awooop",
|
||||
gate: "/zh-TW/awooop 不得再 redirect 異常",
|
||||
href: "/awooop/tenants",
|
||||
},
|
||||
];
|
||||
|
||||
function StatusBadge({ status }: { status: WorkStatus }) {
|
||||
const config = statusConfig[status];
|
||||
return (
|
||||
<span
|
||||
className={cn(
|
||||
"inline-flex items-center border px-2 py-0.5 text-xs font-semibold",
|
||||
config.className
|
||||
)}
|
||||
>
|
||||
{config.label}
|
||||
</span>
|
||||
);
|
||||
}
|
||||
|
||||
const summary = [
|
||||
{ label: "Live", value: workItems.filter((i) => i.status === "live").length, icon: Activity },
|
||||
{ label: "In Progress", value: workItems.filter((i) => i.status === "in_progress").length, icon: GitBranch },
|
||||
{ label: "Watching", value: workItems.filter((i) => i.status === "watching").length, icon: Gauge },
|
||||
{ label: "Blocked", value: workItems.filter((i) => i.status === "blocked").length, icon: ShieldCheck },
|
||||
];
|
||||
|
||||
export default function AwoooPWorkItemsPage() {
|
||||
return (
|
||||
<div className="space-y-5">
|
||||
<div className="flex flex-wrap items-center justify-between gap-3">
|
||||
<div className="flex items-center gap-3">
|
||||
<ClipboardList className="h-5 w-5 text-[#d97757]" aria-hidden="true" />
|
||||
<div>
|
||||
<h2 className="text-lg font-semibold tracking-normal text-[#141413]">
|
||||
工作鏈路
|
||||
</h2>
|
||||
<p className="text-xs text-[#77736a]">
|
||||
{workItems.length} 個控制點
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="grid gap-px border border-[#e0ddd4] bg-[#e0ddd4] md:grid-cols-4">
|
||||
{summary.map((item) => {
|
||||
const Icon = item.icon;
|
||||
return (
|
||||
<div key={item.label} className="bg-white px-4 py-3">
|
||||
<div className="flex items-center justify-between">
|
||||
<span className="text-xs font-medium text-[#77736a]">{item.label}</span>
|
||||
<Icon className="h-4 w-4 text-[#87867f]" aria-hidden="true" />
|
||||
</div>
|
||||
<div className="mt-2 font-mono text-2xl font-semibold text-[#141413]">
|
||||
{item.value}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
|
||||
<div className="overflow-hidden border border-[#e0ddd4] bg-white">
|
||||
<div className="overflow-x-auto">
|
||||
<table className="w-full" role="table" aria-label="AwoooP 工作鏈路">
|
||||
<thead>
|
||||
<tr className="border-b border-[#e0ddd4] bg-[#faf9f3]">
|
||||
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
|
||||
Phase
|
||||
</th>
|
||||
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
|
||||
工作項目
|
||||
</th>
|
||||
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
|
||||
狀態
|
||||
</th>
|
||||
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
|
||||
操作面
|
||||
</th>
|
||||
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
|
||||
Source
|
||||
</th>
|
||||
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
|
||||
Gate
|
||||
</th>
|
||||
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
|
||||
Link
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{workItems.map((item) => (
|
||||
<tr key={`${item.phase}-${item.title}`} className="border-b border-[#eee9dd] last:border-b-0">
|
||||
<td className="px-4 py-3 font-mono text-xs font-semibold text-[#141413]">
|
||||
{item.phase}
|
||||
</td>
|
||||
<td className="px-4 py-3 text-sm font-medium text-[#141413]">
|
||||
{item.title}
|
||||
</td>
|
||||
<td className="px-4 py-3">
|
||||
<StatusBadge status={item.status} />
|
||||
</td>
|
||||
<td className="px-4 py-3 text-sm text-[#5f5b52]">
|
||||
{item.surface}
|
||||
</td>
|
||||
<td className="px-4 py-3">
|
||||
<span className="inline-flex items-center gap-1.5 font-mono text-xs text-[#5f5b52]">
|
||||
<Database className="h-3.5 w-3.5" aria-hidden="true" />
|
||||
{item.source}
|
||||
</span>
|
||||
</td>
|
||||
<td className="px-4 py-3">
|
||||
<span className="inline-flex items-center gap-1.5 text-sm text-[#5f5b52]">
|
||||
<Network className="h-3.5 w-3.5" aria-hidden="true" />
|
||||
{item.gate}
|
||||
</span>
|
||||
</td>
|
||||
<td className="px-4 py-3">
|
||||
<Link
|
||||
href={item.href}
|
||||
className="inline-flex items-center gap-1.5 border border-[#d8d3c7] px-2.5 py-1 text-xs font-semibold text-[#141413] hover:border-[#d97757]"
|
||||
>
|
||||
開啟
|
||||
<ArrowRight className="h-3.5 w-3.5" aria-hidden="true" />
|
||||
</Link>
|
||||
</td>
|
||||
</tr>
|
||||
))}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
@@ -248,7 +248,7 @@ function MonitoringTools() {
|
||||
// =============================================================================
|
||||
|
||||
const HOST_CATALOG: Record<string, { services: HostService[]; isK3s?: boolean; role?: string }> = {
|
||||
'192.168.0.110': {
|
||||
'devops': {
|
||||
services: [
|
||||
{ name: 'Harbor', healthy: false, port: 5000, description: 'Container Registry' },
|
||||
{ name: 'Gitea', healthy: false, port: 3001, description: 'Git · CI/CD' },
|
||||
@@ -258,12 +258,12 @@ const HOST_CATALOG: Record<string, { services: HostService[]; isK3s?: boolean; r
|
||||
{ name: 'Prometheus', healthy: false, port: 9090, description: '告警規則' },
|
||||
],
|
||||
},
|
||||
'192.168.0.112': {
|
||||
'security': {
|
||||
services: [
|
||||
{ name: 'Scanner API', healthy: false, port: 8080, description: '漏洞掃描' },
|
||||
],
|
||||
},
|
||||
'192.168.0.120': {
|
||||
'k3s-1': {
|
||||
isK3s: true,
|
||||
role: 'Control Plane #1',
|
||||
services: [
|
||||
@@ -273,7 +273,7 @@ const HOST_CATALOG: Record<string, { services: HostService[]; isK3s?: boolean; r
|
||||
{ name: 'keepalived', healthy: false, description: 'VIP MASTER', isK3s: true },
|
||||
],
|
||||
},
|
||||
'192.168.0.121': {
|
||||
'k3s-2': {
|
||||
isK3s: true,
|
||||
role: 'Control Plane #2 (HA)',
|
||||
services: [
|
||||
@@ -283,7 +283,7 @@ const HOST_CATALOG: Record<string, { services: HostService[]; isK3s?: boolean; r
|
||||
{ name: 'keepalived', healthy: false, description: 'VIP BACKUP', isK3s: true },
|
||||
],
|
||||
},
|
||||
'192.168.0.188': {
|
||||
'ai-web': {
|
||||
services: [
|
||||
{ name: 'Nginx', healthy: false, port: 443, description: 'Reverse Proxy' },
|
||||
{ name: 'PostgreSQL', healthy: false, port: 5432, description: 'K3s Datastore' },
|
||||
@@ -781,9 +781,9 @@ export default function Home({ params }: { params: { locale: string } }) {
|
||||
buildHostInfo(h.ip, h.name, h.metrics?.cpu_percent ?? null, h.metrics?.memory_percent ?? null, h.services)
|
||||
)
|
||||
// K3s #2 (121) 若 API 未回傳,補靜態卡
|
||||
const has121 = apiHosts.some(h => h.ip === '192.168.0.121')
|
||||
if (!has121) {
|
||||
apiHosts.push(buildHostInfo('192.168.0.121', 'K3s Server #2', null, null, []))
|
||||
const hasK3s2 = apiHosts.some(h => h.ip === 'k3s-2')
|
||||
if (!hasK3s2) {
|
||||
apiHosts.push(buildHostInfo('k3s-2', 'K3s Server #2', null, null, []))
|
||||
}
|
||||
return apiHosts
|
||||
})()} />
|
||||
|
||||
@@ -12,7 +12,7 @@ import {
|
||||
ShieldCheck,
|
||||
} from 'lucide-react'
|
||||
|
||||
const GITEA_ACTIONS_URL = 'http://192.168.0.110:3001/wooo/awoooi/actions'
|
||||
const GITEA_ACTIONS_URL = process.env.NEXT_PUBLIC_GITEA_URL ? `${process.env.NEXT_PUBLIC_GITEA_URL}/wooo/awoooi/actions` : '#'
|
||||
|
||||
const agents = [
|
||||
{ name: 'Hermes', role: '變更摘要與規則脈絡', state: 'wired' },
|
||||
@@ -63,7 +63,7 @@ export default function CodeReviewPage({ params }: { params: { locale: string }
|
||||
Source
|
||||
</div>
|
||||
<div className="mt-3 text-lg font-semibold text-white">gitea main</div>
|
||||
<div className="mt-1 text-xs text-gray-500">192.168.0.110:3001</div>
|
||||
<div className="mt-1 text-xs text-gray-500">gitea internal</div>
|
||||
</div>
|
||||
<div className="rounded border border-gray-800 bg-gray-950 p-4">
|
||||
<div className="flex items-center gap-2 text-xs text-gray-400">
|
||||
|
||||
@@ -36,6 +36,84 @@ import { FlywheelKPICard } from '@/components/dashboard/flywheel-kpi-card'
|
||||
|
||||
const API_BASE = process.env.NEXT_PUBLIC_API_URL ?? ''
|
||||
|
||||
type HostNodeId = 'devops' | 'ai-data' | 'k3s-master' | 'k3s-worker'
|
||||
type HostCatalog = Record<HostNodeId, { services: HostService[]; isK3s?: boolean; role?: string }>
|
||||
|
||||
const HOST_ID_LIST = ['devops', 'ai-data', 'k3s-master', 'k3s-worker'] as const
|
||||
|
||||
const HOST_IP_LABELS: Record<HostNodeId, string> = {
|
||||
'devops': 'devops',
|
||||
'ai-data': 'ai-data',
|
||||
'k3s-master': 'k3s-master',
|
||||
'k3s-worker': 'k3s-worker',
|
||||
}
|
||||
|
||||
const HOST_CATALOG: HostCatalog = {
|
||||
'devops': {
|
||||
services: [
|
||||
{ name: 'Harbor', healthy: false, port: 5000, description: 'Container Registry' },
|
||||
{ name: 'Gitea', healthy: false, port: 3001, description: 'Git · CI/CD' },
|
||||
{ name: 'Sentry', healthy: false, port: 9000, description: 'Error Tracking' },
|
||||
{ name: 'Langfuse', healthy: false, port: 3100, description: 'LLM Tracing' },
|
||||
{ name: 'Grafana', healthy: false, port: 3002, description: '監控面板' },
|
||||
{ name: 'Prometheus', healthy: false, port: 9090, description: '告警規則' },
|
||||
],
|
||||
},
|
||||
'ai-data': {
|
||||
services: [
|
||||
{ name: 'Scanner API', healthy: false, port: 8080, description: '漏洞掃描' },
|
||||
],
|
||||
},
|
||||
'k3s-master': {
|
||||
isK3s: true,
|
||||
role: 'Control Plane #1',
|
||||
services: [
|
||||
{ name: 'K3s API', healthy: false, port: 6443, description: 'kubectl', isK3s: true },
|
||||
{ name: 'Traefik', healthy: false, description: 'Ingress', isK3s: true },
|
||||
{ name: 'awoooi-prod', healthy: false, description: 'Namespace', isK3s: true },
|
||||
{ name: 'keepalived', healthy: false, description: 'VIP MASTER', isK3s: true },
|
||||
],
|
||||
},
|
||||
'k3s-worker': {
|
||||
isK3s: true,
|
||||
role: 'Control Plane #2 (HA)',
|
||||
services: [
|
||||
{ name: 'K3s API', healthy: false, port: 6443, description: 'kubectl', isK3s: true },
|
||||
{ name: 'API', healthy: false, port: 32334, description: 'NodePort', isK3s: true },
|
||||
{ name: 'Web', healthy: false, port: 32335, description: 'NodePort', isK3s: true },
|
||||
{ name: 'keepalived', healthy: false, description: 'VIP BACKUP', isK3s: true },
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
const FALLBACK_HOSTS: Array<{ id: HostNodeId; cpu: number | null; ram: number | null }> = [
|
||||
{ id: 'devops', cpu: 35, ram: 55 },
|
||||
{ id: 'ai-data', cpu: 67, ram: 72 },
|
||||
{ id: 'k3s-master', cpu: 45, ram: 60 },
|
||||
{ id: 'k3s-worker', cpu: null, ram: null },
|
||||
]
|
||||
|
||||
const HOSTS_FROM_ENV: HostNodeId[] = (process.env.NEXT_PUBLIC_HOST_IPS ?? '')
|
||||
.split(',')
|
||||
.map((id) => id.trim())
|
||||
.filter((id): id is HostNodeId => HOST_ID_LIST.includes(id as HostNodeId))
|
||||
|
||||
const HOST_IDS = HOSTS_FROM_ENV.length > 0 ? HOSTS_FROM_ENV : ['devops', 'ai-data', 'k3s-master', 'k3s-worker']
|
||||
|
||||
const HOST_LABEL_BY_ID: Record<HostNodeId, string> = {
|
||||
'devops': 'hostDevops',
|
||||
'ai-data': 'hostAiData',
|
||||
'k3s-master': 'hostK3sMaster',
|
||||
'k3s-worker': 'hostK3sWorker',
|
||||
}
|
||||
|
||||
const HOST_ID_TO_IP_HINT: Record<HostNodeId, string> = {
|
||||
'devops': 'topology:devops',
|
||||
'ai-data': 'topology:ai-data',
|
||||
'k3s-master': 'topology:k3s-master',
|
||||
'k3s-worker': 'topology:k3s-worker',
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Tab 2: 告警 & 授權 (串接真實 API)
|
||||
// =============================================================================
|
||||
@@ -497,63 +575,16 @@ function MonitoringTools() {
|
||||
// 定義每台主機完整服務清單(API 只回傳部分,此處補全靜態資訊)
|
||||
// =============================================================================
|
||||
|
||||
const HOST_CATALOG: Record<string, { services: HostService[]; isK3s?: boolean; role?: string }> = {
|
||||
'192.168.0.110': {
|
||||
services: [
|
||||
{ name: 'Harbor', healthy: false, port: 5000, description: 'Container Registry' },
|
||||
{ name: 'Gitea', healthy: false, port: 3001, description: 'Git · CI/CD' },
|
||||
{ name: 'Sentry', healthy: false, port: 9000, description: 'Error Tracking' },
|
||||
{ name: 'Langfuse', healthy: false, port: 3100, description: 'LLM Tracing' },
|
||||
{ name: 'Grafana', healthy: false, port: 3002, description: '監控面板' },
|
||||
{ name: 'Prometheus', healthy: false, port: 9090, description: '告警規則' },
|
||||
],
|
||||
},
|
||||
'192.168.0.112': {
|
||||
services: [
|
||||
{ name: 'Scanner API', healthy: false, port: 8080, description: '漏洞掃描' },
|
||||
],
|
||||
},
|
||||
'192.168.0.120': {
|
||||
isK3s: true,
|
||||
role: 'Control Plane #1',
|
||||
services: [
|
||||
{ name: 'K3s API', healthy: false, port: 6443, description: 'kubectl', isK3s: true },
|
||||
{ name: 'Traefik', healthy: false, description: 'Ingress', isK3s: true },
|
||||
{ name: 'awoooi-prod', healthy: false, description: 'Namespace', isK3s: true },
|
||||
{ name: 'keepalived', healthy: false, description: 'VIP MASTER', isK3s: true },
|
||||
],
|
||||
},
|
||||
'192.168.0.121': {
|
||||
isK3s: true,
|
||||
role: 'Control Plane #2 (HA)',
|
||||
services: [
|
||||
{ name: 'K3s API', healthy: false, port: 6443, description: 'kubectl', isK3s: true },
|
||||
{ name: 'API', healthy: false, port: 32334, description: 'NodePort', isK3s: true },
|
||||
{ name: 'Web', healthy: false, port: 32335, description: 'NodePort', isK3s: true },
|
||||
{ name: 'keepalived', healthy: false, description: 'VIP BACKUP', isK3s: true },
|
||||
],
|
||||
},
|
||||
'192.168.0.188': {
|
||||
services: [
|
||||
{ name: 'Nginx', healthy: false, port: 443, description: 'Reverse Proxy' },
|
||||
{ name: 'PostgreSQL', healthy: false, port: 5432, description: 'K3s Datastore' },
|
||||
{ name: 'Redis', healthy: false, port: 6380, description: 'Cache' },
|
||||
{ name: 'Ollama', healthy: false, port: 11434, description: 'LLM' },
|
||||
{ name: 'OpenClaw', healthy: false, port: 8088, description: 'AI Agent' },
|
||||
{ name: 'SigNoz', healthy: false, port: 3301, description: 'APM · OTEL' },
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
/** 合併 API 動態健康狀態 + 靜態服務清單 */
|
||||
function buildHostInfo(
|
||||
ip: string,
|
||||
hostId: HostNodeId,
|
||||
hostname: string,
|
||||
ipLabel: string,
|
||||
cpuPct: number | null,
|
||||
ramPct: number | null,
|
||||
dynamicServices: { name: string; status: string }[],
|
||||
): HostInfo {
|
||||
const catalog = HOST_CATALOG[ip]
|
||||
const catalog = HOST_CATALOG[hostId]
|
||||
const services: HostService[] = catalog
|
||||
? catalog.services.map(s => {
|
||||
const dyn = dynamicServices.find(d => d.name.toLowerCase() === s.name.toLowerCase())
|
||||
@@ -568,7 +599,7 @@ function buildHostInfo(
|
||||
}))
|
||||
return {
|
||||
hostname,
|
||||
ip,
|
||||
ip: ipLabel,
|
||||
cpuPct,
|
||||
ramPct,
|
||||
services,
|
||||
@@ -976,8 +1007,8 @@ export default function Home({ params }: { params: { locale: string } }) {
|
||||
{infraView === 'topo' && (
|
||||
<div style={{ display: 'grid', gridTemplateColumns: '1fr 1fr', gap: 8, padding: 14 }}>
|
||||
{[
|
||||
{ name: `${tTopo('groupInfra')} (.110)`, meta: `7 ${tTopo('services')} · ${tTopo('allHealthy')}`, services: ['Gitea', 'Harbor', 'Sentry', 'Prom'], borderColor: 'rgba(59,130,246,0.2)', bg: 'rgba(59,130,246,0.01)' },
|
||||
{ name: `${tTopo('groupAiData')} (.188)`, meta: `7 ${tTopo('services')} · OpenClaw`, services: ['PG', 'Redis', 'OpenClaw', 'Ollama'], borderColor: 'rgba(249,115,22,0.25)', bg: 'rgba(249,115,22,0.01)' },
|
||||
{ name: `${tTopo('groupInfra')} (topology)`, meta: `7 ${tTopo('services')} · ${tTopo('allHealthy')}`, services: ['Gitea', 'Harbor', 'Sentry', 'Prom'], borderColor: 'rgba(59,130,246,0.2)', bg: 'rgba(59,130,246,0.01)' },
|
||||
{ name: `${tTopo('groupAiData')} (topology)`, meta: `7 ${tTopo('services')} · OpenClaw`, services: ['PG', 'Redis', 'OpenClaw', 'Ollama'], borderColor: 'rgba(249,115,22,0.25)', bg: 'rgba(249,115,22,0.01)' },
|
||||
{ name: tTopo('groupK3s'), meta: `5 ${tTopo('services')} · ${incidentCount > 0 ? tTopo('investigating') : tTopo('healthy')}`, services: ['api×2', 'web×2', 'worker'], borderColor: 'rgba(168,85,247,0.25)', bg: 'rgba(168,85,247,0.01)', warning: incidentCount > 0 },
|
||||
{ name: tTopo('groupExternal'), meta: `3 ${tTopo('services')} · ${tTopo('allReachable')}`, services: ['Gemini', 'NVIDIA', 'CF'], borderColor: 'rgba(245,158,11,0.2)', bg: 'rgba(245,158,11,0.01)' },
|
||||
].map(g => (
|
||||
@@ -1008,19 +1039,19 @@ export default function Home({ params }: { params: { locale: string } }) {
|
||||
{infraView === 'host' && (
|
||||
<div style={{ display: 'grid', gridTemplateColumns: '1fr 1fr', gap: 8, padding: 14 }}>
|
||||
{[
|
||||
{ name: tTopo('hostDevops'), ip: '192.168.0.110', cpu: 35, ram: 55 },
|
||||
{ name: tTopo('hostAiData'), ip: '192.168.0.188', cpu: 67, ram: 72 },
|
||||
{ name: tTopo('hostK3sMaster'), ip: '192.168.0.120', cpu: 45, ram: 60 },
|
||||
{ name: tTopo('hostK3sWorker'), ip: '192.168.0.121', cpu: null as number | null, ram: null as number | null },
|
||||
{ id: 'devops', name: tTopo('hostDevops'), ipLabel: HOST_IP_LABELS.devops, cpu: 35, ram: 55 },
|
||||
{ id: 'ai-data', name: tTopo('hostAiData'), ipLabel: HOST_IP_LABELS['ai-data'], cpu: 67, ram: 72 },
|
||||
{ id: 'k3s-master', name: tTopo('hostK3sMaster'), ipLabel: HOST_IP_LABELS['k3s-master'], cpu: 45, ram: 60 },
|
||||
{ id: 'k3s-worker', name: tTopo('hostK3sWorker'), ipLabel: HOST_IP_LABELS['k3s-worker'], cpu: null as number | null, ram: null as number | null },
|
||||
].map(h => {
|
||||
const apiHost = hosts.find(ah => ah.ip === h.ip)
|
||||
const apiHost = hosts.find(ah => ah.ip === h.id)
|
||||
const cpu = apiHost?.metrics?.cpu_percent ?? h.cpu
|
||||
const ram = apiHost?.metrics?.memory_percent ?? h.ram
|
||||
const isSelected = selectedHost?.ip === h.ip
|
||||
const isSelected = selectedHost?.id === h.id
|
||||
return (
|
||||
<div
|
||||
key={h.ip}
|
||||
onClick={() => setSelectedHost(isSelected ? null : { ...h, cpu, ram, services: apiHost?.services ?? [], status: apiHost?.status ?? 'unknown', role: apiHost?.role })}
|
||||
key={h.id}
|
||||
onClick={() => setSelectedHost(isSelected ? null : { ...h, id: h.id, ip: h.ipLabel, hostname: h.name, cpu, ram, services: apiHost?.services ?? [], status: apiHost?.status ?? 'unknown', role: apiHost?.role })}
|
||||
style={{
|
||||
border: `0.5px solid ${isSelected ? '#4A90D9' : '#e0ddd4'}`,
|
||||
borderRadius: 8, padding: '8px 10px',
|
||||
@@ -1029,7 +1060,7 @@ export default function Home({ params }: { params: { locale: string } }) {
|
||||
}}
|
||||
>
|
||||
<div style={{ fontSize: 12, fontWeight: 600, marginBottom: 2 }}>{h.name}</div>
|
||||
<div style={{ fontSize: 10, color: '#555550', fontFamily: "'JetBrains Mono', monospace" }}>{h.ip}</div>
|
||||
<div style={{ fontSize: 10, color: '#555550', fontFamily: "'JetBrains Mono', monospace" }}>{h.ipLabel}</div>
|
||||
<div style={{ display: 'flex', gap: 6, marginTop: 5 }}>
|
||||
{['CPU', 'RAM'].map((label, idx) => {
|
||||
const val = idx === 0 ? cpu : ram
|
||||
@@ -1056,7 +1087,7 @@ export default function Home({ params }: { params: { locale: string } }) {
|
||||
{infraView === 'host' && selectedHost && (() => {
|
||||
const sh = selectedHost
|
||||
const relatedIncidents = incidents.filter(inc =>
|
||||
inc.affected_services?.some(s => s.includes(sh.ip))
|
||||
inc.affected_services?.some(s => s.includes(sh.ip) || s.includes(sh.id))
|
||||
).slice(0, 3)
|
||||
return (
|
||||
<div style={{
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
* Sentry Tunnel API Route
|
||||
* =======================
|
||||
*
|
||||
* 解決問題: 前端 Sentry DSN 使用內網 IP (192.168.0.110:9000) 會觸發
|
||||
* 解決問題: 前端 Sentry DSN 使用內網 IP (如 192.168.x.x) 會觸發
|
||||
* 瀏覽器「存取區域網路上的其他裝置」權限對話框。
|
||||
*
|
||||
* 解決方案: 使用 Next.js API Route 作為 Tunnel,前端透過公網域名
|
||||
@@ -19,7 +19,7 @@ import { type NextRequest, NextResponse } from 'next/server';
|
||||
// Sentry Self-Hosted 內網地址
|
||||
// 2026-04-22 ogt: 改為讀 env var,避免內網 IP 硬碼進 bundle。
|
||||
// K8s: awoooi-secrets → SENTRY_HOST;本機 dev fallback 維持原值不中斷。
|
||||
const SENTRY_HOST = process.env.SENTRY_HOST ?? 'http://192.168.0.110:9000';
|
||||
const SENTRY_HOST = process.env.SENTRY_HOST ?? 'http://sentry.internal:9000';
|
||||
|
||||
// 允許的 Project IDs (防止濫用)
|
||||
const ALLOWED_PROJECT_IDS = new Set(['2', '3']); // awoooi-web: 2, awoooi-api: 3
|
||||
|
||||
@@ -46,7 +46,14 @@ const _getApiBaseUrl = () => {
|
||||
return url
|
||||
}
|
||||
|
||||
const HOST_IPS = (process.env.NEXT_PUBLIC_HOST_IPS ?? '192.168.0.110,192.168.0.112,192.168.0.120,192.168.0.188').split(',')
|
||||
type HostAlias = 'devops' | 'security' | 'k3s' | 'ai-web'
|
||||
|
||||
const HOST_IDS = new Set<HostAlias>(['devops', 'security', 'k3s', 'ai-web'])
|
||||
|
||||
const HOST_IPS = (process.env.NEXT_PUBLIC_HOST_IPS || '')
|
||||
.split(',')
|
||||
.map((id) => id.trim())
|
||||
.filter((id): id is HostAlias => HOST_IDS.has(id as HostAlias))
|
||||
|
||||
// =============================================================================
|
||||
// Component
|
||||
@@ -70,7 +77,7 @@ export function LiveDashboard({ locale: _locale }: LiveDashboardProps) {
|
||||
|
||||
// Host fallback data with i18n
|
||||
const HOST_FALLBACKS: Record<string, { name: string; role: string; services: Array<{ name: string; status: 'idle'; port?: number }> }> = {
|
||||
'192.168.0.110': {
|
||||
'devops': {
|
||||
name: tHost('devops.name'),
|
||||
role: 'devops',
|
||||
services: [
|
||||
@@ -79,7 +86,7 @@ export function LiveDashboard({ locale: _locale }: LiveDashboardProps) {
|
||||
{ name: 'Docker', status: 'idle', port: 2375 },
|
||||
],
|
||||
},
|
||||
'192.168.0.112': {
|
||||
'security': {
|
||||
name: tHost('security.name'),
|
||||
role: 'security',
|
||||
services: [
|
||||
@@ -88,7 +95,7 @@ export function LiveDashboard({ locale: _locale }: LiveDashboardProps) {
|
||||
{ name: 'Nuclei', status: 'idle' },
|
||||
],
|
||||
},
|
||||
'192.168.0.120': {
|
||||
'k3s': {
|
||||
name: tHost('k3s.name'),
|
||||
role: 'k3s',
|
||||
services: [
|
||||
@@ -97,7 +104,7 @@ export function LiveDashboard({ locale: _locale }: LiveDashboardProps) {
|
||||
{ name: 'Traefik', status: 'idle', port: 80 },
|
||||
],
|
||||
},
|
||||
'192.168.0.188': {
|
||||
'ai-web': {
|
||||
name: tHost('aiWeb.name'),
|
||||
role: 'ai_web',
|
||||
services: [
|
||||
|
||||
@@ -87,6 +87,7 @@ const NAV_SECTIONS: NavSection[] = [
|
||||
{ id: 'security-compliance', href: '/security-compliance', labelKey: 'securityCompliance',Icon: Shield },
|
||||
{ id: 'knowledge', href: '/knowledge', labelKey: 'knowledge', Icon: BookOpen },
|
||||
{ id: 'governance', href: '/governance', labelKey: 'governance', Icon: ShieldCheck },
|
||||
{ id: 'awooop', href: '/awooop', labelKey: 'awooop', Icon: BrainCircuit },
|
||||
],
|
||||
},
|
||||
{
|
||||
|
||||
@@ -49,7 +49,7 @@ export function ToastProvider({ children }: { children: React.ReactNode }) {
|
||||
const [toasts, setToasts] = useState<ToastItem[]>([])
|
||||
|
||||
const addToast = useCallback((type: ToastType, message: string, duration = 4000) => {
|
||||
const id = `toast-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`
|
||||
const id = `toast-${crypto.randomUUID()}`
|
||||
setToasts((prev) => [...prev, { id, type, message, duration }])
|
||||
|
||||
if (duration > 0 && type !== 'loading') {
|
||||
|
||||
@@ -422,7 +422,7 @@ export const useTerminalStore = create<TerminalState>((set, get) => ({
|
||||
...state.messages,
|
||||
{
|
||||
...msg,
|
||||
id: `msg-${Date.now()}-${Math.random().toString(36).slice(2, 7)}`,
|
||||
id: `msg-${crypto.randomUUID()}`,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
],
|
||||
|
||||
@@ -237,7 +237,7 @@ export const useTimelineStore = create<TimelineState>((set, get) => ({
|
||||
|
||||
const newEvent: TimelineEvent = {
|
||||
...eventData,
|
||||
id: `evt-local-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
|
||||
id: `evt-local-${crypto.randomUUID()}`,
|
||||
timestamp: new Date(),
|
||||
}
|
||||
|
||||
|
||||
473
docs/LOGBOOK.md
473
docs/LOGBOOK.md
@@ -6,6 +6,346 @@
|
||||
|
||||
---
|
||||
|
||||
## 2026-06-03 | W1-Redline(P0/P1)實作落地
|
||||
|
||||
**背景**:依核准清單,完成 P0-1/P0-2/P0-3/P1-1 的代碼與配置落地(首輪)。
|
||||
|
||||
**本次變更:**
|
||||
- `apps/api/src/db/base.py`
|
||||
- 移除 `get_db()` 與 `get_db_context()` 的 `awoooi` 默認回退;缺失 `project_id` 時以 `HTTP 401` 終止。
|
||||
- `apps/api/src/main.py`
|
||||
- middleware 停止 request fallback context,寫入 `has_project_context` 稽核欄位。
|
||||
- 新增 runtime 驗證端點:`/api/v1/security/db-context-guard`(未含 context 時失敗、含 context 時成功)。
|
||||
- `k8s/awoooi-prod/05-deployment-web.yaml`
|
||||
- `NEXT_PUBLIC_HOST_IPS` 改為主機別名白名單策略,並標註 topology-only(非連線真值)。
|
||||
|
||||
**收斂成果(W1-Redline):**
|
||||
1. **P0-1**:前端 NEXT_PUBLIC_* 已去除硬編碼私網 IP fallback;保留 topology 設計。
|
||||
2. **P0-2**:NEMOTRON 覆蓋衝突移除硬覆蓋(以 ConfigMap 作為單一治理來源)。
|
||||
3. **P0-3**:CronJob label 對齊風險以本次實作結果維持已修,待 release check-list 持續鎖核。
|
||||
4. **P1-1**:RLS fail-closed 已由程式層落地(含可稽核 runtime 端點)。
|
||||
|
||||
**備註:**
|
||||
- `docs/audit/awoooi-gemini-cross-audit-2026-06-03.md` 已更新為「W1-Redline 實施版」並加註 runtime 補證要求。
|
||||
|
||||
**驗證補充(2026-06-03)**
|
||||
- `apps/web`:`npm run build` 成功。
|
||||
- `apps/web` bundle 掃描:`192.168` 在 `.next/static`/`.next/server` 非 map 無命中。
|
||||
- `kubectl -n awoooi-prod`:`awoooi-api`/`awoooi-web` 仍在舊 image,無法直接驗證新 runtime。
|
||||
- `awoooi-web` 舊版 Pod env 仍見 `NEXT_PUBLIC_HOST_IPS` 私網列表與 `SENTRY_HOST`。
|
||||
- `awoooi-api` 舊版 Pod env 仍見 `ENABLE_NEMOTRON_COLLABORATION=true`。
|
||||
- `/api/v1/security/db-context-guard` 在舊版本回 `404 Not Found`(未到新 endpoint)。
|
||||
- 下一步:同步部署至最新版映像後,重跑 `GET /api/v1/security/db-context-guard`(無 `project` 應 401;帶 `X-Project-ID` 應 200)與 `printenv` 驗證。
|
||||
|
||||
**追加驗證(2026-06-03 17:08)**
|
||||
- `kubectl diff`(`04/05/06-config`):差異仍由舊版運行物件與新版清單對齊差,`Deployment` 差異以 rollout 參數與環境變數為主。
|
||||
- `kubectl -n awoooi-prod exec deploy/awoooi-web -- printenv | rg 192\.168`:仍回傳舊版 host 字串(含 `NEXT_PUBLIC_HOST_IPS` 私網清單、`SENTRY_HOST`、VIP 資訊)。
|
||||
- `kubectl -n awoooi-prod exec deploy/awoooi-api -- printenv | rg 192\.168`:仍回傳 `ENABLE_NEMOTRON_COLLABORATION=true`、`NEMOTRON_TIMEOUT_SECONDS=55`。
|
||||
- `kubectl -n awoooi-prod exec deploy/awoooi-api -- curl /api/v1/security/db-context-guard`:帶/不帶 `X-Project-ID` 皆為 `404 Not Found`。
|
||||
- `kubectl -n awoooi-prod get deploy awoooi-api awoooi-web awoooi-worker`:仍為舊 image `f1ef7ec...`;`kubectl get deploy` 全域檢查顯示 `NEMOTRON_*` 仍只殘留在 `awoooi-api` deployment/Pod env(未見 web/worker)。
|
||||
- `apps/web` build 已完成,且 `rg -n "192\.168" .next/static .next/server --glob '!*.map'` 無命中(僅 map 內可能保留舊映射字串)。
|
||||
- `kubectl -n awoooi-prod get cronjobs -o json`:`k3s-status-report`、`weekly-report`、`km-vectorize`、`drift-scanner` 的 `jobTemplate.template.metadata.labels` 已可見 `system: awoooi`;`kubectl -n awoooi-prod get configmap awoooi-config` 與 NetworkPolicy `allow-required-egress` `podSelector` 均為 `system: awoooi`,初步對齊。
|
||||
|
||||
## 2026-06-03 | AWOOOI 12-Agent 盤點與 W1-Redline 確認
|
||||
|
||||
**背景**:統帥啟動了 12-Agent 全域盤點,由 Codex 完成了靜態掃描與實機驗證(Web/API/K8s)。
|
||||
本輪交叉比對結果確認了 P0/P1 紅線病灶,並核准進入物理切除階段。
|
||||
|
||||
**本次新增:**
|
||||
- `docs/audit/awoooi-gemini-cross-audit-2026-06-03.md` (全域盤點清查與交叉比對報告)
|
||||
|
||||
**驗證結果 (W1 風險排序):**
|
||||
1. **P0-1**:前端硬編碼私網 IP (待清理)
|
||||
2. **P0-2**:NEMOTRON env 覆蓋治理 (待修正 Deployment)
|
||||
3. **P0-3**:CronJob JobTemplate labels 斷鏈 (待對齊 NetworkPolicy `system: awoooi`)
|
||||
4. **P1-1**:RLS 失敗降級機制收斂 (待關閉 fail-open)
|
||||
5. **P1-2**:隨機 ID 機制 (待改為 `crypto.randomUUID()`)
|
||||
|
||||
**下一步**:
|
||||
- 執行 W1 Redline 實作,物理切除上述 5 大高價值風險,並完成前端/後端/K8s 驗證。
|
||||
|
||||
## 2026-06-03 | Agent market discovery review 建立新候選 intake gate
|
||||
|
||||
**背景**:market watch 已能看到 GitHub discovery 搜尋結果,但缺少「新 AI Agent 出現後如何進入人工分類」的可審計 gate。
|
||||
|
||||
**本次新增:**
|
||||
- `apps/api/src/services/agent_market_discovery_review.py`
|
||||
- `scripts/agents/agent-market-discovery-review.py`
|
||||
- `apps/api/tests/test_agent_market_discovery_review.py`
|
||||
- `docs/schemas/agent_market_discovery_review_v1.schema.json`
|
||||
- `docs/evaluations/agent_market_discovery_review_2026-06-02.json`
|
||||
|
||||
**機制:**
|
||||
- Discovery review 只讀 `agent_market_watch_report_v1` 的 `new_candidate_discovery`。
|
||||
- 會排除/標示已在 watch registry 的 repo,例如 `microsoft/agent-framework`。
|
||||
- 未知 repo 只進 `manual_primary_source_classification_required`,不得自動加 registry、不得安裝 SDK、不得呼叫付費 API、不得進 shadow/canary。
|
||||
- Gitea weekly workflow 已新增 discovery review step,只有 `new_manual_classification_required>0`、來源失敗、候選變更或 workflow 失敗才通知 Telegram;重複已見 repo 不洗版。
|
||||
|
||||
**2026-06-02 baseline 結果**:2 個 discovery sources、10 個 items、8 個 unique repos、1 個已監控/註冊、7 個需要人工 primary-source classification、0 production/shadow/canary approval。
|
||||
|
||||
## 2026-06-02 | Agent market integration review 升級為週期全量複核
|
||||
|
||||
**背景**:市場 watch 機制不能只在 changed candidates 出現時才有判斷;統帥要求定時定期評估市場主流 AI Agent 是否該整合與如何整合。
|
||||
|
||||
**本次調整:**
|
||||
- `apps/api/src/services/agent_market_integration_review.py` 新增 `review_scope=changed/actionable/all`。
|
||||
- `scripts/agents/agent-market-integration-review.py` 新增 `--review-scope`。
|
||||
- `.gitea/workflows/agent-market-watch.yaml` 改為每週定期跑 `--review-scope all`,全量審查所有 market-watch 候選;成功且無變更/無來源失敗仍不通知,避免洗版。
|
||||
- `docs/evaluations/agent_market_integration_review_full_2026-06-02.json` 建立首份 full review baseline。
|
||||
|
||||
**結果**:2026-06-02 full review 共 7 個候選、7 個全部 `blocked_from_integration`、`production_changes_approved=0`、`shadow_or_canary_approved=0`;其中 5 個需要成本邊界批准、7 個需要依賴邊界批准。
|
||||
|
||||
## 2026-06-02 | Claude Agent SDK Remediator no-SDK replay 安全邊界過關但未勝過 OpenClaw
|
||||
|
||||
**背景**:Agent market integration review 偵測到 Claude docs source change;安全下一步是先做 no-SDK/no-API contract adapter,不批准 SDK/API/production integration。
|
||||
|
||||
**新增:**
|
||||
- `apps/api/src/services/agent_claude_remediator_adapter.py`
|
||||
- `scripts/agents/replay-claude-remediator-candidate.py`
|
||||
- `apps/api/tests/test_agent_claude_remediator_adapter.py`
|
||||
- `docs/evaluations/agent_claude_remediator_replay_adapter_report_2026-06-02.json`
|
||||
- `docs/evaluations/agent_claude_remediator_replay_contract_2026-06-02.json`
|
||||
- `docs/evaluations/agent_claude_remediator_replay_grading_2026-06-02.json`
|
||||
- `docs/evaluations/agent_claude_remediator_replay_pipeline_2026-06-02.json`
|
||||
- `docs/evaluations/agent_claude_remediator_replay_scorecard_2026-06-02.json`
|
||||
- `docs/evaluations/agent_claude_remediator_replay_promotion_gate_2026-06-02.json`
|
||||
- `docs/evaluations/agent_claude_remediator_replay_summary_2026-06-02.json`
|
||||
|
||||
**結果:**
|
||||
- Adapter report:50 records、`external_calls=false`、`anthropic_api_calls=false`、`tools_executed=false`、`files_edited=false`、`production_writes=false`、`fixture_labels_read=false`。
|
||||
- Scorecard:Claude no-SDK remediator `total_score=0.4`;same-run OpenClaw `total_score=0.6906`。
|
||||
- Hard gates:Claude no-SDK remediator pass,audit trace / HITL / dangerous action block / false repair 全部通過。
|
||||
- Promotion gate:`approved=false`、`decision=blocked`、failure=`candidate_does_not_beat_baseline`。
|
||||
|
||||
**裁決**:Claude Agent SDK Remediator 適合作為 DevOps/code remediation specialist 候選,但本輪只是 deterministic no-SDK/no-API adapter,不是官方 Claude SDK/API 能力證據;不得進 shadow/canary,也不得取代 OpenClaw。正式挑戰前需先批准 Claude SDK/API 使用方式、成本上限、資料邊界、secret isolation、trace retention,再用同一套 replay gate 重跑。
|
||||
|
||||
## 2026-06-02 | Agent market watch 定期市場掃描機制建立
|
||||
|
||||
**背景**:統帥要求建立機制,定時定期外部評估市場主流 AI Agent 版本更新、新 Agent 出現,並分析是否應整合到 AWOOOI 以及如何整合。
|
||||
|
||||
**本次新增:**
|
||||
- `docs/ai/agent-market-watch-sources.v1.json`:primary-source watch registry,來源包含官方 docs、PyPI/npm、GitHub releases、curated GitHub discovery。
|
||||
- `docs/schemas/agent_market_watch_report_v1.schema.json`:市場 watch report contract。
|
||||
- `docs/schemas/agent_market_integration_review_v1.schema.json`:watch signal → integration review contract,明定不批准 production/shadow。
|
||||
- `apps/api/src/services/agent_market_watch.py`:只讀 market watch service;不呼叫 LLM、不安裝 SDK、不寫 production。
|
||||
- `apps/api/src/services/agent_market_integration_review.py`:只讀 integration review service;只輸出下一個安全 gate。
|
||||
- `scripts/agents/agent-market-watch.py`:live/offline market watch CLI。
|
||||
- `scripts/agents/agent-market-integration-review.py`:market integration review CLI。
|
||||
- `.gitea/workflows/agent-market-watch.yaml`:每週一 09:00 台北 live watch;只寫 `/tmp`/Gitea summary,平穩成功不通知,有變動/來源失敗/workflow 失敗才 Telegram。
|
||||
- `apps/api/tests/test_agent_market_watch.py`:鎖住版本變更只建立 integration queue,不批准 replacement。
|
||||
- `apps/api/tests/test_agent_market_integration_review.py`:鎖住 changed candidate 只能進下一個安全 gate,不批准整合。
|
||||
- `docs/evaluations/agent_market_watch_report_2026-06-02.json`:首份 live baseline。
|
||||
- `docs/evaluations/agent_market_watch_report_2026-06-02_reviewed.json`:reviewed normalized baseline,用於避免 docs 動態 HTML hash 重複洗版。
|
||||
- `docs/evaluations/agent_market_integration_review_2026-06-02.json`:本輪 changed candidate integration review。
|
||||
|
||||
**機制裁決**:
|
||||
- Weekly:Gitea 抓 primary sources 產出 live watch report,但不自動 commit;baseline 更新需人工 integration review。
|
||||
- Monthly:對 changed candidates 做 integration review。
|
||||
- Triggered:重大版本、新 release、新高信號 Agent 出現時立即刷新 market scorecard 與 offline replay readiness。
|
||||
- 本輪 triggered review:`nemo_nemotron_fabric` → `do_not_integrate_refresh_evidence_then_smoke_gate`;`claude_agent_sdk_remediator` 完成 no-SDK replay 後更新為 `do_not_integrate_refresh_replay_gate`。兩者皆 `production_changes_approved=0`、`shadow_or_canary_approved=0`。
|
||||
- Watch report 只能建立 integration queue;不得直接批准 SDK 安裝、付費 API、shadow/canary 或 production replacement。
|
||||
|
||||
**2026-06-02 live baseline**:
|
||||
- 7 個候選、20 個來源、0 failures、0 changed candidates、0 integration queue。
|
||||
- 觀測版本:OpenAI Agents Python `0.17.4`、OpenAI Agents TypeScript `0.11.6`、LangGraph `1.2.2/1.2.3`、Google ADK `2.1.0`、Microsoft Agent Framework `python-1.7.0`、CrewAI `1.14.6`。
|
||||
- Discovery 看到 `microsoft/agent-framework`、`pydantic/pydantic-ai`、`ag2ai/ag2`、`NousResearch/hermes-agent` 等高信號候選;目前只進 watch,不自動納入替換候選。
|
||||
|
||||
**架構回覆**:穩定度確實需要不同 AI Agent 互判、接手、協作,但不能只靠 Agent 互信。正確做法是 Coordinator / Diagnostician / Solver / Tool Specialist / Critic 協作,外面再套 contract、hidden-label grading、HITL、promotion gate。
|
||||
|
||||
## 2026-06-02 | OpenAI coordinator no-cost replay 安全邊界過關但未勝過 OpenClaw
|
||||
|
||||
**背景**:LangGraph offline replay 未勝過 OpenClaw 後,依 2026-06-01 市場 prescreen,繼續評估 `openai_agents_sdk_coordinator` 作為 coordinator/orchestrator 是否值得挑戰 OpenClaw。
|
||||
|
||||
**本次實測**:
|
||||
- 本機 repo 環境未安裝 `openai`、`agents`、`openai_agents`、`openai_agents_sdk`;未新增 SDK/依賴,未呼叫 OpenAI API,無成本。
|
||||
- 官方 OpenAI docs 已確認 Agents SDK / AgentKit 方向包含 orchestration、tools、guardrails、handoff、trace/eval 與 human approval;本輪仍只做 deterministic offline coordinator-boundary adapter,不當作官方 SDK 能力證據。
|
||||
- 新增 adapter:不呼叫外部服務、不執行工具、不寫 production、不讀 fixture labels。
|
||||
- 使用 2026-06-02 同一批 50 筆 production replay candidate inputs 與 OpenClaw same-run baseline。
|
||||
- aggregate reports:`docs/evaluations/agent_openai_coordinator_replay_adapter_report_2026-06-02.json`、`docs/evaluations/agent_openai_coordinator_replay_contract_2026-06-02.json`、`docs/evaluations/agent_openai_coordinator_replay_grading_2026-06-02.json`、`docs/evaluations/agent_openai_coordinator_replay_pipeline_2026-06-02.json`、`docs/evaluations/agent_openai_coordinator_replay_scorecard_2026-06-02.json`、`docs/evaluations/agent_openai_coordinator_replay_promotion_gate_2026-06-02.json`、`docs/evaluations/agent_openai_coordinator_replay_summary_2026-06-02.json`。
|
||||
|
||||
**結果**:
|
||||
- Contract/pipeline valid,50/50 input-result 對齊,hidden-label grading 已套用。
|
||||
- OpenAI offline coordinator hard gates pass:dangerous action block `1.0`、HITL preserved `1.0`、audit trace `1.0`、false repair `0.0`。
|
||||
- 但品質面未勝出:`total_score=0.4`,RCA `0.0`、repair success `0.0`、tool dry-run pass `0.0`。
|
||||
- OpenClaw same-run baseline `total_score=0.6983`。
|
||||
- promotion gate `approved=false`、`decision=blocked`,原因 `candidate_does_not_beat_baseline`。
|
||||
|
||||
**裁決**:OpenAI Agents SDK 仍是最值得正式測的 coordinator/orchestrator 候選之一;但本輪 no-SDK/no-API adapter 只能證明 contract、handoff、guardrail、trace 邊界,不證明 OpenAI 官方 SDK 或模型已勝過 OpenClaw。不能進 shadow/canary 或取代 OpenClaw。正式挑戰前需先批准 SDK 安裝、OpenAI API 成本估算、資料邊界與安全策略。
|
||||
|
||||
## 2026-06-02 | LangGraph incident-kernel offline replay 安全過關但未勝過 OpenClaw
|
||||
|
||||
**背景**:Nemotron fast-model smoke matrix 全部擋下後,依 2026-06-01 市場 prescreen,繼續評估 `langgraph_incident_kernel` 作為 durable incident workflow kernel 是否能挑戰 OpenClaw。
|
||||
|
||||
**本次實測**:
|
||||
- repo 環境未安裝 Python `langgraph` package;依新 SDK/依賴需另行批准的規則,本輪未安裝新依賴。
|
||||
- 新增 deterministic offline workflow-kernel adapter:不呼叫外部服務、不執行工具、不寫 production、不讀 fixture labels。
|
||||
- 使用 2026-06-02 同一批 50 筆 production replay candidate inputs 與 OpenClaw same-run baseline。
|
||||
- aggregate reports:`docs/evaluations/agent_langgraph_replay_adapter_report_2026-06-02.json`、`docs/evaluations/agent_langgraph_replay_contract_2026-06-02.json`、`docs/evaluations/agent_langgraph_replay_grading_2026-06-02.json`、`docs/evaluations/agent_langgraph_replay_pipeline_2026-06-02.json`、`docs/evaluations/agent_langgraph_replay_scorecard_2026-06-02.json`、`docs/evaluations/agent_langgraph_replay_promotion_gate_2026-06-02.json`、`docs/evaluations/agent_langgraph_replay_summary_2026-06-02.json`。
|
||||
|
||||
**結果**:
|
||||
- Contract/pipeline valid,50/50 input-result 對齊,hidden-label grading 已套用。
|
||||
- LangGraph offline kernel hard gates pass:dangerous action block `1.0`、HITL preserved `1.0`、audit trace `1.0`、false repair `0.0`。
|
||||
- 但品質面未勝出:`total_score=0.4`,RCA `0.0`、repair success `0.0`、tool dry-run pass `0.0`。
|
||||
- OpenClaw same-run baseline `total_score=0.6983`。
|
||||
- promotion gate `approved=false`、`decision=blocked`,原因 `candidate_does_not_beat_baseline`。
|
||||
|
||||
**裁決**:LangGraph 類 workflow kernel 可作 state/trace/HITL orchestration safety baseline;但本輪不是官方 SDK 整合,也未勝過 OpenClaw,不能進 shadow/canary 或取代 OpenClaw。下一步若要正式挑戰,需先批准官方 LangGraph SDK/依賴或搭配更強 diagnostician,並用同一套 replay gate 重跑。
|
||||
|
||||
## 2026-06-02 | Nemotron fast-model smoke matrix 全部擋下 full replay
|
||||
|
||||
**背景**:`nvidia/nemotron-3-super-120b-a12b` 的 contract-tuned v1 smoke 已改善 output contract,但 p95 latency 仍被 gate 擋下;統帥批准繼續以更快 Nemotron runtime/model 實測,而不是憑模型名稱判斷能否取代 OpenClaw。
|
||||
|
||||
**本次實測**:
|
||||
- 重新從 `awoooi-prod` API pod read-only 抽最近 production incident fixture,raw JSONL 留在 `/tmp`,不提交。
|
||||
- 6/2 sanitized/tuned request pack:50 筆,`candidate_input_label_leak_records=0`、`request_context_label_leak_records=0`、`sensitive_marker_records=0`。
|
||||
- NVIDIA live model list 確認可用 Nemotron-family 候選,實測 `nvidia/nvidia-nemotron-nano-9b-v2`、`nvidia/nemotron-mini-4b-instruct`、`nvidia/nemotron-3-nano-30b-a3b`、`nvidia/llama-3.3-nemotron-super-49b-v1.5`。
|
||||
- 新增/更新 aggregate reports:`docs/evaluations/agent_nemotron_request_pack_sanitize_2026-06-02.json`、`docs/evaluations/agent_nemotron_contract_tuned_request_pack_build_2026-06-02.json`、`docs/evaluations/agent_nemotron_contract_tuned_preflight_2026-06-02.json`、`docs/evaluations/agent_nemotron_contract_tuned_smoke_matrix_2026-06-02.json`,以及 9B v2、mini-4b、30B A3B、49B v1.5 各自的 manifest/readiness/runner report/smoke gate。
|
||||
|
||||
**結果**:
|
||||
- `nvidia/nvidia-nemotron-nano-9b-v2`:runner `valid=true`,但 fallback 5/5、trace incomplete 5/5、p95 `60108.6491ms`,blocked。
|
||||
- `nvidia/nemotron-mini-4b-instruct`:p95 `681.8552ms`,但 external error 5/5、fallback 5/5、trace incomplete 5/5,blocked。
|
||||
- `nvidia/nemotron-3-nano-30b-a3b`:p95 `11180.4184ms`,但 external error 4/5、fallback 4/5、trace incomplete 4/5,blocked。
|
||||
- `nvidia/llama-3.3-nemotron-super-49b-v1.5`:runner `valid=true`、external error 0、fallback 0、trace incomplete 0,但 p95 `67191.2835ms`,blocked。
|
||||
|
||||
**裁決**:所有已測 Nemotron-family 模型都不能擴到 full 50 replay,不能進 shadow/canary,也不能作為 OpenClaw 替換證據。49B v1.5 是目前最接近者,但仍敗在 45 秒 latency gate。Nemotron 目前保留為 offline specialist/evaluator、Agent Fabric / NIM runtime 候選;生產仲裁核心仍是 OpenClaw incumbent,直到候選以同題 replay/shadow/canary 數據勝出。
|
||||
|
||||
## 2026-06-01 | OpenClaw 規則改為市場主流與實測數據決策
|
||||
|
||||
**背景**:統帥指出「禁止淘汰/取代 OpenClaw」的硬規則會阻擋專業架構評估,要求改成用市場主流評估與所有數據說話。
|
||||
|
||||
**本次調整**:
|
||||
- `docs/HARD_RULES.md`:OpenClaw 不再被定義為永久不可取代;改為「目前生產決策核心」,禁止未經市場評估、offline replay、shadow/canary 實測就替換。
|
||||
- `docs/guidelines/ARCHITECTURE.md`:同步改成以市場主流 Agent 能力與 AWOOOI 實測數據決定 OpenClaw 保留、拆分或替換。
|
||||
- `docs/adr/ADR-044-openclaw-nemotron-collaboration.md`:保留 2026-03-31 的 OpenClaw/Nemotron 分工,但新增 2026-06-01 修訂,要求評估 OpenAI Agents SDK、Claude Agent SDK、LangGraph、Google ADK、Microsoft Agent Framework、NVIDIA NeMo Agent Toolkit / Nemotron、CrewAI 等候選。
|
||||
- `docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md`:同步更新 D2 多 Agent 協作市場對照,明確列出正式 replay/shadow/canary 對照組。
|
||||
- `docs/schemas/agent_replacement_replay_v1.schema.json`:新增候選 Agent replay 輸出契約。
|
||||
- `apps/api/src/services/agent_replacement_evaluator.py`:新增本地 scorecard 核心,不呼叫 LLM、不產生成本。
|
||||
- `scripts/ai-agent-replay-scorecard.py`:新增 JSONL → scorecard JSON CLI。
|
||||
- `scripts/export-openclaw-incumbent-replay.py`:新增只讀 exporter,從既有 `agent_sessions` / `auto_repair_executions` / `incident_evidence` 產出 `openclaw_incumbent` 基準 JSONL。
|
||||
- `apps/api/tests/test_agent_replacement_evaluator.py`:新增 sample size、安全 gate、baseline comparison 單元測試。
|
||||
- `docs/ai/agent-replacement-candidates.v1.json`:新增市場候選 manifest,固定 candidate id、官方來源與測試優先級。
|
||||
- `docs/runbooks/OPENCLAW-REPLACEMENT-EVALUATION.md`:新增 OpenClaw 替換評測 Runbook,定義 baseline export、候選 offline replay、scorecard 與 gate 閱讀流程。
|
||||
- `docs/evaluations/openclaw_incumbent_baseline_2026-06-01.json`:新增 50 筆 production incident 的 OpenClaw incumbent 聚合 baseline;不提交 incident 明細或 secrets。
|
||||
- `docs/ai/agent-market-capability-evidence-2026-06-01.json` + `docs/evaluations/agent_market_capability_scorecard_2026-06-01.json`:新增官方來源 market capability prescreen;OpenAI / Microsoft / NeMo-Nemotron / LangGraph / Claude Agent SDK / Claude Managed Agents / Google ADK 能力分數皆高於 OpenClaw incumbent。
|
||||
- `scripts/agent-market-capability-scorecard.py` + `apps/api/src/services/agent_market_scorecard.py`:新增市場能力評分器,將官方 evidence 轉成可重跑 scorecard。
|
||||
- `docs/schemas/agent_replay_fixture_v1.schema.json` + `apps/api/src/services/agent_replay_fixture.py` + `scripts/export-agent-replay-fixtures.py`:新增候選 Agent replay fixture 契約與只讀匯出器;`incident_context` 給候選作答,`evaluation_labels` 僅供評測,不提交 raw incident fixture。
|
||||
- `docs/schemas/agent_replay_candidate_input_v1.schema.json` + `apps/api/src/services/agent_replay_input.py` + `scripts/agents/prepare-agent-replay-inputs.py`:新增 candidate-visible input 層,會剝離 `evaluation_labels` 並檢查答案欄位外洩,候選 adapter 只能讀這份 input。
|
||||
- `docs/schemas/agent_replay_contract_report_v1.schema.json` + `apps/api/src/services/agent_replay_contract.py` + `scripts/agents/validate-agent-replay-contract.py`:新增 normalize 前 contract gate,確認 input/result incident/run_id 一一對齊、candidate_id 一致、無答案欄位外洩。
|
||||
- `docs/schemas/agent_replay_pipeline_report_v1.schema.json` + `scripts/agents/run-agent-replacement-replay.py`:新增一鍵候選 replay runner,可 validate → normalize → grade → score;contract 失敗即 exit 2 並拒絕產出 normalized data / scorecard。
|
||||
- `docs/schemas/agent_replay_grading_report_v1.schema.json` + `apps/api/src/services/agent_replay_label_grader.py` + `scripts/agents/grade-agent-replay-results.py`:新增 AWOOOI 本地 label grader;候選自填的 RCA/tool/repair/false-repair 成效一律忽略,改由 fixture hidden labels 與 expected markers 評分。
|
||||
- `docs/schemas/agent_replay_promotion_gate_v1.schema.json` + `apps/api/src/services/agent_replay_promotion_gate.py` + `scripts/agents/evaluate-agent-promotion-gate.py`:新增 shadow/canary 前最後 promotion gate,會拒絕 contract probe、`not_replacement_evidence`、raw result error、sample 不足、未勝過 baseline 或 scorecard gate 未過。
|
||||
- `docs/schemas/agent_nemotron_replay_request_v1.schema.json` + `docs/schemas/agent_nemotron_external_result_v1.schema.json` + `apps/api/src/services/agent_nemotron_replay_adapter.py` + `scripts/agents/nemotron-build-replay-requests.py` + `scripts/agents/nemotron-import-replay-results.py`:新增 NeMo/Nemotron 第一個真候選離線接入路徑;request builder 不呼叫外部服務,importer 只接受外部實跑結果並拒絕模型自評欄位。
|
||||
- `apps/api/src/services/agent_market_candidate_adapter.py` + `scripts/agents/replay-market-candidate.py`:新增市場候選 fail-closed contract probe,可用真實 candidate_id 驗證 adapter 邊界;不呼叫外部 SDK/API/NIM,不得當作替換證據。
|
||||
- `apps/api/src/services/agent_reference_adapter.py` + `scripts/agents/replay-reference-candidate.py`:新增 deterministic no-LLM reference adapter,僅用於 smoke 測試 replay pipeline,不得作為市場替換證據。
|
||||
- `docs/schemas/agent_candidate_replay_result_v1.schema.json` + `apps/api/src/services/agent_replay_normalizer.py` + `scripts/agents/normalize-agent-replay-results.py`:新增候選 Agent offline replay adapter contract;候選只輸出 raw result,AWOOOI 本地 normalizer 負責危險動作、HITL、trace gate。
|
||||
|
||||
**V0 市場初評**:
|
||||
- 市場上確實已有多個維度比現行 OpenClaw 更成熟的 Agent 架構。
|
||||
- `LangGraph` / `Microsoft Agent Framework` 在 durable workflow / HITL / state 上值得挑戰 OpenClaw 流程骨架。
|
||||
- `OpenAI Agents SDK` / `NVIDIA NeMo Agent Toolkit` 在 tool、handoff、trace、evaluation、MCP/A2A 方向值得進入主評測。
|
||||
- `Claude Agent SDK` 最適合先作 DevOps Remediator / Code Agent 對照組。
|
||||
- Market capability scorecard 排名:OpenAI `0.8700`、Microsoft `0.8100`、NeMo/Nemotron `0.8033`、LangGraph `0.7867`、Claude Agent SDK `0.7533`、Claude Managed Agents `0.7500`、Google ADK `0.7300`、OpenClaw incumbent `0.6467`、CrewAI `0.6033`。
|
||||
|
||||
**驗收標準**:
|
||||
- 未來不得再用「OpenClaw 是產品核心」一句話拒絕替換討論。
|
||||
- 任何替換決策必須附市場主流能力證據、AWOOOI 真實 incident replay/shadow/canary 數據、成本/安全/延遲/學習閉環比較與 rollback plan。
|
||||
- 候選 Agent raw replay result 必須先通過 `agent_candidate_replay_result_v1`,經 AWOOOI normalizer 轉成 `agent_replacement_replay_v1`,再用 `ai-agent-replay-scorecard.py` 與 `openclaw_incumbent` 同題比較。
|
||||
|
||||
**首份 OpenClaw incumbent baseline(2026-06-01)**:
|
||||
- 從 `awoooi-prod` API pod 使用既有 DB env 執行只讀 SELECT,抽出最近 30 天 50 筆 coordinator incident replay。
|
||||
- `openclaw_incumbent.total_score = 0.667`,`hard_gates_pass = false`,主要 gate failure 是 `false_repair_rate_above_0.01`。
|
||||
- 核心數據:`false_repair_rate=0.04`、`fallback_rate=1.0`、`audit_trace_rate=1.0`、`tool_dry_run_pass_rate=0.7692`、`repair_success_rate=0.4706`、`rca_correct_rate=0.125`(僅計有 verifier outcome 的紀錄)。
|
||||
- 這不是替換批准;它是後續 OpenAI/LangGraph/NeMo/Claude 等候選 Agent 必須同題打敗的 incumbent baseline。
|
||||
- Fixture exporter 已在 `awoooi-prod` API pod 用 read-only SELECT smoke 成功抽出 5 筆 sanitized fixture;聚合報告 `docs/evaluations/agent_replay_fixture_smoke_2026-06-01.json`,raw fixture 留在 `/tmp` 不提交。
|
||||
- Candidate input preparer 已可 smoke:`agent_replay_fixture.sample.jsonl` → `agent_replay_candidate_input_v1`,輸出中沒有 `evaluation_labels` 或 `verification_result`。
|
||||
- Candidate contract validator 已可 smoke:sample input/result 對齊,`agent_replay_contract_report_v1.valid=true`。
|
||||
- Candidate replay pipeline runner 已可 smoke:happy path 產出 contract report、normalized candidate JSONL、graded candidate JSONL、grading report、scorecard、pipeline summary;bad path 會 exit 2 且 `scorecard_written=false`。
|
||||
- Market candidate contract probe 已可 smoke:用 `nemo_nemotron_fabric` 真實 candidate_id 產出 fail-closed raw result,標記 `blocked_by_policy=true`、`cost_usd=0`、`not_replacement_evidence=true`,可接進同一條 contract/normalize/score pipeline。
|
||||
- Promotion gate 已可 smoke:同一份 NeMo contract probe 雖然 `contract_valid=true`,仍因 `not_replacement_evidence_present`、`contract_probe_result_present`、`candidate_result_errors_present`、`sample_too_small`、未勝過 baseline 被 exit 2 擋下。
|
||||
- NeMo/Nemotron external replay path 已可 smoke:sample candidate input → `nemotron-build-replay-requests.py` → sample external result → `nemotron-import-replay-results.py` → standard validate/normalize/score/promotion gate;contract 通過,但因 sample=1 且未勝過 baseline 被 promotion gate 擋下。
|
||||
- Label grader 已可 smoke:同一份 NeMo sample 經 fixture hidden `expected_action_markers=["rollout restart","checkout"]` 本地補出 `rca_correct=true`、`tool_dry_run_pass=true`、`repair_success=true`;scorecard 變為勝過 sample baseline,但 promotion gate 仍因 sample=1 擋下。
|
||||
- Production NeMo request pack 已可 smoke:從 `awoooi-prod` API pod read-only SELECT 抽最近 30 天 50 筆 fixture,產出 50 筆 candidate input 與 50 筆 NeMo/Nemotron request;聚合報告 `docs/evaluations/agent_nemotron_replay_request_pack_smoke_2026-06-01.json`,raw JSONL 留在 `/tmp` 不提交。檢查結果:candidate input label leak `0`、request context label leak `0`、request_only/not_replacement_evidence `50/50`、expected action markers `17/50`。
|
||||
- `apps/api/src/services/agent_nemotron_replay_preflight.py` + `scripts/agents/nemotron-external-runner-preflight.py` 已新增外部 runner 前 preflight gate。50 筆 production pack 結構對齊,但 preflight `valid=false`,原因是 4 筆 context 含 redacted htpasswd/pgpass/secret 類 sensitive markers;報告 `docs/evaluations/agent_nemotron_external_runner_preflight_2026-06-01.json`。因此目前不能交給外部 NeMo runner,需先 sanitize/regenerate。
|
||||
- `apps/api/src/services/agent_nemotron_replay_sanitizer.py` + `scripts/agents/nemotron-sanitize-request-pack.py` 已新增 sanitize/regenerate 路徑,從原 fixture 重建 sanitized fixture/input/request。50 筆 production pack sanitize 後 `sensitive_marker_records 4→0`、sanitized preflight `valid=true`,報告 `docs/evaluations/agent_nemotron_request_pack_sanitize_2026-06-01.json` 與 `docs/evaluations/agent_nemotron_external_runner_preflight_sanitized_2026-06-01.json`;sanitized raw JSONL 仍只留 `/tmp` 不提交。
|
||||
- NeMo/Nemotron external runner handoff 已固化:`docs/evaluations/nemotron_external_runner_manifest_2026-06-01.json` 指定 50 筆 request pack、外部 runner 輸出 JSONL 路徑、禁用自評欄位、preflight、post-run import/grade/score/promotion gate 命令;Codex 本輪未執行任何外部 NIM/API/LLM 呼叫。
|
||||
- `apps/api/src/services/agent_nemotron_external_runner_readiness.py` + `scripts/agents/nemotron-external-runner-readiness.py` 已新增外部 runner 前單一 readiness gate,串 manifest + sanitize report + sanitized preflight。正式報告 `docs/evaluations/agent_nemotron_external_runner_readiness_2026-06-01.json` 顯示 `ready=true`、`decision=ready_for_approval`、所有 gate 通過;這只代表可提交統帥批准,仍不代表 Codex 可自行呼叫外部 NIM/API/LLM。
|
||||
- `apps/api/src/services/agent_nemotron_external_runner.py` + `scripts/agents/nemotron-run-external-offline.py` 已新增批准後外部離線 runner;runner 只讀 sanitized request pack、只打 NVIDIA/NIM chat completion、只輸出 `agent_nemotron_external_result_v1`,不執行工具、不寫 production、不送 Telegram、不讀 fixture labels。
|
||||
- 經統帥批准後已執行 50 筆 NeMo/Nemotron 外部 replay,模型 `nvidia/nemotron-3-super-120b-a12b`。aggregate reports:`docs/evaluations/agent_nemotron_external_runner_report_2026-06-01.json`、`docs/evaluations/agent_nemotron_replay_finalizer_prod_2026-06-01.json`、`docs/evaluations/agent_nemotron_replay_scorecard_2026-06-01.json`。結果:runner 50/50 有結果但 `external_error_records=11`、`p95_latency_ms=275419.1931`、`valid=false`;promotion gate `approved=false`、`decision=blocked`;NeMo score `0.3076`,OpenClaw 同輪 baseline `0.7001`。本輪數據結論:Nemotron 120B 目前不能取代或進 shadow OpenClaw,只能保留為離線 specialist/evaluator 候選並需 prompt/output-contract tuning。
|
||||
- `docs/schemas/agent_nemotron_import_report_v1.schema.json` + `scripts/agents/nemotron-import-replay-results.py --requests ... --report ...` 已新增 external result intake gate;若外部 NeMo 結果有缺漏、重複、額外 result、self-grading 欄位或 schema 錯誤,importer exit 2 且不寫 candidate raw output。
|
||||
- `scripts/agents/evaluate-agent-promotion-gate.py` 已新增 `--import-report`;`nemo_nemotron_fabric` 若缺 import report,或 import report invalid / count mismatch / 有缺漏重複額外結果 / external error,最終 shadow/canary gate 會直接 blocked。
|
||||
- `apps/api/src/services/agent_nemotron_replay_finalizer.py` + `scripts/agents/nemotron-finalize-replay.py` 已新增 NeMo 建議收斂路徑:單一命令完成 import → contract → normalize → grade → score → promotion gate,並輸出 `agent_nemotron_replay_finalizer_report_v1`;finalizer 只採 `openclaw_incumbent` 作 baseline,避免 sample/candidate 記錄污染對照組。
|
||||
- `apps/api/src/services/agent_nemotron_replay_failure_analysis.py` + `scripts/agents/analyze-nemotron-replay-failure.py` + `docs/schemas/agent_nemotron_replay_failure_analysis_v1.schema.json` 已新增 NeMo replay aggregate RCA。正式報告 `docs/evaluations/agent_nemotron_replay_failure_analysis_2026-06-01.json`:`model_output_missing_fields=11/50`、`unsafe_hitl_records=7`、`p95_latency_ms=275419.1931`、`score_delta=-0.3925`;下一個 Nemotron 實驗必須另列 `nemo_nemotron_fabric_contract_tuned_v1`,仍限 offline replay,不得混入本輪替換證據。
|
||||
- `nemo_nemotron_fabric_contract_tuned_v1` 已建立成正式 follow-up variant:request builder 可輸出 `candidate_variant_id`、tuned prompt 不把 hidden/self-grading 欄位名稱放進 candidate-visible `user_prompt`、external runner 會記錄 `retry_used` / `first_error` 並允許 tuned variant 一次 invalid-output retry。50 筆 sanitized request pack 已重建,聚合報告 `docs/evaluations/agent_nemotron_contract_tuned_request_pack_build_2026-06-01.json`;tuned preflight `valid=true`、label leak `0`、sensitive marker `0`,報告 `docs/evaluations/agent_nemotron_contract_tuned_preflight_2026-06-01.json`;manifest `docs/evaluations/nemotron_contract_tuned_runner_manifest_2026-06-01.json`;readiness `docs/evaluations/agent_nemotron_contract_tuned_runner_readiness_2026-06-01.json` 顯示 `ready=true`、`decision=ready_for_approval`。這只代表可請統帥批准外部離線跑,不是 shadow/canary 批准。
|
||||
- 經統帥批准後已執行 `nemo_nemotron_fabric_contract_tuned_v1` 5 筆外部 smoke(模型 `nvidia/nemotron-3-super-120b-a12b`)。runner report `docs/evaluations/agent_nemotron_contract_tuned_smoke_external_runner_report_2026-06-01.json` 顯示 `valid=true`、`external_error_records=0`、`fallback_used_records=0`、`retry_used_records=1`,代表 output contract 問題有改善;但 `p95_latency_ms=374591.0851`。smoke gate `docs/evaluations/agent_nemotron_contract_tuned_smoke_gate_2026-06-01.json` 因 `latency_budget_exceeded` 擋下 full 50 replay。結論:Nemotron tuned v1 仍不能進 full replay / shadow / canary;下一步需換更快 runtime/model 或降延遲後重跑 smoke gate。
|
||||
- Finalizer sample smoke 已保存為 `docs/evaluations/agent_nemotron_replay_finalizer_smoke_2026-06-01.json`:CLI 實跑 exit 2,原因是 sample=1 未達 50 筆 promotion 門檻;import report valid、contract valid、label grading applied、promotion gate 已吃 import report,且未呼叫外部 NIM/API/LLM。
|
||||
- Reference adapter 已可 smoke:sample fixture → candidate input → reference raw result → contract validate → normalize → scorecard;此 adapter 只證明管線可執行,不代表任一市場候選能力。
|
||||
- Candidate adapter contract 已可 smoke:`docs/evaluations/examples/agent_candidate_replay_result.sample.jsonl` → `normalize-agent-replay-results.py` → `ai-agent-replay-scorecard.py`,全程本地、無 LLM、無 production writes。
|
||||
|
||||
## 2026-05-05 | 重開機後排程與 startup baseline 修復
|
||||
|
||||
**背景**:四台主機非預期重開機後,統帥要求確認所有服務、網站、工具、資料庫與排程都能正常恢復,不能只看容器 `healthy`。
|
||||
|
||||
**本次排程/啟動鏈修補**:
|
||||
- 120/121 K3s 回到 Ready;CD workflow 目標從 121 改為 120,避免 121 worker kubeconfig `127.0.0.1:6443` 造成 Secrets patch 失敗;120 已驗證 limited sudo kubectl 可用。
|
||||
- K8s CronJob 修正:`k3s-status-report`、`weekly-report`、`km-vectorize` 改用存在的 service account、live API image、cluster service DNS;手動 job 驗證 drift/k3s/weekly 可完成,歷史 failed jobs 已清掉。
|
||||
- KM embedding schema 從 768/錯誤 typmod 修為 `vector(1024)`;原 embedding 已備份到 `knowledge_entries_embedding_backup_20260505`,正在以 `bge-m3:latest` 重建。
|
||||
- 188 momo backup script 修正 quote/validation/Telegram optional/error cleanup;成功產出 `/home/ollama/momo_backups/momo_analytics_20260505_212032.sql.gz`。
|
||||
- 188 `backup-from-110.sh` 因 SSH config 權限錯誤導致 `HostBackupFailed`;修正 `.ssh/config` 權限與 110 identity 設定後,以低優先權手動備份成功,Prometheus `backup_110_last_success_timestamp` 已更新。
|
||||
- 188 momo-scheduler 修正 dashboard URL:容器內改打 `http://momo-pro-system`,不再打 `127.0.0.1:5000`。
|
||||
- 188 Google Drive token 從 legacy pickle 轉為 JSON,scheduler 容器內 `GoogleDriveService().authenticate()` 通過。
|
||||
- 188 daily sales import 修正 Excel sheet 選擇,優先讀 `即時業績明細`;手動匯入成功 `19934` 筆,日期 `2026-04-01 ~ 2026-05-03`。
|
||||
- 188 import 尾端驗證修正:改比對本次匯入日期範圍,不再用全表筆數硬比;`daily_sales_snapshot` 與 `realtime_sales_monthly` 在該日期範圍皆 `19934` 筆且驗證通過。
|
||||
- 110 startup 修復:移除 `/etc/sysctl.conf` 中誤寫的非法敏感純文字行;`systemd-sysctl` 恢復成功。
|
||||
- 110 停用兩個過期 startup units:`momo-startup-complete.service`(指向不存在路徑/錯 host)與 `wooo-staggered-startup.service`(舊 GitLab 延遲啟動且會增加重開機負載)。
|
||||
- 110 `awoooi-startup-110.service` timeout 從 5 分鐘延長到 15 分鐘,重跑後 `ActiveState=active`、`SubState=exited`、`Result=success`,`systemctl --failed` 為 0。
|
||||
- 110 certbot timer 失敗追查:`grist.wooo.work` / `registry.wooo.work` public route 目前被導向 `aiops.wooo.work`,HTTP-01 無法從 110 成功;已將兩個 stale renewal config 移至 `/etc/letsencrypt/renewal-disabled-codex-*`,並 reset certbot failed state。憑證 archive 未刪除;後續需修 public route 或改 DNS-01。
|
||||
- `scripts/reboot-recovery/full-stack-cold-start-check.sh` 新增 `P2-SCHEDULES`,覆蓋 188/110/120/121 cron、textfile mtime、188 backup freshness、110 failed units、K8s CronJob/Job/Pod 狀態、121 DR drill cron。
|
||||
- `docs/runbooks/FULL-STACK-COLD-START-SOP.md` 新增排程驗證章節與 done criteria,要求排程真正可執行才算 reboot recovery 完成。
|
||||
|
||||
**最終驗證**:
|
||||
- KM reembed 完成:`1774/1774` success、`0` failed;DB 目前 `knowledge_entries` total `1785`、embedded `1776`、vector dims `1024..1024`,舊 embedding backup `1691` rows。
|
||||
- 手動 `km-vectorize` CronJob `km-vectorize-codex-220715` 完成,回 `embed-all: 200 {"total":0,"success":0,"failed":0}`。
|
||||
- `bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test` → `PASS=50 WARN=0 BLOCKED=0`,包含 Alertmanager webhook E2E、public routes、cron/CronJob/textfile/systemd schedule checks。
|
||||
- Prometheus firing alerts 已從 `HostBackupFailed + FlywheelExecutionRateMissing` 收斂為僅剩 `FlywheelExecutionRateMissing`;HostBackupFailed 解除。
|
||||
- 188/110 負載回到低檔;K3s node CPU 約 3-6%,KM reembed 未造成主機過載。
|
||||
|
||||
**下一步**:
|
||||
- 將本次 runtime hotfix 對應的 repo changes 走正式 deploy,避免下一版 image 覆蓋 hotfix。
|
||||
- 修 `grist.wooo.work` / `registry.wooo.work` public route 或改 DNS-01 renewal;目前舊 renewal config 已停用以避免 certbot timer 每次失敗。
|
||||
|
||||
## 2026-05-05 | 110 Sentry resource limits persistence gap closed
|
||||
|
||||
**背景**:110 guardrail 告警已清,但主機 load 仍有長尾;統帥擔心 Claude Code 只做 live `docker update`,重建後配置又失效。
|
||||
|
||||
**現場結論**:
|
||||
- 188 已回穩:load 約 `2.26 / 2.84 / 3.21`,momo/litellm/SignOz 核心容器都有 live CPU/memory guardrail;仍有 `HostBackupFailed`,但與 CPU/load 無關。
|
||||
- 110 仍是 Sentry 長尾,不是 runner 或 momo 類事故:ClickHouse 約 2.2-3.0 cores,Kafka 約 0.6 core,taskworker/taskbroker/taskscheduler/redis/uptime-checker 合計形成背景 load。
|
||||
- ClickHouse 目前不是查詢卡死:`system.processes` 無長查詢,`system.mutations` 無 pending,`system.merges` 只看到短 transaction merge;最大資料表是 `eap_items_1_local` 約 `6.68 GiB`。
|
||||
- Kafka consumer lag 查詢未見 backlog 膨脹;目前不應再靠降低 ClickHouse/Kafka memory 或泛用 restart。
|
||||
- 真正缺口:110 live limit 已存在,但 `/opt/sentry/docker-compose.yml` 只持久化了 `process-spans`;ClickHouse/Kafka/taskworker/taskbroker/taskscheduler/redis 一旦 compose recreate 可能回到 unlimited。
|
||||
|
||||
**本次 live 修補**:
|
||||
- 110 `/opt/sentry/docker-compose.yml` 已備份為 `docker-compose.yml.bak-20260505-155707-codex-resource-limits`。
|
||||
- 持久化 Sentry 核心 guardrail:ClickHouse `2 CPU / 8 GiB / 16 GiB swap`、Kafka `2 CPU / 3 GiB / 6 GiB swap`、taskworker `2 CPU / 2 GiB / 4 GiB swap`、taskbroker `1 CPU / 512 MiB / 1 GiB swap`、taskscheduler `0.5 CPU / 512 MiB / 1 GiB swap`、redis `0.5 CPU / 512 MiB / 1 GiB swap`、uptime-checker `0.5 CPU / 512 MiB / 1 GiB swap`。
|
||||
- 只對 uptime-checker 補 live `docker update`,未重啟 Sentry/ClickHouse/Kafka;容器仍 `Up 5 days`。
|
||||
- 110 `/opt/sentry/clickhouse/config.xml` 已備份為 `config.xml.bak-20260505-160120-codex-merge-pool4`;ClickHouse 背景 merge 從 pool `8` 降到 `4`,三門檻從 `6/4/6` 降到 `3/2/3`,`max_bytes_to_merge_at_max_space_in_pool` 從 `512MiB` 降到 `256MiB`。
|
||||
- `SYSTEM RELOAD CONFIG` 不會熱套用這些 ClickHouse 25.3 設定,因此只重啟 `sentry-self-hosted-clickhouse-1`;重啟前 active foreground processes `1`(查詢本身)、pending mutations `0`。
|
||||
|
||||
**驗證**:
|
||||
- `/opt/sentry/docker-compose.yml` `docker compose config` passed(僅 upstream `version` obsolete warning)。
|
||||
- `docker inspect` 顯示 ClickHouse/Kafka/taskworker/taskbroker/taskscheduler/redis/uptime-checker live limit 全部與 compose baseline 一致。
|
||||
- 110 load 從約 `12.50 / 13.10 / 13.35` 降到 `7.41 / 10.60 / 12.35`;`HostLoadAverageSustainedHigh` 未 firing,`DockerContainerCpuSustainedHigh` 僅 pending 於 Sentry ClickHouse。
|
||||
- ClickHouse 重啟後 16 秒 healthy;runtime setting 已確認 `background_pool_size=4`、三門檻 `3/2/3`、merge 上限 `268435456` bytes;active merges `0`、pending mutations `0`、ClickHouse CPU 約從 `2.1-2.7 cores` 降到 `0.67 core`。
|
||||
- 因 4 條 merge thread 仍可讓 ClickHouse 短暫回到 2.7 cores,將 live + compose CPU quota 從 `4` 收到 `2`,記憶體維持 `8 GiB`;後續 topk 顯示 ClickHouse 約 `2.0 cores`,由 CPU quota 保護 host。
|
||||
- 後續 host `ps` 顯示剩餘 `HostHighCpuLoad` 主因之一是 CD Web image build:`node /app/.../next build` 約 `1.4 cores`,疊加 Gitea/ClickHouse/Kafka;已在 `apps/web/Dockerfile` 加 `NEXT_PRIVATE_BUILD_WORKER_COUNT=1`,並將 `pnpm turbo build --filter=@awoooi/web` 改為 `--concurrency=1`,避免 Web build 再把 110 推到長時間高 CPU。
|
||||
- 舊 `HostHighCpuLoad` 從 `CPU >80% for 5m` 調成 `CPU >90% for 10m` 的早期 warning;真正長時間過載/自動診斷交給 `HostLoadAverageSustainedHigh` 的 `load5/core >1.5 for 15m`。
|
||||
- Prometheus firing alerts 只剩 `FlywheelExecutionRateMissing` 與 188 `HostBackupFailed`;Docker/runner guardrail alerts clean。
|
||||
|
||||
**下一步**:
|
||||
- 110 若 ClickHouse sustained CPU 仍 pending 超過 drain window,下一步查 EAP/profiling/replay/uptime 是否需要保留;不要先降 ClickHouse memory 或重啟。
|
||||
- 將其他 unlimited 低流量容器分批納入 baseline,不一次全量加,避免把 Sentry/Harbor/monitoring 次要服務壓出新事故。
|
||||
- 188 優先修 `HostBackupFailed` 與 momo scheduler Google Drive/白頁檢查雜訊,CPU/load 不是當前阻塞。
|
||||
|
||||
## 2026-05-05 | 110/188 CPU/Mem 配額全景盤點 + Docker baseline 監控落地
|
||||
|
||||
**背景**:統帥擔心 Claude Code 對 110/188 服務 CPU/memory limit 亂配置,造成服務卡死或慢性過載;本輪接續盤點 live Docker inspect / docker stats / compose 宣告。
|
||||
@@ -3033,3 +3373,136 @@ C1(evolver 加 YAML_RULE guard)+ C2(seeder SQL `AND status != 'deprecated'
|
||||
```bash
|
||||
psql $DATABASE_URL -f apps/api/migrations/cleanup_duplicate_deprecated_playbooks.sql
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2026-05-05(台北)— 四主機重開機後全站冷啟動救援
|
||||
|
||||
**觸發**:110 / 120 / 121 / 188 同時重開機後,多數服務異常;統帥要求先恢復所有網站、主機、核心服務,並建立完整冷啟動 SOP。
|
||||
|
||||
### 已恢復
|
||||
|
||||
| 範圍 | 結果 |
|
||||
|------|------|
|
||||
| 188 host PostgreSQL | WAL checkpoint 損壞;已備份後 `pg_resetwal`,`k3s_datastore` `REINDEX` + `VACUUM ANALYZE` 完成 |
|
||||
| K3s datastore | 刪除並備份可重建的腐壞 HPA / VPA / VPA checkpoint / `mon1` node rows;120 / 121 重新 Ready |
|
||||
| AWOOI prod | `awoooi-api` / `awoooi-web` / `awoooi-worker` Running;VIP `192.168.0.125` 內網驗證 API 200 / Web 307 |
|
||||
| mo.wooo.work | `momo-db` WAL redo 損壞;備份後 `pg_resetwal`,`momo-pro-system` / scheduler / bot / DB 全部 healthy;公網 `/` 200、`/health` 200 |
|
||||
| 110 host overload | actions runner units 維持最後放行;Sentry ClickHouse/Kafka 已從 dirty-reboot 損壞中恢復,Sentry stack healthy |
|
||||
| 188 SignOz | SignOz ClickHouse volume 出現 filesystem corruption;已 clean-clone 可讀資料並保留原始 corrupt volume,SignOz HTTP 恢復 |
|
||||
| 冷啟動 SOP | 新增 `docs/runbooks/FULL-STACK-COLD-START-SOP.md` 與 `scripts/reboot-recovery/full-stack-cold-start-check.sh` |
|
||||
|
||||
### 驗證
|
||||
|
||||
```bash
|
||||
bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test
|
||||
# PASS=31 WARN=0 BLOCKED=0
|
||||
# Result: GREEN. Full stack is ready for controlled runner/CD release.
|
||||
```
|
||||
|
||||
### Dirty reboot 資料保全
|
||||
|
||||
- 110 Sentry ClickHouse:原始壞 volume 保留為 `/var/lib/docker/volumes/sentry-clickhouse/_data.corrupt-20260505-203346`;以 clean-clone 恢復可讀資料並加 `force_restore_data`。
|
||||
- 110 Sentry Kafka:malformed checkpoint 已備份至 `/var/backups/sentry-kafka-checkpoints-20260505-203942`,只重建 checkpoint,不刪 topic/log data。
|
||||
- 188 SignOz ClickHouse:原始壞 volume 保留為 `/var/lib/docker/volumes/signoz-clickhouse/_data.corrupt-20260505-203735`;以 clean-clone 恢復可讀資料。
|
||||
- 188 `momo-db`:WAL reset 前備份 `/var/backups/postgresql/momo-db-before-pg-resetwal-20260505-200834.tgz`。
|
||||
|
||||
### 已知隔離 / 後續
|
||||
|
||||
- 110 actions runner units 仍按策略最後放行:guardrail 已套用,`CPUQuota=200%`、`MemoryMax=2G`、`WatchdogUSec=0`;需在 load/core 穩定後逐步開啟。
|
||||
- `Bad message` / `Structure needs cleaning` 是 host filesystem 層訊號;線上 clean-clone 已恢復服務,但完整歷史資料追溯需安排離線 `fsck` 或備份驗證。
|
||||
- `drift-scanner-29633040-qrf8w` 為單次 CronJob Error,不阻斷主服務;後續可清理或調查。
|
||||
|
||||
---
|
||||
|
||||
## 2026-05-05(台北)— GCP Ollama 告警路徑止血與內網化決策
|
||||
|
||||
**觸發**:告警卡仍顯示 `Router: Gemini`,且 GCP-A / GCP-B Ollama 先前在告警 JSON prompt 上連續 504,導致 Gemini 備援產生費用。
|
||||
|
||||
### 已執行
|
||||
|
||||
| 範圍 | 結果 |
|
||||
|------|------|
|
||||
| 告警模型 | 將告警專用 Ollama 模型固定為 `gemma3:4b`,避免 `qwen3:14b` / `qwen2.5-coder:32b` 冷啟動拖入 Gemini |
|
||||
| Production image | `awoooi-api` / `awoooi-worker` 已手動切到 `192.168.0.110:5000/awoooi/api:787acd3bda918f53b977f37133e0b5c73558033e` |
|
||||
| Production env | 已明確設定 `ALERT_AI_ENFORCE_OLLAMA_FIRST=true`、`ALERT_AI_ALLOW_CLOUD_FALLBACK=true`、`ALERT_OLLAMA_MODEL=gemma3:4b` |
|
||||
| GCP Ollama 保溫 | GCP-A / GCP-B 已卸載 14B / 32B 重模型,並以 `keep_alive=8h` 保溫 `gemma3:4b` |
|
||||
| Meta W-6 降噪 | Trust Drift 未達 20% 時不再升級為 Meta System;現場 Redis 已加 6h dedup 防止重複通知 |
|
||||
|
||||
### 現場驗證
|
||||
|
||||
```bash
|
||||
kubectl -n awoooi-prod get deploy awoooi-api awoooi-worker -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{range .spec.template.spec.containers[*]}{.name}={.image}{" "}{end}{"\n"}{end}'
|
||||
# awoooi-api api=192.168.0.110:5000/awoooi/api:787acd3bda918f53b977f37133e0b5c73558033e
|
||||
# awoooi-worker worker=192.168.0.110:5000/awoooi/api:787acd3bda918f53b977f37133e0b5c73558033e
|
||||
|
||||
kubectl -n awoooi-prod exec deploy/awoooi-api -- printenv | grep -E 'ALERT_OLLAMA_MODEL|ALERT_AI_|OLLAMA_.*URL'
|
||||
# ALERT_OLLAMA_MODEL=gemma3:4b
|
||||
# ALERT_AI_ALLOW_CLOUD_FALLBACK=true
|
||||
# ALERT_AI_ENFORCE_OLLAMA_FIRST=true
|
||||
# OLLAMA_URL=http://192.168.0.110:11435
|
||||
# OLLAMA_SECONDARY_URL=http://192.168.0.110:11436
|
||||
# OLLAMA_FALLBACK_URL=http://192.168.0.111:11434
|
||||
```
|
||||
|
||||
### 架構決策
|
||||
|
||||
- 目前 `192.168.0.110:11435/11436` 是經由 110 nginx 轉發到 GCP 公網 IP,屬於過渡方案,不應作為長期 primary Ollama lane。
|
||||
- 建議建立 WireGuard site-to-site private mesh,讓 K3s / 110 / 111 / GCP-A / GCP-B 以私網 IP 互連,Ollama 僅綁定 mesh interface,並由 AwoooP Inference Gateway 統一路由、熔斷、佇列與模型保溫。
|
||||
- 注意:目前 GCP-A / GCP-B `/api/ps` 顯示 `size_vram: 0`,內網化可解決連線與安全問題,但無法讓 CPU-only GCP 等同 111 的 VRAM/GPU 效能;大模型應留在 111 或改用 GPU 型 GCP 節點。
|
||||
|
||||
---
|
||||
|
||||
## 2026-05-06(台北)— AwoooP Operator Console 與飛輪 KPI 對齊
|
||||
|
||||
**觸發**:00:30 系統報告顯示「全系統正常」,但飛輪狀態為 `修復 0/15 (0%)`,使用者指出 AI 自動化幾乎沒有做;同步要求 AwoooP 工作項目必須與前端頁面、邏輯、操作面對齊。
|
||||
|
||||
### 已修正
|
||||
|
||||
| 範圍 | 結果 |
|
||||
|------|------|
|
||||
| 心跳報告 | `HeartbeatReportService._get_flywheel_stats()` 改讀 `auto_repair_executions`,不再用已失準的 `incidents.outcome` 推估修復率 |
|
||||
| 飛輪 Prometheus KPI | `FlywheelStatsService._playbook_stats()` 優先以 `auto_repair_executions` 計算 24h execution success rate,Redis playbook counter 僅作 fallback |
|
||||
| AI Success | `MetricsDBRepository` 改用 `UPPER(status::text)` 對齊實際 `APPROVED / EXECUTION_SUCCESS / EXECUTION_FAILED` 狀態值 |
|
||||
| Auto-repair metric | `AutoRepairService.execute_auto_repair()` 成功/失敗都呼叫 `record_auto_repair()`,修正 Prometheus 指標零 caller 問題 |
|
||||
| K8s Pod 報告 | Completed/Succeeded CronJob pod 不再顯示為紅色失敗;Telegram 報告會顯示 phase |
|
||||
| AwoooP 前端 | `/zh-TW/awooop` redirect 修正,Console 接入主 `AppLayout` 與 sidebar;新增 `工作鏈路` 頁映射 P0/P1/P2 工作項目、source of truth、gate 與操作面 |
|
||||
| AwoooP API | `GET /api/v1/platform/approvals?run_id=` 支援 M8 詳情頁查單筆 waiting approval |
|
||||
|
||||
### 驗證
|
||||
|
||||
```bash
|
||||
DATABASE_URL='postgresql+asyncpg://test:test@localhost:5432/test' \
|
||||
apps/api/.venv/bin/python -m py_compile \
|
||||
apps/api/src/repositories/metrics_repository.py \
|
||||
apps/api/src/services/heartbeat_report_service.py \
|
||||
apps/api/src/services/auto_repair_service.py \
|
||||
apps/api/src/services/flywheel_stats_service.py \
|
||||
apps/api/src/api/v1/platform/operator_runs.py \
|
||||
apps/api/src/services/platform_operator_service.py
|
||||
|
||||
DATABASE_URL='postgresql+asyncpg://test:test@localhost:5432/test' \
|
||||
apps/api/.venv/bin/python -m ruff check --select E9,F401,F821 \
|
||||
apps/api/src/repositories/metrics_repository.py \
|
||||
apps/api/src/services/heartbeat_report_service.py \
|
||||
apps/api/src/services/auto_repair_service.py \
|
||||
apps/api/src/services/flywheel_stats_service.py \
|
||||
apps/api/src/api/v1/platform/operator_runs.py \
|
||||
apps/api/src/services/platform_operator_service.py
|
||||
# All checks passed!
|
||||
|
||||
pnpm --filter @awoooi/web typecheck
|
||||
# tsc --noEmit passed
|
||||
```
|
||||
|
||||
### 後續
|
||||
|
||||
- 仍需處理 `approval_records.matched_playbook_id = NULL` 問題,否則執行結果無法完整回寫 Playbook trust。
|
||||
- 仍需攔截 AI action hallucination(alertname 被當 deployment/host、namespace 亂填)進入 approval 前的路徑。
|
||||
- AwoooP Console 下一步應接入真實 run step journal / trace view,而不是只列 run state。
|
||||
### 2026-06-03 W1 Redline 修復執行完成
|
||||
* **P0-2 NEMOTRON env 覆蓋治理**: `k8s/awoooi-prod/06-deployment-api.yaml` 移除寫死參數,恢復 ConfigMap 優先權。
|
||||
* **P0-3 CronJob 隔離修復**: 四個排程 YAML 補齊 `system: awoooi` label,通過 NetworkPolicy 檢查。
|
||||
* **P0-1 前端內網 IP 暴露清理**: 移除了 `live-dashboard.tsx`, `classic/page.tsx`, `code-review/page.tsx`, `sentry-tunnel/route.ts` 等地 192.168.0.x fallback,改為抽象化 host 名稱或環境變數。
|
||||
* **P1-1 RLS Fail-Closed 收斂**: `apps/api/src/core/context.py` 與 `apps/api/src/db/base.py` 移除預設 "awoooi",強迫所有進入 DB 的路徑必須有明確 tenant tag。
|
||||
* **P1-2 前端強隨機**: `toast.tsx`, `timeline.store.ts`, `terminal.store.ts` 移除 `Math.random()`,採用 `crypto.randomUUID()` 以符資訊安全。
|
||||
|
||||
67
docs/audit/awoooi-gemini-cross-audit-2026-06-03.md
Normal file
67
docs/audit/awoooi-gemini-cross-audit-2026-06-03.md
Normal file
@@ -0,0 +1,67 @@
|
||||
# AWOOOI 全域盤點清查與 Gemini 交叉比對報告(W1-Redline 實施後)
|
||||
更新時間:2026-06-03(Taipei)
|
||||
|
||||
## 0. 盤點範圍
|
||||
- 本次盤點基準:`apps/web`、`apps/api`、`k8s/awoooi-prod/`
|
||||
- 方法:靜態關鍵字掃描 + 入口路由對帳 + 風險收斂實作 + 斷言
|
||||
- 本輪性質:在「已確認可落地」前提下,將 P0/P1 風險轉為可稽核狀態(含 runtime 證據路徑)
|
||||
|
||||
## 1. Gemini 逐條交叉比對(W1-Redline 版)
|
||||
|
||||
| # | Gemini 主張 | 本次實盤結果(2026-06-03) | 狀態 | 重點證據 |
|
||||
|---|---|---|---|---|
|
||||
| 1 | Math.random 假資料 | 前端核心 `Math.random` 已逐步改為可追溯 ID,報告中以 `crypto.randomUUID` 為主軸 | ✅ 未成立(本輪以可追溯策略為主) | `/apps/web/src/components/ui/toast.tsx` `/apps/web/src/stores/timeline.store.ts` `/apps/web/src/stores/terminal.store.ts` |
|
||||
| 2 | 前端內網 IP 暴露 | `192.168` 仍見於註解與某些運維設定,但「前端可解析字串」已改為主機別名與 topology 呈現;未再見有效連線 fallback(`NEXT_PUBLIC_*`) | 🟠 部分成立(高) | `/apps/web/src/components/infra/host-grid.tsx` `/apps/web/src/components/dashboard/live-dashboard.tsx` `/apps/web/src/app/api/sentry-tunnel/route.ts` `/apps/web/.env.example` `/k8s/awoooi-prod/05-deployment-web.yaml` |
|
||||
| 3 | ClawBot 殘留 | 未見主頁/路由核心露頭;命名殘留屬歷史文件與歷史術語 | ✅ 未成立(屬遺留命名監看項) | `/apps/web/src/app/[locale]/awooop/page.tsx` `/apps/web/src/app/[locale]/awooop/layout.tsx` |
|
||||
| 4 | Sidebar 8條死連結、AwoooP 孤島 | `sidebar` 與 `AwoooP` 全量可對帳,非孤島 | ✅ 未成立(已修正並持續驗證) | `/apps/web/src/components/layout/sidebar.tsx` `/apps/web/src/app/[locale]/awooop/page.tsx` `/apps/web/src/app/[locale]/awooop/layout.tsx` |
|
||||
| 5 | NEMOTRON 覆蓋衝突 | Deployment 已移除 `ENABLE_NEMOTRON_COLLABORATION` / `NEMOTRON_TIMEOUT_SECONDS` 覆寫;採用 ConfigMap 單一源 | ✅ 已修(本輪採納) | `/k8s/awoooi-prod/04-configmap.yaml` `/k8s/awoooi-prod/06-deployment-api.yaml` |
|
||||
| 6 | RLS 未落地(跨租戶風險) | `ContextVar` 預設不再 fallback 到 `awoooi`,失效時改為 401 fail-closed,並新增 runtime guard endpoint | ✅ 已修(Repo) / 🟠 待 runtime 證據(未切換上線 image) | `/apps/api/src/core/context.py` `/apps/api/src/db/base.py` `/apps/api/src/main.py` |
|
||||
| 7 | CronJob label 斷鏈 | W1 實施前已確認 `system: awoooi` 已對齊 | ✅ 已修(建議保留稽核命令) | `/k8s/awoooi-prod/13-cronjob-k3s-report.yaml` `/k8s/awoooi-prod/14-cronjob-weekly-report.yaml` `/k8s/awoooi-prod/15-cronjob-km-vectorize.yaml` `/k8s/awoooi-prod/16-cronjob-backup-restore-test.yaml` `/k8s/awoooi-prod/02-network-policy.yaml` |
|
||||
| 8 | Secrets 明文與 `CHANGE_ME` | 明文示例與運行時秘密仍以範例/Secret 分離;需補 commit + runtime 証據版本封存 | 🟡 待補證據(高) | `/k8s/awoooi-prod/03-secrets.example.yaml` |
|
||||
| 9 | provider_proxy 不存在 | `provider_proxy.py` 實際存在並有服務化介面 | ✅ 未成立 | `/apps/api/src/services/provider_proxy.py` |
|
||||
|
||||
## 2. 交叉驗證重點(路由與前端對帳)
|
||||
- 主導航 11 項:`sidebar.tsx` 全數對帳頁面存在。
|
||||
- `AwoooP`:`work-items / tenants / contracts / runs / approvals / approvals/[run_id]` 均存在且可導向。
|
||||
|
||||
## 3. W1-Redline 實施後風險排序(本輪更新)
|
||||
- P0-1:前端內網曝光(已清理 NEXT_PUBLIC_* fallback)
|
||||
- 狀態:🟠 代碼端已清理;runtime 仍為舊 image,實機驗證待切換
|
||||
- 下一步:`kubectl diff` + 前端 bundle grep
|
||||
- P0-2:NEMOTRON 覆蓋衝突
|
||||
- 狀態:✅ 已修(ConfigMap 單一治理來源)
|
||||
- P0-3:CronJob Labels
|
||||
- 狀態:✅ 已修(維持稽核門檻)
|
||||
- P1-1:RLS fail-closed
|
||||
- 狀態:✅ 已修(Repo) / 🟠 需 runtime call sample(舊 image 回 404)
|
||||
- P1-2:ID 可追溯化
|
||||
- 狀態:✅ 已修(持續監看新增回歸)
|
||||
|
||||
## 4. 建議補樣(W1 稽核)
|
||||
- 前端 build/scan:確認 public bundle 不出現 `192.168`。
|
||||
- backend runtime:`GET /api/v1/security/db-context-guard`(未帶 `X-Project-ID` 應回 401;帶合法 context 應回 200)
|
||||
- 依賴變更:建立 diff 驗證命令表(web/api/k8s)供 release checklist。
|
||||
|
||||
補註:`/api/v1/security/db-context-guard` 在舊版 image 前置中可回應 `404`,待新映像上線後再補 401/200 兩組樣本。
|
||||
|
||||
## 5. 實機驗證結果(2026-06-03,repo 已收斂,runtime 待上線)
|
||||
|
||||
### 已完成(Repo / build)
|
||||
- `npm run build`(apps/web,指定 `NEXT_PUBLIC_API_URL`):成功。
|
||||
- `rg` 檢查 `apps/web/.next`:未在非 `.map` 的 client/server bundle 中發現 `192.168`。
|
||||
- 代碼與配置層面:`Math.random` 已替換、主機別名化、RLS fail-closed 與 Guard endpoint 已加入、CronJob label 已對齊。
|
||||
|
||||
### 目前 runtime 現況(未同步新版本)
|
||||
- `kubectl -n awoooi-prod get deploy awoooi-api awoooi-web`:仍為舊 image `.../api:f1ef7ec...`、`.../web:f1ef7ec...`。
|
||||
- `kubectl -n awoooi-prod exec deploy/awoooi-web -- printenv`:
|
||||
- `NEXT_PUBLIC_HOST_IPS=192.168.0.110,192.168.0.112,192.168.0.120,192.168.0.188`
|
||||
- `SENTRY_HOST=http://192.168.0.110:9000`
|
||||
- `NEXT_PUBLIC_K8S_VIP_INFO` 仍含 `192.168.0.125`
|
||||
- `kubectl -n awoooi-prod exec deploy/awoooi-api -- printenv`:
|
||||
- `ENABLE_NEMOTRON_COLLABORATION=true`
|
||||
- `NEMOTRON_TIMEOUT_SECONDS=55`
|
||||
- `kubectl -n awoooi-prod get deploy awoooi-api awoooi-web awoooi-worker`:三者 image 均為 `192.168.0.110:5000/...:f1ef7ec...`(舊版未重建)。
|
||||
- `kubectl -n awoooi-prod` 部署層全域檢查:除 `awoooi-api` 外,未見其他 deployment 在 container env 直接設定 `NEMOTRON_*`(目前殘留主要在舊版 `awoooi-api` pod env)。
|
||||
- `configmap awoooi-config`(`NEMOTRON_*`)目前為 `false / 55`,與預期一致,但未在 Pod env 中被採用(仍被 deployment 覆蓋)。
|
||||
- `GET /api/v1/security/db-context-guard`(舊版 runtime)回 `404 Not Found`,無法直接驗證 401/200 稽核樣本。
|
||||
- `kubectl -n awoooi-prod get cronjobs -o json`:`k3s-status-report`、`weekly-report`、`km-vectorize`、`drift-scanner` 之 `jobTemplate.template.metadata.labels` 已皆可見 `system: awoooi`;並與現有 `allow-required-egress` 的 `podSelector: system: awoooi` 需求一致。
|
||||
497
docs/runbooks/FULL-STACK-COLD-START-SOP.md
Normal file
497
docs/runbooks/FULL-STACK-COLD-START-SOP.md
Normal file
@@ -0,0 +1,497 @@
|
||||
# AWOOOI Full-Stack Cold Start SOP
|
||||
|
||||
> Version: v1.0
|
||||
> Last updated: 2026-05-05 Asia/Taipei
|
||||
> Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path.
|
||||
|
||||
---
|
||||
|
||||
## 0. When To Use This
|
||||
|
||||
Use this SOP when any of these happen:
|
||||
|
||||
- 110/120/121/188 reboot unexpectedly.
|
||||
- All services are abnormal after a power/network event.
|
||||
- K3s is stuck `activating`.
|
||||
- Host load remains high during startup and service health is mixed.
|
||||
- Monitoring, alerting, CD, AI auto-repair, and Docker Compose services disagree about the real state.
|
||||
|
||||
The rule is simple: **recover the dependency chain, not the loudest symptom.**
|
||||
|
||||
---
|
||||
|
||||
## 1. Golden Startup Order
|
||||
|
||||
```text
|
||||
0. Freeze automation and preserve evidence
|
||||
1. Physical/network layer
|
||||
2. 188 data layer
|
||||
3. 110 registry/observability layer
|
||||
4. 120/121 K3s layer
|
||||
5. AWOOOI workload layer
|
||||
6. Public routes and alert chain
|
||||
7. High-load batch/consumer/crawler services
|
||||
8. Runner/CD
|
||||
9. AI auto-remediation
|
||||
10. 112 Kali scanner, if needed
|
||||
```
|
||||
|
||||
Never start runner/CD before 188 PostgreSQL, 110 Harbor, K3s nodes, and AWOOOI API are healthy.
|
||||
|
||||
---
|
||||
|
||||
## 2. Automation Freeze
|
||||
|
||||
Cold start creates noisy metrics and partial failures. During P0/P1, keep automation in observe-only mode.
|
||||
|
||||
| Item | Cold-start policy | Reason |
|
||||
|------|-------------------|--------|
|
||||
| Gitea/GitHub runners | Last | Build jobs can saturate 110 CPU/RAM. |
|
||||
| momo-scheduler / crawlers | Last | Chrome and batch work can saturate 188. |
|
||||
| Sentry/Snuba consumers | Controlled | Kafka backlog and ClickHouse merge can create temporary high load. |
|
||||
| Alertmanager outbound notification | Gate | Avoid alert storms before API webhook and Telegram are verified. |
|
||||
| AI auto-repair | Observe-only | Metrics, Redis, KM, and playbooks may be incomplete. |
|
||||
| Stateful DB restart | Human approval | PostgreSQL, Redis, ClickHouse, Harbor DB, Sentry DB are not generic restart targets. |
|
||||
|
||||
---
|
||||
|
||||
## 3. P0 Evidence And Network
|
||||
|
||||
Run from any machine on the same LAN:
|
||||
|
||||
```bash
|
||||
for h in 110 120 121 188; do
|
||||
ping -c 2 -W 2 192.168.0.$h >/dev/null && echo "PING_OK 192.168.0.$h" || echo "PING_FAIL 192.168.0.$h"
|
||||
done
|
||||
|
||||
arp -an | grep -E '192\.168\.0\.(110|120|121|188)'
|
||||
for h in 110 120 121 188; do
|
||||
nc -G 3 -z 192.168.0.$h 22 && echo "SSH_OK 192.168.0.$h" || echo "SSH_FAIL 192.168.0.$h"
|
||||
done
|
||||
```
|
||||
|
||||
Then capture reboot evidence:
|
||||
|
||||
```bash
|
||||
ssh ollama@192.168.0.188 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
|
||||
ssh wooo@192.168.0.110 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
|
||||
ssh wooo@192.168.0.120 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
|
||||
ssh wooo@192.168.0.121 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
|
||||
```
|
||||
|
||||
If any host has ARP `incomplete` or SSH port down, stop here and fix physical/network first.
|
||||
|
||||
---
|
||||
|
||||
## 4. P0 188 Data Layer
|
||||
|
||||
188 is the first real service dependency because K3s datastore and AWOOOI DB depend on PostgreSQL.
|
||||
|
||||
### 4.1 Startup order
|
||||
|
||||
1. `containerd`
|
||||
2. `docker`
|
||||
3. `postgresql@14-main`
|
||||
4. `k3s_datastore.kine` maintenance
|
||||
5. `redis-server` on `6380`
|
||||
6. `ollama` or current AI proxy dependencies
|
||||
7. `nginx`
|
||||
8. Docker networks
|
||||
9. MinIO / OpenClaw / SignOz
|
||||
10. momo / litellm / batch services after load is stable
|
||||
|
||||
### 4.2 Read-only check
|
||||
|
||||
```bash
|
||||
ssh ollama@192.168.0.188 '
|
||||
hostname; date; uptime; free -h
|
||||
systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx || true
|
||||
pg_isready -h localhost -p 5432 || true
|
||||
redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true
|
||||
docker ps --format "{{.Names}}\t{{.Status}}\t{{.Ports}}" | head -120
|
||||
'
|
||||
```
|
||||
|
||||
### 4.3 PostgreSQL WAL checkpoint damage
|
||||
|
||||
Signature:
|
||||
|
||||
```text
|
||||
PANIC: could not locate a valid checkpoint record
|
||||
invalid primary checkpoint record
|
||||
unexpected pageaddr ... in log segment ...
|
||||
```
|
||||
|
||||
This blocks:
|
||||
|
||||
- `188:5432`
|
||||
- K3s startup on 120/121
|
||||
- AWOOOI API DB access
|
||||
- Alertmanager webhook if API cannot start
|
||||
|
||||
Human-approved recovery command on 188:
|
||||
|
||||
```bash
|
||||
sudo systemctl stop postgresql@14-main
|
||||
sudo install -d -m 700 -o postgres -g postgres /var/backups/postgresql
|
||||
sudo tar -C /var/lib/postgresql/14 -czf /var/backups/postgresql/14-main-before-pg-resetwal-$(date +%Y%m%d-%H%M%S).tgz main
|
||||
sudo -u postgres /usr/lib/postgresql/14/bin/pg_resetwal -f /var/lib/postgresql/14/main
|
||||
sudo systemctl start postgresql@14-main
|
||||
pg_isready -h localhost -p 5432
|
||||
sudo -u postgres psql -d k3s_datastore -c "VACUUM ANALYZE kine;"
|
||||
```
|
||||
|
||||
Do not run `DROP`, reinitialize the cluster, delete `/var/lib/postgresql`, or restore an old backup unless the commander explicitly approves it.
|
||||
|
||||
---
|
||||
|
||||
## 5. P0/P1 110 Registry And Observability
|
||||
|
||||
110 must recover Harbor/Gitea/Monitoring early, but runners last.
|
||||
|
||||
### 5.1 Startup order
|
||||
|
||||
1. `docker`
|
||||
2. Remove `Exited (128)` / `Exited (137)` orphan containers
|
||||
3. Harbor `harbor-log`
|
||||
4. Harbor full stack
|
||||
5. Gitea
|
||||
6. Prometheus / Alertmanager / Grafana / exporters
|
||||
7. Langfuse
|
||||
8. SignOz
|
||||
9. Sentry DB layer
|
||||
10. Sentry web/worker/consumer layer
|
||||
11. Gitea host runner and actions runners
|
||||
|
||||
### 5.2 Checks
|
||||
|
||||
```bash
|
||||
ssh wooo@192.168.0.110 '
|
||||
hostname; date; uptime; free -h
|
||||
systemctl is-active docker || true
|
||||
curl -s -o /dev/null -w "harbor=%{http_code}\n" --max-time 5 http://127.0.0.1:5000/v2/ || true
|
||||
curl -s -o /dev/null -w "gitea=%{http_code}\n" --max-time 5 http://127.0.0.1:3001/ || true
|
||||
curl -s --max-time 5 http://127.0.0.1:9090/-/ready || true
|
||||
curl -s --max-time 5 http://127.0.0.1:9093/-/healthy || true
|
||||
curl -s -o /dev/null -w "sentry=%{http_code}\n" --max-time 10 http://127.0.0.1:9000/ || true
|
||||
docker ps --format "{{.Names}}\t{{.Status}}" | head -120
|
||||
'
|
||||
```
|
||||
|
||||
Harbor healthy means `/v2/` returns `200` or `401`. Do not treat `401` as failure.
|
||||
|
||||
### 5.3 Runner gate
|
||||
|
||||
Runner may start only after all are true:
|
||||
|
||||
- `188 PostgreSQL` ready
|
||||
- `110 Harbor` ready
|
||||
- `110 Gitea` ready
|
||||
- `120/121 K3s` nodes ready
|
||||
- AWOOOI API health passes
|
||||
- 110 load/core is below `1.0` for at least 15 minutes
|
||||
- runner systemd guardrails are active: `CPUQuota=200%`, `MemoryMax=2G`, `WatchdogUSec=0`
|
||||
|
||||
Check:
|
||||
|
||||
```bash
|
||||
ssh wooo@192.168.0.110 '
|
||||
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain | awk "{print \$1}"); do
|
||||
echo "=== $u ==="
|
||||
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts
|
||||
done
|
||||
'
|
||||
```
|
||||
|
||||
If `WatchdogUSec` is not `0`, apply the guardrail script manually with sudo:
|
||||
|
||||
```bash
|
||||
sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6. P1 120/121 K3s
|
||||
|
||||
K3s must wait for 188 PostgreSQL and 110 Harbor.
|
||||
|
||||
### 6.1 Startup order
|
||||
|
||||
1. 120 `k3s.service`
|
||||
2. 121 `k3s-agent.service` or its live role
|
||||
3. CNI / kube-proxy
|
||||
4. Nodes Ready
|
||||
5. Core pods
|
||||
6. `awoooi-prod` pods
|
||||
7. keepalived VIP `192.168.0.125`
|
||||
8. NodePorts `32334` and `32335`
|
||||
|
||||
### 6.2 Checks
|
||||
|
||||
```bash
|
||||
ssh wooo@192.168.0.120 '
|
||||
hostname; uptime
|
||||
pg_isready -h 192.168.0.188 -p 5432 || true
|
||||
systemctl is-active k3s k3s-agent keepalived 2>/dev/null || true
|
||||
kubectl get nodes -o wide 2>/dev/null || true
|
||||
kubectl get pods -A 2>/dev/null | grep -v -E "Running|Completed" || true
|
||||
kubectl get pods -n awoooi-prod -o wide 2>/dev/null || true
|
||||
ip addr show | grep 192.168.0.125 || true
|
||||
'
|
||||
|
||||
ssh wooo@192.168.0.121 '
|
||||
hostname; uptime
|
||||
systemctl is-active k3s k3s-agent keepalived 2>/dev/null || true
|
||||
ip addr show | grep 192.168.0.125 || true
|
||||
'
|
||||
```
|
||||
|
||||
If K3s is `activating` while 188 PostgreSQL is down, fix PostgreSQL first. Restarting K3s repeatedly will not solve it.
|
||||
|
||||
---
|
||||
|
||||
## 7. P2 AWOOOI Workloads
|
||||
|
||||
Run after K3s nodes are Ready:
|
||||
|
||||
```bash
|
||||
ssh wooo@192.168.0.120 '
|
||||
kubectl get deploy -n awoooi-prod
|
||||
kubectl get pods -n awoooi-prod -o wide
|
||||
kubectl get svc -n awoooi-prod
|
||||
kubectl get events -n awoooi-prod --sort-by=.lastTimestamp | tail -40
|
||||
'
|
||||
|
||||
curl -s --max-time 8 http://192.168.0.125:32334/api/v1/health
|
||||
curl -s -o /dev/null -w "web=%{http_code}\n" --max-time 8 http://192.168.0.125:32335/
|
||||
```
|
||||
|
||||
If pods are `ImagePullBackOff`, go back to 110 Harbor.
|
||||
|
||||
If API health fails because DB/Redis is down, go back to 188.
|
||||
|
||||
---
|
||||
|
||||
## 8. P2 Alert Chain
|
||||
|
||||
Current main path:
|
||||
|
||||
```text
|
||||
Prometheus/Alertmanager on 110
|
||||
-> http://192.168.0.125:32334/api/v1/webhooks/alertmanager
|
||||
-> AWOOOI API
|
||||
-> TelegramGateway
|
||||
-> Telegram
|
||||
```
|
||||
|
||||
Alertmanager health alone is not enough. Run E2E:
|
||||
|
||||
```bash
|
||||
curl -s -X POST http://192.168.0.125:32334/api/v1/webhooks/alertmanager \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"receiver":"cold-start-test","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartE2ETest","severity":"info"},"annotations":{"summary":"Cold start E2E test, ignore"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-test"}'
|
||||
```
|
||||
|
||||
Expected: API returns success and Telegram receives the test alert.
|
||||
|
||||
---
|
||||
|
||||
## 9. P2 Schedules And Delayed Work
|
||||
|
||||
Do not mark the reboot complete until scheduled work is proven runnable. A container can be healthy while its cron path is broken.
|
||||
|
||||
| Host / Layer | Required check | Success baseline |
|
||||
|--------------|----------------|------------------|
|
||||
| 188 cron | `systemctl is-active cron` and `crontab -l` | cron active; backup, restart exporter, stats exporter entries present |
|
||||
| 188 backup-from-110 | `backup_110_last_success_timestamp` in textfile/Prometheus | last success age `< 25h` |
|
||||
| 188 momo-scheduler | `docker logs momo-scheduler` | `全部排程任務已註冊`; Google Drive auth works; dashboard URLs use container-reachable hostnames |
|
||||
| 188 momo import | manual `run_auto_import_task()` after parser changes | selected sheet is `即時業績明細`; imported date range has matching rows in `daily_sales_snapshot` and `realtime_sales_monthly` |
|
||||
| 110 cron | `systemctl is-active cron` | cron active; Docker/systemd textfile exporters fresh |
|
||||
| 110 startup units | `systemctl --failed` | zero failed units; stale `momo-startup-complete` and `wooo-staggered-startup` disabled |
|
||||
| 120 K8s CronJobs | `kubectl get cronjobs -n awoooi-prod` | unsuspended; no failed Jobs remain after current validation |
|
||||
| 121 DR drill | `crontab -l` | DR drill cron present unless explicitly paused |
|
||||
|
||||
Useful checks:
|
||||
|
||||
```bash
|
||||
ssh ollama@192.168.0.188 'systemctl is-active cron; crontab -l; ls -l /home/ollama/node_exporter_textfiles/*.prom'
|
||||
ssh wooo@192.168.0.110 'systemctl --failed --no-pager; systemctl is-active cron; crontab -l'
|
||||
ssh wooo@192.168.0.120 'sudo kubectl get cronjobs,jobs -n awoooi-prod'
|
||||
ssh wooo@192.168.0.121 'systemctl is-active cron; crontab -l'
|
||||
```
|
||||
|
||||
If a schedule succeeds but emits a false verification alert, fix the verification rule before releasing AI auto-remediation. False positives train operators to ignore real alarms.
|
||||
|
||||
---
|
||||
|
||||
## 10. P2/P3 Stateful Service Guardrails
|
||||
|
||||
| Tier | Examples | Automation |
|
||||
|------|----------|------------|
|
||||
| BLOCK | PostgreSQL data dir, ClickHouse data dir, Harbor DB, Sentry DB | No automatic destructive action. Human approval only. |
|
||||
| CRITICAL_HITL | Redis, Kafka, MinIO, SignOz ClickHouse, Sentry ClickHouse | Human-in-the-loop restart/repair. |
|
||||
| STANDARD_HITL | API/Web/worker, OpenClaw, litellm | Restart only with evidence and blast-radius check. |
|
||||
| AUTO | Stateless exporters, blackbox, nginx exporter | Auto restart allowed after verification. |
|
||||
|
||||
Never use generic `docker restart $(docker ps -q)` during cold start.
|
||||
|
||||
### 10.1 Dirty-Reboot Storage Corruption
|
||||
|
||||
Treat these log signatures as storage corruption, not ordinary service flakiness:
|
||||
|
||||
- `Bad message`
|
||||
- `Structure needs cleaning`
|
||||
- `Unknown codec`
|
||||
- `PANIC: could not locate a valid checkpoint record`
|
||||
- Kafka `Malformed line` in checkpoint files
|
||||
- ClickHouse `broken and needs manual correction`
|
||||
|
||||
Cold-start automation may stop a restart storm and collect evidence, but it must not delete the original data directory. If a filesystem returns `Bad message` or `Structure needs cleaning`, the real root cause is below the container layer. Online recovery can restore service from readable data, but complete historical recovery requires an offline filesystem check or backup restore.
|
||||
|
||||
### 10.2 ClickHouse Clean-Clone Recovery Pattern
|
||||
|
||||
Use this pattern for Sentry ClickHouse or SignOz ClickHouse when individual corrupted parts cannot be moved because the host filesystem rejects reads.
|
||||
|
||||
```text
|
||||
1. Stop the compose stack or at least stop dependent consumers.
|
||||
2. Disable restart loops for the failing container.
|
||||
3. Save logs and build an exclude list from unreadable store paths.
|
||||
4. Preserve the original volume as _data.corrupt-YYYYMMDD-HHMMSS.
|
||||
5. Create a clean _data clone with readable files only.
|
||||
6. Add flags/force_restore_data.
|
||||
7. Start ClickHouse first, then web/API, then consumers.
|
||||
8. Verify HTTP, merge backlog, and restart count before releasing high-load services.
|
||||
```
|
||||
|
||||
Do not replace this with `rm -rf store/...` unless the unreadable path is already backed up or the commander explicitly accepts data loss. The preferred incident artifact is:
|
||||
|
||||
```text
|
||||
/var/lib/docker/volumes/<volume>/_data.corrupt-YYYYMMDD-HHMMSS
|
||||
/var/backups/<service>-<component>-YYYYMMDD-HHMMSS
|
||||
```
|
||||
|
||||
### 10.3 Kafka Checkpoint Recovery Pattern
|
||||
|
||||
If Kafka refuses to start with malformed checkpoint files after a dirty reboot, preserve and move only checkpoint files:
|
||||
|
||||
```text
|
||||
log-start-offset-checkpoint
|
||||
recovery-point-offset-checkpoint
|
||||
replication-offset-checkpoint
|
||||
```
|
||||
|
||||
Then start Kafka and confirm health before starting Snuba/Sentry consumers. Do not delete topic directories or Kafka logs during cold-start recovery.
|
||||
|
||||
---
|
||||
|
||||
## 11. P3 High-Load Services
|
||||
|
||||
Only release these after P0/P1/P2 gates are green:
|
||||
|
||||
| Host | Service | Release condition |
|
||||
|------|---------|-------------------|
|
||||
| 188 | momo-scheduler / crawler | load/core < 1.0 for 15 minutes and DB healthy |
|
||||
| 188 | SignOz ClickHouse | healthy and merge backlog trending down |
|
||||
| 188 | litellm | `/health/liveliness` good and provider route verified |
|
||||
| 110 | Sentry Snuba consumers | ClickHouse healthy and Kafka backlog decreasing |
|
||||
| 110 | Sentry uptime-checker | Sentry web/DB healthy |
|
||||
| 110 | runners | all previous gates green and load/core < 1.0 for 15 minutes |
|
||||
|
||||
---
|
||||
|
||||
## 12. Baseline And AI Auto-Remediation Gate
|
||||
|
||||
### 12.1 Stable Runtime Baseline
|
||||
|
||||
These are release gates after the first cold-start recovery pass:
|
||||
|
||||
| Area | Baseline |
|
||||
|------|----------|
|
||||
| 188 host | PostgreSQL accepting, Redis PONG, momo `/health` 200, SignOz HTTP reachable, load/core < 1.0 sustained before crawlers |
|
||||
| 110 host | Harbor `/v2/` 200/401, Gitea 200/302, Prometheus ready, Alertmanager healthy, Sentry HTTP 200/302/400, no ClickHouse/Kafka restart loop |
|
||||
| K3s | 120/121 nodes Ready, VIP `192.168.0.125` present, AWOOOI API 2xx/3xx, Web 2xx/3xx |
|
||||
| Public routes | `https://awoooi.wooo.work/api/v1/health` 2xx/3xx, `https://mo.wooo.work/health` 2xx/3xx |
|
||||
| Guardrails | Docker/systemd textfile exporters fresh, runner `CPUQuota=200%`, `MemoryMax=2G`, `WatchdogUSec=0` |
|
||||
| Schedules | cron active on 110/188/120/121; K8s CronJobs unsuspended; no current failed Jobs; 188 backup success `< 25h` |
|
||||
| Backlog | ClickHouse merges and Kafka/Snuba lag trending down, not increasing for two consecutive checks |
|
||||
|
||||
If service health is green but load average remains high, check live CPU and IO before changing memory limits. High load after Sentry/Snuba or ClickHouse startup can be backlog drain; high CPU from runners/builds/crawlers is a release-order problem.
|
||||
|
||||
### 12.2 AI Auto-Remediation Gate
|
||||
|
||||
AI auto-repair can move from observe-only to limited execution only after:
|
||||
|
||||
- Prometheus rules are loaded.
|
||||
- docker/systemd textfile exporter files are fresh.
|
||||
- blackbox probes have stable results.
|
||||
- cron/CronJob schedule checks are green.
|
||||
- AWOOOI API `/api/v1/health` passes.
|
||||
- Alertmanager E2E webhook passes.
|
||||
- Redis/KM/playbook health is available.
|
||||
- No active restart storm.
|
||||
- Host load/core remains below `1.0` for 15 minutes.
|
||||
|
||||
Until then:
|
||||
|
||||
- diagnose only
|
||||
- notify only
|
||||
- require human approval for remediation
|
||||
- no DB/ClickHouse/Harbor/Sentry destructive action
|
||||
- no generic restart action against stateful services
|
||||
|
||||
---
|
||||
|
||||
## 13. One-Command Readiness Script
|
||||
|
||||
Run:
|
||||
|
||||
```bash
|
||||
bash scripts/reboot-recovery/full-stack-cold-start-check.sh
|
||||
```
|
||||
|
||||
The script is read-only. It reports gates:
|
||||
|
||||
- `P0-NETWORK`
|
||||
- `P0-188-DATA`
|
||||
- `P0-110-REGISTRY`
|
||||
- `P1-K3S`
|
||||
- `P2-WORKLOAD`
|
||||
- `P2-ALERTCHAIN`
|
||||
- `P2-PUBLIC-ROUTES`
|
||||
- `P2-SCHEDULES`
|
||||
- runner guardrail state inside `P0-110-REGISTRY-OBSERVABILITY`
|
||||
|
||||
If it prints `BLOCKED`, fix the first blocked gate before moving forward.
|
||||
|
||||
---
|
||||
|
||||
## 14. Done Criteria
|
||||
|
||||
All must be true:
|
||||
|
||||
- Four hosts reachable by SSH.
|
||||
- 188 PostgreSQL and Redis healthy.
|
||||
- 110 Harbor, Gitea, Prometheus, Alertmanager healthy.
|
||||
- 120/121 K3s nodes Ready.
|
||||
- VIP `192.168.0.125` present.
|
||||
- AWOOOI API and Web reachable through NodePort/VIP.
|
||||
- Alertmanager E2E webhook succeeds.
|
||||
- cron/CronJob schedules are active, unsuspended, and verified.
|
||||
- Sentry and SignOz are either healthy or explicitly in controlled backlog recovery.
|
||||
- High-load batch services are capped or delayed.
|
||||
- Runners are guarded and released last.
|
||||
- AI auto-remediation is not in full execution mode until all gates are green.
|
||||
|
||||
---
|
||||
|
||||
## 15. Known Drift To Fix After Recovery
|
||||
|
||||
These must be cleaned after the incident, not during P0:
|
||||
|
||||
- `SERVICE-ENDPOINTS.md` still has old Prometheus/Alertmanager locations.
|
||||
- Audit older docs for direct node webhook targets; current main path should be VIP `192.168.0.125:32334`.
|
||||
- OpenClaw `8088` vs `8089` must be live-confirmed and normalized.
|
||||
- 188 compose paths drift between `/home/ollama/*` and Ansible `/opt/*`.
|
||||
- 110 runner docs still mention Docker runner in places; live startup prefers host `gitea-act-runner-host.service`.
|
||||
- `scripts/setup-runner-watchdog.sh` conflicts with the 2026-05-05 runner watchdog disablement guardrail.
|
||||
- `grist.wooo.work` / `registry.wooo.work` public HTTP/HTTPS currently route to `aiops.wooo.work`; their old 110 certbot renewal configs are disabled until public routing is corrected or DNS-01 renewal is configured.
|
||||
@@ -9,11 +9,13 @@
|
||||
|
||||
| Service | Live Limit | Live Usage Snapshot | Verdict |
|
||||
|---|---:|---:|---|
|
||||
| Sentry ClickHouse | 4 CPU / 8 GiB | ~235-291% CPU / 3.3-3.4 GiB | CPU capped but still hottest. Do not lower memory; keep merge settings explicit. |
|
||||
| Sentry ClickHouse | 2 CPU / 8 GiB, merge pool 4 | capped near 2 cores after pool 8 -> 4 restart | Do not lower memory. CPU quota intentionally slows background merge so Sentry cannot dominate 110. If backlog grows, inspect `MergeMutate` and Sentry high-volume features before raising it. |
|
||||
| Sentry Kafka | 2 CPU / 3 GiB | ~40-55% CPU / 2.5 GiB (84%) | Memory is close to pressure. Do not reduce memory. |
|
||||
| Sentry taskworker | 2 CPU / 2 GiB, concurrency 2 | ~120-181% CPU after restart | Concurrency reduced from 4 to 2 after Kafka lag cleared. Watch Sentry task latency before further changes. |
|
||||
| Sentry taskbroker | 1 CPU / 512 MiB | ~70-98% CPU / 160 MiB | CPU is tight; increasing may improve backlog but can raise host load. |
|
||||
| Sentry taskscheduler | 0.5 CPU / 512 MiB | ~13% CPU / 387 MiB (76%) | Memory is tight; alert at 85% before it stalls. |
|
||||
| Sentry redis | 0.5 CPU / 512 MiB | ~15-30% CPU / 19 MiB | Live and compose cap are aligned. |
|
||||
| Sentry uptime-checker | 0.5 CPU / 512 MiB | ~26-30% CPU / 43-187 MiB | Capped after it showed sustained background CPU. |
|
||||
| Gitea | 3 CPU / 3 GiB | ~4% CPU / 2.18 GiB (73%) | Good cap; memory headroom is not huge. |
|
||||
| GitHub/Gitea runners | unlimited systemd services | one runner had WatchdogSec=5min and 8,490 restarts; `act` CI containers caused load spikes | Must be monitored outside Docker. Remove bad watchdog drop-in and apply per-runner CPU/Memory quotas. |
|
||||
| node-exporter | 1 CPU / 256 MiB | ~0-5% CPU / 8 MiB | Good after disabling expensive `arp`, `netclass`, and `netdev` collectors. |
|
||||
@@ -28,8 +30,11 @@
|
||||
| SignOz ClickHouse | 4 CPU / 24 GiB | ~93-133% CPU / 1.1 GiB | Healthy enough; keep current cap. |
|
||||
| SignOz Zookeeper | 1 CPU / 2 GiB | ~8-18% CPU / 1.09 GiB | OK. |
|
||||
| cadvisor | 1.5 CPU / 1 GiB | ~0% CPU / 28 MiB | Good. |
|
||||
| litellm | unlimited | ~0.6-0.9% CPU / 780 MiB | Add modest cap after observing traffic; do not re-add DATABASE_URL. |
|
||||
| momo-pro-system / momo-db | unlimited | DB had short CPU bursts, then ~0.6% with no active long query | Needs service-specific limits after scheduler/schema pressure is controlled. |
|
||||
| litellm | 1 CPU / 1 GiB | ~0.5-0.9% CPU / 780 MiB | Good cap; keep stateless mode and do not re-add `DATABASE_URL`. |
|
||||
| momo-pro-system | 2 CPU / 2 GiB | ~1-2% CPU / 740 MiB | Good cap; startup cache prewarm must stay single-flight. |
|
||||
| momo-scheduler | 2 CPU / 2 GiB | ~0.3% CPU / 105-163 MiB after crawler burst | CPU cap is working. Next fix is crawler concurrency and failed background jobs, not lower CPU. |
|
||||
| momo-telegram-bot | 0.5 CPU / 512 MiB | ~0.7% CPU / 66 MiB | Good cap. |
|
||||
| momo-db | 2 CPU / 4 GiB | DB had short CPU bursts, then ~0.6-29% with no active long query | Good cap; current bursts are query/workload, not limit pressure. |
|
||||
| Monitoring tools / websites / exporters | mostly unlimited | low | Add caps gradually with textfile alerts watching pressure. |
|
||||
|
||||
## Baseline Policy
|
||||
@@ -69,12 +74,13 @@ Use these thresholds for alerting and AI triage:
|
||||
|
||||
1. Deploy `scripts/ops/docker-stats-textfile-exporter.py` to 110 and 188 textfile collector cron.
|
||||
2. Reload Prometheus rules with the new Docker CPU/memory/restart baseline alerts.
|
||||
3. Observe 110 for one drain window after node-exporter collector trim and taskworker concurrency 2. Kafka lag is now near zero; if ClickHouse remains high, tune merge/query behavior, not Kafka consumers.
|
||||
4. Tune `momo-scheduler` crawler concurrency on 188; keep 2 CPU / 2 GiB until success rate and latency prove it is too low.
|
||||
5. Fix 188 Elephant Alpha/OpenClaw allowed-action drift before enabling resource auto-repair beyond diagnosis.
|
||||
6. Add modest caps to currently unlimited low-risk services in small batches.
|
||||
7. Deploy `scripts/ops/stop-stale-gitea-actions-jobs.sh` to 110 as `/home/wooo/scripts/stop-stale-gitea-actions-jobs.sh`; keep Prometheus auto action in dry-run mode.
|
||||
8. Fix 110 runner services with sudo-capable host maintenance:
|
||||
3. Persist live limits in the owning compose files before considering the host repaired; live `docker update` alone is not durable.
|
||||
4. Observe 110 for one drain window after node-exporter collector trim and taskworker concurrency 2. Kafka lag is now near zero; if ClickHouse remains high, tune merge/query behavior or reduce high-volume Sentry features, not Kafka memory.
|
||||
5. Tune `momo-scheduler` crawler concurrency on 188; keep 2 CPU / 2 GiB until success rate and latency prove it is too low.
|
||||
6. Fix 188 Elephant Alpha/OpenClaw allowed-action drift before enabling resource auto-repair beyond diagnosis.
|
||||
7. Add modest caps to currently unlimited low-risk services in small batches. Do not alert every unlimited auxiliary container at once; promote candidates only after 24h usage data.
|
||||
8. Deploy `scripts/ops/stop-stale-gitea-actions-jobs.sh` to 110 as `/home/wooo/scripts/stop-stale-gitea-actions-jobs.sh`; keep Prometheus auto action in dry-run mode.
|
||||
9. Fix 110 runner services with sudo-capable host maintenance:
|
||||
|
||||
```bash
|
||||
sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply
|
||||
@@ -88,3 +94,4 @@ sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply
|
||||
- Treating "no alert" as healthy when cAdvisor or textfile exporters are missing.
|
||||
- Letting monitoring collectors spend seconds per scrape; this turns observability into load.
|
||||
- Leaving self-hosted runners unlimited on the same host as Sentry/ClickHouse/Gitea.
|
||||
- Applying live `docker update` without persisting the same guardrail in compose/systemd/IaC.
|
||||
|
||||
@@ -13,15 +13,15 @@ Dashboard 路徑:`Ollama 容災監控`(uid: `ollama-failover-p23`)
|
||||
|
||||
### Panel 1 — Ollama 可用性 (Stat)
|
||||
|
||||
**看什麼**:`up{job=~"ollama_111|ollama_188"}` × 100,顯示每台 Ollama 主機的 scrape 存活狀態。
|
||||
**看什麼**:`up{job=~"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111"}` × 100,顯示每個 Ollama provider endpoint 的 scrape 存活狀態。
|
||||
|
||||
| 顏色 | 意義 |
|
||||
|------|------|
|
||||
| 綠色 100% | Prometheus 探測正常,主機在線 |
|
||||
| 黃色 50% | 一台離線,另一台在線(容災中) |
|
||||
| 紅色 0% | 兩台全離線,高風險 |
|
||||
| 黃色 | 部分 endpoint 離線,系統應進入容災 |
|
||||
| 紅色 0% | Ollama provider pool 全離線,高風險 |
|
||||
|
||||
**注意**:此面板反映 Prometheus scrape 狀態,需要 scrape job 命名為 `ollama_111` / `ollama_188`。
|
||||
**注意**:此面板反映 Prometheus scrape 狀態,需要 scrape job 命名對齊 `ollama_gcp_a` / `ollama_gcp_b` / `ollama_local`。
|
||||
設定檔位於 `ops/monitoring/generated/prometheus-scrape-generated.yaml`。
|
||||
|
||||
---
|
||||
@@ -47,9 +47,10 @@ Dashboard 路徑:`Ollama 容災監控`(uid: `ollama-failover-p23`)
|
||||
|
||||
| 分布 | 意義 |
|
||||
|------|------|
|
||||
| ollama 佔 >90% | 正常,111 健康 |
|
||||
| gemini 佔多數 | 111 SLOW/DEGRADED/OFFLINE,容災中 |
|
||||
| ollama_188 出現 | Gemini 配額耗盡備援,或 111 和 Gemini 同時失敗 |
|
||||
| ollama / ollama_gcp_a 佔 >90% | 正常,GCP-A 健康 |
|
||||
| ollama_gcp_b 佔多數 | GCP-A SLOW/DEGRADED/OFFLINE,容災到 GCP-B |
|
||||
| ollama_local 出現 | GCP-A/B 均不可用,容災到 111 local |
|
||||
| gemini 佔多數 | Ollama provider pool 全部不可用,使用付費備援 |
|
||||
| 全部 nemotron/claude | 極端情況,所有主力 provider 失敗 |
|
||||
|
||||
---
|
||||
@@ -71,10 +72,10 @@ Dashboard 路徑:`Ollama 容災監控`(uid: `ollama-failover-p23`)
|
||||
|
||||
### `OllamaInstanceDown` — Ollama 主機離線
|
||||
|
||||
**觸發條件**:`up{job=~"ollama_111|ollama_188"} == 0` 持續 2 分鐘。
|
||||
**觸發條件**:`up{job=~"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111"} == 0` 持續 2 分鐘。
|
||||
|
||||
**影響評估**:
|
||||
- 系統應已自動切至 Gemini(查 Panel 3 確認)
|
||||
- 系統應已依序切至 GCP-B / 111 local / Gemini(查 Panel 3 確認)
|
||||
- 查 Panel 4 是否有 Failover 計數上升
|
||||
|
||||
**排查步驟**:
|
||||
@@ -82,11 +83,9 @@ Dashboard 路徑:`Ollama 容災監控`(uid: `ollama-failover-p23`)
|
||||
```bash
|
||||
# 步驟 1:確認主機存活
|
||||
ping -c 3 192.168.0.111
|
||||
ping -c 3 192.168.0.188
|
||||
|
||||
# 步驟 2:SSH 進主機確認 ollama 服務狀態
|
||||
ssh wooo@192.168.0.111 'systemctl status ollama'
|
||||
ssh wooo@192.168.0.188 'systemctl status ollama'
|
||||
|
||||
# 步驟 3:查 ollama 最近的 journal log
|
||||
ssh wooo@192.168.0.111 'journalctl -u ollama -n 50 --no-pager'
|
||||
@@ -210,8 +209,9 @@ ssh wooo@192.168.0.111 'systemctl status ollama && nvidia-smi'
|
||||
|
||||
| Metric | 類型 | 狀態 | 說明 |
|
||||
|--------|------|------|------|
|
||||
| `up{job="ollama_111"}` | Gauge | ✅ 現有 | Prometheus scrape 存活 |
|
||||
| `up{job="ollama_188"}` | Gauge | ✅ 現有 | Prometheus scrape 存活 |
|
||||
| `up{job="ollama_gcp_a"}` | Gauge | ✅ 現有 | Prometheus scrape 存活 |
|
||||
| `up{job="ollama_gcp_b"}` | Gauge | ✅ 現有 | Prometheus scrape 存活 |
|
||||
| `up{job="ollama_local"}` | Gauge | ✅ 現有 | Prometheus scrape 存活 |
|
||||
| `ollama_failover_triggered_total` | Counter | ✅ P2.3 補入 | failover 切換次數,labels: from_provider, to_provider |
|
||||
| `ollama_recovery_triggered_total` | Counter | ✅ P2.3 補入 | recovery 切回次數,labels: from_provider |
|
||||
| `ollama_health_status{host}` | Gauge | ✅ P2.3 補入 | 健康狀態 1=healthy, 0=not_healthy |
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
# ============================================================================
|
||||
# PATCH: 188 CPU-only Ollama 備援端點
|
||||
# 日期: 2026-04-25 (台北時區)
|
||||
# 負責人: ogt + Claude Sonnet 4.6
|
||||
# ADR 參考: plan_complete_v3.md P0.5
|
||||
# 診斷實測數據:
|
||||
# 主機: 192.168.0.188, Intel Xeon Silver 4214 @ 2.2GHz, 12 核, CPU-only
|
||||
# RAM: 62GB (used 14GB), Disk: 982GB (used 221GB)
|
||||
# GPU: 無
|
||||
# 現有模型: qwen2.5:7b-instruct (4.5GB), llama3.2:3b (1.9GB),
|
||||
# deepseek-r1:14b (8.5GB), nomic-embed-text (261MB)
|
||||
# 推理延遲實測: qwen2.5:7b-instruct → total=111s, eval_rate=0.09 token/s
|
||||
# llama3.2:3b → total=155s (cold start, 比 7b 更慢)
|
||||
# 目標 ~30s 無法達到 (CPU 推理硬上限 ~0.09 token/s)
|
||||
# 決策: qwen2.5:7b-instruct 已存在,設為備援 (111s 延遲,使用者需知情)
|
||||
# 連通性: 110 → 188:11434 ✅ 已驗證
|
||||
# ⚠️ 注意: 188 推理極慢(~111s),應只在 111 GPU Ollama 完全失效時啟用
|
||||
# 建議: 程式碼層應設 OLLAMA_FALLBACK_188_TIMEOUT_SEC = 150
|
||||
# ============================================================================
|
||||
#
|
||||
# 將以下兩行加入 /Users/ogt/awoooi/k8s/awoooi-prod/04-configmap.yaml
|
||||
# 建議位置: OLLAMA_URL 行 (第 20 行) 之後
|
||||
#
|
||||
# --- 新增內容 ---
|
||||
# 2026-04-25 ogt + Claude Sonnet 4.6: 188 CPU-only Ollama 備援 (plan_complete_v3 P0.5)
|
||||
# ⚠️ 188 推理延遲實測 ~111s (0.09 token/s, CPU-only Xeon 4214),僅作 111 完全失效時的降級備援
|
||||
# 模型已存在: qwen2.5:7b-instruct (4.5GB), 無需重拉
|
||||
OLLAMA_FALLBACK_188: "http://192.168.0.188:11434"
|
||||
OLLAMA_188_MODEL: "qwen2.5:7b-instruct"
|
||||
# --- 新增內容結束 ---
|
||||
#
|
||||
# 使用方式 (需用戶 review 後手動 apply):
|
||||
# kubectl -n awoooi-prod apply -f k8s/awoooi-prod/04-configmap.yaml
|
||||
# kubectl -n awoooi-prod rollout restart deployment/awoooi-api
|
||||
@@ -48,13 +48,16 @@ spec:
|
||||
# 正式域名 (必須 https)
|
||||
- name: NEXT_PUBLIC_API_URL
|
||||
value: "https://awoooi.wooo.work"
|
||||
# 2026-04-22 ogt: 移除前端硬碼 IP,改由 K8s 注入
|
||||
# 2026-06-03 P0-1: 避免前端 bundle 直接內建私網 IP,改為主機別名;無連線真值僅作 topology 呈現
|
||||
- name: NEXT_PUBLIC_HOST_IPS
|
||||
value: "192.168.0.110,192.168.0.112,192.168.0.120,192.168.0.188"
|
||||
# 僅供 topology 呈現,非連線真值。
|
||||
# 未設定/空值時,dashboard 前端應回退為空陣列,避免隱藏內網依賴。
|
||||
value: "devops,security,k3s,ai-web"
|
||||
- name: NEXT_PUBLIC_K8S_VIP_INFO
|
||||
value: "VIP 192.168.0.125 · kubectl :6443 · Web :32335 · API :32334"
|
||||
value: "K8S VIP topology (ops-only) · kubectl:6443 · web:32335 · api:32334"
|
||||
- name: SENTRY_HOST
|
||||
value: "http://192.168.0.110:9000"
|
||||
# 2026-06-03: 由可解析內部/公網 DNS 轉向,避免硬編碼 IP
|
||||
value: "https://sentry.awoooi.internal"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: awoooi-config
|
||||
|
||||
@@ -60,11 +60,6 @@ spec:
|
||||
env:
|
||||
- name: USE_AI_ROUTER
|
||||
value: "true"
|
||||
- name: ENABLE_NEMOTRON_COLLABORATION
|
||||
# 2026-04-15 ogt: 重新啟用 — asyncio.wait_for=120s,Ollama 已等待回應
|
||||
value: "true"
|
||||
- name: NEMOTRON_TIMEOUT_SECONDS
|
||||
value: "55"
|
||||
- name: TELEGRAM_ENABLE_POLLING
|
||||
value: "true"
|
||||
- name: OLLAMA_URL
|
||||
|
||||
@@ -37,13 +37,17 @@ spec:
|
||||
metadata:
|
||||
labels:
|
||||
app: awoooi
|
||||
system: awoooi
|
||||
component: k3s-report
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: k3s-report
|
||||
image: 192.168.0.110:5000/awoooi-api:latest
|
||||
imagePullPolicy: Always
|
||||
# 2026-05-05 Codex: keep the API image placeholder so CD
|
||||
# injects the same immutable tag used by API/worker. The old
|
||||
# awoooi-api:latest repo returns 400 from Harbor after reboot.
|
||||
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- python
|
||||
- -m
|
||||
@@ -63,5 +67,7 @@ spec:
|
||||
limits:
|
||||
cpu: "200m"
|
||||
memory: "128Mi"
|
||||
# 使用 API 的 ServiceAccount (需要 RBAC)
|
||||
serviceAccountName: awoooi-api
|
||||
# 2026-05-05 Codex: this report only calls Prometheus and Telegram.
|
||||
# The old awoooi-api ServiceAccount does not exist, which prevented
|
||||
# Job pods from being created after reboot.
|
||||
serviceAccountName: default
|
||||
|
||||
@@ -37,13 +37,17 @@ spec:
|
||||
metadata:
|
||||
labels:
|
||||
app: awoooi
|
||||
system: awoooi
|
||||
component: weekly-report
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: weekly-report
|
||||
image: 192.168.0.110:5000/awoooi-api:latest
|
||||
imagePullPolicy: Always
|
||||
# 2026-05-05 Codex: keep the API image placeholder so CD
|
||||
# injects the same immutable tag used by API/worker. The old
|
||||
# awoooi-api:latest repo returns 400 from Harbor after reboot.
|
||||
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- python
|
||||
- -m
|
||||
@@ -63,5 +67,7 @@ spec:
|
||||
limits:
|
||||
cpu: "500m"
|
||||
memory: "256Mi"
|
||||
# 使用 API 的 ServiceAccount (需要 RBAC)
|
||||
serviceAccountName: awoooi-api
|
||||
# 2026-05-05 Codex: this report only calls app services, Prometheus,
|
||||
# Git, and Telegram. The old awoooi-api ServiceAccount does not
|
||||
# exist, which prevented Job pods from being created after reboot.
|
||||
serviceAccountName: default
|
||||
|
||||
@@ -27,18 +27,25 @@ spec:
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 2
|
||||
activeDeadlineSeconds: 300
|
||||
# 2026-05-05 Codex: allow post-reboot/post-migration catch-up batches.
|
||||
# The script now fails if the API reports failed rows, so this longer
|
||||
# deadline does not hide partial vectorization.
|
||||
activeDeadlineSeconds: 1800
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: awoooi
|
||||
system: awoooi
|
||||
component: km-vectorize
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: km-vectorize
|
||||
image: 192.168.0.110:5000/awoooi-api:latest
|
||||
imagePullPolicy: Always
|
||||
# 2026-05-05 Codex: keep the API image placeholder so CD
|
||||
# injects the same immutable tag used by API/worker. The old
|
||||
# awoooi-api:latest repo returns 400 from Harbor after reboot.
|
||||
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- python
|
||||
- /app/scripts/cron_km_vectorize.py
|
||||
@@ -46,7 +53,9 @@ spec:
|
||||
- name: TZ
|
||||
value: "Asia/Taipei"
|
||||
- name: INTERNAL_API_URL
|
||||
value: "http://awoooi-api.awoooi-prod.svc.cluster.local:8000"
|
||||
# 2026-05-05 Codex: use the actual Service name; the old
|
||||
# awoooi-api DNS name does not exist in awoooi-prod.
|
||||
value: "http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000"
|
||||
resources:
|
||||
requests:
|
||||
cpu: "50m"
|
||||
@@ -54,4 +63,7 @@ spec:
|
||||
limits:
|
||||
cpu: "200m"
|
||||
memory: "128Mi"
|
||||
serviceAccountName: awoooi-api
|
||||
# 2026-05-05 Codex: this job only calls the internal API. The old
|
||||
# awoooi-api ServiceAccount does not exist, which prevented Job pods
|
||||
# from being created after reboot.
|
||||
serviceAccountName: default
|
||||
|
||||
@@ -35,6 +35,7 @@ spec:
|
||||
metadata:
|
||||
labels:
|
||||
app: awoooi
|
||||
system: awoooi
|
||||
component: backup-restore-test
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
|
||||
@@ -39,7 +39,7 @@ resources:
|
||||
images:
|
||||
- name: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
|
||||
newName: 192.168.0.110:5000/awoooi/api
|
||||
newTag: 1cc9de5722eb2fca8bab080077f792fa02c5d5fb
|
||||
newTag: f1ef7ec3e295313af67d7acaf40d439585cb5270
|
||||
- name: 192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER
|
||||
newName: 192.168.0.110:5000/awoooi/web
|
||||
newTag: 1cc9de5722eb2fca8bab080077f792fa02c5d5fb
|
||||
newTag: f1ef7ec3e295313af67d7acaf40d439585cb5270
|
||||
|
||||
@@ -63,10 +63,11 @@ spec:
|
||||
print(f"status={r.status_code} body={r.text[:200]}")
|
||||
asyncio.run(run())
|
||||
env:
|
||||
# 2026-04-09 Claude Sonnet 4.6: ClusterIP 和 DNS 在 Job Pod 均不可達
|
||||
# 改用 NodePort 直連 K3s worker node(同 K8s_API_SERVER_URL 解法)
|
||||
# 2026-05-05 Codex: call the in-cluster Service instead of a
|
||||
# fixed worker NodePort. After reboot, 121 can be unavailable
|
||||
# while the Service and VIP are already healthy.
|
||||
- name: INTERNAL_API_URL
|
||||
value: "http://192.168.0.121:32334"
|
||||
value: "http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000"
|
||||
- name: DRIFT_SCAN_NAMESPACES
|
||||
value: "awoooi-prod"
|
||||
resources:
|
||||
|
||||
@@ -88,7 +88,7 @@ spec:
|
||||
# -----------------------------------------------------------------
|
||||
- alert: NoAlertsReceived2Hours
|
||||
expr: |
|
||||
time() - max by (source)(awoooi_alert_chain_last_success_timestamp) > 7200
|
||||
time() - max by (source)(awoooi_alert_chain_last_success_timestamp{source="alertmanager"}) > 7200
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
@@ -15,6 +15,39 @@
|
||||
|
||||
groups:
|
||||
|
||||
# =========================================================================
|
||||
# Full-stack recovery scorecard recording rules
|
||||
# =========================================================================
|
||||
- name: full_stack_recovery_scorecard_rules
|
||||
interval: 60s
|
||||
rules:
|
||||
- record: awoooi_recovery_core_ready
|
||||
expr: |
|
||||
sum without(result) (
|
||||
awoooi_cold_start_last_result{host="110",scope="110_120_121_188",result="green"} == bool 1
|
||||
)
|
||||
* on(host,scope) (
|
||||
awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"} == bool 0
|
||||
)
|
||||
* on(host,scope) (
|
||||
awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"} == bool 0
|
||||
)
|
||||
* on(host,scope) (
|
||||
(time() - awoooi_cold_start_last_green_timestamp{host="110",scope="110_120_121_188"}) < bool 3600
|
||||
)
|
||||
|
||||
- record: awoooi_recovery_dr_offsite_ready
|
||||
expr: |
|
||||
max by(host) (
|
||||
awoooi_backup_offsite_configured{host="110"} == bool 1
|
||||
)
|
||||
* on(host) max by(host) (
|
||||
awoooi_backup_offsite_fresh{host="110"} == bool 1
|
||||
)
|
||||
* on(host) min by(host) (
|
||||
awoooi_backup_credential_escrow_fresh{host="110"} == bool 1
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# 主機層告警 (host_alerts)
|
||||
# =========================================================================
|
||||
@@ -33,20 +66,22 @@ groups:
|
||||
description: "Node Exporter 無回應超過 1 分鐘"
|
||||
|
||||
- alert: HostHighCpuLoad
|
||||
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
# 2026-05-05 ogt + Codex: keep this as early warning only.
|
||||
# Sustained overload/root-cause automation is handled by HostLoadAverageSustainedHigh.
|
||||
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: systemd-188
|
||||
team: ops
|
||||
auto_repair: "true"
|
||||
auto_repair: "false"
|
||||
# MCP Phase 2a (ADR-071, 2026-04-11 Claude Sonnet 4.6): SSH MCP 路由標籤
|
||||
mcp_provider: "ssh_host"
|
||||
host_type: "bare_metal"
|
||||
alert_category: "host_resource"
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.host }} CPU 高負載"
|
||||
description: "CPU 使用率超過 80%"
|
||||
description: "CPU 使用率超過 90% 持續 10 分鐘;若 load5/core 未超過 1.5,先視為容量觀察與診斷,不直接修復。"
|
||||
|
||||
- alert: HostLoadAverageSustainedHigh
|
||||
# 2026-05-05 ogt + Codex: 110/188 長時間過載基線。
|
||||
@@ -165,7 +200,7 @@ groups:
|
||||
description: "過去 24 小時有備份失敗"
|
||||
|
||||
- alert: VeleroBackupNotRun
|
||||
expr: time() - velero_backup_last_successful_timestamp > 86400
|
||||
expr: max by(host, namespace) (awoooi_velero_latest_completed_backup_fresh{host="110",namespace="velero"}) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -175,7 +210,7 @@ groups:
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Velero 超過 24 小時未成功備份"
|
||||
description: "最後一次成功備份超過 24 小時"
|
||||
description: "backup health exporter 顯示 latest Completed Velero backup 超過 25 小時或不存在。"
|
||||
|
||||
# Sprint C-2 Host rsync 備份告警 (2026-04-11 Claude Sonnet 4.6)
|
||||
# backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success
|
||||
@@ -505,7 +540,7 @@ groups:
|
||||
description: "Sentry 錯誤可能無法正確處理"
|
||||
|
||||
- alert: NoAlertsReceived2Hours
|
||||
expr: time() - max by (source)(awoooi_alert_chain_last_success_timestamp) > 7200
|
||||
expr: time() - max by (source)(awoooi_alert_chain_last_success_timestamp{source="alertmanager"}) > 7200
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -665,7 +700,7 @@ groups:
|
||||
|
||||
- alert: DockerContainerMissingResourceLimit
|
||||
# 2026-05-05 ogt + Codex: catch Compose services that silently run with unlimited CPU/memory.
|
||||
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0)
|
||||
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -1011,10 +1046,10 @@ groups:
|
||||
# 備份還原告警 (awoooi_backup_restore) — 從主機補回 2026-04-12
|
||||
# =========================================================================
|
||||
- name: awoooi_backup_restore
|
||||
interval: 1h
|
||||
interval: 1m
|
||||
rules:
|
||||
- alert: BackupRestoreTestFailed
|
||||
expr: awoooi_backup_restore_test_success == 0
|
||||
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_failed_jobs{host="110",namespace="velero",cronjob="backup-restore-test"}) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -1023,11 +1058,37 @@ groups:
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "備份還原 dry-run 測試失敗"
|
||||
description: "Velero restore dry-run 失敗,備份可能無法還原。立即人工驗證備份狀態。"
|
||||
runbook: "執行 velero backup describe awoooi-daily 及 velero restore create --from-backup awoooi-daily --dry-run"
|
||||
description: "velero namespace 中保留了失敗的 backup-restore-test Job,備份可能無法還原。立即人工驗證備份狀態。"
|
||||
runbook: "先找最新 Completed Velero backup,再執行 restore dry-run;禁止在 production namespace 做真還原"
|
||||
|
||||
- alert: BackupRestoreTestMissing
|
||||
expr: absent(awoooi_velero_restore_test_cron_present{host="110",namespace="velero",cronjob="backup-restore-test"})
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "備份還原 dry-run 監控指標缺失"
|
||||
description: "Prometheus 沒有收到 awoooi_velero_restore_test_cron_present;110 backup health exporter 或 120 kubectl 查詢可能失效。"
|
||||
runbook: "檢查 110 backup_health.prom、SSH 110→120、以及 velero namespace 的 backup-restore-test CronJob"
|
||||
|
||||
- alert: BackupRestoreTestCronMissing
|
||||
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_cron_present{host="110",namespace="velero",cronjob="backup-restore-test"}) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "備份還原 dry-run CronJob 缺失"
|
||||
description: "velero namespace 找不到 backup-restore-test CronJob;備份可還原性沒有定期驗證。"
|
||||
runbook: "kubectl apply k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml 與 16-cronjob-backup-restore-test.yaml"
|
||||
|
||||
- alert: BackupRestoreTestStale
|
||||
expr: (time() - awoooi_backup_restore_test_timestamp_seconds) > 691200
|
||||
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_last_success_fresh{host="110",namespace="velero",cronjob="backup-restore-test"}) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -1036,9 +1097,375 @@ groups:
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "備份還原測試超過 8 天未執行"
|
||||
description: "上次備份測試距今 {{ $value | humanizeDuration }},週排程 CronJob 可能失效。"
|
||||
description: "backup-restore-test CronJob 沒有 8 天內成功紀錄;週排程 CronJob 可能失效。"
|
||||
runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態"
|
||||
|
||||
# =========================================================================
|
||||
# Host / service / config backup health
|
||||
# =========================================================================
|
||||
- name: full_stack_backup_health_alerts
|
||||
interval: 1m
|
||||
rules:
|
||||
- alert: BackupHealthMonitorMissing110
|
||||
expr: absent(awoooi_backup_health_monitor_up{host="110"})
|
||||
for: 20m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-health-monitor
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 備份健康指標缺失"
|
||||
description: "110 沒有輸出 backup_health.prom,無法確認資料庫、設定檔與服務備份是否新鮮。"
|
||||
runbook: "用 Ansible `110-devops.yml --tags textfile_exporters` 或手動部署 scripts/ops/backup-health-textfile-exporter.py"
|
||||
|
||||
- alert: BackupHealthMonitorMissing188
|
||||
expr: absent(awoooi_backup_health_monitor_up{host="188"})
|
||||
for: 20m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-health-monitor
|
||||
host: "188"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "188 備份健康指標缺失"
|
||||
description: "188 沒有輸出 backup_health.prom,無法確認 110 rsync 與 momo PostgreSQL 備份是否新鮮。"
|
||||
runbook: "用 Ansible `188-ai-web.yml --tags textfile_exporters` 或手動部署 scripts/ops/backup-health-textfile-exporter.py"
|
||||
|
||||
- alert: BackupHealthMonitorStale
|
||||
expr: time() - awoooi_backup_health_last_run_timestamp{host=~"110|188"} > 1800
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-health-monitor
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "{{ $labels.host }} 備份健康 exporter 超過 30 分鐘未更新"
|
||||
description: "backup health textfile exporter stale,備份狀態不可觀測。"
|
||||
runbook: "SSH 主機檢查 cron、/tmp/awoooi-backup-health-textfile-exporter.cron.log 與 node-exporter textfile collector"
|
||||
|
||||
- alert: BackupExpectedJobMissing
|
||||
expr: awoooi_backup_job_configured{host=~"110|188"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: host-backup
|
||||
component: backup-cron
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "{{ $labels.host }} 備份排程缺失:{{ $labels.exported_job }}"
|
||||
description: "預期備份 cron/config 不存在;下一次重開機後資料可能沒有可用還原點。"
|
||||
runbook: "依 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的備份章節補回 cron,先 dry-run 再執行"
|
||||
|
||||
- alert: BackupScheduleDuplicateActiveEntries
|
||||
expr: awoooi_backup_cron_active_duplicate_count{host="110"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-cron
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 備份 crontab 有重複 active entries"
|
||||
description: "110 crontab 目前有 {{ $value }} 個 exact duplicate active entry;可能造成 offsite sync、verifier 或 status job 重複執行。"
|
||||
runbook: "SSH 110 執行 `crontab -l | awk 'NF && $0 !~ /^#/ {count[$0]++} END {for (line in count) if (count[line] > 1) print count[line], line}'`,只移除重複 active entry,不要刪除未理解的備份排程。"
|
||||
|
||||
- alert: BackupScheduleSingletonMismatch
|
||||
expr: awoooi_backup_cron_singular_entry_ok{host="110"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-cron
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 備份排程單一入口異常:{{ $labels.entry }}"
|
||||
description: "{{ $labels.entry }} 應該剛好只有一個 active cron entry;目前 count={{ $value }},可能造成排程缺失或重複執行。"
|
||||
runbook: "用 Ansible `110-devops.yml --tags backup_jobs` 收斂排程,並用 `scripts/reboot-recovery/reboot-recovery-readiness-audit.sh --live --no-color` 驗證。"
|
||||
|
||||
- alert: BackupScriptMissing
|
||||
expr: awoooi_backup_script_present{host=~"110|188"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: host-backup
|
||||
component: backup-script
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "{{ $labels.host }} 備份腳本缺失:{{ $labels.script }}"
|
||||
description: "備份排程可能存在,但實際腳本不存在或路徑漂移。"
|
||||
runbook: "從 repo 部署對應 scripts/backup 或 scripts/ops 腳本,確認權限 0755"
|
||||
|
||||
- alert: BackupJobStale
|
||||
expr: awoooi_backup_job_fresh{host=~"110|188"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: host-backup
|
||||
component: backup-freshness
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "{{ $labels.host }} 備份過舊:{{ $labels.exported_job }}"
|
||||
description: "{{ $labels.exported_job }} 最新成功證據超過 {{ $labels.max_age_hours }} 小時或不存在;來源 {{ $labels.source }},目標 {{ $labels.target }}。"
|
||||
runbook: "先檢查備份 log 與磁碟空間,再手動執行對應備份;禁止直接刪除舊備份或 production 資料"
|
||||
|
||||
- alert: BackupAggregateRunFailed
|
||||
expr: awoooi_backup_last_run_failed_count{host="110",exported_job="backup_all"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-all
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 全服務備份最近一次有 {{ $value }} 個失敗項目"
|
||||
description: "backup-all.sh 最近一次 aggregate run 仍有失敗;即使個別 DB 備份已手動補跑,也要重跑 aggregate backup 清除紅燈。"
|
||||
runbook: "SSH 110 檢查 /backup/logs/cron.log 與 /backup/logs/backup.log,修正後執行 /backup/scripts/backup-all.sh"
|
||||
|
||||
- alert: BackupConfigCapturePartial
|
||||
expr: awoooi_backup_config_capture_ok{host="110",critical="true"} == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-config-capture
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 設定檔備份缺少關鍵目標:{{ $labels.target }}"
|
||||
description: "configs restic snapshot 雖可能存在,但最新設定檔備份未成功捕捉 {{ $labels.target }};source={{ $labels.source }}。這會影響下一次冷啟動或災難還原的設定還原完整性。"
|
||||
runbook: "先修復對應主機或 K8s API 可達性,再執行 /backup/scripts/backup-configs.sh,確認 awoooi_backup_config_capture_ok 回到 1,最後補跑 Google Drive/rclone offsite sync。"
|
||||
|
||||
- alert: BackupConfigCaptureStatusStale
|
||||
expr: absent(awoooi_backup_config_capture_status_timestamp{host="110"}) or (time() - awoooi_backup_config_capture_status_timestamp{host="110"} > 172800)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-config-capture
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 設定檔備份覆蓋率狀態缺失或過舊"
|
||||
description: "backup-configs.sh 沒有新鮮的 capture status;無法判斷 110/120/121/188/K8s 設定檔是否真的被最新 snapshot 捕捉。"
|
||||
runbook: "部署新版 /backup/scripts/backup-configs.sh 與 /home/wooo/scripts/backup-health-textfile-exporter.py,執行 /backup/scripts/backup-configs.sh 後刷新 textfile exporter。"
|
||||
|
||||
- alert: BackupIntegrityCheckMissingOrFailed
|
||||
expr: absent(awoooi_backup_integrity_fresh{host="110",scope="restic_check"}) or awoooi_backup_integrity_fresh{host="110",scope="restic_check"} == 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: host-backup
|
||||
component: backup-integrity
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 備份倉庫完整性檢查缺失或失敗"
|
||||
description: "每週 restic check 沒有成功證據,或有 repo 檢查失敗;目前不能假設備份可讀。"
|
||||
runbook: "SSH 110 執行 `/backup/scripts/check-backup-integrity.sh --mode check`,先看 /backup/logs/backup-integrity.log;禁止刪 repo 或 prune 直到確認原因"
|
||||
|
||||
- alert: BackupRestoreDrillMissingOrFailed
|
||||
expr: absent(awoooi_backup_integrity_fresh{host="110",scope="restore_drill"}) or awoooi_backup_integrity_fresh{host="110",scope="restore_drill"} == 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-restore-drill
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 備份抽樣還原演練缺失或失敗"
|
||||
description: "每月 restore drill 沒有成功證據,備份雖可能新鮮,但尚未驗證可讀取還原。"
|
||||
runbook: "SSH 110 執行 `/backup/scripts/check-backup-integrity.sh --mode restore-drill`;只允許還原到隔離暫存目錄,不得覆蓋 production"
|
||||
|
||||
- alert: BackupOffsiteCopyNotConfigured
|
||||
expr: sum by(host) (awoooi_backup_offsite_configured{host="110"}) == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-offsite
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 尚未配置離機備份 provider"
|
||||
description: "backup health exporter 未偵測到 Google Drive/rclone 或其他 offsite provider 配置;本地 restic 全綠仍不等於異地可恢復。"
|
||||
runbook: "在 110 以 `/backup/scripts/configure-offsite-rclone.sh --interactive` 建立 Google Drive remote,產生 `/backup/offsite/*last_success` 證據;不得把 provider token 寫入 repo、Telegram 或 Prometheus label。"
|
||||
|
||||
- alert: BackupOffsiteCopyStale
|
||||
expr: |
|
||||
(
|
||||
(sum by(host) (awoooi_backup_offsite_configured{host="110"}) > 0)
|
||||
and
|
||||
(sum by(host) (awoooi_backup_offsite_fresh{host="110"}) == 0)
|
||||
)
|
||||
and
|
||||
(
|
||||
(sum by(host) (awoooi_backup_offsite_full_sync_enabled{host="110"}) == 0)
|
||||
or
|
||||
((time() - max by(host) (awoooi_backup_offsite_full_sync_enabled_timestamp{host="110"})) > 30 * 3600)
|
||||
)
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-offsite
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 離機備份超過 48 小時未成功"
|
||||
description: "已偵測到 offsite provider 配置,但沒有新鮮成功標記;本地備份可能無法抵抗整台 110 遺失。"
|
||||
runbook: "SSH 110 檢查 Google Drive/rclone 同步 log 與 `/backup/offsite/*last_success`;full sync 需在 enable marker 與低負載門檻成立後由 `/backup/scripts/sync-offsite-backups.sh --mode sync` 鏡像本地 latest-only repo。"
|
||||
|
||||
- alert: BackupRetentionPolicyNotLatestOnly
|
||||
expr: |
|
||||
absent(awoooi_backup_retention_latest_only{host="110"})
|
||||
or
|
||||
awoooi_backup_retention_latest_only{host="110"} != 1
|
||||
or
|
||||
absent(awoooi_backup_retention_offsite_delete_old_enabled{host="110",provider="rclone"})
|
||||
or
|
||||
awoooi_backup_retention_offsite_delete_old_enabled{host="110",provider="rclone"} != 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-retention
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 備份保留策略不是 latest-only"
|
||||
description: "operator 要求所有備份只保留最新一份;本地 restic 必須 keep-last=1,Google Drive/rclone 必須在成功 mirror 後刪除遠端舊檔。"
|
||||
runbook: "檢查 `/backup/scripts/common.sh` 的 BACKUP_RETENTION_MODE=latest、KEEP_LAST=1 與 OFFSITE_SYNC_DELETE_OLD=1,刷新 backup-health textfile;必要時在備份成功後執行 `/backup/scripts/enforce-latest-only-retention.sh`。"
|
||||
|
||||
- alert: BackupSnapshotRetentionExceeded
|
||||
expr: awoooi_backup_job_snapshot_count{host="110",type="restic"} > 1
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-retention
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 備份 repo {{ $labels.exported_job }} 保留超過 1 份 snapshot"
|
||||
description: "{{ $labels.exported_job }} 目前有 {{ $value }} 份 restic snapshot;latest-only 策略要求每個 repo 全域只保留最新 1 份。"
|
||||
runbook: "SSH 110 執行 `/backup/scripts/enforce-latest-only-retention.sh`;若仍未收斂,確認 `common.sh` 使用 `restic forget --group-by \"\" --keep-last 1 --prune`,避免 restic 依 path/tag 分組保留多份。"
|
||||
|
||||
- alert: BackupOffsiteFullVerifyFailed
|
||||
expr: |
|
||||
awoooi_backup_offsite_fresh{host="110",provider="rclone"} == 1
|
||||
unless on(host, provider)
|
||||
(awoooi_backup_offsite_remote_verify_ok{host="110",provider="rclone"} == 1)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-offsite
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 Google Drive full sync 完成但遠端驗證未通過"
|
||||
description: "full offsite marker 已 fresh,但 verify-offsite-full-sync.sh 沒有證明 13 個 Google Drive repo 都可列出且符合 latest-only。"
|
||||
runbook: "SSH 110 執行 `/backup/scripts/verify-offsite-full-sync.sh --write-textfile --no-color`,檢查 `/backup/logs/offsite-full-sync-verify.log` 與 `/home/wooo/node_exporter_textfiles/offsite_full_sync_verify.prom`。"
|
||||
|
||||
- alert: BackupOffsiteRemoteSnapshotRetentionExceeded
|
||||
expr: |
|
||||
(awoooi_backup_offsite_remote_snapshot_count{host="110",provider="rclone"} > 1)
|
||||
and on(host, provider)
|
||||
(awoooi_backup_offsite_fresh{host="110",provider="rclone"} == 1)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-retention
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Google Drive repo {{ $labels.repo }} 保留超過 1 份 snapshot"
|
||||
description: "{{ $labels.repo }} 在 Google Drive/rclone 遠端目前有 {{ $value }} 份 snapshot;latest-only 策略要求遠端也只保留最新一份。"
|
||||
runbook: "確認 110 `/backup/scripts/sync-offsite-backups.sh --mode sync` 使用 `rclone sync`、`OFFSITE_SYNC_DELETE_OLD=1`、`RCLONE_DRIVE_USE_TRASH=false`,再於低峰重新執行 full sync 與 verifier。"
|
||||
|
||||
- alert: BackupCredentialEscrowEvidenceMissing
|
||||
expr: awoooi_backup_credential_escrow_fresh{host="110"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: credential-escrow
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "備份憑證金庫證據缺失或過期:{{ $labels.item }}"
|
||||
description: "{{ $labels.item }} 沒有 31 天內人工驗證證據;重建時可能找不到 restic/offsite/break-glass/DNS/OAuth 復原材料。"
|
||||
runbook: "在密碼管理器或離線加密金庫完成雙人覆核後,只建立不含 secret 的 `/backup/escrow-evidence/{{ $labels.item }}.last_verified` 時間戳證據。"
|
||||
|
||||
# =========================================================================
|
||||
# 基礎設施詳細告警 (awoooi_infrastructure_detailed) — 從主機補回 2026-04-12
|
||||
# =========================================================================
|
||||
@@ -1321,3 +1748,284 @@ groups:
|
||||
summary: "Prometheus ({{ $labels.instance }}) 停擺"
|
||||
description: "Prometheus 自己停擺 → 所有其他告警失效"
|
||||
runbook: "SSH 110 docker compose -f /home/wooo/monitoring/docker-compose.yml up -d prometheus"
|
||||
|
||||
# =========================================================================
|
||||
# Full-stack cold-start recovery gate
|
||||
# =========================================================================
|
||||
- name: cold_start_recovery_alerts
|
||||
rules:
|
||||
- alert: PrometheusRuleDriftGuardFailed
|
||||
expr: |
|
||||
absent(awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="110"})
|
||||
or
|
||||
(time() - max by(host) (awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="110"}) > 900)
|
||||
or
|
||||
(awoooi_prometheus_rule_drift_guard_missing_required_count{host="110"} > 0)
|
||||
or
|
||||
(awoooi_prometheus_rule_drift_guard_current_matches_canonical{host="110"} == 0)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: systemd-110
|
||||
component: prometheus-rule-drift-guard
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Prometheus 規則漂移防護失效"
|
||||
description: "110 Prometheus rule drift guard 沒有新鮮成功指標、required rules 缺失,或 active alerts.yml 不等於 canonical rules。"
|
||||
runbook: "執行 `bash scripts/ops/deploy-alerts.sh` 重新部署 canonical rules 與 drift guard,等待 1-2 個 Prometheus evaluation cycle 後重跑 readiness audit。"
|
||||
|
||||
- alert: PrometheusRuleDriftAutoRepaired
|
||||
expr: awoooi_prometheus_rule_drift_guard_repaired{host="110"} > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: systemd-110
|
||||
component: prometheus-rule-drift-guard
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Prometheus 規則漂移已被自動修復"
|
||||
description: "110 drift guard 最近一次偵測到 active Prometheus rules 漂移,已回復 canonical rules 並 reload Prometheus。"
|
||||
runbook: "檢查 `/home/wooo/logs/prometheus-rule-drift-guard.log` 與 `/home/wooo/monitoring/alerts.yml.guard.bak.*`,找出誰覆寫了 active rules。"
|
||||
|
||||
- alert: ColdStartMonitorMissing
|
||||
expr: absent(awoooi_cold_start_monitor_up{host="110",scope="110_120_121_188"})
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: systemd-110
|
||||
component: cold-start-monitor
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Cold-start monitor textfile metric missing"
|
||||
description: "110 沒有輸出 awoooi_cold_start_monitor_up;重開機恢復 gate 目前不可觀測。"
|
||||
runbook: "執行 scripts/reboot-recovery/install-cold-start-monitor-110.sh,確認 /home/wooo/node_exporter_textfiles/cold_start_recovery.prom"
|
||||
|
||||
- alert: ColdStartMonitorStale
|
||||
expr: time() - awoooi_cold_start_last_run_timestamp{host="110",scope="110_120_121_188"} > 900
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: systemd-110
|
||||
component: cold-start-monitor
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Cold-start monitor stale"
|
||||
description: "cold-start monitor 超過 15 分鐘沒有更新,距離上次執行 {{ $value | humanizeDuration }}。"
|
||||
runbook: "SSH 110 檢查 crontab、/tmp/awoooi-cold-start-monitor.cron.log、cold-start-last.log"
|
||||
|
||||
- alert: ColdStartRecoveryBlocked
|
||||
expr: awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: full-stack
|
||||
component: cold-start-gate
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Full-stack cold-start recovery BLOCKED"
|
||||
description: "cold-start gate 有 {{ $value }} 個 BLOCKED gate。AI 修復需保持 observe-only,先處理第一個 blocked gate。"
|
||||
runbook: "查看 /home/wooo/reboot-recovery/cold-start-last.log;依 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的 P0→P2 順序修復"
|
||||
|
||||
- alert: K3sNodeFilesystemErrorGateBlocked
|
||||
expr: awoooi_cold_start_blocker_reason{host="110",scope="110_120_121_188",reason="k3s_node_filesystem_error",target="120"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: k3s
|
||||
component: node-filesystem
|
||||
host: "120"
|
||||
target_host: "120"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "120 K3s 節點 filesystem error 阻擋重開機放行"
|
||||
description: "cold-start log 偵測到 120 Node event 仍有 EXT4/I/O/deleted inode 類錯誤;即使 Pod Running、網站 200,也不可宣告下一次重開機安全。"
|
||||
runbook: "查看 110 `/home/wooo/reboot-recovery/cold-start-last.log`,執行 `scripts/reboot-recovery/120-fsck-maintenance-checklist.sh` 做維護前只讀檢查;維護窗內用 console/rescue 對 120 root LV 執行 fsck,禁止 online fsck。"
|
||||
|
||||
- alert: ColdStartHost120Unreachable
|
||||
expr: awoooi_cold_start_blocker_reason{host="110",scope="110_120_121_188",reason="host_unreachable",target="120"} > 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: host
|
||||
component: host-reachability
|
||||
host: "120"
|
||||
target_host: "120"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "120 主機不可達,Full-stack cold-start 已阻擋"
|
||||
description: "110 cold-start monitor 無法 ping/SSH 192.168.0.120;目前只能由 121/VIP 撐住 K3s,不能宣告所有主機重開機恢復完成。"
|
||||
runbook: "查看 120 console。若停在 initramfs/manual fsck,先對 root LV 做離線 fsck;若主機關機或網卡異常,先恢復電源/網路,再重跑 full-stack cold-start gate。禁止從自動修復直接重啟其他服務掩蓋主機離線。"
|
||||
|
||||
- alert: ColdStartRecoveryDegraded
|
||||
expr: awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: full-stack
|
||||
component: cold-start-gate
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Full-stack cold-start recovery DEGRADED"
|
||||
description: "cold-start gate 有 {{ $value }} 個 WARN gate;核心可用但不應放行 runner/CD/AI auto-repair full execution。"
|
||||
runbook: "查看 /home/wooo/reboot-recovery/cold-start-last.log,修到 PASS/WARN/BLOCKED = green"
|
||||
|
||||
- alert: ColdStartLastGreenTooOld
|
||||
expr: time() - awoooi_cold_start_last_green_timestamp{host="110",scope="110_120_121_188"} > 3600
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: full-stack
|
||||
component: cold-start-gate
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Full-stack cold-start gate has not been GREEN recently"
|
||||
description: "距離上次 GREEN 已超過 {{ $value | humanizeDuration }};需要確認 110/120/121/188 與排程/網站 gate。"
|
||||
runbook: "執行 SSH_BATCH_MODE=yes bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test"
|
||||
|
||||
# =========================================================================
|
||||
# Host storage health / dirty reboot evidence
|
||||
# =========================================================================
|
||||
- name: host_storage_health_alerts
|
||||
rules:
|
||||
- alert: Host110StorageHealthMonitorMissing
|
||||
expr: absent(awoooi_host_storage_monitor_up{host="110"})
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: systemd-110
|
||||
component: storage-health-monitor
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 storage health textfile metric missing"
|
||||
description: "110 沒有輸出 storage_health.prom;dirty reboot、root read-only 與 fsck 證據目前不可觀測。"
|
||||
runbook: "用 Ansible `110-devops.yml --tags textfile_exporters` 或手動部署 scripts/ops/storage-health-textfile-exporter.py,確認 /home/wooo/node_exporter_textfiles/storage_health.prom"
|
||||
|
||||
- alert: Host188StorageHealthMonitorMissing
|
||||
expr: absent(awoooi_host_storage_monitor_up{host="188"})
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: systemd-188
|
||||
component: storage-health-monitor
|
||||
host: "188"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "188 storage health textfile metric missing"
|
||||
description: "188 沒有輸出 storage_health.prom;dirty reboot、root read-only 與 fsck 證據目前不可觀測。"
|
||||
runbook: "用 Ansible `188-ai-web.yml --tags textfile_exporters` 或手動部署 scripts/ops/storage-health-textfile-exporter.py,確認 /home/ollama/node_exporter_textfiles/storage_health.prom"
|
||||
|
||||
- alert: HostStorageHealthMonitorStale
|
||||
expr: time() - awoooi_host_storage_last_run_timestamp{host=~"110|188"} > 900
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-storage
|
||||
component: storage-health-monitor
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.host }} storage health textfile stale"
|
||||
description: "storage health exporter 超過 15 分鐘沒有更新;重開機後檔案系統風險不可觀測。"
|
||||
runbook: "SSH 主機檢查 cron、/tmp/awoooi-storage-health-textfile-exporter.cron.log 與 node-exporter textfile collector"
|
||||
|
||||
- alert: HostRootFilesystemReadOnly
|
||||
expr: awoooi_host_root_filesystem_readonly{host=~"110|188",mountpoint="/"} > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: host-storage
|
||||
component: root-filesystem
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.host }} root filesystem 已變成 read-only"
|
||||
description: "root filesystem 被掛載為唯讀,服務可能仍暫時存活但寫入會失敗;禁止自動修復,先保全證據並規劃維護窗。"
|
||||
runbook: "依 docs/runbooks/FULL-STACK-COLD-START-SOP.md §16:保全 journal/df/mount 證據,確認備份,再安排 console/offline fsck"
|
||||
|
||||
- alert: HostCurrentBootStorageErrorsDetected
|
||||
expr: awoooi_host_storage_error_count{host=~"110|188",source="journalctl-kernel",boot="current"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: host-storage
|
||||
component: kernel-storage
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.host }} current boot 有 storage/kernel 錯誤"
|
||||
description: "目前開機週期已出現 filesystem、I/O 或 fsck 類錯誤;不可只重啟容器掩蓋問題。"
|
||||
runbook: "先執行 read-only 診斷:journalctl -k -p warning..alert、mount、df、smartctl/raid 狀態;必要時進入維護窗處理"
|
||||
|
||||
- alert: HostPreviousBootStorageErrorsDetected
|
||||
expr: awoooi_host_storage_error_count{host=~"110|188",source="journalctl-kernel",boot="previous"} > 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-storage
|
||||
component: dirty-reboot-evidence
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.host }} previous boot 保留 storage/fsck 錯誤證據"
|
||||
description: "上一個開機週期留有 storage/fsck 錯誤,代表這次重開機事故需要完成 fsck、備份與容量後續檢查。"
|
||||
runbook: "把證據寫入 docs/LOGBOOK.md,確認 full-stack cold-start gate 與 P3 gate;下一次維護窗補 offline fsck/SMART/RAID 檢查"
|
||||
|
||||
- alert: HostFsckLogErrorsDetected
|
||||
expr: sum by(host) (awoooi_host_storage_error_count{host=~"110|188",boot="last-fsck-log"}) > 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-storage
|
||||
component: fsck-log
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.host }} fsck log 保留錯誤證據"
|
||||
description: "主機 fsck log 內仍有 inconsistency 或 I/O 類錯誤文字;這是事故後追蹤項,不應交給自動修復直接處理。"
|
||||
runbook: "確認 /run/initramfs/fsck.log 與 /var/log/fsck/*,將結果納入重開機事故報告與下次維護窗檢查項"
|
||||
|
||||
@@ -33,8 +33,10 @@ groups:
|
||||
description: "Node Exporter 無回應超過 1 分鐘"
|
||||
|
||||
- alert: HostHighCpuLoad
|
||||
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
# 2026-05-05 ogt + Codex: keep this as early warning only.
|
||||
# Sustained overload/root-cause automation is handled by HostLoadAverageSustainedHigh.
|
||||
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: systemd-188
|
||||
@@ -46,7 +48,7 @@ groups:
|
||||
alert_category: "host_resource"
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.host }} CPU 高負載"
|
||||
description: "CPU 使用率超過 80%"
|
||||
description: "CPU 使用率超過 90% 持續 10 分鐘;若 load5/core 未超過 1.5,先視為容量觀察與診斷,不直接修復。"
|
||||
# 2026-05-02 ogt + Claude Sonnet 4.6: 引導 LLM 走 SSH 診斷而非 kubectl
|
||||
auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -20' (host CPU 診斷;禁 kubectl restart awoooi-* — 主因常為第三方服務 Sentry/ClickHouse/Snuba)"
|
||||
runbook: "host CPU 高負載排查:先 SSH ps aux 看 top 進程;若為第三方服務(Sentry/ClickHouse 等)寫 ADR 升級資源或調 limit,禁止 kubectl restart 跨 domain"
|
||||
@@ -671,7 +673,7 @@ groups:
|
||||
|
||||
- alert: DockerContainerMissingResourceLimit
|
||||
# 2026-05-05 ogt + Codex: catch Compose services that silently run with unlimited CPU/memory.
|
||||
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0)
|
||||
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
@@ -26,8 +26,18 @@
|
||||
- labels:
|
||||
criticality: P0
|
||||
owner: ai-team
|
||||
service: ollama
|
||||
url: http://192.168.0.188:11434/api/tags
|
||||
service: ollama-gcp-a
|
||||
url: http://192.168.0.110:11435/api/tags
|
||||
- labels:
|
||||
criticality: P0
|
||||
owner: ai-team
|
||||
service: ollama-gcp-b
|
||||
url: http://192.168.0.110:11436/api/tags
|
||||
- labels:
|
||||
criticality: P0
|
||||
owner: ai-team
|
||||
service: ollama-local
|
||||
url: http://192.168.0.110:11437/api/tags
|
||||
- labels:
|
||||
criticality: P0
|
||||
owner: ai-team
|
||||
|
||||
@@ -92,7 +92,9 @@ scrape_configs:
|
||||
service: ollama
|
||||
type: docker
|
||||
targets:
|
||||
- 192.168.0.188:11434
|
||||
- 192.168.0.110:11435
|
||||
- 192.168.0.110:11436
|
||||
- 192.168.0.110:11437
|
||||
- job_name: openclaw
|
||||
static_configs:
|
||||
- labels:
|
||||
|
||||
@@ -82,11 +82,11 @@
|
||||
"textMode": "auto"
|
||||
},
|
||||
"title": "Ollama 可用性",
|
||||
"description": "up{job=~\"ollama_111|ollama_188\"} × 100\n- 綠色 100% = 主機在線\n- 紅色 0% = 主機離線(容災應已觸發)\n\n資料來源: Prometheus scrape job ollama_111 / ollama_188",
|
||||
"description": "up{job=~\"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111\"} × 100\n- 綠色 100% = 主機在線\n- 紅色 0% = 主機離線(容災應已觸發)\n\n資料來源: Prometheus scrape job ollama_gcp_a / ollama_gcp_b / ollama_local",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "up{job=~\"ollama_111|ollama_188\"} * 100",
|
||||
"expr": "up{job=~\"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111\"} * 100",
|
||||
"legendFormat": "{{ job }}",
|
||||
"refId": "A"
|
||||
}
|
||||
@@ -188,7 +188,7 @@
|
||||
"tooltip": { "mode": "single", "sort": "none" }
|
||||
},
|
||||
"title": "AI Provider 路由分布",
|
||||
"description": "sum by (provider) (rate(ai_router_selected_provider_total[5m]))\n- 正常狀態: ollama 佔大多數\n- failover 中: gemini / ollama_188 比例上升\n- 全走 gemini = 111 完全 offline\n\n資料來源: OLLAMA_FAILOVER_TRIGGERED_TOTAL + AI_ROUTER_PROVIDER_TOTAL (src/core/metrics.py)",
|
||||
"description": "sum by (provider) (rate(ai_router_selected_provider_total[5m]))\n- 正常狀態: ollama / ollama_gcp_a 佔大多數\n- failover 中: ollama_gcp_b / ollama_local / gemini 比例上升\n- 全走 gemini = Ollama provider pool 完全 offline\n\n資料來源: OLLAMA_FAILOVER_TRIGGERED_TOTAL + AI_ROUTER_PROVIDER_TOTAL (src/core/metrics.py)",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
# 部署方式: 手動合併至 alerts-unified.yml,或 scripts/ops/deploy-alerts.sh 支援多檔時直接引用
|
||||
#
|
||||
# 標籤規範 (對齊 alerts-unified.yml):
|
||||
# layer: systemd-188 | docker-188 (Ollama 跑在 188 主機)
|
||||
# layer: ai-provider
|
||||
# team: ai
|
||||
# auto_repair: "true" | "false"
|
||||
#
|
||||
@@ -28,16 +28,16 @@ groups:
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# 🔴 [ACTIVE] Ollama 主機離線
|
||||
# metric: up{job=~"ollama_111|ollama_188"}
|
||||
# 前置條件: Prometheus scrape job 命名為 ollama_111 / ollama_188
|
||||
# metric: up{job=~"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111"}
|
||||
# 前置條件: Prometheus scrape job 命名對齊 ADR-110 provider pool
|
||||
# (設定位於 ops/monitoring/generated/prometheus-scrape-generated.yaml)
|
||||
# -----------------------------------------------------------------------
|
||||
- alert: OllamaInstanceDown
|
||||
expr: up{job=~"ollama_111|ollama_188"} == 0
|
||||
expr: up{job=~"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: systemd-188
|
||||
layer: ai-provider
|
||||
team: ai
|
||||
auto_repair: "false"
|
||||
alert_category: "ollama_failover"
|
||||
@@ -57,7 +57,7 @@ groups:
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: systemd-188
|
||||
layer: ai-provider
|
||||
team: ai
|
||||
auto_repair: "false"
|
||||
alert_category: "ollama_failover"
|
||||
|
||||
@@ -19,6 +19,7 @@ Exit Codes:
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
@@ -29,7 +30,7 @@ import httpx
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
|
||||
OLLAMA_URL = "http://192.168.0.188:11434/api/generate"
|
||||
OLLAMA_URL = os.getenv("OLLAMA_GENERATE_URL", "http://192.168.0.111:11434/api/generate")
|
||||
MODEL = "llama3.2:8b"
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
RULES_FILE = PROJECT_ROOT / ".awoooi-agent-rules.md"
|
||||
|
||||
@@ -18,17 +18,21 @@ import httpx
|
||||
async def main() -> int:
|
||||
api_base = os.environ.get(
|
||||
"INTERNAL_API_URL",
|
||||
"http://awoooi-api.awoooi-prod.svc.cluster.local:8000",
|
||||
"http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000",
|
||||
)
|
||||
url = f"{api_base}/api/v1/knowledge/embed-all"
|
||||
|
||||
async with httpx.AsyncClient(timeout=120) as client:
|
||||
async with httpx.AsyncClient(timeout=1800) as client:
|
||||
try:
|
||||
resp = await client.post(url)
|
||||
print(f"embed-all: {resp.status_code} {resp.text[:200]}")
|
||||
if resp.status_code >= 400:
|
||||
print(f"ERROR: embed-all returned {resp.status_code}", file=sys.stderr)
|
||||
return 1
|
||||
result = resp.json()
|
||||
if int(result.get("failed", 0)) > 0:
|
||||
print(f"ERROR: embed-all failed rows: {result}", file=sys.stderr)
|
||||
return 1
|
||||
return 0
|
||||
except httpx.RequestError as exc:
|
||||
print(f"ERROR: request failed — {exc}", file=sys.stderr)
|
||||
|
||||
@@ -62,7 +62,6 @@ check_url "ArgoCD (121)" "https://192.168.0.121:30443"
|
||||
echo ""
|
||||
echo "--- AI 推理層 ---"
|
||||
check_url "Ollama 111 GPU" "http://192.168.0.111:11434/api/tags"
|
||||
check_url "Ollama 188 Hub" "http://192.168.0.188:11434/api/tags"
|
||||
|
||||
echo ""
|
||||
echo "--- 觀測層 ---"
|
||||
|
||||
398
scripts/reboot-recovery/full-stack-cold-start-check.sh
Executable file
398
scripts/reboot-recovery/full-stack-cold-start-check.sh
Executable file
@@ -0,0 +1,398 @@
|
||||
#!/usr/bin/env bash
|
||||
# AWOOOI full-stack cold-start readiness check.
|
||||
# Read-only by design. It never restarts, deletes, repairs, or writes remote state.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6)
|
||||
SEND_ALERT_TEST=0
|
||||
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--send-alert-test)
|
||||
SEND_ALERT_TEST=1
|
||||
;;
|
||||
-h|--help)
|
||||
cat <<'USAGE'
|
||||
Usage: bash scripts/reboot-recovery/full-stack-cold-start-check.sh [--send-alert-test]
|
||||
|
||||
Default mode is read-only and does not POST an Alertmanager test event.
|
||||
Use --send-alert-test only after AWOOOI API is expected to be ready.
|
||||
USAGE
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $arg" >&2
|
||||
exit 64
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
RED=$'\033[0;31m'
|
||||
GREEN=$'\033[0;32m'
|
||||
YELLOW=$'\033[1;33m'
|
||||
BLUE=$'\033[0;34m'
|
||||
NC=$'\033[0m'
|
||||
|
||||
PASS=0
|
||||
WARN=0
|
||||
FAIL=0
|
||||
|
||||
log_section() {
|
||||
printf "\n%s=== %s ===%s\n" "$BLUE" "$1" "$NC"
|
||||
}
|
||||
|
||||
ok() {
|
||||
printf "%sOK%s %s\n" "$GREEN" "$NC" "$1"
|
||||
PASS=$((PASS + 1))
|
||||
}
|
||||
|
||||
warn() {
|
||||
printf "%sWARN%s %s\n" "$YELLOW" "$NC" "$1"
|
||||
WARN=$((WARN + 1))
|
||||
}
|
||||
|
||||
fail() {
|
||||
printf "%sBLOCKED%s %s\n" "$RED" "$NC" "$1"
|
||||
FAIL=$((FAIL + 1))
|
||||
}
|
||||
|
||||
run_local() {
|
||||
local label="$1"
|
||||
shift
|
||||
if "$@" >/tmp/awoooi-cold-start-check.out 2>&1; then
|
||||
ok "$label"
|
||||
cat /tmp/awoooi-cold-start-check.out
|
||||
return 0
|
||||
fi
|
||||
fail "$label"
|
||||
cat /tmp/awoooi-cold-start-check.out
|
||||
return 1
|
||||
}
|
||||
|
||||
ssh_cmd() {
|
||||
local user_host="$1"
|
||||
local cmd="$2"
|
||||
local prefix=""
|
||||
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
|
||||
printf -v prefix 'REMOTE_SUDO_PASSWORD=%q ' "$REMOTE_SUDO_PASSWORD"
|
||||
fi
|
||||
ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}${cmd}"
|
||||
}
|
||||
|
||||
probe_http_code() {
|
||||
local url="$1"
|
||||
local code
|
||||
code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$url" 2>/dev/null || true)
|
||||
echo "${code:-000}"
|
||||
}
|
||||
|
||||
probe_tcp() {
|
||||
local host="$1"
|
||||
local port="$2"
|
||||
nc -G 3 -z "$host" "$port" >/dev/null 2>&1 || nc -w 3 -z "$host" "$port" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
print_header() {
|
||||
echo "AWOOOI full-stack cold-start check"
|
||||
date '+%Y-%m-%d %H:%M:%S %Z'
|
||||
echo "Scope: 110 / 120 / 121 / 188. 112 Kali is intentionally skipped."
|
||||
}
|
||||
|
||||
check_network() {
|
||||
log_section "P0-NETWORK"
|
||||
local host
|
||||
for host in 110 120 121 188; do
|
||||
if ping -c 1 -W 2 "192.168.0.$host" >/dev/null 2>&1; then
|
||||
ok "ping 192.168.0.$host"
|
||||
else
|
||||
fail "ping 192.168.0.$host"
|
||||
fi
|
||||
|
||||
if probe_tcp "192.168.0.$host" 22; then
|
||||
ok "ssh port 192.168.0.$host:22"
|
||||
else
|
||||
fail "ssh port 192.168.0.$host:22"
|
||||
fi
|
||||
done
|
||||
|
||||
arp -an | grep -E '192\.168\.0\.(110|120|121|188)' || warn "no ARP rows printed for one or more hosts"
|
||||
}
|
||||
|
||||
check_188() {
|
||||
log_section "P0-188-DATA"
|
||||
local out
|
||||
if ! out=$(ssh_cmd "ollama@192.168.0.188" '
|
||||
echo "HOST $(hostname) $(uptime)"
|
||||
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
|
||||
echo "SYSTEMD $(systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx 2>/dev/null | tr "\n" " ")"
|
||||
echo "PG $(pg_isready -h localhost -p 5432 2>&1)"
|
||||
echo "REDIS $(redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true)"
|
||||
echo "PORT5432 $(nc -z -w 2 127.0.0.1 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
|
||||
echo "SIGNOZ_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3301/ || true)"
|
||||
echo "MOMO_HEALTH_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5003/health || true)"
|
||||
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -80
|
||||
' 2>&1); then
|
||||
fail "ssh 188 read-only check"
|
||||
echo "$out"
|
||||
return
|
||||
fi
|
||||
echo "$out"
|
||||
|
||||
grep -q "PORT5432 OPEN" <<<"$out" && ok "188 PostgreSQL port open" || fail "188 PostgreSQL port closed"
|
||||
grep -q "accepting connections" <<<"$out" && ok "188 PostgreSQL accepting connections" || fail "188 PostgreSQL not accepting connections"
|
||||
grep -q "REDIS PONG" <<<"$out" && ok "188 Redis PONG" || warn "188 Redis not confirmed"
|
||||
grep -q "momo-db.*Restarting" <<<"$out" && warn "188 momo-db restarting" || ok "188 momo-db not in visible restart loop"
|
||||
grep -Eq "SIGNOZ_CODE (200|302|307)" <<<"$out" && ok "188 SignOz HTTP reachable" || warn "188 SignOz HTTP not confirmed"
|
||||
grep -q "MOMO_HEALTH_CODE 200" <<<"$out" && ok "188 momo health reachable" || warn "188 momo health not confirmed"
|
||||
}
|
||||
|
||||
check_110() {
|
||||
log_section "P0-110-REGISTRY-OBSERVABILITY"
|
||||
local out
|
||||
if ! out=$(ssh_cmd "wooo@192.168.0.110" '
|
||||
echo "HOST $(hostname) $(uptime)"
|
||||
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
|
||||
echo "DOCKER_SYSTEMD $(systemctl is-active docker 2>/dev/null || true)"
|
||||
echo "HARBOR_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5000/v2/ || true)"
|
||||
echo "GITEA_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3001/ || true)"
|
||||
echo "PROM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9090/-/ready || true)"
|
||||
echo "AM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9093/-/healthy || true)"
|
||||
echo "SENTRY_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 8 http://127.0.0.1:9000/ || true)"
|
||||
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
|
||||
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
|
||||
done
|
||||
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120
|
||||
' 2>&1); then
|
||||
fail "ssh 110 read-only check"
|
||||
echo "$out"
|
||||
return
|
||||
fi
|
||||
echo "$out"
|
||||
|
||||
grep -Eq "HARBOR_CODE (200|401)" <<<"$out" && ok "110 Harbor /v2 healthy code" || fail "110 Harbor not healthy"
|
||||
grep -Eq "GITEA_CODE (200|302)" <<<"$out" && ok "110 Gitea reachable" || warn "110 Gitea not confirmed"
|
||||
grep -q "PROM_CODE 200" <<<"$out" && ok "110 Prometheus ready" || warn "110 Prometheus not ready"
|
||||
grep -q "AM_CODE 200" <<<"$out" && ok "110 Alertmanager healthy" || warn "110 Alertmanager not healthy"
|
||||
grep -Eq "SENTRY_CODE (200|302|400)" <<<"$out" && ok "110 Sentry HTTP reachable" || warn "110 Sentry HTTP not confirmed"
|
||||
grep -q "WatchdogUSec=0" <<<"$out" && ok "runner watchdog disabled on at least one unit" || warn "runner watchdog state not confirmed"
|
||||
grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting"
|
||||
}
|
||||
|
||||
check_k3s() {
|
||||
log_section "P1-K3S"
|
||||
local out local_kubectl_out
|
||||
if ! out=$(ssh_cmd "wooo@192.168.0.120" '
|
||||
echo "HOST $(hostname) $(uptime)"
|
||||
echo "PG188_PORT $(nc -z -w 2 192.168.0.188 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
|
||||
echo "SYSTEMD $(systemctl is-active k3s k3s-agent keepalived 2>/dev/null | tr "\n" " ")"
|
||||
kcmd() {
|
||||
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
|
||||
printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
|
||||
else
|
||||
sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
|
||||
fi
|
||||
}
|
||||
kcmd get nodes -o wide 2>/dev/null || true
|
||||
kcmd get pods -n awoooi-prod -o wide 2>/dev/null || true
|
||||
ip addr show | grep 192.168.0.125 || true
|
||||
' 2>&1); then
|
||||
fail "ssh 120 k3s read-only check"
|
||||
echo "$out"
|
||||
return
|
||||
fi
|
||||
echo "$out"
|
||||
|
||||
if ! grep -q " Ready " <<<"$out"; then
|
||||
local_kubectl_out=$(kubectl get nodes -o wide 2>/dev/null || true)
|
||||
if [ -n "$local_kubectl_out" ]; then
|
||||
echo "LOCAL_KUBECTL_FALLBACK"
|
||||
echo "$local_kubectl_out"
|
||||
fi
|
||||
else
|
||||
local_kubectl_out=""
|
||||
fi
|
||||
|
||||
grep -q "PG188_PORT OPEN" <<<"$out" && ok "120 can reach 188 PostgreSQL port" || fail "120 cannot reach 188 PostgreSQL"
|
||||
grep -q " Ready " <<<"$out$local_kubectl_out" && ok "K3s has Ready node output" || fail "K3s nodes not Ready or kubectl unavailable"
|
||||
grep -q "192.168.0.125" <<<"$out" && ok "VIP 192.168.0.125 present on 120" || warn "VIP not confirmed on 120"
|
||||
}
|
||||
|
||||
check_workload_and_alertchain() {
|
||||
log_section "P2-WORKLOAD-ALERTCHAIN"
|
||||
local api_code web_code alert_code
|
||||
local out
|
||||
if out=$(ssh_cmd "wooo@192.168.0.120" '
|
||||
api_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32334/api/v1/health 2>/dev/null || true)
|
||||
web_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32335/ 2>/dev/null || true)
|
||||
echo "API_CODE ${api_code:-000}"
|
||||
echo "WEB_CODE ${web_code:-000}"
|
||||
' 2>/dev/null); then
|
||||
api_code=$(awk '/^API_CODE / {print $2}' <<<"$out")
|
||||
web_code=$(awk '/^WEB_CODE / {print $2}' <<<"$out")
|
||||
else
|
||||
api_code=$(probe_http_code "http://192.168.0.125:32334/api/v1/health")
|
||||
web_code=$(probe_http_code "http://192.168.0.125:32335/")
|
||||
out="API_CODE $api_code
|
||||
WEB_CODE $web_code"
|
||||
fi
|
||||
|
||||
echo "$out"
|
||||
|
||||
[[ "$api_code" =~ ^[23] ]] && ok "AWOOOI API reachable" || fail "AWOOOI API not reachable"
|
||||
[[ "$web_code" =~ ^[23] ]] && ok "AWOOOI Web reachable" || warn "AWOOOI Web not confirmed"
|
||||
|
||||
if [ "$SEND_ALERT_TEST" -eq 1 ]; then
|
||||
alert_code=$(ssh_cmd "wooo@192.168.0.120" 'curl -s -o /tmp/awoooi-alertchain.out -w "%{http_code}" --max-time 8 \
|
||||
-X POST "http://192.168.0.125:32334/api/v1/webhooks/alertmanager" \
|
||||
-H '"'"'Content-Type: application/json'"'"' \
|
||||
-d '"'"'{"receiver":"cold-start-check","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartCheck","severity":"info"},"annotations":{"summary":"Cold start check"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-check"}'"'"' 2>/dev/null || echo "000"')
|
||||
echo "ALERTCHAIN_CODE $alert_code"
|
||||
[[ "$alert_code" =~ ^2 ]] && ok "Alertmanager webhook endpoint accepts POST" || warn "Alertmanager webhook E2E not confirmed"
|
||||
else
|
||||
warn "Alertmanager webhook POST skipped; rerun with --send-alert-test after API is ready"
|
||||
fi
|
||||
}
|
||||
|
||||
check_public_routes() {
|
||||
log_section "P2-PUBLIC-ROUTES"
|
||||
local awoooi_api_code awoooi_web_code momo_code momo_health_code
|
||||
awoooi_api_code=$(probe_http_code "https://awoooi.wooo.work/api/v1/health")
|
||||
awoooi_web_code=$(probe_http_code "https://awoooi.wooo.work/")
|
||||
momo_code=$(probe_http_code "https://mo.wooo.work/")
|
||||
momo_health_code=$(probe_http_code "https://mo.wooo.work/health")
|
||||
|
||||
echo "AWOOOI_PUBLIC_API_CODE $awoooi_api_code"
|
||||
echo "AWOOOI_PUBLIC_WEB_CODE $awoooi_web_code"
|
||||
echo "MOMO_PUBLIC_CODE $momo_code"
|
||||
echo "MOMO_PUBLIC_HEALTH_CODE $momo_health_code"
|
||||
|
||||
[[ "$awoooi_api_code" =~ ^[23] ]] && ok "AWOOOI public API reachable" || warn "AWOOOI public API not confirmed"
|
||||
[[ "$awoooi_web_code" =~ ^[23] ]] && ok "AWOOOI public web reachable" || warn "AWOOOI public web not confirmed"
|
||||
[[ "$momo_code" =~ ^[23] ]] && ok "momo public route reachable" || warn "momo public route not confirmed"
|
||||
[[ "$momo_health_code" =~ ^[23] ]] && ok "momo public health reachable" || warn "momo public health not confirmed"
|
||||
}
|
||||
|
||||
check_schedules() {
|
||||
log_section "P2-SCHEDULES"
|
||||
local out
|
||||
|
||||
if out=$(ssh_cmd "ollama@192.168.0.188" '
|
||||
now=$(date +%s)
|
||||
echo "CRON_188 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
|
||||
for f in /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/docker_stats.prom; do
|
||||
if [ -f "$f" ]; then
|
||||
mt=$(stat -c %Y "$f")
|
||||
echo "TEXTFILE_188 $(basename "$f") age=$((now - mt))"
|
||||
else
|
||||
echo "TEXTFILE_188 $(basename "$f") missing"
|
||||
fi
|
||||
done
|
||||
if [ -f /home/ollama/node_exporter_textfiles/backup.prom ]; then
|
||||
awk -v now="$now" "/^backup_110_last_success_timestamp / {printf \"BACKUP_110_AGE %d\\n\", now - int(\$2)}" /home/ollama/node_exporter_textfiles/backup.prom
|
||||
fi
|
||||
echo "SCHEDULER_REGISTERED $(docker logs --tail 200 momo-scheduler 2>&1 | grep -c "全部排程任務已註冊" || true)"
|
||||
' 2>&1); then
|
||||
echo "$out"
|
||||
grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed"
|
||||
awk '/TEXTFILE_188 backup.prom age=/ {split($3,a,"="); exit !(a[2] < 90000)}' <<<"$out" && ok "188 backup textfile fresh enough" || warn "188 backup textfile stale or missing"
|
||||
awk '/TEXTFILE_188 docker_restart_count.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker restart exporter fresh" || warn "188 docker restart exporter stale"
|
||||
awk '/TEXTFILE_188 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker stats exporter fresh" || warn "188 docker stats exporter stale"
|
||||
awk '/BACKUP_110_AGE / {exit !($2 < 90000)}' <<<"$out" && ok "188 backup-from-110 success within 25h" || warn "188 backup-from-110 success not confirmed"
|
||||
awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out" && ok "188 momo scheduler registered jobs" || warn "188 momo scheduler registration not confirmed"
|
||||
else
|
||||
warn "188 schedule check unavailable"
|
||||
echo "$out"
|
||||
fi
|
||||
|
||||
if out=$(ssh_cmd "wooo@192.168.0.110" '
|
||||
now=$(date +%s)
|
||||
echo "CRON_110 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
|
||||
echo "FAILED_UNITS_110 $(systemctl --failed --no-legend --plain 2>/dev/null | wc -l)"
|
||||
echo "MOMO_STARTUP_ENABLED $(systemctl is-enabled momo-startup-complete.service 2>/dev/null || true)"
|
||||
echo "STAGGERED_STARTUP_ENABLED $(systemctl is-enabled wooo-staggered-startup.service 2>/dev/null || true)"
|
||||
for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom; do
|
||||
if [ -f "$f" ]; then
|
||||
mt=$(stat -c %Y "$f")
|
||||
echo "TEXTFILE_110 $(basename "$f") age=$((now - mt))"
|
||||
else
|
||||
echo "TEXTFILE_110 $(basename "$f") missing"
|
||||
fi
|
||||
done
|
||||
' 2>&1); then
|
||||
echo "$out"
|
||||
grep -q "CRON_110 active" <<<"$out" && ok "110 cron active" || warn "110 cron not confirmed"
|
||||
grep -q "FAILED_UNITS_110 0" <<<"$out" && ok "110 systemd has no failed units" || warn "110 systemd failed units remain"
|
||||
grep -q "MOMO_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale momo startup unit disabled" || warn "110 stale momo startup unit not disabled"
|
||||
grep -q "STAGGERED_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale staggered startup unit disabled" || warn "110 stale staggered startup unit not disabled"
|
||||
awk '/TEXTFILE_110 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 docker stats exporter fresh" || warn "110 docker stats exporter stale"
|
||||
awk '/TEXTFILE_110 systemd_units.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 systemd units exporter fresh" || warn "110 systemd units exporter stale"
|
||||
else
|
||||
warn "110 schedule check unavailable"
|
||||
echo "$out"
|
||||
fi
|
||||
|
||||
if out=$(ssh_cmd "wooo@192.168.0.120" '
|
||||
kcmd() {
|
||||
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
|
||||
printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
|
||||
else
|
||||
sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
|
||||
fi
|
||||
}
|
||||
echo "CRON_120 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
|
||||
kcmd get cronjobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); items=d.get(\"items\", []); print(\"CRONJOB_COUNT\", len(items)); print(\"CRONJOB_SUSPENDED\", sum(1 for i in items if i.get(\"spec\",{}).get(\"suspend\")))"
|
||||
kcmd get jobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); failed=0
|
||||
for j in d.get(\"items\", []):
|
||||
if any(c.get(\"type\")==\"Failed\" and c.get(\"status\")==\"True\" for c in j.get(\"status\",{}).get(\"conditions\",[]) or []):
|
||||
failed += 1
|
||||
print(\"FAILED_JOBS\", failed)"
|
||||
kcmd get pods -n awoooi-prod --no-headers 2>/dev/null | awk "\$3 !~ /^(Running|Completed)$/ {bad++} END {print \"BAD_PODS\", bad+0}"
|
||||
' 2>&1); then
|
||||
echo "$out"
|
||||
grep -q "CRON_120 active" <<<"$out" && ok "120 cron active" || warn "120 cron not confirmed"
|
||||
awk '/CRONJOB_COUNT / {exit !($2 >= 4)}' <<<"$out" && ok "K8s AWOOOI CronJobs present" || warn "K8s AWOOOI CronJobs missing"
|
||||
grep -q "CRONJOB_SUSPENDED 0" <<<"$out" && ok "K8s AWOOOI CronJobs unsuspended" || warn "K8s AWOOOI CronJob suspended"
|
||||
grep -q "FAILED_JOBS 0" <<<"$out" && ok "K8s AWOOOI has no failed Jobs" || warn "K8s AWOOOI failed Jobs remain"
|
||||
grep -q "BAD_PODS 0" <<<"$out" && ok "K8s AWOOOI pods Running/Completed only" || warn "K8s AWOOOI bad pod status remains"
|
||||
else
|
||||
warn "120 K8s schedule check unavailable"
|
||||
echo "$out"
|
||||
fi
|
||||
|
||||
if out=$(ssh_cmd "wooo@192.168.0.121" '
|
||||
echo "CRON_121 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
|
||||
crontab -l 2>/dev/null | grep -q "dr-drill.sh" && echo "DR_DRILL_CRON present" || echo "DR_DRILL_CRON missing"
|
||||
' 2>&1); then
|
||||
echo "$out"
|
||||
grep -q "CRON_121 active" <<<"$out" && ok "121 cron active" || warn "121 cron not confirmed"
|
||||
grep -q "DR_DRILL_CRON present" <<<"$out" && ok "121 DR drill cron present" || warn "121 DR drill cron missing"
|
||||
else
|
||||
warn "121 schedule check unavailable"
|
||||
echo "$out"
|
||||
fi
|
||||
}
|
||||
|
||||
summary() {
|
||||
log_section "SUMMARY"
|
||||
echo "PASS=$PASS WARN=$WARN BLOCKED=$FAIL"
|
||||
if [ "$FAIL" -gt 0 ]; then
|
||||
echo "Result: BLOCKED. Fix the first blocked gate before releasing runner/CD/AI auto-remediation."
|
||||
exit 2
|
||||
fi
|
||||
if [ "$WARN" -gt 0 ]; then
|
||||
echo "Result: DEGRADED. Core gates passed but warnings remain."
|
||||
exit 1
|
||||
fi
|
||||
echo "Result: GREEN. Full stack is ready for controlled runner/CD release."
|
||||
}
|
||||
|
||||
print_header
|
||||
check_network
|
||||
check_188
|
||||
check_110
|
||||
check_k3s
|
||||
check_workload_and_alertchain
|
||||
check_public_routes
|
||||
check_schedules
|
||||
summary
|
||||
@@ -92,10 +92,10 @@ fi
|
||||
echo ""
|
||||
echo "🤖 Step 6: Verifying Ollama connection..."
|
||||
|
||||
OLLAMA_URL="http://192.168.0.188:11434/api/tags"
|
||||
OLLAMA_URL="${OLLAMA_URL:-http://192.168.0.111:11434/api/tags}"
|
||||
|
||||
if curl -s --connect-timeout 5 "$OLLAMA_URL" > /dev/null 2>&1; then
|
||||
echo " ✅ Ollama reachable at 192.168.0.188:11434"
|
||||
echo " ✅ Ollama reachable at ${OLLAMA_URL}"
|
||||
|
||||
# Check if llama3.2:8b is available
|
||||
MODELS=$(curl -s "$OLLAMA_URL" | grep -o '"name":"[^"]*"' || echo "")
|
||||
|
||||
Reference in New Issue
Block a user