Compare commits

...

9 Commits

Author SHA1 Message Date
Your Name
337b2df60d chore(cd): deploy latest image tag for prod manifests 2026-06-04 00:13:51 +08:00
Your Name
ab21d8bad2 chore: execute W1-redline convergence updates and evidence log 2026-06-03 20:10:14 +08:00
Your Name
2d37383fc6 fix(monitoring): fix false positive NoAlertsReceived2Hours by filtering only alertmanager source 2026-05-28 15:33:17 +08:00
Your Name
3779f6f1e0 fix(metrics): 串入飛輪指標到 /metrics 主端點,修復 FlywheelExecutionRateMissing 死告警
INC-20260507-99ADF2 根因(feedback_full_chain_first_then_fix.md 全鏈分析):

【鏈路斷點】規則層(5/3 加)vs 指標層(5/6 改)vs scrape 層(從沒同步)
- 577250a6(5/3)「反消音化」commit 加了 FlywheelExecutionRateMissing
  rule,要求 110 Prom scrape 到 awoooi_flywheel_execution_success_rate;
- a2c4b3d4(5/6)Codex 改 FlywheelStatsService 用 auto_repair_executions
  作 source of truth(24h 樣本 1-9 筆回 None 給 W-3b watchdog 接管);
- 但 awoooi_flywheel_* 指標自始至終只在 /api/v1/stats/flywheel/metrics
  暴露,110 Prom awoooi-api job 抓的是 /metrics → absent() 永遠 1
  → 自 2026-05-06T04:14 UTC 起 firing 26h+ 屬 dead alert

【修法】只動 awoooi-api 一處,不碰 Codex 設計、不碰 110 Prom 配置:
- main.py /metrics endpoint 改 async,在 generate_latest() 後串入
  FlywheelStatsService.compute() → to_prometheus_lines()。
- 既有 awoooi-api scrape job 自動拿到飛輪指標。
- 完全保留 Codex a2c4b3d4 設計:1-9 筆回 None 讓 W-3b watchdog 雙保險。

【不碰的部分】
- flywheel_stats_service.py 不動:Codex 5/6 LOGBOOK 已明確說明
  「Redis playbook counter 失準 → 用 auto_repair_executions 為唯一信任源」,
  1-9 筆 return None 是配合 ai_slo_watchdog_job W-3b grace+30min 設計的
  反消音化雙保險,不是 bug。

驗證計畫(部署後):
1. curl /metrics | grep awoooi_flywheel  → 看到飛輪指標
2. Prom query awoooi_flywheel_execution_success_rate  → 非空
3. ALERTS{alertname="FlywheelExecutionRateMissing"}  → resolved
4. 30 分鐘觀察 Telegram 不再收 INC-20260507-99ADF2

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 15:32:47 +08:00
Your Name
c38227e945 fix(ai): remove 188 ollama provider 2026-05-06 14:33:16 +08:00
Your Name
1b4a6c1e8c fix(awooop): align console with flywheel execution metrics 2026-05-06 00:44:53 +08:00
Your Name
894174da5b fix(ops): harden cold-start schedule recovery 2026-05-05 22:14:54 +08:00
Your Name
10cd9fc025 fix(openclaw): gate alert cloud fallback behind flag 2026-05-05 20:53:12 +08:00
Your Name
8161ccf83f fix(ops): persist host resource guardrails 2026-05-05 16:13:02 +08:00
86 changed files with 3916 additions and 667 deletions

View File

@@ -108,7 +108,9 @@ jobs:
mkdir -p ~/.ssh
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS
# 2026-05-05 Codex: kubectl runs on 120 control-plane. 121 is a
# worker and its local kubeconfig points at 127.0.0.1:6443.
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS
set -e
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
@@ -138,10 +140,10 @@ jobs:
SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
run: |
cat k8s/awoooi-dev/02-configmap.yaml | \
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 \
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 \
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << 'DEPLOY'
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << 'DEPLOY'
set -e
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml

View File

@@ -406,8 +406,11 @@ jobs:
mkdir -p ~/.ssh
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
ssh-keyscan 192.168.0.121 >> ~/.ssh/known_hosts 2>/dev/null
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS
# 2026-05-05 Codex: kubectl must run on the 120 control-plane.
# 121 is a worker after cold-start recovery; its kubeconfig points at
# 127.0.0.1:6443 and fails ADR-035 secret patching.
ssh-keyscan 192.168.0.120 >> ~/.ssh/known_hosts 2>/dev/null
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS
set -e
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
@@ -634,19 +637,21 @@ jobs:
mkdir -p ~/.ssh
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
ssh-keyscan 192.168.0.121 >> ~/.ssh/known_hosts 2>/dev/null
# 2026-05-05 Codex: deploy-side kubectl/ArgoCD operations run on 120
# control-plane, not 121 worker.
ssh-keyscan 192.168.0.120 >> ~/.ssh/known_hosts 2>/dev/null
IMAGE_TAG="${{ github.sha }}"
HARBOR=192.168.0.110:5000
# ─── Step 1: Apply ConfigMap + ServiceRegistry (ArgoCD 管的是 DeploymentConfigMap 仍直接 apply) ───
cat k8s/awoooi-prod/04-configmap.yaml | \
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
echo "✅ ConfigMap 已更新"
cat k8s/awoooi-prod/15-service-registry-configmap.yaml | \
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
echo "✅ Service Registry ConfigMap 已更新"
@@ -688,7 +693,7 @@ jobs:
}
# ─── Step 4: 等待 ArgoCD sync + rollout ───
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
"EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" << 'ARGOCD_WAIT'
set -e
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
@@ -814,7 +819,7 @@ jobs:
- name: Alert Chain Smoke Test
id: alert_chain_smoke
run: |
# 2026-04-05 Claude Code: 使用真實 API 地址192.168.0.121:32334 NodePort
# 2026-05-05 Codex: use the keepalived VIP instead of a fixed node.
# Host runner launches the CI image explicitly to avoid act RWLayer=nil.
if docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-alert-smoke" \
@@ -824,7 +829,7 @@ jobs:
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
"${{ env.CI_IMAGE }}" \
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url http://192.168.0.121:32334 --json | tee /tmp/alert_chain_result.json'; then
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url http://192.168.0.125:32334 --json | tee /tmp/alert_chain_result.json'; then
echo "alert_chain_status=pass" >> $GITHUB_OUTPUT
else
echo "alert_chain_status=fail" >> $GITHUB_OUTPUT

View File

@@ -6,8 +6,9 @@
-- bge-m3 產生 1024 維向量,現有 schema vector(768) 不相容INSERT 會直接失敗
--
-- 影響範圍:
-- 1. rag_chunks.embedding vector(768) → vector(1024)
-- 2. playbook_embeddings.embedding vector(768) → vector(1024)
-- 1. knowledge_entries.embedding vector(768) → vector(1024)
-- 2. rag_chunks.embedding vector(768) → vector(1024)
-- 3. playbook_embeddings.embedding vector(768) → vector(1024)
--
-- 遷移策略:清空現有向量資料,切換維度後由 re-embed script 重新嵌入
-- 現有向量資料若要保留,需先 dump 用 nomic 格式備份(舊維度無法轉換)
@@ -21,7 +22,24 @@
BEGIN;
-- 1. rag_chunks清空向量資料,變更欄位維度
-- 1. knowledge_entries備份舊向量並清空,變更欄位維度
CREATE TABLE IF NOT EXISTS knowledge_entries_embedding_backup_20260505 AS
SELECT
id,
embedding::text AS embedding_768,
NOW() AS backed_up_at
FROM knowledge_entries
WHERE embedding IS NOT NULL;
ALTER TABLE knowledge_entries
ALTER COLUMN embedding TYPE vector(1024)
USING NULL; -- 清空現有 768 維向量(維度不可轉換)
COMMENT ON COLUMN knowledge_entries.embedding IS
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-05 ADR-110 follow-up)';
-- 2. rag_chunks清空向量資料變更欄位維度
-- ivfflat index 必須先 DROP 才能 ALTER COLUMN
DROP INDEX IF EXISTS idx_rag_chunks_embedding;
@@ -39,7 +57,7 @@ COMMENT ON COLUMN rag_chunks.embedding IS
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-04 ADR-110)';
-- 2. playbook_embeddings清空向量資料變更欄位維度
-- 3. playbook_embeddings清空向量資料變更欄位維度
DROP INDEX IF EXISTS ix_playbook_embeddings_vec;
ALTER TABLE playbook_embeddings
@@ -61,9 +79,15 @@ COMMENT ON TABLE playbook_embeddings IS
-- 3. 驗證遷移結果
DO $$
DECLARE
v_km_dim integer;
v_rag_dim integer;
v_pb_dim integer;
BEGIN
SELECT atttypmod INTO v_km_dim
FROM pg_attribute
JOIN pg_class ON attrelid = pg_class.oid
WHERE relname = 'knowledge_entries' AND attname = 'embedding';
SELECT atttypmod INTO v_rag_dim
FROM pg_attribute
JOIN pg_class ON attrelid = pg_class.oid
@@ -74,15 +98,18 @@ BEGIN
JOIN pg_class ON attrelid = pg_class.oid
WHERE relname = 'playbook_embeddings' AND attname = 'embedding';
-- atttypmod for vector(1024) = 1024 + 1 = 1025
IF v_rag_dim != 1025 THEN
RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗expected 1025, got %', v_rag_dim;
-- pgvector atttypmod stores the configured dimension.
IF v_km_dim != 1024 THEN
RAISE EXCEPTION 'knowledge_entries.embedding 維度驗證失敗expected 1024, got %', v_km_dim;
END IF;
IF v_pb_dim != 1025 THEN
RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗expected 1025, got %', v_pb_dim;
IF v_rag_dim != 1024 THEN
RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗expected 1024, got %', v_rag_dim;
END IF;
IF v_pb_dim != 1024 THEN
RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗expected 1024, got %', v_pb_dim;
END IF;
RAISE NOTICE '✅ embedding 遷移驗證通過rag_chunksplaybook_embeddings 均為 vector(1024)';
RAISE NOTICE '✅ embedding 遷移驗證通過:knowledge_entries、rag_chunksplaybook_embeddings 均為 vector(1024)';
END $$;
COMMIT;

View File

@@ -11,7 +11,7 @@ Endpoints:
Components Checked:
- PostgreSQL (192.168.0.188:5432)
- Redis (192.168.0.188:6380)
- Ollama (192.168.0.188:11434)
- Ollama (settings.OLLAMA_URL / ADR-110 provider pool)
- OpenClaw (192.168.0.188:8089)
- SigNoz (192.168.0.188:3301)
"""

View File

@@ -108,8 +108,9 @@ async def list_runs(
)
async def list_approvals(
project_id: str | None = Query(None, description="租戶 ID可選"),
run_id: str | None = Query(None, description="Run ID可選M8 詳情頁查單筆)"),
) -> dict[str, Any]:
return await list_approvals_svc(project_id=project_id)
return await list_approvals_svc(project_id=project_id, run_id=run_id)
@router.post(

View File

@@ -145,7 +145,7 @@ class Settings(BaseSettings):
# ==========================================================================
# ADR-104: LLM Playbook Generator
# 成功修復且未命中既有 Playbook 時,用本地 LLM 生成 DRAFT/REVIEW Playbook。
# 成本護欄:實作層只走 local providerOllama 111 → Ollama 188),不新增雲端 fallback。
# 成本護欄:實作層只走 local providerGCP-A → GCP-B → 111),不新增雲端 fallback。
# 回滾指令: kubectl set env deployment/awoooi-api ENABLE_LLM_PLAYBOOK_GENERATION=false
# ==========================================================================
ENABLE_LLM_PLAYBOOK_GENERATION: bool = Field(
@@ -504,6 +504,22 @@ class Settings(BaseSettings):
"unexpected cloud spend from Gitea push/PR alerts."
),
)
ALERT_AI_ALLOW_CLOUD_FALLBACK: bool = Field(
default=True,
description=(
"Allow incident/alert OpenClaw analysis to use cloud fallback "
"providers after the GCP-A/GCP-B/111 Ollama lane is exhausted. "
"Default true so Gemini can act as the final backup, after the "
"ordered Ollama lane is exhausted."
),
)
ALERT_AI_ENFORCE_OLLAMA_FIRST: bool = Field(
default=True,
description=(
"Force incident/alert OpenClaw analysis to try GCP-A, then GCP-B, "
"then local 111 before cloud backup providers such as Gemini."
),
)
# 2026-03-29 ogt: ADR-036 Nemotron Tool Calling 整合
NVIDIA_API_KEY: str = Field(
default="",
@@ -855,7 +871,7 @@ class Settings(BaseSettings):
# ==========================================================================
# MCP Phase 2b: Prometheus MCP Server (ADR-071, 2026-04-11 Claude Sonnet 4.6)
# ==========================================================================
# 2026-04-29 ogt + Claude Opus 4.7: drift fix — 188 是 Ollama HubPrometheus 實際在 110
# 2026-04-29 ogt + Claude Opus 4.7: drift fix — Prometheus 實際在 110
# ConfigMap 04-configmap.yaml 也是 110governance_agent / SLO check 連 188 會 timeout
# 此 drift 是 SPF-4 (governance_agent silently fail) 根因之一
PROMETHEUS_URL: str = Field(
@@ -929,7 +945,7 @@ class Settings(BaseSettings):
"devops": "192.168.0.110", # Harbor, GH Runner
"security": "192.168.0.112", # Kali Scanner
"k3s_master": "192.168.0.120", # K3s Master
"ai_web": "192.168.0.188", # Nginx, Postgres, Redis, Ollama
"ai_web": "192.168.0.188", # Nginx, Postgres, Redis, SignOz
}

View File

@@ -10,13 +10,51 @@
"""
from __future__ import annotations
from contextvars import ContextVar
from contextvars import ContextVar, Token
# 追蹤當前非同步任務的 project_id
# default="awoooi" 確保未設時也能正常查詢RLS fail-open 保護)
PROJECT_ID: ContextVar[str] = ContextVar("project_id", default="awoooi")
# Fail-Closed: 移除 default="awoooi",進 DB 路徑需要明確租戶標籤
PROJECT_ID: ContextVar[str | None] = ContextVar("project_id")
PROJECT_ID_SOURCE: ContextVar[str | None] = ContextVar("project_id_source")
PROJECT_ID_REQUEST_ID: ContextVar[str | None] = ContextVar("project_id_request_id")
def get_current_project_id() -> str:
def set_project_context(
project_id: str | None,
source: str = "runtime",
request_id: str | None = None,
) -> tuple[Token[str | None], Token[str | None], Token[str | None]]:
"""
設定當前 request/context 的 project 上下文,並回傳 ContextVar token 供 restore。
"""
return (
PROJECT_ID.set(project_id),
PROJECT_ID_SOURCE.set(source),
PROJECT_ID_REQUEST_ID.set(request_id),
)
def clear_project_context(tokens: tuple[Token[str | None], Token[str | None], Token[str | None]]) -> None:
"""清除 request 上下文,回復前一個 ContextVar 狀態。"""
PROJECT_ID_REQUEST_ID.reset(tokens[2])
PROJECT_ID_SOURCE.reset(tokens[1])
PROJECT_ID.reset(tokens[0])
def get_project_context() -> dict[str, str | None]:
"""取得目前上下文快照(可直接寫入 audit log"""
return {
"project_id": PROJECT_ID.get(None),
"source": PROJECT_ID_SOURCE.get(None),
"request_id": PROJECT_ID_REQUEST_ID.get(None),
}
def get_current_project_id() -> str | None:
"""取得當前任務的 project_id給 service 層使用)"""
return PROJECT_ID.get()
return PROJECT_ID.get(None)
def get_current_project_context() -> dict[str, str | None]:
"""取得可追溯上下文(同 get_project_context保留 API 命名)。"""
return get_project_context()

View File

@@ -16,6 +16,7 @@ Features:
from collections.abc import AsyncGenerator
from contextlib import asynccontextmanager
from fastapi import HTTPException
from sqlalchemy import text
from sqlalchemy.ext.asyncio import (
AsyncEngine,
@@ -26,6 +27,8 @@ from sqlalchemy.ext.asyncio import (
from sqlalchemy.orm import DeclarativeBase
from src.core.config import settings
from src.core.context import get_current_project_context
from src.core.logging import get_logger
# =============================================================================
# Base Model
@@ -42,6 +45,19 @@ class Base(DeclarativeBase):
_engine: AsyncEngine | None = None
_session_factory: async_sessionmaker[AsyncSession] | None = None
logger = get_logger("awoooi.db")
def _raise_unauthorized_db_context(msg: str) -> None:
context = get_current_project_context()
logger.error(
"db_context_missing",
reason=msg,
project_id=context.get("project_id"),
project_id_source=context.get("source"),
request_id=context.get("request_id"),
)
raise HTTPException(status_code=401, detail="Missing tenant context: project_id is required")
def get_engine() -> AsyncEngine:
@@ -103,13 +119,21 @@ async def get_db() -> AsyncGenerator[AsyncSession, None]:
async def get_items(db: AsyncSession = Depends(get_db)):
...
"""
from src.core.context import get_current_project_id
factory = get_session_factory()
async with factory() as session:
try:
# AwoooP Phase 2.3 (2026-05-04 ogt): SET LOCAL app.project_id 讓 RLS Policy 生效
# 預設 'awoooi',多租戶路由將在 middleware 注入實際 project_id
# Fail-Closed RLS: 遇到未授權情境拋出錯誤而非回退到 "awoooi"
pid = get_current_project_id()
if not pid:
_raise_unauthorized_db_context(
"Unauthorized: project_id is missing in context (Fail-Closed RLS)"
)
await session.execute(
text("SELECT set_config('app.project_id', 'awoooi', TRUE)")
text("SELECT set_config('app.project_id', :pid, TRUE)"),
{"pid": pid},
)
yield session
await session.commit()
@@ -123,19 +147,22 @@ async def get_db_context(project_id: str | None = None) -> AsyncGenerator[AsyncS
"""
Context manager for database session (non-FastAPI usage)
AwoooP Phase 2.3/2.4: 優先序 — 明確參數 > contextvar > "awoooi"
AwoooP Phase 2.3/2.4: 優先序 — 明確參數 > contextvar(缺失則 fail-closed
- Phase 2.3: 啟用 RLS tenant isolationSET LOCAL app.project_id
- Phase 2.4: 從 asyncio contextvar 讀取 background loop 的 project_id
Usage:
async with get_db_context() as db: # 繼承 contextvar 或預設 awoooi
async with get_db_context() as db: # 繼承 contextvar(缺失將 fail-closed
...
async with get_db_context("other-tenant") as db: # 明確指定 tenant
...
"""
"""
from src.core.context import get_current_project_id
effective_pid = project_id if project_id is not None else get_current_project_id()
if not effective_pid:
_raise_unauthorized_db_context("Unauthorized: project_id is missing in context (Fail-Closed RLS)")
factory = get_session_factory()
async with factory() as session:
try:

View File

@@ -479,7 +479,7 @@ async def _collect_all_k8s_assets() -> tuple[list[dict[str, Any]], list[dict[str
# 6. Prometheus targets — 補齊 host-install services (110/112/188/125 等非 K8s)
# Gap 1 修補 (2026-04-19 audit): 原本 asset_inventory 只涵蓋 K8s,
# 110 Harbor/Gitea/監控 + 188 PostgreSQL/Redis/Ollama host-install 全漏
# 110 Harbor/Gitea/監控 + 188 PostgreSQL/Redis host-install 全漏
# 用 Prometheus /api/v1/targets 自動發現全節點服務
try:
prom_assets, host_relationships = await _collect_prometheus_targets()

View File

@@ -172,7 +172,7 @@ _LLM_FORECAST_PROMPT = """你是 AWOOOI 容量規劃專家。以下 host 過去
{findings_json}
## 當前主機環境資訊
- 主機架構: 110 (Harbor/Gitea/監控), 112 (Security), 120/121 (K3s), 125 (K3s backup), 188 (PG/Redis/Ollama/MinIO)
- 主機架構: 110 (Harbor/Gitea/監控), 112 (Security), 120/121 (K3s), 125 (K3s backup), 188 (PG/Redis/MinIO)
- 判斷請考慮: 該主機上跑什麼服務、常見瓶頸模式
## 輸出規格 (必須是合法 JSON,純 JSON 無前後文字)

View File

@@ -20,6 +20,7 @@ Date: 2026-03-20
import asyncio
import os
from uuid import uuid4
from collections.abc import AsyncGenerator
from contextlib import asynccontextmanager
@@ -80,6 +81,7 @@ from src.core.feature_flags import aiops_flags # ADR-080: AI 自主化飛輪 fe
from src.core.http_client import close_all_http_clients, init_all_http_clients
from src.core.logging import get_logger, setup_logging
from src.core.redis_client import close_redis_pool, init_redis_pool
from src.services.flywheel_stats_service import get_flywheel_stats_service
from src.core.sse import get_publisher
from src.core.telemetry import setup_telemetry, shutdown_telemetry
@@ -186,10 +188,9 @@ else:
@asynccontextmanager
async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
"""Application lifespan events"""
# AwoooP Phase 2.4 (2026-05-04 ogt): 設定 startup handler 的 project_id context
# asyncio.create_task() 自動繼承父任務的 ContextVar → 31 個 background loop 全部標記為 awoooi
from src.core.context import PROJECT_ID
PROJECT_ID.set("awoooi")
# AwoooP Phase 2.4 (2026-05-04 ogt):
# 改為不再在 lifespan 初始化預設 project_id context
# 後續請求皆需透過 middleware/runtime 攜帶 project_id 注入,否則拒絕查詢。
# Startup
logger.info(
@@ -683,7 +684,7 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
logger.warning("ollama_failover_system_start_failed", error=str(e))
# 2026-04-27 P3.2.2 by Claude — AI Provider 版本追蹤(每 1 小時)
# 探測 5 Providerollama/ollama_188/gemini/claude/openclaw_nemo版本
# 探測 5 Providerollama/ollama_local/gemini/claude/openclaw_nemo版本
# 寫入 ai_provider_version_history版本變更時 log warningP3.2.3 alerter 後續整合
try:
async def _run_model_version_tracker_loop() -> None:
@@ -819,7 +820,7 @@ app.add_middleware(
allow_origins=settings.CORS_ORIGINS,
allow_credentials=True,
allow_methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
allow_headers=["Authorization", "Content-Type", "X-Request-ID"],
allow_headers=["Authorization", "Content-Type", "X-Request-ID", "X-Project-ID", "X-Tenant-ID"],
expose_headers=["X-Request-ID"],
)
@@ -837,27 +838,53 @@ async def request_logging_middleware(request: Request, call_next):
"""
import time
request_id = request.headers.get("X-Request-ID", "-")
from src.core.context import clear_project_context, get_current_project_context, set_project_context
request_id = request.headers.get("X-Request-ID") or str(uuid4())
project_id = (
request.headers.get("X-Project-ID")
or request.headers.get("X-Tenant-ID")
or request.query_params.get("project_id")
)
project_id = project_id.strip() if project_id else None
source = "request.project_id.missing"
if project_id:
source = "request.header_or_query"
context_tokens = set_project_context(
project_id=project_id,
source=source,
request_id=request_id,
)
start_time = time.perf_counter()
# Bind request context for all logs in this request
structlog.contextvars.clear_contextvars()
current_context = get_current_project_context()
structlog.contextvars.bind_contextvars(
request_id=request_id,
method=request.method,
path=request.url.path,
project_id=current_context["project_id"],
project_context_source=current_context["source"],
)
log = get_logger("awoooi.http")
log.debug("request_start")
response = await call_next(request)
try:
response = await call_next(request)
finally:
clear_project_context(context_tokens)
duration_ms = (time.perf_counter() - start_time) * 1000
log.info(
"request_complete",
status_code=response.status_code,
duration_ms=round(duration_ms, 2),
project_id=current_context["project_id"],
project_context_source=current_context["source"],
has_project_context=bool(current_context["project_id"]),
)
# Add request ID to response headers
@@ -865,6 +892,26 @@ async def request_logging_middleware(request: Request, call_next):
return response
@app.get("/api/v1/security/db-context-guard")
async def db_context_guard() -> dict:
"""
Context Guard Endpoint (P1-1 runtime evidence)
- 未提供 project contextX-Project-ID / X-Tenant-ID / project_id query
時,應回傳 401代表 RLS 已採 fail-closed
- 有提供 context 時回傳 context snapshot便於稽核
"""
from src.core.context import get_current_project_context
from src.db.base import get_db_context
async with get_db_context():
return {
"status": "ok",
"project_context": get_current_project_context(),
"source": "runtime_guard",
}
# =============================================================================
# Exception Handlers
# =============================================================================
@@ -1005,10 +1052,17 @@ app.include_router(platform_v1.router, prefix="/api/v1/platform", tags=["AwoooP
@app.get("/metrics", include_in_schema=False)
async def prometheus_metrics() -> Response:
"""Prometheus metrics endpoint for alerting"""
return Response(
content=generate_latest(),
media_type=CONTENT_TYPE_LATEST,
)
content = generate_latest().decode("utf-8")
# 2026-05-07 ogt + Claude Sonnet 4.6 — INC-20260507-99ADF2 修復
# 飛輪指標awoooi_flywheel_*)原本只在 /api/v1/stats/flywheel/metrics 暴露,
# 110 Prom awoooi-api job scrape /metrics 時抓不到 → FlywheelExecutionRateMissing 永久 firing
# 修法:在此串入飛輪指標,讓既有 scrape job 無需新增 job 即可抓到
try:
flywheel_metrics = await get_flywheel_stats_service().compute()
content += flywheel_metrics.to_prometheus_lines()
except Exception:
logger.warning("prometheus_metrics_flywheel_error")
return Response(content=content, media_type=CONTENT_TYPE_LATEST)
# =============================================================================

View File

@@ -29,7 +29,7 @@ from __future__ import annotations
from prometheus_client import Histogram
# Buckets 對齊 NIM 實測分佈2-27s並覆蓋三段 timeout 30/20/15s 邊界
# 低端0.5-5s快速路徑Ollama 188 本地
# 低端0.5-5s快速路徑Ollama provider pool
# 中端5-20sNIM + Gemini fallback
# 高端20-60s超時 / 慢速 Provider
_AGENT_STEP_BUCKETS = [0.5, 1.0, 2.0, 5.0, 10.0, 15.0, 20.0, 30.0, 45.0, 60.0]

View File

@@ -60,13 +60,17 @@ class MetricsDBRepository(IMetricsRepository):
cutoff = datetime.now(UTC) - timedelta(hours=hours)
# Query: 統計 executed vs total (approved + executed + execution_failed)
# 2026-05-06 ogt + Codex:
# approval_records.status 目前實際寫入的是大寫 enum
# (APPROVED / EXECUTION_SUCCESS / EXECUTION_FAILED)。舊查詢只看
# lowercase executed導致 AI Success 在報表層永遠趨近 0。
query = text("""
SELECT
COUNT(CASE WHEN status = 'executed' THEN 1 END) as executed_count,
COUNT(CASE WHEN UPPER(status::text) = 'EXECUTION_SUCCESS' THEN 1 END) as executed_count,
COUNT(*) as total_count
FROM approval_records
WHERE created_at >= :cutoff
AND status IN ('approved', 'executed', 'execution_failed')
AND UPPER(status::text) IN ('APPROVED', 'EXECUTION_SUCCESS', 'EXECUTION_FAILED')
""")
result = await session.execute(query, {"cutoff": cutoff})
@@ -127,11 +131,11 @@ class MetricsDBRepository(IMetricsRepository):
trend_query = text("""
SELECT
date_trunc('hour', created_at) as hour_bucket,
COUNT(CASE WHEN status = 'executed' THEN 1 END) * 100.0 /
COUNT(CASE WHEN UPPER(status::text) = 'EXECUTION_SUCCESS' THEN 1 END) * 100.0 /
NULLIF(COUNT(*), 0) as hourly_rate
FROM approval_records
WHERE created_at >= :cutoff
AND status IN ('approved', 'executed', 'execution_failed')
AND UPPER(status::text) IN ('APPROVED', 'EXECUTION_SUCCESS', 'EXECUTION_FAILED')
GROUP BY hour_bucket
ORDER BY hour_bucket DESC
LIMIT :limit

View File

@@ -104,7 +104,7 @@ async def get_agent_thinking(
) -> StreamingResponse:
"""
OpenClaw 思考軌跡 (SSE 串流)
Phase 1.2: 真實串接 Ollama at 192.168.0.188:11434
Phase 1.2: 真實串接設定中的 Ollama provider pool
"""
async def generate_thinking_stream():

View File

@@ -1,10 +1,10 @@
"""
Ollama Provider - Phase 24 ADR-052
====================================
本地 LLM 推理 (192.168.0.188 VMware VM, CPU-only)
本地 / 私有 LLM 推理 Provider。
搬移自: openclaw.py _call_ollama (L349-409)
特性: 免費、隱私安全 (local)、但 CPU 慢 (~97s/30tokens for qwen2.5:7b)
特性: 免費、隱私安全 (local)、可依 ADR-110 指向 GCP-A/GCP-B/111。
2026-04-02 ogt: Phase 24-A 從 openclaw.py 抽出
"""
@@ -268,33 +268,27 @@ class OllamaProvider:
self._http_client = None
# 2026-04-26 Wave5 B1-fix by Claude Engineer-A4 — OLLAMA_188 provider 註冊
class Ollama188Provider(OllamaProvider):
# 2026-05-06 Codex — 188 不再作為 Ollama Provider本地備援統一命名為 ollama_local。
class OllamaLocalProvider(OllamaProvider):
"""
Ollama 188 CPU-only 備援 Provider
Ollama Local fallback Provider
繼承 OllamaProvider使用 OLLAMA_FALLBACK_URL192.168.0.188:11434
作為推理端點,模型預設 OLLAMA_HEALTH_CHECK_MODELqwen2.5:7b-instruct
B1 修復:原本 _init_registry 未登錄此 provider導致
executor.execute() 遇到 "ollama_188" → not_registered → 跳過,
188 從未被打到。此類別補全登錄鏈路。
2026-04-26 Wave5 B1-fix by Claude Engineer-A4
使用 OLLAMA_FALLBACK_URL 作為本地最後防線端點。
ADR-110 目前設定為 110 nginx proxy → 111 Ollama188 不得再作為 Ollama provider
"""
@property
def name(self) -> str:
return "ollama_188"
return "ollama_local"
@property
def is_enabled(self) -> bool:
import os
# 優先查 ENABLE_OLLAMA_188;若未設定(預設 true則看 OLLAMA_FALLBACK_URL 是否有值
env_override = os.getenv("ENABLE_OLLAMA_188", "true").lower() == "true"
# 優先查 ENABLE_OLLAMA_LOCAL;若未設定(預設 true則看 OLLAMA_FALLBACK_URL 是否有值
env_override = os.getenv("ENABLE_OLLAMA_LOCAL", "true").lower() == "true"
if not env_override:
return False
# OLLAMA_FALLBACK_URL 空字串 → 未設定 188 節點 → 停用
# OLLAMA_FALLBACK_URL 空字串 → 未設定本地節點 → 停用
return bool(getattr(settings, "OLLAMA_FALLBACK_URL", ""))
def _endpoint_url(self) -> str:
@@ -319,18 +313,18 @@ class Ollama188Provider(OllamaProvider):
client = await self._get_client()
registry = get_model_registry()
# 嘗試取 ollama_188 專屬設定fallback 到 ollama 預設
# 嘗試取本地 fallback 專屬設定fallback 到 ollama 預設
try:
model_name = registry.get_model("ollama_188", "rca")
model_name = registry.get_model("ollama_local", "rca")
except Exception:
model_name = getattr(settings, "OLLAMA_HEALTH_CHECK_MODEL", "qwen2.5:7b-instruct")
try:
options = registry.get_provider_options("ollama_188")
options = registry.get_provider_options("ollama_local")
except Exception:
options = registry.get_provider_options("ollama")
# CPU-only 備援:固定使用較長 timeoutCPU 推理慢)
# 本地備援:固定使用較長 timeout,避免 111 模型載入時被過早判死。
task_type = (context or {}).get("task_type", "")
if task_type in ("diagnose", "force_local"):
read_timeout = float(getattr(settings, "OLLAMA_DIAGNOSE_TIMEOUT_SECONDS", 200))
@@ -359,7 +353,7 @@ class Ollama188Provider(OllamaProvider):
latency = (time.perf_counter() - start) * 1000
logger.info(
"ollama_188_provider_success",
"ollama_local_provider_success",
response_length=len(result),
tokens=tokens,
latency_ms=round(latency, 1),
@@ -375,12 +369,12 @@ class Ollama188Provider(OllamaProvider):
except httpx.TimeoutException as e:
latency = (time.perf_counter() - start) * 1000
logger.warning("ollama_188_provider_timeout", error=str(e), latency_ms=round(latency, 1))
logger.warning("ollama_local_provider_timeout", error=str(e), latency_ms=round(latency, 1))
return AIResult(raw_response="", success=False, provider=self.name, latency_ms=latency, error=f"Timeout: {e}")
except Exception as e:
latency = (time.perf_counter() - start) * 1000
logger.warning("ollama_188_provider_failed", error=str(e), latency_ms=round(latency, 1))
logger.warning("ollama_local_provider_failed", error=str(e), latency_ms=round(latency, 1))
return AIResult(raw_response="", success=False, provider=self.name, latency_ms=latency, error=str(e))
async def health_check(self) -> bool:

View File

@@ -73,10 +73,6 @@ class AIProviderEnum(str, Enum):
"""AI 提供者"""
OLLAMA = "ollama"
# 2026-04-25 critic-fix Part2 B2 by Claude Engineer-C2
# P1.1b OllamaFailoverManager 使用 provider_name="ollama_188"
# 但 AIProviderEnum 沒有此值 → P1.2 整合時 lookup 失敗
OLLAMA_188 = "ollama_188" # 188 CPU-only 備援節點P1.1b
# 2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP 三層容災
# OllamaFailoverManager 回傳 provider_name="ollama_gcp_a"/"ollama_gcp_b"/"ollama_local"
# 缺少 enum 值 → AIProviderEnum(primary_str) 拋 ValueError → fallback chain 清空 → 直跳 Gemini
@@ -96,8 +92,6 @@ class AIProviderEnum(str, Enum):
# Provider 對應延遲預算 (ms)
PROVIDER_LATENCY_BUDGET: dict[AIProviderEnum, int] = {
AIProviderEnum.OLLAMA: 60000, # 本地,允許較長處理時間
# 2026-04-25 critic-fix Part2 B2 by Claude Engineer-C2 — 188 CPU-only 推理較慢
AIProviderEnum.OLLAMA_188: 120000, # 120s budget for CPU inference
# 2026-05-04 ogt: ADR-110 GCP 三層容災 — GCP NVMe SSD 推理快60s 足夠
AIProviderEnum.OLLAMA_GCP_A: 60000,
AIProviderEnum.OLLAMA_GCP_B: 60000,
@@ -432,7 +426,7 @@ class AIRouter:
model = failover_result.primary.model
reason = f"{reason} [failover→{primary_str}]"
except ValueError:
# provider_name 無法對應已知 enum理論上不應發生OLLAMA_188 已加)
# provider_name 無法對應已知 enum;避免未知 provider 靜默進入執行層。
logger.warning(
"ai_router_unknown_failover_provider",
provider=primary_str,
@@ -1078,11 +1072,46 @@ class AIRouterExecutor:
cached = await redis.get(cache_key)
if cached:
data = _json.loads(cached)
cached_provider = data.get("provider", "cache")
provider_allowed = cached_provider in provider_order
ollama_first_required = (
bool(context)
and any(
key in context
for key in (
"alert_type",
"alertname",
"alert_name",
"fingerprint",
"incident_id",
"severity",
"target_resource",
)
)
and bool(provider_order)
and provider_order[0].startswith("ollama")
)
if (
cached_provider == "ollama"
and any(provider.startswith("ollama") for provider in provider_order)
):
provider_allowed = True
if ollama_first_required and not cached_provider.startswith("ollama"):
provider_allowed = False
if not provider_allowed:
logger.info(
"ai_router_cache_provider_mismatch_skip",
cache_key=cache_key[:30],
cached_provider=cached_provider,
provider_order=provider_order,
ollama_first_required=ollama_first_required,
)
raise ValueError("cached provider not allowed by current provider_order")
logger.info("ai_router_cache_hit", cache_key=cache_key[:30])
return AIResult(
raw_response=data.get("response", ""),
success=True,
provider=data.get("provider", "cache"),
provider=cached_provider,
from_cache=True,
)
except Exception as e:
@@ -1306,7 +1335,7 @@ def _init_registry() -> AIProviderRegistry:
"""初始化 Provider Registry (首次呼叫時自動註冊所有 Provider)"""
from src.services.ai_providers.ollama import (
OllamaProvider,
Ollama188Provider,
OllamaLocalProvider,
OllamaGcpBProvider, # 2026-05-04 ADR-110 GCP-B
)
from src.services.ai_providers.gemini import GeminiProvider
@@ -1327,8 +1356,9 @@ def _init_registry() -> AIProviderRegistry:
from src.services.ai_providers.nemotron import NemotronProvider
registry.register(NemotronProvider())
# 2026-04-26 Wave5 B1-fix by Claude Engineer-A4 — 補登 OLLAMA_188 備援 provider
ollama_local = Ollama188Provider()
# 2026-05-06 Codex: 188 不再作為 Ollama provider
# Local fallback 統一命名為 ollama_local端點由 OLLAMA_FALLBACK_URL 指向 111/110 proxy。
ollama_local = OllamaLocalProvider()
registry.register(ollama_local)
# 2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP 三層容災修復
@@ -1337,7 +1367,7 @@ def _init_registry() -> AIProviderRegistry:
# 修復:
# "ollama_gcp_a" alias → 同 OllamaProviderOLLAMA_URL = GCP-A
# "ollama_gcp_b" → 新 OllamaGcpBProviderOLLAMA_SECONDARY_URL = GCP-B
# "ollama_local" alias → 同 Ollama188ProviderOLLAMA_FALLBACK_URL = 111
# "ollama_local" OllamaLocalProviderOLLAMA_FALLBACK_URL = 111 / 110:11437
registry._providers["ollama_gcp_a"] = ollama_gcp_a
registry.register(OllamaGcpBProvider())
registry._providers["ollama_local"] = ollama_local

View File

@@ -457,6 +457,8 @@ class AutoRepairService:
except Exception as _db_e:
logger.error("auto_repair_db_write_failed", error=str(_db_e))
self._record_auto_repair_metric(playbook, success=True)
# 2026-04-07 Claude Code: Sprint 4 B1/B2 — 記錄處置類型
# P0-1 Fix: 統一使用 AnomalyCounter.hash_signature()
try:
@@ -630,6 +632,8 @@ class AutoRepairService:
except Exception as _db_e:
logger.error("auto_repair_db_write_failed", error=str(_db_e))
self._record_auto_repair_metric(playbook, success=False)
# 2026-04-04 Claude Code: Phase 25 P1 — 失敗修復後 fire-and-forget 生成 ANTI_PATTERN
# 2026-04-05 Claude Code: I1 修正 — 補齊 _pending_tasks GC 防護(對稱化)
try:
@@ -700,6 +704,35 @@ class AutoRepairService:
return max_risk
def _record_auto_repair_metric(self, playbook: Playbook, success: bool) -> None:
"""把實際 auto-repair 執行寫入 Prometheus 指標。
2026-05-06 ogt + CodexDB 已有 auto_repair_executions
core.metrics.record_auto_repair() 長期零 caller導致治理/心跳用
Prometheus 看起來像「飛輪沒做事」。label 使用 action_type避免
playbook_id 造成高基數。
"""
try:
from src.core.metrics import record_auto_repair
first_step = playbook.repair_steps[0] if playbook.repair_steps else None
action = first_step.action_type.value if first_step else "unknown"
max_risk = self._get_max_risk_level(playbook)
tier = {
RiskLevel.LOW: 1,
RiskLevel.MEDIUM: 2,
RiskLevel.HIGH: 3,
RiskLevel.CRITICAL: 4,
}.get(max_risk, 0)
record_auto_repair(action=action, tier=tier, success=success)
except Exception as e:
logger.warning(
"auto_repair_metric_record_failed",
playbook_id=playbook.playbook_id,
success=success,
error=str(e),
)
def _is_host_or_backup_incident(self, incident: Incident) -> bool:
"""主機/備份類事件只能走 SSH/只讀診斷,不允許 K8s rollout 類修復。"""

View File

@@ -607,7 +607,7 @@ async def _nemoclaw_second_opinion(incident: "Incident", primary_result: dict) -
"""
MCP Phase 4a: NemoClaw second opinion — 信心 < 0.7 時觸發
============================================================
用 deepseek-r1:14b (Ollama 188) 對同一份資料做獨立推理,
用 deepseek-r1:14b (設定的 Ollama primary) 對同一份資料做獨立推理,
輸出純文字 advisory_note不執行任何操作。
2026-04-11 Claude Sonnet 4.6 Asia/Taipei
@@ -666,7 +666,7 @@ async def _generate_playbook_draft_if_new(incident: "Incident") -> None:
MCP Phase 4c: Playbook 無命中時,自動生成 AI 草稿 Playbook 寫入 KM
=====================================================================
- 僅在 KM 中不存在同 alertname 的 Playbook 時觸發(避免重複)
- 用 qwen2.5:7b-instruct (Ollama 188) 生成結構化 Playbook 草稿
- 用 qwen2.5:7b-instruct (設定的 Ollama primary) 生成結構化 Playbook 草稿
- 寫入 KnowledgeEntrystatus=DRAFT需人工審核後升為 APPROVED
- 寫入 AlertOperationLog PLAYBOOK_DRAFT_CREATED 事件

View File

@@ -237,6 +237,31 @@ class FlywheelStatsService:
except (json.JSONDecodeError, KeyError):
continue
# 2026-05-06 ogt + Codex:
# 執行成功率的 source of truth 是 auto_repair_executions。
# Redis playbook success_count/failure_count 會因回寫鏈路中斷而落後,
# 造成 governance / heartbeat 判定「飛輪沒有執行」。
try:
async with get_db_context() as db:
row = await db.execute(
text("""
SELECT
COUNT(*) FILTER (WHERE success IS TRUE) AS success,
COUNT(*) AS total
FROM auto_repair_executions
WHERE created_at >= NOW() - interval '24 hours'
""")
)
repair_stats = row.one()
db_total_exec = int(repair_stats.total or 0)
if db_total_exec >= FLYWHEEL_MIN_SAMPLE:
db_total_success = int(repair_stats.success or 0)
return count, db_total_success / db_total_exec
if db_total_exec > 0:
return count, None
except Exception:
logger.warning("flywheel_stats_auto_repair_execution_query_failed")
if total_exec < FLYWHEEL_MIN_SAMPLE:
# 樣本不足(含 Redis 空),回 None 通知呼叫方跳過 W-3 告警判斷
return count, None

View File

@@ -15,7 +15,7 @@ HeartbeatReportService — ADR-073 心跳監控重構
import asyncio
import html
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from datetime import datetime
from typing import Optional
import httpx
@@ -420,8 +420,8 @@ class HeartbeatReportService:
try:
# KM 向量化率DB 查詢)
from src.db.base import get_db_context
from src.db.models import IncidentRecord, KnowledgeEntryRecord
from sqlalchemy import func, select
from src.db.models import KnowledgeEntryRecord
from sqlalchemy import func, select, text as sa_text
async with get_db_context() as db:
# KM 總數
km_total = await db.scalar(select(func.count()).select_from(KnowledgeEntryRecord))
@@ -436,20 +436,22 @@ class HeartbeatReportService:
stats.km_vectorized = vec_result.scalar() or 0
# 24h 修復統計
since = datetime.utcnow() - timedelta(hours=24)
outcomes = await db.execute(
select(IncidentRecord.outcome).where(
IncidentRecord.created_at >= since,
IncidentRecord.outcome.isnot(None),
)
)
outcome_list = [r[0] for r in outcomes.all() if r[0]]
stats.attempt_24h = len(outcome_list)
stats.success_24h = sum(
1 for o in outcome_list
if isinstance(o, dict) and o.get("execution_success")
or isinstance(o, str) and "success" in o.lower()
# 2026-05-06 ogt + Codex:
# incidents.outcome 已不是自動修復 source of truth。實際執行紀錄
# 寫在 auto_repair_executions舊查詢會讓心跳報告顯示 0/15
# 造成「全系統正常」但飛輪 KPI 失真的假象。
repair_result = await db.execute(
sa_text("""
SELECT
COUNT(*) FILTER (WHERE success IS TRUE) AS success,
COUNT(*) AS total
FROM auto_repair_executions
WHERE created_at >= NOW() - interval '24 hours'
""")
)
repair_row = repair_result.one()
stats.success_24h = int(repair_row.success or 0)
stats.attempt_24h = int(repair_row.total or 0)
# 最後學習活動
last_km = await db.scalar(
@@ -865,9 +867,10 @@ def report_to_telegram_html(report: HeartbeatReport) -> str:
lines.append("☸️ <b>Kubernetes Pods</b>")
for i, pod in enumerate(report.pods):
prefix = "└─" if i == len(report.pods) - 1 else "├─"
ready_icon = "" if pod.ready else ""
ready_icon = "" if pod.ready or pod.status in ("Succeeded", "Completed") else ""
restart_str = f" (重啟×{pod.restarts})" if pod.restarts > 0 else ""
lines.append(f"{prefix} {ready_icon} {html.escape(pod.name[:35])}{restart_str}")
status_str = "" if pod.ready else f" <code>{html.escape(pod.status)}</code>"
lines.append(f"{prefix} {ready_icon} {html.escape(pod.name[:35])}{restart_str}{status_str}")
# --- Scanner 狀態 ---
if report.scanners.last_runs:

View File

@@ -7,7 +7,7 @@ Hosts:
- 192.168.0.110: DevOps 金庫 (Harbor, GH Runner)
- 192.168.0.112: Kali Security (Scanner API)
- 192.168.0.120: K3s Master (awoooi-prod namespace)
- 192.168.0.188: AI+Web 中心 (Nginx, PostgreSQL, Redis, Ollama, OpenClaw, SigNoz)
- 192.168.0.188: AI+Web 中心 (Nginx, PostgreSQL, Redis, OpenClaw, SigNoz)
Features:
- asyncio.gather for parallel fetching

View File

@@ -1097,26 +1097,45 @@ class IncidentService:
from src.repositories.incident_repository import get_incident_repository
from src.utils.timezone import now_taipei
# 1. 從 Working Memory 讀取
repo = get_incident_repository()
# 1. 從 Working Memory 讀取;若 Redis TTL 已過,回退到 Episodic DB。
# 2026-05-29 ogt + Codex: 舊 incident 只留在 DB 時仍需可收斂,
# 否則 FlywheelIncidentsStuck 會永久累積歷史 INVESTIGATING。
incident = await self.get_from_working_memory(incident_id)
db_only = False
if incident is None:
logger.warning("incident_not_found_for_resolve", incident_id=incident_id)
return None
incident = await repo.get_by_id(incident_id)
if incident is None:
logger.warning("incident_not_found_for_resolve", incident_id=incident_id)
return None
db_only = True
if incident.status in (IncidentStatus.RESOLVED, IncidentStatus.CLOSED):
logger.info(
"incident_resolve_idempotent_skip",
incident_id=incident_id,
status=incident.status.value,
db_only=db_only,
)
return incident
# 2. 更新狀態
incident.status = IncidentStatus.RESOLVED
incident.resolved_at = now_taipei()
incident.updated_at = now_taipei()
# 3. 寫入 Working Memory
redis_success = await self.save_to_working_memory(incident)
if not redis_success:
logger.error("resolve_redis_write_failed", incident_id=incident_id)
return None
# 3. 寫入 Working Memory。DB-only 舊案不重新灌回 Redis working memory。
if not db_only:
redis_success = await self.save_to_working_memory(incident)
if not redis_success:
logger.error("resolve_redis_write_failed", incident_id=incident_id)
return None
else:
logger.info("resolve_db_only_incident", incident_id=incident_id)
# 4. 同步到 Episodic Memory
try:
repo = get_incident_repository()
await repo.update_status(
incident_id=incident_id,
status="resolved",

View File

@@ -34,8 +34,12 @@ logger = structlog.get_logger(__name__)
# 台北時區
TZ_TAIPEI = ZoneInfo("Asia/Taipei")
# Prometheus 端點
PROMETHEUS_URL = "http://192.168.0.121:30090"
# Prometheus endpoint.
#
# 2026-05-05 Codex: do not pin this report job to a K3s worker NodePort.
# Production already injects PROMETHEUS_URL from ConfigMap, currently the
# Docker Prometheus on 110. This keeps reboot recovery independent of 121.
PROMETHEUS_URL = settings.PROMETHEUS_URL.rstrip("/")
# kube-state-metrics 查詢
PROM_QUERIES = {
@@ -215,7 +219,7 @@ class K3sMonitorService:
# 發送訊息
formatted = status.format()
result = await gateway.send_message(formatted)
result = await gateway.send_text(formatted)
if result:
logger.info("k3s_daily_report_sent", date=status.report_date)

View File

@@ -5,7 +5,7 @@ AI Provider 版本探測 — 為每個 Provider 提供 get_version()
Provider:
- ollama : 34.143.170.20 GCP-A Ollama (primary) — 2026-05-03 ogt: ADR-110 GCP-A Primary
- ollama_188 : 192.168.0.188 Ollama (fallback)
- ollama_local : 192.168.0.111 / 110 proxy Ollama (local fallback)
- gemini : Google Gemini API (版本 = model name)
- claude : Anthropic Claude (版本 = model name)
- openclaw_nemo : OpenClaw NemoTron (版本 = OPENCLAW_DEFAULT_MODEL)
@@ -31,7 +31,7 @@ TAIPEI_TZ = timezone(timedelta(hours=8))
class ProviderVersionInfo:
"""AI Provider 版本快照"""
provider: str # "ollama" / "ollama_188" / "gemini" / "claude" / "openclaw_nemo"
provider: str # "ollama" / "ollama_local" / "gemini" / "claude" / "openclaw_nemo"
model: str
version: str # version string 或 tagOllama 用 modified_at其他用 model name
digest: str | None = None # SHA256 digest僅 Ollama 有)
@@ -43,7 +43,7 @@ class ProviderVersionInfo:
# =============================================================================
async def probe_ollama_version(url: str, model: str) -> ProviderVersionInfo:
"""探測 OllamaGCP-A 或 188GET /api/tags 取 model digest + modified_at
"""探測 OllamaGCP-A/GCP-B 或本地 111GET /api/tags 取 model digest + modified_at
Args:
url: Ollama base URL例如 "http://34.143.170.20:11434"GCP-A Primary
@@ -58,15 +58,12 @@ async def probe_ollama_version(url: str, model: str) -> ProviderVersionInfo:
"""
import httpx
# 2026-05-03 ogt: ADR-110 GCP-A Primary — 擴展 provider 判斷邏輯支援 GCP 三層容災
# 188 保留 ollama_188 命名CPU-only 主機,雖移出 routing chain 但仍可被 probe
# 2026-05-06 Codex: 188 不再作為 Ollama providerlocal fallback 一律標示 ollama_local。
_GCP_OLLAMA_IPS = {"34.143.170.20", "34.21.145.224"}
if any(ip in url for ip in _GCP_OLLAMA_IPS):
provider_name = "ollama"
elif "192.168.0.111" in url:
elif "192.168.0.111" in url or "192.168.0.110:11437" in url:
provider_name = "ollama_local"
elif "192.168.0.188" in url:
provider_name = "ollama_188"
else:
provider_name = "ollama_remote"
@@ -179,7 +176,7 @@ async def probe_claude_version() -> ProviderVersionInfo:
async def probe_openclaw_nemo_version() -> ProviderVersionInfo:
"""OpenClaw NemoTron版本字串從 settings.OPENCLAW_DEFAULT_MODEL 讀取
NemoTron 運行在 OpenClaw 188 節點(使用 Ollama 推理)
NemoTron 運行在 OpenClaw 節點
透過 OPENCLAW_URL /api/tags 探測,模型名稱即版本識別。
Returns:
@@ -195,18 +192,18 @@ async def probe_openclaw_nemo_version() -> ProviderVersionInfo:
# OpenClaw 底層是 Ollama使用 OPENCLAW_URL 的 host:port 加上 Ollama port
# OPENCLAW_URL 是 8088OpenClaw APIOllama 通常在 11434
# 188 的 Ollama URL 若有設定則直接用 OLLAMA_FALLBACK_URL
ollama_188_url = settings.OLLAMA_FALLBACK_URL
if not ollama_188_url:
# OpenClaw 底層 tags 來源優先使用本地 fallback Ollama URL。
ollama_local_url = settings.OLLAMA_FALLBACK_URL
if not ollama_local_url:
# fallback從 OPENCLAW_URL host 構建 Ollama URL
from urllib.parse import urlparse
parsed = urlparse(settings.OPENCLAW_URL)
ollama_188_url = f"{parsed.scheme}://{parsed.hostname}:11434"
ollama_local_url = f"{parsed.scheme}://{parsed.hostname}:11434"
import httpx
async with httpx.AsyncClient(timeout=5.0) as client:
resp = await client.get(f"{ollama_188_url}/api/tags")
resp = await client.get(f"{ollama_local_url}/api/tags")
resp.raise_for_status()
models = resp.json().get("models", [])
@@ -220,7 +217,7 @@ async def probe_openclaw_nemo_version() -> ProviderVersionInfo:
)
# model 不在清單時version 用 model namedigest=None
logger.warning("openclaw_nemo_model_not_in_tags", model=model, url=ollama_188_url)
logger.warning("openclaw_nemo_model_not_in_tags", model=model, url=ollama_local_url)
return ProviderVersionInfo(
provider="openclaw_nemo",
model=model,
@@ -257,7 +254,7 @@ async def probe_all_providers() -> list[ProviderVersionInfo]:
raw = await asyncio.gather(*tasks, return_exceptions=True)
results: list[ProviderVersionInfo] = []
provider_labels = ["ollama", "ollama_188", "gemini", "claude", "openclaw_nemo"]
provider_labels = ["ollama", "ollama_local", "gemini", "claude", "openclaw_nemo"]
for label, outcome in zip(provider_labels, raw, strict=True):
if isinstance(outcome, ProviderVersionInfo):
results.append(outcome)

View File

@@ -5,7 +5,7 @@ Phase 5: OpenClaw 實體化升級 (2026-03-21)
統帥校正: SignOz 為唯一全能視力中心
Features:
- 真實 LLM SDK 整合 (Ollama → Gemini → Claude)
- 真實 LLM SDK 整合 (告警預設 Ollama GCP-A → GCP-B → 111 → Gemini)
- SignOz Gold Metrics 即時擷取 (P99/Error/RPS)
- AIOps Agent 專業人格 (K8s 維運 + SRE RCA 專精)
- 強制結構化 JSON 輸出 (符合 API 契約)
@@ -144,8 +144,8 @@ class OpenClawService:
"""
OpenClaw AI 決策服務 - True LLM + SignOz Integration
實作 AI_FALLBACK_ORDER 備援機制:
Ollama → Gemini → Claude → Mock
實作 AI_FALLBACK_ORDER 備援機制
告警/incident 上下文預設套用成本防線,只允許 Ollama GCP-A → GCP-B → 111。
新增 SignOz 整合:
- 自動擷取 Gold Metrics
@@ -176,6 +176,89 @@ class OpenClawService:
await self._http_client.aclose()
self._http_client = None
def _is_incident_alert_context(self, alert_context: dict | None) -> bool:
"""Return true when a request came from the alert/incident automation path."""
if not alert_context:
return False
alert_keys = {
"alert_type",
"alertname",
"alert_name",
"fingerprint",
"incident_id",
"severity",
"signals",
"target_resource",
}
return any(key in alert_context for key in alert_keys)
def _cloud_fallback_allowed_for_alert(self, alert_context: dict | None) -> bool:
"""Cloud fallback is allowed after the ordered Ollama lane for alerts."""
if not self._is_incident_alert_context(alert_context):
return True
return bool(getattr(settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", True))
def _alert_enforces_ollama_first(self, alert_context: dict | None) -> bool:
"""Alert cards must try GCP-A/GCP-B/111 before Gemini backup."""
return (
self._is_incident_alert_context(alert_context)
and bool(getattr(settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True))
)
async def _resolve_alert_provider_order(
self,
task_type: str = "diagnose",
alert_context: dict | None = None,
cloud_provider_order: list[str] | None = None,
) -> list[str]:
"""Resolve GCP-A/GCP-B/111, then Gemini backup, for alert analysis."""
provider_order: list[str] = []
try:
route = await get_ollama_failover_manager().select_provider(task_type=task_type)
provider_order = [
endpoint.provider_name
for endpoint in route.all_endpoints_in_order()
if endpoint.provider_name.startswith("ollama")
]
except Exception as route_error:
logger.warning(
"alert_ollama_route_lookup_failed",
error=str(route_error),
task_type=task_type,
)
if not provider_order:
provider_order = ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
deduped: list[str] = []
for provider_name in provider_order:
if provider_name and provider_name not in deduped:
deduped.append(provider_name)
if not self._alert_enforces_ollama_first(alert_context):
return deduped
ollama_order = {"ollama_gcp_a": 0, "ollama_gcp_b": 1, "ollama_local": 2}
ordered_ollama = [
provider_name
for provider_name in deduped
if provider_name in ollama_order
]
ordered_ollama.sort(key=lambda provider_name: ollama_order[provider_name])
if not ordered_ollama:
ordered_ollama = ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
if not self._cloud_fallback_allowed_for_alert(alert_context):
return ordered_ollama
cloud_candidates = cloud_provider_order or []
cloud_backup: list[str] = []
for provider_name in [*cloud_candidates, "gemini"]:
if provider_name == "gemini" and provider_name not in cloud_backup:
cloud_backup.append(provider_name)
return ordered_ollama + cloud_backup
# =========================================================================
# SignOz Integration
# =========================================================================
@@ -437,13 +520,13 @@ class OpenClawService:
# 完整移除時機: Phase 24 完整驗收後 (ADR-052 D11)
# =========================================================================
async def _call_ollama(self, prompt: str) -> tuple[str, bool]:
async def _call_ollama(self, prompt: str, *, ollama_only: bool = False) -> tuple[str, bool]:
"""
呼叫 Ollama (支援 JSON Mode)。
USE_AI_ROUTER=true 正常會走 AIRouterExecutor這裡是 legacy safety-net。
2026-05-05 Codex: safety-net 也必須遵守 ADR-110 三層 Ollama
路由,不能只打 OLLAMA_URL 後直接掉 Gemini。
路由,告警路徑預設只允許 GCP-A/GCP-B/111不能只打 OLLAMA_URL 後直接掉 Gemini。
"""
try:
client = await self._get_client()
@@ -484,6 +567,26 @@ class OpenClawService:
endpoints.append((provider_name, endpoint_url))
seen_urls.add(endpoint_url)
if ollama_only:
allowed_provider_order = {"ollama_gcp_a": 0, "ollama_gcp_b": 1, "ollama_local": 2}
endpoints = [
(provider_name, endpoint_url)
for provider_name, endpoint_url in endpoints
if provider_name in allowed_provider_order
]
endpoints.sort(key=lambda item: allowed_provider_order[item[0]])
if not endpoints:
endpoints = [
("ollama_gcp_a", settings.OLLAMA_URL),
("ollama_gcp_b", getattr(settings, "OLLAMA_SECONDARY_URL", "")),
("ollama_local", getattr(settings, "OLLAMA_FALLBACK_URL", "")),
]
endpoints = [
(provider_name, endpoint_url)
for provider_name, endpoint_url in endpoints
if endpoint_url
]
last_error = ""
for provider_name, endpoint_url in endpoints:
try:
@@ -973,7 +1076,11 @@ class OpenClawService:
try:
# 2026-04-02 ogt: C2 修復 — 呼叫 AIRouter.route() 智慧路由 (非靜態 order)
# D1 意圖分類路由、D7 隱私保護 (DIAGNOSE/CODE_REVIEW 強制 local) 生效
from src.services.ai_router import get_ai_router, get_ai_executor, IntentType
from src.services.ai_router import (
IntentType,
get_ai_executor,
get_ai_router,
)
router = get_ai_router()
executor = get_ai_executor()
@@ -987,7 +1094,10 @@ class OpenClawService:
if p.value != decision.selected_provider.value
]
try:
from src.services.ai_control import get_primary_provider, is_provider_disabled
from src.services.ai_control import (
get_primary_provider,
is_provider_disabled,
)
_primary = await get_primary_provider()
if _primary and _primary != decision.selected_provider.value:
# 把 primary 移到首位 (保留原始 fallback)
@@ -1003,6 +1113,20 @@ class OpenClawService:
except Exception as _e:
logger.warning("ai_control_override_failed", error=str(_e))
if self._alert_enforces_ollama_first(alert_context):
original_provider_order = list(provider_order)
provider_order = await self._resolve_alert_provider_order(
task_type=decision.intent.value if decision.intent else "diagnose",
alert_context=alert_context,
cloud_provider_order=original_provider_order,
)
logger.info(
"alert_ollama_first_provider_order",
original_provider_order=original_provider_order,
provider_order=provider_order,
cloud_fallback_allowed=self._cloud_fallback_allowed_for_alert(alert_context),
)
# Step 3: D7 隱私 — CODE_REVIEW 強制 local
# 2026-04-15 ogt: DIAGNOSE 移除 require_localv4.3 決策NIM 為主力,無隱私問題)
# ai_router.py v4.3 已明確「NIM 從 Phase 22 起就是主力,無隱私問題」
@@ -1045,13 +1169,18 @@ class OpenClawService:
_mock_json, _rule_id = self._generate_mock_response(alert_context or {}, signoz_metrics)
if _rule_id == "generic_fallback":
import asyncio
from src.services.alert_rule_engine import auto_generate_rule
try:
asyncio.create_task(auto_generate_rule(
alert_context or {},
ollama_url=settings.OLLAMA_URL,
model=settings.OPENCLAW_DEFAULT_MODEL,
gemini_api_key=getattr(settings, "GEMINI_API_KEY", ""),
gemini_api_key=(
getattr(settings, "GEMINI_API_KEY", "")
if self._cloud_fallback_allowed_for_alert(alert_context)
else ""
),
))
except Exception as _e:
logger.warning("auto_rule_trigger_failed", error=str(_e))
@@ -1086,7 +1215,18 @@ class OpenClawService:
from src.services.ai_rate_limiter import get_ai_rate_limiter
rate_limiter = get_ai_rate_limiter()
for provider in settings.AI_FALLBACK_ORDER:
legacy_provider_order = list(settings.AI_FALLBACK_ORDER)
if self._alert_enforces_ollama_first(alert_context):
legacy_provider_order = ["ollama"]
if self._cloud_fallback_allowed_for_alert(alert_context):
legacy_provider_order.append("gemini")
logger.info(
"legacy_alert_ollama_first_provider_order",
provider_order=legacy_provider_order,
cloud_fallback_allowed=self._cloud_fallback_allowed_for_alert(alert_context),
)
for provider in legacy_provider_order:
# Rate Limit 檢查 (nvidia/gemini/claude 需檢查ollama 不限)
# 2026-03-30 ogt: 加入 nvidia (RPM=5 限制)
if provider in ("nvidia", "gemini", "claude"):
@@ -1109,7 +1249,10 @@ class OpenClawService:
cost_usd = 0.0
if provider == "ollama":
response, success = await self._call_ollama(prompt)
response, success = await self._call_ollama(
prompt,
ollama_only=self._alert_enforces_ollama_first(alert_context),
)
elif provider == "gemini":
response, success, total_tokens, cost_usd = await self._call_gemini(prompt)
elif provider == "nvidia":
@@ -1165,13 +1308,18 @@ class OpenClawService:
_mock_json, _rule_id = self._generate_mock_response(alert_context or {}, signoz_metrics)
if _rule_id == "generic_fallback":
import asyncio
from src.services.alert_rule_engine import auto_generate_rule
try:
asyncio.create_task(auto_generate_rule(
alert_context or {},
ollama_url=settings.OLLAMA_URL,
model=settings.OPENCLAW_DEFAULT_MODEL,
gemini_api_key=getattr(settings, "GEMINI_API_KEY", ""),
gemini_api_key=(
getattr(settings, "GEMINI_API_KEY", "")
if self._cloud_fallback_allowed_for_alert(alert_context)
else ""
),
))
except Exception as _e:
logger.warning("auto_rule_trigger_failed", error=str(_e))
@@ -1218,14 +1366,14 @@ class OpenClawService:
except json.JSONDecodeError:
# 3. 啟發式修補: 如果結尾缺少括號,嘗試補齊
if candidate.startswith("{") and not candidate.endswith("}"):
for i in range(1, 5): # 嘗試補 1-5 個括號/引號
try:
repaired = candidate + '"' * (i-1) + "}" * i
json.loads(repaired)
logger.info("json_repaired_heuristically", level=i)
return repaired
except:
continue
for i in range(1, 5): # 嘗試補 1-5 個括號/引號
try:
repaired = candidate + '"' * (i - 1) + "}" * i
json.loads(repaired)
logger.info("json_repaired_heuristically", level=i)
return repaired
except json.JSONDecodeError:
continue
continue
# 4. 極端情況: 找出最後一個有效 key
@@ -1235,11 +1383,11 @@ class OpenClawService:
# 暴力去除非法尾綴 (如 \t\t...)
candidate = re.sub(r"[ \t\r\n]+$", "", candidate)
if not candidate.endswith("}"):
candidate += '"}' # 嘗試最簡單的閉合
candidate += '"}' # 嘗試最簡單的閉合
try:
json.loads(candidate)
return candidate
except:
except json.JSONDecodeError:
pass
return None
@@ -1791,7 +1939,7 @@ Focus on:
from src.services.ai_router import get_ai_registry
ai_registry = get_ai_registry()
provider = ai_registry.get("ollama") or ai_registry.get("ollama_188")
provider = ai_registry.get("ollama") or ai_registry.get("ollama_local")
if provider is None or not hasattr(provider, "analyze_with_tools"):
logger.warning(
"openclaw_agent_loop_shadow_skipped",
@@ -2200,6 +2348,7 @@ Expert context: {json.dumps(expert_context or {}, ensure_ascii=False, default=st
}
"""
import asyncio
from src.services.nvidia_provider import get_nvidia_provider
nvidia = get_nvidia_provider()
@@ -2334,7 +2483,7 @@ Expert context: {json.dumps(expert_context or {}, ensure_ascii=False, default=st
"latency_ms": latency_ms,
}
except asyncio.TimeoutError:
except TimeoutError:
latency_ms = (time.time() - start_time) * 1000
logger.error(
"nemotron_tool_call_timeout",
@@ -2528,6 +2677,7 @@ async def _fetch_k8s_inventory_for_openclaw(
"awoooi-api, awoooi-web, ..." 格式字串,失敗時返回 ""
"""
import asyncio as _asyncio
import structlog as _structlog
_logger = _structlog.get_logger(__name__)
try:
@@ -2542,7 +2692,7 @@ async def _fetch_k8s_inventory_for_openclaw(
)
try:
stdout, _ = await _asyncio.wait_for(proc.communicate(), timeout=timeout_sec)
except _asyncio.TimeoutError:
except TimeoutError:
proc.kill()
_logger.warning("k8s_inventory_timeout_openclaw", namespace=namespace)
return ""

View File

@@ -9,8 +9,6 @@ ADR-106AwoooP Agent Platform
from __future__ import annotations
import uuid
from datetime import datetime
from decimal import Decimal
from typing import Any
from uuid import UUID
@@ -153,8 +151,21 @@ async def list_runs(
# Approvals
# =============================================================================
async def list_approvals(project_id: str | None) -> dict[str, Any]:
"""列出所有 waiting_approval 狀態的 runs。"""
async def list_approvals(
project_id: str | None,
run_id: str | None = None,
) -> dict[str, Any]:
"""列出 waiting_approval runs可依 project_id 或 run_id 篩選。"""
run_uuid: UUID | None = None
if run_id:
try:
run_uuid = uuid.UUID(run_id)
except ValueError as exc:
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail=f"run_id 格式錯誤: {exc}",
) from exc
async with get_db_context("awoooi") as db:
stmt = (
select(AwoooPRunState)
@@ -163,6 +174,8 @@ async def list_approvals(project_id: str | None) -> dict[str, Any]:
)
if project_id is not None:
stmt = stmt.where(AwoooPRunState.project_id == project_id)
if run_uuid is not None:
stmt = stmt.where(AwoooPRunState.run_id == run_uuid)
count_stmt = select(func.count()).select_from(stmt.subquery())
total_result = await db.execute(count_stmt)

View File

@@ -4,7 +4,7 @@ LLM Playbook Generator - ADR-104 T1/T2/T6
從成功修復案例生成可治理的 Playbook 草稿。
設計重點:
- 只用 local provider 順序Ollama 111 -> Ollama 188),避免新增雲端成本。
- 只用 local/provider pool 順序GCP-A -> 111 local),避免新增雲端成本。
- LLM 產出必須經 Pydantic + action_parser 安全收斂。
- 不直接 APPROVED先 DRAFT/REVIEW再交治理 job 晉級。
"""
@@ -30,7 +30,6 @@ from src.models.playbook import (
RiskLevel,
SymptomPattern,
)
from src.services.action_parser import is_safe_kubectl_action
from src.services.action_parser import kubectl_safety_reason
logger = structlog.get_logger(__name__)
@@ -218,7 +217,7 @@ class LLMPlaybookGenerator:
executor = get_ai_executor()
result = await executor.execute(
prompt=prompt,
provider_order=["ollama", "ollama_188"],
provider_order=["ollama", "ollama_local"],
context=context,
cache_ttl=86400,
require_local=True,

View File

@@ -244,7 +244,7 @@ class WeeklyReportService:
# 發送訊息
formatted = report.format()
result = await gateway.send_message(formatted)
result = await gateway.send_text(formatted)
if result:
logger.info("weekly_report_sent", week=report.week_range)

View File

@@ -0,0 +1,90 @@
from __future__ import annotations
import json
from typing import Any
import pytest
from src.services import ai_router as ai_router_module
from src.services.ai_providers.interfaces import AIResult
from src.services.ai_router import AIProviderRegistry, AIRouterExecutor
class _FakeRedis:
def __init__(self, cached_provider: str) -> None:
self.cached_provider = cached_provider
self.set_calls: list[tuple[str, str, int | None]] = []
async def get(self, key: str) -> str:
return json.dumps({
"response": '{"provider":"stale"}',
"provider": self.cached_provider,
})
async def set(self, key: str, value: str, ex: int | None = None) -> None:
self.set_calls.append((key, value, ex))
class _FakeProvider:
name = "ollama_gcp_a"
privacy_level = "local"
is_enabled = True
capabilities = {"rca", "chat"}
def __init__(self) -> None:
self.calls = 0
async def analyze(self, prompt: str, context: dict[str, Any] | None = None) -> AIResult:
self.calls += 1
return AIResult(
raw_response='{"provider":"fresh_ollama"}',
success=True,
provider=self.name,
)
@pytest.mark.asyncio
async def test_executor_skips_cached_cloud_provider_when_ollama_lane_is_required(
monkeypatch: pytest.MonkeyPatch,
) -> None:
fake_redis = _FakeRedis(cached_provider="gemini")
fake_provider = _FakeProvider()
registry = AIProviderRegistry()
registry.register(fake_provider)
monkeypatch.setattr(ai_router_module._settings, "MOCK_MODE", False)
monkeypatch.setattr("src.core.redis_client.get_redis", lambda: fake_redis)
result = await AIRouterExecutor(registry).execute(
prompt="diagnose alert",
provider_order=["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"],
context={"intent_hint": "diagnose", "alert_type": "HostHighCpuLoad"},
)
assert result.provider == "ollama_gcp_a"
assert result.raw_response == '{"provider":"fresh_ollama"}'
assert fake_provider.calls == 1
assert fake_redis.set_calls
@pytest.mark.asyncio
async def test_executor_allows_cached_ollama_provider_for_ollama_lane(
monkeypatch: pytest.MonkeyPatch,
) -> None:
fake_redis = _FakeRedis(cached_provider="ollama")
fake_provider = _FakeProvider()
registry = AIProviderRegistry()
registry.register(fake_provider)
monkeypatch.setattr(ai_router_module._settings, "MOCK_MODE", False)
monkeypatch.setattr("src.core.redis_client.get_redis", lambda: fake_redis)
result = await AIRouterExecutor(registry).execute(
prompt="diagnose alert",
provider_order=["ollama_gcp_a", "ollama_gcp_b", "ollama_local"],
context={"intent_hint": "diagnose", "alert_type": "HostHighCpuLoad"},
)
assert result.provider == "ollama"
assert result.from_cache is True
assert fake_provider.calls == 0

View File

@@ -124,8 +124,9 @@ def test_diagnose_fallback_chain_ollama_primary():
assert AIProviderEnum.OPENCLAW_NEMO in providers_in_chain
assert AIProviderEnum.GEMINI in providers_in_chain
assert AIProviderEnum.CLAUDE in providers_in_chain
# OLLAMA_188 (CPU-only 備援) 仍排除M1 Pro 111 才是 GPU 主推理)
assert AIProviderEnum.OLLAMA_188 not in providers_in_chain
# 188 不得作為 Ollama provider本地備援只允許 ollama_local。
provider_values = {p.value for p in providers_in_chain}
assert "ollama_188" not in provider_values
def test_diagnose_fallback_chain_contains_cloud_providers():
@@ -159,7 +160,7 @@ async def test_diagnose_route_primary_is_ollama():
# 雲端 fallback 仍在OpenClaw / Gemini / Claude 救命備援)
fb_providers = [p for p, _ in decision.fallback_chain]
# ollama_failover_manager 可能轉到 ollama_188但 ollama variant 必須有
# ollama_failover_manager 可能轉到 GCP-B / ollama_local但雲端救命備援仍必須存在。
has_cloud_fallback = (
AIProviderEnum.GEMINI in fb_providers or AIProviderEnum.CLAUDE in fb_providers
)

View File

@@ -83,7 +83,7 @@ async def test_router_uses_failover_when_ollama_initial_provider():
return_value=_make_failover_result(
primary_provider="gemini",
primary_model="gemini-1.5-flash",
fallback=[("ollama_188", "qwen2.5:7b-instruct"), ("nemotron", "nvidia/nemotron-mini-4b-instruct")],
fallback=[("ollama_local", "qwen2.5:7b-instruct"), ("nemotron", "nvidia/nemotron-mini-4b-instruct")],
)
)
@@ -109,14 +109,14 @@ async def test_router_uses_failover_when_ollama_initial_provider():
@pytest.mark.asyncio
async def test_router_failover_fallback_chain_converted():
"""failover_manager 回傳 fallback_chain → decision.fallback_chain 包含 OLLAMA_188"""
"""failover_manager 回傳 fallback_chain → decision.fallback_chain 包含 OLLAMA_LOCAL"""
mock_fm = MagicMock()
mock_fm.select_provider = AsyncMock(
return_value=_make_failover_result(
primary_provider="gemini",
primary_model="gemini-1.5-flash",
fallback=[
("ollama_188", "qwen2.5:7b-instruct"),
("ollama_local", "qwen2.5:7b-instruct"),
("nemotron", "nvidia/nemotron-mini-4b-instruct"),
("claude", "claude-haiku-4-5-20251001"),
],
@@ -134,8 +134,8 @@ async def test_router_failover_fallback_chain_converted():
decision = await router.route("test alert message")
fb_providers = [p for p, _ in decision.fallback_chain]
assert AIProviderEnum.OLLAMA_188 in fb_providers, (
f"OLLAMA_188 not in fallback_chain: {fb_providers}"
assert AIProviderEnum.OLLAMA_LOCAL in fb_providers, (
f"OLLAMA_LOCAL not in fallback_chain: {fb_providers}"
)
assert AIProviderEnum.NEMOTRON in fb_providers
assert AIProviderEnum.CLAUDE in fb_providers

View File

@@ -68,7 +68,7 @@ async def test_alert_failover_dedup(mock_redis, mock_telegram_send):
"to_provider": "gemini",
"reason": "111 unhealthy",
"model": "qwen3:8b",
"fallback_chain_str": "gemini → ollama_188",
"fallback_chain_str": "gemini → ollama_local",
}
# 第 1 次dedup pass發送

View File

@@ -1,16 +1,15 @@
# apps/api/tests/test_failover_e2e_dispatch.py | 2026-04-26 @ Asia/Taipei
# 2026-04-26 Wave5 B4 by Claude Engineer-A4 — E2E executor dispatch 測試
# 驗證 failover 切到 OLLAMA_188 後HTTP 請求真的打到 OLLAMA_FALLBACK_URL
# apps/api/tests/test_failover_e2e_dispatch.py | 2026-05-06 @ Asia/Taipei
# 2026-05-06 Codex — 188 不再作為 Ollama Provider驗證 ollama_local dispatch
"""
E2Eexecutor dispatch 層驗證
===============================
測試覆蓋(補全 B4 — 整合測試只驗決策層,未驗執行層):
1. registry 確實有 ollama_188 providerB1 修復後基本健全性)
2. Ollama188Provider.is_enabled 在有 OLLAMA_FALLBACK_URL 時為 True
3. Ollama188Provider.is_enabled 在 OLLAMA_FALLBACK_URL 空字串時為 False
4. Ollama188Provider.analyze() 真的把 HTTP 打到 OLLAMA_FALLBACK_URL攔截 httpx
5. executor.execute(provider_order=["ollama_188"]) 真的路由到 188 URL
1. registry 確實有 ollama_local provider且沒有 ollama_188 provider
2. OllamaLocalProvider.is_enabled 在有 OLLAMA_FALLBACK_URL 時為 True
3. OllamaLocalProvider.is_enabled 在 OLLAMA_FALLBACK_URL 空字串時為 False
4. OllamaLocalProvider.analyze() 真的把 HTTP 打到 OLLAMA_FALLBACK_URL攔截 httpx
5. executor.execute(provider_order=["ollama_local"]) 真的路由到 local URL
6. Gemini quota pipeline 並行 5 次不超發B3 atomic 驗證)
7. Gemini quota TTL 第一次呼叫即設定
"""
@@ -28,31 +27,30 @@ import pytest
# =============================================================================
def test_registry_has_ollama_188_provider():
"""B1 基本健全性:_init_registry() 後 registry 必須有 ollama_188"""
def test_registry_has_ollama_local_provider_without_ollama_188():
"""_init_registry() 後 registry 必須有 ollama_local且不得有 ollama_188"""
from src.services.ai_router import _init_registry
registry = _init_registry()
# registry.get() 只返回 is_enabled=True 的 provider
# 用 _providers dict 直接檢查(不管 is_enabled
assert "ollama_188" in registry._providers, (
"ollama_188 not found in registry._providers — B1 fix 未生效"
)
assert "ollama_local" in registry._providers
assert "ollama_188" not in registry._providers
def test_ollama_188_provider_name():
"""Ollama188Provider.name == 'ollama_188'"""
from src.services.ai_providers.ollama import Ollama188Provider
def test_ollama_local_provider_name():
"""OllamaLocalProvider.name == 'ollama_local'"""
from src.services.ai_providers.ollama import OllamaLocalProvider
p = Ollama188Provider()
assert p.name == "ollama_188"
p = OllamaLocalProvider()
assert p.name == "ollama_local"
def test_ollama_188_provider_privacy_level():
"""Ollama188Provider.privacy_level == 'local'(本地推理,可接機密資料)"""
from src.services.ai_providers.ollama import Ollama188Provider
def test_ollama_local_provider_privacy_level():
"""OllamaLocalProvider.privacy_level == 'local'(本地推理,可接機密資料)"""
from src.services.ai_providers.ollama import OllamaLocalProvider
p = Ollama188Provider()
p = OllamaLocalProvider()
assert p.privacy_level == "local"
@@ -61,45 +59,44 @@ def test_ollama_188_provider_privacy_level():
# =============================================================================
def test_ollama_188_is_enabled_with_fallback_url(monkeypatch):
"""OLLAMA_FALLBACK_URL 有值 + ENABLE_OLLAMA_188 未設 → is_enabled == True"""
from src.services.ai_providers.ollama import Ollama188Provider
from src.core.config import get_settings
def test_ollama_local_is_enabled_with_fallback_url(monkeypatch):
"""OLLAMA_FALLBACK_URL 有值 + ENABLE_OLLAMA_LOCAL 未設 → is_enabled == True"""
from src.services.ai_providers.ollama import OllamaLocalProvider
monkeypatch.setenv("ENABLE_OLLAMA_188", "true")
monkeypatch.setenv("ENABLE_OLLAMA_LOCAL", "true")
# patch settings 的 OLLAMA_FALLBACK_URL
mock_settings = MagicMock()
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
mock_settings.OPENCLAW_TIMEOUT = "60"
p = Ollama188Provider()
p = OllamaLocalProvider()
# 直接 patch module-level settings 物件
with patch("src.services.ai_providers.ollama.settings", mock_settings):
assert p.is_enabled is True
def test_ollama_188_is_disabled_without_fallback_url(monkeypatch):
"""OLLAMA_FALLBACK_URL 空字串 → is_enabled == False188 節點未設定)"""
from src.services.ai_providers.ollama import Ollama188Provider
def test_ollama_local_is_disabled_without_fallback_url(monkeypatch):
"""OLLAMA_FALLBACK_URL 空字串 → is_enabled == Falselocal 節點未設定)"""
from src.services.ai_providers.ollama import OllamaLocalProvider
monkeypatch.setenv("ENABLE_OLLAMA_188", "true")
monkeypatch.setenv("ENABLE_OLLAMA_LOCAL", "true")
mock_settings = MagicMock()
mock_settings.OLLAMA_FALLBACK_URL = ""
p = Ollama188Provider()
p = OllamaLocalProvider()
with patch("src.services.ai_providers.ollama.settings", mock_settings):
assert p.is_enabled is False
def test_ollama_188_is_disabled_by_env_flag(monkeypatch):
"""ENABLE_OLLAMA_188=false → is_enabled == False即使有 URL"""
from src.services.ai_providers.ollama import Ollama188Provider
def test_ollama_local_is_disabled_by_env_flag(monkeypatch):
"""ENABLE_OLLAMA_LOCAL=false → is_enabled == False即使有 URL"""
from src.services.ai_providers.ollama import OllamaLocalProvider
monkeypatch.setenv("ENABLE_OLLAMA_188", "false")
monkeypatch.setenv("ENABLE_OLLAMA_LOCAL", "false")
mock_settings = MagicMock()
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
p = Ollama188Provider()
p = OllamaLocalProvider()
with patch("src.services.ai_providers.ollama.settings", mock_settings):
assert p.is_enabled is False
@@ -110,14 +107,14 @@ def test_ollama_188_is_disabled_by_env_flag(monkeypatch):
@pytest.mark.asyncio
async def test_ollama_188_analyze_dispatches_to_fallback_url():
async def test_ollama_local_analyze_dispatches_to_fallback_url():
"""
B4 核心Ollama188Provider.analyze() 必須把 HTTP 打到 OLLAMA_FALLBACK_URL。
攔截 httpx.AsyncClient.post記錄實際呼叫 URL斷言包含 188 IP。
B4 核心OllamaLocalProvider.analyze() 必須把 HTTP 打到 OLLAMA_FALLBACK_URL。
攔截 httpx.AsyncClient.post記錄實際呼叫 URL斷言包含本地 fallback IP。
"""
from src.services.ai_providers.ollama import Ollama188Provider
from src.services.ai_providers.ollama import OllamaLocalProvider
FALLBACK_URL = "http://192.168.0.188:11434"
FALLBACK_URL = "http://192.168.0.111:11434"
captured_urls: list[str] = []
mock_response = MagicMock()
@@ -149,7 +146,7 @@ async def test_ollama_188_analyze_dispatches_to_fallback_url():
"top_p": 0.9,
})
provider = Ollama188Provider()
provider = OllamaLocalProvider()
with patch("src.services.ai_providers.ollama.settings", mock_settings):
with patch("src.services.ai_providers.ollama.get_model_registry", return_value=mock_registry):
@@ -159,45 +156,45 @@ async def test_ollama_188_analyze_dispatches_to_fallback_url():
result = await provider.analyze("test prompt", context={})
assert len(captured_urls) > 0, "analyze() 未發出任何 HTTP 請求"
assert any("192.168.0.188" in url for url in captured_urls), (
f"HTTP 請求未打到 188,實際 URL: {captured_urls}"
assert any("192.168.0.111" in url for url in captured_urls), (
f"HTTP 請求未打到 local fallback,實際 URL: {captured_urls}"
)
assert result.provider == "ollama_188"
assert result.provider == "ollama_local"
@pytest.mark.asyncio
async def test_ollama_188_analyze_returns_error_when_no_fallback_url():
async def test_ollama_local_analyze_returns_error_when_no_fallback_url():
"""OLLAMA_FALLBACK_URL 未設定 → analyze() 應返回 success=False不發 HTTP"""
from src.services.ai_providers.ollama import Ollama188Provider
from src.services.ai_providers.ollama import OllamaLocalProvider
mock_settings = MagicMock()
mock_settings.OLLAMA_FALLBACK_URL = ""
provider = Ollama188Provider()
provider = OllamaLocalProvider()
with patch("src.services.ai_providers.ollama.settings", mock_settings):
result = await provider.analyze("test prompt")
assert result.success is False
assert result.provider == "ollama_188"
assert result.provider == "ollama_local"
assert "OLLAMA_FALLBACK_URL" in (result.error or "")
@pytest.mark.asyncio
async def test_executor_dispatches_ollama_188_to_fallback_url():
async def test_executor_dispatches_ollama_local_to_fallback_url():
"""
B4 執行層AIRouterExecutor.execute(provider_order=["ollama_188"])
應路由到 Ollama188Provider且 HTTP 打到 OLLAMA_FALLBACK_URL。
B4 執行層AIRouterExecutor.execute(provider_order=["ollama_local"])
應路由到 OllamaLocalProvider且 HTTP 打到 OLLAMA_FALLBACK_URL。
"""
from src.services.ai_router import AIProviderRegistry, AIRouterExecutor, reset_ai_router
from src.services.ai_providers.ollama import Ollama188Provider
from src.services.ai_providers.ollama import OllamaLocalProvider
from src.services.ai_providers.interfaces import AIResult
reset_ai_router()
FALLBACK_URL = "http://192.168.0.188:11434"
FALLBACK_URL = "http://192.168.0.111:11434"
captured_urls: list[str] = []
# 建立真實 registry只登錄 ollama_188
# 建立真實 registry只登錄 ollama_local
registry = AIProviderRegistry()
# mock analyze 讓它回傳成功,但驗 URL 路徑
@@ -206,15 +203,15 @@ async def test_executor_dispatches_ollama_188_to_fallback_url():
return AIResult(
raw_response='{"action_title":"ok","confidence":0.9}',
success=True,
provider="ollama_188",
provider="ollama_local",
tokens=10,
)
mock_settings_global = MagicMock()
mock_settings_global.OLLAMA_FALLBACK_URL = FALLBACK_URL
# 建立 Ollama188Providermock 其 analyze + is_enabled
provider = Ollama188Provider()
# 建立 OllamaLocalProvidermock 其 analyze + is_enabled
provider = OllamaLocalProvider()
provider.analyze = fake_analyze # type: ignore[method-assign]
# 強制 is_enabled = True繞過 settings patch 的複雜度)
@@ -233,14 +230,14 @@ async def test_executor_dispatches_ollama_188_to_fallback_url():
mock_settings.MOCK_MODE = False
result = await executor.execute(
prompt="test alert",
provider_order=["ollama_188"],
provider_order=["ollama_local"],
context={},
)
assert result.success is True, f"execute 失敗: {result.error}"
assert result.provider == "ollama_188", f"provider 不是 ollama_188: {result.provider}"
assert any("192.168.0.188" in u for u in captured_urls), (
f"HTTP 未打到 188captured: {captured_urls}"
assert result.provider == "ollama_local", f"provider 不是 ollama_local: {result.provider}"
assert any("192.168.0.111" in u for u in captured_urls), (
f"HTTP 未打到 local fallbackcaptured: {captured_urls}"
)

View File

@@ -16,7 +16,7 @@ import httpx
import pytest
# Ollama 伺服器配置
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.188:11434")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.111:11434")
DEFAULT_MODEL = os.getenv("OLLAMA_MODEL", "qwen2.5:7b-instruct")
TIMEOUT = 300 # 秒 (CPU 推理模式需 ~222-666 秒,見 2026-03-26 評估)
@@ -111,7 +111,7 @@ async def check_ollama_available() -> bool:
@pytest.mark.integration
class TestModelRegression:
"""模型回歸測試 — 需要 Ollama 服務 (192.168.0.188:11434)"""
"""模型回歸測試 — 需要 Ollama 服務(預設 111可用 OLLAMA_URL 覆寫)"""
@pytest.fixture(autouse=True)
async def check_ollama(self):

View File

@@ -90,8 +90,8 @@ class TestProbeOllamaVersion:
assert isinstance(info.captured_at, datetime)
@pytest.mark.asyncio
async def test_success_188_provider(self):
"""188 URL → provider='ollama_188'"""
async def test_success_local_provider(self):
"""111 / local proxy URL → provider='ollama_local'"""
model_entry = {
"name": "deepseek-r1:14b",
"modified_at": "2026-04-02T00:00:00Z",
@@ -106,10 +106,10 @@ class TestProbeOllamaVersion:
with patch("httpx.AsyncClient", return_value=mock_client):
info = await probe_ollama_version(
"http://192.168.0.188:11434", "deepseek-r1:14b"
"http://192.168.0.111:11434", "deepseek-r1:14b"
)
assert info.provider == "ollama_188"
assert info.provider == "ollama_local"
@pytest.mark.asyncio
async def test_model_not_found_raises(self):
@@ -279,7 +279,7 @@ class TestProbeOpenclawNemoVersion:
mock_settings = MagicMock()
mock_settings.OPENCLAW_DEFAULT_MODEL = "deepseek-r1:14b"
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
with patch("src.services.model_version_probe.settings", mock_settings), \
patch("httpx.AsyncClient", return_value=mock_client):
@@ -301,7 +301,7 @@ class TestProbeOpenclawNemoVersion:
mock_settings = MagicMock()
mock_settings.OPENCLAW_DEFAULT_MODEL = "deepseek-r1:14b"
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
with patch("src.services.model_version_probe.settings", mock_settings), \
patch("httpx.AsyncClient", return_value=mock_client):
@@ -333,7 +333,7 @@ class TestProbeAllProviders:
"""5 個 provider 全部成功 → 回傳 5 筆 ProviderVersionInfo"""
fake_results = [
ProviderVersionInfo(provider="ollama", model="qwen2.5:7b-instruct", version="v1"),
ProviderVersionInfo(provider="ollama_188", model="qwen2.5:7b-instruct", version="v1"),
ProviderVersionInfo(provider="ollama_local", model="qwen2.5:7b-instruct", version="v1"),
ProviderVersionInfo(provider="gemini", model="gemini-1.5-flash", version="gemini-1.5-flash"),
ProviderVersionInfo(provider="claude", model="claude-haiku-4-5-20251001", version="claude-haiku-4-5-20251001"),
ProviderVersionInfo(provider="openclaw_nemo", model="deepseek-r1:14b", version="v1"),
@@ -347,7 +347,7 @@ class TestProbeAllProviders:
mock_settings = MagicMock()
mock_settings.OLLAMA_URL = "http://34.143.170.20:11434" # GCP-AADR-110
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
mock_settings.OLLAMA_HEALTH_CHECK_MODEL = "qwen2.5:7b-instruct"
with patch("src.services.model_version_probe.settings", mock_settings):
@@ -364,8 +364,8 @@ class TestProbeAllProviders:
raise RuntimeError("simulated failure")
async def _fail_ollama(url, model):
if "188" in url:
raise RuntimeError("188 offline")
if "111" in url:
raise RuntimeError("local offline")
return good
with patch("src.services.model_version_probe.probe_ollama_version", side_effect=_fail_ollama), \
@@ -379,13 +379,13 @@ class TestProbeAllProviders:
mock_settings = MagicMock()
mock_settings.OLLAMA_URL = "http://34.143.170.20:11434" # GCP-AADR-110
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.111:11434"
mock_settings.OLLAMA_HEALTH_CHECK_MODEL = "qwen2.5:7b-instruct"
with patch("src.services.model_version_probe.settings", mock_settings):
results = await probe_all_providers()
# ollama(ok) + ollama_188(fail) + gemini(fail) + claude(ok) + openclaw_nemo(ok) → 3
# ollama(ok) + ollama_local(fail) + gemini(fail) + claude(ok) + openclaw_nemo(ok) → 3
assert len(results) == 3
providers = {r.provider for r in results}
assert "ollama" in providers

View File

@@ -48,7 +48,7 @@ def _make_info(provider: str, version: str = "v1", digest: str | None = "sha256:
def _make_five() -> list[ProviderVersionInfo]:
return [
_make_info("ollama"),
_make_info("ollama_188"),
_make_info("ollama_local"),
_make_info("gemini", digest=None),
_make_info("claude", digest=None),
_make_info("openclaw_nemo"),

View File

@@ -310,7 +310,7 @@ class TestSelectProvider:
)
with patch.object(manager, "_write_failover_audit", return_value=None):
result = await manager.select_provider()
await manager.select_provider()
# 並行 check 三台主機GCP-A / GCP-B / Local
assert mock_monitor.check.call_count == 3
@@ -625,7 +625,6 @@ class TestWriteFailoverAudit:
@pytest.mark.asyncio
async def test_audit_uses_structlog_not_db(self):
"""_write_failover_audit 應呼叫 structlog不呼叫 DB"""
import structlog
manager = _make_manager()
from src.services.ollama_failover_manager import OllamaEndpoint, OllamaRoutingResult
@@ -657,22 +656,22 @@ class TestWriteFailoverAudit:
# =============================================================================
# B2: AIProviderEnum.OLLAMA_188 存在
# 2026-04-25 critic-fix Part2 by Claude Engineer-C2
# B2: AIProviderEnum.OLLAMA_LOCAL 存在
# 2026-05-06 Codex — 188 不再作為 Ollama Provider
# =============================================================================
class TestAIProviderEnumOllama188:
"""B2 修復驗證AIProviderEnum.OLLAMA_188 存在且 PROVIDER_LATENCY_BUDGET 有對應值"""
class TestAIProviderEnumOllamaLocal:
"""B2 修復驗證AIProviderEnum.OLLAMA_LOCAL 存在且 PROVIDER_LATENCY_BUDGET 有對應值"""
def test_ollama_188_enum_exists(self):
def test_ollama_local_enum_exists(self):
from src.services.ai_router import AIProviderEnum
assert AIProviderEnum.OLLAMA_188.value == "ollama_188"
assert AIProviderEnum.OLLAMA_LOCAL.value == "ollama_local"
def test_ollama_188_in_latency_budget(self):
def test_ollama_local_in_latency_budget(self):
from src.services.ai_router import AIProviderEnum, PROVIDER_LATENCY_BUDGET
assert AIProviderEnum.OLLAMA_188 in PROVIDER_LATENCY_BUDGET
assert PROVIDER_LATENCY_BUDGET[AIProviderEnum.OLLAMA_188] == 120000
assert AIProviderEnum.OLLAMA_LOCAL in PROVIDER_LATENCY_BUDGET
assert PROVIDER_LATENCY_BUDGET[AIProviderEnum.OLLAMA_LOCAL] == 90000
# =============================================================================

View File

@@ -42,7 +42,7 @@ from src.services.ollama_health_monitor import (
# =============================================================================
HOST = "http://34.143.170.20:11434" # GCP-A PrimaryADR-110 2026-05-03
HOST_188 = "http://192.168.0.188:11434" # 歷史遺留參考常數(已移出主路由)
HOST_LOCAL = "http://192.168.0.111:11434" # Local fallback已移出 188 主路由)
@pytest.fixture(autouse=True)

View File

@@ -0,0 +1,231 @@
from __future__ import annotations
from dataclasses import dataclass
from types import SimpleNamespace
from typing import Any
from unittest.mock import AsyncMock
import pytest
from src.services import ai_control as ai_control_module
from src.services import ai_router as ai_router_module
from src.services import openclaw as openclaw_module
from src.services.ai_router import AIProviderEnum
from src.services.intent_classifier import IntentType
from src.services.openclaw import OpenClawService
@dataclass
class _FakeEndpoint:
provider_name: str
url: str = "http://example.test"
class _FakeRoute:
def all_endpoints_in_order(self) -> list[_FakeEndpoint]:
return [
_FakeEndpoint("ollama_gcp_a"),
_FakeEndpoint("ollama_gcp_b"),
_FakeEndpoint("ollama_local"),
_FakeEndpoint("gemini", ""),
]
class _FakeFailoverManager:
def __init__(self) -> None:
self.task_types: list[str] = []
async def select_provider(self, task_type: str = "general") -> _FakeRoute:
self.task_types.append(task_type)
return _FakeRoute()
class _UnorderedFailoverManager:
async def select_provider(self, task_type: str = "general") -> SimpleNamespace:
return SimpleNamespace(
all_endpoints_in_order=lambda: [
_FakeEndpoint("ollama_local"),
_FakeEndpoint("gemini"),
_FakeEndpoint("ollama_gcp_b"),
_FakeEndpoint("ollama_gcp_a"),
],
)
class _FakeRouter:
async def route(self, prompt: str, context: dict[str, Any]) -> SimpleNamespace:
return SimpleNamespace(
selected_provider=AIProviderEnum.GEMINI,
fallback_chain=[
(AIProviderEnum.CLAUDE, "claude"),
(AIProviderEnum.OLLAMA, "qwen2.5:7b-instruct"),
],
intent=IntentType.DIAGNOSE,
routing_reason="high complexity would normally prefer cloud",
)
class _FakeExecutor:
def __init__(self) -> None:
self.provider_order: list[str] | None = None
async def execute(
self,
*,
prompt: str,
provider_order: list[str],
context: dict[str, Any],
cache_ttl: int,
require_local: bool,
) -> SimpleNamespace:
self.provider_order = provider_order
return SimpleNamespace(
raw_response='{"root_cause":"ok","suggested_action":"NO_ACTION"}',
provider=provider_order[0],
success=True,
tokens=42,
cost_usd=0.0,
latency_ms=10.0,
)
@pytest.mark.asyncio
async def test_alert_context_uses_ollama_lane_then_gemini_backup(
monkeypatch: pytest.MonkeyPatch,
) -> None:
fake_executor = _FakeExecutor()
fake_failover = _FakeFailoverManager()
monkeypatch.setattr(openclaw_module.settings, "USE_AI_ROUTER", True)
monkeypatch.setattr(openclaw_module.settings, "MOCK_MODE", False)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", True)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
monkeypatch.setattr(ai_control_module, "get_ai_router_enabled", AsyncMock(return_value=None))
monkeypatch.setattr(ai_control_module, "get_primary_provider", AsyncMock(return_value=None))
monkeypatch.setattr(ai_control_module, "is_provider_disabled", AsyncMock(return_value=False))
monkeypatch.setattr(ai_router_module, "get_ai_router", lambda: _FakeRouter())
monkeypatch.setattr(ai_router_module, "get_ai_executor", lambda: fake_executor)
monkeypatch.setattr(openclaw_module, "get_ollama_failover_manager", lambda: fake_failover)
service = object.__new__(OpenClawService)
result = await service._call_with_fallback(
"diagnose alert",
alert_context={
"incident_id": "INC-1",
"alertname": "HostHighCpuLoad",
"target_resource": "node-exporter-110",
},
)
assert result == (
'{"root_cause":"ok","suggested_action":"NO_ACTION"}',
"ollama_gcp_a",
True,
42,
0.0,
)
assert fake_executor.provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"]
assert fake_failover.task_types == ["diagnose"]
@pytest.mark.asyncio
async def test_alert_context_can_disable_cloud_backup_for_cost_stop(
monkeypatch: pytest.MonkeyPatch,
) -> None:
fake_executor = _FakeExecutor()
fake_failover = _FakeFailoverManager()
monkeypatch.setattr(openclaw_module.settings, "USE_AI_ROUTER", True)
monkeypatch.setattr(openclaw_module.settings, "MOCK_MODE", False)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", False)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
monkeypatch.setattr(ai_control_module, "get_ai_router_enabled", AsyncMock(return_value=None))
monkeypatch.setattr(ai_control_module, "get_primary_provider", AsyncMock(return_value=None))
monkeypatch.setattr(ai_control_module, "is_provider_disabled", AsyncMock(return_value=False))
monkeypatch.setattr(ai_router_module, "get_ai_router", lambda: _FakeRouter())
monkeypatch.setattr(ai_router_module, "get_ai_executor", lambda: fake_executor)
monkeypatch.setattr(openclaw_module, "get_ollama_failover_manager", lambda: fake_failover)
service = object.__new__(OpenClawService)
await service._call_with_fallback(
"diagnose alert",
alert_context={"incident_id": "INC-1", "alertname": "HostHighCpuLoad"},
)
assert fake_executor.provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
@pytest.mark.asyncio
async def test_non_alert_context_keeps_router_cloud_order(
monkeypatch: pytest.MonkeyPatch,
) -> None:
fake_executor = _FakeExecutor()
monkeypatch.setattr(openclaw_module.settings, "USE_AI_ROUTER", True)
monkeypatch.setattr(openclaw_module.settings, "MOCK_MODE", False)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", False)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
monkeypatch.setattr(ai_control_module, "get_ai_router_enabled", AsyncMock(return_value=None))
monkeypatch.setattr(ai_control_module, "get_primary_provider", AsyncMock(return_value=None))
monkeypatch.setattr(ai_control_module, "is_provider_disabled", AsyncMock(return_value=False))
monkeypatch.setattr(ai_router_module, "get_ai_router", lambda: _FakeRouter())
monkeypatch.setattr(ai_router_module, "get_ai_executor", lambda: fake_executor)
service = object.__new__(OpenClawService)
await service._call_with_fallback("general question", alert_context={"intent_hint": "query"})
assert fake_executor.provider_order == ["gemini", "claude", "ollama"]
@pytest.mark.asyncio
async def test_alert_context_uses_gcp_a_gcp_b_then_111_order(
monkeypatch: pytest.MonkeyPatch,
) -> None:
fake_failover = _FakeFailoverManager()
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", False)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
monkeypatch.setattr(openclaw_module, "get_ollama_failover_manager", lambda: fake_failover)
service = object.__new__(OpenClawService)
provider_order = await service._resolve_alert_provider_order(
task_type="diagnose",
alert_context={"incident_id": "INC-1", "alertname": "HostHighCpuLoad"},
)
assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
@pytest.mark.asyncio
async def test_alert_context_sorts_ollama_lane_and_drops_cloud_providers(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", False)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
monkeypatch.setattr(openclaw_module, "get_ollama_failover_manager", lambda: _UnorderedFailoverManager())
service = object.__new__(OpenClawService)
provider_order = await service._resolve_alert_provider_order(
task_type="diagnose",
alert_context={"incident_id": "INC-1", "alertname": "HostHighCpuLoad"},
)
assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
@pytest.mark.asyncio
async def test_alert_context_sorts_ollama_lane_before_gemini_backup(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", True)
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True)
monkeypatch.setattr(openclaw_module, "get_ollama_failover_manager", lambda: _UnorderedFailoverManager())
service = object.__new__(OpenClawService)
provider_order = await service._resolve_alert_provider_order(
task_type="diagnose",
alert_context={"incident_id": "INC-1", "alertname": "HostHighCpuLoad"},
cloud_provider_order=["claude", "gemini", "ollama"],
)
assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"]

View File

@@ -18,7 +18,7 @@ import pytest
from src.core.prompts import OPENCLAW_TEST_PROMPT
# Ollama 配置
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.188:11434")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.111:11434")
DEFAULT_MODEL = os.getenv("OLLAMA_MODEL", "qwen2.5:7b-instruct")
TIMEOUT = 300 # 秒 (CPU 推理模式需 ~222-666 秒,見 2026-03-26 評估)

View File

@@ -8,8 +8,8 @@
# 必填 (REQUIRED)
# ----------------------------------------------------------------------------
# API 後端 URLNext.js build-time 寫入 JS bundle,禁止使用內網 IP
NEXT_PUBLIC_API_URL=http://192.168.0.188:32334
# API 後端 URLNext.js build-time 寫入 JS bundle
NEXT_PUBLIC_API_URL=https://awoooi.wooo.work
# ----------------------------------------------------------------------------
# 可選 (OPTIONAL)
@@ -19,17 +19,17 @@ NEXT_PUBLIC_API_URL=http://192.168.0.188:32334
NEXT_PUBLIC_ENABLE_DEMO=false
# SignOz 可觀測性平台 URL
NEXT_PUBLIC_SIGNOZ_URL=http://192.168.0.110:3301
NEXT_PUBLIC_SIGNOZ_URL=https://signoz.wooo.work
# 主機 IP 列表逗號分隔live-dashboard 用於 fallback 顯示)
NEXT_PUBLIC_HOST_IPS=192.168.0.110,192.168.0.112,192.168.0.120,192.168.0.188
NEXT_PUBLIC_HOST_IPS=devops,security,k3s,ai-web
# K8s Cluster VIP 資訊字串host-grid 顯示用)
NEXT_PUBLIC_K8S_VIP_INFO=VIP 192.168.0.125 · kubectl :6443 · Web :32335 · API :32334
NEXT_PUBLIC_K8S_VIP_INFO=K8S VIP topology (ops-only) · kubectl:6443 · web:32335 · api:32334
# ----------------------------------------------------------------------------
# Server-side Only不含 NEXT_PUBLIC_ 前綴,不會暴露在 JS bundle
# ----------------------------------------------------------------------------
# Sentry 自建主機 URLsentry-tunnel route handler 使用)
SENTRY_HOST=http://192.168.0.110:9000
SENTRY_HOST=http://sentry.internal:9000

View File

@@ -44,6 +44,8 @@ ARG NEXT_PUBLIC_SENTRY_DSN=
ENV NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL}
ENV NEXT_PUBLIC_SENTRY_DSN=${NEXT_PUBLIC_SENTRY_DSN}
ENV NEXT_TELEMETRY_DISABLED=1
# 2026-05-05 ogt + Codex: keep self-hosted 110 runner builds from saturating CPU.
ENV NEXT_PRIVATE_BUILD_WORKER_COUNT=1
# 2026-04-06 ogt: --mount=type=cache 持久化 .next/cache跨 build 增量編譯
# 只有變更的頁面重新編譯,未變更頁面直接用 cache → 節省 3-4 min
@@ -51,7 +53,7 @@ ENV NEXT_TELEMETRY_DISABLED=1
# /root/.cache/turbo 存放 turbo 的 task 輸出快取,避免每次重跑未變動的 packages
RUN --mount=type=cache,target=/app/apps/web/.next/cache \
--mount=type=cache,target=/root/.cache/turbo \
pnpm turbo build --filter=@awoooi/web
pnpm turbo build --filter=@awoooi/web --concurrency=1
FROM base AS runner
WORKDIR /app

View File

@@ -67,7 +67,8 @@
"operations": "Operations",
"securityCompliance": "Security & Compliance",
"classicAICenter": "Classic AI Center",
"governance": "AI Governance"
"governance": "AI Governance",
"awooop": "AwoooP"
},
"locale": {
"switch": "Switch Language",
@@ -1480,4 +1481,4 @@
"retry": "Retry"
}
}
}
}

View File

@@ -67,7 +67,8 @@
"operations": "營運",
"securityCompliance": "安全合規",
"classicAICenter": "經典 AI 中心",
"governance": "AI 治理"
"governance": "AI 治理",
"awooop": "AwoooP"
},
"locale": {
"switch": "切換語系",
@@ -1481,4 +1482,4 @@
"retry": "重試"
}
}
}
}

View File

@@ -114,7 +114,7 @@ function ApprovalRow({ approval }: { approval: Approval }) {
<tr
className={cn(
"border-b border-border hover:bg-accent/30 transition-colors",
isCritical && "bg-red-900/10 hover:bg-red-900/20"
isCritical && "bg-[#fff0ef] hover:bg-[#ffe4e1]"
)}
>
<td className="px-4 py-3">
@@ -232,7 +232,7 @@ export default function ApprovalsPage() {
{/* Error State */}
{error && (
<div className="flex items-start gap-3 p-4 bg-red-900/20 border border-red-800/40 rounded-lg">
<div className="flex items-start gap-3 border border-[#e2a29b] bg-[#fff0ef] p-4">
<AlertCircle className="w-5 h-5 text-red-400 flex-shrink-0 mt-0.5" aria-hidden="true" />
<div>
<p className="text-sm font-medium text-red-300"></p>
@@ -243,7 +243,7 @@ export default function ApprovalsPage() {
{/* Empty State — 所有審批已處理 */}
{!loading && approvals.length === 0 && !error && (
<div className="flex flex-col items-center justify-center py-16 bg-card border border-border rounded-xl">
<div className="flex flex-col items-center justify-center border border-[#e0ddd4] bg-white py-16">
<ShieldCheck className="w-12 h-12 text-green-400 mb-3" aria-hidden="true" />
<p className="text-sm font-medium text-foreground mb-1"></p>
<p className="text-xs text-muted-foreground"> Run</p>
@@ -252,7 +252,7 @@ export default function ApprovalsPage() {
{/* Table */}
{(loading || approvals.length > 0) && (
<div className="bg-card border border-border rounded-xl overflow-hidden shadow-sm">
<div className="overflow-hidden border border-[#e0ddd4] bg-white shadow-[0_1px_4px_rgba(0,0,0,0.05)]">
<div className="overflow-x-auto">
<table className="w-full" role="table" aria-label="審批佇列">
<thead>

View File

@@ -19,20 +19,25 @@ import { cn } from "@/lib/utils";
// Types
// =============================================================================
type ContractStatus = "draft" | "published" | "active";
type ContractStatus = "draft" | "published" | "active" | "revoked";
interface Contract {
id: string;
contract_id: string;
contract_family: string;
project_id: string;
status: ContractStatus;
lifecycle_status: ContractStatus;
body_hash: string;
created_at: string;
}
interface Tenant {
project_id: string;
name: string;
display_name: string;
}
interface ContractsResponse {
contracts?: Contract[];
items?: Contract[];
}
// =============================================================================
@@ -47,21 +52,27 @@ const STATUS_CONFIG: Record<
> = {
draft: {
label: "草稿",
bg: "bg-gray-800",
text: "text-gray-300",
border: "border-gray-600",
bg: "bg-[#f4f1e8]",
text: "text-[#5f5b52]",
border: "border-[#d8d3c7]",
},
published: {
label: "已發佈",
bg: "bg-blue-900/40",
text: "text-blue-300",
border: "border-blue-600/40",
bg: "bg-[#eef5ff]",
text: "text-[#1f5b9b]",
border: "border-[#9bb6d9]",
},
active: {
label: "生效中",
bg: "bg-green-900/40",
text: "text-green-300",
border: "border-green-600/40",
bg: "bg-[#f0faf2]",
text: "text-[#17602a]",
border: "border-[#9bc7a4]",
},
revoked: {
label: "已撤銷",
bg: "bg-[#fff0ef]",
text: "text-[#9f2f25]",
border: "border-[#e2a29b]",
},
};
@@ -109,7 +120,7 @@ function ContractRow({ contract }: { contract: Contract }) {
</span>
</td>
<td className="px-4 py-3">
<StatusBadge status={contract.status} />
<StatusBadge status={contract.lifecycle_status} />
</td>
<td className="px-4 py-3">
<span className="font-mono text-xs text-muted-foreground bg-muted px-2 py-0.5 rounded">
@@ -140,7 +151,10 @@ export default function ContractsPage() {
useEffect(() => {
fetch(`${API_BASE}/api/v1/platform/tenants`)
.then((r) => r.json())
.then((data) => setTenants(Array.isArray(data.items) ? data.items : []))
.then((data) => {
const rows = Array.isArray(data.tenants) ? data.tenants : data.items;
setTenants(Array.isArray(rows) ? rows : []);
})
.catch(() => {});
}, []);
@@ -154,8 +168,9 @@ export default function ContractsPage() {
`${API_BASE}/api/v1/platform/contracts?${params.toString()}`
);
if (!res.ok) throw new Error(`HTTP ${res.status}`);
const data = await res.json();
setContracts(Array.isArray(data.items) ? data.items : []);
const data: ContractsResponse = await res.json();
const rows = Array.isArray(data.contracts) ? data.contracts : data.items;
setContracts(Array.isArray(rows) ? rows : []);
} catch (err) {
setError(err instanceof Error ? err.message : "載入失敗");
} finally {
@@ -192,7 +207,7 @@ export default function ContractsPage() {
</div>
{/* Filters */}
<div className="flex items-center gap-3 p-4 bg-card border border-border rounded-xl">
<div className="flex items-center gap-3 border border-[#e0ddd4] bg-white p-4">
<Filter className="w-4 h-4 text-muted-foreground flex-shrink-0" aria-hidden="true" />
<span className="text-sm text-muted-foreground"></span>
<div className="relative">
@@ -205,7 +220,7 @@ export default function ContractsPage() {
<option value=""></option>
{tenants.map((t) => (
<option key={t.project_id} value={t.project_id}>
{t.name || t.project_id}
{t.display_name || t.project_id}
</option>
))}
</select>
@@ -225,7 +240,7 @@ export default function ContractsPage() {
)}
{/* Table */}
<div className="bg-card border border-border rounded-xl overflow-hidden shadow-sm">
<div className="overflow-hidden border border-[#e0ddd4] bg-white shadow-[0_1px_4px_rgba(0,0,0,0.05)]">
<div className="overflow-x-auto">
<table className="w-full" role="table" aria-label="合約清單">
<thead>
@@ -269,7 +284,7 @@ export default function ContractsPage() {
</tr>
) : (
contracts.map((contract) => (
<ContractRow key={contract.id} contract={contract} />
<ContractRow key={contract.contract_id + contract.body_hash} contract={contract} />
))
)}
</tbody>

View File

@@ -6,8 +6,9 @@
"use client";
import { AppLayout } from "@/components/layout";
import { Link, usePathname } from "@/i18n/routing";
import { Building2, FileText, Activity, ShieldCheck } from "lucide-react";
import { Activity, BrainCircuit, Building2, ClipboardList, FileText, ShieldCheck } from "lucide-react";
import { cn } from "@/lib/utils";
// =============================================================================
@@ -15,6 +16,11 @@ import { cn } from "@/lib/utils";
// =============================================================================
const navItems = [
{
label: "工作鏈路",
href: "/awooop/work-items" as const,
icon: ClipboardList,
},
{
label: "租戶管理",
href: "/awooop/tenants" as const,
@@ -43,64 +49,72 @@ const navItems = [
export default function AwoooPLayout({
children,
params,
}: {
children: React.ReactNode;
params: { locale: string };
}) {
const pathname = usePathname();
return (
<div className="min-h-full flex flex-col">
{/* Console Header */}
<div className="bg-card border-b border-border px-6 py-4">
<div className="flex items-center justify-between mb-4">
<div>
<h1 className="text-xl font-bold text-foreground tracking-tight">
AwoooP Operator Console
</h1>
<p className="text-xs text-muted-foreground mt-0.5">
Agent · · Run ·
</p>
</div>
<div className="flex items-center gap-2">
<span className="inline-flex items-center gap-1.5 px-2.5 py-1 rounded-full text-xs font-medium bg-brand-accent/10 text-brand-accent border border-brand-accent/20">
<span className="w-1.5 h-1.5 rounded-full bg-brand-accent animate-pulse" />
<AppLayout locale={params.locale} showBackground={false}>
<div className="min-h-[calc(100vh-116px)] bg-[#f7f5ee] border border-[#e0ddd4]">
<div className="border-b border-[#e0ddd4] bg-[#faf9f3] px-5 py-4">
<div className="flex flex-wrap items-center justify-between gap-3">
<div className="flex items-center gap-3">
<span className="flex h-9 w-9 items-center justify-center border border-[#d8d3c7] bg-white text-[#141413]">
<BrainCircuit className="h-4 w-4" aria-hidden="true" />
</span>
<div>
<h1 className="text-lg font-semibold tracking-normal text-[#141413]">
AwoooP Operator Console
</h1>
<div className="mt-1 flex items-center gap-2 text-xs text-[#77736a]">
<span className="font-mono">Control Plane</span>
<span className="h-1 w-1 rounded-full bg-[#d97757]" />
<span className="font-mono">Shadow First</span>
</div>
</div>
</div>
<span className="inline-flex items-center gap-2 border border-[#d8d3c7] bg-white px-3 py-1.5 text-xs font-semibold text-[#141413]">
<span className="h-1.5 w-1.5 rounded-full bg-[#22c55e]" />
OPERATOR
</span>
</div>
<nav
className="mt-4 flex flex-wrap gap-1"
role="navigation"
aria-label="AwoooP 主要導航"
>
{navItems.map((item) => {
const Icon = item.icon;
const isActive =
pathname === item.href ||
pathname?.startsWith(item.href + "/");
return (
<Link
key={item.href}
href={item.href}
aria-current={isActive ? "page" : undefined}
className={cn(
"inline-flex items-center gap-2 border px-3 py-2 text-sm font-medium transition-colors",
isActive
? "border-[#d97757] bg-white text-[#141413]"
: "border-transparent text-[#77736a] hover:border-[#d8d3c7] hover:bg-white hover:text-[#141413]"
)}
>
<Icon className="h-4 w-4" aria-hidden="true" />
{item.label}
</Link>
);
})}
</nav>
</div>
{/* Tab Navigation */}
<nav className="flex gap-1" role="navigation" aria-label="AwoooP 主要導航">
{navItems.map((item) => {
const Icon = item.icon;
const isActive =
pathname === item.href ||
pathname?.startsWith(item.href + "/");
return (
<Link
key={item.href}
href={item.href}
aria-current={isActive ? "page" : undefined}
className={cn(
"flex items-center gap-2 px-4 py-2 rounded-lg text-sm font-medium transition-all duration-150",
isActive
? "bg-brand-accent/15 text-brand-accent border border-brand-accent/30"
: "text-muted-foreground hover:text-foreground hover:bg-accent"
)}
>
<Icon className="w-4 h-4" aria-hidden="true" />
{item.label}
</Link>
);
})}
</nav>
<main className="px-5 py-5">{children}</main>
</div>
{/* Page Content */}
<main className="flex-1 px-6 py-6">
{children}
</main>
</div>
</AppLayout>
);
}

View File

@@ -2,8 +2,12 @@
// WOOO AIOps - AwoooP Console 入口重導向
// =============================================================================
import { redirect } from "@/i18n/routing";
import { redirect } from "next/navigation";
export default function AwoooPPage() {
redirect("/awooop/tenants");
export default function AwoooPPage({
params,
}: {
params: { locale: string };
}) {
redirect(`/${params.locale}/awooop/work-items`);
}

View File

@@ -23,16 +23,14 @@ import { cn } from "@/lib/utils";
// =============================================================================
type RunState =
| "CREATED"
| "QUEUED"
| "POLICY_RESOLVED"
| "RUNNING"
| "WAITING_TOOL"
| "WAITING_APPROVAL"
| "RESUMED"
| "COMPLETED"
| "FAILED"
| "CANCELLED";
| "pending"
| "running"
| "waiting_tool"
| "waiting_approval"
| "completed"
| "failed"
| "cancelled"
| "timeout";
interface Run {
run_id: string;
@@ -40,18 +38,19 @@ interface Run {
agent_id: string;
state: RunState;
is_shadow: boolean;
token_usage_input: number | null;
token_usage_output: number | null;
cost_usd: number | string;
step_count: number;
created_at: string;
}
interface Tenant {
project_id: string;
name: string;
display_name: string;
}
interface RunsResponse {
items: Run[];
runs?: Run[];
items?: Run[];
total: number;
page: number;
per_page: number;
@@ -69,66 +68,54 @@ const STATE_CONFIG: Record<
RunState,
{ label: string; bg: string; text: string; border: string; pulse?: boolean }
> = {
CREATED: {
label: "已建立",
bg: "bg-gray-800",
text: "text-gray-300",
border: "border-gray-600",
pending: {
label: "待執行",
bg: "bg-[#f4f1e8]",
text: "text-[#5f5b52]",
border: "border-[#d8d3c7]",
},
QUEUED: {
label: "排隊中",
bg: "bg-gray-800",
text: "text-gray-400",
border: "border-gray-600",
},
POLICY_RESOLVED: {
label: "策略已解析",
bg: "bg-blue-900/40",
text: "text-blue-300",
border: "border-blue-600/40",
},
RUNNING: {
running: {
label: "執行中",
bg: "bg-green-900/40",
text: "text-green-300",
border: "border-green-600/40",
bg: "bg-[#f0faf2]",
text: "text-[#17602a]",
border: "border-[#9bc7a4]",
pulse: true,
},
WAITING_TOOL: {
waiting_tool: {
label: "等待工具",
bg: "bg-yellow-900/40",
text: "text-yellow-300",
border: "border-yellow-600/40",
bg: "bg-[#fff7e8]",
text: "text-[#8a5a08]",
border: "border-[#d9b36f]",
},
WAITING_APPROVAL: {
waiting_approval: {
label: "等待審批",
bg: "bg-yellow-900/40",
text: "text-yellow-300",
border: "border-yellow-600/40",
bg: "bg-[#fff7e8]",
text: "text-[#8a5a08]",
border: "border-[#d9b36f]",
},
RESUMED: {
label: "已恢復",
bg: "bg-purple-900/40",
text: "text-purple-300",
border: "border-purple-600/40",
},
COMPLETED: {
completed: {
label: "已完成",
bg: "bg-green-900/40",
text: "text-green-400",
border: "border-green-600/40",
bg: "bg-[#f0faf2]",
text: "text-[#17602a]",
border: "border-[#9bc7a4]",
},
FAILED: {
failed: {
label: "失敗",
bg: "bg-red-900/40",
text: "text-red-300",
border: "border-red-600/40",
bg: "bg-[#fff0ef]",
text: "text-[#9f2f25]",
border: "border-[#e2a29b]",
},
CANCELLED: {
cancelled: {
label: "已取消",
bg: "bg-red-900/30",
text: "text-red-400",
border: "border-red-700/40",
bg: "bg-[#fff0ef]",
text: "text-[#9f2f25]",
border: "border-[#e2a29b]",
},
timeout: {
label: "已超時",
bg: "bg-[#fff0ef]",
text: "text-[#9f2f25]",
border: "border-[#e2a29b]",
},
};
@@ -137,7 +124,7 @@ const STATE_CONFIG: Record<
// =============================================================================
function RunStateBadge({ state }: { state: RunState }) {
const config = STATE_CONFIG[state] ?? STATE_CONFIG.CREATED;
const config = STATE_CONFIG[state] ?? STATE_CONFIG.pending;
return (
<span
className={cn(
@@ -158,7 +145,7 @@ function RunStateBadge({ state }: { state: RunState }) {
function ShadowBadge({ isShadow }: { isShadow: boolean }) {
if (!isShadow) return <span className="text-muted-foreground text-sm">--</span>;
return (
<span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-gray-800 text-gray-400 border border-gray-600">
<span className="inline-flex items-center border border-[#d8d3c7] bg-white px-2 py-0.5 text-xs font-medium text-[#5f5b52]">
Shadow
</span>
);
@@ -174,8 +161,7 @@ function RunRow({ run }: { run: Run }) {
})
: "--";
const totalTokens =
(run.token_usage_input ?? 0) + (run.token_usage_output ?? 0);
const cost = Number(run.cost_usd ?? 0);
return (
<tr className="border-b border-border hover:bg-accent/30 transition-colors">
@@ -202,12 +188,12 @@ function RunRow({ run }: { run: Run }) {
</td>
<td className="px-4 py-3">
<span className="flex items-center gap-1 text-sm font-mono text-muted-foreground">
{totalTokens > 0 ? (
{run.step_count > 0 || cost > 0 ? (
<>
<Cpu className="w-3.5 h-3.5" aria-hidden="true" />
{totalTokens.toLocaleString()}
{run.step_count.toLocaleString()} steps
<span className="text-xs text-muted-foreground/60">
({run.token_usage_input ?? 0} {run.token_usage_output ?? 0})
(${cost.toFixed(4)})
</span>
</>
) : (
@@ -244,7 +230,10 @@ export default function RunsPage() {
useEffect(() => {
fetch(`${API_BASE}/api/v1/platform/tenants`)
.then((r) => r.json())
.then((data) => setTenants(Array.isArray(data.items) ? data.items : []))
.then((data) => {
const rows = Array.isArray(data.tenants) ? data.tenants : data.items;
setTenants(Array.isArray(rows) ? rows : []);
})
.catch(() => {});
}, []);
@@ -253,7 +242,7 @@ export default function RunsPage() {
setError(null);
const params = new URLSearchParams();
if (projectFilter) params.set("project_id", projectFilter);
if (statusFilter) params.set("status", statusFilter);
if (statusFilter) params.set("state", statusFilter);
params.set("page", String(page));
params.set("per_page", String(PER_PAGE));
@@ -262,7 +251,8 @@ export default function RunsPage() {
);
if (!res.ok) throw new Error(`HTTP ${res.status}`);
const data: RunsResponse = await res.json();
setRuns(Array.isArray(data.items) ? data.items : []);
const rows = Array.isArray(data.runs) ? data.runs : data.items;
setRuns(Array.isArray(rows) ? rows : []);
setTotal(data.total ?? 0);
setLastRefresh(new Date());
} catch (err) {
@@ -320,7 +310,7 @@ export default function RunsPage() {
</div>
{/* Filters */}
<div className="flex items-center gap-3 p-4 bg-card border border-border rounded-xl flex-wrap">
<div className="flex flex-wrap items-center gap-3 border border-[#e0ddd4] bg-white p-4">
<Filter className="w-4 h-4 text-muted-foreground flex-shrink-0" aria-hidden="true" />
<span className="text-sm text-muted-foreground"></span>
@@ -335,7 +325,7 @@ export default function RunsPage() {
<option value=""></option>
{tenants.map((t) => (
<option key={t.project_id} value={t.project_id}>
{t.name || t.project_id}
{t.display_name || t.project_id}
</option>
))}
</select>
@@ -373,7 +363,7 @@ export default function RunsPage() {
)}
{/* Table */}
<div className="bg-card border border-border rounded-xl overflow-hidden shadow-sm">
<div className="overflow-hidden border border-[#e0ddd4] bg-white shadow-[0_1px_4px_rgba(0,0,0,0.05)]">
<div className="overflow-x-auto">
<table className="w-full" role="table" aria-label="Run 清單">
<thead>
@@ -394,7 +384,7 @@ export default function RunsPage() {
Shadow
</th>
<th className="text-left px-4 py-3 text-xs font-medium text-muted-foreground uppercase tracking-wider">
Token
/ Steps
</th>
<th className="text-left px-4 py-3 text-xs font-medium text-muted-foreground uppercase tracking-wider">

View File

@@ -21,6 +21,7 @@ import { cn } from "@/lib/utils";
// =============================================================================
type MigrationMode =
| "legacy_awoooi_default"
| "shadow"
| "canary"
| "read_only"
@@ -29,14 +30,15 @@ type MigrationMode =
interface Tenant {
project_id: string;
name: string;
display_name: string;
migration_mode: MigrationMode;
budget_limit_usd: number | null;
is_suspended: boolean;
budget_limit_usd: number | string | null;
is_active: boolean;
}
interface ApiResponse {
items: Tenant[];
tenants?: Tenant[];
items?: Tenant[];
total: number;
}
@@ -50,35 +52,41 @@ const MIGRATION_MODE_CONFIG: Record<
MigrationMode,
{ label: string; bg: string; text: string; border: string }
> = {
legacy_awoooi_default: {
label: "Legacy",
bg: "bg-white",
text: "text-[#5f5b52]",
border: "border-[#d8d3c7]",
},
shadow: {
label: "Shadow",
bg: "bg-gray-800",
text: "text-gray-300",
border: "border-gray-600",
bg: "bg-[#f4f1e8]",
text: "text-[#5f5b52]",
border: "border-[#d8d3c7]",
},
canary: {
label: "Canary",
bg: "bg-yellow-900/40",
text: "text-yellow-300",
border: "border-yellow-600/40",
bg: "bg-[#fff7e8]",
text: "text-[#8a5a08]",
border: "border-[#d9b36f]",
},
read_only: {
label: "Read Only",
bg: "bg-blue-900/40",
text: "text-blue-300",
border: "border-blue-600/40",
bg: "bg-[#eef5ff]",
text: "text-[#1f5b9b]",
border: "border-[#9bb6d9]",
},
suggest: {
label: "Suggest",
bg: "bg-purple-900/40",
text: "text-purple-300",
border: "border-purple-600/40",
bg: "bg-[#f6f0ff]",
text: "text-[#6541a5]",
border: "border-[#baa7de]",
},
auto_remediate: {
label: "Auto Remediate",
bg: "bg-green-900/40",
text: "text-green-300",
border: "border-green-600/40",
bg: "bg-[#f0faf2]",
text: "text-[#17602a]",
border: "border-[#9bc7a4]",
},
};
@@ -104,12 +112,12 @@ function MigrationModeBadge({ mode }: { mode: MigrationMode }) {
function SuspendedBadge({ suspended }: { suspended: boolean }) {
return suspended ? (
<span className="inline-flex items-center gap-1 px-2.5 py-0.5 rounded-md text-xs font-medium bg-red-900/40 text-red-300 border border-red-600/40">
<span className="inline-flex items-center gap-1 border border-[#e2a29b] bg-[#fff0ef] px-2.5 py-0.5 text-xs font-medium text-[#9f2f25]">
<Ban className="w-3 h-3" aria-hidden="true" />
</span>
) : (
<span className="inline-flex items-center gap-1 px-2.5 py-0.5 rounded-md text-xs font-medium bg-green-900/40 text-green-300 border border-green-600/40">
<span className="inline-flex items-center gap-1 border border-[#9bc7a4] bg-[#f0faf2] px-2.5 py-0.5 text-xs font-medium text-[#17602a]">
<CheckCircle2 className="w-3 h-3" aria-hidden="true" />
</span>
@@ -117,6 +125,9 @@ function SuspendedBadge({ suspended }: { suspended: boolean }) {
}
function TenantRow({ tenant }: { tenant: Tenant }) {
const budget =
tenant.budget_limit_usd == null ? null : Number(tenant.budget_limit_usd);
return (
<tr className="border-b border-border hover:bg-accent/30 transition-colors">
<td className="px-4 py-3">
@@ -125,7 +136,7 @@ function TenantRow({ tenant }: { tenant: Tenant }) {
</span>
</td>
<td className="px-4 py-3">
<span className="text-sm text-foreground font-medium">{tenant.name || "--"}</span>
<span className="text-sm text-foreground font-medium">{tenant.display_name || "--"}</span>
</td>
<td className="px-4 py-3">
<MigrationModeBadge mode={tenant.migration_mode} />
@@ -135,7 +146,7 @@ function TenantRow({ tenant }: { tenant: Tenant }) {
{tenant.budget_limit_usd != null ? (
<>
<DollarSign className="w-3.5 h-3.5" aria-hidden="true" />
{tenant.budget_limit_usd.toLocaleString("en-US", {
{budget?.toLocaleString("en-US", {
minimumFractionDigits: 2,
})}
</>
@@ -145,7 +156,7 @@ function TenantRow({ tenant }: { tenant: Tenant }) {
</span>
</td>
<td className="px-4 py-3">
<SuspendedBadge suspended={tenant.is_suspended} />
<SuspendedBadge suspended={!tenant.is_active} />
</td>
</tr>
);
@@ -166,7 +177,8 @@ export default function TenantsPage() {
const res = await fetch(`${API_BASE}/api/v1/platform/tenants`);
if (!res.ok) throw new Error(`HTTP ${res.status}`);
const data: ApiResponse = await res.json();
setTenants(Array.isArray(data.items) ? data.items : []);
const rows = Array.isArray(data.tenants) ? data.tenants : data.items;
setTenants(Array.isArray(rows) ? rows : []);
} catch (err) {
setError(err instanceof Error ? err.message : "載入失敗");
} finally {
@@ -214,7 +226,7 @@ export default function TenantsPage() {
)}
{/* Table */}
<div className="bg-card border border-border rounded-xl overflow-hidden shadow-sm">
<div className="overflow-hidden border border-[#e0ddd4] bg-white shadow-[0_1px_4px_rgba(0,0,0,0.05)]">
<div className="overflow-x-auto">
<table className="w-full" role="table" aria-label="租戶清單">
<thead>

View File

@@ -0,0 +1,254 @@
// =============================================================================
// WOOO AIOps - AwoooP 工作鏈路
// =============================================================================
// 將 AwoooP 實施項目對齊到 Operator Console 可觀測面。
"use client";
import {
Activity,
ArrowRight,
ClipboardList,
Database,
Gauge,
GitBranch,
Network,
ShieldCheck,
} from "lucide-react";
import { Link } from "@/i18n/routing";
import { cn } from "@/lib/utils";
type WorkStatus = "live" | "in_progress" | "blocked" | "watching";
type WorkItem = {
phase: string;
title: string;
status: WorkStatus;
surface: string;
source: string;
gate: string;
href: "/awooop/tenants" | "/awooop/contracts" | "/awooop/runs" | "/awooop/approvals";
};
const statusConfig: Record<WorkStatus, { label: string; className: string }> = {
live: {
label: "已接線",
className: "border-[#9bc7a4] bg-[#f0faf2] text-[#17602a]",
},
in_progress: {
label: "推進中",
className: "border-[#d9b36f] bg-[#fff7e8] text-[#8a5a08]",
},
blocked: {
label: "阻塞",
className: "border-[#e2a29b] bg-[#fff0ef] text-[#9f2f25]",
},
watching: {
label: "觀察期",
className: "border-[#9bb6d9] bg-[#eef5ff] text-[#1f5b9b]",
},
};
const workItems: WorkItem[] = [
{
phase: "P0",
title: "AI 路由以 GCP-A/GCP-B/111 Ollama 優先",
status: "live",
surface: "Run 監控",
source: "ai_routing_decision / ollama_failover_decision",
gate: "Gemini 僅能作為 fallback",
href: "/awooop/runs",
},
{
phase: "P0",
title: "飛輪 KPI 改讀 auto_repair_executions",
status: "in_progress",
surface: "工作鏈路 / 系統報告",
source: "auto_repair_executions",
gate: "修復率不得再讀 incidents.outcome",
href: "/awooop/runs",
},
{
phase: "P0",
title: "審批與 Run State 對齊",
status: "live",
surface: "審批佇列",
source: "awooop_run_state",
gate: "waiting_approval 才能 decide",
href: "/awooop/approvals",
},
{
phase: "P1",
title: "Contract Lifecycle",
status: "watching",
surface: "合約儀表板",
source: "awooop_contract_revisions",
gate: "draft → published → active",
href: "/awooop/contracts",
},
{
phase: "P1",
title: "Tenant Migration State",
status: "watching",
surface: "租戶管理",
source: "awooop_projects",
gate: "shadow gate 需量化",
href: "/awooop/tenants",
},
{
phase: "P1",
title: "MCP Gateway 與 Context Firewall",
status: "watching",
surface: "Run 監控",
source: "mcp_gateway audit / redaction",
gate: "tool call 必須帶 project_id",
href: "/awooop/runs",
},
{
phase: "P2",
title: "Communication Hub",
status: "watching",
surface: "Run 監控 / 審批佇列",
source: "conversation_event / outbound_message",
gate: "Telegram 先 mirror 再切流",
href: "/awooop/runs",
},
{
phase: "P2",
title: "Operator Console 正式接入主站",
status: "in_progress",
surface: "AwoooP",
source: "apps/web/src/app/[locale]/awooop",
gate: "/zh-TW/awooop 不得再 redirect 異常",
href: "/awooop/tenants",
},
];
function StatusBadge({ status }: { status: WorkStatus }) {
const config = statusConfig[status];
return (
<span
className={cn(
"inline-flex items-center border px-2 py-0.5 text-xs font-semibold",
config.className
)}
>
{config.label}
</span>
);
}
const summary = [
{ label: "Live", value: workItems.filter((i) => i.status === "live").length, icon: Activity },
{ label: "In Progress", value: workItems.filter((i) => i.status === "in_progress").length, icon: GitBranch },
{ label: "Watching", value: workItems.filter((i) => i.status === "watching").length, icon: Gauge },
{ label: "Blocked", value: workItems.filter((i) => i.status === "blocked").length, icon: ShieldCheck },
];
export default function AwoooPWorkItemsPage() {
return (
<div className="space-y-5">
<div className="flex flex-wrap items-center justify-between gap-3">
<div className="flex items-center gap-3">
<ClipboardList className="h-5 w-5 text-[#d97757]" aria-hidden="true" />
<div>
<h2 className="text-lg font-semibold tracking-normal text-[#141413]">
</h2>
<p className="text-xs text-[#77736a]">
{workItems.length}
</p>
</div>
</div>
</div>
<div className="grid gap-px border border-[#e0ddd4] bg-[#e0ddd4] md:grid-cols-4">
{summary.map((item) => {
const Icon = item.icon;
return (
<div key={item.label} className="bg-white px-4 py-3">
<div className="flex items-center justify-between">
<span className="text-xs font-medium text-[#77736a]">{item.label}</span>
<Icon className="h-4 w-4 text-[#87867f]" aria-hidden="true" />
</div>
<div className="mt-2 font-mono text-2xl font-semibold text-[#141413]">
{item.value}
</div>
</div>
);
})}
</div>
<div className="overflow-hidden border border-[#e0ddd4] bg-white">
<div className="overflow-x-auto">
<table className="w-full" role="table" aria-label="AwoooP 工作鏈路">
<thead>
<tr className="border-b border-[#e0ddd4] bg-[#faf9f3]">
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
Phase
</th>
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
</th>
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
</th>
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
</th>
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
Source
</th>
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
Gate
</th>
<th className="px-4 py-3 text-left text-xs font-semibold text-[#77736a]">
Link
</th>
</tr>
</thead>
<tbody>
{workItems.map((item) => (
<tr key={`${item.phase}-${item.title}`} className="border-b border-[#eee9dd] last:border-b-0">
<td className="px-4 py-3 font-mono text-xs font-semibold text-[#141413]">
{item.phase}
</td>
<td className="px-4 py-3 text-sm font-medium text-[#141413]">
{item.title}
</td>
<td className="px-4 py-3">
<StatusBadge status={item.status} />
</td>
<td className="px-4 py-3 text-sm text-[#5f5b52]">
{item.surface}
</td>
<td className="px-4 py-3">
<span className="inline-flex items-center gap-1.5 font-mono text-xs text-[#5f5b52]">
<Database className="h-3.5 w-3.5" aria-hidden="true" />
{item.source}
</span>
</td>
<td className="px-4 py-3">
<span className="inline-flex items-center gap-1.5 text-sm text-[#5f5b52]">
<Network className="h-3.5 w-3.5" aria-hidden="true" />
{item.gate}
</span>
</td>
<td className="px-4 py-3">
<Link
href={item.href}
className="inline-flex items-center gap-1.5 border border-[#d8d3c7] px-2.5 py-1 text-xs font-semibold text-[#141413] hover:border-[#d97757]"
>
<ArrowRight className="h-3.5 w-3.5" aria-hidden="true" />
</Link>
</td>
</tr>
))}
</tbody>
</table>
</div>
</div>
</div>
);
}

View File

@@ -248,7 +248,7 @@ function MonitoringTools() {
// =============================================================================
const HOST_CATALOG: Record<string, { services: HostService[]; isK3s?: boolean; role?: string }> = {
'192.168.0.110': {
'devops': {
services: [
{ name: 'Harbor', healthy: false, port: 5000, description: 'Container Registry' },
{ name: 'Gitea', healthy: false, port: 3001, description: 'Git · CI/CD' },
@@ -258,12 +258,12 @@ const HOST_CATALOG: Record<string, { services: HostService[]; isK3s?: boolean; r
{ name: 'Prometheus', healthy: false, port: 9090, description: '告警規則' },
],
},
'192.168.0.112': {
'security': {
services: [
{ name: 'Scanner API', healthy: false, port: 8080, description: '漏洞掃描' },
],
},
'192.168.0.120': {
'k3s-1': {
isK3s: true,
role: 'Control Plane #1',
services: [
@@ -273,7 +273,7 @@ const HOST_CATALOG: Record<string, { services: HostService[]; isK3s?: boolean; r
{ name: 'keepalived', healthy: false, description: 'VIP MASTER', isK3s: true },
],
},
'192.168.0.121': {
'k3s-2': {
isK3s: true,
role: 'Control Plane #2 (HA)',
services: [
@@ -283,7 +283,7 @@ const HOST_CATALOG: Record<string, { services: HostService[]; isK3s?: boolean; r
{ name: 'keepalived', healthy: false, description: 'VIP BACKUP', isK3s: true },
],
},
'192.168.0.188': {
'ai-web': {
services: [
{ name: 'Nginx', healthy: false, port: 443, description: 'Reverse Proxy' },
{ name: 'PostgreSQL', healthy: false, port: 5432, description: 'K3s Datastore' },
@@ -781,9 +781,9 @@ export default function Home({ params }: { params: { locale: string } }) {
buildHostInfo(h.ip, h.name, h.metrics?.cpu_percent ?? null, h.metrics?.memory_percent ?? null, h.services)
)
// K3s #2 (121) 若 API 未回傳,補靜態卡
const has121 = apiHosts.some(h => h.ip === '192.168.0.121')
if (!has121) {
apiHosts.push(buildHostInfo('192.168.0.121', 'K3s Server #2', null, null, []))
const hasK3s2 = apiHosts.some(h => h.ip === 'k3s-2')
if (!hasK3s2) {
apiHosts.push(buildHostInfo('k3s-2', 'K3s Server #2', null, null, []))
}
return apiHosts
})()} />

View File

@@ -12,7 +12,7 @@ import {
ShieldCheck,
} from 'lucide-react'
const GITEA_ACTIONS_URL = 'http://192.168.0.110:3001/wooo/awoooi/actions'
const GITEA_ACTIONS_URL = process.env.NEXT_PUBLIC_GITEA_URL ? `${process.env.NEXT_PUBLIC_GITEA_URL}/wooo/awoooi/actions` : '#'
const agents = [
{ name: 'Hermes', role: '變更摘要與規則脈絡', state: 'wired' },
@@ -63,7 +63,7 @@ export default function CodeReviewPage({ params }: { params: { locale: string }
Source
</div>
<div className="mt-3 text-lg font-semibold text-white">gitea main</div>
<div className="mt-1 text-xs text-gray-500">192.168.0.110:3001</div>
<div className="mt-1 text-xs text-gray-500">gitea internal</div>
</div>
<div className="rounded border border-gray-800 bg-gray-950 p-4">
<div className="flex items-center gap-2 text-xs text-gray-400">

View File

@@ -36,6 +36,84 @@ import { FlywheelKPICard } from '@/components/dashboard/flywheel-kpi-card'
const API_BASE = process.env.NEXT_PUBLIC_API_URL ?? ''
type HostNodeId = 'devops' | 'ai-data' | 'k3s-master' | 'k3s-worker'
type HostCatalog = Record<HostNodeId, { services: HostService[]; isK3s?: boolean; role?: string }>
const HOST_ID_LIST = ['devops', 'ai-data', 'k3s-master', 'k3s-worker'] as const
const HOST_IP_LABELS: Record<HostNodeId, string> = {
'devops': 'devops',
'ai-data': 'ai-data',
'k3s-master': 'k3s-master',
'k3s-worker': 'k3s-worker',
}
const HOST_CATALOG: HostCatalog = {
'devops': {
services: [
{ name: 'Harbor', healthy: false, port: 5000, description: 'Container Registry' },
{ name: 'Gitea', healthy: false, port: 3001, description: 'Git · CI/CD' },
{ name: 'Sentry', healthy: false, port: 9000, description: 'Error Tracking' },
{ name: 'Langfuse', healthy: false, port: 3100, description: 'LLM Tracing' },
{ name: 'Grafana', healthy: false, port: 3002, description: '監控面板' },
{ name: 'Prometheus', healthy: false, port: 9090, description: '告警規則' },
],
},
'ai-data': {
services: [
{ name: 'Scanner API', healthy: false, port: 8080, description: '漏洞掃描' },
],
},
'k3s-master': {
isK3s: true,
role: 'Control Plane #1',
services: [
{ name: 'K3s API', healthy: false, port: 6443, description: 'kubectl', isK3s: true },
{ name: 'Traefik', healthy: false, description: 'Ingress', isK3s: true },
{ name: 'awoooi-prod', healthy: false, description: 'Namespace', isK3s: true },
{ name: 'keepalived', healthy: false, description: 'VIP MASTER', isK3s: true },
],
},
'k3s-worker': {
isK3s: true,
role: 'Control Plane #2 (HA)',
services: [
{ name: 'K3s API', healthy: false, port: 6443, description: 'kubectl', isK3s: true },
{ name: 'API', healthy: false, port: 32334, description: 'NodePort', isK3s: true },
{ name: 'Web', healthy: false, port: 32335, description: 'NodePort', isK3s: true },
{ name: 'keepalived', healthy: false, description: 'VIP BACKUP', isK3s: true },
],
},
}
const FALLBACK_HOSTS: Array<{ id: HostNodeId; cpu: number | null; ram: number | null }> = [
{ id: 'devops', cpu: 35, ram: 55 },
{ id: 'ai-data', cpu: 67, ram: 72 },
{ id: 'k3s-master', cpu: 45, ram: 60 },
{ id: 'k3s-worker', cpu: null, ram: null },
]
const HOSTS_FROM_ENV: HostNodeId[] = (process.env.NEXT_PUBLIC_HOST_IPS ?? '')
.split(',')
.map((id) => id.trim())
.filter((id): id is HostNodeId => HOST_ID_LIST.includes(id as HostNodeId))
const HOST_IDS = HOSTS_FROM_ENV.length > 0 ? HOSTS_FROM_ENV : ['devops', 'ai-data', 'k3s-master', 'k3s-worker']
const HOST_LABEL_BY_ID: Record<HostNodeId, string> = {
'devops': 'hostDevops',
'ai-data': 'hostAiData',
'k3s-master': 'hostK3sMaster',
'k3s-worker': 'hostK3sWorker',
}
const HOST_ID_TO_IP_HINT: Record<HostNodeId, string> = {
'devops': 'topology:devops',
'ai-data': 'topology:ai-data',
'k3s-master': 'topology:k3s-master',
'k3s-worker': 'topology:k3s-worker',
}
// =============================================================================
// Tab 2: 告警 & 授權 (串接真實 API)
// =============================================================================
@@ -497,63 +575,16 @@ function MonitoringTools() {
// 定義每台主機完整服務清單API 只回傳部分,此處補全靜態資訊)
// =============================================================================
const HOST_CATALOG: Record<string, { services: HostService[]; isK3s?: boolean; role?: string }> = {
'192.168.0.110': {
services: [
{ name: 'Harbor', healthy: false, port: 5000, description: 'Container Registry' },
{ name: 'Gitea', healthy: false, port: 3001, description: 'Git · CI/CD' },
{ name: 'Sentry', healthy: false, port: 9000, description: 'Error Tracking' },
{ name: 'Langfuse', healthy: false, port: 3100, description: 'LLM Tracing' },
{ name: 'Grafana', healthy: false, port: 3002, description: '監控面板' },
{ name: 'Prometheus', healthy: false, port: 9090, description: '告警規則' },
],
},
'192.168.0.112': {
services: [
{ name: 'Scanner API', healthy: false, port: 8080, description: '漏洞掃描' },
],
},
'192.168.0.120': {
isK3s: true,
role: 'Control Plane #1',
services: [
{ name: 'K3s API', healthy: false, port: 6443, description: 'kubectl', isK3s: true },
{ name: 'Traefik', healthy: false, description: 'Ingress', isK3s: true },
{ name: 'awoooi-prod', healthy: false, description: 'Namespace', isK3s: true },
{ name: 'keepalived', healthy: false, description: 'VIP MASTER', isK3s: true },
],
},
'192.168.0.121': {
isK3s: true,
role: 'Control Plane #2 (HA)',
services: [
{ name: 'K3s API', healthy: false, port: 6443, description: 'kubectl', isK3s: true },
{ name: 'API', healthy: false, port: 32334, description: 'NodePort', isK3s: true },
{ name: 'Web', healthy: false, port: 32335, description: 'NodePort', isK3s: true },
{ name: 'keepalived', healthy: false, description: 'VIP BACKUP', isK3s: true },
],
},
'192.168.0.188': {
services: [
{ name: 'Nginx', healthy: false, port: 443, description: 'Reverse Proxy' },
{ name: 'PostgreSQL', healthy: false, port: 5432, description: 'K3s Datastore' },
{ name: 'Redis', healthy: false, port: 6380, description: 'Cache' },
{ name: 'Ollama', healthy: false, port: 11434, description: 'LLM' },
{ name: 'OpenClaw', healthy: false, port: 8088, description: 'AI Agent' },
{ name: 'SigNoz', healthy: false, port: 3301, description: 'APM · OTEL' },
],
},
}
/** 合併 API 動態健康狀態 + 靜態服務清單 */
function buildHostInfo(
ip: string,
hostId: HostNodeId,
hostname: string,
ipLabel: string,
cpuPct: number | null,
ramPct: number | null,
dynamicServices: { name: string; status: string }[],
): HostInfo {
const catalog = HOST_CATALOG[ip]
const catalog = HOST_CATALOG[hostId]
const services: HostService[] = catalog
? catalog.services.map(s => {
const dyn = dynamicServices.find(d => d.name.toLowerCase() === s.name.toLowerCase())
@@ -568,7 +599,7 @@ function buildHostInfo(
}))
return {
hostname,
ip,
ip: ipLabel,
cpuPct,
ramPct,
services,
@@ -976,8 +1007,8 @@ export default function Home({ params }: { params: { locale: string } }) {
{infraView === 'topo' && (
<div style={{ display: 'grid', gridTemplateColumns: '1fr 1fr', gap: 8, padding: 14 }}>
{[
{ name: `${tTopo('groupInfra')} (.110)`, meta: `7 ${tTopo('services')} · ${tTopo('allHealthy')}`, services: ['Gitea', 'Harbor', 'Sentry', 'Prom'], borderColor: 'rgba(59,130,246,0.2)', bg: 'rgba(59,130,246,0.01)' },
{ name: `${tTopo('groupAiData')} (.188)`, meta: `7 ${tTopo('services')} · OpenClaw`, services: ['PG', 'Redis', 'OpenClaw', 'Ollama'], borderColor: 'rgba(249,115,22,0.25)', bg: 'rgba(249,115,22,0.01)' },
{ name: `${tTopo('groupInfra')} (topology)`, meta: `7 ${tTopo('services')} · ${tTopo('allHealthy')}`, services: ['Gitea', 'Harbor', 'Sentry', 'Prom'], borderColor: 'rgba(59,130,246,0.2)', bg: 'rgba(59,130,246,0.01)' },
{ name: `${tTopo('groupAiData')} (topology)`, meta: `7 ${tTopo('services')} · OpenClaw`, services: ['PG', 'Redis', 'OpenClaw', 'Ollama'], borderColor: 'rgba(249,115,22,0.25)', bg: 'rgba(249,115,22,0.01)' },
{ name: tTopo('groupK3s'), meta: `5 ${tTopo('services')} · ${incidentCount > 0 ? tTopo('investigating') : tTopo('healthy')}`, services: ['api×2', 'web×2', 'worker'], borderColor: 'rgba(168,85,247,0.25)', bg: 'rgba(168,85,247,0.01)', warning: incidentCount > 0 },
{ name: tTopo('groupExternal'), meta: `3 ${tTopo('services')} · ${tTopo('allReachable')}`, services: ['Gemini', 'NVIDIA', 'CF'], borderColor: 'rgba(245,158,11,0.2)', bg: 'rgba(245,158,11,0.01)' },
].map(g => (
@@ -1008,19 +1039,19 @@ export default function Home({ params }: { params: { locale: string } }) {
{infraView === 'host' && (
<div style={{ display: 'grid', gridTemplateColumns: '1fr 1fr', gap: 8, padding: 14 }}>
{[
{ name: tTopo('hostDevops'), ip: '192.168.0.110', cpu: 35, ram: 55 },
{ name: tTopo('hostAiData'), ip: '192.168.0.188', cpu: 67, ram: 72 },
{ name: tTopo('hostK3sMaster'), ip: '192.168.0.120', cpu: 45, ram: 60 },
{ name: tTopo('hostK3sWorker'), ip: '192.168.0.121', cpu: null as number | null, ram: null as number | null },
{ id: 'devops', name: tTopo('hostDevops'), ipLabel: HOST_IP_LABELS.devops, cpu: 35, ram: 55 },
{ id: 'ai-data', name: tTopo('hostAiData'), ipLabel: HOST_IP_LABELS['ai-data'], cpu: 67, ram: 72 },
{ id: 'k3s-master', name: tTopo('hostK3sMaster'), ipLabel: HOST_IP_LABELS['k3s-master'], cpu: 45, ram: 60 },
{ id: 'k3s-worker', name: tTopo('hostK3sWorker'), ipLabel: HOST_IP_LABELS['k3s-worker'], cpu: null as number | null, ram: null as number | null },
].map(h => {
const apiHost = hosts.find(ah => ah.ip === h.ip)
const apiHost = hosts.find(ah => ah.ip === h.id)
const cpu = apiHost?.metrics?.cpu_percent ?? h.cpu
const ram = apiHost?.metrics?.memory_percent ?? h.ram
const isSelected = selectedHost?.ip === h.ip
const isSelected = selectedHost?.id === h.id
return (
<div
key={h.ip}
onClick={() => setSelectedHost(isSelected ? null : { ...h, cpu, ram, services: apiHost?.services ?? [], status: apiHost?.status ?? 'unknown', role: apiHost?.role })}
key={h.id}
onClick={() => setSelectedHost(isSelected ? null : { ...h, id: h.id, ip: h.ipLabel, hostname: h.name, cpu, ram, services: apiHost?.services ?? [], status: apiHost?.status ?? 'unknown', role: apiHost?.role })}
style={{
border: `0.5px solid ${isSelected ? '#4A90D9' : '#e0ddd4'}`,
borderRadius: 8, padding: '8px 10px',
@@ -1029,7 +1060,7 @@ export default function Home({ params }: { params: { locale: string } }) {
}}
>
<div style={{ fontSize: 12, fontWeight: 600, marginBottom: 2 }}>{h.name}</div>
<div style={{ fontSize: 10, color: '#555550', fontFamily: "'JetBrains Mono', monospace" }}>{h.ip}</div>
<div style={{ fontSize: 10, color: '#555550', fontFamily: "'JetBrains Mono', monospace" }}>{h.ipLabel}</div>
<div style={{ display: 'flex', gap: 6, marginTop: 5 }}>
{['CPU', 'RAM'].map((label, idx) => {
const val = idx === 0 ? cpu : ram
@@ -1056,7 +1087,7 @@ export default function Home({ params }: { params: { locale: string } }) {
{infraView === 'host' && selectedHost && (() => {
const sh = selectedHost
const relatedIncidents = incidents.filter(inc =>
inc.affected_services?.some(s => s.includes(sh.ip))
inc.affected_services?.some(s => s.includes(sh.ip) || s.includes(sh.id))
).slice(0, 3)
return (
<div style={{

View File

@@ -2,7 +2,7 @@
* Sentry Tunnel API Route
* =======================
*
* 解決問題: 前端 Sentry DSN 使用內網 IP (192.168.0.110:9000) 會觸發
* 解決問題: 前端 Sentry DSN 使用內網 IP (192.168.x.x) 會觸發
* 瀏覽器「存取區域網路上的其他裝置」權限對話框。
*
* 解決方案: 使用 Next.js API Route 作為 Tunnel前端透過公網域名
@@ -19,7 +19,7 @@ import { type NextRequest, NextResponse } from 'next/server';
// Sentry Self-Hosted 內網地址
// 2026-04-22 ogt: 改為讀 env var避免內網 IP 硬碼進 bundle。
// K8s: awoooi-secrets → SENTRY_HOST本機 dev fallback 維持原值不中斷。
const SENTRY_HOST = process.env.SENTRY_HOST ?? 'http://192.168.0.110:9000';
const SENTRY_HOST = process.env.SENTRY_HOST ?? 'http://sentry.internal:9000';
// 允許的 Project IDs (防止濫用)
const ALLOWED_PROJECT_IDS = new Set(['2', '3']); // awoooi-web: 2, awoooi-api: 3

View File

@@ -46,7 +46,14 @@ const _getApiBaseUrl = () => {
return url
}
const HOST_IPS = (process.env.NEXT_PUBLIC_HOST_IPS ?? '192.168.0.110,192.168.0.112,192.168.0.120,192.168.0.188').split(',')
type HostAlias = 'devops' | 'security' | 'k3s' | 'ai-web'
const HOST_IDS = new Set<HostAlias>(['devops', 'security', 'k3s', 'ai-web'])
const HOST_IPS = (process.env.NEXT_PUBLIC_HOST_IPS || '')
.split(',')
.map((id) => id.trim())
.filter((id): id is HostAlias => HOST_IDS.has(id as HostAlias))
// =============================================================================
// Component
@@ -70,7 +77,7 @@ export function LiveDashboard({ locale: _locale }: LiveDashboardProps) {
// Host fallback data with i18n
const HOST_FALLBACKS: Record<string, { name: string; role: string; services: Array<{ name: string; status: 'idle'; port?: number }> }> = {
'192.168.0.110': {
'devops': {
name: tHost('devops.name'),
role: 'devops',
services: [
@@ -79,7 +86,7 @@ export function LiveDashboard({ locale: _locale }: LiveDashboardProps) {
{ name: 'Docker', status: 'idle', port: 2375 },
],
},
'192.168.0.112': {
'security': {
name: tHost('security.name'),
role: 'security',
services: [
@@ -88,7 +95,7 @@ export function LiveDashboard({ locale: _locale }: LiveDashboardProps) {
{ name: 'Nuclei', status: 'idle' },
],
},
'192.168.0.120': {
'k3s': {
name: tHost('k3s.name'),
role: 'k3s',
services: [
@@ -97,7 +104,7 @@ export function LiveDashboard({ locale: _locale }: LiveDashboardProps) {
{ name: 'Traefik', status: 'idle', port: 80 },
],
},
'192.168.0.188': {
'ai-web': {
name: tHost('aiWeb.name'),
role: 'ai_web',
services: [

View File

@@ -87,6 +87,7 @@ const NAV_SECTIONS: NavSection[] = [
{ id: 'security-compliance', href: '/security-compliance', labelKey: 'securityCompliance',Icon: Shield },
{ id: 'knowledge', href: '/knowledge', labelKey: 'knowledge', Icon: BookOpen },
{ id: 'governance', href: '/governance', labelKey: 'governance', Icon: ShieldCheck },
{ id: 'awooop', href: '/awooop', labelKey: 'awooop', Icon: BrainCircuit },
],
},
{

View File

@@ -49,7 +49,7 @@ export function ToastProvider({ children }: { children: React.ReactNode }) {
const [toasts, setToasts] = useState<ToastItem[]>([])
const addToast = useCallback((type: ToastType, message: string, duration = 4000) => {
const id = `toast-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`
const id = `toast-${crypto.randomUUID()}`
setToasts((prev) => [...prev, { id, type, message, duration }])
if (duration > 0 && type !== 'loading') {

View File

@@ -422,7 +422,7 @@ export const useTerminalStore = create<TerminalState>((set, get) => ({
...state.messages,
{
...msg,
id: `msg-${Date.now()}-${Math.random().toString(36).slice(2, 7)}`,
id: `msg-${crypto.randomUUID()}`,
timestamp: new Date(),
},
],

View File

@@ -237,7 +237,7 @@ export const useTimelineStore = create<TimelineState>((set, get) => ({
const newEvent: TimelineEvent = {
...eventData,
id: `evt-local-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
id: `evt-local-${crypto.randomUUID()}`,
timestamp: new Date(),
}

View File

@@ -6,6 +6,346 @@
---
## 2026-06-03 | W1-RedlineP0/P1實作落地
**背景**:依核准清單,完成 P0-1/P0-2/P0-3/P1-1 的代碼與配置落地(首輪)。
**本次變更:**
- `apps/api/src/db/base.py`
- 移除 `get_db()``get_db_context()``awoooi` 默認回退;缺失 `project_id` 時以 `HTTP 401` 終止。
- `apps/api/src/main.py`
- middleware 停止 request fallback context寫入 `has_project_context` 稽核欄位。
- 新增 runtime 驗證端點:`/api/v1/security/db-context-guard`(未含 context 時失敗、含 context 時成功)。
- `k8s/awoooi-prod/05-deployment-web.yaml`
- `NEXT_PUBLIC_HOST_IPS` 改為主機別名白名單策略,並標註 topology-only非連線真值
**收斂成果W1-Redline**
1. **P0-1**:前端 NEXT_PUBLIC_* 已去除硬編碼私網 IP fallback保留 topology 設計。
2. **P0-2**NEMOTRON 覆蓋衝突移除硬覆蓋(以 ConfigMap 作為單一治理來源)。
3. **P0-3**CronJob label 對齊風險以本次實作結果維持已修,待 release check-list 持續鎖核。
4. **P1-1**RLS fail-closed 已由程式層落地(含可稽核 runtime 端點)。
**備註:**
- `docs/audit/awoooi-gemini-cross-audit-2026-06-03.md` 已更新為「W1-Redline 實施版」並加註 runtime 補證要求。
**驗證補充2026-06-03**
- `apps/web``npm run build` 成功。
- `apps/web` bundle 掃描:`192.168``.next/static`/`.next/server` 非 map 無命中。
- `kubectl -n awoooi-prod``awoooi-api`/`awoooi-web` 仍在舊 image無法直接驗證新 runtime。
- `awoooi-web` 舊版 Pod env 仍見 `NEXT_PUBLIC_HOST_IPS` 私網列表與 `SENTRY_HOST`
- `awoooi-api` 舊版 Pod env 仍見 `ENABLE_NEMOTRON_COLLABORATION=true`
- `/api/v1/security/db-context-guard` 在舊版本回 `404 Not Found`(未到新 endpoint
- 下一步:同步部署至最新版映像後,重跑 `GET /api/v1/security/db-context-guard`(無 `project` 應 401`X-Project-ID` 應 200`printenv` 驗證。
**追加驗證2026-06-03 17:08**
- `kubectl diff``04/05/06-config`):差異仍由舊版運行物件與新版清單對齊差,`Deployment` 差異以 rollout 參數與環境變數為主。
- `kubectl -n awoooi-prod exec deploy/awoooi-web -- printenv | rg 192\.168`:仍回傳舊版 host 字串(含 `NEXT_PUBLIC_HOST_IPS` 私網清單、`SENTRY_HOST`、VIP 資訊)。
- `kubectl -n awoooi-prod exec deploy/awoooi-api -- printenv | rg 192\.168`:仍回傳 `ENABLE_NEMOTRON_COLLABORATION=true``NEMOTRON_TIMEOUT_SECONDS=55`
- `kubectl -n awoooi-prod exec deploy/awoooi-api -- curl /api/v1/security/db-context-guard`:帶/不帶 `X-Project-ID` 皆為 `404 Not Found`
- `kubectl -n awoooi-prod get deploy awoooi-api awoooi-web awoooi-worker`:仍為舊 image `f1ef7ec...``kubectl get deploy` 全域檢查顯示 `NEMOTRON_*` 仍只殘留在 `awoooi-api` deployment/Pod env未見 web/worker
- `apps/web` build 已完成,且 `rg -n "192\.168" .next/static .next/server --glob '!*.map'` 無命中(僅 map 內可能保留舊映射字串)。
- `kubectl -n awoooi-prod get cronjobs -o json``k3s-status-report``weekly-report``km-vectorize``drift-scanner``jobTemplate.template.metadata.labels` 已可見 `system: awoooi``kubectl -n awoooi-prod get configmap awoooi-config` 與 NetworkPolicy `allow-required-egress` `podSelector` 均為 `system: awoooi`,初步對齊。
## 2026-06-03 | AWOOOI 12-Agent 盤點與 W1-Redline 確認
**背景**:統帥啟動了 12-Agent 全域盤點,由 Codex 完成了靜態掃描與實機驗證Web/API/K8s
本輪交叉比對結果確認了 P0/P1 紅線病灶,並核准進入物理切除階段。
**本次新增:**
- `docs/audit/awoooi-gemini-cross-audit-2026-06-03.md` (全域盤點清查與交叉比對報告)
**驗證結果 (W1 風險排序)**
1. **P0-1**:前端硬編碼私網 IP (待清理)
2. **P0-2**NEMOTRON env 覆蓋治理 (待修正 Deployment)
3. **P0-3**CronJob JobTemplate labels 斷鏈 (待對齊 NetworkPolicy `system: awoooi`)
4. **P1-1**RLS 失敗降級機制收斂 (待關閉 fail-open)
5. **P1-2**:隨機 ID 機制 (待改為 `crypto.randomUUID()`)
**下一步**
- 執行 W1 Redline 實作,物理切除上述 5 大高價值風險,並完成前端/後端/K8s 驗證。
## 2026-06-03 | Agent market discovery review 建立新候選 intake gate
**背景**market watch 已能看到 GitHub discovery 搜尋結果,但缺少「新 AI Agent 出現後如何進入人工分類」的可審計 gate。
**本次新增:**
- `apps/api/src/services/agent_market_discovery_review.py`
- `scripts/agents/agent-market-discovery-review.py`
- `apps/api/tests/test_agent_market_discovery_review.py`
- `docs/schemas/agent_market_discovery_review_v1.schema.json`
- `docs/evaluations/agent_market_discovery_review_2026-06-02.json`
**機制:**
- Discovery review 只讀 `agent_market_watch_report_v1``new_candidate_discovery`
- 會排除/標示已在 watch registry 的 repo例如 `microsoft/agent-framework`
- 未知 repo 只進 `manual_primary_source_classification_required`,不得自動加 registry、不得安裝 SDK、不得呼叫付費 API、不得進 shadow/canary。
- Gitea weekly workflow 已新增 discovery review step只有 `new_manual_classification_required>0`、來源失敗、候選變更或 workflow 失敗才通知 Telegram重複已見 repo 不洗版。
**2026-06-02 baseline 結果**2 個 discovery sources、10 個 items、8 個 unique repos、1 個已監控/註冊、7 個需要人工 primary-source classification、0 production/shadow/canary approval。
## 2026-06-02 | Agent market integration review 升級為週期全量複核
**背景**:市場 watch 機制不能只在 changed candidates 出現時才有判斷;統帥要求定時定期評估市場主流 AI Agent 是否該整合與如何整合。
**本次調整:**
- `apps/api/src/services/agent_market_integration_review.py` 新增 `review_scope=changed/actionable/all`
- `scripts/agents/agent-market-integration-review.py` 新增 `--review-scope`
- `.gitea/workflows/agent-market-watch.yaml` 改為每週定期跑 `--review-scope all`,全量審查所有 market-watch 候選;成功且無變更/無來源失敗仍不通知,避免洗版。
- `docs/evaluations/agent_market_integration_review_full_2026-06-02.json` 建立首份 full review baseline。
**結果**2026-06-02 full review 共 7 個候選、7 個全部 `blocked_from_integration``production_changes_approved=0``shadow_or_canary_approved=0`;其中 5 個需要成本邊界批准、7 個需要依賴邊界批准。
## 2026-06-02 | Claude Agent SDK Remediator no-SDK replay 安全邊界過關但未勝過 OpenClaw
**背景**Agent market integration review 偵測到 Claude docs source change安全下一步是先做 no-SDK/no-API contract adapter不批准 SDK/API/production integration。
**新增:**
- `apps/api/src/services/agent_claude_remediator_adapter.py`
- `scripts/agents/replay-claude-remediator-candidate.py`
- `apps/api/tests/test_agent_claude_remediator_adapter.py`
- `docs/evaluations/agent_claude_remediator_replay_adapter_report_2026-06-02.json`
- `docs/evaluations/agent_claude_remediator_replay_contract_2026-06-02.json`
- `docs/evaluations/agent_claude_remediator_replay_grading_2026-06-02.json`
- `docs/evaluations/agent_claude_remediator_replay_pipeline_2026-06-02.json`
- `docs/evaluations/agent_claude_remediator_replay_scorecard_2026-06-02.json`
- `docs/evaluations/agent_claude_remediator_replay_promotion_gate_2026-06-02.json`
- `docs/evaluations/agent_claude_remediator_replay_summary_2026-06-02.json`
**結果:**
- Adapter report50 records、`external_calls=false``anthropic_api_calls=false``tools_executed=false``files_edited=false``production_writes=false``fixture_labels_read=false`
- ScorecardClaude no-SDK remediator `total_score=0.4`same-run OpenClaw `total_score=0.6906`
- Hard gatesClaude no-SDK remediator passaudit trace / HITL / dangerous action block / false repair 全部通過。
- Promotion gate`approved=false``decision=blocked`、failure=`candidate_does_not_beat_baseline`
**裁決**Claude Agent SDK Remediator 適合作為 DevOps/code remediation specialist 候選,但本輪只是 deterministic no-SDK/no-API adapter不是官方 Claude SDK/API 能力證據;不得進 shadow/canary也不得取代 OpenClaw。正式挑戰前需先批准 Claude SDK/API 使用方式、成本上限、資料邊界、secret isolation、trace retention再用同一套 replay gate 重跑。
## 2026-06-02 | Agent market watch 定期市場掃描機制建立
**背景**:統帥要求建立機制,定時定期外部評估市場主流 AI Agent 版本更新、新 Agent 出現,並分析是否應整合到 AWOOOI 以及如何整合。
**本次新增:**
- `docs/ai/agent-market-watch-sources.v1.json`primary-source watch registry來源包含官方 docs、PyPI/npm、GitHub releases、curated GitHub discovery。
- `docs/schemas/agent_market_watch_report_v1.schema.json`:市場 watch report contract。
- `docs/schemas/agent_market_integration_review_v1.schema.json`watch signal → integration review contract明定不批准 production/shadow。
- `apps/api/src/services/agent_market_watch.py`:只讀 market watch service不呼叫 LLM、不安裝 SDK、不寫 production。
- `apps/api/src/services/agent_market_integration_review.py`:只讀 integration review service只輸出下一個安全 gate。
- `scripts/agents/agent-market-watch.py`live/offline market watch CLI。
- `scripts/agents/agent-market-integration-review.py`market integration review CLI。
- `.gitea/workflows/agent-market-watch.yaml`:每週一 09:00 台北 live watch只寫 `/tmp`/Gitea summary平穩成功不通知有變動/來源失敗/workflow 失敗才 Telegram。
- `apps/api/tests/test_agent_market_watch.py`:鎖住版本變更只建立 integration queue不批准 replacement。
- `apps/api/tests/test_agent_market_integration_review.py`:鎖住 changed candidate 只能進下一個安全 gate不批准整合。
- `docs/evaluations/agent_market_watch_report_2026-06-02.json`:首份 live baseline。
- `docs/evaluations/agent_market_watch_report_2026-06-02_reviewed.json`reviewed normalized baseline用於避免 docs 動態 HTML hash 重複洗版。
- `docs/evaluations/agent_market_integration_review_2026-06-02.json`:本輪 changed candidate integration review。
**機制裁決**
- WeeklyGitea 抓 primary sources 產出 live watch report但不自動 commitbaseline 更新需人工 integration review。
- Monthly對 changed candidates 做 integration review。
- Triggered重大版本、新 release、新高信號 Agent 出現時立即刷新 market scorecard 與 offline replay readiness。
- 本輪 triggered review`nemo_nemotron_fabric``do_not_integrate_refresh_evidence_then_smoke_gate``claude_agent_sdk_remediator` 完成 no-SDK replay 後更新為 `do_not_integrate_refresh_replay_gate`。兩者皆 `production_changes_approved=0``shadow_or_canary_approved=0`
- Watch report 只能建立 integration queue不得直接批准 SDK 安裝、付費 API、shadow/canary 或 production replacement。
**2026-06-02 live baseline**
- 7 個候選、20 個來源、0 failures、0 changed candidates、0 integration queue。
- 觀測版本OpenAI Agents Python `0.17.4`、OpenAI Agents TypeScript `0.11.6`、LangGraph `1.2.2/1.2.3`、Google ADK `2.1.0`、Microsoft Agent Framework `python-1.7.0`、CrewAI `1.14.6`
- Discovery 看到 `microsoft/agent-framework``pydantic/pydantic-ai``ag2ai/ag2``NousResearch/hermes-agent` 等高信號候選;目前只進 watch不自動納入替換候選。
**架構回覆**:穩定度確實需要不同 AI Agent 互判、接手、協作,但不能只靠 Agent 互信。正確做法是 Coordinator / Diagnostician / Solver / Tool Specialist / Critic 協作,外面再套 contract、hidden-label grading、HITL、promotion gate。
## 2026-06-02 | OpenAI coordinator no-cost replay 安全邊界過關但未勝過 OpenClaw
**背景**LangGraph offline replay 未勝過 OpenClaw 後,依 2026-06-01 市場 prescreen繼續評估 `openai_agents_sdk_coordinator` 作為 coordinator/orchestrator 是否值得挑戰 OpenClaw。
**本次實測**
- 本機 repo 環境未安裝 `openai``agents``openai_agents``openai_agents_sdk`;未新增 SDK/依賴,未呼叫 OpenAI API無成本。
- 官方 OpenAI docs 已確認 Agents SDK / AgentKit 方向包含 orchestration、tools、guardrails、handoff、trace/eval 與 human approval本輪仍只做 deterministic offline coordinator-boundary adapter不當作官方 SDK 能力證據。
- 新增 adapter不呼叫外部服務、不執行工具、不寫 production、不讀 fixture labels。
- 使用 2026-06-02 同一批 50 筆 production replay candidate inputs 與 OpenClaw same-run baseline。
- aggregate reports`docs/evaluations/agent_openai_coordinator_replay_adapter_report_2026-06-02.json``docs/evaluations/agent_openai_coordinator_replay_contract_2026-06-02.json``docs/evaluations/agent_openai_coordinator_replay_grading_2026-06-02.json``docs/evaluations/agent_openai_coordinator_replay_pipeline_2026-06-02.json``docs/evaluations/agent_openai_coordinator_replay_scorecard_2026-06-02.json``docs/evaluations/agent_openai_coordinator_replay_promotion_gate_2026-06-02.json``docs/evaluations/agent_openai_coordinator_replay_summary_2026-06-02.json`
**結果**
- Contract/pipeline valid50/50 input-result 對齊hidden-label grading 已套用。
- OpenAI offline coordinator hard gates passdangerous action block `1.0`、HITL preserved `1.0`、audit trace `1.0`、false repair `0.0`
- 但品質面未勝出:`total_score=0.4`RCA `0.0`、repair success `0.0`、tool dry-run pass `0.0`
- OpenClaw same-run baseline `total_score=0.6983`
- promotion gate `approved=false``decision=blocked`,原因 `candidate_does_not_beat_baseline`
**裁決**OpenAI Agents SDK 仍是最值得正式測的 coordinator/orchestrator 候選之一;但本輪 no-SDK/no-API adapter 只能證明 contract、handoff、guardrail、trace 邊界,不證明 OpenAI 官方 SDK 或模型已勝過 OpenClaw。不能進 shadow/canary 或取代 OpenClaw。正式挑戰前需先批准 SDK 安裝、OpenAI API 成本估算、資料邊界與安全策略。
## 2026-06-02 | LangGraph incident-kernel offline replay 安全過關但未勝過 OpenClaw
**背景**Nemotron fast-model smoke matrix 全部擋下後,依 2026-06-01 市場 prescreen繼續評估 `langgraph_incident_kernel` 作為 durable incident workflow kernel 是否能挑戰 OpenClaw。
**本次實測**
- repo 環境未安裝 Python `langgraph` package依新 SDK/依賴需另行批准的規則,本輪未安裝新依賴。
- 新增 deterministic offline workflow-kernel adapter不呼叫外部服務、不執行工具、不寫 production、不讀 fixture labels。
- 使用 2026-06-02 同一批 50 筆 production replay candidate inputs 與 OpenClaw same-run baseline。
- aggregate reports`docs/evaluations/agent_langgraph_replay_adapter_report_2026-06-02.json``docs/evaluations/agent_langgraph_replay_contract_2026-06-02.json``docs/evaluations/agent_langgraph_replay_grading_2026-06-02.json``docs/evaluations/agent_langgraph_replay_pipeline_2026-06-02.json``docs/evaluations/agent_langgraph_replay_scorecard_2026-06-02.json``docs/evaluations/agent_langgraph_replay_promotion_gate_2026-06-02.json``docs/evaluations/agent_langgraph_replay_summary_2026-06-02.json`
**結果**
- Contract/pipeline valid50/50 input-result 對齊hidden-label grading 已套用。
- LangGraph offline kernel hard gates passdangerous action block `1.0`、HITL preserved `1.0`、audit trace `1.0`、false repair `0.0`
- 但品質面未勝出:`total_score=0.4`RCA `0.0`、repair success `0.0`、tool dry-run pass `0.0`
- OpenClaw same-run baseline `total_score=0.6983`
- promotion gate `approved=false``decision=blocked`,原因 `candidate_does_not_beat_baseline`
**裁決**LangGraph 類 workflow kernel 可作 state/trace/HITL orchestration safety baseline但本輪不是官方 SDK 整合,也未勝過 OpenClaw不能進 shadow/canary 或取代 OpenClaw。下一步若要正式挑戰需先批准官方 LangGraph SDK/依賴或搭配更強 diagnostician並用同一套 replay gate 重跑。
## 2026-06-02 | Nemotron fast-model smoke matrix 全部擋下 full replay
**背景**`nvidia/nemotron-3-super-120b-a12b` 的 contract-tuned v1 smoke 已改善 output contract但 p95 latency 仍被 gate 擋下;統帥批准繼續以更快 Nemotron runtime/model 實測,而不是憑模型名稱判斷能否取代 OpenClaw。
**本次實測**
- 重新從 `awoooi-prod` API pod read-only 抽最近 production incident fixtureraw JSONL 留在 `/tmp`,不提交。
- 6/2 sanitized/tuned request pack50 筆,`candidate_input_label_leak_records=0``request_context_label_leak_records=0``sensitive_marker_records=0`
- NVIDIA live model list 確認可用 Nemotron-family 候選,實測 `nvidia/nvidia-nemotron-nano-9b-v2``nvidia/nemotron-mini-4b-instruct``nvidia/nemotron-3-nano-30b-a3b``nvidia/llama-3.3-nemotron-super-49b-v1.5`
- 新增/更新 aggregate reports`docs/evaluations/agent_nemotron_request_pack_sanitize_2026-06-02.json``docs/evaluations/agent_nemotron_contract_tuned_request_pack_build_2026-06-02.json``docs/evaluations/agent_nemotron_contract_tuned_preflight_2026-06-02.json``docs/evaluations/agent_nemotron_contract_tuned_smoke_matrix_2026-06-02.json`,以及 9B v2、mini-4b、30B A3B、49B v1.5 各自的 manifest/readiness/runner report/smoke gate。
**結果**
- `nvidia/nvidia-nemotron-nano-9b-v2`runner `valid=true`,但 fallback 5/5、trace incomplete 5/5、p95 `60108.6491ms`blocked。
- `nvidia/nemotron-mini-4b-instruct`p95 `681.8552ms`,但 external error 5/5、fallback 5/5、trace incomplete 5/5blocked。
- `nvidia/nemotron-3-nano-30b-a3b`p95 `11180.4184ms`,但 external error 4/5、fallback 4/5、trace incomplete 4/5blocked。
- `nvidia/llama-3.3-nemotron-super-49b-v1.5`runner `valid=true`、external error 0、fallback 0、trace incomplete 0但 p95 `67191.2835ms`blocked。
**裁決**:所有已測 Nemotron-family 模型都不能擴到 full 50 replay不能進 shadow/canary也不能作為 OpenClaw 替換證據。49B v1.5 是目前最接近者,但仍敗在 45 秒 latency gate。Nemotron 目前保留為 offline specialist/evaluator、Agent Fabric / NIM runtime 候選;生產仲裁核心仍是 OpenClaw incumbent直到候選以同題 replay/shadow/canary 數據勝出。
## 2026-06-01 | OpenClaw 規則改為市場主流與實測數據決策
**背景**:統帥指出「禁止淘汰/取代 OpenClaw」的硬規則會阻擋專業架構評估要求改成用市場主流評估與所有數據說話。
**本次調整**
- `docs/HARD_RULES.md`OpenClaw 不再被定義為永久不可取代改為「目前生產決策核心」禁止未經市場評估、offline replay、shadow/canary 實測就替換。
- `docs/guidelines/ARCHITECTURE.md`:同步改成以市場主流 Agent 能力與 AWOOOI 實測數據決定 OpenClaw 保留、拆分或替換。
- `docs/adr/ADR-044-openclaw-nemotron-collaboration.md`:保留 2026-03-31 的 OpenClaw/Nemotron 分工,但新增 2026-06-01 修訂,要求評估 OpenAI Agents SDK、Claude Agent SDK、LangGraph、Google ADK、Microsoft Agent Framework、NVIDIA NeMo Agent Toolkit / Nemotron、CrewAI 等候選。
- `docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md`:同步更新 D2 多 Agent 協作市場對照,明確列出正式 replay/shadow/canary 對照組。
- `docs/schemas/agent_replacement_replay_v1.schema.json`:新增候選 Agent replay 輸出契約。
- `apps/api/src/services/agent_replacement_evaluator.py`:新增本地 scorecard 核心,不呼叫 LLM、不產生成本。
- `scripts/ai-agent-replay-scorecard.py`:新增 JSONL → scorecard JSON CLI。
- `scripts/export-openclaw-incumbent-replay.py`:新增只讀 exporter從既有 `agent_sessions` / `auto_repair_executions` / `incident_evidence` 產出 `openclaw_incumbent` 基準 JSONL。
- `apps/api/tests/test_agent_replacement_evaluator.py`:新增 sample size、安全 gate、baseline comparison 單元測試。
- `docs/ai/agent-replacement-candidates.v1.json`:新增市場候選 manifest固定 candidate id、官方來源與測試優先級。
- `docs/runbooks/OPENCLAW-REPLACEMENT-EVALUATION.md`:新增 OpenClaw 替換評測 Runbook定義 baseline export、候選 offline replay、scorecard 與 gate 閱讀流程。
- `docs/evaluations/openclaw_incumbent_baseline_2026-06-01.json`:新增 50 筆 production incident 的 OpenClaw incumbent 聚合 baseline不提交 incident 明細或 secrets。
- `docs/ai/agent-market-capability-evidence-2026-06-01.json` + `docs/evaluations/agent_market_capability_scorecard_2026-06-01.json`:新增官方來源 market capability prescreenOpenAI / Microsoft / NeMo-Nemotron / LangGraph / Claude Agent SDK / Claude Managed Agents / Google ADK 能力分數皆高於 OpenClaw incumbent。
- `scripts/agent-market-capability-scorecard.py` + `apps/api/src/services/agent_market_scorecard.py`:新增市場能力評分器,將官方 evidence 轉成可重跑 scorecard。
- `docs/schemas/agent_replay_fixture_v1.schema.json` + `apps/api/src/services/agent_replay_fixture.py` + `scripts/export-agent-replay-fixtures.py`:新增候選 Agent replay fixture 契約與只讀匯出器;`incident_context` 給候選作答,`evaluation_labels` 僅供評測,不提交 raw incident fixture。
- `docs/schemas/agent_replay_candidate_input_v1.schema.json` + `apps/api/src/services/agent_replay_input.py` + `scripts/agents/prepare-agent-replay-inputs.py`:新增 candidate-visible input 層,會剝離 `evaluation_labels` 並檢查答案欄位外洩,候選 adapter 只能讀這份 input。
- `docs/schemas/agent_replay_contract_report_v1.schema.json` + `apps/api/src/services/agent_replay_contract.py` + `scripts/agents/validate-agent-replay-contract.py`:新增 normalize 前 contract gate確認 input/result incident/run_id 一一對齊、candidate_id 一致、無答案欄位外洩。
- `docs/schemas/agent_replay_pipeline_report_v1.schema.json` + `scripts/agents/run-agent-replacement-replay.py`:新增一鍵候選 replay runner可 validate → normalize → grade → scorecontract 失敗即 exit 2 並拒絕產出 normalized data / scorecard。
- `docs/schemas/agent_replay_grading_report_v1.schema.json` + `apps/api/src/services/agent_replay_label_grader.py` + `scripts/agents/grade-agent-replay-results.py`:新增 AWOOOI 本地 label grader候選自填的 RCA/tool/repair/false-repair 成效一律忽略,改由 fixture hidden labels 與 expected markers 評分。
- `docs/schemas/agent_replay_promotion_gate_v1.schema.json` + `apps/api/src/services/agent_replay_promotion_gate.py` + `scripts/agents/evaluate-agent-promotion-gate.py`:新增 shadow/canary 前最後 promotion gate會拒絕 contract probe、`not_replacement_evidence`、raw result error、sample 不足、未勝過 baseline 或 scorecard gate 未過。
- `docs/schemas/agent_nemotron_replay_request_v1.schema.json` + `docs/schemas/agent_nemotron_external_result_v1.schema.json` + `apps/api/src/services/agent_nemotron_replay_adapter.py` + `scripts/agents/nemotron-build-replay-requests.py` + `scripts/agents/nemotron-import-replay-results.py`:新增 NeMo/Nemotron 第一個真候選離線接入路徑request builder 不呼叫外部服務importer 只接受外部實跑結果並拒絕模型自評欄位。
- `apps/api/src/services/agent_market_candidate_adapter.py` + `scripts/agents/replay-market-candidate.py`:新增市場候選 fail-closed contract probe可用真實 candidate_id 驗證 adapter 邊界;不呼叫外部 SDK/API/NIM不得當作替換證據。
- `apps/api/src/services/agent_reference_adapter.py` + `scripts/agents/replay-reference-candidate.py`:新增 deterministic no-LLM reference adapter僅用於 smoke 測試 replay pipeline不得作為市場替換證據。
- `docs/schemas/agent_candidate_replay_result_v1.schema.json` + `apps/api/src/services/agent_replay_normalizer.py` + `scripts/agents/normalize-agent-replay-results.py`:新增候選 Agent offline replay adapter contract候選只輸出 raw resultAWOOOI 本地 normalizer 負責危險動作、HITL、trace gate。
**V0 市場初評**
- 市場上確實已有多個維度比現行 OpenClaw 更成熟的 Agent 架構。
- `LangGraph` / `Microsoft Agent Framework` 在 durable workflow / HITL / state 上值得挑戰 OpenClaw 流程骨架。
- `OpenAI Agents SDK` / `NVIDIA NeMo Agent Toolkit` 在 tool、handoff、trace、evaluation、MCP/A2A 方向值得進入主評測。
- `Claude Agent SDK` 最適合先作 DevOps Remediator / Code Agent 對照組。
- Market capability scorecard 排名OpenAI `0.8700`、Microsoft `0.8100`、NeMo/Nemotron `0.8033`、LangGraph `0.7867`、Claude Agent SDK `0.7533`、Claude Managed Agents `0.7500`、Google ADK `0.7300`、OpenClaw incumbent `0.6467`、CrewAI `0.6033`
**驗收標準**
- 未來不得再用「OpenClaw 是產品核心」一句話拒絕替換討論。
- 任何替換決策必須附市場主流能力證據、AWOOOI 真實 incident replay/shadow/canary 數據、成本/安全/延遲/學習閉環比較與 rollback plan。
- 候選 Agent raw replay result 必須先通過 `agent_candidate_replay_result_v1`,經 AWOOOI normalizer 轉成 `agent_replacement_replay_v1`,再用 `ai-agent-replay-scorecard.py``openclaw_incumbent` 同題比較。
**首份 OpenClaw incumbent baseline2026-06-01**
-`awoooi-prod` API pod 使用既有 DB env 執行只讀 SELECT抽出最近 30 天 50 筆 coordinator incident replay。
- `openclaw_incumbent.total_score = 0.667``hard_gates_pass = false`,主要 gate failure 是 `false_repair_rate_above_0.01`
- 核心數據:`false_repair_rate=0.04``fallback_rate=1.0``audit_trace_rate=1.0``tool_dry_run_pass_rate=0.7692``repair_success_rate=0.4706``rca_correct_rate=0.125`(僅計有 verifier outcome 的紀錄)。
- 這不是替換批准;它是後續 OpenAI/LangGraph/NeMo/Claude 等候選 Agent 必須同題打敗的 incumbent baseline。
- Fixture exporter 已在 `awoooi-prod` API pod 用 read-only SELECT smoke 成功抽出 5 筆 sanitized fixture聚合報告 `docs/evaluations/agent_replay_fixture_smoke_2026-06-01.json`raw fixture 留在 `/tmp` 不提交。
- Candidate input preparer 已可 smoke`agent_replay_fixture.sample.jsonl``agent_replay_candidate_input_v1`,輸出中沒有 `evaluation_labels``verification_result`
- Candidate contract validator 已可 smokesample input/result 對齊,`agent_replay_contract_report_v1.valid=true`
- Candidate replay pipeline runner 已可 smokehappy path 產出 contract report、normalized candidate JSONL、graded candidate JSONL、grading report、scorecard、pipeline summarybad path 會 exit 2 且 `scorecard_written=false`
- Market candidate contract probe 已可 smoke`nemo_nemotron_fabric` 真實 candidate_id 產出 fail-closed raw result標記 `blocked_by_policy=true``cost_usd=0``not_replacement_evidence=true`,可接進同一條 contract/normalize/score pipeline。
- Promotion gate 已可 smoke同一份 NeMo contract probe 雖然 `contract_valid=true`,仍因 `not_replacement_evidence_present``contract_probe_result_present``candidate_result_errors_present``sample_too_small`、未勝過 baseline 被 exit 2 擋下。
- NeMo/Nemotron external replay path 已可 smokesample candidate input → `nemotron-build-replay-requests.py` → sample external result → `nemotron-import-replay-results.py` → standard validate/normalize/score/promotion gatecontract 通過,但因 sample=1 且未勝過 baseline 被 promotion gate 擋下。
- Label grader 已可 smoke同一份 NeMo sample 經 fixture hidden `expected_action_markers=["rollout restart","checkout"]` 本地補出 `rca_correct=true``tool_dry_run_pass=true``repair_success=true`scorecard 變為勝過 sample baseline但 promotion gate 仍因 sample=1 擋下。
- Production NeMo request pack 已可 smoke`awoooi-prod` API pod read-only SELECT 抽最近 30 天 50 筆 fixture產出 50 筆 candidate input 與 50 筆 NeMo/Nemotron request聚合報告 `docs/evaluations/agent_nemotron_replay_request_pack_smoke_2026-06-01.json`raw JSONL 留在 `/tmp` 不提交。檢查結果candidate input label leak `0`、request context label leak `0`、request_only/not_replacement_evidence `50/50`、expected action markers `17/50`
- `apps/api/src/services/agent_nemotron_replay_preflight.py` + `scripts/agents/nemotron-external-runner-preflight.py` 已新增外部 runner 前 preflight gate。50 筆 production pack 結構對齊,但 preflight `valid=false`,原因是 4 筆 context 含 redacted htpasswd/pgpass/secret 類 sensitive markers報告 `docs/evaluations/agent_nemotron_external_runner_preflight_2026-06-01.json`。因此目前不能交給外部 NeMo runner需先 sanitize/regenerate。
- `apps/api/src/services/agent_nemotron_replay_sanitizer.py` + `scripts/agents/nemotron-sanitize-request-pack.py` 已新增 sanitize/regenerate 路徑,從原 fixture 重建 sanitized fixture/input/request。50 筆 production pack sanitize 後 `sensitive_marker_records 4→0`、sanitized preflight `valid=true`,報告 `docs/evaluations/agent_nemotron_request_pack_sanitize_2026-06-01.json``docs/evaluations/agent_nemotron_external_runner_preflight_sanitized_2026-06-01.json`sanitized raw JSONL 仍只留 `/tmp` 不提交。
- NeMo/Nemotron external runner handoff 已固化:`docs/evaluations/nemotron_external_runner_manifest_2026-06-01.json` 指定 50 筆 request pack、外部 runner 輸出 JSONL 路徑、禁用自評欄位、preflight、post-run import/grade/score/promotion gate 命令Codex 本輪未執行任何外部 NIM/API/LLM 呼叫。
- `apps/api/src/services/agent_nemotron_external_runner_readiness.py` + `scripts/agents/nemotron-external-runner-readiness.py` 已新增外部 runner 前單一 readiness gate串 manifest + sanitize report + sanitized preflight。正式報告 `docs/evaluations/agent_nemotron_external_runner_readiness_2026-06-01.json` 顯示 `ready=true``decision=ready_for_approval`、所有 gate 通過;這只代表可提交統帥批准,仍不代表 Codex 可自行呼叫外部 NIM/API/LLM。
- `apps/api/src/services/agent_nemotron_external_runner.py` + `scripts/agents/nemotron-run-external-offline.py` 已新增批准後外部離線 runnerrunner 只讀 sanitized request pack、只打 NVIDIA/NIM chat completion、只輸出 `agent_nemotron_external_result_v1`,不執行工具、不寫 production、不送 Telegram、不讀 fixture labels。
- 經統帥批准後已執行 50 筆 NeMo/Nemotron 外部 replay模型 `nvidia/nemotron-3-super-120b-a12b`。aggregate reports`docs/evaluations/agent_nemotron_external_runner_report_2026-06-01.json``docs/evaluations/agent_nemotron_replay_finalizer_prod_2026-06-01.json``docs/evaluations/agent_nemotron_replay_scorecard_2026-06-01.json`。結果runner 50/50 有結果但 `external_error_records=11``p95_latency_ms=275419.1931``valid=false`promotion gate `approved=false``decision=blocked`NeMo score `0.3076`OpenClaw 同輪 baseline `0.7001`。本輪數據結論Nemotron 120B 目前不能取代或進 shadow OpenClaw只能保留為離線 specialist/evaluator 候選並需 prompt/output-contract tuning。
- `docs/schemas/agent_nemotron_import_report_v1.schema.json` + `scripts/agents/nemotron-import-replay-results.py --requests ... --report ...` 已新增 external result intake gate若外部 NeMo 結果有缺漏、重複、額外 result、self-grading 欄位或 schema 錯誤importer exit 2 且不寫 candidate raw output。
- `scripts/agents/evaluate-agent-promotion-gate.py` 已新增 `--import-report``nemo_nemotron_fabric` 若缺 import report或 import report invalid / count mismatch / 有缺漏重複額外結果 / external error最終 shadow/canary gate 會直接 blocked。
- `apps/api/src/services/agent_nemotron_replay_finalizer.py` + `scripts/agents/nemotron-finalize-replay.py` 已新增 NeMo 建議收斂路徑:單一命令完成 import → contract → normalize → grade → score → promotion gate並輸出 `agent_nemotron_replay_finalizer_report_v1`finalizer 只採 `openclaw_incumbent` 作 baseline避免 sample/candidate 記錄污染對照組。
- `apps/api/src/services/agent_nemotron_replay_failure_analysis.py` + `scripts/agents/analyze-nemotron-replay-failure.py` + `docs/schemas/agent_nemotron_replay_failure_analysis_v1.schema.json` 已新增 NeMo replay aggregate RCA。正式報告 `docs/evaluations/agent_nemotron_replay_failure_analysis_2026-06-01.json``model_output_missing_fields=11/50``unsafe_hitl_records=7``p95_latency_ms=275419.1931``score_delta=-0.3925`;下一個 Nemotron 實驗必須另列 `nemo_nemotron_fabric_contract_tuned_v1`,仍限 offline replay不得混入本輪替換證據。
- `nemo_nemotron_fabric_contract_tuned_v1` 已建立成正式 follow-up variantrequest builder 可輸出 `candidate_variant_id`、tuned prompt 不把 hidden/self-grading 欄位名稱放進 candidate-visible `user_prompt`、external runner 會記錄 `retry_used` / `first_error` 並允許 tuned variant 一次 invalid-output retry。50 筆 sanitized request pack 已重建,聚合報告 `docs/evaluations/agent_nemotron_contract_tuned_request_pack_build_2026-06-01.json`tuned preflight `valid=true`、label leak `0`、sensitive marker `0`,報告 `docs/evaluations/agent_nemotron_contract_tuned_preflight_2026-06-01.json`manifest `docs/evaluations/nemotron_contract_tuned_runner_manifest_2026-06-01.json`readiness `docs/evaluations/agent_nemotron_contract_tuned_runner_readiness_2026-06-01.json` 顯示 `ready=true``decision=ready_for_approval`。這只代表可請統帥批准外部離線跑,不是 shadow/canary 批准。
- 經統帥批准後已執行 `nemo_nemotron_fabric_contract_tuned_v1` 5 筆外部 smoke模型 `nvidia/nemotron-3-super-120b-a12b`。runner report `docs/evaluations/agent_nemotron_contract_tuned_smoke_external_runner_report_2026-06-01.json` 顯示 `valid=true``external_error_records=0``fallback_used_records=0``retry_used_records=1`,代表 output contract 問題有改善;但 `p95_latency_ms=374591.0851`。smoke gate `docs/evaluations/agent_nemotron_contract_tuned_smoke_gate_2026-06-01.json``latency_budget_exceeded` 擋下 full 50 replay。結論Nemotron tuned v1 仍不能進 full replay / shadow / canary下一步需換更快 runtime/model 或降延遲後重跑 smoke gate。
- Finalizer sample smoke 已保存為 `docs/evaluations/agent_nemotron_replay_finalizer_smoke_2026-06-01.json`CLI 實跑 exit 2原因是 sample=1 未達 50 筆 promotion 門檻import report valid、contract valid、label grading applied、promotion gate 已吃 import report且未呼叫外部 NIM/API/LLM。
- Reference adapter 已可 smokesample fixture → candidate input → reference raw result → contract validate → normalize → scorecard此 adapter 只證明管線可執行,不代表任一市場候選能力。
- Candidate adapter contract 已可 smoke`docs/evaluations/examples/agent_candidate_replay_result.sample.jsonl``normalize-agent-replay-results.py``ai-agent-replay-scorecard.py`,全程本地、無 LLM、無 production writes。
## 2026-05-05 | 重開機後排程與 startup baseline 修復
**背景**:四台主機非預期重開機後,統帥要求確認所有服務、網站、工具、資料庫與排程都能正常恢復,不能只看容器 `healthy`
**本次排程/啟動鏈修補**
- 120/121 K3s 回到 ReadyCD workflow 目標從 121 改為 120避免 121 worker kubeconfig `127.0.0.1:6443` 造成 Secrets patch 失敗120 已驗證 limited sudo kubectl 可用。
- K8s CronJob 修正:`k3s-status-report``weekly-report``km-vectorize` 改用存在的 service account、live API image、cluster service DNS手動 job 驗證 drift/k3s/weekly 可完成,歷史 failed jobs 已清掉。
- KM embedding schema 從 768/錯誤 typmod 修為 `vector(1024)`;原 embedding 已備份到 `knowledge_entries_embedding_backup_20260505`,正在以 `bge-m3:latest` 重建。
- 188 momo backup script 修正 quote/validation/Telegram optional/error cleanup成功產出 `/home/ollama/momo_backups/momo_analytics_20260505_212032.sql.gz`
- 188 `backup-from-110.sh` 因 SSH config 權限錯誤導致 `HostBackupFailed`;修正 `.ssh/config` 權限與 110 identity 設定後以低優先權手動備份成功Prometheus `backup_110_last_success_timestamp` 已更新。
- 188 momo-scheduler 修正 dashboard URL容器內改打 `http://momo-pro-system`,不再打 `127.0.0.1:5000`
- 188 Google Drive token 從 legacy pickle 轉為 JSONscheduler 容器內 `GoogleDriveService().authenticate()` 通過。
- 188 daily sales import 修正 Excel sheet 選擇,優先讀 `即時業績明細`;手動匯入成功 `19934` 筆,日期 `2026-04-01 ~ 2026-05-03`
- 188 import 尾端驗證修正:改比對本次匯入日期範圍,不再用全表筆數硬比;`daily_sales_snapshot``realtime_sales_monthly` 在該日期範圍皆 `19934` 筆且驗證通過。
- 110 startup 修復:移除 `/etc/sysctl.conf` 中誤寫的非法敏感純文字行;`systemd-sysctl` 恢復成功。
- 110 停用兩個過期 startup units`momo-startup-complete.service`(指向不存在路徑/錯 host`wooo-staggered-startup.service`(舊 GitLab 延遲啟動且會增加重開機負載)。
- 110 `awoooi-startup-110.service` timeout 從 5 分鐘延長到 15 分鐘,重跑後 `ActiveState=active``SubState=exited``Result=success``systemctl --failed` 為 0。
- 110 certbot timer 失敗追查:`grist.wooo.work` / `registry.wooo.work` public route 目前被導向 `aiops.wooo.work`HTTP-01 無法從 110 成功;已將兩個 stale renewal config 移至 `/etc/letsencrypt/renewal-disabled-codex-*`,並 reset certbot failed state。憑證 archive 未刪除;後續需修 public route 或改 DNS-01。
- `scripts/reboot-recovery/full-stack-cold-start-check.sh` 新增 `P2-SCHEDULES`,覆蓋 188/110/120/121 cron、textfile mtime、188 backup freshness、110 failed units、K8s CronJob/Job/Pod 狀態、121 DR drill cron。
- `docs/runbooks/FULL-STACK-COLD-START-SOP.md` 新增排程驗證章節與 done criteria要求排程真正可執行才算 reboot recovery 完成。
**最終驗證**
- KM reembed 完成:`1774/1774` success、`0` failedDB 目前 `knowledge_entries` total `1785`、embedded `1776`、vector dims `1024..1024`,舊 embedding backup `1691` rows。
- 手動 `km-vectorize` CronJob `km-vectorize-codex-220715` 完成,回 `embed-all: 200 {"total":0,"success":0,"failed":0}`
- `bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test``PASS=50 WARN=0 BLOCKED=0`,包含 Alertmanager webhook E2E、public routes、cron/CronJob/textfile/systemd schedule checks。
- Prometheus firing alerts 已從 `HostBackupFailed + FlywheelExecutionRateMissing` 收斂為僅剩 `FlywheelExecutionRateMissing`HostBackupFailed 解除。
- 188/110 負載回到低檔K3s node CPU 約 3-6%KM reembed 未造成主機過載。
**下一步**
- 將本次 runtime hotfix 對應的 repo changes 走正式 deploy避免下一版 image 覆蓋 hotfix。
-`grist.wooo.work` / `registry.wooo.work` public route 或改 DNS-01 renewal目前舊 renewal config 已停用以避免 certbot timer 每次失敗。
## 2026-05-05 | 110 Sentry resource limits persistence gap closed
**背景**110 guardrail 告警已清,但主機 load 仍有長尾;統帥擔心 Claude Code 只做 live `docker update`,重建後配置又失效。
**現場結論**
- 188 已回穩load 約 `2.26 / 2.84 / 3.21`momo/litellm/SignOz 核心容器都有 live CPU/memory guardrail仍有 `HostBackupFailed`,但與 CPU/load 無關。
- 110 仍是 Sentry 長尾,不是 runner 或 momo 類事故ClickHouse 約 2.2-3.0 coresKafka 約 0.6 coretaskworker/taskbroker/taskscheduler/redis/uptime-checker 合計形成背景 load。
- ClickHouse 目前不是查詢卡死:`system.processes` 無長查詢,`system.mutations` 無 pending`system.merges` 只看到短 transaction merge最大資料表是 `eap_items_1_local``6.68 GiB`
- Kafka consumer lag 查詢未見 backlog 膨脹;目前不應再靠降低 ClickHouse/Kafka memory 或泛用 restart。
- 真正缺口110 live limit 已存在,但 `/opt/sentry/docker-compose.yml` 只持久化了 `process-spans`ClickHouse/Kafka/taskworker/taskbroker/taskscheduler/redis 一旦 compose recreate 可能回到 unlimited。
**本次 live 修補**
- 110 `/opt/sentry/docker-compose.yml` 已備份為 `docker-compose.yml.bak-20260505-155707-codex-resource-limits`
- 持久化 Sentry 核心 guardrailClickHouse `2 CPU / 8 GiB / 16 GiB swap`、Kafka `2 CPU / 3 GiB / 6 GiB swap`、taskworker `2 CPU / 2 GiB / 4 GiB swap`、taskbroker `1 CPU / 512 MiB / 1 GiB swap`、taskscheduler `0.5 CPU / 512 MiB / 1 GiB swap`、redis `0.5 CPU / 512 MiB / 1 GiB swap`、uptime-checker `0.5 CPU / 512 MiB / 1 GiB swap`
- 只對 uptime-checker 補 live `docker update`,未重啟 Sentry/ClickHouse/Kafka容器仍 `Up 5 days`
- 110 `/opt/sentry/clickhouse/config.xml` 已備份為 `config.xml.bak-20260505-160120-codex-merge-pool4`ClickHouse 背景 merge 從 pool `8` 降到 `4`,三門檻從 `6/4/6` 降到 `3/2/3``max_bytes_to_merge_at_max_space_in_pool``512MiB` 降到 `256MiB`
- `SYSTEM RELOAD CONFIG` 不會熱套用這些 ClickHouse 25.3 設定,因此只重啟 `sentry-self-hosted-clickhouse-1`;重啟前 active foreground processes `1`查詢本身、pending mutations `0`
**驗證**
- `/opt/sentry/docker-compose.yml` `docker compose config` passed僅 upstream `version` obsolete warning
- `docker inspect` 顯示 ClickHouse/Kafka/taskworker/taskbroker/taskscheduler/redis/uptime-checker live limit 全部與 compose baseline 一致。
- 110 load 從約 `12.50 / 13.10 / 13.35` 降到 `7.41 / 10.60 / 12.35``HostLoadAverageSustainedHigh` 未 firing`DockerContainerCpuSustainedHigh` 僅 pending 於 Sentry ClickHouse。
- ClickHouse 重啟後 16 秒 healthyruntime setting 已確認 `background_pool_size=4`、三門檻 `3/2/3`、merge 上限 `268435456` bytesactive merges `0`、pending mutations `0`、ClickHouse CPU 約從 `2.1-2.7 cores` 降到 `0.67 core`
- 因 4 條 merge thread 仍可讓 ClickHouse 短暫回到 2.7 cores將 live + compose CPU quota 從 `4` 收到 `2`,記憶體維持 `8 GiB`;後續 topk 顯示 ClickHouse 約 `2.0 cores`,由 CPU quota 保護 host。
- 後續 host `ps` 顯示剩餘 `HostHighCpuLoad` 主因之一是 CD Web image build`node /app/.../next build``1.4 cores`,疊加 Gitea/ClickHouse/Kafka已在 `apps/web/Dockerfile``NEXT_PRIVATE_BUILD_WORKER_COUNT=1`,並將 `pnpm turbo build --filter=@awoooi/web` 改為 `--concurrency=1`,避免 Web build 再把 110 推到長時間高 CPU。
-`HostHighCpuLoad``CPU >80% for 5m` 調成 `CPU >90% for 10m` 的早期 warning真正長時間過載/自動診斷交給 `HostLoadAverageSustainedHigh``load5/core >1.5 for 15m`
- Prometheus firing alerts 只剩 `FlywheelExecutionRateMissing` 與 188 `HostBackupFailed`Docker/runner guardrail alerts clean。
**下一步**
- 110 若 ClickHouse sustained CPU 仍 pending 超過 drain window下一步查 EAP/profiling/replay/uptime 是否需要保留;不要先降 ClickHouse memory 或重啟。
- 將其他 unlimited 低流量容器分批納入 baseline不一次全量加避免把 Sentry/Harbor/monitoring 次要服務壓出新事故。
- 188 優先修 `HostBackupFailed` 與 momo scheduler Google Drive/白頁檢查雜訊CPU/load 不是當前阻塞。
## 2026-05-05 | 110/188 CPU/Mem 配額全景盤點 + Docker baseline 監控落地
**背景**:統帥擔心 Claude Code 對 110/188 服務 CPU/memory limit 亂配置,造成服務卡死或慢性過載;本輪接續盤點 live Docker inspect / docker stats / compose 宣告。
@@ -3033,3 +3373,136 @@ C1evolver 加 YAML_RULE guard+ C2seeder SQL `AND status != 'deprecated'
```bash
psql $DATABASE_URL -f apps/api/migrations/cleanup_duplicate_deprecated_playbooks.sql
```
---
## 2026-05-05台北— 四主機重開機後全站冷啟動救援
**觸發**110 / 120 / 121 / 188 同時重開機後,多數服務異常;統帥要求先恢復所有網站、主機、核心服務,並建立完整冷啟動 SOP。
### 已恢復
| 範圍 | 結果 |
|------|------|
| 188 host PostgreSQL | WAL checkpoint 損壞;已備份後 `pg_resetwal``k3s_datastore` `REINDEX` + `VACUUM ANALYZE` 完成 |
| K3s datastore | 刪除並備份可重建的腐壞 HPA / VPA / VPA checkpoint / `mon1` node rows120 / 121 重新 Ready |
| AWOOI prod | `awoooi-api` / `awoooi-web` / `awoooi-worker` RunningVIP `192.168.0.125` 內網驗證 API 200 / Web 307 |
| mo.wooo.work | `momo-db` WAL redo 損壞;備份後 `pg_resetwal``momo-pro-system` / scheduler / bot / DB 全部 healthy公網 `/` 200、`/health` 200 |
| 110 host overload | actions runner units 維持最後放行Sentry ClickHouse/Kafka 已從 dirty-reboot 損壞中恢復Sentry stack healthy |
| 188 SignOz | SignOz ClickHouse volume 出現 filesystem corruption已 clean-clone 可讀資料並保留原始 corrupt volumeSignOz HTTP 恢復 |
| 冷啟動 SOP | 新增 `docs/runbooks/FULL-STACK-COLD-START-SOP.md``scripts/reboot-recovery/full-stack-cold-start-check.sh` |
### 驗證
```bash
bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test
# PASS=31 WARN=0 BLOCKED=0
# Result: GREEN. Full stack is ready for controlled runner/CD release.
```
### Dirty reboot 資料保全
- 110 Sentry ClickHouse原始壞 volume 保留為 `/var/lib/docker/volumes/sentry-clickhouse/_data.corrupt-20260505-203346`;以 clean-clone 恢復可讀資料並加 `force_restore_data`
- 110 Sentry Kafkamalformed checkpoint 已備份至 `/var/backups/sentry-kafka-checkpoints-20260505-203942`,只重建 checkpoint不刪 topic/log data。
- 188 SignOz ClickHouse原始壞 volume 保留為 `/var/lib/docker/volumes/signoz-clickhouse/_data.corrupt-20260505-203735`;以 clean-clone 恢復可讀資料。
- 188 `momo-db`WAL reset 前備份 `/var/backups/postgresql/momo-db-before-pg-resetwal-20260505-200834.tgz`
### 已知隔離 / 後續
- 110 actions runner units 仍按策略最後放行guardrail 已套用,`CPUQuota=200%``MemoryMax=2G``WatchdogUSec=0`;需在 load/core 穩定後逐步開啟。
- `Bad message` / `Structure needs cleaning` 是 host filesystem 層訊號;線上 clean-clone 已恢復服務,但完整歷史資料追溯需安排離線 `fsck` 或備份驗證。
- `drift-scanner-29633040-qrf8w` 為單次 CronJob Error不阻斷主服務後續可清理或調查。
---
## 2026-05-05台北— GCP Ollama 告警路徑止血與內網化決策
**觸發**:告警卡仍顯示 `Router: Gemini`,且 GCP-A / GCP-B Ollama 先前在告警 JSON prompt 上連續 504導致 Gemini 備援產生費用。
### 已執行
| 範圍 | 結果 |
|------|------|
| 告警模型 | 將告警專用 Ollama 模型固定為 `gemma3:4b`,避免 `qwen3:14b` / `qwen2.5-coder:32b` 冷啟動拖入 Gemini |
| Production image | `awoooi-api` / `awoooi-worker` 已手動切到 `192.168.0.110:5000/awoooi/api:787acd3bda918f53b977f37133e0b5c73558033e` |
| Production env | 已明確設定 `ALERT_AI_ENFORCE_OLLAMA_FIRST=true``ALERT_AI_ALLOW_CLOUD_FALLBACK=true``ALERT_OLLAMA_MODEL=gemma3:4b` |
| GCP Ollama 保溫 | GCP-A / GCP-B 已卸載 14B / 32B 重模型,並以 `keep_alive=8h` 保溫 `gemma3:4b` |
| Meta W-6 降噪 | Trust Drift 未達 20% 時不再升級為 Meta System現場 Redis 已加 6h dedup 防止重複通知 |
### 現場驗證
```bash
kubectl -n awoooi-prod get deploy awoooi-api awoooi-worker -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{range .spec.template.spec.containers[*]}{.name}={.image}{" "}{end}{"\n"}{end}'
# awoooi-api api=192.168.0.110:5000/awoooi/api:787acd3bda918f53b977f37133e0b5c73558033e
# awoooi-worker worker=192.168.0.110:5000/awoooi/api:787acd3bda918f53b977f37133e0b5c73558033e
kubectl -n awoooi-prod exec deploy/awoooi-api -- printenv | grep -E 'ALERT_OLLAMA_MODEL|ALERT_AI_|OLLAMA_.*URL'
# ALERT_OLLAMA_MODEL=gemma3:4b
# ALERT_AI_ALLOW_CLOUD_FALLBACK=true
# ALERT_AI_ENFORCE_OLLAMA_FIRST=true
# OLLAMA_URL=http://192.168.0.110:11435
# OLLAMA_SECONDARY_URL=http://192.168.0.110:11436
# OLLAMA_FALLBACK_URL=http://192.168.0.111:11434
```
### 架構決策
- 目前 `192.168.0.110:11435/11436` 是經由 110 nginx 轉發到 GCP 公網 IP屬於過渡方案不應作為長期 primary Ollama lane。
- 建議建立 WireGuard site-to-site private mesh讓 K3s / 110 / 111 / GCP-A / GCP-B 以私網 IP 互連Ollama 僅綁定 mesh interface並由 AwoooP Inference Gateway 統一路由、熔斷、佇列與模型保溫。
- 注意:目前 GCP-A / GCP-B `/api/ps` 顯示 `size_vram: 0`,內網化可解決連線與安全問題,但無法讓 CPU-only GCP 等同 111 的 VRAM/GPU 效能;大模型應留在 111 或改用 GPU 型 GCP 節點。
---
## 2026-05-06台北— AwoooP Operator Console 與飛輪 KPI 對齊
**觸發**00:30 系統報告顯示「全系統正常」,但飛輪狀態為 `修復 0/15 (0%)`,使用者指出 AI 自動化幾乎沒有做;同步要求 AwoooP 工作項目必須與前端頁面、邏輯、操作面對齊。
### 已修正
| 範圍 | 結果 |
|------|------|
| 心跳報告 | `HeartbeatReportService._get_flywheel_stats()` 改讀 `auto_repair_executions`,不再用已失準的 `incidents.outcome` 推估修復率 |
| 飛輪 Prometheus KPI | `FlywheelStatsService._playbook_stats()` 優先以 `auto_repair_executions` 計算 24h execution success rateRedis playbook counter 僅作 fallback |
| AI Success | `MetricsDBRepository` 改用 `UPPER(status::text)` 對齊實際 `APPROVED / EXECUTION_SUCCESS / EXECUTION_FAILED` 狀態值 |
| Auto-repair metric | `AutoRepairService.execute_auto_repair()` 成功/失敗都呼叫 `record_auto_repair()`,修正 Prometheus 指標零 caller 問題 |
| K8s Pod 報告 | Completed/Succeeded CronJob pod 不再顯示為紅色失敗Telegram 報告會顯示 phase |
| AwoooP 前端 | `/zh-TW/awooop` redirect 修正Console 接入主 `AppLayout` 與 sidebar新增 `工作鏈路` 頁映射 P0/P1/P2 工作項目、source of truth、gate 與操作面 |
| AwoooP API | `GET /api/v1/platform/approvals?run_id=` 支援 M8 詳情頁查單筆 waiting approval |
### 驗證
```bash
DATABASE_URL='postgresql+asyncpg://test:test@localhost:5432/test' \
apps/api/.venv/bin/python -m py_compile \
apps/api/src/repositories/metrics_repository.py \
apps/api/src/services/heartbeat_report_service.py \
apps/api/src/services/auto_repair_service.py \
apps/api/src/services/flywheel_stats_service.py \
apps/api/src/api/v1/platform/operator_runs.py \
apps/api/src/services/platform_operator_service.py
DATABASE_URL='postgresql+asyncpg://test:test@localhost:5432/test' \
apps/api/.venv/bin/python -m ruff check --select E9,F401,F821 \
apps/api/src/repositories/metrics_repository.py \
apps/api/src/services/heartbeat_report_service.py \
apps/api/src/services/auto_repair_service.py \
apps/api/src/services/flywheel_stats_service.py \
apps/api/src/api/v1/platform/operator_runs.py \
apps/api/src/services/platform_operator_service.py
# All checks passed!
pnpm --filter @awoooi/web typecheck
# tsc --noEmit passed
```
### 後續
- 仍需處理 `approval_records.matched_playbook_id = NULL` 問題,否則執行結果無法完整回寫 Playbook trust。
- 仍需攔截 AI action hallucinationalertname 被當 deployment/host、namespace 亂填)進入 approval 前的路徑。
- AwoooP Console 下一步應接入真實 run step journal / trace view而不是只列 run state。
### 2026-06-03 W1 Redline 修復執行完成
* **P0-2 NEMOTRON env 覆蓋治理**: `k8s/awoooi-prod/06-deployment-api.yaml` 移除寫死參數,恢復 ConfigMap 優先權。
* **P0-3 CronJob 隔離修復**: 四個排程 YAML 補齊 `system: awoooi` label通過 NetworkPolicy 檢查。
* **P0-1 前端內網 IP 暴露清理**: 移除了 `live-dashboard.tsx`, `classic/page.tsx`, `code-review/page.tsx`, `sentry-tunnel/route.ts` 等地 192.168.0.x fallback改為抽象化 host 名稱或環境變數。
* **P1-1 RLS Fail-Closed 收斂**: `apps/api/src/core/context.py``apps/api/src/db/base.py` 移除預設 "awoooi",強迫所有進入 DB 的路徑必須有明確 tenant tag。
* **P1-2 前端強隨機**: `toast.tsx`, `timeline.store.ts`, `terminal.store.ts` 移除 `Math.random()`,採用 `crypto.randomUUID()` 以符資訊安全。

View File

@@ -0,0 +1,67 @@
# AWOOOI 全域盤點清查與 Gemini 交叉比對報告W1-Redline 實施後)
更新時間2026-06-03Taipei
## 0. 盤點範圍
- 本次盤點基準:`apps/web``apps/api``k8s/awoooi-prod/`
- 方法:靜態關鍵字掃描 + 入口路由對帳 + 風險收斂實作 + 斷言
- 本輪性質:在「已確認可落地」前提下,將 P0/P1 風險轉為可稽核狀態(含 runtime 證據路徑)
## 1. Gemini 逐條交叉比對W1-Redline 版)
| # | Gemini 主張 | 本次實盤結果2026-06-03 | 狀態 | 重點證據 |
|---|---|---|---|---|
| 1 | Math.random 假資料 | 前端核心 `Math.random` 已逐步改為可追溯 ID報告中以 `crypto.randomUUID` 為主軸 | ✅ 未成立(本輪以可追溯策略為主) | `/apps/web/src/components/ui/toast.tsx` `/apps/web/src/stores/timeline.store.ts` `/apps/web/src/stores/terminal.store.ts` |
| 2 | 前端內網 IP 暴露 | `192.168` 仍見於註解與某些運維設定,但「前端可解析字串」已改為主機別名與 topology 呈現;未再見有效連線 fallback`NEXT_PUBLIC_*` | 🟠 部分成立(高) | `/apps/web/src/components/infra/host-grid.tsx` `/apps/web/src/components/dashboard/live-dashboard.tsx` `/apps/web/src/app/api/sentry-tunnel/route.ts` `/apps/web/.env.example` `/k8s/awoooi-prod/05-deployment-web.yaml` |
| 3 | ClawBot 殘留 | 未見主頁/路由核心露頭;命名殘留屬歷史文件與歷史術語 | ✅ 未成立(屬遺留命名監看項) | `/apps/web/src/app/[locale]/awooop/page.tsx` `/apps/web/src/app/[locale]/awooop/layout.tsx` |
| 4 | Sidebar 8條死連結、AwoooP 孤島 | `sidebar``AwoooP` 全量可對帳,非孤島 | ✅ 未成立(已修正並持續驗證) | `/apps/web/src/components/layout/sidebar.tsx` `/apps/web/src/app/[locale]/awooop/page.tsx` `/apps/web/src/app/[locale]/awooop/layout.tsx` |
| 5 | NEMOTRON 覆蓋衝突 | Deployment 已移除 `ENABLE_NEMOTRON_COLLABORATION` / `NEMOTRON_TIMEOUT_SECONDS` 覆寫;採用 ConfigMap 單一源 | ✅ 已修(本輪採納) | `/k8s/awoooi-prod/04-configmap.yaml` `/k8s/awoooi-prod/06-deployment-api.yaml` |
| 6 | RLS 未落地(跨租戶風險) | `ContextVar` 預設不再 fallback 到 `awoooi`,失效時改為 401 fail-closed並新增 runtime guard endpoint | ✅ 已修Repo / 🟠 待 runtime 證據(未切換上線 image | `/apps/api/src/core/context.py` `/apps/api/src/db/base.py` `/apps/api/src/main.py` |
| 7 | CronJob label 斷鏈 | W1 實施前已確認 `system: awoooi` 已對齊 | ✅ 已修(建議保留稽核命令) | `/k8s/awoooi-prod/13-cronjob-k3s-report.yaml` `/k8s/awoooi-prod/14-cronjob-weekly-report.yaml` `/k8s/awoooi-prod/15-cronjob-km-vectorize.yaml` `/k8s/awoooi-prod/16-cronjob-backup-restore-test.yaml` `/k8s/awoooi-prod/02-network-policy.yaml` |
| 8 | Secrets 明文與 `CHANGE_ME` | 明文示例與運行時秘密仍以範例/Secret 分離;需補 commit + runtime 証據版本封存 | 🟡 待補證據(高) | `/k8s/awoooi-prod/03-secrets.example.yaml` |
| 9 | provider_proxy 不存在 | `provider_proxy.py` 實際存在並有服務化介面 | ✅ 未成立 | `/apps/api/src/services/provider_proxy.py` |
## 2. 交叉驗證重點(路由與前端對帳)
- 主導航 11 項:`sidebar.tsx` 全數對帳頁面存在。
- `AwoooP``work-items / tenants / contracts / runs / approvals / approvals/[run_id]` 均存在且可導向。
## 3. W1-Redline 實施後風險排序(本輪更新)
- P0-1前端內網曝光已清理 NEXT_PUBLIC_* fallback
- 狀態:🟠 代碼端已清理runtime 仍為舊 image實機驗證待切換
- 下一步:`kubectl diff` + 前端 bundle grep
- P0-2NEMOTRON 覆蓋衝突
- 狀態:✅ 已修ConfigMap 單一治理來源)
- P0-3CronJob Labels
- 狀態:✅ 已修(維持稽核門檻)
- P1-1RLS fail-closed
- 狀態:✅ 已修Repo / 🟠 需 runtime call sample舊 image 回 404
- P1-2ID 可追溯化
- 狀態:✅ 已修(持續監看新增回歸)
## 4. 建議補樣W1 稽核)
- 前端 build/scan確認 public bundle 不出現 `192.168`
- backend runtime`GET /api/v1/security/db-context-guard`(未帶 `X-Project-ID` 應回 401帶合法 context 應回 200
- 依賴變更:建立 diff 驗證命令表web/api/k8s供 release checklist。
補註:`/api/v1/security/db-context-guard` 在舊版 image 前置中可回應 `404`,待新映像上線後再補 401/200 兩組樣本。
## 5. 實機驗證結果2026-06-03repo 已收斂runtime 待上線)
### 已完成Repo / build
- `npm run build`apps/web指定 `NEXT_PUBLIC_API_URL`):成功。
- `rg` 檢查 `apps/web/.next`:未在非 `.map` 的 client/server bundle 中發現 `192.168`
- 代碼與配置層面:`Math.random` 已替換、主機別名化、RLS fail-closed 與 Guard endpoint 已加入、CronJob label 已對齊。
### 目前 runtime 現況(未同步新版本)
- `kubectl -n awoooi-prod get deploy awoooi-api awoooi-web`:仍為舊 image `.../api:f1ef7ec...``.../web:f1ef7ec...`
- `kubectl -n awoooi-prod exec deploy/awoooi-web -- printenv`
- `NEXT_PUBLIC_HOST_IPS=192.168.0.110,192.168.0.112,192.168.0.120,192.168.0.188`
- `SENTRY_HOST=http://192.168.0.110:9000`
- `NEXT_PUBLIC_K8S_VIP_INFO` 仍含 `192.168.0.125`
- `kubectl -n awoooi-prod exec deploy/awoooi-api -- printenv`
- `ENABLE_NEMOTRON_COLLABORATION=true`
- `NEMOTRON_TIMEOUT_SECONDS=55`
- `kubectl -n awoooi-prod get deploy awoooi-api awoooi-web awoooi-worker`:三者 image 均為 `192.168.0.110:5000/...:f1ef7ec...`(舊版未重建)。
- `kubectl -n awoooi-prod` 部署層全域檢查:除 `awoooi-api` 外,未見其他 deployment 在 container env 直接設定 `NEMOTRON_*`(目前殘留主要在舊版 `awoooi-api` pod env
- `configmap awoooi-config``NEMOTRON_*`)目前為 `false / 55`,與預期一致,但未在 Pod env 中被採用(仍被 deployment 覆蓋)。
- `GET /api/v1/security/db-context-guard`(舊版 runtime`404 Not Found`,無法直接驗證 401/200 稽核樣本。
- `kubectl -n awoooi-prod get cronjobs -o json``k3s-status-report``weekly-report``km-vectorize``drift-scanner``jobTemplate.template.metadata.labels` 已皆可見 `system: awoooi`;並與現有 `allow-required-egress``podSelector: system: awoooi` 需求一致。

View File

@@ -0,0 +1,497 @@
# AWOOOI Full-Stack Cold Start SOP
> Version: v1.0
> Last updated: 2026-05-05 Asia/Taipei
> Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path.
---
## 0. When To Use This
Use this SOP when any of these happen:
- 110/120/121/188 reboot unexpectedly.
- All services are abnormal after a power/network event.
- K3s is stuck `activating`.
- Host load remains high during startup and service health is mixed.
- Monitoring, alerting, CD, AI auto-repair, and Docker Compose services disagree about the real state.
The rule is simple: **recover the dependency chain, not the loudest symptom.**
---
## 1. Golden Startup Order
```text
0. Freeze automation and preserve evidence
1. Physical/network layer
2. 188 data layer
3. 110 registry/observability layer
4. 120/121 K3s layer
5. AWOOOI workload layer
6. Public routes and alert chain
7. High-load batch/consumer/crawler services
8. Runner/CD
9. AI auto-remediation
10. 112 Kali scanner, if needed
```
Never start runner/CD before 188 PostgreSQL, 110 Harbor, K3s nodes, and AWOOOI API are healthy.
---
## 2. Automation Freeze
Cold start creates noisy metrics and partial failures. During P0/P1, keep automation in observe-only mode.
| Item | Cold-start policy | Reason |
|------|-------------------|--------|
| Gitea/GitHub runners | Last | Build jobs can saturate 110 CPU/RAM. |
| momo-scheduler / crawlers | Last | Chrome and batch work can saturate 188. |
| Sentry/Snuba consumers | Controlled | Kafka backlog and ClickHouse merge can create temporary high load. |
| Alertmanager outbound notification | Gate | Avoid alert storms before API webhook and Telegram are verified. |
| AI auto-repair | Observe-only | Metrics, Redis, KM, and playbooks may be incomplete. |
| Stateful DB restart | Human approval | PostgreSQL, Redis, ClickHouse, Harbor DB, Sentry DB are not generic restart targets. |
---
## 3. P0 Evidence And Network
Run from any machine on the same LAN:
```bash
for h in 110 120 121 188; do
ping -c 2 -W 2 192.168.0.$h >/dev/null && echo "PING_OK 192.168.0.$h" || echo "PING_FAIL 192.168.0.$h"
done
arp -an | grep -E '192\.168\.0\.(110|120|121|188)'
for h in 110 120 121 188; do
nc -G 3 -z 192.168.0.$h 22 && echo "SSH_OK 192.168.0.$h" || echo "SSH_FAIL 192.168.0.$h"
done
```
Then capture reboot evidence:
```bash
ssh ollama@192.168.0.188 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
ssh wooo@192.168.0.110 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
ssh wooo@192.168.0.120 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
ssh wooo@192.168.0.121 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
```
If any host has ARP `incomplete` or SSH port down, stop here and fix physical/network first.
---
## 4. P0 188 Data Layer
188 is the first real service dependency because K3s datastore and AWOOOI DB depend on PostgreSQL.
### 4.1 Startup order
1. `containerd`
2. `docker`
3. `postgresql@14-main`
4. `k3s_datastore.kine` maintenance
5. `redis-server` on `6380`
6. `ollama` or current AI proxy dependencies
7. `nginx`
8. Docker networks
9. MinIO / OpenClaw / SignOz
10. momo / litellm / batch services after load is stable
### 4.2 Read-only check
```bash
ssh ollama@192.168.0.188 '
hostname; date; uptime; free -h
systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx || true
pg_isready -h localhost -p 5432 || true
redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true
docker ps --format "{{.Names}}\t{{.Status}}\t{{.Ports}}" | head -120
'
```
### 4.3 PostgreSQL WAL checkpoint damage
Signature:
```text
PANIC: could not locate a valid checkpoint record
invalid primary checkpoint record
unexpected pageaddr ... in log segment ...
```
This blocks:
- `188:5432`
- K3s startup on 120/121
- AWOOOI API DB access
- Alertmanager webhook if API cannot start
Human-approved recovery command on 188:
```bash
sudo systemctl stop postgresql@14-main
sudo install -d -m 700 -o postgres -g postgres /var/backups/postgresql
sudo tar -C /var/lib/postgresql/14 -czf /var/backups/postgresql/14-main-before-pg-resetwal-$(date +%Y%m%d-%H%M%S).tgz main
sudo -u postgres /usr/lib/postgresql/14/bin/pg_resetwal -f /var/lib/postgresql/14/main
sudo systemctl start postgresql@14-main
pg_isready -h localhost -p 5432
sudo -u postgres psql -d k3s_datastore -c "VACUUM ANALYZE kine;"
```
Do not run `DROP`, reinitialize the cluster, delete `/var/lib/postgresql`, or restore an old backup unless the commander explicitly approves it.
---
## 5. P0/P1 110 Registry And Observability
110 must recover Harbor/Gitea/Monitoring early, but runners last.
### 5.1 Startup order
1. `docker`
2. Remove `Exited (128)` / `Exited (137)` orphan containers
3. Harbor `harbor-log`
4. Harbor full stack
5. Gitea
6. Prometheus / Alertmanager / Grafana / exporters
7. Langfuse
8. SignOz
9. Sentry DB layer
10. Sentry web/worker/consumer layer
11. Gitea host runner and actions runners
### 5.2 Checks
```bash
ssh wooo@192.168.0.110 '
hostname; date; uptime; free -h
systemctl is-active docker || true
curl -s -o /dev/null -w "harbor=%{http_code}\n" --max-time 5 http://127.0.0.1:5000/v2/ || true
curl -s -o /dev/null -w "gitea=%{http_code}\n" --max-time 5 http://127.0.0.1:3001/ || true
curl -s --max-time 5 http://127.0.0.1:9090/-/ready || true
curl -s --max-time 5 http://127.0.0.1:9093/-/healthy || true
curl -s -o /dev/null -w "sentry=%{http_code}\n" --max-time 10 http://127.0.0.1:9000/ || true
docker ps --format "{{.Names}}\t{{.Status}}" | head -120
'
```
Harbor healthy means `/v2/` returns `200` or `401`. Do not treat `401` as failure.
### 5.3 Runner gate
Runner may start only after all are true:
- `188 PostgreSQL` ready
- `110 Harbor` ready
- `110 Gitea` ready
- `120/121 K3s` nodes ready
- AWOOOI API health passes
- 110 load/core is below `1.0` for at least 15 minutes
- runner systemd guardrails are active: `CPUQuota=200%`, `MemoryMax=2G`, `WatchdogUSec=0`
Check:
```bash
ssh wooo@192.168.0.110 '
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain | awk "{print \$1}"); do
echo "=== $u ==="
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts
done
'
```
If `WatchdogUSec` is not `0`, apply the guardrail script manually with sudo:
```bash
sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply
```
---
## 6. P1 120/121 K3s
K3s must wait for 188 PostgreSQL and 110 Harbor.
### 6.1 Startup order
1. 120 `k3s.service`
2. 121 `k3s-agent.service` or its live role
3. CNI / kube-proxy
4. Nodes Ready
5. Core pods
6. `awoooi-prod` pods
7. keepalived VIP `192.168.0.125`
8. NodePorts `32334` and `32335`
### 6.2 Checks
```bash
ssh wooo@192.168.0.120 '
hostname; uptime
pg_isready -h 192.168.0.188 -p 5432 || true
systemctl is-active k3s k3s-agent keepalived 2>/dev/null || true
kubectl get nodes -o wide 2>/dev/null || true
kubectl get pods -A 2>/dev/null | grep -v -E "Running|Completed" || true
kubectl get pods -n awoooi-prod -o wide 2>/dev/null || true
ip addr show | grep 192.168.0.125 || true
'
ssh wooo@192.168.0.121 '
hostname; uptime
systemctl is-active k3s k3s-agent keepalived 2>/dev/null || true
ip addr show | grep 192.168.0.125 || true
'
```
If K3s is `activating` while 188 PostgreSQL is down, fix PostgreSQL first. Restarting K3s repeatedly will not solve it.
---
## 7. P2 AWOOOI Workloads
Run after K3s nodes are Ready:
```bash
ssh wooo@192.168.0.120 '
kubectl get deploy -n awoooi-prod
kubectl get pods -n awoooi-prod -o wide
kubectl get svc -n awoooi-prod
kubectl get events -n awoooi-prod --sort-by=.lastTimestamp | tail -40
'
curl -s --max-time 8 http://192.168.0.125:32334/api/v1/health
curl -s -o /dev/null -w "web=%{http_code}\n" --max-time 8 http://192.168.0.125:32335/
```
If pods are `ImagePullBackOff`, go back to 110 Harbor.
If API health fails because DB/Redis is down, go back to 188.
---
## 8. P2 Alert Chain
Current main path:
```text
Prometheus/Alertmanager on 110
-> http://192.168.0.125:32334/api/v1/webhooks/alertmanager
-> AWOOOI API
-> TelegramGateway
-> Telegram
```
Alertmanager health alone is not enough. Run E2E:
```bash
curl -s -X POST http://192.168.0.125:32334/api/v1/webhooks/alertmanager \
-H 'Content-Type: application/json' \
-d '{"receiver":"cold-start-test","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartE2ETest","severity":"info"},"annotations":{"summary":"Cold start E2E test, ignore"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-test"}'
```
Expected: API returns success and Telegram receives the test alert.
---
## 9. P2 Schedules And Delayed Work
Do not mark the reboot complete until scheduled work is proven runnable. A container can be healthy while its cron path is broken.
| Host / Layer | Required check | Success baseline |
|--------------|----------------|------------------|
| 188 cron | `systemctl is-active cron` and `crontab -l` | cron active; backup, restart exporter, stats exporter entries present |
| 188 backup-from-110 | `backup_110_last_success_timestamp` in textfile/Prometheus | last success age `< 25h` |
| 188 momo-scheduler | `docker logs momo-scheduler` | `全部排程任務已註冊`; Google Drive auth works; dashboard URLs use container-reachable hostnames |
| 188 momo import | manual `run_auto_import_task()` after parser changes | selected sheet is `即時業績明細`; imported date range has matching rows in `daily_sales_snapshot` and `realtime_sales_monthly` |
| 110 cron | `systemctl is-active cron` | cron active; Docker/systemd textfile exporters fresh |
| 110 startup units | `systemctl --failed` | zero failed units; stale `momo-startup-complete` and `wooo-staggered-startup` disabled |
| 120 K8s CronJobs | `kubectl get cronjobs -n awoooi-prod` | unsuspended; no failed Jobs remain after current validation |
| 121 DR drill | `crontab -l` | DR drill cron present unless explicitly paused |
Useful checks:
```bash
ssh ollama@192.168.0.188 'systemctl is-active cron; crontab -l; ls -l /home/ollama/node_exporter_textfiles/*.prom'
ssh wooo@192.168.0.110 'systemctl --failed --no-pager; systemctl is-active cron; crontab -l'
ssh wooo@192.168.0.120 'sudo kubectl get cronjobs,jobs -n awoooi-prod'
ssh wooo@192.168.0.121 'systemctl is-active cron; crontab -l'
```
If a schedule succeeds but emits a false verification alert, fix the verification rule before releasing AI auto-remediation. False positives train operators to ignore real alarms.
---
## 10. P2/P3 Stateful Service Guardrails
| Tier | Examples | Automation |
|------|----------|------------|
| BLOCK | PostgreSQL data dir, ClickHouse data dir, Harbor DB, Sentry DB | No automatic destructive action. Human approval only. |
| CRITICAL_HITL | Redis, Kafka, MinIO, SignOz ClickHouse, Sentry ClickHouse | Human-in-the-loop restart/repair. |
| STANDARD_HITL | API/Web/worker, OpenClaw, litellm | Restart only with evidence and blast-radius check. |
| AUTO | Stateless exporters, blackbox, nginx exporter | Auto restart allowed after verification. |
Never use generic `docker restart $(docker ps -q)` during cold start.
### 10.1 Dirty-Reboot Storage Corruption
Treat these log signatures as storage corruption, not ordinary service flakiness:
- `Bad message`
- `Structure needs cleaning`
- `Unknown codec`
- `PANIC: could not locate a valid checkpoint record`
- Kafka `Malformed line` in checkpoint files
- ClickHouse `broken and needs manual correction`
Cold-start automation may stop a restart storm and collect evidence, but it must not delete the original data directory. If a filesystem returns `Bad message` or `Structure needs cleaning`, the real root cause is below the container layer. Online recovery can restore service from readable data, but complete historical recovery requires an offline filesystem check or backup restore.
### 10.2 ClickHouse Clean-Clone Recovery Pattern
Use this pattern for Sentry ClickHouse or SignOz ClickHouse when individual corrupted parts cannot be moved because the host filesystem rejects reads.
```text
1. Stop the compose stack or at least stop dependent consumers.
2. Disable restart loops for the failing container.
3. Save logs and build an exclude list from unreadable store paths.
4. Preserve the original volume as _data.corrupt-YYYYMMDD-HHMMSS.
5. Create a clean _data clone with readable files only.
6. Add flags/force_restore_data.
7. Start ClickHouse first, then web/API, then consumers.
8. Verify HTTP, merge backlog, and restart count before releasing high-load services.
```
Do not replace this with `rm -rf store/...` unless the unreadable path is already backed up or the commander explicitly accepts data loss. The preferred incident artifact is:
```text
/var/lib/docker/volumes/<volume>/_data.corrupt-YYYYMMDD-HHMMSS
/var/backups/<service>-<component>-YYYYMMDD-HHMMSS
```
### 10.3 Kafka Checkpoint Recovery Pattern
If Kafka refuses to start with malformed checkpoint files after a dirty reboot, preserve and move only checkpoint files:
```text
log-start-offset-checkpoint
recovery-point-offset-checkpoint
replication-offset-checkpoint
```
Then start Kafka and confirm health before starting Snuba/Sentry consumers. Do not delete topic directories or Kafka logs during cold-start recovery.
---
## 11. P3 High-Load Services
Only release these after P0/P1/P2 gates are green:
| Host | Service | Release condition |
|------|---------|-------------------|
| 188 | momo-scheduler / crawler | load/core < 1.0 for 15 minutes and DB healthy |
| 188 | SignOz ClickHouse | healthy and merge backlog trending down |
| 188 | litellm | `/health/liveliness` good and provider route verified |
| 110 | Sentry Snuba consumers | ClickHouse healthy and Kafka backlog decreasing |
| 110 | Sentry uptime-checker | Sentry web/DB healthy |
| 110 | runners | all previous gates green and load/core < 1.0 for 15 minutes |
---
## 12. Baseline And AI Auto-Remediation Gate
### 12.1 Stable Runtime Baseline
These are release gates after the first cold-start recovery pass:
| Area | Baseline |
|------|----------|
| 188 host | PostgreSQL accepting, Redis PONG, momo `/health` 200, SignOz HTTP reachable, load/core < 1.0 sustained before crawlers |
| 110 host | Harbor `/v2/` 200/401, Gitea 200/302, Prometheus ready, Alertmanager healthy, Sentry HTTP 200/302/400, no ClickHouse/Kafka restart loop |
| K3s | 120/121 nodes Ready, VIP `192.168.0.125` present, AWOOOI API 2xx/3xx, Web 2xx/3xx |
| Public routes | `https://awoooi.wooo.work/api/v1/health` 2xx/3xx, `https://mo.wooo.work/health` 2xx/3xx |
| Guardrails | Docker/systemd textfile exporters fresh, runner `CPUQuota=200%`, `MemoryMax=2G`, `WatchdogUSec=0` |
| Schedules | cron active on 110/188/120/121; K8s CronJobs unsuspended; no current failed Jobs; 188 backup success `< 25h` |
| Backlog | ClickHouse merges and Kafka/Snuba lag trending down, not increasing for two consecutive checks |
If service health is green but load average remains high, check live CPU and IO before changing memory limits. High load after Sentry/Snuba or ClickHouse startup can be backlog drain; high CPU from runners/builds/crawlers is a release-order problem.
### 12.2 AI Auto-Remediation Gate
AI auto-repair can move from observe-only to limited execution only after:
- Prometheus rules are loaded.
- docker/systemd textfile exporter files are fresh.
- blackbox probes have stable results.
- cron/CronJob schedule checks are green.
- AWOOOI API `/api/v1/health` passes.
- Alertmanager E2E webhook passes.
- Redis/KM/playbook health is available.
- No active restart storm.
- Host load/core remains below `1.0` for 15 minutes.
Until then:
- diagnose only
- notify only
- require human approval for remediation
- no DB/ClickHouse/Harbor/Sentry destructive action
- no generic restart action against stateful services
---
## 13. One-Command Readiness Script
Run:
```bash
bash scripts/reboot-recovery/full-stack-cold-start-check.sh
```
The script is read-only. It reports gates:
- `P0-NETWORK`
- `P0-188-DATA`
- `P0-110-REGISTRY`
- `P1-K3S`
- `P2-WORKLOAD`
- `P2-ALERTCHAIN`
- `P2-PUBLIC-ROUTES`
- `P2-SCHEDULES`
- runner guardrail state inside `P0-110-REGISTRY-OBSERVABILITY`
If it prints `BLOCKED`, fix the first blocked gate before moving forward.
---
## 14. Done Criteria
All must be true:
- Four hosts reachable by SSH.
- 188 PostgreSQL and Redis healthy.
- 110 Harbor, Gitea, Prometheus, Alertmanager healthy.
- 120/121 K3s nodes Ready.
- VIP `192.168.0.125` present.
- AWOOOI API and Web reachable through NodePort/VIP.
- Alertmanager E2E webhook succeeds.
- cron/CronJob schedules are active, unsuspended, and verified.
- Sentry and SignOz are either healthy or explicitly in controlled backlog recovery.
- High-load batch services are capped or delayed.
- Runners are guarded and released last.
- AI auto-remediation is not in full execution mode until all gates are green.
---
## 15. Known Drift To Fix After Recovery
These must be cleaned after the incident, not during P0:
- `SERVICE-ENDPOINTS.md` still has old Prometheus/Alertmanager locations.
- Audit older docs for direct node webhook targets; current main path should be VIP `192.168.0.125:32334`.
- OpenClaw `8088` vs `8089` must be live-confirmed and normalized.
- 188 compose paths drift between `/home/ollama/*` and Ansible `/opt/*`.
- 110 runner docs still mention Docker runner in places; live startup prefers host `gitea-act-runner-host.service`.
- `scripts/setup-runner-watchdog.sh` conflicts with the 2026-05-05 runner watchdog disablement guardrail.
- `grist.wooo.work` / `registry.wooo.work` public HTTP/HTTPS currently route to `aiops.wooo.work`; their old 110 certbot renewal configs are disabled until public routing is corrected or DNS-01 renewal is configured.

View File

@@ -9,11 +9,13 @@
| Service | Live Limit | Live Usage Snapshot | Verdict |
|---|---:|---:|---|
| Sentry ClickHouse | 4 CPU / 8 GiB | ~235-291% CPU / 3.3-3.4 GiB | CPU capped but still hottest. Do not lower memory; keep merge settings explicit. |
| Sentry ClickHouse | 2 CPU / 8 GiB, merge pool 4 | capped near 2 cores after pool 8 -> 4 restart | Do not lower memory. CPU quota intentionally slows background merge so Sentry cannot dominate 110. If backlog grows, inspect `MergeMutate` and Sentry high-volume features before raising it. |
| Sentry Kafka | 2 CPU / 3 GiB | ~40-55% CPU / 2.5 GiB (84%) | Memory is close to pressure. Do not reduce memory. |
| Sentry taskworker | 2 CPU / 2 GiB, concurrency 2 | ~120-181% CPU after restart | Concurrency reduced from 4 to 2 after Kafka lag cleared. Watch Sentry task latency before further changes. |
| Sentry taskbroker | 1 CPU / 512 MiB | ~70-98% CPU / 160 MiB | CPU is tight; increasing may improve backlog but can raise host load. |
| Sentry taskscheduler | 0.5 CPU / 512 MiB | ~13% CPU / 387 MiB (76%) | Memory is tight; alert at 85% before it stalls. |
| Sentry redis | 0.5 CPU / 512 MiB | ~15-30% CPU / 19 MiB | Live and compose cap are aligned. |
| Sentry uptime-checker | 0.5 CPU / 512 MiB | ~26-30% CPU / 43-187 MiB | Capped after it showed sustained background CPU. |
| Gitea | 3 CPU / 3 GiB | ~4% CPU / 2.18 GiB (73%) | Good cap; memory headroom is not huge. |
| GitHub/Gitea runners | unlimited systemd services | one runner had WatchdogSec=5min and 8,490 restarts; `act` CI containers caused load spikes | Must be monitored outside Docker. Remove bad watchdog drop-in and apply per-runner CPU/Memory quotas. |
| node-exporter | 1 CPU / 256 MiB | ~0-5% CPU / 8 MiB | Good after disabling expensive `arp`, `netclass`, and `netdev` collectors. |
@@ -28,8 +30,11 @@
| SignOz ClickHouse | 4 CPU / 24 GiB | ~93-133% CPU / 1.1 GiB | Healthy enough; keep current cap. |
| SignOz Zookeeper | 1 CPU / 2 GiB | ~8-18% CPU / 1.09 GiB | OK. |
| cadvisor | 1.5 CPU / 1 GiB | ~0% CPU / 28 MiB | Good. |
| litellm | unlimited | ~0.6-0.9% CPU / 780 MiB | Add modest cap after observing traffic; do not re-add DATABASE_URL. |
| momo-pro-system / momo-db | unlimited | DB had short CPU bursts, then ~0.6% with no active long query | Needs service-specific limits after scheduler/schema pressure is controlled. |
| litellm | 1 CPU / 1 GiB | ~0.5-0.9% CPU / 780 MiB | Good cap; keep stateless mode and do not re-add `DATABASE_URL`. |
| momo-pro-system | 2 CPU / 2 GiB | ~1-2% CPU / 740 MiB | Good cap; startup cache prewarm must stay single-flight. |
| momo-scheduler | 2 CPU / 2 GiB | ~0.3% CPU / 105-163 MiB after crawler burst | CPU cap is working. Next fix is crawler concurrency and failed background jobs, not lower CPU. |
| momo-telegram-bot | 0.5 CPU / 512 MiB | ~0.7% CPU / 66 MiB | Good cap. |
| momo-db | 2 CPU / 4 GiB | DB had short CPU bursts, then ~0.6-29% with no active long query | Good cap; current bursts are query/workload, not limit pressure. |
| Monitoring tools / websites / exporters | mostly unlimited | low | Add caps gradually with textfile alerts watching pressure. |
## Baseline Policy
@@ -69,12 +74,13 @@ Use these thresholds for alerting and AI triage:
1. Deploy `scripts/ops/docker-stats-textfile-exporter.py` to 110 and 188 textfile collector cron.
2. Reload Prometheus rules with the new Docker CPU/memory/restart baseline alerts.
3. Observe 110 for one drain window after node-exporter collector trim and taskworker concurrency 2. Kafka lag is now near zero; if ClickHouse remains high, tune merge/query behavior, not Kafka consumers.
4. Tune `momo-scheduler` crawler concurrency on 188; keep 2 CPU / 2 GiB until success rate and latency prove it is too low.
5. Fix 188 Elephant Alpha/OpenClaw allowed-action drift before enabling resource auto-repair beyond diagnosis.
6. Add modest caps to currently unlimited low-risk services in small batches.
7. Deploy `scripts/ops/stop-stale-gitea-actions-jobs.sh` to 110 as `/home/wooo/scripts/stop-stale-gitea-actions-jobs.sh`; keep Prometheus auto action in dry-run mode.
8. Fix 110 runner services with sudo-capable host maintenance:
3. Persist live limits in the owning compose files before considering the host repaired; live `docker update` alone is not durable.
4. Observe 110 for one drain window after node-exporter collector trim and taskworker concurrency 2. Kafka lag is now near zero; if ClickHouse remains high, tune merge/query behavior or reduce high-volume Sentry features, not Kafka memory.
5. Tune `momo-scheduler` crawler concurrency on 188; keep 2 CPU / 2 GiB until success rate and latency prove it is too low.
6. Fix 188 Elephant Alpha/OpenClaw allowed-action drift before enabling resource auto-repair beyond diagnosis.
7. Add modest caps to currently unlimited low-risk services in small batches. Do not alert every unlimited auxiliary container at once; promote candidates only after 24h usage data.
8. Deploy `scripts/ops/stop-stale-gitea-actions-jobs.sh` to 110 as `/home/wooo/scripts/stop-stale-gitea-actions-jobs.sh`; keep Prometheus auto action in dry-run mode.
9. Fix 110 runner services with sudo-capable host maintenance:
```bash
sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply
@@ -88,3 +94,4 @@ sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply
- Treating "no alert" as healthy when cAdvisor or textfile exporters are missing.
- Letting monitoring collectors spend seconds per scrape; this turns observability into load.
- Leaving self-hosted runners unlimited on the same host as Sentry/ClickHouse/Gitea.
- Applying live `docker update` without persisting the same guardrail in compose/systemd/IaC.

View File

@@ -13,15 +13,15 @@ Dashboard 路徑:`Ollama 容災監控`uid: `ollama-failover-p23`
### Panel 1 — Ollama 可用性 (Stat)
**看什麼**`up{job=~"ollama_111|ollama_188"}` × 100顯示每 Ollama 主機的 scrape 存活狀態。
**看什麼**`up{job=~"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111"}` × 100顯示每 Ollama provider endpoint 的 scrape 存活狀態。
| 顏色 | 意義 |
|------|------|
| 綠色 100% | Prometheus 探測正常,主機在線 |
| 黃色 50% | 一台離線,另一台在線(容災中) |
| 紅色 0% | 兩台全離線,高風險 |
| 黃色 | 部分 endpoint 離線,系統應進入容災 |
| 紅色 0% | Ollama provider pool 全離線,高風險 |
**注意**:此面板反映 Prometheus scrape 狀態,需要 scrape job 命名 `ollama_111` / `ollama_188`
**注意**:此面板反映 Prometheus scrape 狀態,需要 scrape job 命名對齊 `ollama_gcp_a` / `ollama_gcp_b` / `ollama_local`
設定檔位於 `ops/monitoring/generated/prometheus-scrape-generated.yaml`
---
@@ -47,9 +47,10 @@ Dashboard 路徑:`Ollama 容災監控`uid: `ollama-failover-p23`
| 分布 | 意義 |
|------|------|
| ollama 佔 >90% | 正常,111 健康 |
| gemini 佔多數 | 111 SLOW/DEGRADED/OFFLINE容災 |
| ollama_188 出現 | Gemini 配額耗盡備援,或 111 和 Gemini 同時失敗 |
| ollama / ollama_gcp_a 佔 >90% | 正常,GCP-A 健康 |
| ollama_gcp_b 佔多數 | GCP-A SLOW/DEGRADED/OFFLINE容災到 GCP-B |
| ollama_local 出現 | GCP-A/B 均不可用,容災到 111 local |
| gemini 佔多數 | Ollama provider pool 全部不可用,使用付費備援 |
| 全部 nemotron/claude | 極端情況,所有主力 provider 失敗 |
---
@@ -71,10 +72,10 @@ Dashboard 路徑:`Ollama 容災監控`uid: `ollama-failover-p23`
### `OllamaInstanceDown` — Ollama 主機離線
**觸發條件**`up{job=~"ollama_111|ollama_188"} == 0` 持續 2 分鐘。
**觸發條件**`up{job=~"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111"} == 0` 持續 2 分鐘。
**影響評估**
- 系統應已自動切至 Gemini查 Panel 3 確認)
- 系統應已依序切至 GCP-B / 111 local / Gemini查 Panel 3 確認)
- 查 Panel 4 是否有 Failover 計數上升
**排查步驟**
@@ -82,11 +83,9 @@ Dashboard 路徑:`Ollama 容災監控`uid: `ollama-failover-p23`
```bash
# 步驟 1確認主機存活
ping -c 3 192.168.0.111
ping -c 3 192.168.0.188
# 步驟 2SSH 進主機確認 ollama 服務狀態
ssh wooo@192.168.0.111 'systemctl status ollama'
ssh wooo@192.168.0.188 'systemctl status ollama'
# 步驟 3查 ollama 最近的 journal log
ssh wooo@192.168.0.111 'journalctl -u ollama -n 50 --no-pager'
@@ -210,8 +209,9 @@ ssh wooo@192.168.0.111 'systemctl status ollama && nvidia-smi'
| Metric | 類型 | 狀態 | 說明 |
|--------|------|------|------|
| `up{job="ollama_111"}` | Gauge | ✅ 現有 | Prometheus scrape 存活 |
| `up{job="ollama_188"}` | Gauge | ✅ 現有 | Prometheus scrape 存活 |
| `up{job="ollama_gcp_a"}` | Gauge | ✅ 現有 | Prometheus scrape 存活 |
| `up{job="ollama_gcp_b"}` | Gauge | ✅ 現有 | Prometheus scrape 存活 |
| `up{job="ollama_local"}` | Gauge | ✅ 現有 | Prometheus scrape 存活 |
| `ollama_failover_triggered_total` | Counter | ✅ P2.3 補入 | failover 切換次數labels: from_provider, to_provider |
| `ollama_recovery_triggered_total` | Counter | ✅ P2.3 補入 | recovery 切回次數labels: from_provider |
| `ollama_health_status{host}` | Gauge | ✅ P2.3 補入 | 健康狀態 1=healthy, 0=not_healthy |

View File

@@ -1,34 +0,0 @@
# ============================================================================
# PATCH: 188 CPU-only Ollama 備援端點
# 日期: 2026-04-25 (台北時區)
# 負責人: ogt + Claude Sonnet 4.6
# ADR 參考: plan_complete_v3.md P0.5
# 診斷實測數據:
# 主機: 192.168.0.188, Intel Xeon Silver 4214 @ 2.2GHz, 12 核, CPU-only
# RAM: 62GB (used 14GB), Disk: 982GB (used 221GB)
# GPU: 無
# 現有模型: qwen2.5:7b-instruct (4.5GB), llama3.2:3b (1.9GB),
# deepseek-r1:14b (8.5GB), nomic-embed-text (261MB)
# 推理延遲實測: qwen2.5:7b-instruct → total=111s, eval_rate=0.09 token/s
# llama3.2:3b → total=155s (cold start, 比 7b 更慢)
# 目標 ~30s 無法達到 (CPU 推理硬上限 ~0.09 token/s)
# 決策: qwen2.5:7b-instruct 已存在,設為備援 (111s 延遲,使用者需知情)
# 連通性: 110 → 188:11434 ✅ 已驗證
# ⚠️ 注意: 188 推理極慢(~111s),應只在 111 GPU Ollama 完全失效時啟用
# 建議: 程式碼層應設 OLLAMA_FALLBACK_188_TIMEOUT_SEC = 150
# ============================================================================
#
# 將以下兩行加入 /Users/ogt/awoooi/k8s/awoooi-prod/04-configmap.yaml
# 建議位置: OLLAMA_URL 行 (第 20 行) 之後
#
# --- 新增內容 ---
# 2026-04-25 ogt + Claude Sonnet 4.6: 188 CPU-only Ollama 備援 (plan_complete_v3 P0.5)
# ⚠️ 188 推理延遲實測 ~111s (0.09 token/s, CPU-only Xeon 4214),僅作 111 完全失效時的降級備援
# 模型已存在: qwen2.5:7b-instruct (4.5GB), 無需重拉
OLLAMA_FALLBACK_188: "http://192.168.0.188:11434"
OLLAMA_188_MODEL: "qwen2.5:7b-instruct"
# --- 新增內容結束 ---
#
# 使用方式 (需用戶 review 後手動 apply):
# kubectl -n awoooi-prod apply -f k8s/awoooi-prod/04-configmap.yaml
# kubectl -n awoooi-prod rollout restart deployment/awoooi-api

View File

@@ -48,13 +48,16 @@ spec:
# 正式域名 (必須 https)
- name: NEXT_PUBLIC_API_URL
value: "https://awoooi.wooo.work"
# 2026-04-22 ogt: 移除前端硬碼 IP改由 K8s 注入
# 2026-06-03 P0-1: 避免前端 bundle 直接內建私網 IP改為主機別名無連線真值僅作 topology 呈現
- name: NEXT_PUBLIC_HOST_IPS
value: "192.168.0.110,192.168.0.112,192.168.0.120,192.168.0.188"
# 僅供 topology 呈現,非連線真值。
# 未設定/空值時dashboard 前端應回退為空陣列,避免隱藏內網依賴。
value: "devops,security,k3s,ai-web"
- name: NEXT_PUBLIC_K8S_VIP_INFO
value: "VIP 192.168.0.125 · kubectl :6443 · Web :32335 · API :32334"
value: "K8S VIP topology (ops-only) · kubectl:6443 · web:32335 · api:32334"
- name: SENTRY_HOST
value: "http://192.168.0.110:9000"
# 2026-06-03: 由可解析內部/公網 DNS 轉向,避免硬編碼 IP
value: "https://sentry.awoooi.internal"
envFrom:
- configMapRef:
name: awoooi-config

View File

@@ -60,11 +60,6 @@ spec:
env:
- name: USE_AI_ROUTER
value: "true"
- name: ENABLE_NEMOTRON_COLLABORATION
# 2026-04-15 ogt: 重新啟用 — asyncio.wait_for=120sOllama 已等待回應
value: "true"
- name: NEMOTRON_TIMEOUT_SECONDS
value: "55"
- name: TELEGRAM_ENABLE_POLLING
value: "true"
- name: OLLAMA_URL

View File

@@ -37,13 +37,17 @@ spec:
metadata:
labels:
app: awoooi
system: awoooi
component: k3s-report
spec:
restartPolicy: OnFailure
containers:
- name: k3s-report
image: 192.168.0.110:5000/awoooi-api:latest
imagePullPolicy: Always
# 2026-05-05 Codex: keep the API image placeholder so CD
# injects the same immutable tag used by API/worker. The old
# awoooi-api:latest repo returns 400 from Harbor after reboot.
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
imagePullPolicy: IfNotPresent
command:
- python
- -m
@@ -63,5 +67,7 @@ spec:
limits:
cpu: "200m"
memory: "128Mi"
# 使用 API 的 ServiceAccount (需要 RBAC)
serviceAccountName: awoooi-api
# 2026-05-05 Codex: this report only calls Prometheus and Telegram.
# The old awoooi-api ServiceAccount does not exist, which prevented
# Job pods from being created after reboot.
serviceAccountName: default

View File

@@ -37,13 +37,17 @@ spec:
metadata:
labels:
app: awoooi
system: awoooi
component: weekly-report
spec:
restartPolicy: OnFailure
containers:
- name: weekly-report
image: 192.168.0.110:5000/awoooi-api:latest
imagePullPolicy: Always
# 2026-05-05 Codex: keep the API image placeholder so CD
# injects the same immutable tag used by API/worker. The old
# awoooi-api:latest repo returns 400 from Harbor after reboot.
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
imagePullPolicy: IfNotPresent
command:
- python
- -m
@@ -63,5 +67,7 @@ spec:
limits:
cpu: "500m"
memory: "256Mi"
# 使用 API 的 ServiceAccount (需要 RBAC)
serviceAccountName: awoooi-api
# 2026-05-05 Codex: this report only calls app services, Prometheus,
# Git, and Telegram. The old awoooi-api ServiceAccount does not
# exist, which prevented Job pods from being created after reboot.
serviceAccountName: default

View File

@@ -27,18 +27,25 @@ spec:
jobTemplate:
spec:
backoffLimit: 2
activeDeadlineSeconds: 300
# 2026-05-05 Codex: allow post-reboot/post-migration catch-up batches.
# The script now fails if the API reports failed rows, so this longer
# deadline does not hide partial vectorization.
activeDeadlineSeconds: 1800
template:
metadata:
labels:
app: awoooi
system: awoooi
component: km-vectorize
spec:
restartPolicy: OnFailure
containers:
- name: km-vectorize
image: 192.168.0.110:5000/awoooi-api:latest
imagePullPolicy: Always
# 2026-05-05 Codex: keep the API image placeholder so CD
# injects the same immutable tag used by API/worker. The old
# awoooi-api:latest repo returns 400 from Harbor after reboot.
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
imagePullPolicy: IfNotPresent
command:
- python
- /app/scripts/cron_km_vectorize.py
@@ -46,7 +53,9 @@ spec:
- name: TZ
value: "Asia/Taipei"
- name: INTERNAL_API_URL
value: "http://awoooi-api.awoooi-prod.svc.cluster.local:8000"
# 2026-05-05 Codex: use the actual Service name; the old
# awoooi-api DNS name does not exist in awoooi-prod.
value: "http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000"
resources:
requests:
cpu: "50m"
@@ -54,4 +63,7 @@ spec:
limits:
cpu: "200m"
memory: "128Mi"
serviceAccountName: awoooi-api
# 2026-05-05 Codex: this job only calls the internal API. The old
# awoooi-api ServiceAccount does not exist, which prevented Job pods
# from being created after reboot.
serviceAccountName: default

View File

@@ -35,6 +35,7 @@ spec:
metadata:
labels:
app: awoooi
system: awoooi
component: backup-restore-test
spec:
restartPolicy: Never

View File

@@ -39,7 +39,7 @@ resources:
images:
- name: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
newName: 192.168.0.110:5000/awoooi/api
newTag: 1cc9de5722eb2fca8bab080077f792fa02c5d5fb
newTag: f1ef7ec3e295313af67d7acaf40d439585cb5270
- name: 192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER
newName: 192.168.0.110:5000/awoooi/web
newTag: 1cc9de5722eb2fca8bab080077f792fa02c5d5fb
newTag: f1ef7ec3e295313af67d7acaf40d439585cb5270

View File

@@ -63,10 +63,11 @@ spec:
print(f"status={r.status_code} body={r.text[:200]}")
asyncio.run(run())
env:
# 2026-04-09 Claude Sonnet 4.6: ClusterIP 和 DNS 在 Job Pod 均不可達
# 改用 NodePort 直連 K3s worker node同 K8s_API_SERVER_URL 解法)
# 2026-05-05 Codex: call the in-cluster Service instead of a
# fixed worker NodePort. After reboot, 121 can be unavailable
# while the Service and VIP are already healthy.
- name: INTERNAL_API_URL
value: "http://192.168.0.121:32334"
value: "http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000"
- name: DRIFT_SCAN_NAMESPACES
value: "awoooi-prod"
resources:

View File

@@ -88,7 +88,7 @@ spec:
# -----------------------------------------------------------------
- alert: NoAlertsReceived2Hours
expr: |
time() - max by (source)(awoooi_alert_chain_last_success_timestamp) > 7200
time() - max by (source)(awoooi_alert_chain_last_success_timestamp{source="alertmanager"}) > 7200
for: 5m
labels:
severity: warning

View File

@@ -15,6 +15,39 @@
groups:
# =========================================================================
# Full-stack recovery scorecard recording rules
# =========================================================================
- name: full_stack_recovery_scorecard_rules
interval: 60s
rules:
- record: awoooi_recovery_core_ready
expr: |
sum without(result) (
awoooi_cold_start_last_result{host="110",scope="110_120_121_188",result="green"} == bool 1
)
* on(host,scope) (
awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"} == bool 0
)
* on(host,scope) (
awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"} == bool 0
)
* on(host,scope) (
(time() - awoooi_cold_start_last_green_timestamp{host="110",scope="110_120_121_188"}) < bool 3600
)
- record: awoooi_recovery_dr_offsite_ready
expr: |
max by(host) (
awoooi_backup_offsite_configured{host="110"} == bool 1
)
* on(host) max by(host) (
awoooi_backup_offsite_fresh{host="110"} == bool 1
)
* on(host) min by(host) (
awoooi_backup_credential_escrow_fresh{host="110"} == bool 1
)
# =========================================================================
# 主機層告警 (host_alerts)
# =========================================================================
@@ -33,20 +66,22 @@ groups:
description: "Node Exporter 無回應超過 1 分鐘"
- alert: HostHighCpuLoad
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
# 2026-05-05 ogt + Codex: keep this as early warning only.
# Sustained overload/root-cause automation is handled by HostLoadAverageSustainedHigh.
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 10m
labels:
severity: warning
layer: systemd-188
team: ops
auto_repair: "true"
auto_repair: "false"
# MCP Phase 2a (ADR-071, 2026-04-11 Claude Sonnet 4.6): SSH MCP 路由標籤
mcp_provider: "ssh_host"
host_type: "bare_metal"
alert_category: "host_resource"
annotations:
summary: "主機 {{ $labels.host }} CPU 高負載"
description: "CPU 使用率超過 80%"
description: "CPU 使用率超過 90% 持續 10 分鐘;若 load5/core 未超過 1.5,先視為容量觀察與診斷,不直接修復。"
- alert: HostLoadAverageSustainedHigh
# 2026-05-05 ogt + Codex: 110/188 長時間過載基線。
@@ -165,7 +200,7 @@ groups:
description: "過去 24 小時有備份失敗"
- alert: VeleroBackupNotRun
expr: time() - velero_backup_last_successful_timestamp > 86400
expr: max by(host, namespace) (awoooi_velero_latest_completed_backup_fresh{host="110",namespace="velero"}) == 0
for: 10m
labels:
severity: critical
@@ -175,7 +210,7 @@ groups:
auto_repair: "false"
annotations:
summary: "Velero 超過 24 小時未成功備份"
description: "最後一次成功備份超過 24 小時"
description: "backup health exporter 顯示 latest Completed Velero backup 超過 25 小時或不存在。"
# Sprint C-2 Host rsync 備份告警 (2026-04-11 Claude Sonnet 4.6)
# backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success
@@ -505,7 +540,7 @@ groups:
description: "Sentry 錯誤可能無法正確處理"
- alert: NoAlertsReceived2Hours
expr: time() - max by (source)(awoooi_alert_chain_last_success_timestamp) > 7200
expr: time() - max by (source)(awoooi_alert_chain_last_success_timestamp{source="alertmanager"}) > 7200
for: 5m
labels:
severity: warning
@@ -665,7 +700,7 @@ groups:
- alert: DockerContainerMissingResourceLimit
# 2026-05-05 ogt + Codex: catch Compose services that silently run with unlimited CPU/memory.
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0)
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0)
for: 30m
labels:
severity: warning
@@ -1011,10 +1046,10 @@ groups:
# 備份還原告警 (awoooi_backup_restore) — 從主機補回 2026-04-12
# =========================================================================
- name: awoooi_backup_restore
interval: 1h
interval: 1m
rules:
- alert: BackupRestoreTestFailed
expr: awoooi_backup_restore_test_success == 0
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_failed_jobs{host="110",namespace="velero",cronjob="backup-restore-test"}) > 0
for: 5m
labels:
severity: critical
@@ -1023,11 +1058,37 @@ groups:
auto_repair: "false"
annotations:
summary: "備份還原 dry-run 測試失敗"
description: "Velero restore dry-run 失敗,備份可能無法還原。立即人工驗證備份狀態。"
runbook: "執行 velero backup describe awoooi-daily 及 velero restore create --from-backup awoooi-daily --dry-run"
description: "velero namespace 中保留了失敗的 backup-restore-test Job,備份可能無法還原。立即人工驗證備份狀態。"
runbook: "先找最新 Completed Velero backup再執行 restore dry-run禁止在 production namespace 做真還原"
- alert: BackupRestoreTestMissing
expr: absent(awoooi_velero_restore_test_cron_present{host="110",namespace="velero",cronjob="backup-restore-test"})
for: 30m
labels:
severity: warning
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "備份還原 dry-run 監控指標缺失"
description: "Prometheus 沒有收到 awoooi_velero_restore_test_cron_present110 backup health exporter 或 120 kubectl 查詢可能失效。"
runbook: "檢查 110 backup_health.prom、SSH 110→120、以及 velero namespace 的 backup-restore-test CronJob"
- alert: BackupRestoreTestCronMissing
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_cron_present{host="110",namespace="velero",cronjob="backup-restore-test"}) == 0
for: 15m
labels:
severity: critical
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "備份還原 dry-run CronJob 缺失"
description: "velero namespace 找不到 backup-restore-test CronJob備份可還原性沒有定期驗證。"
runbook: "kubectl apply k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml 與 16-cronjob-backup-restore-test.yaml"
- alert: BackupRestoreTestStale
expr: (time() - awoooi_backup_restore_test_timestamp_seconds) > 691200
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_last_success_fresh{host="110",namespace="velero",cronjob="backup-restore-test"}) == 0
for: 10m
labels:
severity: warning
@@ -1036,9 +1097,375 @@ groups:
auto_repair: "false"
annotations:
summary: "備份還原測試超過 8 天未執行"
description: "上次備份測試距今 {{ $value | humanizeDuration }}週排程 CronJob 可能失效。"
description: "backup-restore-test CronJob 沒有 8 天內成功紀錄;週排程 CronJob 可能失效。"
runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態"
# =========================================================================
# Host / service / config backup health
# =========================================================================
- name: full_stack_backup_health_alerts
interval: 1m
rules:
- alert: BackupHealthMonitorMissing110
expr: absent(awoooi_backup_health_monitor_up{host="110"})
for: 20m
labels:
severity: warning
layer: host-backup
component: backup-health-monitor
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份健康指標缺失"
description: "110 沒有輸出 backup_health.prom無法確認資料庫、設定檔與服務備份是否新鮮。"
runbook: "用 Ansible `110-devops.yml --tags textfile_exporters` 或手動部署 scripts/ops/backup-health-textfile-exporter.py"
- alert: BackupHealthMonitorMissing188
expr: absent(awoooi_backup_health_monitor_up{host="188"})
for: 20m
labels:
severity: warning
layer: host-backup
component: backup-health-monitor
host: "188"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "188 備份健康指標缺失"
description: "188 沒有輸出 backup_health.prom無法確認 110 rsync 與 momo PostgreSQL 備份是否新鮮。"
runbook: "用 Ansible `188-ai-web.yml --tags textfile_exporters` 或手動部署 scripts/ops/backup-health-textfile-exporter.py"
- alert: BackupHealthMonitorStale
expr: time() - awoooi_backup_health_last_run_timestamp{host=~"110|188"} > 1800
for: 10m
labels:
severity: warning
layer: host-backup
component: backup-health-monitor
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "{{ $labels.host }} 備份健康 exporter 超過 30 分鐘未更新"
description: "backup health textfile exporter stale備份狀態不可觀測。"
runbook: "SSH 主機檢查 cron、/tmp/awoooi-backup-health-textfile-exporter.cron.log 與 node-exporter textfile collector"
- alert: BackupExpectedJobMissing
expr: awoooi_backup_job_configured{host=~"110|188"} == 0
for: 15m
labels:
severity: critical
layer: host-backup
component: backup-cron
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "{{ $labels.host }} 備份排程缺失:{{ $labels.exported_job }}"
description: "預期備份 cron/config 不存在;下一次重開機後資料可能沒有可用還原點。"
runbook: "依 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的備份章節補回 cron先 dry-run 再執行"
- alert: BackupScheduleDuplicateActiveEntries
expr: awoooi_backup_cron_active_duplicate_count{host="110"} > 0
for: 15m
labels:
severity: warning
layer: host-backup
component: backup-cron
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份 crontab 有重複 active entries"
description: "110 crontab 目前有 {{ $value }} 個 exact duplicate active entry可能造成 offsite sync、verifier 或 status job 重複執行。"
runbook: "SSH 110 執行 `crontab -l | awk 'NF && $0 !~ /^#/ {count[$0]++} END {for (line in count) if (count[line] > 1) print count[line], line}'`,只移除重複 active entry不要刪除未理解的備份排程。"
- alert: BackupScheduleSingletonMismatch
expr: awoooi_backup_cron_singular_entry_ok{host="110"} == 0
for: 15m
labels:
severity: warning
layer: host-backup
component: backup-cron
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份排程單一入口異常:{{ $labels.entry }}"
description: "{{ $labels.entry }} 應該剛好只有一個 active cron entry目前 count={{ $value }},可能造成排程缺失或重複執行。"
runbook: "用 Ansible `110-devops.yml --tags backup_jobs` 收斂排程,並用 `scripts/reboot-recovery/reboot-recovery-readiness-audit.sh --live --no-color` 驗證。"
- alert: BackupScriptMissing
expr: awoooi_backup_script_present{host=~"110|188"} == 0
for: 15m
labels:
severity: critical
layer: host-backup
component: backup-script
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "{{ $labels.host }} 備份腳本缺失:{{ $labels.script }}"
description: "備份排程可能存在,但實際腳本不存在或路徑漂移。"
runbook: "從 repo 部署對應 scripts/backup 或 scripts/ops 腳本,確認權限 0755"
- alert: BackupJobStale
expr: awoooi_backup_job_fresh{host=~"110|188"} == 0
for: 15m
labels:
severity: critical
layer: host-backup
component: backup-freshness
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "{{ $labels.host }} 備份過舊:{{ $labels.exported_job }}"
description: "{{ $labels.exported_job }} 最新成功證據超過 {{ $labels.max_age_hours }} 小時或不存在;來源 {{ $labels.source }},目標 {{ $labels.target }}。"
runbook: "先檢查備份 log 與磁碟空間,再手動執行對應備份;禁止直接刪除舊備份或 production 資料"
- alert: BackupAggregateRunFailed
expr: awoooi_backup_last_run_failed_count{host="110",exported_job="backup_all"} > 0
for: 10m
labels:
severity: warning
layer: host-backup
component: backup-all
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 全服務備份最近一次有 {{ $value }} 個失敗項目"
description: "backup-all.sh 最近一次 aggregate run 仍有失敗;即使個別 DB 備份已手動補跑,也要重跑 aggregate backup 清除紅燈。"
runbook: "SSH 110 檢查 /backup/logs/cron.log 與 /backup/logs/backup.log修正後執行 /backup/scripts/backup-all.sh"
- alert: BackupConfigCapturePartial
expr: awoooi_backup_config_capture_ok{host="110",critical="true"} == 0
for: 10m
labels:
severity: warning
layer: host-backup
component: backup-config-capture
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 設定檔備份缺少關鍵目標:{{ $labels.target }}"
description: "configs restic snapshot 雖可能存在,但最新設定檔備份未成功捕捉 {{ $labels.target }}source={{ $labels.source }}。這會影響下一次冷啟動或災難還原的設定還原完整性。"
runbook: "先修復對應主機或 K8s API 可達性,再執行 /backup/scripts/backup-configs.sh確認 awoooi_backup_config_capture_ok 回到 1最後補跑 Google Drive/rclone offsite sync。"
- alert: BackupConfigCaptureStatusStale
expr: absent(awoooi_backup_config_capture_status_timestamp{host="110"}) or (time() - awoooi_backup_config_capture_status_timestamp{host="110"} > 172800)
for: 30m
labels:
severity: warning
layer: host-backup
component: backup-config-capture
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 設定檔備份覆蓋率狀態缺失或過舊"
description: "backup-configs.sh 沒有新鮮的 capture status無法判斷 110/120/121/188/K8s 設定檔是否真的被最新 snapshot 捕捉。"
runbook: "部署新版 /backup/scripts/backup-configs.sh 與 /home/wooo/scripts/backup-health-textfile-exporter.py執行 /backup/scripts/backup-configs.sh 後刷新 textfile exporter。"
- alert: BackupIntegrityCheckMissingOrFailed
expr: absent(awoooi_backup_integrity_fresh{host="110",scope="restic_check"}) or awoooi_backup_integrity_fresh{host="110",scope="restic_check"} == 0
for: 30m
labels:
severity: critical
layer: host-backup
component: backup-integrity
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "110 備份倉庫完整性檢查缺失或失敗"
description: "每週 restic check 沒有成功證據,或有 repo 檢查失敗;目前不能假設備份可讀。"
runbook: "SSH 110 執行 `/backup/scripts/check-backup-integrity.sh --mode check`,先看 /backup/logs/backup-integrity.log禁止刪 repo 或 prune 直到確認原因"
- alert: BackupRestoreDrillMissingOrFailed
expr: absent(awoooi_backup_integrity_fresh{host="110",scope="restore_drill"}) or awoooi_backup_integrity_fresh{host="110",scope="restore_drill"} == 0
for: 30m
labels:
severity: warning
layer: host-backup
component: backup-restore-drill
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份抽樣還原演練缺失或失敗"
description: "每月 restore drill 沒有成功證據,備份雖可能新鮮,但尚未驗證可讀取還原。"
runbook: "SSH 110 執行 `/backup/scripts/check-backup-integrity.sh --mode restore-drill`;只允許還原到隔離暫存目錄,不得覆蓋 production"
- alert: BackupOffsiteCopyNotConfigured
expr: sum by(host) (awoooi_backup_offsite_configured{host="110"}) == 0
for: 1m
labels:
severity: warning
layer: host-backup
component: backup-offsite
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 尚未配置離機備份 provider"
description: "backup health exporter 未偵測到 Google Drive/rclone 或其他 offsite provider 配置;本地 restic 全綠仍不等於異地可恢復。"
runbook: "在 110 以 `/backup/scripts/configure-offsite-rclone.sh --interactive` 建立 Google Drive remote產生 `/backup/offsite/*last_success` 證據;不得把 provider token 寫入 repo、Telegram 或 Prometheus label。"
- alert: BackupOffsiteCopyStale
expr: |
(
(sum by(host) (awoooi_backup_offsite_configured{host="110"}) > 0)
and
(sum by(host) (awoooi_backup_offsite_fresh{host="110"}) == 0)
)
and
(
(sum by(host) (awoooi_backup_offsite_full_sync_enabled{host="110"}) == 0)
or
((time() - max by(host) (awoooi_backup_offsite_full_sync_enabled_timestamp{host="110"})) > 30 * 3600)
)
for: 2h
labels:
severity: warning
layer: host-backup
component: backup-offsite
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 離機備份超過 48 小時未成功"
description: "已偵測到 offsite provider 配置,但沒有新鮮成功標記;本地備份可能無法抵抗整台 110 遺失。"
runbook: "SSH 110 檢查 Google Drive/rclone 同步 log 與 `/backup/offsite/*last_success`full sync 需在 enable marker 與低負載門檻成立後由 `/backup/scripts/sync-offsite-backups.sh --mode sync` 鏡像本地 latest-only repo。"
- alert: BackupRetentionPolicyNotLatestOnly
expr: |
absent(awoooi_backup_retention_latest_only{host="110"})
or
awoooi_backup_retention_latest_only{host="110"} != 1
or
absent(awoooi_backup_retention_offsite_delete_old_enabled{host="110",provider="rclone"})
or
awoooi_backup_retention_offsite_delete_old_enabled{host="110",provider="rclone"} != 1
for: 15m
labels:
severity: warning
layer: host-backup
component: backup-retention
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份保留策略不是 latest-only"
description: "operator 要求所有備份只保留最新一份;本地 restic 必須 keep-last=1Google Drive/rclone 必須在成功 mirror 後刪除遠端舊檔。"
runbook: "檢查 `/backup/scripts/common.sh` 的 BACKUP_RETENTION_MODE=latest、KEEP_LAST=1 與 OFFSITE_SYNC_DELETE_OLD=1刷新 backup-health textfile必要時在備份成功後執行 `/backup/scripts/enforce-latest-only-retention.sh`。"
- alert: BackupSnapshotRetentionExceeded
expr: awoooi_backup_job_snapshot_count{host="110",type="restic"} > 1
for: 30m
labels:
severity: warning
layer: host-backup
component: backup-retention
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份 repo {{ $labels.exported_job }} 保留超過 1 份 snapshot"
description: "{{ $labels.exported_job }} 目前有 {{ $value }} 份 restic snapshotlatest-only 策略要求每個 repo 全域只保留最新 1 份。"
runbook: "SSH 110 執行 `/backup/scripts/enforce-latest-only-retention.sh`;若仍未收斂,確認 `common.sh` 使用 `restic forget --group-by \"\" --keep-last 1 --prune`,避免 restic 依 path/tag 分組保留多份。"
- alert: BackupOffsiteFullVerifyFailed
expr: |
awoooi_backup_offsite_fresh{host="110",provider="rclone"} == 1
unless on(host, provider)
(awoooi_backup_offsite_remote_verify_ok{host="110",provider="rclone"} == 1)
for: 30m
labels:
severity: warning
layer: host-backup
component: backup-offsite
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 Google Drive full sync 完成但遠端驗證未通過"
description: "full offsite marker 已 fresh但 verify-offsite-full-sync.sh 沒有證明 13 個 Google Drive repo 都可列出且符合 latest-only。"
runbook: "SSH 110 執行 `/backup/scripts/verify-offsite-full-sync.sh --write-textfile --no-color`,檢查 `/backup/logs/offsite-full-sync-verify.log` 與 `/home/wooo/node_exporter_textfiles/offsite_full_sync_verify.prom`。"
- alert: BackupOffsiteRemoteSnapshotRetentionExceeded
expr: |
(awoooi_backup_offsite_remote_snapshot_count{host="110",provider="rclone"} > 1)
and on(host, provider)
(awoooi_backup_offsite_fresh{host="110",provider="rclone"} == 1)
for: 30m
labels:
severity: warning
layer: host-backup
component: backup-retention
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Google Drive repo {{ $labels.repo }} 保留超過 1 份 snapshot"
description: "{{ $labels.repo }} 在 Google Drive/rclone 遠端目前有 {{ $value }} 份 snapshotlatest-only 策略要求遠端也只保留最新一份。"
runbook: "確認 110 `/backup/scripts/sync-offsite-backups.sh --mode sync` 使用 `rclone sync`、`OFFSITE_SYNC_DELETE_OLD=1`、`RCLONE_DRIVE_USE_TRASH=false`,再於低峰重新執行 full sync 與 verifier。"
- alert: BackupCredentialEscrowEvidenceMissing
expr: awoooi_backup_credential_escrow_fresh{host="110"} == 0
for: 1m
labels:
severity: warning
layer: host-backup
component: credential-escrow
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "備份憑證金庫證據缺失或過期:{{ $labels.item }}"
description: "{{ $labels.item }} 沒有 31 天內人工驗證證據;重建時可能找不到 restic/offsite/break-glass/DNS/OAuth 復原材料。"
runbook: "在密碼管理器或離線加密金庫完成雙人覆核後,只建立不含 secret 的 `/backup/escrow-evidence/{{ $labels.item }}.last_verified` 時間戳證據。"
# =========================================================================
# 基礎設施詳細告警 (awoooi_infrastructure_detailed) — 從主機補回 2026-04-12
# =========================================================================
@@ -1321,3 +1748,284 @@ groups:
summary: "Prometheus ({{ $labels.instance }}) 停擺"
description: "Prometheus 自己停擺 → 所有其他告警失效"
runbook: "SSH 110 docker compose -f /home/wooo/monitoring/docker-compose.yml up -d prometheus"
# =========================================================================
# Full-stack cold-start recovery gate
# =========================================================================
- name: cold_start_recovery_alerts
rules:
- alert: PrometheusRuleDriftGuardFailed
expr: |
absent(awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="110"})
or
(time() - max by(host) (awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="110"}) > 900)
or
(awoooi_prometheus_rule_drift_guard_missing_required_count{host="110"} > 0)
or
(awoooi_prometheus_rule_drift_guard_current_matches_canonical{host="110"} == 0)
for: 10m
labels:
severity: critical
layer: systemd-110
component: prometheus-rule-drift-guard
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "Prometheus 規則漂移防護失效"
description: "110 Prometheus rule drift guard 沒有新鮮成功指標、required rules 缺失,或 active alerts.yml 不等於 canonical rules。"
runbook: "執行 `bash scripts/ops/deploy-alerts.sh` 重新部署 canonical rules 與 drift guard等待 1-2 個 Prometheus evaluation cycle 後重跑 readiness audit。"
- alert: PrometheusRuleDriftAutoRepaired
expr: awoooi_prometheus_rule_drift_guard_repaired{host="110"} > 0
for: 1m
labels:
severity: warning
layer: systemd-110
component: prometheus-rule-drift-guard
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Prometheus 規則漂移已被自動修復"
description: "110 drift guard 最近一次偵測到 active Prometheus rules 漂移,已回復 canonical rules 並 reload Prometheus。"
runbook: "檢查 `/home/wooo/logs/prometheus-rule-drift-guard.log` 與 `/home/wooo/monitoring/alerts.yml.guard.bak.*`,找出誰覆寫了 active rules。"
- alert: ColdStartMonitorMissing
expr: absent(awoooi_cold_start_monitor_up{host="110",scope="110_120_121_188"})
for: 15m
labels:
severity: warning
layer: systemd-110
component: cold-start-monitor
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Cold-start monitor textfile metric missing"
description: "110 沒有輸出 awoooi_cold_start_monitor_up重開機恢復 gate 目前不可觀測。"
runbook: "執行 scripts/reboot-recovery/install-cold-start-monitor-110.sh確認 /home/wooo/node_exporter_textfiles/cold_start_recovery.prom"
- alert: ColdStartMonitorStale
expr: time() - awoooi_cold_start_last_run_timestamp{host="110",scope="110_120_121_188"} > 900
for: 10m
labels:
severity: warning
layer: systemd-110
component: cold-start-monitor
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Cold-start monitor stale"
description: "cold-start monitor 超過 15 分鐘沒有更新,距離上次執行 {{ $value | humanizeDuration }}。"
runbook: "SSH 110 檢查 crontab、/tmp/awoooi-cold-start-monitor.cron.log、cold-start-last.log"
- alert: ColdStartRecoveryBlocked
expr: awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"} > 0
for: 5m
labels:
severity: critical
layer: full-stack
component: cold-start-gate
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "Full-stack cold-start recovery BLOCKED"
description: "cold-start gate 有 {{ $value }} 個 BLOCKED gate。AI 修復需保持 observe-only先處理第一個 blocked gate。"
runbook: "查看 /home/wooo/reboot-recovery/cold-start-last.log依 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的 P0→P2 順序修復"
- alert: K3sNodeFilesystemErrorGateBlocked
expr: awoooi_cold_start_blocker_reason{host="110",scope="110_120_121_188",reason="k3s_node_filesystem_error",target="120"} > 0
for: 5m
labels:
severity: critical
layer: k3s
component: node-filesystem
host: "120"
target_host: "120"
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "120 K3s 節點 filesystem error 阻擋重開機放行"
description: "cold-start log 偵測到 120 Node event 仍有 EXT4/I/O/deleted inode 類錯誤;即使 Pod Running、網站 200也不可宣告下一次重開機安全。"
runbook: "查看 110 `/home/wooo/reboot-recovery/cold-start-last.log`,執行 `scripts/reboot-recovery/120-fsck-maintenance-checklist.sh` 做維護前只讀檢查;維護窗內用 console/rescue 對 120 root LV 執行 fsck禁止 online fsck。"
- alert: ColdStartHost120Unreachable
expr: awoooi_cold_start_blocker_reason{host="110",scope="110_120_121_188",reason="host_unreachable",target="120"} > 0
for: 3m
labels:
severity: critical
layer: host
component: host-reachability
host: "120"
target_host: "120"
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "120 主機不可達Full-stack cold-start 已阻擋"
description: "110 cold-start monitor 無法 ping/SSH 192.168.0.120;目前只能由 121/VIP 撐住 K3s不能宣告所有主機重開機恢復完成。"
runbook: "查看 120 console。若停在 initramfs/manual fsck先對 root LV 做離線 fsck若主機關機或網卡異常先恢復電源/網路,再重跑 full-stack cold-start gate。禁止從自動修復直接重啟其他服務掩蓋主機離線。"
- alert: ColdStartRecoveryDegraded
expr: awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"} > 0
for: 15m
labels:
severity: warning
layer: full-stack
component: cold-start-gate
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Full-stack cold-start recovery DEGRADED"
description: "cold-start gate 有 {{ $value }} 個 WARN gate核心可用但不應放行 runner/CD/AI auto-repair full execution。"
runbook: "查看 /home/wooo/reboot-recovery/cold-start-last.log修到 PASS/WARN/BLOCKED = green"
- alert: ColdStartLastGreenTooOld
expr: time() - awoooi_cold_start_last_green_timestamp{host="110",scope="110_120_121_188"} > 3600
for: 15m
labels:
severity: warning
layer: full-stack
component: cold-start-gate
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Full-stack cold-start gate has not been GREEN recently"
description: "距離上次 GREEN 已超過 {{ $value | humanizeDuration }};需要確認 110/120/121/188 與排程/網站 gate。"
runbook: "執行 SSH_BATCH_MODE=yes bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test"
# =========================================================================
# Host storage health / dirty reboot evidence
# =========================================================================
- name: host_storage_health_alerts
rules:
- alert: Host110StorageHealthMonitorMissing
expr: absent(awoooi_host_storage_monitor_up{host="110"})
for: 15m
labels:
severity: warning
layer: systemd-110
component: storage-health-monitor
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 storage health textfile metric missing"
description: "110 沒有輸出 storage_health.promdirty reboot、root read-only 與 fsck 證據目前不可觀測。"
runbook: "用 Ansible `110-devops.yml --tags textfile_exporters` 或手動部署 scripts/ops/storage-health-textfile-exporter.py確認 /home/wooo/node_exporter_textfiles/storage_health.prom"
- alert: Host188StorageHealthMonitorMissing
expr: absent(awoooi_host_storage_monitor_up{host="188"})
for: 15m
labels:
severity: warning
layer: systemd-188
component: storage-health-monitor
host: "188"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "188 storage health textfile metric missing"
description: "188 沒有輸出 storage_health.promdirty reboot、root read-only 與 fsck 證據目前不可觀測。"
runbook: "用 Ansible `188-ai-web.yml --tags textfile_exporters` 或手動部署 scripts/ops/storage-health-textfile-exporter.py確認 /home/ollama/node_exporter_textfiles/storage_health.prom"
- alert: HostStorageHealthMonitorStale
expr: time() - awoooi_host_storage_last_run_timestamp{host=~"110|188"} > 900
for: 10m
labels:
severity: warning
layer: host-storage
component: storage-health-monitor
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} storage health textfile stale"
description: "storage health exporter 超過 15 分鐘沒有更新;重開機後檔案系統風險不可觀測。"
runbook: "SSH 主機檢查 cron、/tmp/awoooi-storage-health-textfile-exporter.cron.log 與 node-exporter textfile collector"
- alert: HostRootFilesystemReadOnly
expr: awoooi_host_root_filesystem_readonly{host=~"110|188",mountpoint="/"} > 0
for: 1m
labels:
severity: critical
layer: host-storage
component: root-filesystem
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} root filesystem 已變成 read-only"
description: "root filesystem 被掛載為唯讀,服務可能仍暫時存活但寫入會失敗;禁止自動修復,先保全證據並規劃維護窗。"
runbook: "依 docs/runbooks/FULL-STACK-COLD-START-SOP.md §16保全 journal/df/mount 證據,確認備份,再安排 console/offline fsck"
- alert: HostCurrentBootStorageErrorsDetected
expr: awoooi_host_storage_error_count{host=~"110|188",source="journalctl-kernel",boot="current"} > 0
for: 5m
labels:
severity: critical
layer: host-storage
component: kernel-storage
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} current boot 有 storage/kernel 錯誤"
description: "目前開機週期已出現 filesystem、I/O 或 fsck 類錯誤;不可只重啟容器掩蓋問題。"
runbook: "先執行 read-only 診斷journalctl -k -p warning..alert、mount、df、smartctl/raid 狀態;必要時進入維護窗處理"
- alert: HostPreviousBootStorageErrorsDetected
expr: awoooi_host_storage_error_count{host=~"110|188",source="journalctl-kernel",boot="previous"} > 0
for: 30m
labels:
severity: warning
layer: host-storage
component: dirty-reboot-evidence
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} previous boot 保留 storage/fsck 錯誤證據"
description: "上一個開機週期留有 storage/fsck 錯誤,代表這次重開機事故需要完成 fsck、備份與容量後續檢查。"
runbook: "把證據寫入 docs/LOGBOOK.md確認 full-stack cold-start gate 與 P3 gate下一次維護窗補 offline fsck/SMART/RAID 檢查"
- alert: HostFsckLogErrorsDetected
expr: sum by(host) (awoooi_host_storage_error_count{host=~"110|188",boot="last-fsck-log"}) > 0
for: 30m
labels:
severity: warning
layer: host-storage
component: fsck-log
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} fsck log 保留錯誤證據"
description: "主機 fsck log 內仍有 inconsistency 或 I/O 類錯誤文字;這是事故後追蹤項,不應交給自動修復直接處理。"
runbook: "確認 /run/initramfs/fsck.log 與 /var/log/fsck/*,將結果納入重開機事故報告與下次維護窗檢查項"

View File

@@ -33,8 +33,10 @@ groups:
description: "Node Exporter 無回應超過 1 分鐘"
- alert: HostHighCpuLoad
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
# 2026-05-05 ogt + Codex: keep this as early warning only.
# Sustained overload/root-cause automation is handled by HostLoadAverageSustainedHigh.
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 10m
labels:
severity: warning
layer: systemd-188
@@ -46,7 +48,7 @@ groups:
alert_category: "host_resource"
annotations:
summary: "主機 {{ $labels.host }} CPU 高負載"
description: "CPU 使用率超過 80%"
description: "CPU 使用率超過 90% 持續 10 分鐘;若 load5/core 未超過 1.5,先視為容量觀察與診斷,不直接修復。"
# 2026-05-02 ogt + Claude Sonnet 4.6: 引導 LLM 走 SSH 診斷而非 kubectl
auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -20' (host CPU 診斷;禁 kubectl restart awoooi-* — 主因常為第三方服務 Sentry/ClickHouse/Snuba)"
runbook: "host CPU 高負載排查:先 SSH ps aux 看 top 進程若為第三方服務Sentry/ClickHouse 等)寫 ADR 升級資源或調 limit禁止 kubectl restart 跨 domain"
@@ -671,7 +673,7 @@ groups:
- alert: DockerContainerMissingResourceLimit
# 2026-05-05 ogt + Codex: catch Compose services that silently run with unlimited CPU/memory.
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0)
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0)
for: 30m
labels:
severity: warning

View File

@@ -26,8 +26,18 @@
- labels:
criticality: P0
owner: ai-team
service: ollama
url: http://192.168.0.188:11434/api/tags
service: ollama-gcp-a
url: http://192.168.0.110:11435/api/tags
- labels:
criticality: P0
owner: ai-team
service: ollama-gcp-b
url: http://192.168.0.110:11436/api/tags
- labels:
criticality: P0
owner: ai-team
service: ollama-local
url: http://192.168.0.110:11437/api/tags
- labels:
criticality: P0
owner: ai-team

View File

@@ -92,7 +92,9 @@ scrape_configs:
service: ollama
type: docker
targets:
- 192.168.0.188:11434
- 192.168.0.110:11435
- 192.168.0.110:11436
- 192.168.0.110:11437
- job_name: openclaw
static_configs:
- labels:

View File

@@ -82,11 +82,11 @@
"textMode": "auto"
},
"title": "Ollama 可用性",
"description": "up{job=~\"ollama_111|ollama_188\"} × 100\n- 綠色 100% = 主機在線\n- 紅色 0% = 主機離線(容災應已觸發)\n\n資料來源: Prometheus scrape job ollama_111 / ollama_188",
"description": "up{job=~\"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111\"} × 100\n- 綠色 100% = 主機在線\n- 紅色 0% = 主機離線(容災應已觸發)\n\n資料來源: Prometheus scrape job ollama_gcp_a / ollama_gcp_b / ollama_local",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=~\"ollama_111|ollama_188\"} * 100",
"expr": "up{job=~\"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111\"} * 100",
"legendFormat": "{{ job }}",
"refId": "A"
}
@@ -188,7 +188,7 @@
"tooltip": { "mode": "single", "sort": "none" }
},
"title": "AI Provider 路由分布",
"description": "sum by (provider) (rate(ai_router_selected_provider_total[5m]))\n- 正常狀態: ollama 佔大多數\n- failover 中: gemini / ollama_188 比例上升\n- 全走 gemini = 111 完全 offline\n\n資料來源: OLLAMA_FAILOVER_TRIGGERED_TOTAL + AI_ROUTER_PROVIDER_TOTAL (src/core/metrics.py)",
"description": "sum by (provider) (rate(ai_router_selected_provider_total[5m]))\n- 正常狀態: ollama / ollama_gcp_a 佔大多數\n- failover 中: ollama_gcp_b / ollama_local / gemini 比例上升\n- 全走 gemini = Ollama provider pool 完全 offline\n\n資料來源: OLLAMA_FAILOVER_TRIGGERED_TOTAL + AI_ROUTER_PROVIDER_TOTAL (src/core/metrics.py)",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },

View File

@@ -6,7 +6,7 @@
# 部署方式: 手動合併至 alerts-unified.yml或 scripts/ops/deploy-alerts.sh 支援多檔時直接引用
#
# 標籤規範 (對齊 alerts-unified.yml):
# layer: systemd-188 | docker-188 (Ollama 跑在 188 主機)
# layer: ai-provider
# team: ai
# auto_repair: "true" | "false"
#
@@ -28,16 +28,16 @@ groups:
# -----------------------------------------------------------------------
# 🔴 [ACTIVE] Ollama 主機離線
# metric: up{job=~"ollama_111|ollama_188"}
# 前置條件: Prometheus scrape job 命名為 ollama_111 / ollama_188
# metric: up{job=~"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111"}
# 前置條件: Prometheus scrape job 命名對齊 ADR-110 provider pool
# (設定位於 ops/monitoring/generated/prometheus-scrape-generated.yaml)
# -----------------------------------------------------------------------
- alert: OllamaInstanceDown
expr: up{job=~"ollama_111|ollama_188"} == 0
expr: up{job=~"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111"} == 0
for: 2m
labels:
severity: critical
layer: systemd-188
layer: ai-provider
team: ai
auto_repair: "false"
alert_category: "ollama_failover"
@@ -57,7 +57,7 @@ groups:
for: 10m
labels:
severity: warning
layer: systemd-188
layer: ai-provider
team: ai
auto_repair: "false"
alert_category: "ollama_failover"

View File

@@ -19,6 +19,7 @@ Exit Codes:
"""
import json
import os
import subprocess
import sys
from pathlib import Path
@@ -29,7 +30,7 @@ import httpx
# Configuration
# =============================================================================
OLLAMA_URL = "http://192.168.0.188:11434/api/generate"
OLLAMA_URL = os.getenv("OLLAMA_GENERATE_URL", "http://192.168.0.111:11434/api/generate")
MODEL = "llama3.2:8b"
PROJECT_ROOT = Path(__file__).parent.parent
RULES_FILE = PROJECT_ROOT / ".awoooi-agent-rules.md"

View File

@@ -18,17 +18,21 @@ import httpx
async def main() -> int:
api_base = os.environ.get(
"INTERNAL_API_URL",
"http://awoooi-api.awoooi-prod.svc.cluster.local:8000",
"http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000",
)
url = f"{api_base}/api/v1/knowledge/embed-all"
async with httpx.AsyncClient(timeout=120) as client:
async with httpx.AsyncClient(timeout=1800) as client:
try:
resp = await client.post(url)
print(f"embed-all: {resp.status_code} {resp.text[:200]}")
if resp.status_code >= 400:
print(f"ERROR: embed-all returned {resp.status_code}", file=sys.stderr)
return 1
result = resp.json()
if int(result.get("failed", 0)) > 0:
print(f"ERROR: embed-all failed rows: {result}", file=sys.stderr)
return 1
return 0
except httpx.RequestError as exc:
print(f"ERROR: request failed — {exc}", file=sys.stderr)

View File

@@ -62,7 +62,6 @@ check_url "ArgoCD (121)" "https://192.168.0.121:30443"
echo ""
echo "--- AI 推理層 ---"
check_url "Ollama 111 GPU" "http://192.168.0.111:11434/api/tags"
check_url "Ollama 188 Hub" "http://192.168.0.188:11434/api/tags"
echo ""
echo "--- 觀測層 ---"

View File

@@ -0,0 +1,398 @@
#!/usr/bin/env bash
# AWOOOI full-stack cold-start readiness check.
# Read-only by design. It never restarts, deletes, repairs, or writes remote state.
set -uo pipefail
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6)
SEND_ALERT_TEST=0
for arg in "$@"; do
case "$arg" in
--send-alert-test)
SEND_ALERT_TEST=1
;;
-h|--help)
cat <<'USAGE'
Usage: bash scripts/reboot-recovery/full-stack-cold-start-check.sh [--send-alert-test]
Default mode is read-only and does not POST an Alertmanager test event.
Use --send-alert-test only after AWOOOI API is expected to be ready.
USAGE
exit 0
;;
*)
echo "Unknown argument: $arg" >&2
exit 64
;;
esac
done
RED=$'\033[0;31m'
GREEN=$'\033[0;32m'
YELLOW=$'\033[1;33m'
BLUE=$'\033[0;34m'
NC=$'\033[0m'
PASS=0
WARN=0
FAIL=0
log_section() {
printf "\n%s=== %s ===%s\n" "$BLUE" "$1" "$NC"
}
ok() {
printf "%sOK%s %s\n" "$GREEN" "$NC" "$1"
PASS=$((PASS + 1))
}
warn() {
printf "%sWARN%s %s\n" "$YELLOW" "$NC" "$1"
WARN=$((WARN + 1))
}
fail() {
printf "%sBLOCKED%s %s\n" "$RED" "$NC" "$1"
FAIL=$((FAIL + 1))
}
run_local() {
local label="$1"
shift
if "$@" >/tmp/awoooi-cold-start-check.out 2>&1; then
ok "$label"
cat /tmp/awoooi-cold-start-check.out
return 0
fi
fail "$label"
cat /tmp/awoooi-cold-start-check.out
return 1
}
ssh_cmd() {
local user_host="$1"
local cmd="$2"
local prefix=""
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
printf -v prefix 'REMOTE_SUDO_PASSWORD=%q ' "$REMOTE_SUDO_PASSWORD"
fi
ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}${cmd}"
}
probe_http_code() {
local url="$1"
local code
code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$url" 2>/dev/null || true)
echo "${code:-000}"
}
probe_tcp() {
local host="$1"
local port="$2"
nc -G 3 -z "$host" "$port" >/dev/null 2>&1 || nc -w 3 -z "$host" "$port" >/dev/null 2>&1
}
print_header() {
echo "AWOOOI full-stack cold-start check"
date '+%Y-%m-%d %H:%M:%S %Z'
echo "Scope: 110 / 120 / 121 / 188. 112 Kali is intentionally skipped."
}
check_network() {
log_section "P0-NETWORK"
local host
for host in 110 120 121 188; do
if ping -c 1 -W 2 "192.168.0.$host" >/dev/null 2>&1; then
ok "ping 192.168.0.$host"
else
fail "ping 192.168.0.$host"
fi
if probe_tcp "192.168.0.$host" 22; then
ok "ssh port 192.168.0.$host:22"
else
fail "ssh port 192.168.0.$host:22"
fi
done
arp -an | grep -E '192\.168\.0\.(110|120|121|188)' || warn "no ARP rows printed for one or more hosts"
}
check_188() {
log_section "P0-188-DATA"
local out
if ! out=$(ssh_cmd "ollama@192.168.0.188" '
echo "HOST $(hostname) $(uptime)"
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
echo "SYSTEMD $(systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx 2>/dev/null | tr "\n" " ")"
echo "PG $(pg_isready -h localhost -p 5432 2>&1)"
echo "REDIS $(redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true)"
echo "PORT5432 $(nc -z -w 2 127.0.0.1 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
echo "SIGNOZ_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3301/ || true)"
echo "MOMO_HEALTH_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5003/health || true)"
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -80
' 2>&1); then
fail "ssh 188 read-only check"
echo "$out"
return
fi
echo "$out"
grep -q "PORT5432 OPEN" <<<"$out" && ok "188 PostgreSQL port open" || fail "188 PostgreSQL port closed"
grep -q "accepting connections" <<<"$out" && ok "188 PostgreSQL accepting connections" || fail "188 PostgreSQL not accepting connections"
grep -q "REDIS PONG" <<<"$out" && ok "188 Redis PONG" || warn "188 Redis not confirmed"
grep -q "momo-db.*Restarting" <<<"$out" && warn "188 momo-db restarting" || ok "188 momo-db not in visible restart loop"
grep -Eq "SIGNOZ_CODE (200|302|307)" <<<"$out" && ok "188 SignOz HTTP reachable" || warn "188 SignOz HTTP not confirmed"
grep -q "MOMO_HEALTH_CODE 200" <<<"$out" && ok "188 momo health reachable" || warn "188 momo health not confirmed"
}
check_110() {
log_section "P0-110-REGISTRY-OBSERVABILITY"
local out
if ! out=$(ssh_cmd "wooo@192.168.0.110" '
echo "HOST $(hostname) $(uptime)"
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
echo "DOCKER_SYSTEMD $(systemctl is-active docker 2>/dev/null || true)"
echo "HARBOR_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5000/v2/ || true)"
echo "GITEA_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3001/ || true)"
echo "PROM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9090/-/ready || true)"
echo "AM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9093/-/healthy || true)"
echo "SENTRY_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 8 http://127.0.0.1:9000/ || true)"
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
done
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120
' 2>&1); then
fail "ssh 110 read-only check"
echo "$out"
return
fi
echo "$out"
grep -Eq "HARBOR_CODE (200|401)" <<<"$out" && ok "110 Harbor /v2 healthy code" || fail "110 Harbor not healthy"
grep -Eq "GITEA_CODE (200|302)" <<<"$out" && ok "110 Gitea reachable" || warn "110 Gitea not confirmed"
grep -q "PROM_CODE 200" <<<"$out" && ok "110 Prometheus ready" || warn "110 Prometheus not ready"
grep -q "AM_CODE 200" <<<"$out" && ok "110 Alertmanager healthy" || warn "110 Alertmanager not healthy"
grep -Eq "SENTRY_CODE (200|302|400)" <<<"$out" && ok "110 Sentry HTTP reachable" || warn "110 Sentry HTTP not confirmed"
grep -q "WatchdogUSec=0" <<<"$out" && ok "runner watchdog disabled on at least one unit" || warn "runner watchdog state not confirmed"
grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting"
}
check_k3s() {
log_section "P1-K3S"
local out local_kubectl_out
if ! out=$(ssh_cmd "wooo@192.168.0.120" '
echo "HOST $(hostname) $(uptime)"
echo "PG188_PORT $(nc -z -w 2 192.168.0.188 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
echo "SYSTEMD $(systemctl is-active k3s k3s-agent keepalived 2>/dev/null | tr "\n" " ")"
kcmd() {
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
else
sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
fi
}
kcmd get nodes -o wide 2>/dev/null || true
kcmd get pods -n awoooi-prod -o wide 2>/dev/null || true
ip addr show | grep 192.168.0.125 || true
' 2>&1); then
fail "ssh 120 k3s read-only check"
echo "$out"
return
fi
echo "$out"
if ! grep -q " Ready " <<<"$out"; then
local_kubectl_out=$(kubectl get nodes -o wide 2>/dev/null || true)
if [ -n "$local_kubectl_out" ]; then
echo "LOCAL_KUBECTL_FALLBACK"
echo "$local_kubectl_out"
fi
else
local_kubectl_out=""
fi
grep -q "PG188_PORT OPEN" <<<"$out" && ok "120 can reach 188 PostgreSQL port" || fail "120 cannot reach 188 PostgreSQL"
grep -q " Ready " <<<"$out$local_kubectl_out" && ok "K3s has Ready node output" || fail "K3s nodes not Ready or kubectl unavailable"
grep -q "192.168.0.125" <<<"$out" && ok "VIP 192.168.0.125 present on 120" || warn "VIP not confirmed on 120"
}
check_workload_and_alertchain() {
log_section "P2-WORKLOAD-ALERTCHAIN"
local api_code web_code alert_code
local out
if out=$(ssh_cmd "wooo@192.168.0.120" '
api_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32334/api/v1/health 2>/dev/null || true)
web_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32335/ 2>/dev/null || true)
echo "API_CODE ${api_code:-000}"
echo "WEB_CODE ${web_code:-000}"
' 2>/dev/null); then
api_code=$(awk '/^API_CODE / {print $2}' <<<"$out")
web_code=$(awk '/^WEB_CODE / {print $2}' <<<"$out")
else
api_code=$(probe_http_code "http://192.168.0.125:32334/api/v1/health")
web_code=$(probe_http_code "http://192.168.0.125:32335/")
out="API_CODE $api_code
WEB_CODE $web_code"
fi
echo "$out"
[[ "$api_code" =~ ^[23] ]] && ok "AWOOOI API reachable" || fail "AWOOOI API not reachable"
[[ "$web_code" =~ ^[23] ]] && ok "AWOOOI Web reachable" || warn "AWOOOI Web not confirmed"
if [ "$SEND_ALERT_TEST" -eq 1 ]; then
alert_code=$(ssh_cmd "wooo@192.168.0.120" 'curl -s -o /tmp/awoooi-alertchain.out -w "%{http_code}" --max-time 8 \
-X POST "http://192.168.0.125:32334/api/v1/webhooks/alertmanager" \
-H '"'"'Content-Type: application/json'"'"' \
-d '"'"'{"receiver":"cold-start-check","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartCheck","severity":"info"},"annotations":{"summary":"Cold start check"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-check"}'"'"' 2>/dev/null || echo "000"')
echo "ALERTCHAIN_CODE $alert_code"
[[ "$alert_code" =~ ^2 ]] && ok "Alertmanager webhook endpoint accepts POST" || warn "Alertmanager webhook E2E not confirmed"
else
warn "Alertmanager webhook POST skipped; rerun with --send-alert-test after API is ready"
fi
}
check_public_routes() {
log_section "P2-PUBLIC-ROUTES"
local awoooi_api_code awoooi_web_code momo_code momo_health_code
awoooi_api_code=$(probe_http_code "https://awoooi.wooo.work/api/v1/health")
awoooi_web_code=$(probe_http_code "https://awoooi.wooo.work/")
momo_code=$(probe_http_code "https://mo.wooo.work/")
momo_health_code=$(probe_http_code "https://mo.wooo.work/health")
echo "AWOOOI_PUBLIC_API_CODE $awoooi_api_code"
echo "AWOOOI_PUBLIC_WEB_CODE $awoooi_web_code"
echo "MOMO_PUBLIC_CODE $momo_code"
echo "MOMO_PUBLIC_HEALTH_CODE $momo_health_code"
[[ "$awoooi_api_code" =~ ^[23] ]] && ok "AWOOOI public API reachable" || warn "AWOOOI public API not confirmed"
[[ "$awoooi_web_code" =~ ^[23] ]] && ok "AWOOOI public web reachable" || warn "AWOOOI public web not confirmed"
[[ "$momo_code" =~ ^[23] ]] && ok "momo public route reachable" || warn "momo public route not confirmed"
[[ "$momo_health_code" =~ ^[23] ]] && ok "momo public health reachable" || warn "momo public health not confirmed"
}
check_schedules() {
log_section "P2-SCHEDULES"
local out
if out=$(ssh_cmd "ollama@192.168.0.188" '
now=$(date +%s)
echo "CRON_188 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
for f in /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/docker_stats.prom; do
if [ -f "$f" ]; then
mt=$(stat -c %Y "$f")
echo "TEXTFILE_188 $(basename "$f") age=$((now - mt))"
else
echo "TEXTFILE_188 $(basename "$f") missing"
fi
done
if [ -f /home/ollama/node_exporter_textfiles/backup.prom ]; then
awk -v now="$now" "/^backup_110_last_success_timestamp / {printf \"BACKUP_110_AGE %d\\n\", now - int(\$2)}" /home/ollama/node_exporter_textfiles/backup.prom
fi
echo "SCHEDULER_REGISTERED $(docker logs --tail 200 momo-scheduler 2>&1 | grep -c "全部排程任務已註冊" || true)"
' 2>&1); then
echo "$out"
grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed"
awk '/TEXTFILE_188 backup.prom age=/ {split($3,a,"="); exit !(a[2] < 90000)}' <<<"$out" && ok "188 backup textfile fresh enough" || warn "188 backup textfile stale or missing"
awk '/TEXTFILE_188 docker_restart_count.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker restart exporter fresh" || warn "188 docker restart exporter stale"
awk '/TEXTFILE_188 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker stats exporter fresh" || warn "188 docker stats exporter stale"
awk '/BACKUP_110_AGE / {exit !($2 < 90000)}' <<<"$out" && ok "188 backup-from-110 success within 25h" || warn "188 backup-from-110 success not confirmed"
awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out" && ok "188 momo scheduler registered jobs" || warn "188 momo scheduler registration not confirmed"
else
warn "188 schedule check unavailable"
echo "$out"
fi
if out=$(ssh_cmd "wooo@192.168.0.110" '
now=$(date +%s)
echo "CRON_110 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
echo "FAILED_UNITS_110 $(systemctl --failed --no-legend --plain 2>/dev/null | wc -l)"
echo "MOMO_STARTUP_ENABLED $(systemctl is-enabled momo-startup-complete.service 2>/dev/null || true)"
echo "STAGGERED_STARTUP_ENABLED $(systemctl is-enabled wooo-staggered-startup.service 2>/dev/null || true)"
for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom; do
if [ -f "$f" ]; then
mt=$(stat -c %Y "$f")
echo "TEXTFILE_110 $(basename "$f") age=$((now - mt))"
else
echo "TEXTFILE_110 $(basename "$f") missing"
fi
done
' 2>&1); then
echo "$out"
grep -q "CRON_110 active" <<<"$out" && ok "110 cron active" || warn "110 cron not confirmed"
grep -q "FAILED_UNITS_110 0" <<<"$out" && ok "110 systemd has no failed units" || warn "110 systemd failed units remain"
grep -q "MOMO_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale momo startup unit disabled" || warn "110 stale momo startup unit not disabled"
grep -q "STAGGERED_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale staggered startup unit disabled" || warn "110 stale staggered startup unit not disabled"
awk '/TEXTFILE_110 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 docker stats exporter fresh" || warn "110 docker stats exporter stale"
awk '/TEXTFILE_110 systemd_units.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 systemd units exporter fresh" || warn "110 systemd units exporter stale"
else
warn "110 schedule check unavailable"
echo "$out"
fi
if out=$(ssh_cmd "wooo@192.168.0.120" '
kcmd() {
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
else
sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
fi
}
echo "CRON_120 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
kcmd get cronjobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); items=d.get(\"items\", []); print(\"CRONJOB_COUNT\", len(items)); print(\"CRONJOB_SUSPENDED\", sum(1 for i in items if i.get(\"spec\",{}).get(\"suspend\")))"
kcmd get jobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); failed=0
for j in d.get(\"items\", []):
if any(c.get(\"type\")==\"Failed\" and c.get(\"status\")==\"True\" for c in j.get(\"status\",{}).get(\"conditions\",[]) or []):
failed += 1
print(\"FAILED_JOBS\", failed)"
kcmd get pods -n awoooi-prod --no-headers 2>/dev/null | awk "\$3 !~ /^(Running|Completed)$/ {bad++} END {print \"BAD_PODS\", bad+0}"
' 2>&1); then
echo "$out"
grep -q "CRON_120 active" <<<"$out" && ok "120 cron active" || warn "120 cron not confirmed"
awk '/CRONJOB_COUNT / {exit !($2 >= 4)}' <<<"$out" && ok "K8s AWOOOI CronJobs present" || warn "K8s AWOOOI CronJobs missing"
grep -q "CRONJOB_SUSPENDED 0" <<<"$out" && ok "K8s AWOOOI CronJobs unsuspended" || warn "K8s AWOOOI CronJob suspended"
grep -q "FAILED_JOBS 0" <<<"$out" && ok "K8s AWOOOI has no failed Jobs" || warn "K8s AWOOOI failed Jobs remain"
grep -q "BAD_PODS 0" <<<"$out" && ok "K8s AWOOOI pods Running/Completed only" || warn "K8s AWOOOI bad pod status remains"
else
warn "120 K8s schedule check unavailable"
echo "$out"
fi
if out=$(ssh_cmd "wooo@192.168.0.121" '
echo "CRON_121 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
crontab -l 2>/dev/null | grep -q "dr-drill.sh" && echo "DR_DRILL_CRON present" || echo "DR_DRILL_CRON missing"
' 2>&1); then
echo "$out"
grep -q "CRON_121 active" <<<"$out" && ok "121 cron active" || warn "121 cron not confirmed"
grep -q "DR_DRILL_CRON present" <<<"$out" && ok "121 DR drill cron present" || warn "121 DR drill cron missing"
else
warn "121 schedule check unavailable"
echo "$out"
fi
}
summary() {
log_section "SUMMARY"
echo "PASS=$PASS WARN=$WARN BLOCKED=$FAIL"
if [ "$FAIL" -gt 0 ]; then
echo "Result: BLOCKED. Fix the first blocked gate before releasing runner/CD/AI auto-remediation."
exit 2
fi
if [ "$WARN" -gt 0 ]; then
echo "Result: DEGRADED. Core gates passed but warnings remain."
exit 1
fi
echo "Result: GREEN. Full stack is ready for controlled runner/CD release."
}
print_header
check_network
check_188
check_110
check_k3s
check_workload_and_alertchain
check_public_routes
check_schedules
summary

View File

@@ -92,10 +92,10 @@ fi
echo ""
echo "🤖 Step 6: Verifying Ollama connection..."
OLLAMA_URL="http://192.168.0.188:11434/api/tags"
OLLAMA_URL="${OLLAMA_URL:-http://192.168.0.111:11434/api/tags}"
if curl -s --connect-timeout 5 "$OLLAMA_URL" > /dev/null 2>&1; then
echo " ✅ Ollama reachable at 192.168.0.188:11434"
echo " ✅ Ollama reachable at ${OLLAMA_URL}"
# Check if llama3.2:8b is available
MODELS=$(curl -s "$OLLAMA_URL" | grep -o '"name":"[^"]*"' || echo "")