From f8e44971c152f09bd2274114194b8480803a72ff Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 1 May 2026 14:20:16 +0800 Subject: [PATCH] feat(aiops): enable read-only agent loop canary --- .agents/skills/04-awoooi-devops-commander.md | 18 ++ .agents/skills/07-tool-integration-expert.md | 4 +- .gitea/workflows/cd.yaml | 31 +++- apps/api/src/core/config.py | 10 ++ apps/api/src/services/openclaw.py | 156 ++++++++++++++++++ apps/api/tests/test_agent_loop_foundation.py | 70 ++++++++ docs/LOGBOOK.md | 17 ++ docs/adr/ADR-105-mcp-agent-loop-governance.md | 12 ++ k8s/awoooi-prod/04-configmap.yaml | 5 + 9 files changed, 315 insertions(+), 8 deletions(-) diff --git a/.agents/skills/04-awoooi-devops-commander.md b/.agents/skills/04-awoooi-devops-commander.md index 3931d176..89aafc06 100644 --- a/.agents/skills/04-awoooi-devops-commander.md +++ b/.agents/skills/04-awoooi-devops-commander.md @@ -39,6 +39,7 @@ | v2.6 | 2026-04-11 | Claude Sonnet 4.6 | **Sprint B-1 Ansible IaC 骨架 + Architecture Review 安全修復** | | v2.7 | 2026-04-11 | Claude Sonnet 4.6 | **Sprint B-2/B-3 ArgoCD GitOps + Sprint C Velero/rsync DR + ADR-070 MCP Phase 1-4 全自動 AIOps 閉環 + ADR-071 告警通知四類型** | | v2.8 | 2026-04-25 | Claude Sonnet 4.6 | **🔴 Prometheus 記憶體指標選擇規範(working_set vs usage_bytes)+ Gitea HMAC Webhook 規範** | +| v2.9 | 2026-05-01 | Codex | **ArgoCD deploy revision gate:CD 不得以舊 revision Synced/Healthy 誤判成功** | --- @@ -624,6 +625,23 @@ concurrency: - Session Conflict 錯誤 - set_output 檔案遺失 +### ArgoCD Deploy Revision Gate (2026-05-01) + +GitOps CD 在 `kustomization.yaml` commit/push 後,禁止只用 `Synced + Healthy` 判定完成;那可能是上一個 revision 已同步。正確條件: + +```bash +DEPLOY_REVISION=$(git rev-parse HEAD) # chore(cd): deploy ... commit +kubectl annotate application awoooi-prod -n argocd \ + argocd.argoproj.io/refresh=hard --overwrite + +# 必須同時成立 +status.sync.status == Synced +status.health.status == Healthy +status.sync.revision == DEPLOY_REVISION +``` + +超時必須 `exit 1`,不可繼續 rollout/health check 舊 image,否則會把「舊版健康」誤報成「新版已部署」。 + --- ## 🚨 Runner 殭屍進程修復 (2026-03-26 教訓) diff --git a/.agents/skills/07-tool-integration-expert.md b/.agents/skills/07-tool-integration-expert.md index fcb057d0..03134a9a 100644 --- a/.agents/skills/07-tool-integration-expert.md +++ b/.agents/skills/07-tool-integration-expert.md @@ -10,7 +10,7 @@ | 欄位 | 值 | |------|-----| -| **版本** | v1.4 | +| **版本** | v1.5 | | **建立日期** | 2026-03-25 23:30 (台北) | | **建立者** | Claude Code | | **最後修改** | 2026-05-01 15:45 (台北) | @@ -20,6 +20,7 @@ | 版本 | 日期 | 執行者 | 變更內容 | |------|------|--------|----------| +| v1.5 | 2026-05-01 | Codex | OpenClaw Agent Loop read-only shadow canary + prod feature flag | | v1.4 | 2026-05-01 | Codex | MCP Agent Loop governance、audit schema、Agent role tool permissions | | v1.3 | 2026-03-26 18:00 | Claude Code | 新增 Grafana MCP (#83) + SignOz query_logs | | v1.2 | 2026-03-26 23:30 | Claude Code | 新增 Filesystem MCP Tool (#82 已完成) | @@ -57,6 +58,7 @@ Phase 13.2 Tool 實作 (P0 最優先): - OpenClaw / NemoTron / Hermes / ElephantAlpha 的工具白名單必須由 `ai_providers/permissions.py` 控制。 - Internal RAG/MCP 知識層沿用 PostgreSQL + pgvector + Redis hot cache;不得為「MCP RAG」另建孤立資料庫,除非已有量級、隔離或延遲證據。 - `incident_id` 在 MCP audit schema 中使用 `VARCHAR(64)`,因為 AWOOOI incident 是 `INC-*` 字串,不是 UUID。 +- OpenClaw Agent Loop 初期只可用 shadow canary:`ENABLE_OPENCLAW_AGENT_LOOP_SHADOW=true` 時,先給 read-only tools 且不改主決策;確認 `mcp_audit_log`、latency、LLM quality 後才允許升級成 decisive path。 ### 已完成 Tool 功能 diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 0ca1b414..bcef1eed 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -635,6 +635,7 @@ jobs: git config user.email "cd@awoooi.internal" git config user.name "AWOOOI CD" git add k8s/awoooi-prod/kustomization.yaml + DEPLOY_REVISION="" git diff --cached --quiet && echo "⚡ kustomization.yaml 無變化,跳過 push" || { git commit -m "chore(cd): deploy ${IMAGE_TAG::7} [skip ci]" # 用 token 推送(避免 SSH key 需要額外設定 push 權限) @@ -644,26 +645,42 @@ jobs: # 2026-04-17 ogt: -X theirs — kustomization.yaml 衝突時採用當次部署的 image tag git fetch gitea main git rebase -X theirs gitea/main + DEPLOY_REVISION=$(git rev-parse HEAD) git push gitea main - echo "✅ kustomization.yaml 已 push,等待 ArgoCD sync..." + echo "✅ kustomization.yaml 已 push,等待 ArgoCD sync 到 ${DEPLOY_REVISION:0:8}..." } # ─── Step 4: 等待 ArgoCD sync + rollout ─── - ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 << 'ARGOCD_WAIT' + ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \ + "EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" << 'ARGOCD_WAIT' set -e export KUBECONFIG=/etc/rancher/k3s/k3s.yaml - # 等待 ArgoCD Application Synced(最多 120s) + # 等待 ArgoCD Application Synced(最多 180s)。只看 + # Synced/Healthy 可能誤判成上一個 revision 已同步,因此有 + # deploy commit 時必須同時確認 status.sync.revision。 echo "⏳ 等待 ArgoCD sync..." - for i in $(seq 1 24); do + sudo kubectl annotate application awoooi-prod -n argocd \ + argocd.argoproj.io/refresh=hard --overwrite >/dev/null 2>&1 || true + for i in $(seq 1 36); do SYNC=$(sudo kubectl get application awoooi-prod -n argocd \ -o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown") HEALTH=$(sudo kubectl get application awoooi-prod -n argocd \ -o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown") - echo " ArgoCD: sync=$SYNC health=$HEALTH" + REVISION=$(sudo kubectl get application awoooi-prod -n argocd \ + -o jsonpath='{.status.sync.revision}' 2>/dev/null || echo "Unknown") + SHORT_REVISION=$(echo "$REVISION" | cut -c1-8) + SHORT_EXPECTED=$(echo "$EXPECTED_REVISION" | cut -c1-8) + echo " ArgoCD: sync=$SYNC health=$HEALTH revision=$SHORT_REVISION expected=${SHORT_EXPECTED:-any}" if [ "$SYNC" = "Synced" ] && [ "$HEALTH" = "Healthy" ]; then - echo "✅ ArgoCD Synced + Healthy" - break + if [ -z "$EXPECTED_REVISION" ] || [ "$REVISION" = "$EXPECTED_REVISION" ]; then + echo "✅ ArgoCD Synced + Healthy" + break + fi + fi + if [ "$i" = "36" ]; then + echo "❌ ArgoCD 未在期限內同步到目標 revision" + exit 1 fi sleep 5 done diff --git a/apps/api/src/core/config.py b/apps/api/src/core/config.py index f9137a4a..79e909df 100644 --- a/apps/api/src/core/config.py +++ b/apps/api/src/core/config.py @@ -61,6 +61,16 @@ class Settings(BaseSettings): default=False, description="Phase 24: True=新 AIRouter 路由, False=舊 openclaw.py fallback chain", ) + ENABLE_OPENCLAW_AGENT_LOOP_SHADOW: bool = Field( + default=False, + description="ADR-105 P1: True=OpenClaw 生成 proposal 後用本地 Agent Loop 做 read-only shadow investigation, False=不執行", + ) + OPENCLAW_AGENT_LOOP_MAX_ITERATIONS: int = Field( + default=3, + ge=1, + le=5, + description="ADR-105 P1: OpenClaw Agent Loop shadow 最大 tool_use 輪數", + ) # ========================================================================== # W1 PR-P1: Playbook 匹配 Feature Flag (2026-04-28 ogt + Claude Sonnet 4.6) diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index c472ca92..e4db0452 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -1709,6 +1709,153 @@ Focus on: # 2026-03-31 Claude Code: 統帥批准實作 # ========================================================================= + async def _maybe_run_openclaw_agent_loop_shadow( + self, + *, + proposal: dict, + incident_id: str, + severity: str, + signals: list[dict], + affected_services: list[str], + expert_context: dict | None = None, + ) -> None: + """ + ADR-105 P1: read-only Agent Loop shadow investigation. + + This is intentionally non-decisive: it proves MCP tool_use/audit wiring + with local models and read-only tools, then falls back silently to the + regular proposal when disabled or unavailable. + """ + if not settings.ENABLE_OPENCLAW_AGENT_LOOP_SHADOW: + return + + try: + from src.plugins.mcp.registry import get_provider_registry + from src.services.ai_providers.agent_loop import AgentToolExecutor + from src.services.ai_providers.permissions import is_read_only_tool + from src.services.ai_router import get_ai_registry + + ai_registry = get_ai_registry() + provider = ai_registry.get("ollama") or ai_registry.get("ollama_188") + if provider is None or not hasattr(provider, "analyze_with_tools"): + logger.warning( + "openclaw_agent_loop_shadow_skipped", + incident_id=incident_id, + reason="no_local_tool_provider", + ) + return + + mcp_registry = get_provider_registry() + providers = {p.name: p for p in mcp_registry.all()} + allowed_servers = {"kubernetes", "prometheus", "signoz", "database", "rag", "grafana"} + available_tools = [] + for mcp_provider in providers.values(): + if mcp_provider.name not in allowed_servers: + continue + try: + provider_tools = await mcp_provider.list_tools() + except Exception as exc: + logger.warning( + "openclaw_agent_loop_tool_list_failed", + incident_id=incident_id, + provider=mcp_provider.name, + error=str(exc), + ) + continue + available_tools.extend( + tool for tool in provider_tools + if tool.server_name in allowed_servers and is_read_only_tool(tool) + ) + + if not available_tools: + logger.warning( + "openclaw_agent_loop_shadow_skipped", + incident_id=incident_id, + reason="no_readonly_tools", + ) + return + + executor = AgentToolExecutor( + available_tools=available_tools, + providers=providers, + agent_role="openclaw", + incident_id=incident_id, + flywheel_node="reason", + ) + shadow_prompt = self._build_agent_loop_shadow_prompt( + proposal=proposal, + incident_id=incident_id, + severity=severity, + signals=signals, + affected_services=affected_services, + expert_context=expert_context, + ) + result = await provider.analyze_with_tools( + prompt=shadow_prompt, + available_tools=available_tools, + tool_executor=executor.execute, + max_iterations=settings.OPENCLAW_AGENT_LOOP_MAX_ITERATIONS, + agent_role="openclaw", + context={ + "incident_id": incident_id, + "severity": severity, + "task_type": "diagnose", + }, + ) + proposal["agent_loop_shadow"] = { + "enabled": True, + "success": result.success, + "provider": result.provider, + "tokens": result.tokens, + "latency_ms": round(result.latency_ms, 1), + "error": result.error, + "preview": (result.raw_response or "")[:700], + } + logger.info( + "openclaw_agent_loop_shadow_complete", + incident_id=incident_id, + provider=result.provider, + success=result.success, + tools_available=len(available_tools), + latency_ms=round(result.latency_ms, 1), + ) + except Exception as exc: + logger.warning( + "openclaw_agent_loop_shadow_failed", + incident_id=incident_id, + error=str(exc), + ) + + def _build_agent_loop_shadow_prompt( + self, + *, + proposal: dict, + incident_id: str, + severity: str, + signals: list[dict], + affected_services: list[str], + expert_context: dict | None = None, + ) -> str: + """Build a compact read-only investigation prompt for Agent Loop shadow mode.""" + return f"""你是 OpenClaw 的唯讀 shadow investigator。你可以使用 MCP 工具查證,但不得要求任何寫入、重啟、刪除或通知動作。 + +請只回傳 JSON: +{{ + "root_cause_check": "你對目前根因的查證結論", + "evidence_used": ["最多 5 條具體證據"], + "confidence_delta": -0.1, + "missing_evidence": ["還缺什麼證據"], + "human_or_ai_next_step": "下一個安全步驟" +}} + +Incident: {incident_id} +Severity: {severity} +Affected services: {json.dumps(affected_services, ensure_ascii=False)} +Signals: {json.dumps(signals[:5], ensure_ascii=False, default=str)} +Current proposal: {json.dumps(proposal, ensure_ascii=False, default=str)} +Expert context: {json.dumps(expert_context or {}, ensure_ascii=False, default=str)} +""" + async def generate_incident_proposal_with_tools( self, incident_id: str, @@ -1762,6 +1909,15 @@ Focus on: if not success or proposal is None: return proposal, provider, success + await self._maybe_run_openclaw_agent_loop_shadow( + proposal=proposal, + incident_id=incident_id, + severity=severity, + signals=signals, + affected_services=affected_services, + expert_context=expert_context, + ) + # Step 2: 判斷是否需要 Nemotron risk_level = proposal.get("risk_level", "low").lower() if risk_level == "low": diff --git a/apps/api/tests/test_agent_loop_foundation.py b/apps/api/tests/test_agent_loop_foundation.py index 84505391..fbaacd7a 100644 --- a/apps/api/tests/test_agent_loop_foundation.py +++ b/apps/api/tests/test_agent_loop_foundation.py @@ -2,6 +2,7 @@ import pytest from src.plugins.mcp.interfaces import MCPTool, MCPToolProvider, MCPToolResult from src.plugins.mcp.registry import AuditedMCPToolProvider +from src.services.ai_providers.interfaces import AIResult from src.services.ai_providers.agent_loop import AgentToolExecutor from src.services.ai_providers.permissions import filter_tools_for_agent, is_tool_allowed from src.services.ai_providers.tool_schema import ( @@ -119,3 +120,72 @@ async def test_agent_tool_executor_blocks_disallowed_tool(): assert result.success is False assert "not allowed" in (result.error or "") assert provider.calls == [] + + +@pytest.mark.asyncio +async def test_openclaw_agent_loop_shadow_uses_read_only_tools(monkeypatch): + from src.core.config import settings + from src.services.openclaw import OpenClawService + + monkeypatch.setattr(settings, "ENABLE_OPENCLAW_AGENT_LOOP_SHADOW", True) + monkeypatch.setattr(settings, "OPENCLAW_AGENT_LOOP_MAX_ITERATIONS", 2) + + class FakeAIProvider: + def __init__(self): + self.seen_tools = [] + + async def analyze_with_tools( + self, + prompt, + available_tools, + tool_executor, + max_iterations=5, + agent_role="openclaw", + context=None, + ): + self.seen_tools = available_tools + return AIResult( + raw_response='{"root_cause_check":"ok"}', + success=True, + provider="ollama_agent_loop", + latency_ms=12.3, + ) + + class FakeAIRegistry: + def __init__(self, provider): + self.provider = provider + + def get(self, name): + return self.provider if name == "ollama" else None + + class FakeMCPRegistry: + def all(self): + return [FakeProvider("database"), FakeProvider("ssh_host")] + + async def fake_list_tools(self): + if self.name == "database": + return [ + _tool("database", "list_incidents"), + _tool("database", "execute_sql"), + ] + return [_tool("ssh_host", "run_command")] + + fake_ai_provider = FakeAIProvider() + monkeypatch.setattr(FakeProvider, "list_tools", fake_list_tools) + monkeypatch.setattr("src.services.ai_router.get_ai_registry", lambda: FakeAIRegistry(fake_ai_provider)) + monkeypatch.setattr("src.plugins.mcp.registry.get_provider_registry", lambda: FakeMCPRegistry()) + + service = object.__new__(OpenClawService) + proposal = {"risk_level": "medium", "action": "investigate"} + + await service._maybe_run_openclaw_agent_loop_shadow( + proposal=proposal, + incident_id="INC-1", + severity="P2", + signals=[{"alertname": "ApiErrorRateHigh"}], + affected_services=["awoooi-api"], + expert_context={"source": "test"}, + ) + + assert proposal["agent_loop_shadow"]["success"] is True + assert [tool.name for tool in fake_ai_provider.seen_tools] == ["list_incidents"] diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 00605e7b..b422d9cd 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,23 @@ --- +## 2026-05-01 | Agent Loop P1 canary + CD Argo revision gate + SSH MCP 四節點閉環 + +承接 ADR-105 地基與 production 驗證後的待辦:CD 會在 push deploy commit 後誤判上一個 Argo revision 已 Synced/Healthy;SSH MCP key 尚未授權 120/121;Agent Loop 仍只停在 provider capability,尚未有 production canary。 + +### 完成 +- Gitea CD `Deploy to K8s (ArgoCD GitOps)` 在 push `chore(cd): deploy ... [skip ci]` 後,會記錄 `DEPLOY_REVISION`,先 annotate `argocd.argoproj.io/refresh=hard`,再等待 `.status.sync.revision == DEPLOY_REVISION` 且 Synced/Healthy;超時直接 fail,不再讓舊 revision rollout 假成功。 +- `ssh-mcp-key` public key 已以一次性 privileged pod 追加到 `mon(192.168.0.120)` 與 `mon1(192.168.0.121)` 的 `wooo/.ssh/authorized_keys`;臨時 pod 已刪除。 +- API pod 內使用 `/run/secrets/ssh_mcp_key` + `/etc/ssh-mcp/known_hosts` 驗證 `wooo@192.168.0.120`、`wooo@192.168.0.121` 均回 `OK`。 +- 新增 `ENABLE_OPENCLAW_AGENT_LOOP_SHADOW` / `OPENCLAW_AGENT_LOOP_MAX_ITERATIONS`;production configmap 開啟 read-only canary,最多 2 輪,本地 Ollama tool_use,不改主決策。 +- `OpenClawService.generate_incident_proposal_with_tools()` 在原 proposal 成功後執行 read-only Agent Loop shadow investigation,只給 Kubernetes/Prometheus/SignOz/Database/RAG/Grafana 的 read-only MCP tools,結果附加 `agent_loop_shadow` metadata。 +- Agent Loop shadow 失敗只 log warning,不阻塞原本 PreDecision/Nemotron/Playbook 路徑。 + +### 驗證 +- `python3 -m py_compile apps/api/src/core/config.py apps/api/src/services/openclaw.py apps/api/src/services/ai_providers/permissions.py` 通過。 +- `cd apps/api && pytest tests/test_agent_loop_foundation.py tests/test_openclaw_cache_key.py -q` → 8 passed。 +- Production 前置檢查:最新 image `b6cf6167` 健康;ArgoCD `Synced Healthy 33a71489`;四節點 SSH MCP key 驗證完成。 + ## 2026-05-01 | LLM 鬼循環治理 — in-flight lock + stable cache + no-retry 2xx Claude Code 成本評估指出真正瓶頸不是外部 AI 費用,而是同一告警 0 秒重入、20 秒週期反覆呼叫 LLM、以及 HTTP 500 讓 Alertmanager 立即重試。結論:先修飛輪,再談 Gemini/Groq/Claude 訂閱;健康狀態下外部 provider 只應作為 capped fallback。 diff --git a/docs/adr/ADR-105-mcp-agent-loop-governance.md b/docs/adr/ADR-105-mcp-agent-loop-governance.md index 04763fe7..6a922022 100644 --- a/docs/adr/ADR-105-mcp-agent-loop-governance.md +++ b/docs/adr/ADR-105-mcp-agent-loop-governance.md @@ -64,6 +64,18 @@ Alert -> Agent Loop -> LLM tool_use -> MCP tool_result -> Final Decision 3. P2: OpenClaw/NemoTron/Hermes/ElephantAlpha 逐一接線,先 read-only / diagnose,再開執行類工具。 4. P3: Langfuse spans、Grafana MCP dashboard、audit replay。 +## 2026-05-01 P1 Canary + +OpenClaw 先接 read-only shadow investigation,而不是直接替換主決策: + +- Feature flag:`ENABLE_OPENCLAW_AGENT_LOOP_SHADOW` +- 輪數上限:`OPENCLAW_AGENT_LOOP_MAX_ITERATIONS`(prod canary 設 2) +- 觸發點:`OpenClawService.generate_incident_proposal_with_tools()` 原 proposal 成功後 +- 允許工具:Kubernetes / Prometheus / SignOz / Database / RAG / Grafana 的 read-only tools +- Provider:本地 Ollama 優先,不新增 Gemini/Claude 付費呼叫 +- 影響面:只附加 `agent_loop_shadow` metadata,不覆蓋 `action`、`risk_level`、`confidence` 或 Nemotron tool result +- 失敗策略:log warning 後回到既有 proposal / Nemotron / Playbook 路徑 + ## 驗收 - `mcp_audit_log` 24h call count > 0。 diff --git a/k8s/awoooi-prod/04-configmap.yaml b/k8s/awoooi-prod/04-configmap.yaml index a9b9f13a..16ce9152 100644 --- a/k8s/awoooi-prod/04-configmap.yaml +++ b/k8s/awoooi-prod/04-configmap.yaml @@ -52,6 +52,11 @@ data: # 回滾: kubectl set env deployment/awoooi-api -n awoooi-prod USE_AI_ROUTER=false # ============================================================================ USE_AI_ROUTER: "true" + # ADR-105 P1: OpenClaw Agent Loop shadow canary. + # 只給 read-only MCP tools,使用本地 Ollama,結果只附加 metadata 不改決策。 + # 回滾: kubectl set env deployment/awoooi-api -n awoooi-prod ENABLE_OPENCLAW_AGENT_LOOP_SHADOW=false + ENABLE_OPENCLAW_AGENT_LOOP_SHADOW: "true" + OPENCLAW_AGENT_LOOP_MAX_ITERATIONS: "2" # ============================================================================ # Phase 22: OpenClaw + Nemotron 協作 (ADR-044)