Compare commits

..

9 Commits

Author SHA1 Message Date
Your Name
337b2df60d chore(cd): deploy latest image tag for prod manifests 2026-06-04 00:13:51 +08:00
Your Name
ab21d8bad2 chore: execute W1-redline convergence updates and evidence log 2026-06-03 20:10:14 +08:00
Your Name
2d37383fc6 fix(monitoring): fix false positive NoAlertsReceived2Hours by filtering only alertmanager source 2026-05-28 15:33:17 +08:00
Your Name
3779f6f1e0 fix(metrics): 串入飛輪指標到 /metrics 主端點,修復 FlywheelExecutionRateMissing 死告警
INC-20260507-99ADF2 根因(feedback_full_chain_first_then_fix.md 全鏈分析):

【鏈路斷點】規則層(5/3 加)vs 指標層(5/6 改)vs scrape 層(從沒同步)
- 577250a6(5/3)「反消音化」commit 加了 FlywheelExecutionRateMissing
  rule,要求 110 Prom scrape 到 awoooi_flywheel_execution_success_rate;
- a2c4b3d4(5/6)Codex 改 FlywheelStatsService 用 auto_repair_executions
  作 source of truth(24h 樣本 1-9 筆回 None 給 W-3b watchdog 接管);
- 但 awoooi_flywheel_* 指標自始至終只在 /api/v1/stats/flywheel/metrics
  暴露,110 Prom awoooi-api job 抓的是 /metrics → absent() 永遠 1
  → 自 2026-05-06T04:14 UTC 起 firing 26h+ 屬 dead alert

【修法】只動 awoooi-api 一處,不碰 Codex 設計、不碰 110 Prom 配置:
- main.py /metrics endpoint 改 async,在 generate_latest() 後串入
  FlywheelStatsService.compute() → to_prometheus_lines()。
- 既有 awoooi-api scrape job 自動拿到飛輪指標。
- 完全保留 Codex a2c4b3d4 設計:1-9 筆回 None 讓 W-3b watchdog 雙保險。

【不碰的部分】
- flywheel_stats_service.py 不動:Codex 5/6 LOGBOOK 已明確說明
  「Redis playbook counter 失準 → 用 auto_repair_executions 為唯一信任源」,
  1-9 筆 return None 是配合 ai_slo_watchdog_job W-3b grace+30min 設計的
  反消音化雙保險,不是 bug。

驗證計畫(部署後):
1. curl /metrics | grep awoooi_flywheel  → 看到飛輪指標
2. Prom query awoooi_flywheel_execution_success_rate  → 非空
3. ALERTS{alertname="FlywheelExecutionRateMissing"}  → resolved
4. 30 分鐘觀察 Telegram 不再收 INC-20260507-99ADF2

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 15:32:47 +08:00
Your Name
c38227e945 fix(ai): remove 188 ollama provider 2026-05-06 14:33:16 +08:00
Your Name
1b4a6c1e8c fix(awooop): align console with flywheel execution metrics 2026-05-06 00:44:53 +08:00
Your Name
894174da5b fix(ops): harden cold-start schedule recovery 2026-05-05 22:14:54 +08:00
Your Name
10cd9fc025 fix(openclaw): gate alert cloud fallback behind flag 2026-05-05 20:53:12 +08:00
Your Name
8161ccf83f fix(ops): persist host resource guardrails 2026-05-05 16:13:02 +08:00
623 changed files with 7046 additions and 223311 deletions

832
.claude/settings.json Normal file
View File

@@ -0,0 +1,832 @@
{
"permissions": {
"allow": [
"Read(**)",
"Glob(**)",
"Grep(**)",
"Bash(curl *)",
"Bash(kubectl get *)",
"Bash(kubectl describe *)",
"Bash(kubectl logs *)",
"Bash(kubectl rollout status *)",
"Bash(docker ps *)",
"Bash(docker logs *)",
"Bash(ls *)",
"Bash(cat *)",
"Bash(head *)",
"Bash(tail *)",
"Bash(grep *)",
"Bash(find *)",
"Bash(pwd)",
"Bash(which *)",
"Bash(echo *)",
"Bash(git status *)",
"Bash(git log *)",
"Bash(git diff *)",
"Bash(git branch *)",
"Bash(git remote *)",
"Edit(**)",
"Write(apps/**)",
"Write(packages/**)",
"Write(docs/**)",
"Write(.agents/**)",
"Write(k8s/**)",
"Write(scripts/**)",
"Bash(pnpm *)",
"Bash(npm *)",
"Bash(npx *)",
"Bash(node *)",
"Bash(python *)",
"Bash(python3 *)",
"Bash(pip *)",
"Bash(cd *)",
"Bash(mkdir *)",
"Bash(touch *)",
"Bash(cp *)",
"Bash(mv *)",
"Bash(chmod *)",
"Bash(pytest *)",
"Bash(playwright *)",
"Bash(git add *)",
"Bash(git commit *)",
"Bash(git stash *)",
"Bash(ssh *)",
"Bash(scp *)",
"Bash(export KUBECONFIG=*)",
"Bash(git push:*)",
"Bash(claude --version)",
"Bash(git check-ignore:*)",
"WebSearch",
"Bash(claude plugin:*)",
"Bash(claude --channels)",
"Bash(claude --channels plugin:telegram@claude-plugins-official --help)",
"Bash(bash)",
"Bash(source ~/.zshrc)",
"Bash(~/.bun/bin/bun --version)",
"Bash(env)",
"Bash(claude upgrade:*)",
"Bash(/Users/ogt/.local/bin/claude --help)",
"Bash(CLAUDE_CODE_EXPERIMENTAL_CHANNELS=1 claude --help)",
"Bash(claude --channels plugin:telegram@claude-plugins-official --print \"hello\")",
"Bash(mkdir -p ~/.claude/channels/telegram)",
"Bash(~/.claude/channels/telegram/.env)",
"Bash(~/.bun/bin/bun run:*)",
"Bash(sudo ln:*)",
"Bash(ln -sf ~/.bun/bin/bun /opt/homebrew/bin/bun)",
"Bash(xargs python:*)",
"Bash(uv --version)",
"Bash(pip3 install:*)",
"Bash(pip3 show:*)",
"Bash(ruff *)",
"Bash(mypy *)",
"Bash(black *)",
"Bash(isort *)",
"Bash(timeout *)",
"Bash(wc *)",
"Bash(sort *)",
"Bash(uniq *)",
"Bash(awk *)",
"Bash(sed *)",
"Bash(tr *)",
"Bash(tee *)",
"Bash(xargs *)",
"Bash(test *)",
"Bash([ *)",
"Bash(true)",
"Bash(false)",
"Bash(date *)",
"Bash(sleep *)",
"Bash(kill *)",
"Bash(pkill *)",
"Bash(ps *)",
"Bash(top *)",
"Bash(htop *)",
"Bash(df *)",
"Bash(du *)",
"Bash(free *)",
"Bash(uname *)",
"Bash(hostname *)",
"Bash(whoami)",
"Bash(id *)",
"Bash(groups *)",
"Bash(stat *)",
"Bash(file *)",
"Bash(realpath *)",
"Bash(dirname *)",
"Bash(basename *)",
"Bash(type *)",
"Bash(command *)",
"Bash(hash *)",
"Bash(alias *)",
"Bash(set *)",
"Bash(unset *)",
"Bash(printenv *)",
"Bash(diff *)",
"Bash(cmp *)",
"Bash(comm *)",
"Bash(join *)",
"Bash(paste *)",
"Bash(cut *)",
"Bash(rev *)",
"Bash(nl *)",
"Bash(fmt *)",
"Bash(fold *)",
"Bash(pr *)",
"Bash(expand *)",
"Bash(unexpand *)",
"Bash(od *)",
"Bash(xxd *)",
"Bash(hexdump *)",
"Bash(strings *)",
"Bash(base64 *)",
"Bash(md5sum *)",
"Bash(sha256sum *)",
"Bash(jq *)",
"Bash(yq *)",
"Bash(gh *)",
"Bash(docker build *)",
"Bash(docker run *)",
"Bash(docker exec *)",
"Bash(docker compose *)",
"Bash(docker-compose *)",
"Bash(docker images *)",
"Bash(docker inspect *)",
"Bash(docker network *)",
"Bash(docker volume *)",
"Bash(kubectl apply *)",
"Bash(kubectl create *)",
"Bash(kubectl exec *)",
"Bash(kubectl port-forward *)",
"Bash(kubectl config *)",
"Bash(helm *)",
"Bash(terraform *)",
"Bash(ansible *)",
"Bash(bun *)",
"Bash(deno *)",
"Bash(cargo *)",
"Bash(rustc *)",
"Bash(go *)",
"Bash(java *)",
"Bash(javac *)",
"Bash(gradle *)",
"Bash(mvn *)",
"Bash(make *)",
"Bash(cmake *)",
"Bash(ninja *)",
"Bash(uv *)",
"Bash(poetry *)",
"Bash(pipx *)",
"Bash(virtualenv *)",
"Bash(venv *)",
"Bash(conda *)",
"Bash(brew *)",
"Bash(apt *)",
"Bash(apt-get *)",
"Bash(yum *)",
"Bash(dnf *)",
"Bash(pacman *)",
"Bash(snap *)",
"Bash(flatpak *)",
"Bash(systemctl status *)",
"Bash(journalctl *)",
"Bash(service * status)",
"Bash(nc *)",
"Bash(netstat *)",
"Bash(ss *)",
"Bash(lsof *)",
"Bash(nmap *)",
"Bash(dig *)",
"Bash(nslookup *)",
"Bash(host *)",
"Bash(ping *)",
"Bash(traceroute *)",
"Bash(mtr *)",
"Bash(wget *)",
"Bash(http *)",
"Bash(httpie *)",
"Bash(hadolint apps/api/Dockerfile)",
"Bash(docker info:*)",
"Bash(kubectl cluster-info:*)",
"Read(//var/run/**)",
"Bash(open -a Docker)",
"Bash(git rm:*)",
"Bash(git reset:*)",
"Bash(kubectl --kubeconfig ~/.kube/config get pods -n awoooi -o wide)",
"Bash(kubectl scale:*)",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollam@192.168.0.188 \"docker ps -a | grep -i claw\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker ps -a | grep -i claw\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker start clawbot && sleep 3 && docker logs clawbot --tail=10\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker ps | grep clawbot && docker port clawbot\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker logs clawbot --tail=30\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cat /home/ollama/clawbot/.env | grep -E ''\\(TG_|TELEGRAM\\)'' | head -5\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker inspect clawbot --format=''{{range .Mounts}}{{.Source}}:{{.Destination}} {{end}}''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker inspect clawbot --format=''{{range .Config.Env}}{{println .}}{{end}}'' | grep -E ''\\(TG_|TELEGRAM|ENABLED\\)''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker logs clawbot 2>&1 | grep -i ''logout\\\\|log.out\\\\|shutdown\\\\|stop'' | tail -20\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker logs clawbot 2>&1 | grep -E ''\\(getMe|getUpdates|sendMessage\\).*200'' | tail -5\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker logs clawbot 2>&1 | grep -i ''success\\\\|started\\\\|初始化'' | head -20\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker logs clawbot 2>&1 | grep -E ''2026-03-\\(19|20|21\\)'' | grep -i ''error\\\\|fail\\\\|logout\\\\|400\\\\|401'' | head -20\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker stop clawbot && docker rm clawbot && echo ''✅ OpenClaw 已永久停用''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cd /home/ollama/clawbot-v5 && docker-compose ps 2>/dev/null || ls -la docker-compose.yml 2>/dev/null || find /home/ollama -name ''docker-compose*.yml'' -type f 2>/dev/null | head -5\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cd /home/ollama/clawbot-v5 && docker-compose up -d && sleep 3 && docker-compose ps\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cd /home/ollama/clawbot-v5 && docker compose up -d 2>&1 || docker run -d --name clawbot --restart unless-stopped -p 8088:8088 -v /var/run/docker.sock:/var/run/docker.sock 192.168.0.110:5000/library/clawbot:stable-v6 2>&1\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker logs clawbot --tail=15 2>&1\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker ps --format ''table {{.Names}}\\\\t{{.Status}}'' | grep -E ''clawbot|litellm''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cd /home/ollama/clawbot-v5 && sed -i ''s|TELEGRAM_BOT_TOKEN=.*|TELEGRAM_BOT_TOKEN=8569720657:AAHrJ5CMOb4rP0IYJrCUiDViLsnpK69uEUI|'' .env && grep TELEGRAM_BOT_TOKEN .env\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cd /home/ollama/clawbot-v5 && docker compose down && docker compose up -d && sleep 5 && docker logs clawbot --tail=10\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker ps --format ''{{.Names}}'' | grep -i alert\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker stop alertmanager && docker rm alertmanager && echo ''✅ 舊 AIOPS Alertmanager 已停用''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker ps --format ''table {{.Names}}\\\\t{{.Image}}\\\\t{{.Status}}''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cat /home/ollama/momo-pro/monitoring/prometheus/alert_rules.yml 2>/dev/null | grep -A5 ''ClawbotDown\\\\|telegram\\\\|AIOPS'' | head -30\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"find /home/ollama -name ''*.yml'' -type f 2>/dev/null | xargs grep -l ''ClawbotDown\\\\|telegram'' 2>/dev/null | head -5\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker exec clawbot grep -r ''協同警報\\\\|ClawbotDown'' /app 2>/dev/null | head -5\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker exec prometheus cat /etc/prometheus/prometheus.yml 2>/dev/null | grep -A10 ''alerting\\\\|alertmanager''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker ps | grep -i alert || echo ''✅ 沒有 alertmanager 在運行''\")",
"Bash(jq -r '.status, .components | to_entries[] | \"\"\"\"\\\\\\(.key\\): \\\\\\(.value.status\\)\"\"\"\"')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker ps --format ''table {{.Names}}\\\\t{{.Status}}'' | grep clawbot && docker logs clawbot --tail=15\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker inspect clawbot --format=''{{range .Config.Env}}{{println .}}{{end}}'' | grep TELEGRAM\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cd /home/ollama/clawbot-v5 && sed -i ''s|TELEGRAM_BOT_TOKEN=.*|TELEGRAM_BOT_TOKEN=8569720657:AAFjDyjAN94QQrjn1gBnFXAyS20EUyozH8c|'' .env && docker compose down && docker compose up -d && sleep 5 && docker logs clawbot --tail=10\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker exec clawbot grep -r ''ClawBotDown\\\\|ClawbotDown'' /app 2>/dev/null | head -5 || echo ''在程式碼中找不到''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker exec prometheus cat /etc/prometheus/alerts.yml 2>/dev/null | grep -A10 ''ClawBot\\\\|clawbot'' | head -30\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker exec prometheus cat /etc/prometheus/alerts.yml 2>/dev/null | grep -i ''clawbot\\\\|claw'' -A5 -B5\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker logs clawbot --since=5m 2>&1 | grep -i ''clawbot\\\\|incident\\\\|alert'' | tail -20\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker logs clawbot --tail 50 2>&1\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker logs clawbot 2>&1 | grep -i ''telegram\\\\|polling\\\\|bot'' | tail -20\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker ps --format ''table {{.Names}}\\\\t{{.Status}}\\\\t{{.Ports}}'' | grep -E ''claw|NAME''\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker logs clawbot 2>&1 | grep -E ''telegram|Telegram|error|Error'' | tail -20\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker ps | grep ollama\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker ps -a --format ''table {{.Names}}\\\\t{{.Status}}'' | head -20\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"sed -i ''s|host.docker.internal|172.17.0.1|g'' /home/ollama/clawbot-v5/.env && cat /home/ollama/clawbot-v5/.env | grep OLLAMA\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cd /home/ollama/clawbot-v5 && docker-compose restart clawbot && sleep 3 && docker logs clawbot --tail 30 2>&1\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cd /home/ollama/clawbot-v5 && docker compose restart clawbot && sleep 5 && docker logs clawbot --tail 30 2>&1\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker exec clawbot curl -s http://172.17.0.1:11434/api/tags | head -c 200\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker logs clawbot 2>&1 | tail -10\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker logs clawbot 2>&1 | grep -iE ''error|telegram|polling|alert|send'' | tail -30\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cat /home/ollama/clawbot-v5/.env | grep OLLAMA\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cd /home/ollama/clawbot-v5 && docker compose up -d --force-recreate clawbot && sleep 5 && docker logs clawbot 2>&1 | tail -20\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker exec clawbot curl -s http://172.17.0.1:11434/api/tags | head -c 100\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker logs clawbot --since 5m 2>&1 | tail -30\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker exec momo-db psql -U postgres -d clawbot -c \"\"SELECT enum_range\\(NULL::approvalstatus\\);\"\"\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker exec -e PGPASSWORD=clawbot123 momo-db psql -U clawbot -d clawbot -c \"\"SELECT enum_range\\(NULL::approvalstatus\\);\"\"\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker ps | grep -E ''postgres|db''\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker exec momo-db env | grep -i postgres\")",
"Bash(sshpass -p \"0936223270\" ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"PGPASSWORD=AwoooiProd2026 psql -h localhost -U awoooi -d awoooi_prod -c \"\"SELECT enum_range\\(NULL::approvalstatus\\);\"\"\")",
"Bash(KUBECONFIG=~/.kube/config kubectl config get-contexts)",
"Bash(docker tag:*)",
"Bash(docker push:*)",
"Bash(ssh ollama@192.168.0.188 \"cd ~/awoooi-build && find apps/web/src -name ''''*.ts'''' -o -name ''''*.tsx'''' | head -30 | xargs md5sum\")",
"Bash(rsync -avz --exclude 'node_modules' --exclude '.next' --exclude '.turbo' --exclude '*.log' /Users/ogt/awoooi/ ollama@192.168.0.188:~/awoooi-build/)",
"Bash(gh run:*)",
"Bash(APPROVAL_ID=\"ea43578e-17cd-40b9-b4c3-8fe8e92f225c\" __NEW_LINE_76dc92b2699cd7d5__ echo \"=== 檢查 Approval Metadata ===\" curl -s \"https://awoooi.wooo.work/api/v1/approvals/pending\")",
"Bash(APPROVAL_ID=\"865ab726-c3b9-447e-86a9-65a6227516e6\" __NEW_LINE_db14ef76ca26af32__ echo \"=== 簽核 ===\" curl -s -X POST \"https://awoooi.wooo.work/api/v1/approvals/$APPROVAL_ID/sign\" -H \"Content-Type: application/json\" -d '{\"\"\"\"signer_id\"\"\"\":\"\"\"\"commander\"\"\"\",\"\"\"\"signer_name\"\"\"\":\"\"\"\"Commander\"\"\"\",\"\"\"\"comment\"\"\"\":\"\"\"\"Test resolution\"\"\"\"}')",
"Read(//Users/ogt/awoooi/**)",
"Bash(APPROVAL_ID=\"e9445e68-6c3e-4899-b507-3b9b7bcaf0a7\" __NEW_LINE_680ad94d4896e58a__ echo \"=== 簽核 ===\" curl -s -X POST \"https://awoooi.wooo.work/api/v1/approvals/$APPROVAL_ID/sign\" -H \"Content-Type: application/json\" -d '{\"\"\"\"signer_id\"\"\"\":\"\"\"\"commander\"\"\"\",\"\"\"\"signer_name\"\"\"\":\"\"\"\"Commander\"\"\"\",\"\"\"\"comment\"\"\"\":\"\"\"\"Final test\"\"\"\"}')",
"Bash(APPROVAL_ID=\"eb0afb4e-834b-4af7-9ae0-3c58232fdd99\" INCIDENT=\"INC-20260323-F05CD6\" __NEW_LINE_47f1c3803a64b43c__ echo \"=== 簽核前 Incident 狀態 ===\" curl -s \"https://awoooi.wooo.work/api/v1/incidents/$INCIDENT\")",
"Bash(mkdir -p /Users/ogt/awoooi/.claude/hooks)",
"Bash(/Users/ogt/awoooi/.claude/hooks/pre-commit-check.sh:*)",
"Bash(git -C /Users/ogt/awoooi status packages/lewooogo-core/)",
"Bash(git -C /Users/ogt/awoooi ls-files packages/lewooogo-core/src/)",
"Bash(git -C /Users/ogt/awoooi status --short)",
"Bash(git -C /Users/ogt/awoooi add apps/api/pyproject.toml apps/api/scripts/ apps/api/src/ apps/web/.eslintrc.js apps/web/src/ packages/lewooogo-core/.eslintrc.js)",
"Bash(git -C /Users/ogt/awoooi diff --cached --stat)",
"Bash(git -C:*)",
"Bash(for wf:*)",
"Bash(do)",
"Bash(done)",
"Bash(jq 'if type == \"\"\"\"array\"\"\"\" then .[0] | {incident_id, status, decision} else . end')",
"Bash(PYTHONPATH=. python -c \"from src.api.v1.stats import router; print\\(''✅ stats.py 載入成功,路由數:'', len\\(router.routes\\)\\)\")",
"Bash(PYTHONPATH=. pytest tests/ -v --tb=short)",
"Bash(PYTHONPATH=. pytest tests/test_stats_api.py -v --tb=short)",
"Bash(PYTHONPATH=. pytest tests/test_webhook_telegram_integration.py::TestNewAlertTelegramPush -v --tb=long)",
"Bash(PYTHONPATH=. pytest tests/test_webhook_telegram_integration.py::TestNewAlertTelegramPush -v --tb=short)",
"Bash(PYTHONPATH=. pytest tests/test_webhook_telegram_integration.py -v --tb=short)",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl get pods -n awoooi')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl get ns awoooi && kubectl get all -n awoooi')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl get ns | head -20')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl get pods -n awoooi-prod')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl logs awoooi-worker-bb89b5ffc-bpf45 -n awoooi-prod --tail=50')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl logs awoooi-worker-bb89b5ffc-bpf45 -n awoooi-prod --tail=100 | grep -i telegram')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl logs awoooi-api-8c9489b6c-cm8g5 -n awoooi-prod --tail=50 | grep -i webhook')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl logs awoooi-api-8c9489b6c-cm8g5 -n awoooi-prod --tail=30')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl get pods -n monitoring | grep alertmanager')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"kubectl get configmap alertmanager-config -n monitoring -o jsonpath=''{.data.alertmanager\\\\.yml}'' | head -50\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl get svc -n awoooi-prod')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"kubectl patch configmap alertmanager-config -n monitoring --type merge -p ''{\"\"data\"\":{\"\"alertmanager.yml\"\":\"\"global:\\\\n resolve_timeout: 5m\\\\n\\\\nroute:\\\\n group_by: [\\\\\"\"alertname\\\\\"\", \\\\\"\"severity\\\\\"\"]\\\\n group_wait: 30s\\\\n group_interval: 5m\\\\n repeat_interval: 4h\\\\n receiver: \\\\\"\"awoooi-webhook\\\\\"\"\\\\n routes:\\\\n - match:\\\\n severity: critical\\\\n receiver: \\\\\"\"awoooi-webhook\\\\\"\"\\\\n group_wait: 10s\\\\n repeat_interval: 1h\\\\n - match:\\\\n severity: warning\\\\n receiver: \\\\\"\"awoooi-webhook\\\\\"\"\\\\n group_wait: 1m\\\\n repeat_interval: 4h\\\\n\\\\nreceivers:\\\\n - name: \\\\\"\"awoooi-webhook\\\\\"\"\\\\n webhook_configs:\\\\n - url: \\\\\"\"http://192.168.0.120:32334/api/v1/webhook/alertmanager\\\\\"\"\\\\n send_resolved: true\\\\n\\\\ninhibit_rules:\\\\n - source_match:\\\\n severity: \\\\\"\"critical\\\\\"\"\\\\n target_match:\\\\n severity: \\\\\"\"warning\\\\\"\"\\\\n equal: [\\\\\"\"alertname\\\\\"\", \\\\\"\"instance\\\\\"\"]\\\\n\"\"}}''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl rollout restart deployment/alertmanager -n monitoring && kubectl rollout status deployment/alertmanager -n monitoring')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"kubectl get configmap alertmanager-config -n monitoring -o jsonpath=''{.data.alertmanager\\\\.yml}'' | grep -A 3 ''url:''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl get pods -n awoooi-prod -o jsonpath=\"\"{range .items[*]}{.metadata.name}{\\\\\"\" \\\\\"\"}{.spec.containers[*].image}{\\\\\"\"\\\\\\\\n\\\\\"\"}{end}\"\"')",
"Bash(git mv:*)",
"Bash(for file:*)",
"Bash(do echo:*)",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 wooo@192.168.0.120 \"echo ''Connected''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"kubectl get deployment -n awoooi-prod -o jsonpath=''{range .items[*]}{.metadata.name}{\"\" selector: \"\"}{.spec.selector.matchLabels}{\"\"\\\\n\"\"}{end}''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"kubectl delete deployment awoooi-api awoooi-web awoooi-worker -n awoooi-prod\")",
"WebFetch(domain:awoooi.wooo.work)",
"WebFetch(domain:api.awoooi.wooo.work)",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl get pods -n awoooi-prod -o wide')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl get svc,ingress -n awoooi-prod')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl exec -n awoooi-prod deploy/awoooi-api -- curl -sf http://localhost:8000/api/v1/health 2>&1')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'curl -sf http://10.43.125.201:8000/api/v1/health 2>&1 || echo \"\"FAILED\"\"')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'sudo nginx -t 2>&1 && sudo cat /etc/nginx/sites-enabled/awoooi* 2>/dev/null || sudo cat /etc/nginx/conf.d/awoooi* 2>/dev/null || echo \"\"No awoooi nginx config found\"\"')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'cat /etc/nginx/sites-enabled/* 2>/dev/null | grep -A5 awoooi || cat /etc/nginx/conf.d/* 2>/dev/null | grep -A5 awoooi || ls -la /etc/nginx/ 2>/dev/null || echo \"\"No nginx on this host\"\"')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'ls /etc/nginx/sites-enabled/ 2>/dev/null && cat /etc/nginx/sites-enabled/*awoooi* 2>/dev/null || echo \"\"Checking conf.d...\"\" && ls /etc/nginx/conf.d/ 2>/dev/null')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'grep -l awoooi /etc/nginx/sites-enabled/* 2>/dev/null || grep -r \"\"awoooi\"\" /etc/nginx/sites-enabled/ 2>/dev/null | head -20')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'grep -r \"\"awoooi\\\\|32334\\\\|32335\"\" /etc/nginx/ 2>/dev/null | head -20')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'echo \"\"0936223270\"\" | sudo -S cp /tmp/awoooi-prod.conf /etc/nginx/conf.d/ && echo \"\"Config copied\"\" && sudo nginx -t 2>&1')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'echo \"\"0936223270\"\" | sudo -S ls -la /etc/nginx/ssl/ 2>/dev/null || echo \"\"No ssl dir\"\" && sudo ls -la /etc/letsencrypt/live/ 2>/dev/null | head -10')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'echo \"\"0936223270\"\" | sudo -S sed -i \"\"s|/etc/nginx/ssl/awoooi.crt|/etc/letsencrypt/live/awoooi.wooo.work/fullchain.pem|g\"\" /etc/nginx/conf.d/awoooi-prod.conf && sudo sed -i \"\"s|/etc/nginx/ssl/awoooi.key|/etc/letsencrypt/live/awoooi.wooo.work/privkey.pem|g\"\" /etc/nginx/conf.d/awoooi-prod.conf && echo \"\"Paths fixed\"\" && sudo nginx -t 2>&1')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'echo \"\"0936223270\"\" | sudo -S nginx -s reload && echo \"\"Nginx reloaded!\"\" && sleep 2')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'grep -r \"\"awoooi\"\" /etc/nginx/sites-enabled/ 2>/dev/null | head -5')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'echo \"\"0936223270\"\" | sudo -S grep -rl \"\"awoooi.wooo.work\"\" /etc/nginx/ 2>/dev/null')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'curl -sf http://192.168.0.121:32334/api/v1/health 2>&1 || echo \"\"FAILED to reach 121\"\"')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'echo \"\"0936223270\"\" | sudo -S rm /etc/nginx/conf.d/awoooi-prod.conf && sudo nginx -t && sudo nginx -s reload && echo \"\"Cleaned up duplicate config\"\"')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'echo \"\"0936223270\"\" | sudo -S tail -30 /var/log/nginx/error.log 2>/dev/null')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'grep -r \"\"api.awoooi\"\" /etc/nginx/ 2>/dev/null || echo \"\"No api.awoooi config found\"\"')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl get configmap awoooi-config -n awoooi-prod -o yaml | grep -A5 NEXT_PUBLIC')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl get deployment awoooi-web -n awoooi-prod -o yaml | grep -A20 \"\"env:\"\" | head -25')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'echo \"\"0936223270\"\" | sudo -S tail -10 /var/log/nginx/access.log 2>/dev/null | grep awoooi')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'echo \"\"0936223270\"\" | sudo -S tail -5 /var/log/nginx/error.log 2>/dev/null')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'echo \"\"0936223270\"\" | sudo -S stat /etc/nginx/sites-available/awoooi.wooo.work.conf 2>/dev/null | grep -E \"\"Modify|Change|Birth\"\"')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl logs -n awoooi-prod -l app=awoooi-web --tail=30 2>/dev/null | grep -i \"\"api\\\\|error\\\\|fetch\"\" | head -20')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'echo \"\"0936223270\"\" | sudo -S tail -20 /var/log/nginx/access.log 2>/dev/null | grep -E \"\"awoooi.*api\"\"')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'echo \"\"0936223270\"\" | sudo -S tail -20 /var/log/nginx/awoooi-prod-access.log 2>/dev/null')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl exec -n awoooi-prod deploy/awoooi-web -- env | grep -i api')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl exec -n awoooi-prod deploy/awoooi-web -- sh -c \"\"grep -r \\\\\"\"NEXT_PUBLIC_API_URL\\\\|api.awoooi\\\\\"\" /app/.next/static/chunks/*.js 2>/dev/null | head -5 || grep -r \\\\\"\"awoooi.wooo.work\\\\\"\" /app/.next/static/chunks/*.js 2>/dev/null | head -3\"\"')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'kubectl exec -n awoooi-prod deploy/awoooi-web -- sh -c \"\"find /app/.next -name \\\\\"\"*.js\\\\\"\" -exec grep -l \\\\\"\"awoooi\\\\\"\" {} \\\\; 2>/dev/null | head -3\"\"')",
"Bash(./scripts/qa-zero-touch.sh)",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'echo \"\"0936223270\"\" | sudo -S cat /etc/nginx/sites-available/awoooi.wooo.work.conf')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'echo \"\"0936223270\"\" | sudo -S cp /tmp/awoooi.wooo.work.conf /etc/nginx/sites-available/awoooi.wooo.work.conf && sudo nginx -t 2>&1')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'echo \"\"0936223270\"\" | sudo -S nginx -s reload && echo \"\"✅ Nginx reloaded with load balancing!\"\"')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'cd /opt && sudo ls -la sentry 2>/dev/null || echo \"\"Sentry 目錄不存在,需要建立\"\"')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'sudo mkdir -p /opt/sentry && sudo chown wooo:wooo /opt/sentry && cd /opt/sentry && git clone https://github.com/getsentry/self-hosted.git . 2>&1 | tail -5')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'echo \"\"0936223270\"\" | sudo -S mkdir -p /opt/sentry && echo \"\"0936223270\"\" | sudo -S chown wooo:wooo /opt/sentry && cd /opt/sentry && git clone https://github.com/getsentry/self-hosted.git . 2>&1 | tail -10')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'cd /opt/sentry && ls -la 2>&1 | head -20')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'cd /opt/sentry && git describe --tags 2>/dev/null || git rev-parse --short HEAD')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'cd /opt/sentry && ./install.sh --help 2>&1 | head -30 || echo \"\"No help available, checking script...\"\"')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'cd /opt/sentry && nohup ./install.sh --skip-user-creation --no-report-self-hosted-issues > /tmp/sentry-install.log 2>&1 &')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'tail -30 /tmp/sentry-install.log 2>/dev/null || echo \"\"日誌檔案尚未建立,等待中...\"\"')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'grep -E \"\"^\\\\▶|^Creating|^Starting|^Error|^✓|Pulling\"\" /tmp/sentry-install.log 2>/dev/null | tail -40')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'echo \"\"=== 日誌行數 ===\"\" && wc -l /tmp/sentry-install.log && echo \"\"\"\" && echo \"\"=== 最近進度 ===\"\" && tail -10 /tmp/sentry-install.log')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'echo \"\"=== 日誌行數 ===\"\" && wc -l /tmp/sentry-install.log && echo \"\"\"\" && echo \"\"=== 關鍵階段 ===\"\" && grep -E \"\"^▶|✓|Error|Creating|Starting\"\" /tmp/sentry-install.log | tail -20')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'echo \"\"=== 日誌行數 ===\"\" && wc -l /tmp/sentry-install.log && echo \"\"\"\" && echo \"\"=== 最近 20 行 ===\"\" && tail -20 /tmp/sentry-install.log')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'echo \"\"=== 日誌行數 ===\"\" && wc -l /tmp/sentry-install.log && echo \"\"\"\" && echo \"\"=== 關鍵階段 ===\"\" && grep -E \"\"^▶|✓|Error|Creating|Starting|Building|DONE\"\" /tmp/sentry-install.log | tail -30')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'echo \"\"=== 日誌行數 ===\"\" && wc -l /tmp/sentry-install.log && echo \"\"\"\" && echo \"\"=== 最近關鍵階段 ===\"\" && grep -E \"\"^▶|✓|Error|Creating|Starting|DONE|Completed|success\"\" /tmp/sentry-install.log | tail -25')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.110 'grep -E \"\"^▶|✓|Error|Completed|success|fail\"\" /tmp/sentry-install.log | tail -15')",
"Bash(redis-cli -h 192.168.0.188 -p 6380 KEYS incident:*)",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cat /home/ollama/momo-pro/monitoring/alertmanager.yml 2>/dev/null || cat /etc/alertmanager/alertmanager.yml 2>/dev/null || echo ''Config not found''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker logs clawbot --tail 30 2>&1\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker logs clawbot --tail 20 2>&1 | grep -iE ''telegram|send|alert|incident|error''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cat /home/ollama/clawbot-v5/.env | grep -E ''TELEGRAM|TG_'' | head -5\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cat /home/ollama/clawbot-v5/.env | grep -E ''REDIS|POSTGRES|DATABASE'' | head -5\")",
"Bash(ssh ollama@192.168.0.188 'curl -s \"\"http://localhost:9093/api/v2/alerts?active=true\"\" | python3 -c \"\"import sys,json; alerts=json.load\\(sys.stdin\\); print\\(f\\\\\"\"Active alerts: {len\\(alerts\\)}\\\\\"\"\\)\"\"')",
"Bash(ssh ollama@192.168.0.188 'curl -s \"\"http://localhost:9093/api/v2/alerts\"\" | python3 -c \"\"import sys,json; alerts=json.load\\(sys.stdin\\); print\\(f\\\\\"\"Total alerts: {len\\(alerts\\)}\\\\\"\"\\); [print\\(a[\\\\\"\"labels\\\\\"\"][\\\\\"\"alertname\\\\\"\"]\\) for a in alerts[:5]]\"\"')",
"Bash(ssh ollama@192.168.0.188 'redis-cli -p 6380 -n 0 GET incident:INC-20260324-36AF55 | python3 -c \"\"import sys,json; d=json.load\\(sys.stdin\\); print\\(f\\\\\"\"Status: {d.get\\(\\\\\"\"status\\\\\"\"\\)}\\\\\"\"\\); print\\(f\\\\\"\"message_id: {d.get\\(\\\\\"\"message_id\\\\\"\", \\\\\"\"NONE\\\\\"\"\\)}\\\\\"\"\\); print\\(f\\\\\"\"chat_id: {d.get\\(\\\\\"\"chat_id\\\\\"\", \\\\\"\"NONE\\\\\"\"\\)}\\\\\"\"\\)\"\"')",
"Bash(ssh ollama@192.168.0.188 'redis-cli -p 6380 -n 0 GET incident:INC-20260324-36AF55 | python3 -c \"\"import sys,json; d=json.load\\(sys.stdin\\); print\\(f\\\\\"\"status: {d.get\\('status'\\)}\\\\\"\"\\); print\\(f\\\\\"\"message_id: {d.get\\('message_id'\\)}\\\\\"\"\\); print\\(f\\\\\"\"created_at: {d.get\\('created_at'\\)}\\\\\"\"\\)\"\"')",
"Bash(redis-cli -h 192.168.0.188 -p 6380 -n 0 KEYS *approval*)",
"Bash(redis-cli -h 192.168.0.188 -p 6380 -n 0 KEYS *incident*)",
"Bash(redis-cli -h 192.168.0.188 -p 6380 -n 0 KEYS *pending*)",
"Bash(redis-cli -h 192.168.0.188 -p 6380 -n 0 KEYS *)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/k3s-prod.yaml kubectl get pods -n awoooi-prod -o wide)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/k3s-prod.yaml kubectl get deployment awoooi-api -n awoooi-prod -o jsonpath='{.spec.template.spec.containers[0].image}')",
"Bash(kubectl --kubeconfig=/Users/ogt/awoooi/k3s-prod.yaml get deployment awoooi-api -n awoooi-prod -o jsonpath='{.spec.template.spec.containers[0].image}')",
"Bash(python3 -c \":*)",
"Bash(/tmp/awoooi-tg-secret.yaml:*)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/k3s-prod.yaml kubectl apply -f /tmp/awoooi-tg-secret.yaml)",
"Bash(for pod:*)",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.188 \"curl -fsSL https://ollama.com/install.sh | sh\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no -o PreferredAuthentications=password wooo@192.168.0.188 \"echo connected && ollama --version\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no -o PreferredAuthentications=password ollama@192.168.0.188 \"curl -fsSL https://ollama.com/install.sh | sh\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"echo ''0936223270'' | sudo -S curl -fsSL https://ollama.com/install.sh | sudo -S sh\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"ollama --version\")",
"Bash(__NEW_LINE_95e9df111552805b__ echo:*)",
"Bash(sshpass -p '0936223270' scp /Users/ogt/awoooi/k8s/nginx/awoooi-prod.conf ollama@192.168.0.188:/tmp/awoooi-prod.conf)",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"echo ''0936223270'' | sudo -S cp /tmp/awoooi-prod.conf /etc/nginx/conf.d/awoooi-prod.conf && echo ''0936223270'' | sudo -S nginx -t 2>&1\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"echo ''0936223270'' | sudo -S ls -la /etc/nginx/ssl/ 2>/dev/null || echo ''No ssl dir''; echo ''0936223270'' | sudo -S ls -la /etc/nginx/conf.d/ 2>/dev/null | head -10\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"echo ''0936223270'' | sudo -S grep -r ''ssl_certificate'' /etc/nginx/ 2>/dev/null | head -5\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"echo ''0936223270'' | sudo -S grep -A 20 ''server_name awoooi'' /etc/nginx/sites-enabled/all-sites.conf 2>/dev/null | head -30\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"echo ''0936223270'' | sudo -S ls -la /etc/nginx/sites-enabled/ 2>/dev/null\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"echo ''0936223270'' | sudo -S cat /etc/nginx/sites-available/awoooi.wooo.work.conf 2>/dev/null\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"echo ''0936223270'' | sudo -S rm /etc/nginx/conf.d/awoooi-prod.conf && echo ''0936223270'' | sudo -S nginx -t 2>&1\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"echo ''0936223270'' | sudo -S nginx -s reload 2>&1\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"echo ''0936223270'' | sudo -S systemctl reload nginx 2>&1\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"docker logs openclaw 2>&1 | tail -30\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"docker ps -a --format ''table {{.Names}}\\\\t{{.Status}}\\\\t{{.Image}}'' 2>&1 | head -15\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"docker logs clawbot 2>&1 | grep -i telegram | tail -20\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"docker logs clawbot 2>&1 | tail -30\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"docker exec alertmanager cat /etc/alertmanager/alertmanager.yml 2>&1 | head -30\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"curl -sf ''http://localhost:9093/api/v1/alerts'' | jq ''.data | length'' 2>/dev/null || curl -sf ''http://localhost:9093/api/v2/alerts'' | jq ''length'' 2>/dev/null\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"docker exec alertmanager wget -qO- ''http://localhost:9093/api/v2/alerts'' 2>&1 | head -100\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl -n awoooi-prod logs -l app=awoooi-worker --tail=50 2>&1\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"cat /home/ollama/alertmanager/alertmanager.yml 2>/dev/null || docker exec alertmanager cat /etc/alertmanager/alertmanager.yml\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"docker cp /tmp/alertmanager.yml alertmanager:/etc/alertmanager/alertmanager.yml && docker exec alertmanager amtool check-config /etc/alertmanager/alertmanager.yml && docker kill -s SIGHUP alertmanager\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"docker inspect alertmanager --format ''{{range .Mounts}}{{.Source}} -> {{.Destination}}{{println}}{{end}}''\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"docker exec alertmanager cat /etc/alertmanager/alertmanager.yml\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"docker restart alertmanager && sleep 3 && docker exec alertmanager cat /etc/alertmanager/alertmanager.yml\")",
"Bash(sshpass -p '0936223270' ssh ollama@192.168.0.188 \"docker logs clawbot 2>&1 | grep -i ''telegram\\\\|webhook\\\\|alert'' | tail -10\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-api -n awoooi-prod --tail=30 2>/dev/null | grep -E ''''POST|webhook|alertmanager|ManualTest''''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-api -n awoooi-prod --tail=30 2>/dev/null | grep -iE ''''POST|webhook''''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-api -n awoooi-prod --tail=50 2>/dev/null | grep -iE ''''POST.*webhook|alertmanager_webhook|NewFingerprint''''\")",
"Bash(kustomize build:*)",
"Bash(KUBECONFIG=~/.kube/config kubectl get secret awoooi-secrets -n awoooi-prod -o jsonpath='{.data}')",
"Bash(KUBECONFIG=/Users/ogt/.kube/config kubectl exec deploy/awoooi-api -n awoooi-prod -- env)",
"Bash(git checkout:*)",
"Bash(jq -r '.status // \"\"\"\"failed\"\"\"\"')",
"Bash(jq -r '.total // \"\"\"\"error\"\"\"\"')",
"Bash(redis-cli -h 192.168.0.188 -p 6380 -n 10 XLEN awoooi:signals)",
"Bash(redis-cli -h 192.168.0.188 -p 6380 -n 10 XRANGE awoooi:signals - + COUNT 5)",
"Bash(SENTRY_TOKEN=\"2b73050606d2b32f54095b4e177f4842f2bfe69d4b17da25f6daa4739148a972\" curl -s \"http://192.168.0.110:9000/api/0/organizations/\" -H \"Authorization: Bearer $SENTRY_TOKEN\")",
"Bash(SENTRY_TOKEN=\"2b73050606d2b32f54095b4e177f4842f2bfe69d4b17da25f6daa4739148a972\" curl -s \"http://192.168.0.110:9000/api/0/organizations/sentry/projects/\" -H \"Authorization: Bearer $SENTRY_TOKEN\")",
"Bash(SENTRY_TOKEN=\"2b73050606d2b32f54095b4e177f4842f2bfe69d4b17da25f6daa4739148a972\" curl -s \"http://192.168.0.110:9000/api/0/projects/sentry/awoooi-api/rules/\" -H \"Authorization: Bearer $SENTRY_TOKEN\")",
"Bash(SENTRY_TOKEN=\"2b73050606d2b32f54095b4e177f4842f2bfe69d4b17da25f6daa4739148a972\" __NEW_LINE_583db0bbb6875db0__ echo \"=== Alert Rules ===\" curl -s \"http://192.168.0.110:9000/api/0/projects/sentry/awoooi-api/rules/\" -H \"Authorization: Bearer $SENTRY_TOKEN\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get nodes -o wide && echo ''---'' && kubectl top nodes 2>/dev/null || echo ''metrics-server not installed''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get pods -n awoooi-prod -o wide && echo ''---'' && kubectl get pvc -n awoooi-prod 2>/dev/null && echo ''---'' && kubectl get sc 2>/dev/null && echo ''---'' && kubectl get deploy -n awoooi-prod\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get ns && echo ''---'' && kubectl get svc -A | grep -E ''prometheus|grafana|metrics|signoz|longhorn|argocd'' || echo ''No monitoring/gitops services found''\")",
"Bash(ssh wooo@192.168.0.120 \"cat /etc/rancher/k3s/config.yaml 2>/dev/null || echo ''--- K3s default config \\(no custom config.yaml\\) ---'' && echo ''---'' && sudo k3s check-config 2>/dev/null | head -30 || echo ''check-config not available''\")",
"Bash(ssh wooo@192.168.0.120 \"free -h && echo ''---'' && swapon --show && echo ''---'' && df -h /var/lib/rancher/k3s\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get pods -n cnpg-system && echo ''---'' && kubectl get svc -n monitoring\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get all -n awoooi-prod -o wide 2>/dev/null && echo ''---QUOTA---'' && kubectl describe quota -n awoooi-prod 2>/dev/null && echo ''---EVENTS---'' && kubectl get events -n awoooi-prod --sort-by=''.lastTimestamp'' 2>/dev/null | tail -20\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get helmcharts -A 2>/dev/null || echo ''No HelmCharts'' && echo ''---'' && kubectl get helmreleases -A 2>/dev/null || echo ''No HelmReleases'' && echo ''---'' && kubectl api-resources | grep -E ''argo|flux|velero|longhorn'' || echo ''No GitOps/Backup CRDs''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get ds -A && echo ''---'' && kubectl get cm -n kube-system | grep -E ''traefik|coredns'' && echo ''---REGISTRIES---'' && sudo cat /etc/rancher/k3s/registries.yaml 2>/dev/null || echo ''No registries.yaml''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get ingress -A 2>/dev/null || echo ''No Ingress'' && echo ''---HPA---'' && kubectl get hpa -A 2>/dev/null || echo ''No HPA'' && echo ''---PDB---'' && kubectl get pdb -A 2>/dev/null || echo ''No PDB'' && echo ''---SYSCTL---'' && cat /proc/sys/net/core/somaxconn && cat /proc/sys/fs/file-max\")",
"Bash(ssh wooo@192.168.0.120 \"systemctl status k3s | head -20 && echo ''---K3S-VERSION---'' && k3s --version && echo ''---ETCD-STATUS---'' && sudo k3s etcd-snapshot list 2>/dev/null | head -5 || echo ''No etcd snapshots''\")",
"Bash(ssh wooo@192.168.0.121 \"free -h && swapon --show && echo ''---DISK---'' && df -h /var/lib/rancher/k3s 2>/dev/null\")",
"Bash(ssh wooo@192.168.0.120 \"sudo ls -la /var/lib/rancher/k3s/server/db/ 2>/dev/null && echo ''---TOKEN---'' && sudo cat /var/lib/rancher/k3s/server/token 2>/dev/null | head -1 | cut -c1-20\")",
"Bash(ssh -o ConnectTimeout=10 wooo@192.168.0.120 \"ps aux | grep k3s | grep -v grep | head -3 && echo ''---'' && sudo cat /etc/systemd/system/k3s.service 2>/dev/null | grep -E ''ExecStart|datastore''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"echo ''0936223270'' | sudo -S mkdir -p /backup/k3s_etcd 2>/dev/null && echo ''0936223270'' | sudo -S chown ollama:ollama /backup/k3s_etcd 2>/dev/null && echo ''=== 188 備份目錄 ==='' && ls -la /backup/\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"mkdir -p ~/.ssh && chmod 700 ~/.ssh && echo ''ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCnTnbjtSPwrI/pN6DByDxsFDOR4+sVnk7hb+eOr+Pb4e7o7QGbyKaJC2eKP7uRBilPqeScuvNKZhwmY8ZOuhjId+ZyLK0jZXHdq3a6tjsQ4MwPGyT2aMaD7x2jKzPbFojR0P5lmQWH2zjxeVuB7UeBIejaYk3gQEMFVES8Xh84yxFvy9jlwKmZFAI0gIhx0nPOTPB7onTyb8L5snUbwQQntoHWYFbb83+wui/kM15aLT5r8uvS2yZdsWWrDvAyuIShde1ceTBevwwqxezH1egXGoGkvZYYF7vHFu3X6jF7Nfp4qVfo0EfFV3omy90HzoFvoEXCC+jIWU0TjUqdEgGIEj2b+YXw3bIs+k+g/0/iJzA5LLUNb2vHVHoUmah4ZNlfiGU7e6hTYXjLjoXJlz9gfv6LYywhgktdThi9sUCn6rzbatlMrY0HNUE6uOwRTugMq1YUEJCvRqeFmtX5yF6xGp+FbOjIr1kMmplbRQRqKIrpQoqEn0+UBXC7OwJNCk8= wooo@mon'' >> ~/.ssh/authorized_keys && chmod 600 ~/.ssh/authorized_keys && echo ''SSH key 已加入''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"echo ''ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCs3nQ11B+V/VEchNR9Uzj57JoKXOJ8S1UVjCTHkUDL8FnrbdPFr0zvpYgX0a/Ipj9wHkqU6z6Ho6MQj3X2+HaK5fC0fZ3aZE1QT2df/x0xXdyka9XSaTFaymKzNTvfmum40koBkNccKyO5SLSjTcoTZCDHP4RqHHu/MYjQMejG7yeyCFmgumrHh5T/0DXPf5zl0Ff1C5U3VCLPxz5vq63JB2dTfrjQLg3sO0ZI3KTZE8aFj3txKz5snDZX3nE1tHZMKLecwwEqi130BtVZcm8zXDqX83gtUDp/WLfPyKCmzZzGf6YgEofIsyrVup8XnD9xNoFmbEeBdFocGWeoIVIn+faOpU22fvQ34L57GHhNQwygZOPKsZa9XNKjayKdKQl3gcAA2wnkZgN0cyIEYvTd3O+Z5Xvff2dat+0sDMK571V+0JEdAMOpQjFO7DkwjKHn/gHLmvRjYLiUOItX9JysFgYuHs8omad2LmeUIkQrBD2I2hyvY49HaJKWctk4Jm0= root@mon'' >> ~/.ssh/authorized_keys && echo ''Root SSH key added''\")",
"Bash(grep -r \"\"\"zod\"\"\" /Users/ogt/awoooi/package.json /Users/ogt/awoooi/apps/*/package.json /Users/ogt/awoooi/packages/*/package.json)",
"Bash(__NEW_LINE_144503b060dfd3dd__ echo:*)",
"Bash(__NEW_LINE_ae2a22b14586d7aa__ echo:*)",
"Bash(__NEW_LINE_e17561a4e55f74d4__ echo:*)",
"Bash(ssh wooo@192.168.0.120 \"echo ''''0936223270'''' | sudo -S cat /etc/rancher/k3s/k3s.yaml 2>/dev/null | sed ''''s|https://127.0.0.1:6443|https://192.168.0.125:6443|g''''\")",
"Bash(KUBECONFIG=/tmp/kubeconfig-vip.yaml kubectl get nodes)",
"Bash(kubectl --kubeconfig=/tmp/kubeconfig-vip.yaml get rs -n awoooi-prod)",
"Bash(kubectl --kubeconfig=/tmp/kubeconfig-vip.yaml get pods -A --no-headers)",
"Bash(kubectl --kubeconfig=/tmp/kubeconfig-vip.yaml get jobs -A --no-headers)",
"Bash(kubectl --kubeconfig=/tmp/kubeconfig-vip.yaml get rs -n awoooi-prod --no-headers)",
"Bash(kubectl --kubeconfig=/tmp/kubeconfig-vip.yaml delete job api-watchdog-29556380 -n wooo-aiops-uat)",
"Bash(kubectl --kubeconfig=/tmp/kubeconfig-vip.yaml get pods -n awoooi-prod)",
"Bash(kubectl --kubeconfig=/tmp/kubeconfig-vip.yaml get pods -A)",
"Bash(kubectl --kubeconfig=/tmp/kubeconfig-vip.yaml get svc -A)",
"Bash(PGPASSWORD=changeme psql -h 192.168.0.188 -U awoooi -d awoooi_prod -f /Users/ogt/awoooi/apps/api/scripts/migrate_phase18_audit_logs.sql)",
"Bash(PLAYWRIGHT_BASE_URL=http://192.168.0.125:32335 npx playwright test phase11-conversational.spec.ts --reporter=list)",
"Bash(PLAYWRIGHT_BASE_URL=http://192.168.0.125:32335 npx playwright test phase11-conversational.spec.ts --reporter=list --workers=1)",
"Bash(KUBECONFIG=~/.kube/config kubectl get nodes --server=https://192.168.0.125:6443 --insecure-skip-tls-verify)",
"Bash(source .venv/bin/activate)",
"Read(//etc/postgresql/14/main/**)",
"Bash(for port:*)",
"Bash(kubectl top:*)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl top pods -n awoooi-prod)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get pods -n awoooi-prod -o wide)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get svc -n awoooi-prod)",
"Bash(jq -r '.components | to_entries[] | \"\"\"\"\\\\\\(.key\\): \\\\\\(.value.status\\)\"\"\"\"')",
"Bash(tar -xzf velero-v1.13.0-darwin-arm64.tar.gz)",
"Bash(sudo mv:*)",
"Bash(velero version:*)",
"Bash(mkdir -p ~/bin)",
"Bash(mv velero-v1.13.0-darwin-arm64/velero ~/bin/)",
"Bash(~/bin/velero version:*)",
"Bash(k8s/velero/00-namespace.yaml:*)",
"Bash(k8s/velero/01-credentials.yaml:*)",
"Bash(k8s/velero/02-velero-install.yaml:*)",
"Bash(tar -xzf velero.tar.gz)",
"Bash(/tmp/velero-credentials:*)",
"Bash(__NEW_LINE_e85d95513fc16492__ ~/bin/velero install --provider aws --plugins velero/velero-plugin-for-aws:v1.9.0 --bucket velero-backups --secret-file /tmp/velero-credentials --backup-location-config region=minio,s3ForcePathStyle=true,s3Url=http://192.168.0.188:9000 --use-volume-snapshots=false --dry-run -o yaml)",
"Bash(__NEW_LINE_e85d95513fc16492__ head:*)",
"Bash(k8s/velero/README.md:*)",
"Bash(KUBECONFIG=/Users/ogt/.kube/config kubectl apply -f /Users/ogt/awoooi/k8s/velero/velero-install-full.yaml)",
"Bash(sshpass -p '09362233270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"whoami && hostname && cat /etc/sudoers.d/* 2>/dev/null | head -5 || echo ''no sudoers.d files''\")",
"Bash(sshpass -p '09362233270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"kubectl get nodes 2>&1 || echo ''kubectl failed, checking k3s kubeconfig...'' && ls -la /etc/rancher/k3s/k3s.yaml 2>&1\")",
"Bash(sshpass -p '09362233270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"sudo -l 2>&1 | head -20\")",
"Bash(sshpass -p '09362233270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''09362233270'' | sudo -S -l 2>&1\")",
"Bash(sshpass -p '09362233270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl get nodes 2>&1\")",
"Bash(sshpass -p '0936223270' scp /Users/ogt/awoooi/k8s/velero/velero-install-full.yaml wooo@192.168.0.120:/tmp/velero-install-full.yaml)",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''''0936223270'''' | sudo -S kubectl apply -f /tmp/velero-install-full.yaml 2>&1\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl get pods -n velero 2>&1\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl get backupstoragelocation -n velero 2>&1\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl logs -n velero deploy/velero --tail=30 2>&1\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl logs -n velero deploy/velero --tail=10 2>&1\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl get secret cloud-credentials -n velero -o jsonpath=''{.data.cloud}'' 2>&1 | base64 -d\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S curl -s http://192.168.0.188:9000/velero-backups/ 2>&1\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl rollout restart deployment/velero -n velero 2>&1\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl get backups -n velero 2>&1\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl describe backup test-backup-20260328-2114 -n velero 2>&1 | tail -30\")",
"Bash(sshpass -p:*)",
"Read(//Users/ogt/awoooi/=== 測試 /approvals/**)",
"Bash(kubectl --kubeconfig=/Users/ogt/.kube/config get svc -n velero -o wide)",
"Bash(kubectl --kubeconfig=/Users/ogt/.kube/config get pods -n velero -o wide)",
"Bash(KUBECONFIG=/Users/ogt/.kube/config kubectl get svc -n velero)",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'echo \"\"0936223270\"\" | sudo -S sh -c \"\"kubectl get pods -A | grep -E \\\\\"\"kube-state|state-metrics\\\\\"\"\"\"')",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'echo \"\"0936223270\"\" | sudo -S sh -c \"\"kubectl get ns | grep -E \\\\\"\"wooo|aiops|legacy|old\\\\\"\"\"\"')",
"Bash(KUBECONFIG=~/.kube/config kubectl get ns --no-headers)",
"WebFetch(domain:build.nvidia.com)",
"WebFetch(domain:ollama.com)",
"WebFetch(domain:docs.api.nvidia.com)",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"curl -s ''http://admin:admin@localhost:3002/api/search?type=dash-db'' | python3 -c \"\"import sys,json; d=json.load\\(sys.stdin\\); print\\(f''Dashboard 數量: {len\\(d\\)}''\\); [print\\(f\\\\\"\" - {i[''title'']}\\\\\"\"\\) for i in d[:10]]\"\"\")",
"Bash(jq '.ai_provider // .data.ai_provider // \"\"\"\"not found\"\"\"\"')",
"Bash(KUBECONFIG=~/.kube/config kubectl logs -n awoooi-prod deployment/awoooi-api --tail=50)",
"Bash(export NVIDIA_API_KEY=\"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\")",
"Bash(curl -s -X POST \"https://integrate.api.nvidia.com/v1/chat/completions\" -H \"Content-Type: application/json\" -H \"Authorization: Bearer $NVIDIA_API_KEY\" -d '{:*)",
"Bash(/tmp/fix-network-policy.yaml:*)",
"Bash(__NEW_LINE_acde7a92ceae01f6__ scp:*)",
"Bash(curl -s -X POST https://awoooi.wooo.work/api/v1/webhooks/alertmanager -H 'Content-Type: application/json' -d '{:*)",
"Bash(ssh ollama@192.168.0.188 'curl -s \"\"http://localhost:9090/api/v1/targets\"\" 2>/dev/null | grep -o \"\"\\\\\"\"health\\\\\"\":\\\\\"\"[^\\\\\"\"]*\\\\\"\"\"\" | sort | uniq -c')",
"Bash(ssh ollama@192.168.0.188 'curl -s \"\"http://localhost:9090/api/v1/rules\"\" 2>/dev/null | grep -o \"\"\\\\\"\"name\\\\\"\":\\\\\"\"[^\\\\\"\"]*\\\\\"\"\"\" | sort | uniq')",
"Bash(ssh ollama@192.168.0.188 'curl -s \"\"http://localhost:9090/api/v1/targets\"\" 2>/dev/null | grep -o \"\"\\\\\"\"job\\\\\"\":\\\\\"\"[^\\\\\"\"]*\\\\\"\"\"\" | sort | uniq -c | sort -rn')",
"Bash(ssh ollama@192.168.0.188 'curl -s \"\"http://localhost:9090/api/v1/query?query=up\"\" 2>/dev/null | grep -o \"\"\\\\\"\"instance\\\\\"\":\\\\\"\"[^\\\\\"\"]*\\\\\"\"\"\" | sort | uniq')",
"Bash(for i:*)",
"Bash(do sleep:*)",
"Bash(kubectl patch:*)",
"Bash(ssh wooo@192.168.0.110 \"cat /tmp/runner_clean.log 2>/dev/null; echo ''---''; ps aux | grep ''Runner.Listener'' | grep -v grep | wc -l\")",
"Bash(KUBECONFIG=~/.kube/config kubectl logs -n awoooi-prod -l app=awoooi-api --tail=200)",
"Bash(/Users/ogt/awoooi/ops/monitoring/deploy-exporters.sh:*)",
"WebFetch(domain:github.com)",
"WebFetch(domain:docs.ollama.com)",
"Skill(telegram:configure)",
"Skill(telegram:configure:*)",
"Bash(USE_NEW_ENGINE=true pytest tests/test_incident*.py -v --tb=short -x)",
"Bash(USE_NEW_ENGINE=true pytest tests/test_approval_field_alignment.py tests/test_learning_service.py -v --tb=short)",
"Bash(/tmp/debug_approval.py:*)",
"Bash(/tmp/debug_approval2.py:*)",
"Bash(/tmp/bulk_sign.sh:*)",
"Bash(bash /tmp/bulk_sign.sh)",
"Bash(/tmp/check_deploy.py:*)",
"Bash(/tmp/check_buttons.py:*)",
"Bash(ssh ollama@192.168.0.188 \"docker logs openclaw --since=10s 2>&1 | grep -Ev ''\\(GET|POST\\) /health'' | tail -10 && echo ''---'' && docker exec openclaw env | grep OPENAI_API_KEY | cut -c1-30\")",
"Read(//Users/ogt/awoooi/https:/awoooi.wooo.work/_next/static/chunks/app/%5Blocale%5D/**)",
"Bash(find /Users/ogt/awoooi/apps/web -type f \\\\\\(-name *.spec.ts -o -name *.spec.tsx \\\\\\))",
"Bash(kubectl -n awoooi-prod get pods)",
"Bash(kubectl -n production get pods)",
"Bash(ssh -o StrictHostKeyChecking=no wooo@192.168.0.121 \"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl get deployment awoooi-web -n awoooi-prod -o jsonpath=''{.spec.template.spec.containers[0].image}'' && echo '''' && sudo kubectl get pods -n awoooi-prod -l app=awoooi-web --no-headers\")",
"Bash(KUBECONFIG=/Users/ogt/.kube/config kubectl get pods -n awoooi-prod)",
"Bash(for run_id in 166 165)",
"mcp__plugin_playwright_playwright__browser_navigate",
"mcp__plugin_playwright_playwright__browser_take_screenshot",
"Bash(open \"http://192.168.0.110:3001/wooo/awoooi/actions\")",
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs?limit=5\" -H \"Authorization: token $TOKEN\")",
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs/166/jobs\" -H \"Authorization: token $TOKEN\")",
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs?limit=10\" -H \"Authorization: token $TOKEN\")",
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runners\" -H \"Authorization: token $TOKEN\")",
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/admin/runners\" -H \"Authorization: token $TOKEN\")",
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\")",
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs?limit=3\" -H \"Authorization: token $TOKEN\")",
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs/169/jobs\" -H \"Authorization: token $TOKEN\")",
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/jobs/179/logs\" -H \"Authorization: token $TOKEN\")",
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" JOB_ID=180 curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/jobs/$JOB_ID/logs\" -H \"Authorization: token $TOKEN\")",
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs?limit=2\" -H \"Authorization: token $TOKEN\")",
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" JOB_ID=181 curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/jobs/$JOB_ID/logs\" -H \"Authorization: token $TOKEN\")",
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs/172/jobs\" -H \"Authorization: token $TOKEN\")",
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/jobs/182/logs\" -H \"Authorization: token $TOKEN\")",
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs/178\" -H \"Authorization: token $TOKEN\")",
"mcp__plugin_playwright_playwright__browser_snapshot",
"mcp__plugin_playwright_playwright__browser_fill_form",
"mcp__plugin_playwright_playwright__browser_click",
"Bash(GITEA_TOKEN=\"e6c9fecb1f0148939493ae0fa30407d28c91279d\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs?limit=5\" -H \"Authorization: token $GITEA_TOKEN\")",
<<<<<<< Updated upstream
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 /tmp/a4_smoke.py)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -c \"from src.repositories.aider_event_repository import AiderEventRepository; print\\('import OK'\\)\")",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_aider_event_service.py -v)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_aider_event_service.py -v --tb=short)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -c \"from src.services.aider_event_service import classify_severity, should_create_incident, build_signal_data; print\\('✓ All imports successful'\\)\")",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_aider_event_service.py::test_build_signal_data_redacts_secrets_in_annotations -v)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_aider_events_api.py -v)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_aider_event_service.py tests/test_aider_events_api.py tests/test_aider_event_models.py tests/test_secret_redactor.py -v)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_aider_event_processor.py -v)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_aider_event_processor.py tests/test_aider_event_service.py tests/test_aider_events_api.py tests/test_aider_event_models.py tests/test_secret_redactor.py -v)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -c \"from src.workers.aider_event_processor import AiderEventProcessor, get_aider_event_processor, run_aider_event_processor_loop; print\\('✓ All imports successful'\\)\")",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_aider_event_processor.py -v --tb=short)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_aider_event_processor.py tests/test_aider_event_service.py tests/test_aider_events_api.py tests/test_aider_event_models.py tests/test_secret_redactor.py --tb=short)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_ai_router_feedback.py -v)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_aider_event_service.py tests/test_aider_events_api.py tests/test_aider_event_models.py tests/test_secret_redactor.py tests/test_aider_event_processor.py tests/test_ai_router_feedback.py -v)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -c \"from src.services.ai_router import AIRouter; from src.db.base import get_session_factory; print\\('✓ Imports successful, no circular imports'\\)\")",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_ai_router_feedback.py tests/test_aider_event_service.py -v --tb=short)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -c \"from src.api.v1 import aider_events; from src.workers.aider_event_processor import run_aider_event_processor_loop; from src.core.config import settings; print\\('AIDER_WEBHOOK_SECRET' in settings.__fields__, 'USE_AIDER_FEEDBACK' in settings.__fields__\\)\")",
"Bash(AIDER_WEBHOOK_SECRET=testsecret /Users/ogt/.pyenv/versions/3.11.7/bin/python3 -c \"from src.main import app; print\\('app OK; title:', app.title\\)\")",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_action_parsing.py tests/test_aider_event_service.py tests/test_aider_events_api.py tests/test_aider_event_models.py tests/test_secret_redactor.py tests/test_aider_event_processor.py tests/test_ai_router_feedback.py -v)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_action_parsing.py tests/test_aider_event_service.py tests/test_aider_events_api.py tests/test_aider_event_models.py tests/test_secret_redactor.py tests/test_aider_event_processor.py tests/test_ai_router_feedback.py -q)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pip install -e .[dev] --quiet)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pip install -e '.[dev]' --quiet)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/ -v)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -c \"from aider_watch_client.aiderw import main as awmain; from aider_watch_client.cli import main as climain; print\\('✓ imports ok'\\)\")",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pip show aider-watch-client)",
"Bash(tailscale status *)",
"Bash(kubectl rollout *)",
"Bash(bash /Users/ogt/awoooi/scripts/aider_watch_client/scripts/install.sh)",
"Bash(git rebase *)",
"Bash(/opt/homebrew/bin/aiderw --message \"add docstring to hello function\" --exit)",
"Bash(kubectl -n awoooi-prod get pod -l app=awoooi-api -o jsonpath='{.items[0].metadata.name}')",
"Bash(kubectl -n awoooi-prod exec awoooi-api-7b9464c969-8ml88 -- python -c ' *)",
"Bash(kubectl -n awoooi-prod rollout restart deployment/awoooi-api)",
"Bash(kubectl -n awoooi-prod get pod -l app=awoooi-api --no-headers)",
"Bash(kubectl -n awoooi-prod rollout status deployment/awoooi-api --timeout=120s)",
"Bash(/opt/homebrew/bin/aider-watch flush *)",
"Bash(kubectl -n awoooi-prod get pod -l app=awoooi-api -o wide)",
"Bash(kubectl -n awoooi-prod rollout status deployment/awoooi-api --timeout=30s)",
"Bash(kubectl -n awoooi-prod exec awoooi-api-6657fb9cf7-47lcg -- python -c \"import src.services.telegram_gateway as tg; import inspect; lines = inspect.getsource\\(tg\\); idx = lines.find\\('response_body=e.response.text'\\); print\\('FOUND' if idx >= 0 else 'NOT FOUND'\\)\")",
"Read(//opt/gitea/**)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/ -q)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/unit/test_aider_event_service.py tests/unit/test_aider_model.py -v)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_aider_events_api.py tests/test_aider_event_models.py tests/test_aider_event_service.py tests/test_aider_event_processor.py -v)",
"Bash(kubectl -n awoooi-prod get svc)",
"Bash(kubectl -n openclaw get pod)",
"Bash(kubectl -n awoooi-prod exec awoooi-api-7cd784c875-r4qkz -- python -c ' *)",
"Bash(kubectl -n awoooi-prod logs awoooi-api-7cd784c875-qt6j2 --since=10m)",
"Bash(kubectl -n awoooi-prod logs awoooi-api-7cd784c875-qt6j2 --since=15m)",
"Bash(kubectl -n awoooi-prod logs awoooi-api-7cd784c875-qt6j2 --since=20m)",
"Bash(kubectl -n awoooi-prod get secret awoooi-secrets -o yaml)",
"Bash(kubectl -n awoooi-prod logs awoooi-api-7cd784c875-qt6j2 --since=30m)",
"Bash(kubectl -n awoooi-prod logs awoooi-api-7cd784c875-qt6j2 --since=2h)",
"Bash(kubectl -n awoooi-prod logs awoooi-api-7cd784c875-qt6j2)",
"Bash(kubectl -n awoooi-prod get pod -l app=awoooi-api -o jsonpath='{range .items[*]}{.metadata.name} {.status.containerStatuses[0].imageID}{\"\\\\n\"}{end}')",
"Bash(kubectl -n awoooi-prod get ingress)",
"Bash(kubectl -n awoooi-prod get svc awoooi-api-svc)",
"Bash(kubectl -n awoooi-prod logs -l app=awoooi-api --since=60s --prefix)",
"Bash(kubectl -n awoooi-prod logs -l app=awoooi-api --since=5m --prefix)",
"Bash(kubectl -n awoooi-prod logs pod/awoooi-api-86bc79766d-dn5ll --since=5m)",
"Bash(kubectl -n awoooi-prod logs pod/awoooi-api-86bc79766d-dn5ll --since=10m)",
"Bash(kubectl -n awoooi-prod logs pod/awoooi-api-86bc79766d-dn5ll)",
"Bash(kubectl -n awoooi-prod logs -l app=awoooi-api --since=90s --prefix)",
"Bash(kubectl -n awoooi-prod logs pod/awoooi-api-86bc79766d-4x69p --since=5m)",
"Bash(redis-cli -h 192.168.0.188 -p 6380 -n 10 SCAN 0 MATCH \"playbook:PB-*\" COUNT 500)",
"Bash(redis-cli -h 192.168.0.188 -p 6380 -n 10 DBSIZE)",
"Bash(wait)",
"Read(//Users/**)",
"Read(//Users/ooo/.claude/**)",
"Bash(mkdir -p /Users/ogt/awoooi/.claude/agents)",
"Bash(cp /Users/ogt/.claude/agents/*.md /Users/ogt/awoooi/.claude/agents/)",
"Bash(kubectl -n awoooi-prod logs --tail=400 -l app=awoooi-api --prefix=true)",
"Bash(kubectl -n awoooi-prod logs --tail=300 awoooi-api-65c69fd649-bxbwp)",
"Bash(kubectl -n awoooi-prod logs --tail=20000 -l app=awoooi-api --prefix=false --since=24h)",
"Bash(kubectl -n awoooi-prod logs --since=24h awoooi-api-65c69fd649-bxbwp)",
"Bash(kubectl -n awoooi-prod logs --since=24h -l app=awoooi-api --prefix=false)",
"Bash(kubectl -n awoooi-prod logs --since=24h awoooi-api-65c69fd649-fmbxd)",
"Bash(kubectl -n awoooi-prod logs --since=3h awoooi-api-65c69fd649-fmbxd)",
"Bash(kubectl -n awoooi-prod logs --since=3h awoooi-api-65c69fd649-bxbwp)",
"Bash(kubectl -n awoooi-prod logs -l app=awoooi-api --tail=30 --since=30m)",
"Bash(kubectl -n awoooi-prod get pods -o wide)",
"Bash(kubectl -n awoooi-prod get pods -l app=awoooi-api -o jsonpath='{.items[0].metadata.creationTimestamp}')",
"Bash(kubectl -n awoooi-prod logs -l app=awoooi-api --tail=5 --since=5m)",
"Bash(kubectl -n awoooi-prod describe pod -l app=awoooi-api)",
"Bash(kubectl -n awoooi-prod logs -l app=awoooi-api --tail=20 --since=10m)",
"Bash(kubectl -n awoooi-prod exec deployment/awoooi-api -- python3 -c ' *)",
"Bash(PGPASSWORD=\"\" psql -h 188.188.188.188 -U aiops -d aiops -c \"\\\\d timeline_events\")",
"Bash(kubectl -n awoooi-prod get deploy awoooi-api -o yaml)",
"Bash(PGPASSWORD=\"\" psql --version)",
"Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- env)",
"Bash(kubectl -n awoooi-prod logs --tail=500 deploy/awoooi-api)",
"Bash(kubectl cp *)",
"Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- sh -c 'curl -sG \"$PROMETHEUS_URL/api/v1/query\" --data-urlencode \"query=up\" 2>&1 | head -c 400')",
"Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- sh -c 'for q in \"sum\\(rate\\(http_requests_total{status=~\\\\\"5..\\\\\"}[5m]\\)\\) / sum\\(rate\\(http_requests_total[5m]\\)\\)\" \"avg\\(rate\\(container_cpu_usage_seconds_total{namespace=\\\\\"awoooi-prod\\\\\",container=\\\\\"awoooi-api\\\\\"}[5m]\\)\\)\" \"pg_stat_activity_count{datname=\\\\\"awoooi\\\\\"}\" \"increase\\(kube_pod_container_status_restarts_total{namespace=\\\\\"awoooi-prod\\\\\"}[15m]\\)\"; do echo \"---- $q\"; curl -sG \"$PROMETHEUS_URL/api/v1/query\" --data-urlencode \"query=$q\" 2>&1 | head -c 250; echo; done')",
"Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- sh -c 'PGPASSWORD=as0V1mohktaFbGIx3R0iCatbMJ6XxFDL psql -h 192.168.0.188 -U awoooi -d awoooi_prod -c \"SELECT metric_name, count\\(*\\), max\\(trained_at\\) FROM dynamic_baseline_record GROUP BY metric_name;\" 2>&1 | head -20')",
"Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- sh -c 'PGPASSWORD=as0V1mohktaFbGIx3R0iCatbMJ6XxFDL psql -h 192.168.0.188 -U awoooi -d awoooi_prod -c \"SELECT count\\(*\\) as asset_count FROM asset_inventory; SELECT count\\(*\\) as coverage_count FROM asset_coverage_snapshot; SELECT count\\(*\\) as host_cap_count FROM host_capacity_snapshot; SELECT count\\(*\\) as compl_count FROM asset_compliance_snapshot; SELECT count\\(*\\) as rule_cat FROM alert_rule_catalog; SELECT count\\(*\\) as log_cluster FROM log_cluster_record;\" 2>&1')",
"Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- sh -c 'python3 -c \" *)",
"Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- python3 -c ' *)",
"Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- sh -c 'for q in \"http_requests_total\" \"container_cpu_usage_seconds_total\" \"container_memory_usage_bytes\" \"kube_pod_container_status_restarts_total\" \"pg_stat_activity_count\" \"node_cpu_seconds_total\" \"node_load1\"; do echo -n \"$q => \"; curl -sG \"$PROMETHEUS_URL/api/v1/query\" --data-urlencode \"query=count\\($q\\)\" 2>&1 | head -c 180; echo; done')",
"Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- sh -c 'curl -sG \"$PROMETHEUS_URL/api/v1/query\" --data-urlencode \"query=container_cpu_usage_seconds_total\" 2>&1 | python3 -c \"import json,sys; d=json.load\\(sys.stdin\\); rs=d[\\\\\"data\\\\\"][\\\\\"result\\\\\"][:3]; [print\\(r[\\\\\"metric\\\\\"]\\) for r in rs]; print\\(\\\\\"total series:\\\\\", len\\(d[\\\\\"data\\\\\"][\\\\\"result\\\\\"]\\)\\)\"')",
"Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- sh -c 'which kubectl 2>&1; kubectl version --client 2>&1 | head -3; kubectl -n awoooi-prod get deploy awoooi-api 2>&1 | head -3')",
"Bash(kubectl -n awoooi-prod logs --tail=2000 deploy/awoooi-api)",
"Bash(psql --version)",
"WebFetch(domain:core.telegram.org)",
"mcp__plugin_context7_context7__resolve-library-id",
"mcp__plugin_context7_context7__query-docs",
"WebFetch(domain:docs.claude.com)",
"Bash(git tag *)",
"Read(//usr/**)",
"Bash(psql -h 192.168.0.110 -U awoooi_user -d awoooi -c \"SELECT id, alertname, status, confidence, description, created_at FROM approval_records WHERE status='PENDING' AND DATE\\(created_at AT TIME ZONE 'Asia/Taipei'\\) = CURRENT_DATE AT TIME ZONE 'Asia/Taipei' ORDER BY created_at DESC LIMIT 10;\")",
"Bash(kubectl -n awoooi-prod get deployment awoooi-api -o jsonpath='{.spec.template.spec.containers[0].image}')",
"Bash(kubectl -n awoooi-prod get deployment awoooi-api -o jsonpath='{.spec.template.spec.containers[0].imagePullPolicy}{\"\\\\n\"}{.spec.template.metadata.labels}{\"\\\\n\"}')",
"Bash(kubectl kustomize *)",
"Bash(kubectl -n awoooi-prod rollout status deployment/awoooi-api --timeout=60s)",
"Bash(kubectl -n awoooi-prod get pods -l app=awoooi-api --no-headers)",
"Bash(kubectl -n awoooi-prod patch deployment awoooi-api -p '{\"spec\":{\"template\":{\"spec\":{\"containers\":[{\"name\":\"api\",\"image\":\"192.168.0.110:5000/awoooi/api:cbd28e29a08435deb8c66af51654d8fa65120a14\"}]}}}}')",
"Bash(kubectl -n awoooi-prod get deployment awoooi-api -o jsonpath='{.spec.template.spec.containers[0].image}{\"\\\\n\"}')",
"Bash(kubectl -n awoooi-prod get pods -l app=awoooi-api -o jsonpath='{range .items[*]}{.metadata.name}{\"\\\\t\"}{.spec.containers[0].image}{\"\\\\n\"}{end}')",
"Bash(kubectl -n awoooi-prod get pdb awoooi-api-pdb -o jsonpath='{.spec.minAvailable}')",
"Bash(kubectl -n awoooi-prod get pods -l app=awoooi-api -o wide)",
"Bash(kubectl -n awoooi-prod describe rs -l app=awoooi-api)",
"Bash(kubectl -n awoooi-prod get events --sort-by='.lastTimestamp')",
"Bash(kubectl -n awoooi-prod get deployment awoooi-api -o jsonpath='{.spec.replicas}{\"\\\\n\"}{.status.replicas}{\"\\\\n\"}{.status.readyReplicas}{\"\\\\n\"}{.status.updatedReplicas}{\"\\\\n\"}')",
"Bash(kubectl -n awoooi-prod get pods -l app=awoooi-api --sort-by=.metadata.creationTimestamp -o jsonpath='{range .items[*]}{.metadata.name}{\":\"}{.metadata.creationTimestamp}{\"\\\\n\"}{end}')",
"Bash(kubectl -n awoooi-prod get deployment awoooi-api -o jsonpath='{.status.conditions[*]}')",
"Bash(kubectl -n awoooi-prod describe deployment awoooi-api)",
"Bash(kubectl -n awoooi-prod get rs -l app=awoooi-api -o jsonpath='{range .items[*]}{.metadata.name}{\":\"}{.spec.template.spec.containers[0].image}{\"\\\\n\"}{end}')",
"Bash(kubectl -n awoooi-prod get deployment awoooi-api -o yaml)",
"Bash(kubectl -n awoooi-prod rollout status deployment/awoooi-api --timeout=180s)",
"Bash(kubectl -n awoooi-prod set image deployment/awoooi-api api=192.168.0.110:5000/awoooi/api:cbd28e29a08435deb8c66af51654d8fa65120a14 --record=false)",
"Bash(kubectl -n awoooi-prod get pods -l app=awoooi-api -o jsonpath='{range .items[*]}{.metadata.name}{\"\\\\t\"}{.spec.containers[0].image}{\"\\\\t\"}{.status.phase}{\"\\\\n\"}{end}')",
"Bash(kubectl -n awoooi-prod get deployment awoooi-api -o jsonpath='{.status.replicas}{\"\\\\t\"}{.status.readyReplicas}{\"\\\\t\"}{.status.updatedReplicas}')",
"Bash(bash /tmp/diagnostic.sh)",
"WebFetch(domain:docs.github.com)",
"WebFetch(domain:docs.sonarsource.com)",
"WebFetch(domain:gitea.com)",
"WebFetch(domain:docs.gitea.com)",
"WebFetch(domain:www.sonarsource.com)",
"WebFetch(domain:golangci-lint.run)",
"WebFetch(domain:www.uber.com)",
"Bash(bash scripts/ops/deploy-alerts.sh --dry-run)",
"Bash(bash scripts/ops/deploy-alerts.sh)",
"Bash(promtool check *)",
"WebFetch(domain:openrouter.ai)",
"WebFetch(domain:qwenlm.github.io)",
"WebFetch(domain:aclanthology.org)",
"WebFetch(domain:datanorth.ai)",
"WebFetch(domain:www.infoq.com)",
"WebFetch(domain:aws.amazon.com)",
"WebFetch(domain:artificialanalysis.ai)",
"WebFetch(domain:www.alibabacloud.com)",
"WebFetch(domain:docs.langchain.com)",
"WebFetch(domain:arxiv.org)",
"WebFetch(domain:blog.kilo.ai)",
"WebFetch(domain:www.siliconflow.com)",
"WebFetch(domain:aicompetence.org)",
"Bash(redis-cli -h 192.168.0.188 -p 6380 ping)",
"Bash(redis-cli ping *)"
=======
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest apps/api/tests/test_aider_event_models.py -v)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_action_parsing.py -v --collect-only)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_action_parsing.py --collect-only)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -m pytest tests/test_aider_event_models.py tests/test_secret_redactor.py -v)",
"Bash(/Users/ogt/.pyenv/versions/3.11.7/bin/python3 -c \"from src.repositories.aider_event_repository import AiderEventRepository; print\\('import OK'\\)\")"
>>>>>>> Stashed changes
],
"deny": [
"Bash(rm -rf *)",
"Bash(git push --force *)",
"Bash(git reset --hard *)",
"Bash(kubectl delete *)",
"Bash(docker rm -f *)"
],
"additionalDirectories": [
"/Users/ogt/.claude/projects/-Users-ogt-awoooi/memory",
"/Users/ogt/awoooi/.claude/hooks",
"/Users/ogt/.claude/channels/telegram",
<<<<<<< Updated upstream
"/Users/ogt",
"/Users/ogt/.claude",
"/Users/ogt/awoooi/apps/web/src/app/[locale]/aiops"
]
},
"hooks": {
"PreToolUse": [
{
"matcher": "",
"hooks": [
{
"type": "command",
"command": "node $CLAUDE_PROJECT_DIR/.claude/hooks/awoooi-guard.js 2>/dev/null || true"
},
{
"type": "command",
"command": "node /Users/ogt/.claude/hooks/branch-protection.js"
},
{
"type": "command",
"command": "node /Users/ogt/.claude/hooks/commit-quality.js"
},
{
"type": "command",
"command": "node /Users/ogt/.claude/hooks/large-file-warner.js"
},
{
"type": "command",
"command": "node /Users/ogt/.claude/hooks/mcp-health.js"
}
]
}
],
"PostToolUse": [
{
"matcher": "",
"hooks": [
{
"type": "command",
"command": "node /Users/ogt/.claude/hooks/audit-log.js"
},
{
"type": "command",
"command": "node /Users/ogt/.claude/hooks/suggest-compact.js"
}
]
}
],
"Stop": [
{
"matcher": "",
"hooks": [
{
"type": "command",
"command": "node /Users/ogt/.claude/hooks/cost-tracker.js"
},
{
"type": "command",
"command": "node /Users/ogt/.claude/hooks/session-summary.js"
}
]
}
=======
"/Users/ogt/aider-watch"
>>>>>>> Stashed changes
]
}
}

View File

@@ -0,0 +1,827 @@
{
"permissions": {
"allow": [
"Bash(pnpm install:*)",
"Bash(npm --version)",
"Bash(npm install:*)",
"Bash(pnpm --version)",
"Bash(pnpm dev:*)",
"Bash(pnpm add:*)",
"Bash(ls -la /Users/ogt/awoooi/apps/web/next.config.*)",
"Bash(pkill -f \"next dev\")",
"Bash(curl -sL http://localhost:3000/zh-TW)",
"Bash(curl -s http://localhost:3000/zh-TW)",
"Bash(pnpm --filter web build)",
"Bash(curl -s http://localhost:3001/zh-TW)",
"Bash(curl -s -o /dev/null -w \"%{http_code}\" http://localhost:3000/zh-TW)",
"Bash(kubectl apply:*)",
"Bash(chmod +x /Users/ogt/awoooi/deploy-infra.sh)",
"Bash(./deploy-infra.sh)",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"mkdir -p /tmp/awoooi-k8s\")",
"Bash(sshpass -p '0936223270' scp -o StrictHostKeyChecking=no /Users/ogt/awoooi/k8s/awoooi-prod/01-namespace-quota.yaml /Users/ogt/awoooi/k8s/awoooi-prod/02-network-policy.yaml /Users/ogt/awoooi/k8s/awoooi-prod/04-configmap.yaml wooo@192.168.0.120:/tmp/awoooi-k8s/)",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"sudo kubectl apply -f /tmp/awoooi-k8s/01-namespace-quota.yaml\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl apply -f /tmp/awoooi-k8s/01-namespace-quota.yaml 2>/dev/null\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl apply -f /tmp/awoooi-k8s/02-network-policy.yaml 2>/dev/null\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl apply -f /tmp/awoooi-k8s/04-configmap.yaml 2>/dev/null\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl get ns awoooi-prod -o wide 2>/dev/null\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl get networkpolicy -n awoooi-prod 2>/dev/null\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl get resourcequota,limitrange,configmap -n awoooi-prod 2>/dev/null\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"rm -rf /tmp/awoooi-k8s\")",
"Bash(PYTHONPATH=. python -c \"from src.main import app; print\\(''Import OK''\\)\")",
"Bash(curl -s http://localhost:8000/api/v1/health/ready)",
"Bash(curl -s http://localhost:8000/api/v1/health/live)",
"Bash(curl -s http://localhost:8000/)",
"Bash(pkill -f \"uvicorn src.main:app\")",
"Bash(pkill -f \"node.*next\")",
"Bash(curl -s http://localhost:8000/api/v1/health)",
"Read(//Users/ogt/awoooi/apps/api/**)",
"Bash(pnpm typecheck:*)",
"Read(//Users/ogt/awoooi/apps/web/**)",
"Bash(curl -s -X POST http://localhost:8000/api/v1/dashboard/demo/spike/clear)",
"Read(//Users/ogt/awoooi/=== 驗證英文頁面 \\(/en/**)",
"Bash(jq \".devDependencies | keys | map\\(select\\(startswith\\(\"\"@playwright\"\"\\) or startswith\\(\"\"playwright\"\"\\)\\)\\)\")",
"Bash(npx playwright:*)",
"Bash(curl -s http://localhost:3000/zh-TW/demo -o /dev/null -w \"Frontend: HTTP %{http_code}\\\\n\")",
"Bash(__NEW_LINE_ef548029029cdfac__ echo:*)",
"Bash(curl -s http://localhost:8000/api/v1/health -o /dev/null -w \"Backend: HTTP %{http_code}\\\\n\")",
"Bash(echo '=== 已產出的截圖 ===' find /Users/ogt/awoooi/apps/web/test-results -name *.png)",
"Bash(echo '=== Playwright E2E 測試結果 ===' echo echo '📸 截圖證據 \\(test-results/screenshots/\\):' ls -la /Users/ogt/awoooi/apps/web/test-results/screenshots/ __NEW_LINE_db74e5f56e34db17__ echo echo '🎬 錄影證據 \\(.webm\\):' find /Users/ogt/awoooi/apps/web/test-results -name *.webm -exec ls -la {})",
"Bash(__NEW_LINE_db74e5f56e34db17__ echo:*)",
"Bash(source .venv/bin/activate)",
"Bash(python scripts/demo_multisig.py)",
"Bash(python -c \"from src.api.v1.approvals import router; print\\(''✅ Approvals router loaded:'', len\\(router.routes\\), ''routes''\\)\")",
"Bash(npx tsc:*)",
"Bash(chmod +x /Users/ogt/awoooi/scripts/demo-multisig-flow.sh)",
"Bash(python -c \"from src.main import app; print\\(''✅ API loads successfully''\\)\")",
"Bash(jq)",
"Bash(/Users/ogt/awoooi/scripts/demo-multisig-flow.sh)",
"Bash(curl -s -X POST \"http://localhost:8000/api/v1/approvals\" -H \"Content-Type: application/json\" -d '{:*)",
"Bash(curl -s http://localhost:8000/api/v1/openapi.json)",
"Bash(python -c \":*)",
"Bash(curl -s http://localhost:3000 -o /dev/null -w \"%{http_code}\")",
"Bash(lsof -ti:3000,3001,8000)",
"Bash(curl -s http://localhost:8000/health)",
"Bash(curl -s http://localhost:8000/api/v1/approvals/pending)",
"Bash(curl -s -o /dev/null -w \"%{http_code}\" http://localhost:3001/zh-TW/demo)",
"Bash(ls -la test-results/*.png)",
"Bash(cp test-results/cpo102-*.png /Users/ogt/awoooi/docs/screenshots/)",
"Bash(ssh ogt@192.168.0.120 'cat /etc/rancher/k3s/k3s.yaml')",
"Bash(python -c \"from src.main import app; print\\(''✅ main.py imports OK''\\)\")",
"Bash(curl -s http://localhost:8000/api/v1/approvals/k8s-test)",
"Bash(sqlite3 awoooi.db \".tables\")",
"Bash(sshpass -p 0936223270 ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 'sudo cat /etc/rancher/k3s/k3s.yaml')",
"Bash(kubectl --kubeconfig=/Users/ogt/awoooi/apps/api/k3s-prod.yaml get deployments -n awoooi-prod)",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl get deployments -n awoooi-prod 2>/dev/null\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl get deployments -A 2>/dev/null\")",
"Bash(curl -s -X POST http://localhost:8000/api/v1/approvals -H \"Content-Type: application/json\" -d '{:*)",
"Bash(APPROVAL_ID=\"b58a0d86-fa4e-43ca-881c-02e978cd7943\")",
"Bash(curl -s -X POST \"http://localhost:8000/api/v1/approvals/$APPROVAL_ID/sign\" -H \"Content-Type: application/json\" -d '{:*)",
"Bash(sqlite3 /Users/ogt/awoooi/apps/api/awoooi.db \"SELECT operation_type, target_resource, namespace, success, dry_run_passed, dry_run_message, error_message, execution_duration_ms, created_at FROM audit_logs ORDER BY created_at DESC LIMIT 1;\" -header -column)",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S kubectl get pods -n monitoring -l app=grafana 2>/dev/null\")",
"Bash(curl -s http://192.168.0.188:11434/api/tags)",
"Bash(python -c \"from src.main import app; print\\(''✅ Compile OK''\\)\")",
"Bash(curl -s http://localhost:8000/api/v1/ai/status)",
"Bash(curl -s -X POST http://localhost:8000/api/v1/ai/analyze-and-propose -H \"Content-Type: application/json\" -d '{}')",
"Bash(curl -s -X POST http://192.168.0.188:11434/api/generate -H \"Content-Type: application/json\" -d '{\"\"\"\"model\"\"\"\":\"\"\"\"llama3.2:1b\"\"\"\",\"\"\"\"prompt\"\"\"\":\"\"\"\"Output only JSON: {\\\\\"\"\"\"action\\\\\"\"\"\":\\\\\"\"\"\"test\\\\\"\"\"\"}\"\"\"\",\"\"\"\"stream\"\"\"\":false,\"\"\"\"format\"\"\"\":\"\"\"\"json\"\"\"\"}' --max-time 30)",
"Bash(curl -s -X POST http://localhost:8000/api/v1/ai/analyze-and-propose -H \"Content-Type: application/json\" -d '{}' --max-time 60)",
"Bash(PROMPT='你是 ClawBot AI。分析以下監控數據輸出純 JSON無其他文字。:*)",
"Bash(curl -s -X POST http://192.168.0.188:11434/api/generate -H \"Content-Type: application/json\" -d \"{\"\"model\"\":\"\"llama3.2:1b\"\",\"\"prompt\"\":\"\"$PROMPT\"\",\"\"stream\"\":false,\"\"format\"\":\"\"json\"\",\"\"options\"\":{\"\"num_predict\"\":256,\"\"temperature\"\":0.1}}\" --max-time 60)",
"Bash(curl -s -X POST http://192.168.0.188:11434/api/generate -H \"Content-Type: application/json\" -d '{\"\"\"\"model\"\"\"\":\"\"\"\"llama3.2:1b\"\"\"\",\"\"\"\"prompt\"\"\"\":\"\"\"\"Harbor service returning 404. Output JSON: {\\\\\"\"\"\"suggested_action\\\\\"\"\"\":\\\\\"\"\"\"RESTART_DEPLOYMENT\\\\\"\"\"\",\\\\\"\"\"\"target_resource\\\\\"\"\"\":\\\\\"\"\"\"harbor\\\\\"\"\"\",\\\\\"\"\"\"namespace\\\\\"\"\"\":\\\\\"\"\"\"default\\\\\"\"\"\",\\\\\"\"\"\"risk_level\\\\\"\"\"\":\\\\\"\"\"\"medium\\\\\"\"\"\",\\\\\"\"\"\"reasoning\\\\\"\"\"\":\\\\\"\"\"\"Service down\\\\\"\"\"\",\\\\\"\"\"\"confidence\\\\\"\"\"\":0.8,\\\\\"\"\"\"affected_services\\\\\"\"\"\":[]}\"\"\"\",\"\"\"\"stream\"\"\"\":false,\"\"\"\"format\"\"\"\":\"\"\"\"json\"\"\"\",\"\"\"\"options\"\"\"\":{\"\"\"\"num_predict\"\"\"\":128,\"\"\"\"temperature\"\"\"\":0.1}}' --max-time 30)",
"Bash(curl -v -X POST http://192.168.0.188:11434/api/generate -H \"Content-Type: application/json\" -d '{\"\"\"\"model\"\"\"\":\"\"\"\"llama3.2:1b\"\"\"\",\"\"\"\"prompt\"\"\"\":\"\"\"\"Say hello\"\"\"\",\"\"\"\"stream\"\"\"\":false}' --max-time 30)",
"Bash(curl -s -X POST http://localhost:8000/api/v1/ai/analyze-and-propose -H \"Content-Type: application/json\" -d '{}' --max-time 120)",
"Bash(curl -s http://localhost:8000/api/v1/ai/analyze-and-propose -X POST -H \"Content-Type: application/json\")",
"Bash(curl -s http://localhost:8000/api/v1/dashboard)",
"Bash(ls -la ~/Downloads/image*.png)",
"Bash(ls -la ~/Desktop/image*.png)",
"Bash(ls -la /Users/ogt/awoooi/apps/web/public/*.png)",
"WebFetch(domain:openclaw.ai)",
"Bash(ls -la /Users/ogt/Downloads/*.png)",
"Bash(ls -la /Users/ogt/.gemini/antigravity/brain/*/image*.png)",
"Bash(ls -lat /Users/ogt/Downloads/*.png)",
"Bash(curl -s http://localhost:8000/api/v1/approvals)",
"Bash(curl -s -X GET http://localhost:8000/api/v1/approvals/)",
"Bash(APPROVAL_ID=\"4989729e-e518-4e7e-8dff-5c3269e0c82b\")",
"Bash(curl -s -X POST \"http://localhost:8000/api/v1/approvals/$APPROVAL_ID/sign\" -H \"Content-Type: application/json\" -d '{\"\"\"\"signer_id\"\"\"\": \"\"\"\"ciso-001\"\"\"\", \"\"\"\"signer_name\"\"\"\": \"\"\"\"Demo CISO\"\"\"\", \"\"\"\"comment\"\"\"\": \"\"\"\"資安確認,核准執行\"\"\"\"}')",
"Bash(curl -s http://localhost:8000/api/v1/webhooks/health)",
"Bash(curl -s -X POST http://localhost:8000/api/v1/webhooks/alerts -H \"Content-Type: application/json\" -d '{:*)",
"Bash(curl -s http://localhost:3000)",
"Bash(ls -la apps/web/test-results/*.png)",
"Bash(curl -s http://localhost:3000/zh-TW/demo)",
"Bash(curl -s -o /dev/null -w \"%{http_code}\" http://localhost:3333/zh-TW/demo)",
"Bash(curl -s http://localhost:8001/api/v1/approvals/pending)",
"Bash(curl -s -X POST http://localhost:8001/api/v1/approvals -H \"Content-Type: application/json\" -d '{:*)",
"Bash(curl -s http://localhost:8001/openapi.json)",
"Bash(curl -s http://localhost:8001/docs)",
"Bash(curl -s http://localhost:8001/api/v1/webhooks/grafana -X OPTIONS)",
"Bash(pnpm run:*)",
"Bash(node scripts/screenshot-rbac.mjs)",
"Bash(pnpm exec:*)",
"Bash(curl -s http://localhost:3333 -o /dev/null -w \"%{http_code}\")",
"Bash(curl -s http://localhost:3333/zh-TW/demo -o /dev/null -w \"%{http_code}\")",
"Bash(python3 -c \"import sys,json; d=json.load\\(sys.stdin\\); print\\(f''''Count: {d[count]}''''\\); [print\\(f''''- {a[id][:8]}... risk={a[risk_level]}''''\\) for a in d[''''approvals''''][:3]]\")",
"Bash(curl -s http://localhost:3000/zh-TW/demo -o /dev/null -w \"%{http_code}\")",
"Bash(python -c \"import sys,json; d=json.load\\(sys.stdin\\); print\\(f'''' Connected: {d[\"\"success\"\"]}''''\\); print\\(f'''' Namespaces: {d[\"\"namespaces\"\"][:3]}...''''\\)\" __NEW_LINE_57ae1c1c812968e7__ echo \"\" echo \"3. 資料庫持久化:\" sqlite3 /Users/ogt/awoooi/apps/api/awoooi.db \"SELECT COUNT\\(*\\) as approvals FROM approval_records;\" sqlite3 /Users/ogt/awoooi/apps/api/awoooi.db \"SELECT COUNT\\(*\\) as timeline FROM timeline_events;\" sqlite3 /Users/ogt/awoooi/apps/api/awoooi.db \"SELECT COUNT\\(*\\) as audits FROM audit_logs;\")",
"Bash(head -2 __NEW_LINE_9bf9481fbdf30d4e__ echo \"\" echo \"2. 告警收斂跳過 LLM 日誌 \\(應該有 4 次\\):\" grep -c \"alert_converged_skip_llm\" /tmp/api-server.log)",
"Bash(python -m json.tool)",
"Bash(__NEW_LINE_7463bff94cecc20f__ echo:*)",
"Bash(__NEW_LINE_13846c8488c5fa9a__ echo:*)",
"Bash(__NEW_LINE_13846c8488c5fa9a__ ls:*)",
"Bash(python -c \"import sys,json; d=json.load\\(sys.stdin\\); print\\(f'''' Status: {d[\"\"status\"\"]}''''\\)\" __NEW_LINE_32366ca1bb050259__ echo \"\" echo \"2. 待簽核記錄 \\(含 hit_count\\):\" curl -s http://localhost:8000/api/v1/approvals/pending)",
"Read(//Users/ogt/awoooi/**)",
"Bash(curl -s http://localhost:8000/api/v1/timeline/events?limit=10)",
"Bash(curl -s http://localhost:8000/api/v1/timeline/events?limit=5)",
"Bash(ls -la /Users/ogt/awoooi/apps/api/*.txt /Users/ogt/awoooi/apps/api/*.toml)",
"Bash(ls -la /Users/ogt/awoooi/docker-compose*.yml)",
"Bash(ls /Users/ogt/awoooi/k8s/awoooi-prod/*rbac* /Users/ogt/awoooi/k8s/awoooi-prod/*service-account*)",
"Bash(kubectl kustomize:*)",
"Bash(docker compose:*)",
"Bash(docker info:*)",
"Bash(python3 -c \"import sys,json; d=json.load\\(sys.stdin\\); print\\(''''API Status:'''', d.get\\(''''status'''', ''''unknown''''\\)\\)\")",
"Bash(pkill -9 -f uvicorn)",
"Bash(lsof -ti:8000)",
"Bash(open -a Docker)",
"Bash(docker stop:*)",
"Bash(lsof -ti:3000)",
"Bash(docker start:*)",
"Bash(docker ps:*)",
"Bash(curl -s http://localhost:3000 -o /dev/null -w 'HTTP Status: %{http_code}\\\\n')",
"Bash(curl -I http://localhost:8000/api/v1/dashboard/stream)",
"Bash(curl -s http://localhost:8000/openapi.json)",
"Bash(curl -s http://localhost:8000/api/v1/dashboard/stream --max-time 3 -w \"\\\\n--- HTTP Status: %{http_code} ---\\\\n\")",
"Bash(curl -s http://localhost:8000/api/v1/dashboard/stream --max-time 3)",
"Bash(curl -s http://localhost:3000/zh-TW -o /dev/null -w \"HTTP Status: %{http_code}\\\\n\")",
"Bash(curl -s -D - http://localhost:8000/api/v1/dashboard/stream --max-time 2)",
"Bash(chmod +x /Users/ogt/awoooi/scripts/deploy-infra.sh)",
"Bash(./scripts/deploy-infra.sh)",
"Bash(pnpm --filter @awoooi/web build)",
"Bash(timeout 10 env MOCK_MODE=true OTEL_ENABLED=false uvicorn src.main:app --host 0.0.0.0 --port 8099)",
"Bash(timeout 8 pnpm --filter @awoooi/web dev)",
"Bash(git diff:*)",
"Bash(curl -s -I http://localhost:8000/api/v1/dashboard/stream)",
"Bash(timeout 3 curl -s -N http://localhost:8000/api/v1/dashboard/stream)",
"Bash(grep -n \"NEXT_PUBLIC\\\\|API_URL\\\\|localhost\" /Users/ogt/awoooi/apps/web/.env*)",
"Bash(timeout 2 curl -s -D - -N http://localhost:8000/api/v1/dashboard/stream)",
"Bash(curl -s http://localhost:3000/)",
"Bash(python -m py_compile scripts/fire_test_alert.py)",
"Bash(python -m scripts.fire_test_alert --help)",
"Bash(python -m scripts.fire_test_alert)",
"Bash(python -m scripts.fire_test_alert --type k8s_pod_crash)",
"Bash(timeout 3 curl -s -N -H \"Origin: http://localhost:3000\" http://localhost:8000/api/v1/dashboard/stream)",
"Bash(python -m scripts.fire_test_alert --type disk_full)",
"Bash(docker restart:*)",
"Bash(curl -s -w \"\\\\nHTTP_CODE: %{http_code}\\\\n\" http://localhost:3000)",
"Bash(docker exec:*)",
"Bash(docker rmi:*)",
"Bash(timeout 5 curl -s -N http://localhost:8000/api/v1/dashboard/stream)",
"Bash(curl -s http://localhost:3000 -w \"\\\\nHTTP: %{http_code}\\\\n\")",
"Bash(timeout 120 docker logs awoooi-api -f --since 1s)",
"Bash(curl -s -I -H \"Origin: http://localhost:3000\" http://localhost:8000/api/v1/dashboard/stream)",
"Bash(curl -s -X OPTIONS -H \"Origin: http://localhost:3000\" -H \"Access-Control-Request-Method: GET\" http://localhost:8000/api/v1/dashboard/stream -I)",
"Bash(node /Users/ogt/awoooi/scripts/verify-sse.js)",
"Bash(python -m scripts.fire_test_alert --type db_connection_timeout)",
"Bash(npm run:*)",
"Bash(docker-compose down:*)",
"Bash(docker-compose build:*)",
"Bash(docker-compose up:*)",
"Bash(pkill -f 'next dev')",
"Bash(node /Users/ogt/awoooi/scripts/test-approval-flow.js)",
"Bash(python -m scripts.fire_test_alert --type pod_crash)",
"Bash(node /Users/ogt/awoooi/scripts/test-k8s-executor.js)",
"Bash(kubectl cluster-info:*)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl cluster-info)",
"Bash(ls -la /Users/ogt/awoooi/apps/web/src/app/[locale]/)",
"Bash(python -c \"from src.api.v1 import audit_logs; print\\(''API module loads OK''\\)\")",
"Bash(curl -s http://localhost:3000/zh-TW/action-logs)",
"Bash(pnpm build:*)",
"Bash(curl -s http://localhost:8000/api/v1/audit-logs)",
"Bash(xargs -r kill -9 2)",
"Bash(/dev/null source:*)",
"Bash(python -c \"from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor; print\\(''''httpx ok''''\\)\")",
"Bash(sqlite3 /Users/ogt/awoooi/apps/api/awoooi.db \"SELECT * FROM audit_logs ORDER BY created_at DESC LIMIT 5;\")",
"Bash(sqlite3 /Users/ogt/awoooi/apps/api/awoooi.db \"SELECT name FROM sqlite_master WHERE type=''table'';\")",
"Bash(sqlite3 /Users/ogt/awoooi/apps/api/awoooi.db \"SELECT id, event_type, status, title, created_at FROM timeline_events ORDER BY created_at DESC LIMIT 5;\")",
"Bash(curl -s http://localhost:8000/api/v1/audit-logs/stats)",
"Bash(curl -s http://localhost:8000/api/v1/timeline?limit=10)",
"Bash(curl -s \"http://localhost:8000/api/v1/timeline\")",
"Bash(curl -s http://localhost:8000/api/v1/docs)",
"Bash(chmod +x /Users/ogt/awoooi/scripts/setup-guardrails.sh /Users/ogt/awoooi/scripts/ai_code_reviewer.py)",
"Bash(ls -la /Users/ogt/awoooi/apps/web/.eslintrc*)",
"Bash(ls -la scripts/*.py scripts/*.sh .pre-commit-config.yaml .secrets.baseline apps/web/.eslintrc.js)",
"Bash(python -m src.services.test_context_gatherer)",
"Bash(python -m pytest src/services/test_context_gatherer.py -v)",
"Bash(grep -r \"ClawBot\\\\|clawbot\\\\|CLAWBOT\" --include=*.py --include=*.ts --include=*.tsx apps/)",
"Bash(python scripts/e2e_openclaw_test.py)",
"Bash(python -m pytest tests/e2e_network_test.py -v --tb=short)",
"Bash(chmod +x /Users/ogt/awoooi/apps/api/scripts/apply_prometheus_config.sh /Users/ogt/awoooi/apps/api/scripts/fire_live_alert.py)",
"Bash(./scripts/apply_prometheus_config.sh)",
"Bash(python scripts/fire_live_alert.py oomkilled)",
"Bash(python scripts/fire_live_alert.py oomkilled --api-url http://localhost:8000)",
"Bash(python scripts/fire_live_alert.py highcpu --api-url http://localhost:8000)",
"Bash(python scripts/fire_live_alert.py podcrash --api-url http://localhost:8000)",
"Bash(python -m pytest tests/test_webhook_telegram_integration.py -v)",
"Bash(ls -la /Users/ogt/awoooi/apps/api/.env*)",
"Bash(ls -la /Users/ogt/wooo-aiops/.env*)",
"Bash(ls -la /Users/ogt/AIOps/.env*)",
"Bash(/Users/ogt/awoooi/apps/api/.env:*)",
"Bash(/tmp/deploy-188-home.sh:*)",
"Bash(chmod +x /tmp/deploy-188-home.sh)",
"Bash(scp /tmp/awoooi-api-deploy.tar.gz /tmp/deploy-188-home.sh ollama@192.168.0.188:/tmp/)",
"Bash(ssh ollama@192.168.0.188 \"bash /tmp/deploy-188-home.sh\")",
"Bash(ssh ollama@192.168.0.188 \"curl -s http://localhost:8000/api/v1/webhooks/health\")",
"Bash(ssh ollama@192.168.0.188 \"tail -50 /tmp/openclaw.log\")",
"Bash(ssh ollama@192.168.0.188 \"cd /home/ollama/awoooi-api && source .venv/bin/activate && pip install sqlalchemy aiosqlite -q && pip install httpx python-dotenv pydantic-settings -q\")",
"Bash(ssh ollama@192.168.0.188 \"cd /home/ollama/awoooi-api && pkill -f ''uvicorn src.main:app'' 2>/dev/null; sleep 1; source .venv/bin/activate && nohup uvicorn src.main:app --host 0.0.0.0 --port 8000 > /tmp/openclaw.log 2>&1 & sleep 3 && curl -s http://localhost:8000/api/v1/webhooks/health\")",
"Bash(ssh ollama@192.168.0.188:*)",
"Bash(pkill -f ngrok)",
"Bash(pkill -f \"ssh -fN.*8001\")",
"Bash(ssh -fN -L 8001:localhost:8000 ollama@192.168.0.188)",
"Bash(curl -s http://localhost:8001/api/v1/webhooks/health)",
"Bash(BOT_TOKEN=\"8569720657:AAHdvKf_P2ms-QKFTyqTLtLiqEggz8cpjMk\" curl -s \"https://api.telegram.org/bot$BOT_TOKEN/getWebhookInfo\")",
"Bash(curl -s https://api.telegram.org/bot$BOT_TOKEN/getWebhookInfo)",
"Bash(curl -s http://localhost:8001/api/v1/webhooks/)",
"Bash(curl -s http://localhost:8001/)",
"Bash(curl -s http://localhost:8001/api/v1/health)",
"Bash(scp /tmp/awoooi-api-v7.tar.gz ollama@192.168.0.188:/tmp/)",
"Bash(tar -czvf /tmp/awoooi-api-v7.1.tar.gz src/ requirements.txt pyproject.toml)",
"Bash(scp /tmp/awoooi-api-v7.1.tar.gz ollama@192.168.0.188:/tmp/)",
"Bash(ssh ollama@192.168.0.188 \"tail -10 /tmp/openclaw.log | grep -E ''''clickhouse|signoz_gold''''\")",
"Bash(ssh ogt@192.168.0.188 \"cd /home/ollama/awoooi-api && tail -50 nohup.out 2>/dev/null || journalctl -u awoooi-api --no-pager -n 50 2>/dev/null || echo ''請手動檢查日誌''\")",
"Bash(curl -s --connect-timeout 5 http://192.168.0.188:8123/ -d \"SELECT 1 FORMAT JSONEachRow\")",
"Bash(curl -s --connect-timeout 5 http://192.168.0.188:11434/api/tags)",
"Bash(ssh -o StrictHostKeyChecking=no -o PasswordAuthentication=no -o BatchMode=yes -o ConnectTimeout=5 ollama@192.168.0.188 \"echo ok\")",
"Bash(ssh -o StrictHostKeyChecking=no -o PasswordAuthentication=no -o BatchMode=yes -o ConnectTimeout=5 wooo@192.168.0.188 \"echo ok\")",
"Bash(ssh -o StrictHostKeyChecking=no -o PasswordAuthentication=no -o BatchMode=yes -o ConnectTimeout=5 root@192.168.0.188 \"echo ok\")",
"Bash(curl -s --connect-timeout 5 http://192.168.0.188:8001/health)",
"Bash(ssh root@192.168.0.188 \"cat /tmp/openclaw.log 2>/dev/null | tail -100 || echo ''Log file not found''\")",
"Bash(ssh -o StrictHostKeyChecking=no -o BatchMode=yes -o ConnectTimeout=5 ollama@192.168.0.188 \"echo ok\")",
"Bash(ssh -o StrictHostKeyChecking=no -o BatchMode=yes -o ConnectTimeout=5 wooo@192.168.0.188 \"echo ok\")",
"Bash(scp /Users/ogt/awoooi/apps/api/src/services/signoz_client.py ollama@192.168.0.188:/home/ollama/awoooi-api/src/services/)",
"Bash(scp /Users/ogt/awoooi/apps/api/src/services/openclaw.py ollama@192.168.0.188:/home/ollama/awoooi-api/src/services/)",
"Bash(scp /Users/ogt/awoooi/apps/api/src/services/telegram_gateway.py ollama@192.168.0.188:/home/ollama/awoooi-api/src/services/)",
"Bash(scp /Users/ogt/awoooi/apps/api/src/api/v1/webhooks.py ollama@192.168.0.188:/home/ollama/awoooi-api/src/api/v1/)",
"Bash(scp /Users/ogt/awoooi/apps/api/src/models/ai.py ollama@192.168.0.188:/home/ollama/awoooi-api/src/models/)",
"Bash(ssh ollama@192.168.0.188 \"cd /home/ollama/awoooi-api && pkill -f ''''uvicorn src.main:app'''' && sleep 2 && nohup .venv/bin/python3 -m uvicorn src.main:app --host 0.0.0.0 --port 8000 > nohup.out 2>&1 &\")",
"Bash(curl -s --connect-timeout 5 http://192.168.0.188:8000/health)",
"Bash(curl -s --connect-timeout 10 http://192.168.0.188:8000/health)",
"Bash(curl -s -X POST http://192.168.0.188:8000/api/v1/webhooks/alerts -H \"Content-Type: application/json\" -d '{:*)",
"Bash(curl -s -X POST http://192.168.0.188:8000/api/v1/webhooks/alerts -H \"Content-Type: application/json\" -d '{\"\"alert_type\"\":\"\"high_cpu\"\",\"\"severity\"\":\"\"critical\"\",\"\"source\"\":\"\"signoz\"\",\"\"target_resource\"\":\"\"api-gateway\"\",\"\"namespace\"\":\"\"awoooi-prod\"\",\"\"message\"\":\"\"CPU 92% test\"\"}')",
"Bash(curl -s --connect-timeout 5 http://192.168.0.188:8000/api/v1/webhooks/alerts -X POST -H \"Content-Type: application/json\" -d '{\"\"alert_type\"\":\"\"high_cpu\"\",\"\"severity\"\":\"\"critical\"\",\"\"source\"\":\"\"signoz\"\",\"\"target_resource\"\":\"\"api-gateway\"\",\"\"namespace\"\":\"\"awoooi-prod\"\",\"\"message\"\":\"\"CPU 92% - 統帥全自主驗收 v2\"\"}')",
"Bash(curl -s --connect-timeout 30 --max-time 120 -X POST http://192.168.0.188:8000/api/v1/webhooks/alerts -H \"Content-Type: application/json\" -d '{:*)",
"Bash(curl -s --connect-timeout 30 --max-time 180 -X POST http://192.168.0.188:8000/api/v1/webhooks/alerts -H \"Content-Type: application/json\" -d '{:*)",
"Bash(curl -s http://192.168.0.188:8000/api/v1/webhooks/alerts -X POST -H \"Content-Type: application/json\" -d '{\"\"alert_type\"\":\"\"k8s_pod_crash\"\",\"\"severity\"\":\"\"critical\"\",\"\"source\"\":\"\"signoz\"\",\"\"target_resource\"\":\"\"inventory-api\"\",\"\"namespace\"\":\"\"commerce\"\",\"\"message\"\":\"\"Pod crash - 統帥終極驗收\"\"}' --connect-timeout 30 --max-time 180)",
"Bash(ssh -o ConnectTimeout=10 ollama@192.168.0.188 \"echo OK && ps aux | grep uvicorn | grep -v grep | head -2\")",
"Bash(curl -s http://192.168.0.188:8000/api/v1/webhooks/alerts -X POST -H \"Content-Type: application/json\" -d '{\"\"alert_type\"\":\"\"ssl_expiry\"\",\"\"severity\"\":\"\"critical\"\",\"\"source\"\":\"\"signoz\"\",\"\"target_resource\"\":\"\"nginx-ingress\"\",\"\"namespace\"\":\"\"ingress\"\",\"\"message\"\":\"\"SSL 即將過期 - 終極驗收\"\"}' --connect-timeout 30 --max-time 180)",
"Bash(curl -s http://192.168.0.188:8000/api/v1/webhooks/alerts -X POST -H \"Content-Type: application/json\" -d '{\"\"alert_type\"\":\"\"db_connection_timeout\"\",\"\"severity\"\":\"\"critical\"\",\"\"source\"\":\"\"signoz\"\",\"\"target_resource\"\":\"\"postgres-primary\"\",\"\"namespace\"\":\"\"database\"\",\"\"message\"\":\"\"DB 連線逾時 - SignOz 整合終極測試\"\"}' --connect-timeout 30 --max-time 180)",
"Bash(curl -s http://192.168.0.188:8000/api/v1/webhooks/alerts -X POST -H \"Content-Type: application/json\" -d '{\"\"alert_type\"\":\"\"service_404\"\",\"\"severity\"\":\"\"critical\"\",\"\"source\"\":\"\"signoz\"\",\"\"target_resource\"\":\"\"auth-service\"\",\"\"namespace\"\":\"\"identity\"\",\"\"message\"\":\"\"Service 404 - SignOz + Ollama 整合終極測試\"\"}' --connect-timeout 30 --max-time 180)",
"Bash(curl -s http://192.168.0.188:8000/api/v1/webhooks/alerts -X POST -H \"Content-Type: application/json\" -d '{\"\"alert_type\"\":\"\"high_cpu\"\",\"\"severity\"\":\"\"warning\"\",\"\"source\"\":\"\"signoz\"\",\"\"target_resource\"\":\"\"recommendation-engine\"\",\"\"namespace\"\":\"\"ml\"\",\"\"message\"\":\"\"CPU 78% - Ollama 最終測試\"\"}' --connect-timeout 30 --max-time 200)",
"Bash(scp apps/api/src/services/openclaw.py ollama@192.168.0.188:/home/ollama/awoooi-api/src/services/openclaw.py)",
"Bash(scp /Users/ogt/awoooi/apps/api/src/core/http_client.py ollama@192.168.0.188:/home/ollama/awoooi-api/src/core/)",
"Bash(scp /Users/ogt/awoooi/apps/api/src/main.py ollama@192.168.0.188:/home/ollama/awoooi-api/src/)",
"Bash(scp /Users/ogt/awoooi/apps/api/src/core/config.py ollama@192.168.0.188:/home/ollama/awoooi-api/src/core/)",
"Bash(scp /Users/ogt/awoooi/apps/api/src/api/v1/health.py ollama@192.168.0.188:/home/ollama/awoooi-api/src/api/v1/)",
"Bash(ssh -o ConnectTimeout=5 ollama@192.168.0.188 \"ps aux | grep uvicorn | grep -v grep\")",
"Bash(curl -s -H \"Origin: http://localhost:3000\" -H \"Access-Control-Request-Method: GET\" -X OPTIONS http://192.168.0.188:8000/api/v1/health -v)",
"Bash(curl -s http://192.168.0.188:8000/api/v1/health)",
"Bash(curl -s -N --max-time 3 http://192.168.0.188:8000/api/v1/dashboard/stream)",
"Bash(curl -s http://localhost:3000/zh-TW -o /dev/null -w \"%{http_code}\")",
"Bash(open http://localhost:3000/zh-TW)",
"Bash(open http://localhost:3001/zh-TW)",
"Bash(curl -s -H \"Origin: http://localhost:3001\" http://192.168.0.188:8000/api/v1/dashboard/stream --max-time 3)",
"Bash(curl -s -I -H \"Origin: http://localhost:3001\" http://192.168.0.188:8000/api/v1/health)",
"Bash(curl -s http://192.168.0.188:8000/api/v1/approvals/pending)",
"Bash(curl -s http://192.168.0.188:8000/api/v1/approvals)",
"Bash(curl -s \"http://192.168.0.188:8000/api/v1/approvals?status=pending_approval\")",
"Bash(xargs sed:*)",
"Bash(curl -s \"http://192.168.0.188:8000/api/v1/approvals/history?limit=5\")",
"Bash(curl -s http://192.168.0.188:8000/api/v1/approvals/approved)",
"Bash(curl -s \"http://192.168.0.188:8000/api/v1/timeline?limit=10\")",
"Bash(curl -s \"http://192.168.0.188:8000/api/v1/action-logs\")",
"Bash(curl -s \"http://192.168.0.188:8000/api/v1/timeline/events?limit=10\")",
"Bash(ssh ogt@192.168.0.188 \"kubectl get nodes\")",
"Bash(curl -s \"http://192.168.0.188:8000/api/v1/approvals/k8s-test\")",
"Bash(scp /Users/ogt/awoooi/apps/api/k3s-prod.yaml ogt@192.168.0.188:~/awoooi-api/k3s-prod.yaml)",
"Bash(curl -s \"http://192.168.0.188:8000/api/v1/timeline/events?limit=5\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.120 \"cat /etc/rancher/k3s/k3s.yaml\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no wooo@192.168.0.188 \"echo ''SSH OK'' && pwd\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"echo ''SSH OK'' && pwd && ls -la ~/awoooi-api/ 2>/dev/null || echo ''Directory not found''\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"sshpass -p ''0936223270'' scp -o StrictHostKeyChecking=no wooo@192.168.0.120:/etc/rancher/k3s/k3s.yaml ~/awoooi-api/k3s-prod.yaml && sed -i ''s/127.0.0.1/192.168.0.120/g'' ~/awoooi-api/k3s-prod.yaml && echo ''Kubeconfig deployed!'' && head -10 ~/awoooi-api/k3s-prod.yaml\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cd ~/awoooi-api && pkill -f ''uvicorn'' 2>/dev/null; sleep 1; nohup .venv/bin/uvicorn src.main:app --host 0.0.0.0 --port 8000 --reload > nohup.out 2>&1 & sleep 3; echo ''=== API Restarted ==='' && tail -20 nohup.out\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 \"cd ~/awoooi-api && pkill -f ''uvicorn src.main'' || true\")",
"Bash(curl -s \"http://192.168.0.188:8000/api/v1/health\" --connect-timeout 5)",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 ollama@192.168.0.188 \"cd ~/awoooi-api && source .venv/bin/activate && nohup uvicorn src.main:app --host 0.0.0.0 --port 8000 > nohup.out 2>&1 &\")",
"Bash(sshpass -p:*)",
"Bash(curl -s \"http://192.168.0.188:8000/api/v1/health\" --connect-timeout 10)",
"Bash(curl -s \"http://192.168.0.188:8000/api/v1/timeline/events?limit=8\")",
"Bash(curl -s http://localhost:3000/zh-TW -o /dev/null -w \"Frontend: HTTP %{http_code}\\\\n\")",
"Bash(sshpass -p '0936223270' ssh -o StrictHostKeyChecking=no ollama@192.168.0.188 'curl -s http://localhost:8000/api/v1/approvals/pending | jq -r \"\".approvals[] | \\\\\"\"ID: \\\\\\(.id\\) | Action: \\\\\\(.action\\)\\\\\"\"\"\"')",
"Bash(curl -s --connect-timeout 5 https://awoooi.wooo.tw/api/v1/health)",
"Bash(curl -s --connect-timeout 5 https://awoooi.wooo.tw/api/v1/approvals/pending)",
"Bash(ssh ollama@192.168.70.188 \"ps aux | grep uvicorn | grep -v grep | head -3\")",
"Bash(ssh -o ConnectTimeout=10 ollama@192.168.70.188 \"echo ''SSH Connected''\")",
"Bash(ping -c 2 -t 5 192.168.70.188)",
"Bash(curl -s --connect-timeout 10 https://awoooi.wooo.tw/api/v1/health)",
"Bash(ssh -o ConnectTimeout=10 ollama@192.168.0.188 \"echo ''SSH Connected to 188 Base''\")",
"Bash(grep -B 5 -A 30 \"async def add_signature\" /Users/ogt/awoooi/apps/api/src/services/*.py)",
"Bash(ssh ogt@192.168.0.188 \"cd /home/ogt/awoooi && docker compose ps\")",
"Bash(ls -la .env*)",
"Bash(.env:*)",
"Bash(timeout 15 python -m uvicorn src.main:app --host 0.0.0.0 --port 8001)",
"Bash(timeout 20 python -m uvicorn src.main:app --host 0.0.0.0 --port 8001)",
"Bash(timeout 25 python -m uvicorn src.main:app --host 0.0.0.0 --port 8001)",
"Bash(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no ogt@192.168.0.188 \"cd /home/ogt/wooo-aiops && docker compose ps clawbot 2>/dev/null || docker ps | grep -i claw\")",
"Bash(ls -la ~/.ssh/*.pub)",
"Bash(ssh -i ~/.ssh/id_rsa -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o PasswordAuthentication=no ogt@192.168.0.188 \"echo connected\")",
"Bash(curl -s \"https://api.telegram.org/bot8569720657:AAHdvKf_P2ms-QKFTyqTLtLiqEggz8cpjMk/logOut\")",
"Bash(curl -s \"https://api.telegram.org/bot8569720657:AAHdvKf_P2ms-QKFTyqTLtLiqEggz8cpjMk/close\")",
"Bash(curl -s \"https://api.telegram.org/bot8569720657:AAHdvKf_P2ms-QKFTyqTLtLiqEggz8cpjMk/getUpdates?timeout=3&limit=1\")",
"Bash(ping -c 1 192.168.0.188)",
"Bash(python -m tests.test_redis_multisig)",
"Bash(curl -v -X POST http://localhost:8000/api/v1/webhooks/signals -H \"Content-Type: application/json\" -d '{:*)",
"Bash(python3 -c \":*)",
"Bash(echo ' 無法連線' __NEW_LINE_8fc87454f9798a7d__ echo echo [結論]: echo ' /signals 端點尚未部署到 .188' echo ' 程式碼已完成,需要執行:' echo \" cd apps/api && docker build -t awoooi-api . && docker-compose up -d\")",
"Bash(__NEW_LINE_dc88f37970737861__ cd:*)",
"Bash(__NEW_LINE_dc88f37970737861__ echo:*)",
"Read(//Users/**)",
"Bash(tail -20 __NEW_LINE_8b049957a9782734__ echo \"\" echo \"[Step 2] 等待容器啟動 \\(10 秒\\)...\" sleep 10 __NEW_LINE_8b049957a9782734__ echo \"\" echo \"[Step 3] 檢查容器狀態...\" docker compose ps)",
"Bash(tail -5 __NEW_LINE_275e0094e9dcb44a__ echo \"\" echo \"[1.2] 重建 API 容器 \\(含 Signal Worker\\)...\" docker compose build api)",
"Bash(1 __NEW_LINE_275e0094e9dcb44a__ echo \"\" echo \"[1.4] 等待服務就緒 \\(15 秒\\)...\" sleep 15 __NEW_LINE_275e0094e9dcb44a__ echo \"\" echo \"[1.5] 檢查容器狀態...\" docker compose ps)",
"Bash(__NEW_LINE_f4c8301ec5249760__ echo:*)",
"Bash(__NEW_LINE_21ba3cf3700d942d__ cd:*)",
"Bash(1 __NEW_LINE_9a14b79fc58c11ba__ echo \"\" echo \"[1.3] 等待服務就緒 \\(15 秒\\)...\" sleep 15 __NEW_LINE_9a14b79fc58c11ba__ echo \"\" echo \"[1.4] 檢查容器狀態...\" docker compose ps api)",
"Bash(1 __NEW_LINE_6b654ca5be87c137__ echo \"\" echo \"[2] 等待服務就緒 \\(15 秒\\)...\" sleep 15 __NEW_LINE_6b654ca5be87c137__ echo \"\" echo \"[3] 發送測試 Signal...\" curl -s -X POST http://localhost:8000/api/v1/webhooks/signals -H \"Content-Type: application/json\" -d '{:*)",
"Bash(__NEW_LINE_564908ddf866c081__ echo:*)",
"Bash(chmod +x /Users/ogt/awoooi/apps/api/scripts/test_phase63_aggregation.py)",
"Bash(python scripts/test_phase63_aggregation.py)",
"Bash(xargs -r docker exec -i awoooi-redis redis-cli DEL)",
"Bash(chmod +x /Users/ogt/awoooi/apps/api/scripts/test_race_condition.py)",
"Bash(python scripts/test_race_condition.py)",
"Bash(chmod +x /Users/ogt/awoooi/apps/api/scripts/test_phase64_proposal.py)",
"Bash(python scripts/test_phase64_proposal.py)",
"Bash(python agent.py --alert FINAL_PHASE_6_TEST)",
"Bash(AWOOOI_REDIS_URL=\"redis://localhost:6379/0\" python agent.py --alert FINAL_PHASE_6_TEST)",
"Bash(curl -s http://localhost:8000/api/v1/incidents)",
"Bash(curl -s -X POST http://localhost:8000/api/v1/incidents/INC-20260322-06085B/proposal)",
"Bash(grep -r \"mock\\\\|Mock\\\\|MOCK\\\\|fake\\\\|Fake\\\\|dummy\\\\|hardcode\" /Users/ogt/awoooi/apps/web/src --include=*.tsx --include=*.ts -l)",
"Bash(NEXT_PUBLIC_API_URL=http://localhost:8000 pnpm next build --no-lint)",
"Bash(grep -v \"Traceback\\\\|File \"\"/usr\\\\|^\\\\s*$\")",
"Bash(python -c \"import sys,json; d=json.load\\(sys.stdin\\); print\\(f''''Signal Count: {len\\(d[\"\"signals\"\"]\\)}''''\\); [print\\(f'''' - {s[\"\"alert_name\"\"]} \\({s[\"\"signal_id\"\"]}\\)''''\\) for s in d[''''signals'''']]\")",
"Bash(curl -s -o /dev/null -w \"%{http_code}\" http://localhost:3003/zh-TW)",
"Bash(curl -s -X GET \"http://localhost:8000/api/v1/incidents\" -H \"Origin: http://localhost:3003\" -H \"Access-Control-Request-Method: GET\" -v)",
"Bash(grep -r TELEGRAM /Users/ogt/awoooi/apps/api/.env*)",
"Bash(grep -r TELEGRAM_BOT_TOKEN /Users/ogt/awoooi --include=*.env* --include=*.yaml --include=*.yml)",
"Bash(curl -s -I -X OPTIONS \"http://localhost:8000/api/v1/incidents\" -H \"Origin: http://localhost:3000\" -H \"Access-Control-Request-Method: GET\")",
"Bash(curl -s \"http://localhost:8000/api/v1/incidents\" -H \"Origin: http://localhost:3000\")",
"Bash(python /tmp/e2e_drill.py)",
"Bash(python -c \"import sys,json; d=json.load\\(sys.stdin\\); i=[x for x in d[''''incidents''''] if x[''''incident_id'''']==''''INC-20260322-06085B''''][0]; print\\(f\"\"Incident: {i[''''incident_id'''']}\"\"\\); print\\(f\"\"Signals: {i[''''signal_count'''']}\"\"\\); print\\(f\"\"Updated: {i[''''updated_at'''']}\"\"\\)\")",
"Bash(curl -s -X POST \"http://localhost:8000/api/v1/telegram/test\")",
"Bash(curl -s -X POST \"http://localhost:8000/api/v1/telegram/test-push\" -H \"Content-Type: application/json\" -d '{\"\"\"\"approval_id\"\"\"\": \"\"\"\"15ab6844-ca4e-4a13-aead-dc71cd342445\"\"\"\", \"\"\"\"risk_level\"\"\"\": \"\"\"\"critical\"\"\"\", \"\"\"\"resource_name\"\"\"\": \"\"\"\"api-gateway\"\"\"\", \"\"\"\"root_cause\"\"\"\": \"\"\"\"E2E DRILL - PodCrashLoopBackOff\"\"\"\", \"\"\"\"suggested_action\"\"\"\": \"\"\"\"RESTART_DEPLOYMENT\"\"\"\", \"\"\"\"estimated_downtime\"\"\"\": \"\"\"\"5-15 min\"\"\"\"}')",
"Bash(curl -s -o /dev/null -w \"HTTP Status: %{http_code}\\\\n\" http://localhost:3000/zh-TW)",
"Bash(curl -s -I \"http://localhost:8000/api/v1/incidents\" -H \"Origin: http://localhost:3000\")",
"Bash(curl -s -X POST http://localhost:8000/api/v1/incidents/INC-20260322-19DF60/proposal)",
"Bash(curl -s -X POST \"http://localhost:8000/api/v1/telegram/test-push\" -H \"Content-Type: application/json\" -d '{\"\"\"\"approval_id\"\"\"\": \"\"\"\"942e762e-fb97-480f-b21a-d3be67fa70b1\"\"\"\", \"\"\"\"risk_level\"\"\"\": \"\"\"\"critical\"\"\"\", \"\"\"\"resource_name\"\"\"\": \"\"\"\"core-system\"\"\"\", \"\"\"\"root_cause\"\"\"\": \"\"\"\"E2E DRILL TAKE 2 - 二次實彈演習\"\"\"\", \"\"\"\"suggested_action\"\"\"\": \"\"\"\"INVESTIGATE_SERVICE\"\"\"\", \"\"\"\"estimated_downtime\"\"\"\": \"\"\"\"5-15 min\"\"\"\"}')",
"Bash(curl -s \"http://localhost:8000/api/v1/incidents\" -H \"Origin: http://localhost:3000\" -H \"Accept: application/json\")",
"Bash(python -c \"import sys,json; d=json.load\\(sys.stdin\\); print\\(f''''Incidents: {d[\"\"count\"\"]}''''\\); [print\\(f'''' - {i[\"\"incident_id\"\"]} | {i[\"\"severity\"\"]} | {i[\"\"signal_count\"\"]} signals | {i[\"\"affected_services\"\"]}''''\\) for i in d[''''incidents'''']]\")",
"Bash(curl -s \"http://localhost:8000/api/v1/approvals/pending\" -H \"Origin: http://localhost:3000\")",
"Bash(python -c \"import sys,json; d=json.load\\(sys.stdin\\); print\\(f''''Pending: {d[\"\"count\"\"]} approvals''''\\); [print\\(f'''' - {a[\"\"id\"\"][:8]}... | {a[\"\"risk_level\"\"]} | {a[\"\"action\"\"][:30]}...''''\\) for a in d[''''approvals''''][:3]]\")",
"Bash(mkdir -p /Users/ogt/awoooi/apps/web/public/fonts)",
"Bash(curl -sL -o DSEG7Classic-Bold.woff2 \"https://cdn.jsdelivr.net/npm/dseg@0.46.0/fonts/DSEG7-Classic/DSEG7Classic-Bold.woff2\")",
"Bash(curl -sL -o DSEG7Classic-Bold.woff \"https://cdn.jsdelivr.net/npm/dseg@0.46.0/fonts/DSEG7-Classic/DSEG7Classic-Bold.woff\")",
"Bash(curl -sL -o DSEG7Classic-Regular.woff2 \"https://cdn.jsdelivr.net/npm/dseg@0.46.0/fonts/DSEG7-Classic/DSEG7Classic-Regular.woff2\")",
"Bash(curl -sL -o DSEG7Classic-Regular.woff \"https://cdn.jsdelivr.net/npm/dseg@0.46.0/fonts/DSEG7-Classic/DSEG7Classic-Regular.woff\")",
"Bash(pnpm next:*)",
"Bash(chmod +x /Users/ogt/awoooi/scripts/bootstrap_prod.sh)",
"Bash(/Users/ogt/awoooi/.env:*)",
"Bash(grep -E \"^\\\\.env$|03-secrets\\\\.yaml\" .gitignore)",
"Bash(echo 'Adding to .gitignore...' if ! grep -q ^.env$ .gitignore)",
"Bash(then echo:*)",
"Bash(git add:*)",
"Bash(git commit:*)",
"Bash(git push:*)",
"Bash(git remote:*)",
"Bash(gh repo:*)",
"Bash(gh api:*)",
"Bash(gh run:*)",
"Bash(ls -la pnpm-*.yaml package.json turbo.json)",
"Bash(git status:*)",
"Bash(gh workflow:*)",
"Bash(ssh wooo@192.168.0.120 \"kubectl get pods -n awoooi-prod -o wide\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-api-77545758fc-xnncc -n awoooi-prod --tail=50\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-api-77545758fc-xnncc -n awoooi-prod 2>&1 | grep -i ''cors'' -A 5 -B 5\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-api-79948cbbbf-b8cgj -n awoooi-prod --tail=100\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get pods -n awoooi-prod -l app=awoooi-api --sort-by=.metadata.creationTimestamp -o name | tail -1 | xargs kubectl logs -n awoooi-prod --tail=50\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get secret awoooi-secrets -n awoooi-prod -o jsonpath=''{.data.OPENCLAW_TG_USER_WHITELIST}'' | base64 -d\")",
"Bash(ssh wooo@192.168.0.120 'kubectl patch secret awoooi-secrets -n awoooi-prod --type='\"''\"'json'\"''\"' -p='\"''\"'[:*)",
"Bash(ssh wooo@192.168.0.120 \"kubectl rollout restart deployment/awoooi-api -n awoooi-prod && kubectl rollout status deployment/awoooi-api -n awoooi-prod --timeout=120s\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl rollout restart deployment/awoooi-worker -n awoooi-prod && kubectl rollout status deployment/awoooi-worker -n awoooi-prod --timeout=120s\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-worker-747967b787-fcx2r -n awoooi-prod --tail=30\")",
"Bash(ssh wooo@192.168.0.110 \"ps aux | grep -E ''actions-runner|Runner'' | grep -v grep\")",
"Bash(curl -sf http://192.168.0.120:32334/api/v1/health)",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-api-fd795cd87-rdpgn -n awoooi-prod --tail=30\")",
"Bash(ssh wooo@192.168.0.110 \"curl -sf http://192.168.0.120:32334/api/v1/health | jq .status\")",
"Bash(ssh wooo@192.168.0.110 \"curl -sf http://192.168.0.120:32334/api/v1/health\")",
"Bash(ssh wooo@192.168.0.120 \"curl -sf http://localhost:32334/api/v1/health\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get svc -n awoooi-prod\")",
"Bash(ssh wooo@192.168.0.120 \"curl -sf http://10.43.125.201:8000/api/v1/health\")",
"Bash(ssh wooo@192.168.0.120 \"curl -sf http://10.43.105.105:3000/ -o /dev/null && echo ''Web OK''\")",
"Bash(ssh ogt@192.168.0.188 \"ls -la /etc/nginx/sites-available/\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-api -n awoooi-prod --tail=50\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-api-795c95ff76-wch2p -n awoooi-prod --tail=30\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get pods -n awoooi-prod && ss -tlnp | grep 32334\")",
"Bash(ssh wooo@192.168.0.120 \"curl -sf http://127.0.0.1:32334/api/v1/health | head -c 200\")",
"Bash(ssh wooo@192.168.0.120 \"sudo ufw status 2>/dev/null || sudo iptables -L INPUT -n | head -20\")",
"Bash(ssh wooo@192.168.0.110 \"curl -sf --connect-timeout 5 http://192.168.0.120:32334/api/v1/health | head -c 100\")",
"Bash(ssh wooo@192.168.0.110 \"curl -v --connect-timeout 5 http://192.168.0.120:32334/api/v1/health 2>&1 | head -30\")",
"Bash(ssh wooo@192.168.0.120 \"cat /etc/systemd/system/k3s.service 2>/dev/null | grep -i exec || ps aux | grep k3s | head -3\")",
"Bash(ssh wooo@192.168.0.120 \"cat /etc/systemd/system/k3s.service\")",
"Bash(ssh wooo@192.168.0.120 \"netstat -tlnp 2>/dev/null | grep 32334 || ss -tlnp | grep 32334\")",
"Bash(ssh wooo@192.168.0.110 \"curl -sf --connect-timeout 5 http://192.168.0.120:31234/health 2>&1 | head -c 100\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get networkpolicy -n awoooi-prod\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get networkpolicy allow-nginx-ingress -n awoooi-prod -o yaml\")",
"Bash(curl -sk https://awoooi.wooo.work/api/v1/health)",
"Bash(curl -sk -I -X OPTIONS https://awoooi.wooo.work/api/v1/health -H \"Origin: https://awoooi.wooo.work\" -H \"Access-Control-Request-Method: GET\")",
"Bash(ssh wooo@192.168.0.120 \"curl -sI --connect-timeout 3 http://127.0.0.1:32334/api/v1/health 2>&1 | head -5\")",
"Bash(ssh wooo@192.168.0.120 \"curl -sI --connect-timeout 3 http://127.0.0.1:32335/ 2>&1 | head -5\")",
"Bash(ssh wooo@192.168.0.121 \"curl -sI --connect-timeout 3 http://127.0.0.1:32334/api/v1/health 2>&1 | head -5\")",
"Bash(ssh wooo@192.168.0.121 \"curl -sI --connect-timeout 3 http://127.0.0.1:32335/ 2>&1 | head -5\")",
"Bash(ssh wooo@192.168.0.120 \"sudo iptables -t nat -L KUBE-NODEPORTS -n 2>/dev/null | head -20\")",
"Bash(ssh wooo@192.168.0.120 \"sudo netstat -tlnp | grep -E ''32334|32335''\")",
"Bash(ssh wooo@192.168.0.120 \"ss -tlnp 2>/dev/null | grep -E ''32334|32335'' || netstat -tln | grep -E ''32334|32335''\")",
"Bash(ssh wooo@192.168.0.120 \"ss -tln | grep -E ''32334|32335|:323''\")",
"Bash(ssh wooo@192.168.0.120 \"ss -tln\")",
"Bash(ssh wooo@192.168.0.120 \"export KUBECONFIG=/home/wooo/.kube/config-120; /home/wooo/bin/kubectl get svc -n awoooi-prod -o wide\")",
"Bash(ssh wooo@192.168.0.120 \"which kubectl || find /usr -name kubectl 2>/dev/null | head -1\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get svc -n awoooi-prod && kubectl get pods -n awoooi-prod -o wide\")",
"Bash(ssh wooo@192.168.0.120 \"export KUBECONFIG=/home/wooo/.kube/config-120 && kubectl logs awoooi-api-546b88465d-lb8zm -n awoooi-prod --tail 80\")",
"Bash(ssh wooo@192.168.0.120 \"KUBECONFIG=/home/wooo/.kube/config-120 kubectl logs awoooi-api-546b88465d-lb8zm -n awoooi-prod --tail 80 2>&1\")",
"Bash(ssh wooo@192.168.0.120 \"ls -la /home/wooo/.kube/ && cat /home/wooo/.kube/config-120 2>/dev/null | head -20 || cat /etc/rancher/k3s/k3s.yaml 2>/dev/null | head -20\")",
"Bash(ssh wooo@192.168.0.120 \"sudo cat /etc/rancher/k3s/k3s.yaml | head -20\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && kubectl logs awoooi-api-546b88465d-lb8zm -n awoooi-prod --tail 100 2>&1\")",
"Bash(ssh wooo@192.168.0.110 \"which kubectl 2>/dev/null || find /home/wooo -name kubectl 2>/dev/null | head -1 || ls -la /home/wooo/bin/\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl logs awoooi-api-546b88465d-lb8zm -n awoooi-prod --tail 100 2>&1\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl describe pod awoooi-api-546b88465d-lb8zm -n awoooi-prod | tail -40\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl get svc -n awoooi-prod -o wide\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl exec -n awoooi-prod deploy/awoooi-api -- curl -sf http://localhost:8000/api/v1/health 2>&1\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl exec -n awoooi-prod deploy/awoooi-api -- wget -qO- http://localhost:8000/api/v1/health 2>&1\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl logs deployment/awoooi-api -n awoooi-prod --tail 20 2>&1\")",
"Bash(ssh wooo@192.168.0.110 \"curl -sf http://192.168.0.120:32334/api/v1/health 2>&1 || echo ''FAILED to connect to 120:32334''\")",
"Bash(ssh wooo@192.168.0.110 \"curl -sf http://192.168.0.121:32334/api/v1/health 2>&1 || echo ''FAILED to connect to 121:32334''\")",
"Bash(ssh wooo@192.168.0.110 \"ssh wooo@192.168.0.120 ''cat /etc/rancher/k3s/k3s.yaml 2>/dev/null || echo No k3s.yaml''\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl get pods -n awoooi-prod -o wide | grep Running\")",
"Bash(ssh -o ConnectTimeout=5 wooo@192.168.0.120 \"ufw status 2>/dev/null || firewall-cmd --state 2>/dev/null || echo ''No firewall command found''\")",
"Bash(ssh -o ConnectTimeout=5 wooo@192.168.0.121 \"ufw status 2>/dev/null || firewall-cmd --state 2>/dev/null || echo ''No firewall command found''\")",
"Bash(pip3 show:*)",
"Bash(docker build:*)",
"Bash(docker version:*)",
"Bash(docker run:*)",
"Bash(curl -vI -H \"Origin: https://awoooi.wooo.work\" http://localhost:8889/api/v1/health)",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl get endpoints awoooi-api-svc -n awoooi-prod\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl get pods -n awoooi-prod -o wide\")",
"Bash(ssh wooo@192.168.0.120 \"sudo -n ufw status 2>/dev/null || sudo -n iptables -L INPUT -n 2>/dev/null | head -20 || echo ''Need sudo for firewall check''\")",
"Bash(ssh wooo@192.168.0.120 \"ss -tln | grep -E ''32334|32335|:323'' || echo ''No NodePort listeners found''\")",
"Bash(ssh wooo@192.168.0.121 \"ss -tln | grep -E ''32334|32335|:323'' || echo ''No NodePort listeners found''\")",
"Bash(ssh wooo@192.168.0.120 \"ps aux | grep -E ''kube-proxy|k3s'' | grep -v grep | head -5\")",
"Bash(ssh wooo@192.168.0.120 \"cat /proc/sys/net/ipv4/ip_forward\")",
"Bash(ssh wooo@192.168.0.120 \"systemctl status k3s 2>/dev/null | head -15 || ps aux | grep ''k3s server'' | grep -v grep\")",
"Bash(ssh wooo@192.168.0.120 \"curl -sf --connect-timeout 5 http://127.0.0.1:32334/api/v1/health 2>&1 || echo ''LOCALHOST NodePort FAILED''\")",
"Bash(ssh wooo@192.168.0.120 \"curl -sf --connect-timeout 5 http://192.168.0.120:32334/api/v1/health 2>&1 || echo ''EXTERNAL IP NodePort FAILED''\")",
"Bash(ssh wooo@192.168.0.120 \"cat /etc/iptables/rules.v4 2>/dev/null || iptables-save 2>/dev/null | grep -E ''DROP|REJECT|32334|32335'' | head -10 || echo ''Cannot read iptables without sudo''\")",
"Bash(ssh wooo@192.168.0.121 \"curl -sf --connect-timeout 5 http://192.168.0.120:32334/api/v1/health 2>&1 || echo ''Worker->Master NodePort FAILED''\")",
"Bash(ssh wooo@192.168.0.120 \"cat /etc/rancher/k3s/config.yaml 2>/dev/null || ls -la /etc/rancher/k3s/ 2>/dev/null || echo ''No K3s config found''\")",
"Bash(ssh wooo@192.168.0.120 \"netstat -an 2>/dev/null | grep 32334 || ss -an | grep 32334 || echo ''No socket found for 32334''\")",
"Bash(ssh wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S iptables -L INPUT -n 2>&1 | head -20\")",
"Bash(ssh wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S iptables -t nat -L KUBE-NODEPORTS -n 2>&1 | head -20\")",
"Bash(ssh wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S iptables -L KUBE-ROUTER-INPUT -n 2>&1 | head -30\")",
"Bash(ssh wooo@192.168.0.120 \"echo ''0936223270'' | sudo -S iptables -t nat -L KUBE-NODEPORTS -n 2>&1 | grep -i awoooi || echo ''NO AWOOOI RULES FOUND''\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl get svc awoooi-api-svc -n awoooi-prod -o yaml | grep -A5 ''spec:''\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl get networkpolicy -n awoooi-prod\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl apply -f - 2>&1\")",
"Bash(curl -sf --connect-timeout 10 https://awoooi.wooo.work/api/v1/health)",
"Bash(curl -skf --connect-timeout 10 https://awoooi.wooo.work/api/v1/health)",
"Bash(curl -sI https://awoooi.wooo.work/)",
"Bash(curl -skI https://awoooi.wooo.work/)",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl logs deployment/awoooi-api -n awoooi-prod --tail 50 2>&1\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl rollout restart deployment/awoooi-api -n awoooi-prod && /home/wooo/kubectl rollout status deployment/awoooi-api -n awoooi-prod --timeout=120s\")",
"Bash(curl -sf https://awoooi.wooo.work/api/v1/health)",
"Bash(curl -skf https://awoooi.wooo.work/api/v1/health)",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl logs deployment/awoooi-api -n awoooi-prod --tail 40 2>&1\")",
"Bash(for i:*)",
"Bash(do curl:*)",
"Bash(echo \"Request $i sent\")",
"Bash(done)",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl logs deployment/awoooi-api -n awoooi-prod --tail 100 2>&1\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl logs deployment/awoooi-api -n awoooi-prod --tail 30 2>&1\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl get configmap awoooi-config -n awoooi-prod -o yaml | grep OTEL\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl exec deployment/awoooi-api -n awoooi-prod -- env | grep OTEL\")",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl exec deployment/awoooi-api -n awoooi-prod -- python -c \"\"import socket; s=socket.socket\\(\\); s.settimeout\\(5\\); s.connect\\(\\(''192.168.0.188'', 24317\\)\\); print\\(''✅ Connection to 24317 OK''\\); s.close\\(\\)\"\" 2>&1\")",
"Bash(curl -vI https://awoooi.wooo.work)",
"Bash(curl -vI https://awoooi.wooo.work/api/v1/health)",
"Bash(curl -sf -X POST https://awoooi.wooo.work/api/v1/webhooks/signals -H \"Content-Type: application/json\" -d '{:*)",
"Bash(curl -s -X POST https://awoooi.wooo.work/api/v1/webhooks/signals -H \"Content-Type: application/json\" -d '{\"\"source\"\": \"\"prometheus\"\", \"\"severity\"\": \"\"P1\"\", \"\"message\"\": \"\"Test alert from CLI\"\"}')",
"Bash(curl -s -X POST https://awoooi.wooo.work/api/v1/webhooks/signals -H \"Content-Type: application/json\" -d '{:*)",
"Bash(ssh wooo@192.168.0.110 \"export KUBECONFIG=/home/wooo/.kube/config-120 && /home/wooo/kubectl get secret awoooi-secrets -n awoooi-prod -o jsonpath=''''{.data.WEBHOOK_HMAC_SECRET}'''' 2>/dev/null\")",
"Bash(timeout 15 curl -N -s https://awoooi.wooo.work/api/v1/dashboard/stream)",
"Bash(bash:*)",
"Bash(curl -s https://awoooi.wooo.work/api/v1/metrics/gold)",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT DISTINCT metric_name FROM signoz_metrics.distributed_samples_v4 WHERE unix_milli > \\(toUnixTimestamp\\(now\\(\\)\\) - 1800\\) * 1000 LIMIT 20 FORMAT TabSeparated\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT count\\(\\) as trace_count FROM signoz_traces.distributed_signoz_index_v2 WHERE timestamp > now\\(\\) - INTERVAL 30 MINUTE FORMAT TabSeparated\")",
"Bash(ssh wooo@192.168.0.120 \"KUBECONFIG=/home/wooo/.kube/config-120 /home/wooo/bin/kubectl get configmap awoooi-config -n awoooi-prod -o jsonpath=''{.data}'' | python3 -m json.tool 2>/dev/null | head -30\")",
"Bash(ssh wooo@192.168.0.120 \"KUBECONFIG=/home/wooo/.kube/config-120 /home/wooo/bin/kubectl logs deployment/awoooi-api -n awoooi-prod --tail 50 2>&1\")",
"Bash(ssh wooo@192.168.0.120 \"which kubectl || ls -la ~/bin/kubectl 2>/dev/null || ls -la /usr/local/bin/kubectl 2>/dev/null || echo ''kubectl not found''\")",
"Bash(ssh wooo@192.168.0.120 \"export KUBECONFIG=/home/wooo/.kube/config-120 && kubectl get configmap awoooi-config -n awoooi-prod -o jsonpath=''{.data}'' 2>&1\")",
"Bash(ssh wooo@192.168.0.120 \"ls -la ~/.kube/ 2>/dev/null; cat ~/.kube/config 2>/dev/null | head -20 || echo ''checking k3s default...''; sudo cat /etc/rancher/k3s/k3s.yaml 2>/dev/null | head -5 || echo ''no k3s config''\")",
"Bash(ssh wooo@192.168.0.120 \"sudo k3s kubectl get configmap awoooi-config -n awoooi-prod -o yaml 2>&1\")",
"Bash(ssh wooo@192.168.0.120 \"sudo k3s kubectl logs deployment/awoooi-api -n awoooi-prod --tail 100 2>&1\")",
"Bash(nc -zv 192.168.0.188 24317)",
"Bash(curl -s http://192.168.0.188:24318/v1/traces -X POST -H \"Content-Type: application/json\" -d '{}')",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT DISTINCT serviceName, count\\(\\) as cnt FROM signoz_traces.distributed_signoz_index_v2 WHERE timestamp > now\\(\\) - INTERVAL 24 HOUR GROUP BY serviceName ORDER BY cnt DESC LIMIT 20 FORMAT TabSeparated\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"DESCRIBE TABLE signoz_traces.distributed_signoz_index_v2 FORMAT TabSeparated\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT serviceName, count\\(\\) as cnt FROM signoz_traces.distributed_signoz_index_v2 WHERE timestamp > now\\(\\) - INTERVAL 5 MINUTE GROUP BY serviceName ORDER BY cnt DESC LIMIT 10 FORMAT TabSeparated\")",
"Bash(curl -s https://awoooi.wooo.work/api/v1/health)",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT serviceName, count\\(\\) as cnt FROM signoz_traces.distributed_signoz_index_v2 WHERE timestamp > now\\(\\) - INTERVAL 10 MINUTE GROUP BY serviceName ORDER BY cnt DESC LIMIT 10 FORMAT TabSeparated\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT service_name, count\\(\\) as cnt FROM signoz_logs.distributed_logs WHERE timestamp > now\\(\\) - INTERVAL 30 MINUTE GROUP BY service_name ORDER BY cnt DESC LIMIT 10 FORMAT TabSeparated\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SHOW TABLES FROM signoz_logs FORMAT TabSeparated\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT count\\(\\) as total FROM signoz_logs.distributed_logs_v2 WHERE timestamp > now\\(\\) - INTERVAL 30 MINUTE FORMAT TabSeparated\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT JSONExtractString\\(resources_string, ''service.name''\\) as svc, count\\(\\) as cnt FROM signoz_logs.distributed_logs_v2 WHERE timestamp > now\\(\\) - INTERVAL 5 MINUTE GROUP BY svc ORDER BY cnt DESC LIMIT 10 FORMAT TabSeparated\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"DESCRIBE TABLE signoz_logs.distributed_logs_v2 FORMAT TabSeparated\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT resources_string[''service.name''] as svc, count\\(\\) as cnt FROM signoz_logs.distributed_logs_v2 WHERE timestamp > \\(toUnixTimestamp64Nano\\(now64\\(\\)\\) - 300000000000\\) GROUP BY svc ORDER BY cnt DESC LIMIT 10 FORMAT TabSeparated\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT body, resources_string FROM signoz_logs.distributed_logs_v2 WHERE timestamp > \\(toUnixTimestamp64Nano\\(now64\\(\\)\\) - 60000000000\\) LIMIT 1 FORMAT JSONEachRow\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT serviceName, count\\(\\) as cnt FROM signoz_traces.distributed_signoz_index_v2 WHERE timestamp > now\\(\\) - INTERVAL 2 MINUTE GROUP BY serviceName ORDER BY cnt DESC LIMIT 10 FORMAT TabSeparated\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT serviceName, name, timestamp FROM signoz_traces.distributed_signoz_index_v2 WHERE timestamp > now\\(\\) - INTERVAL 5 MINUTE ORDER BY timestamp DESC LIMIT 5 FORMAT TabSeparated\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT serviceName, name, formatDateTime\\(timestamp, ''%Y-%m-%d %H:%M:%S''\\) as ts FROM signoz_traces.distributed_signoz_index_v2 ORDER BY timestamp DESC LIMIT 10 FORMAT TabSeparated\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT count\\(\\) FROM signoz_traces.distributed_signoz_index_v2 FORMAT TabSeparated\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT count\\(\\) FROM signoz_traces.distributed_signoz_spans FORMAT TabSeparated\")",
"Bash(ssh wooo@192.168.0.188 \"docker ps | grep -E ''otel|signoz''\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT metric_name, sum\\(value\\) as total FROM signoz_metrics.distributed_samples_v4 WHERE metric_name LIKE ''otelcol%span%'' AND unix_milli > \\(toUnixTimestamp\\(now\\(\\)\\) - 300\\) * 1000 GROUP BY metric_name FORMAT TabSeparated\")",
"Bash(for t:*)",
"Bash(do)",
"Bash(echo -n \"$t: \")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT count\\(\\) FROM signoz_traces.$t FORMAT TabSeparated\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"SELECT serviceName, count\\(\\) as cnt FROM signoz_traces.distributed_signoz_index_v3 WHERE timestamp > now\\(\\) - INTERVAL 10 MINUTE GROUP BY serviceName ORDER BY cnt DESC LIMIT 10 FORMAT TabSeparated\")",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \":*)",
"Bash(curl -s 'http://192.168.0.188:8123/' --data \"DESCRIBE TABLE signoz_traces.distributed_signoz_index_v3 FORMAT TabSeparated\")",
"Bash(AWOOOI_API_URL=https://awoooi.wooo.work WEBHOOK_HMAC_SECRET=\"CHANGE_ME_TO_RANDOM_64_CHARS\" python scripts/fire_live_alert.py oomkilled)",
"Bash(timeout 10 curl -sN https://awoooi.wooo.work/api/v1/dashboard/stream)",
"Bash(curl -s https://awoooi.wooo.work/api/v1/dashboard)",
"Bash(npm list:*)",
"Bash(node scripts/verify-frontend.js)",
"Bash(node /Users/ogt/awoooi/scripts/verify-frontend.js)",
"Bash(python -c \"from src.services.proposal_service import ProposalService; print\\(''''✅ ProposalService OK''''\\)\")",
"Bash(python -c \"from src.services.openclaw import OpenClawService; print\\(''''✅ OpenClawService OK''''\\)\")",
"Bash(curl -s http://192.168.0.120:32334/api/v1/incidents)",
"Bash(jq -r \".incidents[:2] | .[] | \"\"\\\\\\(.incident_id\\) - \\\\\\(.status\\) - \\\\\\(.severity\\)\"\"\")",
"Bash(curl -s -X POST \"http://192.168.0.120:32334/api/v1/incidents/INC-20260322-4B3152/propose\" -H \"Content-Type: application/json\")",
"Bash(kubectl logs:*)",
"Bash(ssh ogt@192.168.0.120 \"kubectl logs deployment/awoooi-api -n awoooi-prod --tail 30\")",
"Bash(curl -sv -X POST \"http://192.168.0.120:32334/api/v1/incidents/INC-20260322-4B3152/propose\" -H \"Content-Type: application/json\")",
"Bash(curl -s http://192.168.0.120:32334/api/v1/health)",
"Bash(curl -s \"http://192.168.0.120:32334/api/v1/incidents/INC-20260322-4B3152\")",
"Bash(curl -sv \"http://192.168.0.120:32334/api/v1/incidents\")",
"Bash(curl -s --retry 3 --retry-delay 2 \"http://192.168.0.120:32334/api/v1/health\")",
"Bash(curl -s --retry 3 --retry-delay 2 http://192.168.0.120:32334/api/v1/health)",
"Bash(do echo:*)",
"Bash(curl -s -X POST \"https://awoooi.wooo.work/api/v1/incidents/INC-20260322-4B3152/propose\" -H \"Content-Type: application/json\")",
"Bash(curl -s -X POST \"https://awoooi.wooo.work/api/v1/incidents/INC-20260322-4B3152/proposal\" -H \"Content-Type: application/json\")",
"Bash(curl -s -X POST \"https://awoooi.wooo.work/api/v1/incidents/INC-20260322-D6C6A0/proposal\" -H \"Content-Type: application/json\")",
"Bash(curl -s http://192.168.0.120:32334/api/v1/approvals/pending)",
"Bash(kubectl get:*)",
"Bash(curl -s -w \"\\\\nHTTP_CODE: %{http_code}\\\\n\" http://192.168.0.120:32334/api/v1/health)",
"Bash(curl -s http://awoooi.wooo.work/api/v1/health)",
"Bash(curl -s http://awoooi.wooo.work/api/v1/approvals/pending)",
"Bash(curl -sL https://awoooi.wooo.work/api/v1/approvals/pending -k)",
"Bash(ssh root@192.168.0.120 \"kubectl get pods -n awoooi-prod -o wide\")",
"Bash(ssh root@192.168.0.120 \"kubectl logs -n awoooi-prod -l app=awoooi-api --tail=30\")",
"Bash(curl -sL https://awoooi.wooo.work/api/v1/timeline -k)",
"Bash(curl -sL https://awoooi.wooo.work/api/v1/incidents -k)",
"Bash(curl -sL \"https://awoooi.wooo.work/api/v1/approvals?include_history=true\" -k)",
"Bash(curl -sL \"https://awoooi.wooo.work/api/v1/incidents/INC-20260322-4B3152\" -k)",
"Bash(curl -sL \"https://awoooi.wooo.work/api/v1/audit-logs?limit=10\" -k)",
"Bash(curl -sL https://awoooi.wooo.work/api/v1/audit-logs?limit=10 -k)",
"Bash(ssh ogt@192.168.0.120 \"kubectl logs -n awoooi-prod -l app=awoooi-api --tail=100\")",
"Bash(ssh ogt@192.168.0.120 \"kubectl logs -n awoooi-prod -l app=awoooi-web --tail=50\")",
"Bash(ssh ogt@192.168.0.188 \"kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml logs -n awoooi-prod -l app=awoooi-api --tail=100 2>/dev/null || docker logs awoooi-api --tail=100 2>/dev/null\")",
"Bash(curl -sL \"https://awoooi.wooo.work/api/v1/approvals/pending\" -k -w \"\\\\n\\\\nHTTP: %{http_code}\\\\nTime: %{time_total}s\\\\n\")",
"Bash(curl -sL -X POST https://awoooi.wooo.work/api/v1/approvals/182e07c1-118a-49d7-b71c-7d33c5484d9b/sign -H 'Content-Type: application/json' -d '{\"\"\"\"signer_id\"\"\"\": \"\"\"\"test-debug\"\"\"\", \"\"\"\"signer_name\"\"\"\": \"\"\"\"Debug Test\"\"\"\", \"\"\"\"comment\"\"\"\": \"\"\"\"Testing\"\"\"\"}' -k)",
"Bash(curl -s https://wwooo.aiops.tw/api/v1/health)",
"Bash(curl -s https://wwooo.aiops.tw/api/v1/incidents?limit=5)",
"Bash(curl -s https://wwooo.aiops.tw/api/v1/approvals/pending)",
"Bash(curl -v -s \"https://wwooo.aiops.tw/api/v1/health\")",
"Bash(curl -s \"https://wwooo.aiops.tw/\")",
"Bash(curl -s --connect-timeout 5 \"http://192.168.0.120:32334/api/v1/health\")",
"Bash(curl -s --connect-timeout 5 \"http://192.168.0.120:32334/api/v1/incidents?limit=5\")",
"Bash(ssh -o ConnectTimeout=5 wooo@192.168.0.120 \"kubectl get pods -n awoooi-prod\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-worker-867f67f55d-kvdl2 -n awoooi-prod --tail=50\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get pods -n awoooi-prod | grep -E ''NAME|worker''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get pods -n awoooi-prod | grep worker\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-worker-5bdc5699bb-kcv9q -n awoooi-prod --tail=30\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get networkpolicy -n awoooi-prod -o wide\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get pods -n awoooi-prod --show-labels | grep worker\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get networkpolicy allow-required-egress -n awoooi-prod -o yaml\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl patch networkpolicy allow-required-egress -n awoooi-prod --type=''json'' -p=''[{\"\"op\"\": \"\"replace\"\", \"\"path\"\": \"\"/spec/podSelector/matchLabels\"\", \"\"value\"\": {\"\"system\"\": \"\"awoooi\"\"}}]''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl rollout restart deployment/awoooi-worker -n awoooi-prod\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-worker-5bdc5699bb-kcv9q -n awoooi-prod --tail=15\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-worker -n awoooi-prod --tail=40\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-worker -n awoooi-prod 2>&1 | grep -E ''signal_worker|redis_pool|INFO'' | tail -10\")",
"Bash(ssh wooo@192.168.0.120 \"curl -s http://localhost:32334/api/v1/health\")",
"Bash(ssh wooo@192.168.0.120 'curl -s -X POST \"\"http://localhost:32334/api/v1/webhooks/signals\"\" -H \"\"Content-Type: application/json\"\" -d \"\"{:*)",
"Bash(ssh wooo@192.168.0.120 \"kubectl get pods -n awoooi-prod | grep -E ''NAME|worker|api''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get pods -n awoooi-prod && echo ''==='' && kubectl logs deployment/awoooi-worker -n awoooi-prod --tail=30\")",
"Bash(ssh wooo@192.168.0.120 \"curl -s http://localhost:32334/api/v1/incidents?limit=5\")",
"Bash(ssh wooo@192.168.0.120 \"curl -s http://localhost:32334/api/v1/approvals/pending\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-worker -n awoooi-prod 2>&1 | head -50\")",
"Bash(ssh wooo@192.168.0.120 \"curl -s http://localhost:32334/api/v1/health | jq ''.components''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get secret -n awoooi-prod -o name\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get secret awoooi-secrets -n awoooi-prod -o jsonpath=''{.data.WEBHOOK_HMAC_SECRET}'' | base64 -d\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-worker -n awoooi-prod --tail=20 2>&1 | grep -E ''signal|incident|telegram|INFO''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-worker -n awoooi-prod --tail=30\")",
"Bash(ssh wooo@192.168.0.120 \"curl -s ''http://localhost:32334/api/v1/incidents?limit=5''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-worker -n awoooi-prod 2>&1 | grep -iE ''telegram|notification|send'' | tail -10\")",
"Bash(ssh wooo@192.168.0.120 \"curl -s ''http://localhost:32334/api/v1/approvals/pending''\")",
"Bash(ssh wooo@192.168.0.120 \"curl -s ''http://localhost:32334/api/v1/incidents?limit=2'' && echo ''---'' && curl -s ''http://localhost:32334/api/v1/approvals/pending''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get pods -n awoooi-prod | grep worker && echo ''---'' && kubectl logs deployment/awoooi-worker -n awoooi-prod --tail=30\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-worker-6b8cc94d9c-xjdwr -n awoooi-prod --tail=40\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get networkpolicy allow-required-egress -n awoooi-prod -o jsonpath=''{.spec.podSelector}''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl patch networkpolicy allow-required-egress -n awoooi-prod --type=''json'' -p=''[{\"\"op\"\": \"\"replace\"\", \"\"path\"\": \"\"/spec/podSelector\"\", \"\"value\"\": {\"\"matchLabels\"\": {\"\"system\"\": \"\"awoooi\"\"}}}]''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl delete pod awoooi-worker-6b8cc94d9c-xjdwr -n awoooi-prod\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-worker-6b8cc94d9c-pmzj7 -n awoooi-prod --tail=30\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-worker-6b8cc94d9c-pmzj7 -n awoooi-prod --tail=20\")",
"Bash(ls -la /Users/ogt/awoooi/apps/api/scripts/fire*.py)",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-worker -n awoooi-prod --tail=50\")",
"Bash(ssh wooo@192.168.0.120 \"curl -s ''http://localhost:32334/api/v1/incidents?limit=3''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-worker -n awoooi-prod 2>&1 | grep -iE ''proposal|approval|llm|ai|ollama|generate'' | tail -20\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get deployment awoooi-worker -n awoooi-prod -o jsonpath=''{.spec.template.spec.containers[0].envFrom}''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get deployment awoooi-api -n awoooi-prod -o jsonpath=''{.spec.template.spec.containers[0].envFrom}''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get configmap awoooi-config -n awoooi-prod -o jsonpath=''''{.data}''''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get secret awoooi-secrets -n awoooi-prod -o jsonpath=''{.data}'' | tr '','' ''\\\\n''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl exec deployment/awoooi-api -n awoooi-prod -- python -c ''import os; print\\(os.getenv\\(\"\"DATABASE_URL\"\", \"\"NOT SET\"\"\\)[:50]\\)''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-api-75ffbfb88b-2htfh -n awoooi-prod --tail=50\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl exec awoooi-api-6687db5564-rv755 -n awoooi-prod -- env | grep DATABASE\")",
"Bash(ssh wooo@192.168.0.120 \"PGPASSWORD=''CHANGE_ME'' psql -h 192.168.0.188 -U awoooi -d awoooi_prod -c ''SELECT 1'' 2>&1 || echo ''Connection failed''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get pods -n awoooi-prod\")",
"Bash(curl -sv http://192.168.0.120:32334/api/v1/health)",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-api-75ffbfb88b-2htfh -n awoooi-prod --tail=20 2>&1\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-worker-7fb7d5b55f-n48gk -n awoooi-prod --tail=20 2>&1\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get rs -n awoooi-prod\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl scale rs awoooi-api-75ffbfb88b -n awoooi-prod --replicas=0\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl scale rs awoooi-worker-7fb7d5b55f -n awoooi-prod --replicas=0\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-worker -n awoooi-prod --tail=10\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get deploy -n awoooi-prod -o wide\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get deploy awoooi-api -n awoooi-prod -o jsonpath=''{.spec.replicas}''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get deploy awoooi-worker -n awoooi-prod -o jsonpath=''{.spec.replicas}''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl rollout status deployment/awoooi-api -n awoooi-prod --timeout=5s\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl rollout history deployment/awoooi-api -n awoooi-prod\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl rollout undo deployment/awoooi-api -n awoooi-prod\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl rollout undo deployment/awoooi-worker -n awoooi-prod\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl rollout status deployment/awoooi-api -n awoooi-prod --timeout=30s\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get rs awoooi-api-6687db5564 -n awoooi-prod -o jsonpath=''{.metadata.annotations.deployment\\\\.kubernetes\\\\.io/revision}''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl delete pod awoooi-api-7f487f7cbb-5f88g -n awoooi-prod\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl rollout undo deployment/awoooi-api -n awoooi-prod --to-revision=46\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-worker -n awoooi-prod --tail=15\")",
"Bash(curl -s http://192.168.0.120:32334/api/v1/incidents?limit=3)",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-worker -n awoooi-prod --since=2m\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-api -n awoooi-prod --since=2m | grep -i webhook\")",
"Bash(curl -sv -X POST http://192.168.0.120:32334/api/v1/webhooks/alertmanager -H \"Content-Type: application/json\" -d '{:*)",
"Bash(ssh wooo@192.168.0.120 \"kubectl get endpoints -n awoooi-prod\")",
"Bash(ssh wooo@192.168.0.120 \"curl -s http://localhost:32334/api/v1/health | jq ''{status}''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-worker -n awoooi-prod --since=30s\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-api-fc4744758-7wfv5 -n awoooi-prod --tail=30 2>&1\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-worker-6fc548887b-b9mtf -n awoooi-prod --tail=30 2>&1\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get configmap awoooi-config -n awoooi-prod -o yaml\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get secret awoooi-secrets -n awoooi-prod -o jsonpath=''''{.data}''''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get pod awoooi-worker-6fc548887b-b9mtf -n awoooi-prod -o jsonpath=''{.metadata.labels}''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get networkpolicy -n awoooi-prod -o yaml\")",
"Bash(ssh wooo@192.168.0.120 'kubectl patch networkpolicy allow-required-egress -n awoooi-prod --type=json -p=\"\"[{\\\\\"\"op\\\\\"\": \\\\\"\"replace\\\\\"\", \\\\\"\"path\\\\\"\": \\\\\"\"/spec/podSelector/matchLabels\\\\\"\", \\\\\"\"value\\\\\"\": {\\\\\"\"system\\\\\"\": \\\\\"\"awoooi\\\\\"\"}}]\"\"')",
"Bash(ssh wooo@192.168.0.120 \"kubectl rollout restart deployment/awoooi-api deployment/awoooi-worker -n awoooi-prod\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs awoooi-api-6c69b77894-d6jqq -n awoooi-prod --tail=20\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl run nc-test --rm -it --restart=Never --image=busybox -- nc -zv 192.168.0.188 5432\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get pods -n awoooi-prod -o=custom-columns=''NAME:.metadata.name,IMAGE:.spec.containers[0].image''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl exec awoooi-api-6687db5564-rv755 -n awoooi-prod -- ls -la *.db 2>/dev/null || echo ''No SQLite files''\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl exec awoooi-api-6687db5564-rv755 -n awoooi-prod -- env | grep -E ''MOCK|DATABASE|SQLITE''\")",
"Bash(curl -s \"http://192.168.0.120:32334/api/v1/approvals\")",
"Bash(python -m py_compile src/lewooogo_brain/engines/incident_engine.py src/lewooogo_brain/engines/proposal_engine.py src/lewooogo_brain/skills/loader.py)",
"Bash(python packages/lewooogo-brain/tests/test_skill_loader.py)",
"Bash(python packages/lewooogo-brain/tests/test_incident_engine.py)",
"Bash(python packages/lewooogo-brain/tests/test_guardrails.py)",
"Bash(python -m py_compile src/lewooogo_brain/engines/proposal_engine.py src/lewooogo_brain/engines/incident_engine.py src/lewooogo_brain/skills/loader.py)",
"Bash(PYTHONPATH=/Users/ogt/awoooi/packages/lewooogo-brain/src python -c \":*)",
"Bash(curl -s --connect-timeout 5 http://192.168.0.188:8000/api/v1/health)",
"Bash(curl -s \"https://awoooi.wooo.work/api/v1/approvals/pending\")",
"Bash(curl -s \"https://awoooi.wooo.work/api/v1/approvals?status=pending\")",
"Bash(curl -s \"https://awoooi.wooo.work/api/v1/incidents\")",
"Bash(uv sync:*)",
"Bash(python -c \"from src.routers.proposals import router; print\\(''✅ Router 語法驗證通過''\\)\")",
"Bash(curl -s -X GET \"https://awoooi.wooo.work/api/v1/health\" --connect-timeout 10)",
"Bash(curl -s -X GET \"https://awoooi.wooo.work/api/v1/incidents\" --connect-timeout 10)",
"Bash(curl -s -o /dev/null -w \"%{http_code}\" \"https://awoooi.wooo.work\" --connect-timeout 10)",
"Bash(curl -s -o /dev/null -w \"%{http_code}\" -L \"https://awoooi.wooo.work\" --connect-timeout 10)",
"Bash(curl -s -X POST \"https://awoooi.wooo.work/api/v1/incidents/test-123/propose\" -H \"Content-Type: application/json\" -d '{\"\"require_dry_run\"\": true}' --connect-timeout 10)",
"Bash(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no ollama@192.168.0.120 \"kubectl get pods -n awoooi-prod -o wide\")",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get pods -n awoooi-prod)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl logs awoooi-api-64c8659cff-grslz -n awoooi-prod --tail=50)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get secret awoooi-secrets -n awoooi-prod -o jsonpath='{.data.DATABASE_URL}')",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl rollout restart deployment/awoooi-api -n awoooi-prod)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get pods -n awoooi-prod -l app=awoooi-api)",
"Bash(curl -s \"https://awoooi.wooo.work/api/v1/health\" --connect-timeout 10)",
"Bash(curl -s -o /dev/null -w \"%{http_code}\" -L \"https://awoooi.wooo.work/zh-TW\" --connect-timeout 10)",
"Bash(python -c \"from src.routers.proposals import router; print\\(''✅ Router import successful''\\)\")",
"Bash(PGPASSWORD=postgres psql -h 192.168.0.188 -U awoooi -d awoooi_dev -c \"SELECT incident_id, status, severity FROM incidents LIMIT 5;\")",
"Bash(PGPASSWORD=AwoooiProd2026 psql -h 192.168.0.188 -U awoooi -d awoooi_prod -c \"SELECT incident_id, status, severity FROM incidents LIMIT 5;\")",
"Bash(curl -sf http://192.168.0.120:32334/api/v1/incidents)",
"Bash(curl -v \"http://192.168.0.120:32334/api/v1/incidents\")",
"Bash(export KUBECONFIG=/Users/ogt/.kube/config-120)",
"Bash(curl -sI \"http://awoooi.wooo.work/\")",
"Bash(openssl s_client -servername awoooi.wooo.work -connect awoooi.wooo.work:443)",
"Bash(openssl x509:*)",
"Bash(curl -s -X POST \"http://192.168.0.120:32334/api/v1/incidents/INC-20260323-7DE10B/propose\" -H \"Content-Type: application/json\" -d '{\"\"\"\"require_dry_run\"\"\"\": true}')",
"Bash(python -c \"from src.services.executor import execute_approved_proposal, get_executor, ActionExecutor; print\\(''✅ Import successful''\\)\")",
"Bash(curl -s https://awoooi.woooo.cc/api/v1/incidents)",
"Bash(curl -s https://awoooi.woooo.cc/api/v1/health)",
"Bash(curl -s --connect-timeout 10 https://awoooi.woooo.cc/api/v1/health)",
"Bash(ssh ogt@192.168.70.202 \"sudo kubectl get pods -n awoooi 2>/dev/null\")",
"Bash(curl -s --connect-timeout 5 http://192.168.70.200:8000/api/v1/health)",
"Bash(ssh ogt@192.168.70.202 \"sudo kubectl get pods -n awoooi-prod\")",
"Bash(ssh -o StrictHostKeyChecking=no ogt@192.168.70.202 \"sudo kubectl get pods -n awoooi-prod\")",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get pods -A)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl logs -n awoooi-prod awoooi-worker-7479556d76-jbbps --tail 30)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl logs -n awoooi-prod -l app=awoooi-api --tail 20)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl exec -n awoooi-prod deployment/awoooi-api -- curl -s http://localhost:8000/api/v1/incidents)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl exec -n awoooi-prod deployment/awoooi-api -- python -c \"import httpx; r = httpx.get\\(''http://localhost:8000/api/v1/incidents''\\); print\\(r.text\\)\")",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get ingress -n awoooi-prod -o wide)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get svc -n awoooi-prod)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get deployment awoooi-worker -n awoooi-prod -o jsonpath='{.spec.template.spec.containers[0].env}')",
"Bash(curl -s --connect-timeout 5 http://192.168.70.202:32334/api/v1/health)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl describe deployment awoooi-worker -n awoooi-prod)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get configmap -n awoooi-prod)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl describe deployment awoooi-api -n awoooi-prod)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get configmap awoooi-config -n awoooi-prod -o yaml)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get secrets -n awoooi-prod)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get secret awoooi-secrets -n awoooi-prod -o jsonpath='{.data}')",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get secret awoooi-secrets -n awoooi-prod -o jsonpath='{.data.REDIS_URL}')",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl rollout restart deployment/awoooi-worker -n awoooi-prod)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get pods -n awoooi-prod -l app=awoooi-worker)",
"Bash(curl -s --connect-timeout 5 https://awoooi.wooo.work/api/v1/health)",
"Bash(curl -s https://awoooi.wooo.work/api/v1/incidents)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl logs -n awoooi-prod -l app=awoooi-worker --tail 10)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get svc -n wooo-aiops-prod)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get svc -A)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl logs -n awoooi-prod awoooi-worker-76bdf9786d-rvtmz --tail 15)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl exec -n awoooi-prod deployment/awoooi-api -- python -c \"import os; print\\(os.getenv\\(''REDIS_URL'', ''NOT_SET''\\)\\)\")",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get deployment awoooi-api -n awoooi-prod -o yaml)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl rollout restart deployment/awoooi-api deployment/awoooi-worker -n awoooi-prod)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl logs -n awoooi-prod awoooi-api-865cdc97db-6mpzz --tail 20)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get pods -n wooo-aiops-prod -l app=redis)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get pods -n wooo-aiops-prod)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl exec -n wooo-aiops-prod redis-6c6fcd64b8-8wznx -- redis-cli ping)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl exec -n awoooi-prod awoooi-api-6445c76797-mrl7p -- python -c \"import redis; r=redis.Redis\\(host=''10.43.239.47'', port=6379, db=10\\); print\\(r.ping\\(\\)\\)\")",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get networkpolicy -A)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get networkpolicy allow-required-egress -n awoooi-prod -o yaml)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl patch networkpolicy allow-required-egress -n awoooi-prod --type='json' -p='[{\"\"op\"\": \"\"add\"\", \"\"path\"\": \"\"/spec/egress/0/ports/-\"\", \"\"value\"\": {\"\"port\"\": 6379, \"\"protocol\"\": \"\"TCP\"\"}}]')",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl logs -n awoooi-prod awoooi-api-5fcc484b85-qpwt6 --tail 15)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl exec -n awoooi-prod awoooi-api-6445c76797-mrl7p -- python -c \"import os; print\\(''REDIS_URL:'', os.getenv\\(''REDIS_URL''\\)\\); import redis; r=redis.Redis.from_url\\(os.getenv\\(''REDIS_URL''\\)\\); print\\(''PING:'', r.ping\\(\\)\\)\")",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl logs -n awoooi-prod awoooi-worker-59d7588d75-p5tht --tail 20)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl logs -n awoooi-prod -l app=awoooi-worker --tail 30)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get deployment awoooi-worker -n awoooi-prod -o yaml)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get networkpolicy -n awoooi-prod -o wide)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl apply -f -)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl logs -n awoooi-prod awoooi-worker-6cd7dcbc9-5mtfq --tail 15)",
"Bash(jq .incidents[0])",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl get configmap awoooi-config -n awoooi-prod -o jsonpath='{.data.OPENCLAW_URL}')",
"Bash(curl -s --connect-timeout 5 http://192.168.0.188:8088/health)",
"Bash(curl -s --connect-timeout 5 http://192.168.0.188:8088/)",
"Bash(nc -zv 192.168.0.188 8088 -w 5)",
"Bash(ping -c 2 192.168.0.188)",
"Bash(ping -c 2 192.168.70.202)",
"Bash(grep -n \"mapToDualState\" /Users/ogt/awoooi/apps/web/src/app/[locale]/page.tsx -A 30)",
"Bash(head -40 /Users/ogt/awoooi/apps/web/src/app/[locale]/page.tsx)",
"Bash(ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no ollama@192.168.0.188 \"docker ps -a | grep -i claw; docker start openclaw 2>/dev/null || docker start clawbot 2>/dev/null || echo ''Container not found, listing all:'' && docker ps -a --format ''table {{.Names}}\\\\t{{.Status}}'' | head -10\")",
"Bash(curl -s --connect-timeout 5 http://192.168.0.188:8089/health)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl rollout status deployment/awoooi-web -n awoooi-prod --timeout=60s)",
"Bash(grep -rn \"clawbot\\\\|ClawBot\" /Users/ogt/awoooi/ --include=*.yaml --include=*.yml --include=*.json)",
"Bash(grep -rn \"ClawBot\\\\|clawbot\" /Users/ogt/awoooi/apps/ --include=*.py --include=*.ts --include=*.tsx)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl logs deployment/awoooi-api -n awoooi-prod --tail=100)",
"Bash(KUBECONFIG=/Users/ogt/awoooi/apps/api/k3s-prod.yaml kubectl logs deployment/awoooi-api -n awoooi-prod --tail=200)",
"Bash(export KUBECONFIG=/Users/ogt/awoooi/k3s-prod.yaml)",
"Bash(ssh root@192.168.0.120 \"kubectl logs deployment/awoooi-api -n awoooi-prod --tail=200 2>&1 | grep -iE ''error|fail|exception|execute|background|parse'' | tail -40\")",
"Bash(curl -s https://awoooi.wooo.work/api/v1/approvals)",
"Bash(ssh k3s@192.168.0.120 \"kubectl logs deployment/awoooi-api -n awoooi-prod --tail=200 2>&1 | grep -iE ''error|fail|execute|background|parse'' | tail -40\")",
"Bash(ssh ubuntu@192.168.0.120 \"kubectl logs deployment/awoooi-api -n awoooi-prod --tail=200 2>&1 | grep -iE ''error|fail|execute|background|parse'' | tail -40\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-api -n awoooi-prod --tail=200 2>&1 | grep -iE ''error|fail|execute|background|parse|skip'' | tail -50\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-api -n awoooi-prod --tail=500 2>&1 | grep -iE ''background_execution|approve_action|reject|k8s_executor'' | tail -30\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl get deploy,sts -n awoooi-prod\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl rollout status deployment/awoooi-api -n awoooi-prod --timeout=120s 2>&1\")",
"Bash(ssh wooo@192.168.0.120 \"kubectl logs deployment/awoooi-api -n awoooi-prod --tail=50 2>&1 | grep -iE ''background_execution|k8s_executor|parse'' | tail -10\")"
],
"additionalDirectories": [
"/Users/ogt/awoooi/docs",
"/Users/ogt/.claude/projects/-Users-ogt-awoooi/memory",
"/Users/ogt/awoooi/apps/web/src/app",
"/Users/ogt/awoooi/apps/api",
"/Users/ogt/awoooi/apps/api/http:/localhost:8000/api/v1",
"/Users/ogt/awoooi/apps/web/public",
"/Users/ogt/Downloads",
"/Users/ogt/awoooi/apps/web/test-results",
"/Users/ogt/awoooi",
"/Users/ogt/awoooi/apps/web/src/app/[locale]",
"/tmp"
]
}
}

View File

@@ -19,18 +19,10 @@
# 文件與腳本(不需要進 image
# 注意: docs/runbooks/, docs/adr/, .agents/skills/ 供 RAG 索引 (ADR-067 Phase 33)
# scripts/ 大部分不需要進 image僅白名單 production runtime/ops 種子腳本
# scripts/ 大部分不需要進 image但 CronJob 腳本需要
# 2026-04-12 ogt (ADR-073 P2-1): 白名單允許 cron_km_vectorize.py
# 2026-05-13 codex: 白名單 T16 auto-repair canary PlayBook seed script
# 2026-05-31 codex: MOMO backup Ansible playbook copies the backup script from
# the controller image; keep only this backup script in the runtime context.
scripts/**
!scripts/
scripts
!scripts/cron_km_vectorize.py
!scripts/backup/
!scripts/backup/backup-momo-188-pg.sh
!scripts/ops/
!scripts/ops/awooop-seed-auto-repair-canary-playbook.py
# Node 快取monorepo 根目錄)
node_modules

View File

@@ -10,7 +10,7 @@ on:
jobs:
lint:
runs-on: ubuntu-latest
runs-on: self-hosted
steps:
- uses: actions/checkout@v4

View File

@@ -43,19 +43,10 @@ jobs:
├ 📝 ${{ steps.commit.outputs.message }}
├ 🔖 <code>${{ steps.commit.outputs.short_sha }}</code>
└ 🌿 dev branch"
if AWOOI_CICD_STATUS=running \
AWOOI_CICD_STAGE=dev-deploy \
AWOOI_CICD_JOB_NAME="[DEV] 部署開始" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_SUMMARY="${{ steps.commit.outputs.message }}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "Dev deploy start notification mirrored through AWOOI API"
else
printf '%b' "$MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text@-"
fi
printf '%b' "$MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text@-"
# API 測試 (同 prod CI確保 dev 也通過)
- name: Run API Tests
@@ -87,18 +78,11 @@ jobs:
echo "✅ API 測試通過"
- name: Login to Harbor
run: |
HARBOR_USERNAME="$(cat <<'AWOOOI_SECRET_HARBOR_USERNAME'
${{ secrets.HARBOR_USERNAME }}
AWOOOI_SECRET_HARBOR_USERNAME
)"
HARBOR_PASSWORD="$(cat <<'AWOOOI_SECRET_HARBOR_PASSWORD'
${{ secrets.HARBOR_PASSWORD }}
AWOOOI_SECRET_HARBOR_PASSWORD
)"
printf '%s' "$HARBOR_PASSWORD" | docker login "${{ env.HARBOR }}" \
-u "$HARBOR_USERNAME" \
--password-stdin
uses: docker/login-action@v3
with:
registry: ${{ env.HARBOR }}
username: ${{ secrets.HARBOR_USERNAME }}
password: ${{ secrets.HARBOR_PASSWORD }}
# Dev API 鏡像:強制重建,不用 cache確保 models.json 等配置文件更新)
- name: Build and Push API (Dev)
@@ -114,37 +98,16 @@ jobs:
# 注入 Dev K8s Secrets
- name: Inject Dev K8s Secrets
env:
SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
TG_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
TG_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }}
NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
run: |
secret_b64() {
python3 -c 'import base64, sys; data=sys.stdin.buffer.read(); data=data[:-1] if data.endswith(b"\n") else data; sys.stdout.write(base64.b64encode(data).decode())'
}
write_deploy_key() {
mkdir -p ~/.ssh
umask 077
cat > ~/.ssh/deploy_key <<'AWOOOI_DEPLOY_KEY'
${{ secrets.DEPLOY_SSH_KEY }}
AWOOOI_DEPLOY_KEY
chmod 600 ~/.ssh/deploy_key
}
TG_BOT_TOKEN_B64="$(secret_b64 <<'AWOOOI_SECRET_TG_BOT_TOKEN'
${{ secrets.TELEGRAM_BOT_TOKEN }}
AWOOOI_SECRET_TG_BOT_TOKEN
)"
TG_CHAT_ID_B64="$(secret_b64 <<'AWOOOI_SECRET_TG_CHAT_ID'
${{ secrets.TELEGRAM_CHAT_ID }}
AWOOOI_SECRET_TG_CHAT_ID
)"
NVIDIA_API_KEY_B64="$(secret_b64 <<'AWOOOI_SECRET_NVIDIA_API_KEY'
${{ secrets.NVIDIA_API_KEY }}
AWOOOI_SECRET_NVIDIA_API_KEY
)"
GEMINI_API_KEY_B64="$(secret_b64 <<'AWOOOI_SECRET_GEMINI_API_KEY'
${{ secrets.GEMINI_API_KEY }}
AWOOOI_SECRET_GEMINI_API_KEY
)"
mkdir -p ~/.ssh
write_deploy_key
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
# 2026-05-05 Codex: kubectl runs on 120 control-plane. 121 is a
# worker and its local kubeconfig points at 127.0.0.1:6443.
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS
@@ -152,19 +115,19 @@ jobs:
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
sudo kubectl patch secret awoooi-secrets -n awoooi-dev --type='json' -p='[
{"op":"replace","path":"/data/OPENCLAW_TG_BOT_TOKEN","value":"${TG_BOT_TOKEN_B64}"},
{"op":"replace","path":"/data/OPENCLAW_TG_CHAT_ID","value":"${TG_CHAT_ID_B64}"}
{"op":"replace","path":"/data/OPENCLAW_TG_BOT_TOKEN","value":"'"$(echo -n "${TG_BOT_TOKEN}" | base64 -w 0)"'"},
{"op":"replace","path":"/data/OPENCLAW_TG_CHAT_ID","value":"'"$(echo -n "${TG_CHAT_ID}" | base64 -w 0)"'"}
]' || echo "⚠️ Telegram Secrets patch 跳過"
if [ -n "${NVIDIA_API_KEY_B64}" ]; then
if [ -n "${NVIDIA_API_KEY}" ]; then
sudo kubectl patch secret awoooi-secrets -n awoooi-dev --type='json' -p='[
{"op":"replace","path":"/data/NVIDIA_API_KEY","value":"${NVIDIA_API_KEY_B64}"}
{"op":"replace","path":"/data/NVIDIA_API_KEY","value":"'"$(echo -n "${NVIDIA_API_KEY}" | base64 -w 0)"'"}
]' && echo "✅ NVIDIA_API_KEY 已注入 dev"
fi
if [ -n "${GEMINI_API_KEY_B64}" ]; then
if [ -n "${GEMINI_API_KEY}" ]; then
sudo kubectl patch secret awoooi-secrets -n awoooi-dev --type='json' -p='[
{"op":"replace","path":"/data/GEMINI_API_KEY","value":"${GEMINI_API_KEY_B64}"}
{"op":"replace","path":"/data/GEMINI_API_KEY","value":"'"$(echo -n "${GEMINI_API_KEY}" | base64 -w 0)"'"}
]' && echo "✅ GEMINI_API_KEY 已注入 dev"
fi
@@ -173,6 +136,8 @@ jobs:
# 部署到 awoooi-dev
- name: Deploy to Dev K8s
env:
SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
run: |
cat k8s/awoooi-dev/02-configmap.yaml | \
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 \
@@ -219,20 +184,10 @@ jobs:
├ 🔖 <code>${{ steps.commit.outputs.short_sha }}</code>
├ ⏱️ 耗時: ${MINUTES}m ${SECONDS}s
└ 🩺 http://192.168.0.125:32344/api/v1/health"
if AWOOI_CICD_STATUS=success \
AWOOI_CICD_STAGE=dev-deploy \
AWOOI_CICD_JOB_NAME="[DEV] 部署完成" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_DURATION_SECONDS="${DURATION}" \
AWOOI_CICD_SUMMARY="${{ steps.commit.outputs.message }}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "Dev deploy success notification mirrored through AWOOI API"
else
printf '%b' "$MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text@-"
fi
printf '%b' "$MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text@-"
- name: Notify Dev Deploy Failure
if: failure()
@@ -241,16 +196,7 @@ jobs:
├ 📝 ${{ steps.commit.outputs.message }}
├ 🔖 <code>${{ steps.commit.outputs.short_sha }}</code>
└ 🔗 <a href=\"http://192.168.0.110:3001/wooo/awoooi/actions\">查看日誌</a>"
if AWOOI_CICD_STATUS=failed \
AWOOI_CICD_STAGE=dev-deploy \
AWOOI_CICD_JOB_NAME="[DEV] 部署失敗" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_SUMMARY="${{ steps.commit.outputs.message }}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "Dev deploy failure notification mirrored through AWOOI API"
else
printf '%b' "$MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text@-"
fi
printf '%b' "$MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text@-"

File diff suppressed because it is too large Load Diff

View File

@@ -30,9 +30,6 @@ jobs:
with:
fetch-depth: 50
- name: Guard Workflow Secret Surfaces
run: node scripts/ci/check-gitea-step-env-secrets.js
- name: Skip Stale Main Push
id: stale
run: |
@@ -105,6 +102,7 @@ jobs:
- name: Notify Code Review Start
if: steps.stale.outputs.skip != 'true'
env:
TG_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
TG_CHAT_ID: ${{ env.TELEGRAM_ALERT_CHAT_ID }}
SHORT_SHA: ${{ steps.ctx.outputs.short_sha }}
BRANCH: ${{ steps.ctx.outputs.branch }}
@@ -112,33 +110,18 @@ jobs:
FILES_DISPLAY: ${{ steps.ctx.outputs.files_display }}
run: |
set -euo pipefail
TG_BOT_TOKEN="$(cat <<'AWOOOI_SECRET_TG_BOT_TOKEN'
${{ secrets.TELEGRAM_BOT_TOKEN }}
AWOOOI_SECRET_TG_BOT_TOKEN
)"
if [ -z "${TG_BOT_TOKEN:-}" ] || [ -z "${TG_CHAT_ID:-}" ]; then
echo "Telegram secret missing; skip start notification"
exit 0
fi
html_escape() { sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g'; }
COMMIT_ESC="$(printf '%s' "$COMMIT_MSG" | html_escape)"
FILES_ESC="$(printf '%s\n' "$FILES_DISPLAY" | html_escape)"
MSG="$(printf '🔍 <b>Code Review 啟動</b>\n──────────────────────\n📦 Commit <code>%s</code> 🌿 <code>%s</code>\n📝 <code>%s</code>\n📁 <b>變更檔案:</b>\n%s\n──────────────────────\n🤖 <b>Hermes → OpenClaw → Elephant Alpha → NemoTron</b>\n📊 即時進度:<a href=\"%s\">%s</a>' "$SHORT_SHA" "$BRANCH" "$COMMIT_ESC" "$FILES_ESC" "$REPORT_URL" "$REPORT_URL")"
if AWOOI_CICD_STATUS=running \
AWOOI_CICD_STAGE=code-review \
AWOOI_CICD_JOB_NAME="Code Review 啟動" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${GITHUB_ACTOR:-CI}" \
AWOOI_CICD_SUMMARY="${COMMIT_MSG}" \
AWOOI_CICD_WORKFLOW_URL="${REPORT_URL}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "Code review start notification mirrored through AWOOI API"
else
if [ -z "${TG_BOT_TOKEN:-}" ] || [ -z "${TG_CHAT_ID:-}" ]; then
echo "Telegram secret missing and AWOOI API notify failed; skip start notification"
exit 0
fi
curl -fsS -X POST "https://api.telegram.org/bot${TG_BOT_TOKEN}/sendMessage" \
-H "Content-Type: application/json" \
-d "$(jq -n --arg c "$TG_CHAT_ID" --arg t "$MSG" '{chat_id:$c,text:$t,parse_mode:"HTML",disable_web_page_preview:true}')" \
>/dev/null
fi
curl -fsS -X POST "https://api.telegram.org/bot${TG_BOT_TOKEN}/sendMessage" \
-H "Content-Type: application/json" \
-d "$(jq -n --arg c "$TG_CHAT_ID" --arg t "$MSG" '{chat_id:$c,text:$t,parse_mode:"HTML",disable_web_page_preview:true}')" \
>/dev/null
- name: Run Deterministic Review
if: steps.stale.outputs.skip != 'true'
@@ -156,14 +139,15 @@ jobs:
- name: Notify Code Review Completion
if: always() && steps.stale.outputs.skip != 'true'
env:
TG_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
TG_CHAT_ID: ${{ env.TELEGRAM_ALERT_CHAT_ID }}
SHORT_SHA: ${{ steps.ctx.outputs.short_sha }}
run: |
set -euo pipefail
TG_BOT_TOKEN="$(cat <<'AWOOOI_SECRET_TG_BOT_TOKEN'
${{ secrets.TELEGRAM_BOT_TOKEN }}
AWOOOI_SECRET_TG_BOT_TOKEN
)"
if [ -z "${TG_BOT_TOKEN:-}" ] || [ -z "${TG_CHAT_ID:-}" ]; then
echo "Telegram secret missing; skip completion notification"
exit 0
fi
REPORT=/tmp/code-review-report.json
if [ ! -s "$REPORT" ]; then
cat > "$REPORT" <<'JSON'
@@ -196,25 +180,7 @@ jobs:
TOP_ESC="$(printf '%s' "$TOP_ISSUE" | html_escape)"
MSG="$(printf '%s <b>Code Review 完成・%s</b>\n──────────────────────\n🔴 CRITICAL <code>%s</code> 🟠 HIGH <code>%s</code> 🟡 MEDIUM <code>%s</code> 🟢 LOW <code>%s</code>\n──────────────────────\n⚠ <b>主要問題</b>\n%s\n\n🔍 <b>整體風險等級</b>\n%s%s\n\n⚠ <b>最高關注問題</b>\n1. %s\n──────────────────────\n🤖 Elephant Alpha<b>%s</b> ✅ %s\n📊 完整報告:<a href=\"%s\">%s</a>' "$STATUS" "$SHORT_SHA" "$CRITICAL" "$HIGH" "$MEDIUM" "$LOW" "$ISSUE_LINE" "$RISK" "$SUMMARY_ESC" "$TOP_ESC" "$RISK" "$ACTION_ESC" "$REPORT_URL" "$REPORT_URL")"
CICD_STATUS=success
if [ "$RISK" = "MEDIUM" ]; then CICD_STATUS=pending; fi
if [ "$RISK" = "HIGH" ] || [ "$RISK" = "CRITICAL" ]; then CICD_STATUS=failed; fi
if AWOOI_CICD_STATUS="${CICD_STATUS}" \
AWOOI_CICD_STAGE=code-review \
AWOOI_CICD_JOB_NAME="Code Review 完成・${RISK}" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${GITHUB_ACTOR:-CI}" \
AWOOI_CICD_SUMMARY="CRITICAL=${CRITICAL}; HIGH=${HIGH}; MEDIUM=${MEDIUM}; LOW=${LOW}; ${SUMMARY}" \
AWOOI_CICD_WORKFLOW_URL="${REPORT_URL}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "Code review completion notification mirrored through AWOOI API"
else
if [ -z "${TG_BOT_TOKEN:-}" ] || [ -z "${TG_CHAT_ID:-}" ]; then
echo "Telegram secret missing and AWOOI API notify failed; skip completion notification"
exit 0
fi
curl -fsS -X POST "https://api.telegram.org/bot${TG_BOT_TOKEN}/sendMessage" \
-H "Content-Type: application/json" \
-d "$(jq -n --arg c "$TG_CHAT_ID" --arg t "$MSG" '{chat_id:$c,text:$t,parse_mode:"HTML",disable_web_page_preview:true}')" \
>/dev/null
fi
curl -fsS -X POST "https://api.telegram.org/bot${TG_BOT_TOKEN}/sendMessage" \
-H "Content-Type: application/json" \
-d "$(jq -n --arg c "$TG_CHAT_ID" --arg t "$MSG" '{chat_id:$c,text:$t,parse_mode:"HTML",disable_web_page_preview:true}')" \
>/dev/null

View File

@@ -1,7 +1,7 @@
# =============================================================================
# Deploy Prometheus Alert Rules (獨立 workflow)
# 2026-04-05 Claude Code (ADR-039 I3): 從 cd.yaml 分離
# 觸發條件: ops/monitoring/alerts-unified.yml / slo-rules.yml 有變更 或 workflow_dispatch
# 觸發條件: ops/monitoring/alerts-unified.yml 有變更 或 workflow_dispatch
# 說明: 告警規則部署不依賴應用構建,獨立觸發以加快響應速度
# =============================================================================
@@ -12,8 +12,6 @@ on:
branches: [main]
paths:
- 'ops/monitoring/alerts-unified.yml'
- 'ops/monitoring/slo-rules.yml'
- 'scripts/ops/deploy-alerts.sh'
workflow_dispatch:
env:
@@ -32,15 +30,11 @@ jobs:
run: |
pip3 install -q pyyaml 2>/dev/null || pip install -q pyyaml
python3 -c "import yaml; yaml.safe_load(open('ops/monitoring/alerts-unified.yml')); print('YAML OK')"
python3 -c "import yaml; yaml.safe_load(open('ops/monitoring/slo-rules.yml')); print('SLO YAML OK')"
- name: Setup SSH key
run: |
mkdir -p ~/.ssh
umask 077
cat > ~/.ssh/id_ed25519 <<'AWOOOI_DEPLOY_KEY'
${{ secrets.DEPLOY_SSH_KEY }}
AWOOOI_DEPLOY_KEY
echo "${{ secrets.DEPLOY_SSH_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
ssh-keyscan 192.168.0.110 >> ~/.ssh/known_hosts
@@ -56,17 +50,6 @@ jobs:
SHORT_SHA="${{ github.sha }}"
SHORT_SHA="${SHORT_SHA:0:7}"
MSG="${EMOJI} Prometheus 告警規則部署 ${STATUS} (${SHORT_SHA})"
CICD_STATUS="success"
[ "$STATUS" != "success" ] && CICD_STATUS="failed"
if AWOOI_CICD_STATUS="${CICD_STATUS}" \
AWOOI_CICD_STAGE=deploy-alerts \
AWOOI_CICD_JOB_NAME="Prometheus 告警規則部署" \
AWOOI_CICD_COMMIT_SHA="${{ github.sha }}" \
AWOOI_CICD_SUMMARY="${MSG}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "Alert rule deploy notification mirrored through AWOOI API"
else
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \
--data-urlencode "text=${MSG}" || true
fi
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \
--data-urlencode "text=${MSG}" || true

View File

@@ -51,52 +51,10 @@ jobs:
echo "status=failed" >> $GITHUB_OUTPUT
exit 1
- name: Source Provider Freshness Smoke
run: |
SOURCE_CANARY_RUN_REF="gitea-e2e-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}"
echo "SOURCE_CANARY_RUN_REF=${SOURCE_CANARY_RUN_REF}" >> "$GITHUB_ENV"
echo "SOURCE_LINK_CANARY_WORK_ITEM_ID=source-evidence:sentry:upstream_canary:awoooi-source-link-canary-${SOURCE_CANARY_RUN_REF}" >> "$GITHUB_ENV"
OPERATOR_KEY="$(cat <<'AWOOOI_SECRET_AWOOOP_OPERATOR_API_KEY'
${{ secrets.AWOOOP_OPERATOR_API_KEY }}
AWOOOI_SECRET_AWOOOP_OPERATOR_API_KEY
)"
AWOOOP_OPERATOR_API_KEY="${OPERATOR_KEY}" \
AWOOOP_OPERATOR_ID=gitea-e2e-health \
python3 scripts/alert_chain_smoke_test.py \
--api-url https://awoooi.wooo.work \
--metrics-api-url http://192.168.0.125:32334 \
--source-provider-heartbeat \
--source-provider-upstream-canary \
--run-ref "${SOURCE_CANARY_RUN_REF}" \
--source-link-canary-target-incident-id INC-20260505-25E744 \
--json
- name: Source Correlation Applied-Link Smoke
run: |
python3 scripts/awooop_source_correlation_apply_smoke.py \
--api-url https://awoooi.wooo.work \
--target-incident-id INC-20260505-25E744 \
--allow-existing-apply \
--refresh-if-stale-days 6 \
--refresh-work-item-id "${SOURCE_LINK_CANARY_WORK_ITEM_ID}" \
--verify-refresh-candidate \
--reviewer-id gitea_e2e_source_link_canary \
--operator-note "T124 dedicated source-link canary refresh; append-only status-chain proof"
- name: Notify Telegram on Failure
if: failure()
run: |
MSG="E2E Health Check 失敗API 健康檢查未通過"
if AWOOI_CICD_STATUS=failed \
AWOOI_CICD_STAGE=e2e-health \
AWOOI_CICD_JOB_NAME="E2E Health Check" \
AWOOI_CICD_COMMIT_SHA="${{ github.sha }}" \
AWOOI_CICD_SUMMARY="${MSG}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "E2E failure notification mirrored through AWOOI API"
else
curl -s -X POST "https://api.telegram.org/bot${{ secrets.OPENCLAW_TG_BOT_TOKEN }}/sendMessage" \
-d chat_id="${{ env.TELEGRAM_ALERT_CHAT_ID }}" \
-d parse_mode="HTML" \
-d text="🔴 <b>[E2E Health Check]</b> 失敗%0A%0A📅 $(TZ=Asia/Taipei date '+%Y-%m-%d %H:%M')%0A🔗 API 健康檢查未通過%0A%0A請檢查 K3s 叢集狀態"
fi
curl -s -X POST "https://api.telegram.org/bot${{ secrets.OPENCLAW_TG_BOT_TOKEN }}/sendMessage" \
-d chat_id="${{ env.TELEGRAM_ALERT_CHAT_ID }}" \
-d parse_mode="HTML" \
-d text="🔴 <b>[E2E Health Check]</b> 失敗%0A%0A📅 $(TZ=Asia/Taipei date '+%Y-%m-%d %H:%M')%0A🔗 API 健康檢查未通過%0A%0A請檢查 K3s 叢集狀態"

View File

@@ -17,7 +17,6 @@ on:
branches: [main]
paths:
- 'apps/api/migrations/*.sql'
workflow_dispatch:
env:
TELEGRAM_ALERT_CHAT_ID: "-1003711974679"
@@ -57,101 +56,45 @@ jobs:
- name: Identify new migrations
id: diff
run: |
ALL_NEW_FILES=$(git diff --no-renames --name-only --diff-filter=A HEAD~1 HEAD -- 'apps/api/migrations/*.sql' || true)
NEW_FILES=$(echo "$ALL_NEW_FILES" | grep -Ev '(_down|rollback)\.sql$' || true)
SKIPPED_ROLLBACK_FILES=$(echo "$ALL_NEW_FILES" | grep -E '(_down|rollback)\.sql$' || true)
NEW_FILES=$(git diff --name-only --diff-filter=A HEAD~1 HEAD -- 'apps/api/migrations/*.sql' || true)
echo "new_files<<EOF" >> $GITHUB_OUTPUT
echo "$NEW_FILES" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
echo "=== New migration files ==="
echo "$NEW_FILES"
if [ -n "$SKIPPED_ROLLBACK_FILES" ]; then
echo "=== Rollback/down migrations skipped by design ==="
echo "$SKIPPED_ROLLBACK_FILES"
fi
- name: Apply new migrations
if: steps.diff.outputs.new_files != ''
env:
# 從 Gitea secrets 取,不直接明碼
PGURL: ${{ secrets.MIGRATION_DATABASE_URL }}
run: |
set -euo pipefail
# 從 Gitea secrets 取,不放 step-level env避免 runner log 展開。
# MIGRATION_DATABASE_URL 是限權帳號DATABASE_URL 只在 PostgreSQL
# 明確回報「必須是 table owner」時作為受控 fallback。
PGURL="$(cat <<'AWOOOI_SECRET_MIGRATION_DATABASE_URL'
${{ secrets.MIGRATION_DATABASE_URL }}
AWOOOI_SECRET_MIGRATION_DATABASE_URL
)"
OWNER_PGURL="$(cat <<'AWOOOI_SECRET_DATABASE_URL'
${{ secrets.DATABASE_URL }}
AWOOOI_SECRET_DATABASE_URL
)"
if [ -z "$PGURL" ]; then
echo "::error::MIGRATION_DATABASE_URL secret not set in Gitea"
exit 1
fi
PGURL_PSQL="${PGURL/postgresql+asyncpg:\/\//postgresql:\/\/}"
OWNER_PGURL_PSQL="${OWNER_PGURL/postgresql+asyncpg:\/\//postgresql:\/\/}"
apply_migration() {
local url="$1"
local file="$2"
psql "$url" \
-v ON_ERROR_STOP=1 \
--single-transaction \
-f "$file"
}
# 套用每個新檔 (single transaction per file)
echo "${{ steps.diff.outputs.new_files }}" | while IFS= read -r file; do
[ -z "$file" ] && continue
echo "=== Applying: $file ==="
migration_err="$(mktemp)"
if ! apply_migration "$PGURL_PSQL" "$file" 2>"$migration_err"; then
if grep -Eq "(must be owner of table|permission denied for table)" "$migration_err"; then
if [ -z "$OWNER_PGURL_PSQL" ]; then
cat "$migration_err" >&2
echo "::error::migration requires table owner but DATABASE_URL secret is not set"
exit 1
fi
echo "::warning::migration requires table owner; retrying with owner connection"
apply_migration "$OWNER_PGURL_PSQL" "$file"
else
cat "$migration_err" >&2
exit 1
fi
fi
rm -f "$migration_err"
psql "$PGURL_PSQL" \
-v ON_ERROR_STOP=1 \
--single-transaction \
-f "$file"
echo "=== OK: $file ==="
done
- name: Seed asset_discovery_run (audit)
if: steps.diff.outputs.new_files != ''
env:
PGURL: ${{ secrets.MIGRATION_DATABASE_URL }}
run: |
set -euo pipefail
PGURL="$(cat <<'AWOOOI_SECRET_MIGRATION_DATABASE_URL'
${{ secrets.MIGRATION_DATABASE_URL }}
AWOOOI_SECRET_MIGRATION_DATABASE_URL
)"
OWNER_PGURL="$(cat <<'AWOOOI_SECRET_DATABASE_URL'
${{ secrets.DATABASE_URL }}
AWOOOI_SECRET_DATABASE_URL
)"
if [ -z "$PGURL" ]; then
echo "::error::MIGRATION_DATABASE_URL secret not set in Gitea"
exit 1
fi
PGURL_PSQL="${PGURL/postgresql+asyncpg:\/\//postgresql:\/\/}"
OWNER_PGURL_PSQL="${OWNER_PGURL/postgresql+asyncpg:\/\//postgresql:\/\/}"
FILES_JSON=$(echo "${{ steps.diff.outputs.new_files }}" | jq -Rn '[inputs | select(length > 0)]')
SUMMARY_JSON=$(jq -cn \
--arg commit_sha "${{ github.sha }}" \
--argjson files "$FILES_JSON" \
'{type: "ci_migration", commit_sha: $commit_sha, files: $files}')
SUMMARY_JSON_SQL=${SUMMARY_JSON//\'/\'\'}
seed_audit() {
local url="$1"
psql "$url" -v ON_ERROR_STOP=1 <<SQL
psql "$PGURL_PSQL" -c "
INSERT INTO asset_discovery_run (
run_id, triggered_by, scope, scan_depth, status,
started_at, ended_at, tools_used, summary
@@ -163,51 +106,23 @@ jobs:
'success',
NOW(),
NOW(),
'{"psql": 1, "gitea_ci": 1}'::jsonb,
'${SUMMARY_JSON_SQL}'::jsonb
'{\"psql\": 1, \"gitea_ci\": 1}'::jsonb,
jsonb_build_object(
'type', 'ci_migration',
'commit_sha', '${{ github.sha }}',
'files', $FILES_JSON
)
);
SQL
}
audit_err="$(mktemp)"
if ! seed_audit "$PGURL_PSQL" 2>"$audit_err"; then
if grep -q "permission denied for table asset_discovery_run" "$audit_err"; then
if [ -z "$OWNER_PGURL_PSQL" ]; then
cat "$audit_err" >&2
echo "::error::audit requires table insert privilege but DATABASE_URL secret is not set"
exit 1
fi
echo "::warning::audit requires owner connection; retrying with owner connection"
seed_audit "$OWNER_PGURL_PSQL"
else
cat "$audit_err" >&2
exit 1
fi
fi
rm -f "$audit_err"
"
- name: Notify Telegram (if configured)
if: always()
env:
TG_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
TG_CHAT: ${{ env.TELEGRAM_ALERT_CHAT_ID }}
run: |
TG_TOKEN="$(cat <<'AWOOOI_SECRET_TG_TOKEN'
${{ secrets.TELEGRAM_BOT_TOKEN }}
AWOOOI_SECRET_TG_TOKEN
)"
STATUS="${{ job.status }}"
CICD_STATUS="success"
[ "$STATUS" != "success" ] && CICD_STATUS="failed"
if AWOOI_CICD_STATUS="${CICD_STATUS}" \
AWOOI_CICD_STAGE=run-migration \
AWOOI_CICD_JOB_NAME="Migration CI" \
AWOOI_CICD_COMMIT_SHA="${{ github.sha }}" \
AWOOI_CICD_SUMMARY="Migration CI: ${STATUS}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "Migration notification mirrored through AWOOI API"
exit 0
fi
if [ -n "$TG_TOKEN" ] && [ -n "$TG_CHAT" ]; then
STATUS="${{ job.status }}"
MSG="🗄️ Migration CI: \`${STATUS}\` — commit ${{ github.sha }}"
curl -s -X POST "https://api.telegram.org/bot${TG_TOKEN}/sendMessage" \
-d chat_id="${TG_CHAT}" \

View File

@@ -13,10 +13,12 @@
name: CD
# 2026-05-12 Codex: GitHub 僅保留唯讀備份;生產 CI/CD 只能從 Gitea 執行。
# 本 workflow 曾可 push / workflow_dispatch 後 build、patch secret、kubectl apply
# 會和 `.gitea/workflows/cd.yaml` 競爭 K3s production 狀態,因此硬停用。
on:
push:
branches: [main]
paths-ignore:
- 'docs/**'
- '*.md'
workflow_dispatch:
inputs:
force_deploy:
@@ -58,7 +60,6 @@ jobs:
# ==================== Pre-flight Check (10s Fail-Fast) ====================
pre-flight-check:
name: "Pre-flight Check"
if: ${{ false }}
runs-on: [self-hosted, harbor, k8s]
timeout-minutes: 1
steps:
@@ -132,7 +133,6 @@ jobs:
# 2026-03-29 Claude Code: 確保監控覆蓋率 >= 90%
monitoring-coverage:
name: "Monitoring Coverage"
if: ${{ false }}
runs-on: [self-hosted, harbor, k8s]
needs: pre-flight-check
timeout-minutes: 2
@@ -152,7 +152,6 @@ jobs:
# ==================== 路徑偵測 (使用 dorny/paths-filter) ====================
detect-changes:
name: Detect Changes
if: ${{ false }}
runs-on: [self-hosted, harbor, k8s]
needs: [pre-flight-check, monitoring-coverage]
timeout-minutes: 1
@@ -198,7 +197,11 @@ jobs:
runs-on: [self-hosted, harbor, k8s]
needs: [detect-changes, build-web]
timeout-minutes: 20
if: ${{ false }}
if: |
!inputs.skip_api && (
needs.detect-changes.outputs.api == 'true' ||
(needs.detect-changes.outputs.api == 'false' && needs.detect-changes.outputs.web == 'false')
)
outputs:
image_tag: ${{ steps.tag.outputs.tag }}
steps:
@@ -235,7 +238,11 @@ jobs:
runs-on: [self-hosted, harbor, k8s]
needs: detect-changes
timeout-minutes: 20
if: ${{ false }}
if: |
!inputs.skip_web && (
needs.detect-changes.outputs.web == 'true' ||
(needs.detect-changes.outputs.api == 'false' && needs.detect-changes.outputs.web == 'false')
)
outputs:
image_tag: ${{ steps.tag.outputs.tag }}
steps:
@@ -286,7 +293,7 @@ jobs:
concurrency:
group: runner-awoooi-cd-mutex
cancel-in-progress: false
if: ${{ false }}
if: always() && (needs.build-api.result == 'success' || needs.build-api.result == 'skipped') && (needs.build-web.result == 'success' || needs.build-web.result == 'skipped')
environment: production
steps:
# 2026-03-29: Runner 診斷檔案清理 (防止並行衝突)

View File

@@ -14,10 +14,15 @@
name: Deploy to Production
# 2026-05-12 Codex: GitHub 是唯讀備份production deploy 只能從 Gitea 進入。
# 這份歷史 workflow 仍含 Harbor build/push 與 kubectl apply/rollout會和 Gitea CD 競爭。
# 保留檔案供稽核,但停用所有 job。
on:
push:
branches:
- main
paths:
- 'apps/api/**'
- 'apps/web/**'
- 'k8s/awoooi-prod/**'
- '.github/workflows/deploy-prod.yml'
workflow_dispatch:
inputs:
deploy_api:
@@ -65,7 +70,6 @@ jobs:
# ===========================================================================
build:
name: "Build Images"
if: ${{ false }}
runs-on: [self-hosted, harbor, k8s]
outputs:
image_tag: ${{ steps.meta.outputs.tag }}
@@ -134,7 +138,6 @@ jobs:
deploy:
name: "Deploy to K3s"
needs: build
if: ${{ false }}
runs-on: [self-hosted, harbor, k8s]
steps:
@@ -207,7 +210,7 @@ jobs:
smoke-test:
name: "Smoke Tests"
needs: deploy
if: ${{ false }}
if: ${{ !inputs.skip_tests }}
runs-on: [self-hosted, harbor, k8s]
steps:
@@ -245,7 +248,7 @@ jobs:
notify:
name: "Send Notification"
needs: [build, deploy, smoke-test]
if: ${{ false }}
if: always()
runs-on: [self-hosted, harbor, k8s]
steps:

1
.gitignore vendored
View File

@@ -93,4 +93,3 @@ tsconfig.tsbuildinfo
!.aiderignore
.claude/settings.local.json
.claude/settings.json
.claude/settings.json.bak*

View File

@@ -31,9 +31,6 @@
## 🔴 絕對禁止 → [HARD_RULES.md](docs/HARD_RULES.md)
## 🔴 文件語言鐵律 → [文件語言規範](docs/HARD_RULES.md#文件語言規範)
Markdown、ADR、LOGBOOK、Runbook、交接文件與計畫文件一律使用繁體中文程式符號、API、指令、錯誤碼、服務名稱與原始 log 可保留英文。
## 🔴 紅區治理 → [RED_ZONES.md](docs/RED_ZONES.md)
Tier 3 核心檔案 (decision_manager, trust_engine, config 等) 修改需首席架構師授權

View File

@@ -1 +1 @@
# 2026-05-20 source-provider-heartbeat deploy trigger
# 2026-04-05 warm-up deploy triggered

View File

@@ -44,6 +44,28 @@ FROM python:3.11-slim
WORKDIR /app
# Copy installed packages from builder
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
COPY --from=builder /usr/local/bin /usr/local/bin
# 2026-04-01 ogt: CACHE_BUST 強制失效 src/ 和 models.json 層
# deps 層 (pip install) 仍可 cache代碼/配置變更必須重建
ARG CACHE_BUST=none
COPY apps/api/src/ ./src/
COPY apps/api/models.json ./models.json
# 2026-04-09 ogt: 規則引擎配置 — alert_rule_engine.py 從此檔載入規則
COPY apps/api/alert_rules.yaml ./alert_rules.yaml
# 2026-04-10 Claude Sonnet 4.6: drift_detector 需要 k8s/ YAML 做 Git state 比對
COPY k8s/ ./k8s/
# 2026-04-10 Claude Sonnet 4.6: RAG 知識庫索引來源 (ADR-067 Phase 33)
COPY docs/ ./docs/
COPY .agents/skills/ ./.agents/skills/
# 2026-05-04 Claude Sonnet 4.6 (Task 1.2): hermes agent_loader 的 system prompt 來源
# agent_loader.py 預設讀 /app/.claude/agents/,對應 K8s AGENTS_DIR 環境變數
COPY .claude/agents/ ./.claude/agents/
# 2026-04-12 ogt (ADR-073 P2-1): CronJob 腳本 — 獨立腳本取代 inline Python
COPY scripts/ ./scripts/
# Install openssh-client + curl — SSH_COMMAND Playbook + healthcheck
# Install kubectl — drift_detector 需要 kubectl 讀取 K8s 實際狀態
# (2026-04-09 Claude Sonnet 4.6 Asia/Taipei, Bug #6 修正 — python:3.11-slim 無 openssh-client)
@@ -53,38 +75,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends openssh-client
chmod +x kubectl && mv kubectl /usr/local/bin/kubectl && \
rm -rf /var/lib/apt/lists/*
# Create non-root user before copying app artifacts so COPY --chown can avoid
# an expensive full-tree chown layer on every source-only rebuild.
RUN useradd -m -u 1000 appuser
# Copy installed packages from builder
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
COPY --from=builder /usr/local/bin /usr/local/bin
# 2026-04-01 ogt: CACHE_BUST 強制失效 src/ 和 models.json 層
# deps 層 (pip install) 仍可 cache代碼/配置變更必須重建
ARG CACHE_BUST=none
COPY --chown=appuser:appuser apps/api/src/ ./src/
# 2026-04-09 ogt: 規則引擎配置 — alert_rule_engine.py 從此檔載入規則
COPY --chown=appuser:appuser apps/api/models.json ./models.json
COPY --chown=appuser:appuser apps/api/alert_rules.yaml ./alert_rules.yaml
# 2026-04-10 Claude Sonnet 4.6: drift_detector 需要 k8s/ YAML 做 Git state 比對
COPY --chown=appuser:appuser k8s/ ./k8s/
# 2026-05-24 Codex: truth-chain / Ansible readiness needs the repo-known
# playbook catalog in the API image.
# 2026-05-31 Codex: ansible-core is now installed through pyproject.toml so
# this catalog can graduate from visibility-only to check-mode runtime-ready
# once repair SSH material is mounted and readable. This still does not enable
# automatic apply; approval/execution code remains the gate.
COPY --chown=appuser:appuser infra/ansible/ ./infra/ansible/
# 2026-04-10 Claude Sonnet 4.6: RAG 知識庫索引來源 (ADR-067 Phase 33)
COPY --chown=appuser:appuser docs/ ./docs/
COPY --chown=appuser:appuser .agents/skills/ ./.agents/skills/
# 2026-05-04 Claude Sonnet 4.6 (Task 1.2): hermes agent_loader 的 system prompt 來源
# agent_loader.py 預設讀 /app/.claude/agents/,對應 K8s AGENTS_DIR 環境變數
COPY --chown=appuser:appuser .claude/agents/ ./.claude/agents/
# 2026-04-12 ogt (ADR-073 P2-1): CronJob 腳本 — 獨立腳本取代 inline Python
COPY --chown=appuser:appuser scripts/ ./scripts/
# Create non-root user
RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
USER appuser
# Expose port

View File

@@ -1,49 +0,0 @@
-- ADR-090 capacity_violation_event metric violation types
-- 日期2026-05-07台北
-- 目的:讓 capacity_scanner_job.py 寫入的 cpu/mem/swap 細項違規符合 DB constraint。
--
-- 背景:
-- capacity_scanner_job.py 會寫入:
-- - cpu_over_threshold
-- - mem_over_threshold
-- - swap_over_threshold
-- 但原始 ADR-090 DDL 只允許較粗的 host_saturation導致 production 出現
-- capacity_violation_event_type_valid check violation容量治理事件漏記。
BEGIN;
ALTER TABLE capacity_violation_event
DROP CONSTRAINT IF EXISTS capacity_violation_event_type_valid;
ALTER TABLE capacity_violation_event
ADD CONSTRAINT capacity_violation_event_type_valid
CHECK (violation_type IN (
'no_limit_set',
'over_request',
'over_limit',
'host_saturation',
'over_sla_budget',
'unauthorized_new_deploy',
'cpu_over_threshold',
'mem_over_threshold',
'swap_over_threshold',
'load_over_threshold'
));
COMMIT;
-- Rollback需人工確認後執行
-- BEGIN;
-- ALTER TABLE capacity_violation_event
-- DROP CONSTRAINT IF EXISTS capacity_violation_event_type_valid;
-- ALTER TABLE capacity_violation_event
-- ADD CONSTRAINT capacity_violation_event_type_valid
-- CHECK (violation_type IN (
-- 'no_limit_set',
-- 'over_request',
-- 'over_limit',
-- 'host_saturation',
-- 'over_sla_budget',
-- 'unauthorized_new_deploy'
-- ));
-- COMMIT;

View File

@@ -1,36 +0,0 @@
-- ADR-090-D: automation_operation_log.operation_type adds Ansible executor audit states
-- Created: 2026-05-12 Taipei
--
-- Purpose:
-- T3 Ansible declarative executor visibility. These operation types allow
-- the AI automation truth chain to record that Ansible was matched,
-- check-mode executed, applied, rolled back, or explicitly skipped.
--
-- Safety:
-- This migration only expands the CHECK allowlist. It does not execute
-- Ansible, change approval behavior, or create auto-remediation rows.
ALTER TABLE automation_operation_log
DROP CONSTRAINT IF EXISTS automation_operation_log_type_valid;
ALTER TABLE automation_operation_log
ADD CONSTRAINT automation_operation_log_type_valid CHECK (operation_type IN (
'monitor_configured','monitor_removed',
'alert_fired','alert_suppressed','alert_routed',
'rule_created','rule_updated','rule_matched','rule_rejected','rule_deprecated',
'playbook_generated','playbook_updated','playbook_executed',
'remediation_executed','remediation_verified','remediation_rolled_back',
'self_correction_attempted',
'km_created','km_updated','km_linked',
'asset_discovered','coverage_recalculated',
'capacity_recommendation','quota_enforced',
'notification_formatted',
'ansible_candidate_matched',
'ansible_check_mode_executed',
'ansible_apply_executed',
'ansible_rollback_executed',
'ansible_execution_skipped'
));
COMMENT ON CONSTRAINT automation_operation_log_type_valid ON automation_operation_log IS
'ADR-090-D: allow first-class Ansible executor audit states for AwoooP truth-chain visibility.';

View File

@@ -1,19 +0,0 @@
-- ADR-090-D rollback: remove Ansible executor audit states from operation_type allowlist.
-- Only apply after confirming no automation_operation_log rows use ansible_* operation types.
ALTER TABLE automation_operation_log
DROP CONSTRAINT IF EXISTS automation_operation_log_type_valid;
ALTER TABLE automation_operation_log
ADD CONSTRAINT automation_operation_log_type_valid CHECK (operation_type IN (
'monitor_configured','monitor_removed',
'alert_fired','alert_suppressed','alert_routed',
'rule_created','rule_updated','rule_matched','rule_rejected','rule_deprecated',
'playbook_generated','playbook_updated','playbook_executed',
'remediation_executed','remediation_verified','remediation_rolled_back',
'self_correction_attempted',
'km_created','km_updated','km_linked',
'asset_discovered','coverage_recalculated',
'capacity_recommendation','quota_enforced',
'notification_formatted'
));

View File

@@ -1,164 +0,0 @@
-- T9: approved SSH execution MCP Gateway seed
-- 目的:讓 Telegram/Approval 已批准的 SSH 修復動作通過 AwoooP Gateway 五閘門。
-- 邊界:只授權 approval_executorwrite/admin 仍需 Gate 5 短效 approval key。
SELECT set_config('app.project_id', 'awoooi', FALSE);
WITH agent_body AS (
SELECT jsonb_build_object(
'schema_version', 'awooop_agent_contract_v1',
'agent_id', 'approval_executor',
'display_name', 'Approval Executor',
'project_id', 'awoooi',
'purpose', 'Approved SSH execution through AwoooP MCP Gateway',
'allowed_scopes', jsonb_build_array('read', 'write', 'admin'),
'requires_gate5_for_scopes', jsonb_build_array('write', 'admin'),
'stage', 't9_ssh_approval_gateway'
) AS body_json
),
inserted_revision AS (
INSERT INTO awooop_contract_revisions (
project_id,
contract_family,
contract_id,
version_major,
version_minor,
lifecycle_status,
body_json,
body_hash,
body_schema_version,
publisher_id,
published_at
)
SELECT
'awoooi',
'agent',
'approval_executor',
1,
0,
'active',
body_json,
encode(digest(body_json::text, 'sha256'), 'hex'),
'v1.0',
'migration:t9_ssh_approval_gateway',
NOW()
FROM agent_body
ON CONFLICT (project_id, contract_family, contract_id, version_major, version_minor)
DO NOTHING
RETURNING revision_id, project_id, contract_family, contract_id
),
chosen_revision AS (
SELECT revision_id, project_id, contract_family, contract_id
FROM inserted_revision
UNION ALL
SELECT revision_id, project_id, contract_family, contract_id
FROM awooop_contract_revisions
WHERE project_id = 'awoooi'
AND contract_family = 'agent'
AND contract_id = 'approval_executor'
AND version_major = 1
AND version_minor = 0
AND lifecycle_status = 'active'
),
upsert_pointer AS (
INSERT INTO awooop_active_revisions (
project_id,
contract_family,
contract_id,
active_revision_id,
updated_at
)
SELECT DISTINCT ON (project_id, contract_family, contract_id)
project_id,
contract_family,
contract_id,
revision_id,
NOW()
FROM chosen_revision
ORDER BY project_id, contract_family, contract_id, revision_id
ON CONFLICT (project_id, contract_family, contract_id)
DO UPDATE SET
active_revision_id = EXCLUDED.active_revision_id,
updated_at = NOW()
RETURNING contract_id
)
SELECT 'approval_executor_active_contracts', count(*) FROM upsert_pointer;
WITH gateway_tools(tool_name, description, required_scope) AS (
VALUES
('ssh_diagnose', 'SSH host diagnosis read', 'read'),
('ssh_docker_restart', 'Approved Docker container restart over SSH', 'write'),
('ssh_docker_compose_restart', 'Approved Docker Compose service restart over SSH', 'write'),
('ssh_systemctl_restart', 'Approved systemd service restart over SSH', 'write'),
('ssh_clear_docker_logs', 'Approved Docker log truncation over SSH', 'write'),
('ssh_renew_ssl', 'Approved certbot renewal over SSH', 'write'),
('ssh_reload_nginx', 'Approved nginx config test and reload over SSH', 'write'),
('ssh_docker_prune', 'Approved Docker prune over SSH with provider disk guard', 'admin')
),
upsert_tools AS (
INSERT INTO awooop_mcp_tool_registry (
project_id,
tool_name,
tool_type,
description,
allowed_scopes,
environment_tags,
is_active,
updated_at
)
SELECT
'awoooi',
tool_name,
'mcp_server',
description,
jsonb_build_array(required_scope),
'{"env": "prod"}'::jsonb,
TRUE,
NOW()
FROM gateway_tools
ON CONFLICT (project_id, tool_name)
DO UPDATE SET
description = EXCLUDED.description,
allowed_scopes = EXCLUDED.allowed_scopes,
environment_tags = EXCLUDED.environment_tags,
is_active = TRUE,
updated_at = NOW()
RETURNING tool_id, tool_name, allowed_scopes
),
upsert_grants AS (
INSERT INTO awooop_mcp_grants (
project_id,
agent_id,
tool_id,
granted_by,
granted_scopes,
expires_at,
is_revoked,
revoked_at,
revoked_by
)
SELECT
'awoooi',
'approval_executor',
tool_id,
'migration:t9_ssh_approval_gateway',
allowed_scopes,
NULL,
FALSE,
NULL,
NULL
FROM upsert_tools
ON CONFLICT (project_id, agent_id, tool_id)
DO UPDATE SET
granted_by = EXCLUDED.granted_by,
granted_scopes = EXCLUDED.granted_scopes,
expires_at = NULL,
is_revoked = FALSE,
revoked_at = NULL,
revoked_by = NULL
RETURNING grant_id
)
SELECT
'approval_executor_ssh_gateway',
(SELECT count(*) FROM upsert_tools) AS tool_rows,
(SELECT count(*) FROM upsert_grants) AS grant_rows;

View File

@@ -1,43 +0,0 @@
-- Rollback for T9 approved SSH execution MCP Gateway seed.
-- Contract revisions are append-only; rollback revokes approval_executor grants
-- and deactivates only the write/admin tools introduced here.
SELECT set_config('app.project_id', 'awoooi', FALSE);
UPDATE awooop_mcp_grants
SET
is_revoked = TRUE,
revoked_at = NOW(),
revoked_by = 'rollback:t9_ssh_approval_gateway'
WHERE project_id = 'awoooi'
AND agent_id = 'approval_executor'
AND granted_by = 'migration:t9_ssh_approval_gateway'
AND is_revoked = FALSE;
UPDATE awooop_mcp_tool_registry
SET
is_active = FALSE,
updated_at = NOW()
WHERE project_id = 'awoooi'
AND tool_name IN (
'ssh_docker_restart',
'ssh_docker_compose_restart',
'ssh_systemctl_restart',
'ssh_clear_docker_logs',
'ssh_renew_ssl',
'ssh_reload_nginx',
'ssh_docker_prune'
);
DELETE FROM awooop_active_revisions
WHERE project_id = 'awoooi'
AND contract_family = 'agent'
AND contract_id = 'approval_executor';
UPDATE awooop_contract_revisions
SET lifecycle_status = 'revoked'
WHERE project_id = 'awoooi'
AND contract_family = 'agent'
AND contract_id = 'approval_executor'
AND publisher_id = 'migration:t9_ssh_approval_gateway'
AND lifecycle_status = 'active';

View File

@@ -1,166 +0,0 @@
-- T23: auto-repair executor read-only MCP Gateway seed
-- 目的:讓 YAML_RULE/PlayBook 的只讀 SSH 診斷步驟經過 AwoooP MCP Gateway。
-- 邊界:只授權 read scopewrite/admin SSH 工具仍必須走 approval_executor + Gate 5。
SELECT set_config('app.project_id', 'awoooi', FALSE);
WITH agent_body AS (
SELECT jsonb_build_object(
'schema_version', 'awooop_agent_contract_v1',
'agent_id', 'auto_repair_executor',
'display_name', 'Auto Repair Executor',
'project_id', 'awoooi',
'purpose', 'Read-only auto-repair diagnostics through AwoooP MCP Gateway',
'allowed_scopes', jsonb_build_array('read'),
'forbidden_scopes', jsonb_build_array('write', 'admin'),
'stage', 't23_auto_repair_diagnostic_gateway'
) AS body_json
),
inserted_revision AS (
INSERT INTO awooop_contract_revisions (
project_id,
contract_family,
contract_id,
version_major,
version_minor,
lifecycle_status,
body_json,
body_hash,
body_schema_version,
publisher_id,
published_at
)
SELECT
'awoooi',
'agent',
'auto_repair_executor',
1,
0,
'active',
body_json,
encode(digest(body_json::text, 'sha256'), 'hex'),
'v1.0',
'migration:t23_auto_repair_executor_read_gateway',
NOW()
FROM agent_body
ON CONFLICT (project_id, contract_family, contract_id, version_major, version_minor)
DO NOTHING
RETURNING revision_id, project_id, contract_family, contract_id
),
chosen_revision AS (
SELECT revision_id, project_id, contract_family, contract_id
FROM inserted_revision
UNION ALL
SELECT revision_id, project_id, contract_family, contract_id
FROM awooop_contract_revisions
WHERE project_id = 'awoooi'
AND contract_family = 'agent'
AND contract_id = 'auto_repair_executor'
AND version_major = 1
AND version_minor = 0
AND lifecycle_status = 'active'
),
upsert_pointer AS (
INSERT INTO awooop_active_revisions (
project_id,
contract_family,
contract_id,
active_revision_id,
updated_at
)
SELECT DISTINCT ON (project_id, contract_family, contract_id)
project_id,
contract_family,
contract_id,
revision_id,
NOW()
FROM chosen_revision
ORDER BY project_id, contract_family, contract_id, revision_id
ON CONFLICT (project_id, contract_family, contract_id)
DO UPDATE SET
active_revision_id = EXCLUDED.active_revision_id,
updated_at = NOW()
RETURNING contract_id
)
SELECT 'auto_repair_executor_active_contracts', count(*) FROM upsert_pointer;
WITH read_tools(tool_name, description) AS (
VALUES
('ssh_diagnose', 'SSH host/container diagnosis read'),
('ssh_get_top_processes', 'SSH top processes read'),
('ssh_get_disk_usage', 'SSH disk usage read'),
('ssh_get_memory_info', 'SSH memory info read'),
('ssh_get_container_logs', 'SSH container logs read'),
('ssh_get_container_status', 'SSH container status read'),
('ssh_get_service_status', 'SSH service status read'),
('ssh_check_port', 'SSH port check read'),
('ssh_get_nginx_error_log', 'SSH nginx error log read'),
('ssh_get_swap_info', 'SSH swap info read')
),
upsert_tools AS (
INSERT INTO awooop_mcp_tool_registry (
project_id,
tool_name,
tool_type,
description,
allowed_scopes,
environment_tags,
is_active,
updated_at
)
SELECT
'awoooi',
tool_name,
'mcp_server',
description,
'["read"]'::jsonb,
'{"env": "prod"}'::jsonb,
TRUE,
NOW()
FROM read_tools
ON CONFLICT (project_id, tool_name)
DO UPDATE SET
description = EXCLUDED.description,
allowed_scopes = EXCLUDED.allowed_scopes,
environment_tags = EXCLUDED.environment_tags,
is_active = TRUE,
updated_at = NOW()
RETURNING tool_id, tool_name, allowed_scopes
),
upsert_grants AS (
INSERT INTO awooop_mcp_grants (
project_id,
agent_id,
tool_id,
granted_by,
granted_scopes,
expires_at,
is_revoked,
revoked_at,
revoked_by
)
SELECT
'awoooi',
'auto_repair_executor',
tool_id,
'migration:t23_auto_repair_executor_read_gateway',
allowed_scopes,
NULL,
FALSE,
NULL,
NULL
FROM upsert_tools
ON CONFLICT (project_id, agent_id, tool_id)
DO UPDATE SET
granted_by = EXCLUDED.granted_by,
granted_scopes = EXCLUDED.granted_scopes,
expires_at = NULL,
is_revoked = FALSE,
revoked_at = NULL,
revoked_by = NULL
RETURNING grant_id
)
SELECT
'auto_repair_executor_read_gateway',
(SELECT count(*) FROM upsert_tools) AS tool_rows,
(SELECT count(*) FROM upsert_grants) AS grant_rows;

View File

@@ -1,24 +0,0 @@
-- Rollback T23 auto-repair executor read-only MCP Gateway grant.
SELECT set_config('app.project_id', 'awoooi', FALSE);
UPDATE awooop_mcp_grants
SET is_revoked = TRUE,
revoked_at = NOW(),
revoked_by = 'rollback:t23_auto_repair_executor_read_gateway'
WHERE project_id = 'awoooi'
AND agent_id = 'auto_repair_executor'
AND granted_by = 'migration:t23_auto_repair_executor_read_gateway';
DELETE FROM awooop_active_revisions
WHERE project_id = 'awoooi'
AND contract_family = 'agent'
AND contract_id = 'auto_repair_executor';
UPDATE awooop_contract_revisions
SET lifecycle_status = 'retired'
WHERE project_id = 'awoooi'
AND contract_family = 'agent'
AND contract_id = 'auto_repair_executor'
AND publisher_id = 'migration:t23_auto_repair_executor_read_gateway'
AND lifecycle_status = 'active';

View File

@@ -1,25 +0,0 @@
-- =============================================================================
-- AwoooP / AWOOOI MCP Gateway Shadow Onboarding
-- 2026-05-13 Codex + ogt
--
-- 背景:
-- AWOOOI 已完成 read-only MCP tool registry / grants seed但 project 本身仍停在
-- legacy_awoooi_default會被 MCP Gateway Gate 1 正確攔截。
--
-- 邊界:
-- 只把 AWOOOI 租戶升到 shadow讓既有 Gate 1 生效。
-- write/admin tool 仍未授權;自動修復/破壞性動作不因本 migration 開放。
-- =============================================================================
BEGIN;
SELECT set_config('app.project_id', 'awoooi', FALSE);
UPDATE awooop_projects
SET
migration_mode = 'shadow',
updated_at = NOW()
WHERE project_id = 'awoooi'
AND migration_mode = 'legacy_awoooi_default';
COMMIT;

View File

@@ -1,20 +0,0 @@
-- =============================================================================
-- Rollback: AwoooP / AWOOOI MCP Gateway Shadow Onboarding
-- 2026-05-13 Codex + ogt
--
-- 只回退仍停在 shadow 的 AWOOOI若已由人工/後續 migration 推進到 canary/active
-- 不自動降級。
-- =============================================================================
BEGIN;
SELECT set_config('app.project_id', 'awoooi', FALSE);
UPDATE awooop_projects
SET
migration_mode = 'legacy_awoooi_default',
updated_at = NOW()
WHERE project_id = 'awoooi'
AND migration_mode = 'shadow';
COMMIT;

View File

@@ -1,211 +0,0 @@
-- T7: awoooi read-only MCP Gateway seed
-- 目的:讓決策前感官 MCP 能通過 AwoooP Gateway Gate 2/3產生 first-class audit。
-- 邊界:只授權 read scope不授權 restart/delete/scale/apply/rollback 等 write/admin 工具。
SELECT set_config('app.project_id', 'awoooi', FALSE);
WITH agent_seed(agent_id, display_name) AS (
VALUES
('pre_decision_investigator', 'Pre-decision Investigator'),
('post_execution_verifier', 'Post-execution Verifier')
),
agent_body AS (
SELECT
agent_id,
jsonb_build_object(
'schema_version', 'awooop_agent_contract_v1',
'agent_id', agent_id,
'display_name', display_name,
'project_id', 'awoooi',
'purpose', 'Read-only MCP sensing through AwoooP Gateway',
'allowed_scopes', jsonb_build_array('read'),
'forbidden_scopes', jsonb_build_array('write', 'admin'),
'stage', 't7_mcp_gateway_read_sense'
) AS body_json
FROM agent_seed
),
inserted_revision AS (
INSERT INTO awooop_contract_revisions (
project_id,
contract_family,
contract_id,
version_major,
version_minor,
lifecycle_status,
body_json,
body_hash,
body_schema_version,
publisher_id,
published_at
)
SELECT
'awoooi',
'agent',
agent_id,
1,
0,
'active',
body_json,
encode(digest(body_json::text, 'sha256'), 'hex'),
'v1.0',
'migration:t7_mcp_gateway_read_seed',
NOW()
FROM agent_body
ON CONFLICT (project_id, contract_family, contract_id, version_major, version_minor)
DO NOTHING
RETURNING revision_id, project_id, contract_family, contract_id
),
chosen_revision AS (
SELECT revision_id, project_id, contract_family, contract_id
FROM inserted_revision
UNION ALL
SELECT revision_id, project_id, contract_family, contract_id
FROM awooop_contract_revisions
WHERE project_id = 'awoooi'
AND contract_family = 'agent'
AND contract_id IN (SELECT agent_id FROM agent_seed)
AND version_major = 1
AND version_minor = 0
AND lifecycle_status = 'active'
),
upsert_pointer AS (
INSERT INTO awooop_active_revisions (
project_id,
contract_family,
contract_id,
active_revision_id,
updated_at
)
SELECT DISTINCT ON (project_id, contract_family, contract_id)
project_id,
contract_family,
contract_id,
revision_id,
NOW()
FROM chosen_revision
ORDER BY project_id, contract_family, contract_id, revision_id
ON CONFLICT (project_id, contract_family, contract_id)
DO UPDATE SET
active_revision_id = EXCLUDED.active_revision_id,
updated_at = NOW()
RETURNING contract_id
)
SELECT 'active_agent_contracts', count(*) FROM upsert_pointer;
WITH read_tools(tool_name, description) AS (
VALUES
('k8s_get_pod_logs', 'Kubernetes pod logs read'),
('k8s_get_events', 'Kubernetes events read'),
('k8s_describe_pod', 'Kubernetes pod describe read'),
('k8s_get_hpa_status', 'Kubernetes HPA status read'),
('k8s_get_node_conditions', 'Kubernetes node conditions read'),
('ssh_diagnose', 'SSH host diagnosis read'),
('ssh_get_top_processes', 'SSH top processes read'),
('ssh_get_disk_usage', 'SSH disk usage read'),
('ssh_get_memory_info', 'SSH memory info read'),
('ssh_get_container_logs', 'SSH container logs read'),
('ssh_get_container_status', 'SSH container status read'),
('ssh_get_service_status', 'SSH service status read'),
('ssh_check_port', 'SSH port check read'),
('ssh_get_nginx_error_log', 'SSH nginx error log read'),
('ssh_get_swap_info', 'SSH swap info read'),
('prometheus_query', 'Prometheus instant query read'),
('prometheus_query_range', 'Prometheus range query read'),
('prometheus_get_alert_history', 'Prometheus alert history read'),
('gold_metrics', 'SigNoz gold metrics read'),
('trace_url', 'SigNoz trace URL read'),
('system_metrics', 'SigNoz system metrics read'),
('query_logs', 'SigNoz logs read'),
('error_logs_summary', 'SigNoz error logs summary read'),
('list_approvals', 'Approval records read'),
('get_approval', 'Approval detail read'),
('list_incidents', 'Incident records read'),
('list_timeline', 'Timeline records read'),
('read_file', 'Filesystem allowlisted file read'),
('list_directory', 'Filesystem allowlisted directory read'),
('search_in_file', 'Filesystem allowlisted file search'),
('list_dashboards', 'Grafana dashboards read'),
('get_dashboard', 'Grafana dashboard read'),
('get_panel_data', 'Grafana panel data read'),
('generate_dashboard_url', 'Grafana dashboard URL read'),
('search_runbook', 'Runbook semantic search read'),
('get_index_stats', 'Runbook index stats read'),
('argocd_list_apps', 'ArgoCD apps read'),
('argocd_get_app_status', 'ArgoCD app status read'),
('argocd_get_sync_history', 'ArgoCD sync history read'),
('sentry_list_issues', 'Sentry issues read'),
('sentry_get_issue', 'Sentry issue detail read'),
('sentry_search_issues', 'Sentry issue search read')
),
upsert_tools AS (
INSERT INTO awooop_mcp_tool_registry (
project_id,
tool_name,
tool_type,
description,
allowed_scopes,
environment_tags,
is_active,
updated_at
)
SELECT
'awoooi',
tool_name,
'mcp_server',
description,
'["read"]'::jsonb,
'{"env": "prod"}'::jsonb,
TRUE,
NOW()
FROM read_tools
ON CONFLICT (project_id, tool_name)
DO UPDATE SET
description = EXCLUDED.description,
allowed_scopes = EXCLUDED.allowed_scopes,
environment_tags = EXCLUDED.environment_tags,
is_active = TRUE,
updated_at = NOW()
RETURNING tool_id
),
grant_agents(agent_id) AS (
VALUES
('pre_decision_investigator'),
('post_execution_verifier')
),
upsert_grants AS (
INSERT INTO awooop_mcp_grants (
project_id,
agent_id,
tool_id,
granted_by,
granted_scopes,
expires_at,
is_revoked,
revoked_at,
revoked_by
)
SELECT
'awoooi',
grant_agents.agent_id,
upsert_tools.tool_id,
'migration:t7_mcp_gateway_read_seed',
'["read"]'::jsonb,
NULL,
FALSE,
NULL,
NULL
FROM upsert_tools
CROSS JOIN grant_agents
ON CONFLICT (project_id, agent_id, tool_id)
DO UPDATE SET
granted_scopes = EXCLUDED.granted_scopes,
expires_at = NULL,
is_revoked = FALSE,
revoked_at = NULL,
revoked_by = NULL
RETURNING grant_id
)
SELECT
'awoooi_read_tools',
(SELECT count(*) FROM upsert_tools) AS tool_rows,
(SELECT count(*) FROM upsert_grants) AS grant_rows;

View File

@@ -1,77 +0,0 @@
-- Rollback for T7 awoooi read-only MCP Gateway seed.
-- Contract revisions are append-only; rollback revokes grants and deactivates the seeded read tools.
SELECT set_config('app.project_id', 'awoooi', FALSE);
UPDATE awooop_mcp_grants
SET
is_revoked = TRUE,
revoked_at = NOW(),
revoked_by = 'rollback:t7_mcp_gateway_read_seed'
WHERE project_id = 'awoooi'
AND agent_id IN ('pre_decision_investigator', 'post_execution_verifier')
AND granted_by = 'migration:t7_mcp_gateway_read_seed'
AND is_revoked = FALSE;
UPDATE awooop_mcp_tool_registry
SET
is_active = FALSE,
updated_at = NOW()
WHERE project_id = 'awoooi'
AND tool_name IN (
'k8s_get_pod_logs',
'k8s_get_events',
'k8s_describe_pod',
'k8s_get_hpa_status',
'k8s_get_node_conditions',
'ssh_diagnose',
'ssh_get_top_processes',
'ssh_get_disk_usage',
'ssh_get_memory_info',
'ssh_get_container_logs',
'ssh_get_container_status',
'ssh_get_service_status',
'ssh_check_port',
'ssh_get_nginx_error_log',
'ssh_get_swap_info',
'prometheus_query',
'prometheus_query_range',
'prometheus_get_alert_history',
'gold_metrics',
'trace_url',
'system_metrics',
'query_logs',
'error_logs_summary',
'list_approvals',
'get_approval',
'list_incidents',
'list_timeline',
'read_file',
'list_directory',
'search_in_file',
'list_dashboards',
'get_dashboard',
'get_panel_data',
'generate_dashboard_url',
'search_runbook',
'get_index_stats',
'argocd_list_apps',
'argocd_get_app_status',
'argocd_get_sync_history',
'sentry_list_issues',
'sentry_get_issue',
'sentry_search_issues'
);
DELETE FROM awooop_active_revisions
WHERE project_id = 'awoooi'
AND contract_family = 'agent'
AND contract_id IN ('pre_decision_investigator', 'post_execution_verifier');
UPDATE awooop_contract_revisions
SET lifecycle_status = 'revoked'
WHERE project_id = 'awoooi'
AND contract_family = 'agent'
AND contract_id IN ('pre_decision_investigator', 'post_execution_verifier')
AND publisher_id = 'migration:t7_mcp_gateway_read_seed'
AND lifecycle_status = 'active';

View File

@@ -1,213 +0,0 @@
-- T7: awoooi read-only MCP Gateway seed
-- 目的:讓決策前感官 MCP 能通過 AwoooP Gateway Gate 2/3產生 first-class audit。
-- 邊界:只授權 read scope不授權 restart/delete/scale/apply/rollback 等 write/admin 工具。
SELECT set_config('app.project_id', 'awoooi', FALSE);
WITH agent_seed(agent_id, display_name) AS (
VALUES
('pre_decision_investigator', 'Pre-decision Investigator'),
('post_execution_verifier', 'Post-execution Verifier')
),
agent_body AS (
SELECT
agent_id,
jsonb_build_object(
'schema_version', 'awooop_agent_contract_v1',
'agent_id', agent_id,
'display_name', display_name,
'project_id', 'awoooi',
'purpose', 'Read-only MCP sensing through AwoooP Gateway',
'allowed_scopes', jsonb_build_array('read'),
'forbidden_scopes', jsonb_build_array('write', 'admin'),
'stage', 't7_mcp_gateway_read_sense'
) AS body_json
FROM agent_seed
),
inserted_revision AS (
INSERT INTO awooop_contract_revisions (
project_id,
contract_family,
contract_id,
version_major,
version_minor,
lifecycle_status,
body_json,
body_hash,
body_schema_version,
publisher_id,
published_at
)
SELECT
'awoooi',
'agent',
agent_id,
1,
0,
'active',
body_json,
encode(digest(body_json::text, 'sha256'), 'hex'),
'v1.0',
'migration:t7_mcp_gateway_read_seed',
NOW()
FROM agent_body
ON CONFLICT (project_id, contract_family, contract_id, version_major, version_minor)
DO NOTHING
RETURNING revision_id, project_id, contract_family, contract_id
),
chosen_revision AS (
SELECT revision_id, project_id, contract_family, contract_id
FROM inserted_revision
UNION ALL
SELECT revision_id, project_id, contract_family, contract_id
FROM awooop_contract_revisions
WHERE project_id = 'awoooi'
AND contract_family = 'agent'
AND contract_id IN (SELECT agent_id FROM agent_seed)
AND version_major = 1
AND version_minor = 0
AND lifecycle_status = 'active'
),
upsert_pointer AS (
INSERT INTO awooop_active_revisions (
project_id,
contract_family,
contract_id,
active_revision_id,
updated_at
)
SELECT DISTINCT ON (project_id, contract_family, contract_id)
project_id,
contract_family,
contract_id,
revision_id,
NOW()
FROM chosen_revision
ORDER BY project_id, contract_family, contract_id, revision_id
ON CONFLICT (project_id, contract_family, contract_id)
DO UPDATE SET
active_revision_id = EXCLUDED.active_revision_id,
updated_at = NOW()
RETURNING contract_id
)
SELECT 'active_agent_contracts', count(*) FROM upsert_pointer;
WITH read_tools(tool_name, description) AS (
VALUES
('k8s_get_pod_logs', 'Kubernetes pod logs read'),
('k8s_get_events', 'Kubernetes events read'),
('k8s_describe_pod', 'Kubernetes pod describe read'),
('k8s_get_hpa_status', 'Kubernetes HPA status read'),
('k8s_get_node_conditions', 'Kubernetes node conditions read'),
('ssh_diagnose', 'SSH host diagnosis read'),
('ssh_get_top_processes', 'SSH top processes read'),
('ssh_get_disk_usage', 'SSH disk usage read'),
('ssh_get_memory_info', 'SSH memory info read'),
('ssh_get_container_logs', 'SSH container logs read'),
('ssh_get_container_status', 'SSH container status read'),
('ssh_get_service_status', 'SSH service status read'),
('ssh_check_port', 'SSH port check read'),
('ssh_get_nginx_error_log', 'SSH nginx error log read'),
('ssh_get_swap_info', 'SSH swap info read'),
('prometheus_query', 'Prometheus instant query read'),
('prometheus_query_range', 'Prometheus range query read'),
('prometheus_get_alert_history', 'Prometheus alert history read'),
('gold_metrics', 'SigNoz gold metrics read'),
('trace_url', 'SigNoz trace URL read'),
('system_metrics', 'SigNoz system metrics read'),
('query_logs', 'SigNoz logs read'),
('error_logs_summary', 'SigNoz error logs summary read'),
('list_approvals', 'Approval records read'),
('get_approval', 'Approval detail read'),
('list_incidents', 'Incident records read'),
('list_timeline', 'Timeline records read'),
('read_file', 'Filesystem allowlisted file read'),
('list_directory', 'Filesystem allowlisted directory read'),
('search_in_file', 'Filesystem allowlisted file search'),
('list_dashboards', 'Grafana dashboards read'),
('get_dashboard', 'Grafana dashboard read'),
('get_panel_data', 'Grafana panel data read'),
('generate_dashboard_url', 'Grafana dashboard URL read'),
('search_runbook', 'Runbook semantic search read'),
('get_index_stats', 'Runbook index stats read'),
('argocd_list_apps', 'ArgoCD apps read'),
('argocd_get_app_status', 'ArgoCD app status read'),
('argocd_get_sync_history', 'ArgoCD sync history read'),
('sentry_list_issues', 'Sentry issues read'),
('sentry_get_issue', 'Sentry issue detail read'),
('sentry_search_issues', 'Sentry issue search read')
),
upsert_tools AS (
INSERT INTO awooop_mcp_tool_registry (
project_id,
tool_name,
tool_type,
description,
allowed_scopes,
environment_tags,
is_active,
updated_at
)
SELECT
'awoooi',
tool_name,
'mcp_server',
description,
'["read"]'::jsonb,
'{"env": "prod"}'::jsonb,
TRUE,
NOW()
FROM read_tools
ON CONFLICT (project_id, tool_name)
DO UPDATE SET
description = EXCLUDED.description,
allowed_scopes = EXCLUDED.allowed_scopes,
environment_tags = EXCLUDED.environment_tags,
is_active = TRUE,
updated_at = NOW()
RETURNING tool_id
),
grant_agents(agent_id) AS (
VALUES
('pre_decision_investigator'),
('post_execution_verifier')
),
upsert_grants AS (
INSERT INTO awooop_mcp_grants (
project_id,
agent_id,
tool_id,
granted_by,
granted_scopes,
expires_at,
is_revoked,
revoked_at,
revoked_by
)
SELECT
'awoooi',
grant_agents.agent_id,
upsert_tools.tool_id,
'migration:t7_mcp_gateway_read_seed',
'["read"]'::jsonb,
NULL,
FALSE,
NULL,
NULL
FROM upsert_tools
CROSS JOIN grant_agents
ON CONFLICT (project_id, agent_id, tool_id)
DO UPDATE SET
granted_scopes = EXCLUDED.granted_scopes,
expires_at = NULL,
is_revoked = FALSE,
revoked_at = NULL,
revoked_by = NULL
RETURNING grant_id
)
SELECT
'awoooi_read_tools',
(SELECT count(*) FROM upsert_tools) AS tool_rows,
(SELECT count(*) FROM upsert_grants) AS grant_rows;
-- v4 exists only to retrigger run-migration after Gitea skipped the v2->v3 rename-only push.

View File

@@ -1,79 +0,0 @@
-- Rollback for T7 awoooi read-only MCP Gateway seed.
-- Contract revisions are append-only; rollback revokes grants and deactivates the seeded read tools.
SELECT set_config('app.project_id', 'awoooi', FALSE);
UPDATE awooop_mcp_grants
SET
is_revoked = TRUE,
revoked_at = NOW(),
revoked_by = 'rollback:t7_mcp_gateway_read_seed'
WHERE project_id = 'awoooi'
AND agent_id IN ('pre_decision_investigator', 'post_execution_verifier')
AND granted_by = 'migration:t7_mcp_gateway_read_seed'
AND is_revoked = FALSE;
UPDATE awooop_mcp_tool_registry
SET
is_active = FALSE,
updated_at = NOW()
WHERE project_id = 'awoooi'
AND tool_name IN (
'k8s_get_pod_logs',
'k8s_get_events',
'k8s_describe_pod',
'k8s_get_hpa_status',
'k8s_get_node_conditions',
'ssh_diagnose',
'ssh_get_top_processes',
'ssh_get_disk_usage',
'ssh_get_memory_info',
'ssh_get_container_logs',
'ssh_get_container_status',
'ssh_get_service_status',
'ssh_check_port',
'ssh_get_nginx_error_log',
'ssh_get_swap_info',
'prometheus_query',
'prometheus_query_range',
'prometheus_get_alert_history',
'gold_metrics',
'trace_url',
'system_metrics',
'query_logs',
'error_logs_summary',
'list_approvals',
'get_approval',
'list_incidents',
'list_timeline',
'read_file',
'list_directory',
'search_in_file',
'list_dashboards',
'get_dashboard',
'get_panel_data',
'generate_dashboard_url',
'search_runbook',
'get_index_stats',
'argocd_list_apps',
'argocd_get_app_status',
'argocd_get_sync_history',
'sentry_list_issues',
'sentry_get_issue',
'sentry_search_issues'
);
DELETE FROM awooop_active_revisions
WHERE project_id = 'awoooi'
AND contract_family = 'agent'
AND contract_id IN ('pre_decision_investigator', 'post_execution_verifier');
UPDATE awooop_contract_revisions
SET lifecycle_status = 'revoked'
WHERE project_id = 'awoooi'
AND contract_family = 'agent'
AND contract_id IN ('pre_decision_investigator', 'post_execution_verifier')
AND publisher_id = 'migration:t7_mcp_gateway_read_seed'
AND lifecycle_status = 'active';
-- v4 rollback companion for the retrigger migration.

View File

@@ -1,77 +0,0 @@
-- T16 verifier gap: allow rollout status evidence through AwoooP MCP Gateway.
-- Boundary: read-only scope only; no restart/delete/scale grant is added here.
SELECT set_config('app.project_id', 'awoooi', FALSE);
WITH upsert_tool AS (
INSERT INTO awooop_mcp_tool_registry (
project_id,
tool_name,
tool_type,
description,
allowed_scopes,
environment_tags,
is_active,
updated_at
)
VALUES (
'awoooi',
'k8s_watch_rollout',
'mcp_server',
'Kubernetes deployment rollout status read',
'["read"]'::jsonb,
'{"env": "prod"}'::jsonb,
TRUE,
NOW()
)
ON CONFLICT (project_id, tool_name)
DO UPDATE SET
description = EXCLUDED.description,
allowed_scopes = EXCLUDED.allowed_scopes,
environment_tags = EXCLUDED.environment_tags,
is_active = TRUE,
updated_at = NOW()
RETURNING tool_id
),
grant_agents(agent_id) AS (
VALUES
('pre_decision_investigator'),
('post_execution_verifier')
),
upsert_grants AS (
INSERT INTO awooop_mcp_grants (
project_id,
agent_id,
tool_id,
granted_by,
granted_scopes,
expires_at,
is_revoked,
revoked_at,
revoked_by
)
SELECT
'awoooi',
grant_agents.agent_id,
upsert_tool.tool_id,
'migration:t16_rollout_verifier_seed',
'["read"]'::jsonb,
NULL,
FALSE,
NULL,
NULL
FROM upsert_tool
CROSS JOIN grant_agents
ON CONFLICT (project_id, agent_id, tool_id)
DO UPDATE SET
granted_scopes = EXCLUDED.granted_scopes,
expires_at = NULL,
is_revoked = FALSE,
revoked_at = NULL,
revoked_by = NULL
RETURNING grant_id
)
SELECT
'k8s_watch_rollout_read_grants' AS seed,
(SELECT count(*) FROM upsert_tool) AS tool_rows,
(SELECT count(*) FROM upsert_grants) AS grant_rows;

View File

@@ -1,24 +0,0 @@
-- Roll back T16 rollout verifier read grant seed.
SELECT set_config('app.project_id', 'awoooi', FALSE);
UPDATE awooop_mcp_grants
SET
is_revoked = TRUE,
revoked_at = NOW(),
revoked_by = 'migration:t16_rollout_verifier_seed_down'
WHERE project_id = 'awoooi'
AND agent_id IN ('pre_decision_investigator', 'post_execution_verifier')
AND tool_id IN (
SELECT tool_id
FROM awooop_mcp_tool_registry
WHERE project_id = 'awoooi'
AND tool_name = 'k8s_watch_rollout'
);
UPDATE awooop_mcp_tool_registry
SET
is_active = FALSE,
updated_at = NOW()
WHERE project_id = 'awoooi'
AND tool_name = 'k8s_watch_rollout';

View File

@@ -1,14 +0,0 @@
-- AwoooP Phase 5bMCP Gateway blocked call 稽核覆蓋
-- 日期2026-05-06
-- 維護者Codex
--
-- Gate 1 / Gate 2 / 未知工具的 blocked call 可能發生在 tool registry row
-- 取得之前。這些安全決策仍必須落稽核紀錄,因此 tool_id 允許為 NULL
-- 但 tool_name 仍維持必填,作為未知工具與早期 gate block 的追蹤線索。
BEGIN;
ALTER TABLE awooop_mcp_gateway_audit
ALTER COLUMN tool_id DROP NOT NULL;
COMMIT;

View File

@@ -1,21 +0,0 @@
-- AwoooP Phase 7 T15b: inbound event truth-chain columns
--
-- Purpose:
-- Telegram cards are only the notification surface. Operators need a
-- redacted replay envelope for inbound alerts so Alertmanager, Sentry, and
-- SignOz events can be correlated with incidents, approvals, logs, and
-- automation decisions without storing raw secrets or PII.
ALTER TABLE awooop_conversation_event
ADD COLUMN IF NOT EXISTS content_redacted TEXT,
ADD COLUMN IF NOT EXISTS redaction_version VARCHAR(32) NOT NULL DEFAULT 'audit_sink_v1',
ADD COLUMN IF NOT EXISTS source_envelope JSONB NOT NULL DEFAULT '{}'::jsonb;
COMMENT ON COLUMN awooop_conversation_event.content_redacted IS
'Full inbound event content after audit_sink redaction; raw unredacted payload text is not stored.';
COMMENT ON COLUMN awooop_conversation_event.redaction_version IS
'Redaction algorithm/version used for content_redacted and source_envelope.';
COMMENT ON COLUMN awooop_conversation_event.source_envelope IS
'Redacted source metadata for inbound replay/audit, including payload hash, provider, source refs, and log correlation hints.';

View File

@@ -1,6 +0,0 @@
-- Rollback for AwoooP Phase 7 T15b inbound truth-chain columns.
-- Safe only if no consumers depend on the redacted replay fields.
ALTER TABLE awooop_conversation_event DROP COLUMN IF EXISTS source_envelope;
ALTER TABLE awooop_conversation_event DROP COLUMN IF EXISTS redaction_version;
ALTER TABLE awooop_conversation_event DROP COLUMN IF EXISTS content_redacted;

View File

@@ -1,21 +0,0 @@
-- AwoooP Phase 7 T1: outbound message truth-chain columns
--
-- Purpose:
-- Telegram must remain a summary channel, but the operator console needs a
-- complete redacted replay of the rendered card and the source envelope that
-- produced it. Store redacted content only; raw unredacted Telegram text stays
-- out of PostgreSQL.
ALTER TABLE awooop_outbound_message
ADD COLUMN IF NOT EXISTS content_redacted TEXT,
ADD COLUMN IF NOT EXISTS redaction_version VARCHAR(32) NOT NULL DEFAULT 'audit_sink_v1',
ADD COLUMN IF NOT EXISTS source_envelope JSONB NOT NULL DEFAULT '{}'::jsonb;
COMMENT ON COLUMN awooop_outbound_message.content_redacted IS
'Full rendered outbound content after audit_sink redaction; raw unredacted text is not stored.';
COMMENT ON COLUMN awooop_outbound_message.redaction_version IS
'Redaction algorithm/version used for content_redacted and source_envelope.';
COMMENT ON COLUMN awooop_outbound_message.source_envelope IS
'Redacted source metadata for replay/audit, including payload hash and adapter context.';

View File

@@ -1,6 +0,0 @@
-- Rollback for AwoooP Phase 7 T1 outbound truth-chain columns.
-- Safe only if no consumers depend on the redacted replay fields.
ALTER TABLE awooop_outbound_message DROP COLUMN IF EXISTS source_envelope;
ALTER TABLE awooop_outbound_message DROP COLUMN IF EXISTS redaction_version;
ALTER TABLE awooop_outbound_message DROP COLUMN IF EXISTS content_redacted;

View File

@@ -10,8 +10,7 @@
-- 2. rag_chunks.embedding vector(768) → vector(1024)
-- 3. playbook_embeddings.embedding vector(768) → vector(1024)
--
-- 遷移策略:僅在欄位不是 vector(1024) 時清空現有向量資料,切換維度後由 re-embed script 重新嵌入
-- 已經是 vector(1024) 的環境重跑本 migration 時,必須保留既有向量資料。
-- 遷移策略:清空現有向量資料,切換維度後由 re-embed script 重新嵌入
-- 現有向量資料若要保留,需先 dump 用 nomic 格式備份(舊維度無法轉換)
--
-- 執行前置條件:
@@ -24,38 +23,17 @@
BEGIN;
-- 1. knowledge_entries備份舊向量並清空變更欄位維度
DO $$
DECLARE
v_dim integer;
BEGIN
SELECT a.atttypmod INTO v_dim
FROM pg_attribute a
JOIN pg_class c ON a.attrelid = c.oid
WHERE c.relname = 'knowledge_entries'
AND a.attname = 'embedding';
CREATE TABLE IF NOT EXISTS knowledge_entries_embedding_backup_20260505 AS
SELECT
id,
embedding::text AS embedding_768,
NOW() AS backed_up_at
FROM knowledge_entries
WHERE embedding IS NOT NULL;
IF v_dim IS DISTINCT FROM 1024 THEN
EXECUTE $sql$
CREATE TABLE IF NOT EXISTS knowledge_entries_embedding_backup_20260505 AS
SELECT
id,
embedding::text AS embedding_768,
NOW() AS backed_up_at
FROM knowledge_entries
WHERE embedding IS NOT NULL
$sql$;
EXECUTE $sql$
ALTER TABLE knowledge_entries
ALTER COLUMN embedding TYPE vector(1024)
USING NULL
$sql$;
RAISE NOTICE 'knowledge_entries.embedding migrated from vector(%) to vector(1024); old embeddings were backed up and cleared', v_dim;
ELSE
RAISE NOTICE 'knowledge_entries.embedding already vector(1024); existing embeddings preserved';
END IF;
END $$;
ALTER TABLE knowledge_entries
ALTER COLUMN embedding TYPE vector(1024)
USING NULL; -- 清空現有 768 維向量(維度不可轉換)
COMMENT ON COLUMN knowledge_entries.embedding IS
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-05 ADR-110 follow-up)';
@@ -63,29 +41,11 @@ COMMENT ON COLUMN knowledge_entries.embedding IS
-- 2. rag_chunks清空向量資料變更欄位維度
-- ivfflat index 必須先 DROP 才能 ALTER COLUMN
DO $$
DECLARE
v_dim integer;
BEGIN
SELECT a.atttypmod INTO v_dim
FROM pg_attribute a
JOIN pg_class c ON a.attrelid = c.oid
WHERE c.relname = 'rag_chunks'
AND a.attname = 'embedding';
DROP INDEX IF EXISTS idx_rag_chunks_embedding;
IF v_dim IS DISTINCT FROM 1024 THEN
EXECUTE 'DROP INDEX IF EXISTS idx_rag_chunks_embedding';
EXECUTE $sql$
ALTER TABLE rag_chunks
ALTER COLUMN embedding TYPE vector(1024)
USING NULL
$sql$;
RAISE NOTICE 'rag_chunks.embedding migrated from vector(%) to vector(1024); old embeddings were cleared', v_dim;
ELSE
RAISE NOTICE 'rag_chunks.embedding already vector(1024); existing embeddings preserved';
END IF;
END $$;
ALTER TABLE rag_chunks
ALTER COLUMN embedding TYPE vector(1024)
USING NULL; -- 清空現有 768 維向量(維度不可轉換)
-- 重建 ivfflat indexlists=100 適合 ~10k 筆以下資料)
CREATE INDEX IF NOT EXISTS idx_rag_chunks_embedding
@@ -98,29 +58,11 @@ COMMENT ON COLUMN rag_chunks.embedding IS
-- 3. playbook_embeddings清空向量資料變更欄位維度
DO $$
DECLARE
v_dim integer;
BEGIN
SELECT a.atttypmod INTO v_dim
FROM pg_attribute a
JOIN pg_class c ON a.attrelid = c.oid
WHERE c.relname = 'playbook_embeddings'
AND a.attname = 'embedding';
DROP INDEX IF EXISTS ix_playbook_embeddings_vec;
IF v_dim IS DISTINCT FROM 1024 THEN
EXECUTE 'DROP INDEX IF EXISTS ix_playbook_embeddings_vec';
EXECUTE $sql$
ALTER TABLE playbook_embeddings
ALTER COLUMN embedding TYPE vector(1024)
USING NULL
$sql$;
RAISE NOTICE 'playbook_embeddings.embedding migrated from vector(%) to vector(1024); old embeddings were cleared', v_dim;
ELSE
RAISE NOTICE 'playbook_embeddings.embedding already vector(1024); existing embeddings preserved';
END IF;
END $$;
ALTER TABLE playbook_embeddings
ALTER COLUMN embedding TYPE vector(1024)
USING NULL; -- 清空現有 768 維向量
CREATE INDEX IF NOT EXISTS ix_playbook_embeddings_vec
ON playbook_embeddings

View File

@@ -25,7 +25,7 @@
"log_anomaly": "deepseek-r1:14b",
"nemoclaw": "deepseek-r1:14b",
"playbook_draft": "qwen3:14b",
"code_review": "qwen2.5-coder:7b",
"code_review": "qwen2.5-coder:32b",
"embedding": "bge-m3:latest",
"rag_generate": "qwen3:14b",
"image_analysis": "minicpm-v:latest",
@@ -175,7 +175,7 @@
},
"pr_code_review": {
"phase": 32,
"model": "qwen2.5-coder:7b",
"model": "qwen2.5-coder:32b",
"timeout_seconds": 120,
"purpose": "Gitea PR 自動審查"
},

View File

@@ -46,10 +46,6 @@ dependencies = [
# 2026-04-16 ogt + Claude Sonnet 4.6: SSH MCP sensor 修復 — asyncssh 缺失導致 sensors_succeeded=0
# 根因: ssh_provider.py 中 import asyncssh 在 try/except 外,所有 15 個 SSH tool 直接 ImportError
"asyncssh>=2.14.0",
# 2026-05-31 Codex: AwoooP truth-chain Ansible runtime gate 需要
# production API image 內真的存在 ansible-playbook否則只能顯示
# candidate audit無法進入 check-mode executor readiness。
"ansible-core>=2.16.0,<2.18.0",
]
# [tool.uv.sources]

View File

@@ -58,8 +58,3 @@ pytest>=7.4.0
pytest-asyncio>=0.23.0
ruff>=0.1.0
sentry-sdk[fastapi]>=2.0.0
# AwoooP Ansible runtime readiness
# 2026-05-31 Codex: production API image must include ansible-playbook before
# truth-chain can honestly mark check-mode executor readiness as available.
ansible-core>=2.16.0,<2.18.0

View File

@@ -9,7 +9,7 @@ AwoooP Phase 1 Batch 1 回填腳本
awooop_phase1_batch1_rls_2026-05-04.sql Step AADD COLUMN nullable已執行
執行方式:
從 secret manager / operator vault 設定 DATABASE_URL禁止在指令或檔案中寫入 URL。
export DATABASE_URL="postgresql+asyncpg://awoooi:<password>@192.168.0.188:5432/awoooi_prod"
cd apps/api && python scripts/awooop_phase1_batch1_backfill.py
2026-05-04 ogt + Claude Sonnet 4.6ADR-118 Batch 1 C-3 修正)

View File

@@ -37,7 +37,6 @@ logging = structlog.get_logger(__name__)
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://34.143.170.20:11434")
EMBEDDING_MODEL = "bge-m3:latest"
EXPECTED_DIM = 1024
PROJECT_ID = os.getenv("AWOOOP_PROJECT_ID", "awoooi")
async def embed_text(client: httpx.AsyncClient, text: str) -> list[float]:
@@ -163,7 +162,6 @@ async def main(dry_run: bool, batch_size: int) -> None:
conn = await asyncpg.connect(database_url)
try:
await conn.execute("SELECT set_config('app.project_id', $1, FALSE)", PROJECT_ID)
# 統計待嵌入筆數
rag_null = await conn.fetchval("SELECT COUNT(*) FROM rag_chunks WHERE embedding IS NULL")
pb_null = await conn.fetchval("SELECT COUNT(*) FROM playbook_embeddings WHERE embedding IS NULL")

View File

@@ -15,7 +15,7 @@ from sqlalchemy import text
from sqlalchemy.ext.asyncio import create_async_engine
# 2026-04-22 ogt: 移除硬碼 changeme改為讀取環境變數強制要求設定
# 執行前: 從 secret manager / operator vault 設定 DATABASE_URL禁止在指令或檔案中寫入 URL。
# 執行前: export DATABASE_URL="postgresql+asyncpg://awoooi:<password>@192.168.0.188:5432/awoooi_prod"
DATABASE_URL = os.environ["DATABASE_URL"]
MIGRATION_SQLS = [

View File

@@ -28,7 +28,7 @@ except ImportError:
# ============================================================================
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://192.168.0.110:11435")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://192.168.0.188:11434")
if not NVIDIA_API_KEY:
print("❌ 請設定 NVIDIA_API_KEY 環境變數")

View File

@@ -22,48 +22,17 @@ from datetime import datetime
from typing import Annotated
import structlog
from fastapi import APIRouter, HTTPException, Query
from fastapi import APIRouter, Query
from src.models.governance import (
GovernanceEventsResponse,
GovernanceQueueResponse,
GovernanceSummaryResponse,
KnowledgeReviewDraftArchiveRequest,
KnowledgeReviewDraftArchiveResponse,
KnowledgeReviewDraftDedupeResponse,
KnowledgeStaleCandidatesResponse,
KnowledgeStaleOwnerReviewBatchQueueRequest,
KnowledgeStaleOwnerReviewBatchQueueResponse,
KnowledgeStaleOwnerReviewBurnDownResponse,
KnowledgeStaleOwnerReviewCompleteRequest,
KnowledgeStaleOwnerReviewCompleteResponse,
KnowledgeStaleOwnerReviewCompletionBatchPreviewRequest,
KnowledgeStaleOwnerReviewCompletionBatchPreviewResponse,
KnowledgeStaleOwnerReviewCompletionQueueResponse,
KnowledgeStaleOwnerReviewInboxResponse,
KnowledgeStaleOwnerReviewRequest,
KnowledgeStaleOwnerReviewResponse,
)
from src.services.governance_km_review_service import (
KmReviewDraftArchiveError,
archive_km_review_draft_duplicates,
)
from src.services.governance_km_stale_review_service import (
KmStaleOwnerReviewError,
batch_queue_km_stale_owner_reviews,
complete_km_stale_owner_review,
preview_km_stale_owner_review_completion_batch,
query_km_stale_owner_review_burndown,
query_km_stale_owner_review_completion_queue,
query_km_stale_owner_review_inbox,
queue_km_stale_owner_review,
)
from src.services.governance_query_service import (
query_governance_events,
query_governance_queue,
query_governance_summary,
query_km_review_draft_dedupe,
query_km_stale_candidates,
)
logger = structlog.get_logger(__name__)
@@ -77,7 +46,6 @@ router = APIRouter()
@router.get("/ai/governance/events", response_model=GovernanceEventsResponse)
async def get_governance_events(
event_id: Annotated[list[str] | None, Query(alias="event_id")] = None,
event_type: Annotated[list[str] | None, Query(alias="event_type")] = None,
from_: Annotated[datetime | None, Query(alias="from")] = None,
to: Annotated[datetime | None, Query(alias="to")] = None,
@@ -90,7 +58,6 @@ async def get_governance_events(
查詢 AI 治理事件列表(分頁)。
- event_type: 多值過濾(可重複傳)
- event_id: 多值精準過濾(可重複傳),供 Telegram 詳情 / 歷史與 Work Items 錨點回看
- from / to: ISO 8601 時間範圍URL 傳 from 參數)
- status: resolved / unresolved
- severity: critical / warning / info由 event_type 映射決定)
@@ -99,7 +66,6 @@ async def get_governance_events(
"""
logger.debug(
"governance_events_request",
event_ids=event_id,
event_types=event_type,
from_=from_,
to=to,
@@ -109,7 +75,6 @@ async def get_governance_events(
size=size,
)
return await query_governance_events(
event_ids=event_id,
event_types=event_type,
from_dt=from_,
to_dt=to,
@@ -128,9 +93,8 @@ async def get_governance_events(
async def get_governance_queue(
dispatch_status: Annotated[
str,
Query(pattern="^(all|pending|dispatched|executing|succeeded|failed|skipped|cancelled)$"),
Query(pattern="^(pending|dispatched|succeeded|failed)$"),
] = "pending",
event_type: Annotated[list[str] | None, Query(alias="event_type")] = None,
page: Annotated[int, Query(ge=1)] = 1,
size: Annotated[int, Query(ge=10, le=100)] = 20,
) -> GovernanceQueueResponse:
@@ -140,360 +104,22 @@ async def get_governance_queue(
governance_remediation_dispatch 表由 Track D 建立,尚未完成時
本 endpoint 回傳 { table_pending: true, items: [], total: 0 },不拋 500。
- dispatch_status: pendingdefault/ dispatched / executing / succeeded / failed / skipped / cancelled / all
- event_type: 多值過濾(可重複傳)
- dispatch_status: pendingdefault/ dispatched / succeeded / failed
- page / size: 分頁
"""
logger.debug(
"governance_queue_request",
dispatch_status=dispatch_status,
event_type=event_type,
page=page,
size=size,
)
return await query_governance_queue(
dispatch_status=dispatch_status,
event_types=event_type,
page=page,
size=size,
)
# =============================================================================
# GET /api/v1/ai/governance/km-review-drafts/dedupe
# =============================================================================
@router.get(
"/ai/governance/km-review-drafts/dedupe",
response_model=KnowledgeReviewDraftDedupeResponse,
)
async def get_km_review_draft_dedupe(
limit: Annotated[int, Query(ge=10, le=200)] = 100,
) -> KnowledgeReviewDraftDedupeResponse:
"""
查詢 Hermes KM healthcheck review drafts 的去重 read model。
這是 read-only owner review surface只回傳 canonical / duplicate /
owner_action不自動 archive、不自動 approve/publish KM。
"""
logger.debug("km_review_draft_dedupe_request", limit=limit)
return await query_km_review_draft_dedupe(limit=limit)
# =============================================================================
# POST /api/v1/ai/governance/km-review-drafts/dedupe/{event_id}/archive-duplicates
# =============================================================================
@router.post(
"/ai/governance/km-review-drafts/dedupe/{governance_event_id}/archive-duplicates",
response_model=KnowledgeReviewDraftArchiveResponse,
)
async def post_km_review_draft_archive_duplicates(
governance_event_id: str,
request: KnowledgeReviewDraftArchiveRequest,
) -> KnowledgeReviewDraftArchiveResponse:
"""
Owner 審核後封存 Hermes KM healthcheck duplicate review drafts。
這不是 read endpoint必須明確傳 owner_approved=true且後端會重新比對
最新 dedupe plan。封存為 KnowledgeEntry.status=archived不刪除資料。
"""
logger.info(
"km_review_draft_archive_request",
governance_event_id=governance_event_id,
canonical_entry_id=request.canonical_entry_id,
duplicate_count=len(request.duplicate_entry_ids),
owner=request.owner,
dry_run=request.dry_run,
owner_approved=request.owner_approved,
)
try:
return await archive_km_review_draft_duplicates(
governance_event_id=governance_event_id,
request=request,
)
except KmReviewDraftArchiveError as exc:
raise HTTPException(status_code=exc.status_code, detail=exc.detail) from exc
# =============================================================================
# GET /api/v1/ai/governance/km-stale-candidates
# =============================================================================
@router.get(
"/ai/governance/km-stale-candidates",
response_model=KnowledgeStaleCandidatesResponse,
)
async def get_km_stale_candidates(
project_id: Annotated[str, Query(min_length=1, max_length=64)] = "awoooi",
limit: Annotated[int, Query(ge=5, le=100)] = 20,
) -> KnowledgeStaleCandidatesResponse:
"""
查詢 stale KM 的 read-only 優先處理清單。
Hermes 可以用這個 read model 產生 KM 更新草稿owner console 則能先看
哪些條目有 Incident / Sentry / SigNoz / PlayBook 脈絡,避免只看到總數。
"""
logger.debug(
"km_stale_candidates_request",
project_id=project_id,
limit=limit,
)
return await query_km_stale_candidates(project_id=project_id, limit=limit)
# =============================================================================
# GET /api/v1/ai/governance/km-stale-owner-reviews
# =============================================================================
@router.get(
"/ai/governance/km-stale-owner-reviews",
response_model=KnowledgeStaleOwnerReviewInboxResponse,
)
async def get_km_stale_owner_reviews(
project_id: Annotated[str, Query(min_length=1, max_length=64)] = "awoooi",
dispatch_status: Annotated[
str,
Query(pattern="^(all|pending|dispatched|executing|succeeded|failed|skipped|cancelled)$"),
] = "pending",
limit: Annotated[int, Query(ge=5, le=100)] = 20,
) -> KnowledgeStaleOwnerReviewInboxResponse:
"""
查詢 stale KM owner-review 工作台。
這是 read-only inbox把 dispatch trail 與 KM priority context 合併,
讓 operator 可以依 P0/P1、score、batch 來源與流程階段逐筆 completion。
"""
logger.debug(
"km_stale_owner_reviews_request",
project_id=project_id,
dispatch_status=dispatch_status,
limit=limit,
)
try:
return await query_km_stale_owner_review_inbox(
project_id=project_id,
dispatch_status=dispatch_status,
limit=limit,
)
except KmStaleOwnerReviewError as exc:
raise HTTPException(status_code=exc.status_code, detail=exc.detail) from exc
# =============================================================================
# GET /api/v1/ai/governance/km-stale-owner-review-burndown
# =============================================================================
@router.get(
"/ai/governance/km-stale-owner-review-burndown",
response_model=KnowledgeStaleOwnerReviewBurnDownResponse,
)
async def get_km_stale_owner_review_burndown(
project_id: Annotated[str, Query(min_length=1, max_length=64)] = "awoooi",
limit: Annotated[int, Query(ge=1, le=100)] = 20,
) -> KnowledgeStaleOwnerReviewBurnDownResponse:
"""
查詢 stale KM owner-review 完成與 stale ratio burn-down 狀態。
這是 read-only dashboard把 pending review、completion audit、recheck
snapshot 與距離治理門檻的剩餘筆數放在同一個前端面板。
"""
logger.debug(
"km_stale_owner_review_burndown_request",
project_id=project_id,
limit=limit,
)
return await query_km_stale_owner_review_burndown(
project_id=project_id,
limit=limit,
)
# =============================================================================
# GET /api/v1/ai/governance/km-stale-owner-review-completion-queue
# =============================================================================
@router.get(
"/ai/governance/km-stale-owner-review-completion-queue",
response_model=KnowledgeStaleOwnerReviewCompletionQueueResponse,
)
async def get_km_stale_owner_review_completion_queue(
project_id: Annotated[str, Query(min_length=1, max_length=64)] = "awoooi",
status_bucket: Annotated[
str,
Query(pattern="^(all|ready|blocked|completed|failed|pending)$"),
] = "all",
priority_tier: Annotated[list[str] | None, Query(alias="priority_tier")] = None,
recommended_completion_outcome: Annotated[
str,
Query(pattern="^(all|refresh_with_evidence|archive|supersede)$"),
] = "all",
batch_governance_event_id: Annotated[str | None, Query(max_length=120)] = None,
can_preview: bool | None = None,
limit: Annotated[int, Query(ge=1, le=100)] = 20,
) -> KnowledgeStaleOwnerReviewCompletionQueueResponse:
"""
查詢 stale KM owner-review completion 分流。
這是 read-only queue把 active / completed / failed dispatch 拆成
ready、blocked、completed、failed讓前端呈現下一步卡點打開頁面不寫 KM。
"""
logger.debug(
"km_stale_owner_review_completion_queue_request",
project_id=project_id,
status_bucket=status_bucket,
priority_tiers=priority_tier,
recommended_completion_outcome=recommended_completion_outcome,
batch_governance_event_id=batch_governance_event_id,
can_preview=can_preview,
limit=limit,
)
try:
return await query_km_stale_owner_review_completion_queue(
project_id=project_id,
status_bucket=status_bucket,
priority_tiers=priority_tier,
recommended_completion_outcome=recommended_completion_outcome,
batch_governance_event_id=batch_governance_event_id,
can_preview=can_preview,
limit=limit,
)
except KmStaleOwnerReviewError as exc:
raise HTTPException(status_code=exc.status_code, detail=exc.detail) from exc
# =============================================================================
# POST /api/v1/ai/governance/km-stale-owner-review-completion-queue/batch-preview
# =============================================================================
@router.post(
"/ai/governance/km-stale-owner-review-completion-queue/batch-preview",
response_model=KnowledgeStaleOwnerReviewCompletionBatchPreviewResponse,
)
async def post_km_stale_owner_review_completion_batch_preview(
request: KnowledgeStaleOwnerReviewCompletionBatchPreviewRequest,
) -> KnowledgeStaleOwnerReviewCompletionBatchPreviewResponse:
"""
Preview a bounded set of owner-review completion candidates.
This endpoint is intentionally dry-run only: it does not write KM, does not
enqueue a batch executor, and does not create governance audit rows. Each
item must still be completed through the single-item dry-run + owner confirm
endpoint.
"""
logger.info(
"km_stale_owner_review_completion_batch_preview_request",
project_id=request.project_id,
status_bucket=request.status_bucket,
priority_tiers=request.priority_tiers,
recommended_completion_outcome=request.recommended_completion_outcome,
batch_governance_event_id=request.batch_governance_event_id,
limit=request.limit,
owner=request.owner,
)
try:
return await preview_km_stale_owner_review_completion_batch(request=request)
except KmStaleOwnerReviewError as exc:
raise HTTPException(status_code=exc.status_code, detail=exc.detail) from exc
# =============================================================================
# POST /api/v1/ai/governance/km-stale-candidates/batch-queue-review
# =============================================================================
@router.post(
"/ai/governance/km-stale-candidates/batch-queue-review",
response_model=KnowledgeStaleOwnerReviewBatchQueueResponse,
)
async def post_km_stale_candidate_batch_queue_review(
request: KnowledgeStaleOwnerReviewBatchQueueRequest,
) -> KnowledgeStaleOwnerReviewBatchQueueResponse:
"""
將 P0/P1 stale KM 批次排入 owner review。
這個 endpoint 只建立 batch audit 與逐筆 owner-review dispatch不改寫 KM。
真正 refresh / archive / supersede 仍需單筆 dry-run fingerprint + owner approval。
"""
logger.info(
"km_stale_candidate_batch_queue_review_request",
project_id=request.project_id,
priority_tiers=request.priority_tiers,
limit=request.limit,
owner=request.owner,
dry_run=request.dry_run,
)
try:
return await batch_queue_km_stale_owner_reviews(request=request)
except KmStaleOwnerReviewError as exc:
raise HTTPException(status_code=exc.status_code, detail=exc.detail) from exc
# =============================================================================
# POST /api/v1/ai/governance/km-stale-candidates/{entry_id}/queue-review
# =============================================================================
@router.post(
"/ai/governance/km-stale-candidates/{entry_id}/queue-review",
response_model=KnowledgeStaleOwnerReviewResponse,
)
async def post_km_stale_candidate_queue_review(
entry_id: str,
request: KnowledgeStaleOwnerReviewRequest,
) -> KnowledgeStaleOwnerReviewResponse:
"""
將單筆 stale KM candidate 排入 owner review。
這個 endpoint 只建立治理事件與 dispatch work item不修改 KM 內容。
實際 refresh / archive / supersede 仍需 owner 在後續流程確認。
"""
logger.info(
"km_stale_candidate_queue_review_request",
entry_id=entry_id,
owner=request.owner,
dry_run=request.dry_run,
)
try:
return await queue_km_stale_owner_review(entry_id=entry_id, request=request)
except KmStaleOwnerReviewError as exc:
raise HTTPException(status_code=exc.status_code, detail=exc.detail) from exc
# =============================================================================
# POST /api/v1/ai/governance/km-stale-candidates/{entry_id}/complete-review
# =============================================================================
@router.post(
"/ai/governance/km-stale-candidates/{entry_id}/complete-review",
response_model=KnowledgeStaleOwnerReviewCompleteResponse,
)
async def post_km_stale_candidate_complete_review(
entry_id: str,
request: KnowledgeStaleOwnerReviewCompleteRequest,
) -> KnowledgeStaleOwnerReviewCompleteResponse:
"""
Owner 審核後完成 stale KM 的 refresh / archive / supersede 流程。
必須先 dry-run 取得 fingerprint真正寫入時需 owner_approved=true。
後端會寫 KM、terminal audit dispatch 與 stale ratio recheck dispatch。
"""
logger.info(
"km_stale_candidate_complete_review_request",
entry_id=entry_id,
dispatch_id=request.dispatch_id,
owner=request.owner,
review_outcome=request.review_outcome,
dry_run=request.dry_run,
owner_approved=request.owner_approved,
)
try:
return await complete_km_stale_owner_review(
entry_id=entry_id,
request=request,
)
except KmStaleOwnerReviewError as exc:
raise HTTPException(status_code=exc.status_code, detail=exc.detail) from exc
# =============================================================================
# GET /api/v1/ai/governance/summary
# =============================================================================

View File

@@ -18,15 +18,8 @@ Endpoints:
from __future__ import annotations
import structlog
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel, Field
from fastapi import APIRouter, Query
from src.services.adr100_remediation_service import (
RemediationMode,
RemediationNotFoundError,
get_adr100_remediation_service,
)
from src.services.adr100_slo_status_service import get_adr100_slo_status_service
from src.services.ai_slo_calculator import AiSloCalculator
logger = structlog.get_logger(__name__)
@@ -34,20 +27,6 @@ logger = structlog.get_logger(__name__)
router = APIRouter()
class RemediationPreviewRequest(BaseModel):
"""ADR-100 remediation preview request."""
work_item_id: str = Field(min_length=1)
mode: RemediationMode = "auto"
class RemediationDryRunRequest(BaseModel):
"""ADR-100 remediation dry-run request."""
work_item_id: str = Field(min_length=1)
mode: RemediationMode = "auto"
@router.get("/ai/slo")
async def get_ai_slo(
force_refresh: bool = Query(False, description="忽略快取,強制重算"),
@@ -71,65 +50,9 @@ async def get_ai_slo(
if cached:
data = cached.to_dict()
data["cache_hit"] = True
data["adr100"] = await get_adr100_slo_status_service().fetch_report()
return data
report = await calc.run()
data = report.to_dict()
data["cache_hit"] = False
data["adr100"] = await get_adr100_slo_status_service().fetch_report()
return data
@router.get("/ai/slo/remediation/preview")
async def preview_ai_slo_remediation(
work_item_id: str = Query(..., min_length=1),
mode: RemediationMode = Query("auto"),
) -> dict:
"""Preview the safe remediation plan for one ADR-100 queue item."""
try:
return await get_adr100_remediation_service().preview(work_item_id, mode)
except RemediationNotFoundError as exc:
raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc
@router.post("/ai/slo/remediation/preview")
async def preview_ai_slo_remediation_post(request: RemediationPreviewRequest) -> dict:
"""POST variant for clients that prefer JSON bodies."""
try:
return await get_adr100_remediation_service().preview(
request.work_item_id,
request.mode,
)
except RemediationNotFoundError as exc:
raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc
@router.post("/ai/slo/remediation/dry-run")
async def dry_run_ai_slo_remediation(request: RemediationDryRunRequest) -> dict:
"""Run a read-only ADR-100 remediation dry-run."""
try:
return await get_adr100_remediation_service().dry_run(
request.work_item_id,
request.mode,
)
except RemediationNotFoundError as exc:
raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc
@router.get("/ai/slo/remediation/history")
async def list_ai_slo_remediation_history(
limit: int = Query(50, ge=1, le=200),
incident_id: str | None = Query(default=None, min_length=1),
work_item_id: str | None = Query(default=None, min_length=1),
) -> dict:
"""List durable ADR-100 remediation dry-run history from alert_operation_log."""
return await get_adr100_remediation_service().history(
limit=limit,
incident_id=incident_id,
work_item_id=work_item_id,
)

View File

@@ -20,7 +20,6 @@ from pydantic import BaseModel
from src.core.config import settings
from src.core.logging import get_logger
from src.core.sse import EventPublisher, EventType, SSEEvent, get_publisher
from src.services.dashboard_metrics_service import fetch_pending_approval_count
from src.services.host_aggregator import AggregatedStatus, HostAggregator
router = APIRouter()
@@ -142,14 +141,12 @@ async def dashboard_update_loop(publisher: EventPublisher) -> None:
try:
# Fetch aggregated status
status = await HostAggregator.fetch_all()
pending_approvals = await fetch_pending_approval_count()
# Publish to all connected clients
event = SSEEvent(
type=EventType.HOST_UPDATE,
data={
"overall_status": status.overall_status,
"pending_approvals": pending_approvals,
"hosts": [
{
"ip": h.ip,
@@ -209,9 +206,7 @@ async def get_dashboard() -> DashboardResponse:
logger.info("dashboard_fetch")
status = await HostAggregator.fetch_all()
response = aggregated_to_response(status)
response.pending_approvals = await fetch_pending_approval_count()
return response
return aggregated_to_response(status)
@router.get("/dashboard/stream")

View File

@@ -13,12 +13,10 @@ leWOOOgo 積木化原則:
建立者: Claude Code (Phase 25 P2)
"""
from typing import Literal
from fastapi import APIRouter, BackgroundTasks, HTTPException
from pydantic import BaseModel, Field
from src.core.csrf import CSRFToken # Phase 20: CSRF Protection
from src.models.drift import (
DriftListResponse,
DriftReport,
@@ -30,10 +28,6 @@ from src.repositories.drift_repository import get_drift_repository
from src.services.drift_adopt_service import get_drift_adopt_service
from src.services.drift_analyzer import get_drift_analyzer
from src.services.drift_detector import get_drift_detector
from src.services.drift_fingerprint_state_service import (
DriftFingerprintStateNotFoundError,
get_drift_fingerprint_state_service,
)
from src.services.drift_interpreter import get_drift_interpreter
from src.services.drift_remediator import get_drift_remediator
from src.utils.timezone import now_taipei
@@ -43,42 +37,6 @@ router = APIRouter(prefix="/drift", tags=["drift"])
# 2026-04-09 Claude Sonnet 4.6: B4 drift_reports 持久化 — 改用 DB repository
class DriftFingerprintHandoffRequest(BaseModel):
"""Record-only handoff request for a stable drift fingerprint."""
report_id: str | None = Field(default=None, min_length=1)
namespace: str | None = Field(default="awoooi-prod", min_length=1)
handoff_kind: Literal[
"open_pr_review",
"manual_investigation",
"zero_diff_pr_cleanup",
] = "open_pr_review"
pr_url: str | None = Field(default=None, min_length=1)
note: str | None = Field(default=None, max_length=500)
class DriftFingerprintRemediationRequest(BaseModel):
"""Record-only remediation request for a stable drift fingerprint."""
report_id: str | None = Field(default=None, min_length=1)
namespace: str | None = Field(default="awoooi-prod", min_length=1)
remediation_kind: Literal[
"live_env_rollback",
"git_adopted",
"git_rollback",
"zero_diff_pr_cleanup",
"manual_noop",
] = "live_env_rollback"
remediation_status: Literal[
"executed_unverified",
"verified_no_drift",
"verification_failed",
] | None = None
verification_report_id: str | None = Field(default=None, min_length=1)
note: str | None = Field(default=None, max_length=1000)
commands_summary: list[str] = Field(default_factory=list, max_length=12)
@router.post("/scan", response_model=DriftScanResponse, summary="觸發漂移掃描")
async def trigger_drift_scan(
request: DriftScanRequest,
@@ -141,72 +99,6 @@ async def list_drift_reports() -> DriftListResponse:
return DriftListResponse(items=items, total=len(items))
@router.get("/fingerprints/state", summary="查詢 Config Drift fingerprint 狀態")
async def get_drift_fingerprint_state(
report_id: str | None = None,
namespace: str | None = "awoooi-prod",
) -> dict:
"""
以 stable fingerprint 聚合漂移狀態。
此 endpoint 只建立 read model重複次數、PR 狀態、是否零 diff、
人工交接歷史與下一步。它不修改 drift / incident / auto-repair 狀態。
"""
svc = get_drift_fingerprint_state_service()
try:
return await svc.get_state(report_id=report_id, namespace=namespace)
except DriftFingerprintStateNotFoundError as exc:
raise HTTPException(status_code=404, detail="drift_report_not_found") from exc
@router.post("/fingerprints/handoff", summary="記錄 Config Drift fingerprint 交接")
async def record_drift_fingerprint_handoff(
request: DriftFingerprintHandoffRequest,
) -> dict:
"""
記錄 stable fingerprint 已轉人工 / PR review 的歷史證據。
安全邊界:只寫 alert_operation_log / timeline_events不修改 drift 狀態、
incident 狀態、自動修復結果,不建立外部 ticket也不 merge PR。
"""
svc = get_drift_fingerprint_state_service()
try:
return await svc.record_handoff(
report_id=request.report_id,
namespace=request.namespace,
handoff_kind=request.handoff_kind,
pr_url=request.pr_url,
note=request.note,
)
except DriftFingerprintStateNotFoundError as exc:
raise HTTPException(status_code=404, detail="drift_report_not_found") from exc
@router.post("/fingerprints/remediation", summary="記錄 Config Drift fingerprint 修復")
async def record_drift_fingerprint_remediation(
request: DriftFingerprintRemediationRequest,
) -> dict:
"""
記錄 stable fingerprint 已完成的修復 / 驗證證據。
安全邊界:只寫 alert_operation_log / timeline_events不修改 drift 狀態、
incident 狀態、自動修復結果,不建立外部 ticket也不執行 kubectl。
"""
svc = get_drift_fingerprint_state_service()
try:
return await svc.record_remediation(
report_id=request.report_id,
namespace=request.namespace,
remediation_kind=request.remediation_kind,
remediation_status=request.remediation_status,
verification_report_id=request.verification_report_id,
note=request.note,
commands_summary=request.commands_summary,
)
except DriftFingerprintStateNotFoundError as exc:
raise HTTPException(status_code=404, detail="drift_report_not_found") from exc
@router.post("/reports/{report_id}/rollback", summary="覆蓋回 Git 狀態")
async def rollback_drift(report_id: str, _csrf_token: CSRFToken) -> dict: # Phase 20: CSRF Protection (驗證用,不需要使用值)
"""

View File

@@ -418,9 +418,7 @@ async def _send_gitea_notification(
logger.debug("gitea_tg_skipped", reason="Bot token not configured")
return
from src.services.telegram_gateway import (
get_telegram_gateway, # type: ignore[import]
)
from src.services.telegram_gateway import get_telegram_gateway # type: ignore[import]
gateway = get_telegram_gateway()
await gateway.initialize()
await gateway.send_alert_notification(message)
@@ -504,22 +502,15 @@ async def handle_pull_request(
review_id = f"gitea-pr-{payload.repository.id}-{pr.number}-{uuid.uuid4().hex[:8]}"
# 背景執行審查 (委派給 Service)
if settings.MOCK_MODE:
logger.info(
"gitea_pr_review_background_skipped_mock_mode",
review_id=review_id,
repo=payload.repository.full_name,
)
else:
service = get_gitea_webhook_service()
background_tasks.add_task(
service.review_pull_request,
repo=payload.repository,
pr=pr,
sender=payload.sender,
review_id=review_id,
action=payload.action,
)
service = get_gitea_webhook_service()
background_tasks.add_task(
service.review_pull_request,
repo=payload.repository,
pr=pr,
sender=payload.sender,
review_id=review_id,
action=payload.action,
)
logger.info(
"gitea_pr_review_scheduled",
@@ -570,24 +561,17 @@ async def handle_push(
review_id = f"gitea-push-{payload.repository.id}-{payload.after[:8]}-{uuid.uuid4().hex[:8]}"
# 背景執行審查 (委派給 Service)
if settings.MOCK_MODE:
logger.info(
"gitea_push_review_background_skipped_mock_mode",
review_id=review_id,
repo=payload.repository.full_name,
)
else:
service = get_gitea_webhook_service()
background_tasks.add_task(
service.review_push,
repo=payload.repository,
commits=commits,
sender=payload.sender,
review_id=review_id,
ref=ref,
before_sha=payload.before,
after_sha=payload.after,
)
service = get_gitea_webhook_service()
background_tasks.add_task(
service.review_push,
repo=payload.repository,
commits=commits,
sender=payload.sender,
review_id=review_id,
ref=ref,
before_sha=payload.before,
after_sha=payload.after,
)
logger.info(
"gitea_push_review_scheduled",

View File

@@ -11,7 +11,7 @@ Endpoints:
Components Checked:
- PostgreSQL (192.168.0.188:5432)
- Redis (192.168.0.188:6380)
- Ollama ADR-110 provider pool (GCP-A -> GCP-B -> 111)
- Ollama (settings.OLLAMA_URL / ADR-110 provider pool)
- OpenClaw (192.168.0.188:8089)
- SigNoz (192.168.0.188:3301)
"""
@@ -26,16 +26,9 @@ from pydantic import BaseModel
from src.core.config import settings
from src.core.logging import get_logger
from src.services.health_check_service import get_health_check_service
from src.services.ollama_endpoint_circuit_breaker import (
get_ollama_endpoint_cooldown_remaining_seconds,
record_ollama_endpoint_failure,
record_ollama_endpoint_success,
)
from src.services.ollama_endpoint_resolver import resolve_ollama_order
router = APIRouter()
logger = get_logger("awoooi.health")
CORE_COMPONENTS = ("api", "postgresql", "redis", "ollama", "openclaw", "signoz")
# =============================================================================
@@ -47,11 +40,6 @@ class ComponentHealth(BaseModel):
status: Literal["up", "down", "degraded"]
latency_ms: float | None = None
error: str | None = None
provider_name: str | None = None
diagnosis_code: str | None = None
retry_after_seconds: float | None = None
cooldown_remaining_seconds: float | None = None
is_cooldown: bool = False
class HealthResponse(BaseModel):
@@ -62,7 +50,6 @@ class HealthResponse(BaseModel):
mock_mode: bool
timestamp: datetime
components: dict[str, ComponentHealth]
ollama_route_order: list[str] = []
# =============================================================================
@@ -119,125 +106,8 @@ async def check_redis() -> ComponentHealth:
async def check_ollama() -> ComponentHealth:
"""Async aggregate Ollama health check via ADR-110 provider chain."""
aggregate, _details = await check_ollama_provider_chain()
return aggregate
async def check_ollama_provider_chain() -> tuple[ComponentHealth, dict[str, ComponentHealth]]:
"""
Check the full Ollama provider chain.
The aggregate ``ollama`` component represents route availability:
- up: GCP-A is reachable
- degraded: GCP-A is unavailable but GCP-B or 111 is reachable
- down: no configured Ollama endpoint is reachable
"""
selections = tuple(
selection
for selection in resolve_ollama_order("healthcheck")
if selection.url and selection.provider_name != "ollama_unconfigured"
)
if not selections:
aggregate = ComponentHealth(
status="down",
error="no Ollama endpoints configured",
)
return aggregate, {}
checked = await asyncio.gather(
*(
_ollama_endpoint_health_check(selection.provider_name, selection.url)
for selection in selections
)
)
details = {
selection.provider_name: result
for selection, result in zip(selections, checked, strict=False)
}
primary = selections[0]
primary_status = details[primary.provider_name].status
if primary.provider_name == "ollama_gcp_a" and primary_status == "up":
return details[primary.provider_name], details
first_available = next(
(
selection
for selection in selections
if details[selection.provider_name].status == "up"
),
None,
)
if first_available:
fallback = details[first_available.provider_name]
return (
ComponentHealth(
status="degraded",
latency_ms=fallback.latency_ms,
error=f"primary unavailable; fallback active: {first_available.provider_name}",
),
details,
)
errors = ", ".join(
f"{provider}={health.error or health.status}"
for provider, health in details.items()
)
return (
ComponentHealth(
status="down",
error=f"all Ollama endpoints unavailable: {errors}",
),
details,
)
async def _ollama_endpoint_health_check(name: str, url: str) -> ComponentHealth:
cooldown_remaining = get_ollama_endpoint_cooldown_remaining_seconds(url)
if cooldown_remaining > 0:
return ComponentHealth(
status="down",
error=f"recent endpoint failure cooldown: {cooldown_remaining:.0f}s",
provider_name=name,
diagnosis_code="endpoint_cooldown",
retry_after_seconds=round(cooldown_remaining, 1),
cooldown_remaining_seconds=round(cooldown_remaining, 1),
is_cooldown=True,
)
result = await _http_health_check(name, url, "/api/tags")
result.provider_name = name
if result.status == "up":
result.diagnosis_code = "endpoint_reachable"
record_ollama_endpoint_success(url)
else:
result.diagnosis_code = _classify_ollama_endpoint_failure(name, result.error)
record_ollama_endpoint_failure(url)
return result
def _classify_ollama_endpoint_failure(
provider_name: str,
error: str | None,
) -> str:
"""Return a stable diagnosis code for UI/alert rendering."""
normalized_error = (error or "").lower()
if "cooldown" in normalized_error:
return "endpoint_cooldown"
if "502" in normalized_error or "bad gateway" in normalized_error:
return (
"local_proxy_upstream_unreachable"
if provider_name == "ollama_local"
else "proxy_upstream_unreachable"
)
if "timeout" in normalized_error:
return "endpoint_timeout"
if "connection refused" in normalized_error:
return "endpoint_connection_refused"
if "no route to host" in normalized_error or "network is unreachable" in normalized_error:
return "endpoint_network_unreachable"
return "endpoint_unreachable"
"""Async Ollama health check via /api/tags"""
return await _http_health_check("ollama", settings.OLLAMA_URL, "/api/tags")
async def check_openclaw() -> ComponentHealth:
@@ -250,30 +120,6 @@ async def check_signoz() -> ComponentHealth:
return await _http_health_check("signoz", settings.SIGNOZ_URL, "/api/v1/health")
def _determine_overall_status(
components: dict[str, ComponentHealth],
) -> Literal["healthy", "degraded", "unhealthy"]:
"""Determine overall health from core aggregate components only."""
statuses = [
components[name].status
for name in CORE_COMPONENTS
if name in components
]
down_count = statuses.count("down")
degraded_count = statuses.count("degraded")
critical_down = (
components.get("postgresql", ComponentHealth(status="down")).status == "down"
or components.get("redis", ComponentHealth(status="down")).status == "down"
)
if critical_down or down_count >= 3:
return "unhealthy"
if down_count >= 1 or degraded_count > 0:
return "degraded"
return "healthy"
# =============================================================================
# Endpoints
# =============================================================================
@@ -296,28 +142,34 @@ async def get_health() -> HealthResponse:
results = await asyncio.gather(
check_postgresql(),
check_redis(),
check_ollama_provider_chain(),
check_ollama(),
check_openclaw(),
check_signoz(),
)
ollama_aggregate, ollama_details = results[2]
components = {
"api": ComponentHealth(status="up", latency_ms=0.0),
"postgresql": results[0],
"redis": results[1],
"ollama": ollama_aggregate,
"ollama": results[2],
"openclaw": results[3],
"signoz": results[4],
}
components.update(ollama_details)
overall_status = _determine_overall_status(components)
ollama_route_order = [
selection.provider_name
for selection in resolve_ollama_order("healthcheck")
if selection.url and selection.provider_name != "ollama_unconfigured"
]
# Determine overall status
statuses = [c.status for c in components.values()]
down_count = statuses.count("down")
degraded_count = statuses.count("degraded")
# Critical services: postgresql, redis
critical_down = components["postgresql"].status == "down" or components["redis"].status == "down"
if critical_down or down_count >= 3:
overall_status: Literal["healthy", "degraded", "unhealthy"] = "unhealthy"
elif down_count >= 1 or degraded_count > 0:
overall_status = "degraded"
else:
overall_status = "healthy"
logger.info(
"health_check_complete",
@@ -333,7 +185,6 @@ async def get_health() -> HealthResponse:
mock_mode=settings.MOCK_MODE,
timestamp=datetime.now(UTC),
components=components,
ollama_route_order=ollama_route_order,
)

View File

@@ -17,10 +17,9 @@ Phase 6.4 核心功能:
- Proposal 必須關聯到 Incident
"""
from datetime import UTC, datetime, timedelta
from typing import Any
from fastapi import APIRouter, HTTPException, Query, status
from fastapi import APIRouter, HTTPException, status
from pydantic import BaseModel, Field
from src.core.logging import get_logger
@@ -134,7 +133,6 @@ class IncidentTimelineResponse(BaseModel):
timeline: list[IncidentTimelineStage] = Field(default_factory=list)
events: list[IncidentTimelineEvent] = Field(default_factory=list)
ascii_timeline: str
reconciliation: dict[str, Any] = Field(default_factory=dict)
# =============================================================================
@@ -150,26 +148,18 @@ class IncidentTimelineResponse(BaseModel):
Phase 6.5 升級:
- 每個事件自動附帶 decision_token
- 預設只讀取已存在的 decision_token
- 需要新決策時改由明確的 proposal / operator run 入口觸發
- 確保 UI 永遠有決策可操作
- 雙軌引擎: LLM (主) + Expert System (備)
""",
)
async def list_incidents(
generate_missing_decisions: bool = Query(
False,
description=(
"預設 false列表查詢只讀既有 decision token"
"true 僅供明確維運操作使用,會背景產生缺少的決策。"
),
),
) -> IncidentListResponse:
async def list_incidents() -> IncidentListResponse:
"""
取得活躍事件清單
Phase 6.5: 附帶既有決策令牌
- 列表查詢必須是低成本純讀路徑
- 不可因為前端輪詢就背景觸發 LLM / Ollama / OpenClaw
- 需要新決策時,呼叫 POST /api/v1/incidents/{incident_id}/proposal
Phase 6.5: 自動為每個事件生成決策令牌
- P0/P1 事件優先處理
- 30 秒內保證有決策
- LLM 失敗時 Expert System 保底
Returns:
IncidentListResponse: 事件清單與計數 (含決策令牌)
@@ -184,6 +174,8 @@ async def list_incidents(
# 按時間排序 (最新優先)
# 2026-03-26 修復: 處理 timezone-aware 與 naive datetime 混合問題
from datetime import UTC
def safe_created_at(i: Incident) -> float:
"""安全取得 timestamp處理 timezone 混合問題"""
dt = i.created_at
@@ -197,24 +189,15 @@ async def list_incidents(
# 2026-04-09 Claude Sonnet 4.6: 效能修復 — list endpoint 不同步等待 AI
# 原設計: 每個 incident await AI 決策 (120-180s timeout),多 incident 時乘積爆炸
# 修復: 只取已存在的決策 token若無則背景觸發生成前端 poll 單筆 GET 取得結果
#
# 2026-05-06 Codex: 成本與推理槽修復 — 預設不再背景觸發 AI。
# 根因: 多個前端頁面會輪詢 GET /incidents若列表查詢偷偷 create_task
# 每次頁面載入都可能消耗 GCP Ollama / OpenClaw 推理槽,甚至 fallback 到 Gemini。
# 新規則: GET list 是純讀;生成新修復建議必須走明確 proposal/operator-run 入口。
if generate_missing_decisions:
import asyncio
import asyncio
responses = []
background_tasks = []
existing_tokens = await decision_manager._find_existing_tokens_for_incidents(
[incident.incident_id for incident in incidents]
)
for incident in incidents:
try:
# 只查已快取的決策 (不等待 AI立即返回)
existing = existing_tokens.get(incident.incident_id)
existing = await decision_manager._find_existing_token(incident.incident_id)
if existing:
decision_info = DecisionInfo(
token=existing.token,
@@ -224,20 +207,17 @@ async def list_incidents(
)
responses.append(IncidentResponse.from_incident(incident, decision_info))
else:
# 無快取 → 本次返回 None。列表查詢預設不觸發 AI
# 前端若需要修復建議,必須呼叫明確的 proposal 入口。
# 無快取 → 背景觸發,本次返回 None(前端看到 decision=null 會 poll
responses.append(IncidentResponse.from_incident(incident, None))
if not generate_missing_decisions:
continue
# 2026-04-16 Claude Sonnet 4.6: 只對 48h 內的 incident 觸發 AI 分析
# 舊 incident token 每小時過期,若不限制會反覆重新分析歷史事件 → Telegram 洪水
from datetime import datetime, timezone, timedelta
_created = getattr(incident, "created_at", None)
_too_old = False
if _created:
if _created.tzinfo is None:
_created = _created.replace(tzinfo=UTC)
_too_old = (_created < datetime.now(UTC) - timedelta(hours=48))
_created = _created.replace(tzinfo=timezone.utc)
_too_old = (_created < datetime.now(timezone.utc) - timedelta(hours=48))
if not _too_old:
timeout = 120.0 if incident.severity in (Severity.P0, Severity.P1) else 180.0
background_tasks.append(
@@ -260,7 +240,6 @@ async def list_incidents(
"incidents_listed",
count=len(incidents),
with_decisions=sum(1 for r in responses if r.decision is not None),
generate_missing_decisions=generate_missing_decisions,
)
return IncidentListResponse(

View File

@@ -9,21 +9,14 @@ ADR-106/ADR-107/ADR-114/ADR-115/ADR-116
from fastapi import APIRouter
from src.api.v1.platform.contracts import router as contracts_router
from src.api.v1.platform.events import router as events_router
from src.api.v1.platform.operator_runs import router as operator_runs_router
from src.api.v1.platform.runs import router as runs_router
from src.api.v1.platform.tenants import router as tenants_router
from src.api.v1.platform.truth_chain import router as truth_chain_router
router = APIRouter()
router.include_router(events_router)
router.include_router(truth_chain_router)
# 2026-05-06 Codex: FastAPI 依註冊順序比對路由。Operator Console 的
# `/runs/list` 必須排在 `/runs/{run_id}` 前面,否則 `list` 會被當成
# run_id造成前端 Run 監控頁 HTTP 422。
router.include_router(operator_runs_router)
router.include_router(runs_router)
router.include_router(tenants_router)
router.include_router(contracts_router)
router.include_router(operator_runs_router)
__all__ = ["router"]

View File

@@ -1,586 +0,0 @@
"""
AwoooP Operator Console — Channel Events API
============================================
提供 Operator Console 讀取 Communication Hub / legacy mirror 的事件摘要。
"""
from __future__ import annotations
from datetime import UTC, datetime
from typing import Annotated, Any, Literal
from uuid import UUID
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel, Field
from src.core.awooop_operator_auth import (
AwoooPOperatorPrincipal,
verify_awooop_operator,
)
from src.services.channel_event_dossier_service import (
RecurrenceWorkItemHandoffKind,
RecurrenceWorkItemMode,
RecurrenceWorkItemNotFoundError,
SourceCorrelationReviewDecision,
fetch_channel_event_dossier,
fetch_channel_event_dossier_coverage,
fetch_channel_event_dossier_recurrence,
fetch_recurrence_work_item_dry_run,
fetch_recurrence_work_item_handoff,
fetch_recurrence_work_item_preview,
fetch_source_correlation_apply,
fetch_source_correlation_review_decision,
)
from src.services.channel_hub import record_external_alert_event
from src.services.platform_operator_service import list_recent_channel_events
router = APIRouter()
class ChannelEventItem(BaseModel):
event_id: UUID
project_id: str
channel_type: str
provider_event_id: str
channel_chat_id: str | None
content_preview: str | None
is_duplicate: bool
received_at: datetime
class RecentEventsResponse(BaseModel):
events: list[ChannelEventItem]
total: int
limit: int
class ChannelEventDossierItem(BaseModel):
event_id: UUID
project_id: str
channel_type: str
provider: str | None
stage: str
provider_event_id: str
content_preview: str | None
content_redacted: str | None
has_redacted_content: bool
redaction_version: str | None
source_url: str | None
content_sha256: str | None
content_length: int | None
source_refs: dict[str, Any]
source_ref_count: int
log_correlation: dict[str, Any]
alertname: str | None
severity: str | None
namespace: str | None
target_resource: str | None
fingerprint: str | None
is_duplicate: bool
provider_ts: datetime | None
received_at: datetime
class ChannelEventDossierSummary(BaseModel):
source_count: int
duplicate_total: int
redacted_total: int
source_ref_total: int
class ChannelEventDossierResponse(BaseModel):
events: list[ChannelEventDossierItem]
total: int
limit: int
summary: ChannelEventDossierSummary
class ChannelEventProviderCoverage(BaseModel):
provider: str
total: int
duplicate_total: int
redacted_total: int
source_ref_total: int
missing_source_refs_total: int
sentry_ref_total: int
signoz_ref_total: int
alert_ref_total: int
latest_received_at: datetime | None
class ChannelEventDossierCoverageSummary(BaseModel):
source_count: int
source_envelope_total: int
missing_source_envelope_total: int
with_source_refs_total: int
missing_source_refs_total: int
duplicate_total: int
redacted_total: int
source_ref_total: int
sentry_ref_total: int
signoz_ref_total: int
alert_ref_total: int
latest_received_at: datetime | None
class ChannelEventDossierCoverageResponse(BaseModel):
project_id: str
limit: int
summary: ChannelEventDossierCoverageSummary
providers: list[ChannelEventProviderCoverage]
SourceProviderName = Literal["sentry", "signoz"]
class SourceProviderHeartbeatRequest(BaseModel):
"""Low-noise freshness heartbeat for external source-provider mirrors."""
project_id: str = Field(default="awoooi", min_length=1, max_length=64)
providers: list[SourceProviderName] = Field(
default_factory=lambda: ["sentry", "signoz"],
min_length=1,
max_length=2,
)
reason: str = Field(
default="scheduled_provider_freshness_smoke",
min_length=1,
max_length=120,
)
run_ref: str | None = Field(default=None, max_length=120)
class SourceProviderHeartbeatItem(BaseModel):
provider: SourceProviderName
event_id: str
conversation_event_id: UUID
class SourceProviderHeartbeatResponse(BaseModel):
status: str
project_id: str
items: list[SourceProviderHeartbeatItem]
class ChannelEventRecurrenceSummary(BaseModel):
source_event_total: int
recurrence_group_total: int
recurrent_group_total: int
duplicate_event_total: int
linked_run_total: int
unlinked_event_total: int
auto_repair_linked_total: int = 0
verified_repair_group_total: int = 0
open_work_item_group_total: int = 0
manual_gate_group_total: int = 0
automation_gap_group_total: int = 0
failed_repair_group_total: int = 0
source_correlation_review_group_total: int = 0
source_correlation_decision_recorded_group_total: int = 0
source_correlation_applied_group_total: int = 0
latest_received_at: datetime | None
class ChannelEventRecurrenceItem(BaseModel):
recurrence_key: str
provider: str | None
alertname: str | None
severity: str | None
namespace: str | None
target_resource: str | None
fingerprint: str | None
latest_stage: str | None = None
latest_event_id: UUID | None
latest_provider_event_id: str | None
latest_content_preview: str | None
latest_run_id: UUID | None
latest_run_state: str | None
latest_agent_id: str | None
latest_incident_id: str | None = None
incident_ids: list[str] = Field(default_factory=list)
repair_summary: dict[str, Any] | None = None
work_item: dict[str, Any] | None = None
source_correlation_review: dict[str, Any] | None = None
source_correlation_apply: dict[str, Any] | None = None
occurrence_total: int
duplicate_total: int
linked_run_total: int
source_ref_total: int
missing_source_refs_total: int
sentry_ref_total: int
signoz_ref_total: int
alert_ref_total: int
stage_counts: dict[str, int] = Field(default_factory=dict)
run_state_counts: dict[str, int]
first_received_at: datetime | None
latest_received_at: datetime | None
class ChannelEventRecurrenceResponse(BaseModel):
project_id: str
limit: int
summary: ChannelEventRecurrenceSummary
items: list[ChannelEventRecurrenceItem]
class RecurrenceWorkItemDryRunRequest(BaseModel):
"""AwoooP recurrence work item dry-run request."""
project_id: str | None = Field(default=None, min_length=1)
work_item_id: str = Field(min_length=1)
mode: RecurrenceWorkItemMode = "auto"
provider: str | None = Field(default=None, min_length=1)
limit: int = Field(default=300, ge=1, le=300)
class RecurrenceWorkItemHandoffRequest(BaseModel):
"""AwoooP recurrence work item handoff request."""
project_id: str | None = Field(default=None, min_length=1)
work_item_id: str = Field(min_length=1)
mode: RecurrenceWorkItemMode = "auto"
handoff_kind: RecurrenceWorkItemHandoffKind = "ticket_proposal"
provider: str | None = Field(default=None, min_length=1)
limit: int = Field(default=300, ge=1, le=300)
class SourceCorrelationReviewDecisionRequest(BaseModel):
"""Record-only source evidence review decision."""
project_id: str | None = Field(default=None, min_length=1)
work_item_id: str = Field(min_length=1)
decision: SourceCorrelationReviewDecision
target_incident_id: str | None = Field(default=None, min_length=1, max_length=30)
reviewer_id: str = Field(default="operator_console", min_length=1, max_length=100)
operator_note: str | None = Field(default=None, max_length=500)
provider: str | None = Field(default=None, min_length=1)
limit: int = Field(default=300, ge=1, le=300)
class SourceCorrelationApplyRequest(BaseModel):
"""Append-only source evidence link apply request."""
project_id: str | None = Field(default=None, min_length=1)
work_item_id: str = Field(min_length=1)
reviewer_id: str = Field(default="operator_console", min_length=1, max_length=100)
operator_note: str | None = Field(default=None, max_length=500)
provider: str | None = Field(default=None, min_length=1)
limit: int = Field(default=300, ge=1, le=300)
@router.get(
"/events/dossier",
response_model=ChannelEventDossierResponse,
summary="查詢 Channel Event 來源卷宗",
description=(
"返回 redacted inbound source envelope供 AwoooP Run Detail 顯示"
"告警來源、source refs、Sentry / SignOz / Alertmanager 關聯與去重狀態。"
),
)
async def get_event_dossier(
project_id: str | None = Query(None, description="租戶 ID可選"),
run_id: UUID | None = Query(None, description="Run ID可選"),
provider_event_id: str | None = Query(
None, description="provider_event_id可選"
),
limit: int = Query(20, ge=1, le=50, description="最多返回筆數"),
) -> dict[str, Any]:
return await fetch_channel_event_dossier(
project_id=project_id,
run_id=run_id,
provider_event_id=provider_event_id,
limit=limit,
)
@router.get(
"/events/dossier/coverage",
response_model=ChannelEventDossierCoverageResponse,
summary="查詢 Channel Event 來源卷宗覆蓋率",
description=(
"返回近期 inbound event 的 source_envelope / source_refs / 去重 / "
"Sentry / SignOz 關聯覆蓋率,供 AwoooP Run List 顯示告警是否已入庫。"
),
)
async def get_event_dossier_coverage(
project_id: str | None = Query(None, description="租戶 ID可選"),
provider: str | None = Query(
None, description="provider可選如 sentry / signoz"
),
limit: int = Query(100, ge=1, le=200, description="最多納入統計筆數"),
) -> dict[str, Any]:
return await fetch_channel_event_dossier_coverage(
project_id=project_id,
provider=provider,
limit=limit,
)
@router.post(
"/events/dossier/provider-heartbeat",
response_model=SourceProviderHeartbeatResponse,
summary="寫入 Sentry / SignOz 來源卷宗 freshness heartbeat",
description=(
"受 AwoooP operator key 保護的低噪音 smoke。只寫入來源卷宗與"
"completed shadow run不建立 Incident、不送 Telegram、不宣稱真實上游告警。"
),
)
async def create_source_provider_heartbeat(
payload: SourceProviderHeartbeatRequest,
operator: Annotated[
AwoooPOperatorPrincipal,
Depends(verify_awooop_operator),
],
) -> dict[str, Any]:
timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
items: list[dict[str, Any]] = []
for provider in payload.providers:
event_id = f"heartbeat-{timestamp}"
event_uuid = await record_external_alert_event(
project_id=payload.project_id,
provider=provider,
event_id=event_id,
stage="heartbeat",
title="SourceProviderHeartbeat",
severity="info",
namespace="awoooi-prod",
target_resource="source-provider-ingestion",
fingerprint=f"source-provider-heartbeat:{provider}",
labels={
"provider": provider,
"synthetic": "true",
"alert_category": "alertchain_provider_freshness",
"telegram": "not_sent",
"incident": "not_created",
},
annotations={
"summary": (
"Low-noise provider freshness smoke; verifies AwoooP "
"source dossier ingestion without creating an incident."
),
"reason": payload.reason,
},
payload={
"reason": payload.reason,
"run_ref": payload.run_ref,
"operator_id": operator.operator_id,
"auth_method": operator.auth_method,
"synthetic": True,
"side_effects": {
"incident_created": False,
"telegram_sent": False,
"approval_created": False,
},
},
)
if event_uuid is None:
raise HTTPException(
status_code=500,
detail=f"{provider} provider heartbeat was not recorded",
)
items.append(
{
"provider": provider,
"event_id": event_id,
"conversation_event_id": event_uuid,
}
)
return {
"status": "recorded",
"project_id": payload.project_id,
"items": items,
}
@router.get(
"/events/dossier/recurrence",
response_model=ChannelEventRecurrenceResponse,
summary="查詢 Channel Event 重複發生與關聯 Run 狀態",
description=(
"將近期 inbound source events 依 fingerprint / alertname / namespace / target 分組,"
"顯示重複發生次數、去重數、source refs 與最新 linked run 狀態。"
),
)
async def get_event_dossier_recurrence(
project_id: str | None = Query(None, description="租戶 ID可選"),
provider: str | None = Query(
None, description="provider可選如 alertmanager / sentry / signoz"
),
limit: int = Query(100, ge=1, le=300, description="最多納入統計筆數"),
) -> dict[str, Any]:
return await fetch_channel_event_dossier_recurrence(
project_id=project_id,
provider=provider,
limit=limit,
)
@router.get(
"/events/dossier/recurrence/work-item/preview",
summary="預覽重複告警工作項的安全處理計畫",
description=(
"依 recurrence read model 找出指定 work_item返回下一步、pre-flight checks "
"與 read-only / no-write 保證;不修改 incident、auto-repair 或 ticket 狀態。"
),
)
async def preview_event_recurrence_work_item(
work_item_id: str = Query(..., min_length=1, description="recurrence work_item_id"),
project_id: str | None = Query(None, description="租戶 ID可選"),
provider: str | None = Query(
None, description="provider可選如 alertmanager / sentry / signoz"
),
mode: RecurrenceWorkItemMode = Query("auto", description="預覽模式"),
limit: int = Query(300, ge=1, le=300, description="最多納入統計筆數"),
) -> dict[str, Any]:
try:
return await fetch_recurrence_work_item_preview(
project_id=project_id,
work_item_id=work_item_id,
mode=mode,
provider=provider,
limit=limit,
)
except RecurrenceWorkItemNotFoundError as exc:
raise HTTPException(
status_code=404,
detail="recurrence_work_item_not_found",
) from exc
@router.post(
"/events/dossier/recurrence/work-item/dry-run",
summary="乾跑重複告警工作項的安全處理流程",
description=(
"依 recurrence read model 產生 dry-run 結果並寫入 pre-flight history"
"但不修改 incident、auto-repair 或 ticket 狀態。"
),
)
async def dry_run_event_recurrence_work_item(
request: RecurrenceWorkItemDryRunRequest,
) -> dict[str, Any]:
try:
return await fetch_recurrence_work_item_dry_run(
project_id=request.project_id,
work_item_id=request.work_item_id,
mode=request.mode,
provider=request.provider,
limit=request.limit,
)
except RecurrenceWorkItemNotFoundError as exc:
raise HTTPException(
status_code=404,
detail="recurrence_work_item_not_found",
) from exc
@router.post(
"/events/dossier/recurrence/work-item/handoff",
summary="記錄重複告警工作項的交接提案",
description=(
"依 recurrence read model 與 dry-run 結果記錄 ticket proposal / 人工接手歷史,"
"但不修改 incident、auto-repair 或外部 ticket 狀態。"
),
)
async def handoff_event_recurrence_work_item(
request: RecurrenceWorkItemHandoffRequest,
) -> dict[str, Any]:
try:
return await fetch_recurrence_work_item_handoff(
project_id=request.project_id,
work_item_id=request.work_item_id,
mode=request.mode,
handoff_kind=request.handoff_kind,
provider=request.provider,
limit=request.limit,
)
except RecurrenceWorkItemNotFoundError as exc:
raise HTTPException(
status_code=404,
detail="recurrence_work_item_not_found",
) from exc
@router.post(
"/events/dossier/recurrence/source-correlation/review",
summary="記錄來源證據與 Incident 配對審核結果",
description=(
"針對 source_correlation_review work item 記錄 operator 審核決定。"
"本 API 僅寫入 alert_operation_log / 可選 timeline_events"
"不修改 Incident 狀態、不回寫 source event、不建立外部 ticket。"
),
)
async def review_source_correlation_work_item(
request: SourceCorrelationReviewDecisionRequest,
) -> dict[str, Any]:
try:
return await fetch_source_correlation_review_decision(
project_id=request.project_id,
work_item_id=request.work_item_id,
decision=request.decision,
target_incident_id=request.target_incident_id,
reviewer_id=request.reviewer_id,
operator_note=request.operator_note,
provider=request.provider,
limit=request.limit,
)
except RecurrenceWorkItemNotFoundError as exc:
raise HTTPException(
status_code=404,
detail="recurrence_work_item_not_found",
) from exc
@router.post(
"/events/dossier/recurrence/source-correlation/apply",
summary="套用已確認的來源證據與 Incident 配對",
description=(
"只接受已寫入 accepted review 的 source_correlation_review work item。"
"成功時以 append-only 方式新增 source_correlation_linked 來源事件,"
"並寫入 alert_operation_log / timeline_events。"
"不修改 Incident 狀態、不修改 auto-repair 結果、不建立外部 ticket。"
),
)
async def apply_source_correlation_work_item(
request: SourceCorrelationApplyRequest,
) -> dict[str, Any]:
try:
return await fetch_source_correlation_apply(
project_id=request.project_id,
work_item_id=request.work_item_id,
reviewer_id=request.reviewer_id,
operator_note=request.operator_note,
provider=request.provider,
limit=request.limit,
)
except RecurrenceWorkItemNotFoundError as exc:
raise HTTPException(
status_code=404,
detail="recurrence_work_item_not_found",
) from exc
@router.get(
"/events/recent",
response_model=RecentEventsResponse,
summary="列出最近 Channel Events",
description=(
"返回 awooop_conversation_event 最近事件。"
"可用 channel_type / provider_prefix 過濾,例如 alert-group 收斂事件。"
),
)
async def list_recent_events(
project_id: str | None = Query(None, description="租戶 ID可選"),
channel_type: str | None = Query(None, description="通道類型(可選)"),
provider_prefix: str | None = Query(
None, description="provider_event_id 前綴(可選)"
),
limit: int = Query(20, ge=1, le=100, description="最多返回筆數"),
) -> dict[str, Any]:
return await list_recent_channel_events(
project_id=project_id,
channel_type=channel_type,
provider_prefix=provider_prefix,
limit=limit,
)

View File

@@ -15,35 +15,12 @@ from decimal import Decimal
from typing import Any, Literal
from uuid import UUID
from fastapi import APIRouter, Depends, Query
from fastapi import APIRouter, Query
from pydantic import BaseModel, Field
from src.core.awooop_operator_auth import (
AwoooPOperatorPrincipal,
verify_awooop_operator,
)
from src.services.platform_operator_service import (
decide_approval as decide_approval_svc,
)
from src.services.platform_operator_service import (
get_ai_route_status as get_ai_route_status_svc,
)
from src.services.platform_operator_service import (
get_awooop_status_chain as get_awooop_status_chain_svc,
)
from src.services.platform_operator_service import (
get_run_detail as get_run_detail_svc,
)
from src.services.platform_operator_service import (
list_cicd_events as list_cicd_events_svc,
)
from src.services.platform_operator_service import (
list_approvals as list_approvals_svc,
)
from src.services.platform_operator_service import (
list_callback_replies as list_callback_replies_svc,
)
from src.services.platform_operator_service import (
list_runs as list_runs_svc,
)
@@ -63,8 +40,6 @@ class RunItem(BaseModel):
step_count: int
created_at: datetime
timeout_at: datetime | None
remediation_summary: dict[str, Any] | None = None
callback_reply_summary: dict[str, Any] | None = None
class ListRunsResponse(BaseModel):
@@ -74,151 +49,12 @@ class ListRunsResponse(BaseModel):
per_page: int
class CallbackReplyItem(BaseModel):
message_id: UUID
run_id: UUID
project_id: str
status: str
needs_human: bool
action: str | None = None
incident_id: str | None = None
event_at: datetime | None = None
channel_type: str
message_type: str
send_status: str
send_error: str | None = None
provider_message_id: str | None = None
triggered_by_state: str | None = None
content_preview: str | None = None
run_state: str | None = None
agent_id: str | None = None
run_created_at: datetime | None = None
callback_reply: dict[str, Any]
awooop_status_chain: dict[str, Any] | None = None
persisted_awooop_status_chain: dict[str, Any] | None = None
km_stale_completion_summary: dict[str, Any] | None = None
persisted_km_stale_completion_summary: dict[str, Any] | None = None
evidence_capture_status: dict[str, Any] | None = None
run_detail_href: str | None = None
class OutboundReplyMarkupGapPrefix(BaseModel):
prefix: str
total: int
recent_24h_total: int = 0
first_sent_at: datetime | None = None
last_sent_at: datetime | None = None
class CallbackReplyAuditSummary(BaseModel):
schema_version: str
project_id: str
outbound_total: int
outbound_source_envelope_total: int
outbound_source_refs_total: int
outbound_trace_ref_total: int = 0
outbound_incident_ref_total: int
outbound_reply_markup_total: int = 0
outbound_reply_markup_missing_incident_ref_total: int = 0
outbound_reply_markup_missing_incident_ref_recent_1h_total: int = 0
outbound_reply_markup_missing_incident_ref_recent_24h_total: int = 0
outbound_reply_markup_missing_incident_ref_latest_sent_at: datetime | None = None
outbound_reply_markup_missing_trace_ref_total: int = 0
outbound_reply_markup_missing_trace_ref_recent_1h_total: int = 0
outbound_reply_markup_missing_trace_ref_recent_24h_total: int = 0
outbound_reply_markup_missing_trace_ref_latest_sent_at: datetime | None = None
outbound_reply_markup_trace_ref_gap_status: str = "clean"
outbound_reply_markup_trace_ref_gap_next_action: str = "none"
outbound_reply_markup_trace_ref_after_gap_total: int = 0
outbound_reply_markup_trace_ref_after_gap_first_sent_at: datetime | None = None
outbound_reply_markup_trace_ref_after_gap_latest_sent_at: datetime | None = None
outbound_reply_markup_trace_ref_gap_recovery_status: str = "not_needed"
outbound_reply_markup_missing_incident_ref_top_prefixes: list[
OutboundReplyMarkupGapPrefix
] = Field(default_factory=list)
outbound_reply_markup_missing_trace_ref_top_prefixes: list[
OutboundReplyMarkupGapPrefix
] = Field(default_factory=list)
outbound_failed_total: int
callback_total: int
callback_sent_total: int
callback_fallback_total: int
callback_rescue_total: int
callback_failed_total: int
callback_detail_total: int
callback_history_total: int
callback_snapshot_captured_total: int
callback_snapshot_partial_total: int
callback_snapshot_missing_total: int
callback_incident_total: int
snapshot_status: str
next_action: str
latest_outbound_at: datetime | None = None
latest_callback_at: datetime | None = None
class ListCallbackRepliesResponse(BaseModel):
items: list[CallbackReplyItem]
total: int
page: int
per_page: int
summary: CallbackReplyAuditSummary | None = None
class CicdEventItem(BaseModel):
id: str
project_id: str
alertname: str
stage: str | None = None
status: str | None = None
severity: str | None = None
commit_sha: str | None = None
triggered_by: str | None = None
duration_seconds: int = 0
summary: str | None = None
description: str | None = None
workflow_url: str | None = None
alert_id: str | None = None
source: str | None = None
action_detail: str | None = None
needs_attention: bool = False
created_at: datetime
class ListCicdEventsResponse(BaseModel):
items: list[CicdEventItem]
total: int
limit: int
class AiRouteStatusResponse(BaseModel):
schema_version: str
workload_type: str
policy_order: list[dict[str, Any]]
selected_provider: str | None = None
selected_url: str | None = None
selected_model: str | None = None
fallback_chain: list[dict[str, Any]]
route_reason: str
route_source: str
route_error: str | None = None
health: dict[str, dict[str, Any]]
lane_mode: str | None = None
active_lane: dict[str, Any] | None = None
skipped_lanes: list[dict[str, Any]] = Field(default_factory=list)
operator_action: dict[str, Any] | None = None
repair_evidence: dict[str, Any] | None = None
checked_at: datetime
class ApprovalItem(BaseModel):
run_id: UUID
project_id: str
agent_id: str
created_at: datetime
timeout_at: datetime | None
remediation_summary: dict[str, Any] | None = None
awooop_status_chain: dict[str, Any] | None = None
class ListApprovalsResponse(BaseModel):
@@ -229,10 +65,7 @@ class ListApprovalsResponse(BaseModel):
class DecideApprovalRequest(BaseModel):
project_id: str = Field(..., description="租戶 ID")
decision: Literal["approve", "reject"] = Field(..., description="核准或拒絕")
approver_id: str | None = Field(
default=None,
description="Deprecated. Ignored; approver comes from trusted operator headers.",
)
approver_id: str = Field(..., description="審核人 IDplatform_subject_id 或 operator email")
reason: str | None = Field(None, description="決策原因(可選)")
@@ -248,8 +81,7 @@ class DecideApprovalResponse(BaseModel):
response_model=ListRunsResponse,
summary="列出 Runs",
description=(
"返回 awooop_run_state 記錄,支援 project_id / state / remediation_status / "
"callback_reply_status / incident_id filter 與分頁。\n\n"
"返回 awooop_run_state 記錄,支援 project_id / state filter 與分頁。\n\n"
"- 按 created_at DESC 排序\n"
"- 注意:此路徑為 /runs/list 以避免與 runs.py 的 /runs/{run_id} 衝突"
),
@@ -257,133 +89,11 @@ class DecideApprovalResponse(BaseModel):
async def list_runs(
project_id: str | None = Query(None, description="租戶 ID可選"),
state: str | None = Query(None, description="Run 狀態 filter可選"),
remediation_status: str | None = Query(
None,
description="AI 證據狀態 filterno_evidence/mcp_observed/read_only_dry_run/write_observed/blocked/observed",
),
callback_reply_status: str | None = Query(
None,
description="Telegram callback reply 狀態 filterno_callback/sent/fallback_sent/rescue_sent/failed/observed",
),
incident_id: str | None = Query(None, description="關聯 Incident ID filter可選"),
page: int = Query(1, ge=1, description="頁碼,從 1 開始"),
per_page: int = Query(_DEFAULT_PER_PAGE, ge=1, le=_MAX_PER_PAGE, description="每頁筆數"),
) -> dict[str, Any]:
return await list_runs_svc(
project_id=project_id,
state=state,
remediation_status=remediation_status,
callback_reply_status=callback_reply_status,
incident_id=incident_id,
page=page,
per_page=per_page,
)
@router.get(
"/runs/callback-replies",
response_model=ListCallbackRepliesResponse,
summary="列出 Telegram Callback Reply Evidence",
description=(
"從 AwoooP outbound mirror 查詢 Telegram 詳情 / 歷史 callback reply 的"
"送達、fallback、救援與失敗證據只讀不修改 incident、run 或 Telegram 狀態。"
),
)
async def list_callback_replies(
project_id: str | None = Query(None, description="租戶 ID可選"),
callback_reply_status: str | None = Query(
None,
description="Telegram callback reply 狀態 filtersent/fallback_sent/rescue_sent/failed/observed/no_callback",
),
action: str | None = Query(None, description="Callback action filter例如 detail/history"),
incident_id: str | None = Query(None, description="關聯 Incident ID filter可選"),
page: int = Query(1, ge=1, description="頁碼,從 1 開始"),
per_page: int = Query(20, ge=1, le=_MAX_PER_PAGE, description="每頁筆數"),
) -> dict[str, Any]:
return await list_callback_replies_svc(
project_id=project_id,
callback_reply_status=callback_reply_status,
action=action,
incident_id=incident_id,
page=page,
per_page=per_page,
)
@router.get(
"/cicd/events",
response_model=ListCicdEventsResponse,
summary="列出 CI/CD evidence events",
description=(
"從 alert_operation_log 讀取 CI/CD notification evidence供 AwoooP "
"Deployments / Run Console 顯示 rollout-risk、success、failed 等階段狀態。"
),
)
async def list_cicd_events(
project_id: str | None = Query(None, description="租戶 ID目前支援 awoooi"),
stage: str | None = Query(None, description="CI/CD stage filter可選"),
status: str | None = Query(None, description="CI/CD status filterrunning/success/failed/pending"),
limit: int = Query(12, ge=1, le=50, description="最多返回筆數"),
) -> dict[str, Any]:
return await list_cicd_events_svc(
project_id=project_id,
stage=stage,
status_filter=status,
limit=limit,
)
@router.get(
"/ai-route-status",
response_model=AiRouteStatusResponse,
summary="查詢 AI Provider 路由狀態",
description=(
"回傳目前 Ollama/Gemini 路由策略、即時 primary、fallback chain 與健康狀態;"
"只讀,不觸發推理或自動修復。"
),
)
async def get_ai_route_status(
workload_type: str | None = Query(
"deep_rca",
description="工作負載類型,例如 deep_rca/hermes/interactive/embedding/rag/code_review/image_analysis",
),
) -> dict[str, Any]:
return await get_ai_route_status_svc(workload_type=workload_type)
@router.get(
"/runs/{run_id}/detail",
summary="查詢 Run 詳細時間線",
description=(
"返回單一 Run 的主狀態、Step Journal、MCP Gateway audit、"
"入站 Channel Event 與出站訊息,供 Operator Console 顯示完整處置脈絡。"
),
)
async def get_run_detail(
run_id: str,
project_id: str | None = Query(None, description="租戶 ID可選"),
) -> dict[str, Any]:
return await get_run_detail_svc(run_id=run_id, project_id=project_id)
@router.get(
"/status-chain",
summary="查詢 AwoooP 狀態鏈",
description=(
"依 incident_id 查詢 truth-chain + ADR-100 history 合併後的只讀狀態鏈,"
"供 Work Items、Approvals、Monitoring 等操作頁面共用。"
),
)
async def get_awooop_status_chain(
project_id: str | None = Query(None, description="租戶 ID可選"),
incident_id: list[str] | None = Query(
None,
description="Incident ID可重複傳入以合併同一工作項的多個事件",
),
) -> dict[str, Any]:
return await get_awooop_status_chain_svc(
project_id=project_id,
incident_ids=incident_id or [],
project_id=project_id, state=state, page=page, per_page=per_page
)
@@ -399,16 +109,8 @@ async def get_awooop_status_chain(
async def list_approvals(
project_id: str | None = Query(None, description="租戶 ID可選"),
run_id: str | None = Query(None, description="Run ID可選M8 詳情頁查單筆)"),
remediation_status: str | None = Query(
None,
description="AI 證據狀態 filterno_evidence/mcp_observed/read_only_dry_run/write_observed/blocked/observed",
),
) -> dict[str, Any]:
return await list_approvals_svc(
project_id=project_id,
run_id=run_id,
remediation_status=remediation_status,
)
return await list_approvals_svc(project_id=project_id, run_id=run_id)
@router.post(
@@ -425,12 +127,11 @@ async def list_approvals(
async def decide_approval(
run_id: str,
body: DecideApprovalRequest,
operator: AwoooPOperatorPrincipal = Depends(verify_awooop_operator),
) -> dict[str, Any]:
return await decide_approval_svc(
run_id=run_id,
project_id=body.project_id,
decision=body.decision,
approver_id=operator.operator_id,
approver_id=body.approver_id,
reason=body.reason,
)

View File

@@ -1,64 +0,0 @@
"""AwoooP Operator Console — truth-chain read API."""
from __future__ import annotations
from typing import Any
from fastapi import APIRouter, Depends, Query
from src.core.awooop_operator_auth import (
AwoooPOperatorPrincipal,
verify_awooop_operator,
)
from src.services.awooop_truth_chain_service import (
fetch_automation_quality_summary,
fetch_truth_chain,
)
router = APIRouter()
@router.get(
"/truth-chain/quality/summary",
summary="查詢 AI 自動化品質總覽",
description=(
"T12c read-only aggregate endpoint. 聚合最近 incident 的 automation quality gate"
"讓 Operator 不必逐張 Telegram 卡片判斷是否真正完成 AI 自動修復。"
"此總覽不回傳逐筆 examplessource-level truth-chain 詳情仍需 operator auth。"
),
)
async def get_automation_quality_summary(
project_id: str = Query("awoooi", description="租戶 ID"),
hours: int = Query(24, ge=1, le=168, description="回看小時數"),
limit: int = Query(200, ge=1, le=500, description="最多評估 incident 數"),
) -> dict[str, Any]:
summary = await fetch_automation_quality_summary(
project_id=project_id,
hours=hours,
limit=limit,
)
summary["examples"] = []
summary["visibility_note"] = (
"Aggregate only. Use /truth-chain/{source_id} with operator auth for source-level details."
)
return summary
@router.get(
"/truth-chain/{source_id}",
summary="查詢 Telegram / Incident / Drift 真相鏈",
description=(
"T0 read-only endpoint. 聚合 incident、approval、evidence、MCP、"
"automation_operation_log、drift repeat state 與 outbound mirror"
"讓 Operator Console 能判斷 Telegram 卡片目前卡在哪個流程節點。"
),
)
async def get_truth_chain(
source_id: str,
project_id: str = Query("awoooi", description="租戶 ID"),
operator: AwoooPOperatorPrincipal = Depends(verify_awooop_operator),
) -> dict[str, Any]:
# operator dependency intentionally gates this read API even though the
# principal is not otherwise needed by the aggregation query.
_ = operator
return await fetch_truth_chain(source_id=source_id, project_id=project_id)

View File

@@ -8,10 +8,9 @@ leWOOOgo 原則: Router 只做 HTTP 轉發,業務邏輯在 KnowledgeRAGService
建立者: Claude Code (Phase 33 ADR-067)
"""
from fastapi import APIRouter, BackgroundTasks
from fastapi import APIRouter, BackgroundTasks, HTTPException
from pydantic import BaseModel
from src.core.config import get_settings
from src.services.knowledge_rag_service import get_knowledge_rag_service
router = APIRouter(prefix="/rag", tags=["RAG Knowledge Base"])
@@ -44,10 +43,9 @@ async def trigger_index(background_tasks: BackgroundTasks) -> RagIndexResponse:
- .agents/skills/*.md
"""
background_tasks.add_task(_run_index)
model = get_settings().OLLAMA_EMBEDDING_MODEL
return RagIndexResponse(
status="accepted",
message=f"索引已排程,背景執行中({model} @ Ollama GCP-A/GCP-B/111",
message="索引已排程,背景執行中(nomic-embed-text @ Ollama 111",
)
@@ -64,7 +62,6 @@ async def rag_debug() -> dict:
"""診斷用:確認容器內 docs 路徑 + Ollama 連線"""
import os
from pathlib import Path
import httpx
paths_check = {}
@@ -79,27 +76,15 @@ async def rag_debug() -> dict:
try:
async with httpx.AsyncClient(timeout=10.0) as c:
from src.core.config import get_settings as _gs
from src.services.ollama_endpoint_resolver import resolve_ollama_order
settings = _gs()
statuses: list[str] = []
for endpoint in resolve_ollama_order("embedding"):
if not endpoint.url:
continue
r = await c.post(
f"{endpoint.url}/api/embeddings",
json={"model": settings.OLLAMA_EMBEDDING_MODEL, "prompt": "test"},
)
if r.status_code == 200:
ollama_ok = True
break
statuses.append(f"{endpoint.provider_name}=http_{r.status_code}")
if ollama_ok is not True:
ollama_ok = ", ".join(statuses) or "no_endpoint"
r = await c.post(
f"{_gs().OLLAMA_URL}/api/embeddings",
json={"model": "nomic-embed-text", "prompt": "test"},
)
ollama_ok = r.status_code == 200 if r.status_code == 200 else f"http_{r.status_code}"
except Exception as e:
ollama_ok = f"error: {type(e).__name__}: {e}"
return {"cwd": os.getcwd(), "paths": paths_check, "ollama_embedding": ollama_ok}
return {"cwd": os.getcwd(), "paths": paths_check, "ollama_111_embed": ollama_ok}
@router.get("/stats", summary="索引統計")

View File

@@ -14,15 +14,12 @@ AWOOOI API - Sentry Webhook Handler
🔴 HARD RULE: 時間顯示使用 Asia/Taipei (UTC+8)
"""
import json
import uuid
from typing import Any
import structlog
from fastapi import APIRouter, BackgroundTasks, HTTPException, Request
from pydantic import BaseModel
from src.core.awooop_operator_auth import authenticate_awooop_operator_headers
from src.core.circuit_breaker import get_openclaw_guard
from src.core.metrics import (
record_alert_chain_failure,
@@ -38,10 +35,8 @@ from src.models.approval import (
)
from src.services.anomaly_counter import get_anomaly_counter
from src.services.approval_db import get_approval_service
from src.services.channel_hub import record_external_alert_event
from src.services.openclaw_http_service import get_openclaw_http_service
from src.services.sentry_service import get_sentry_service
# 2026-04-27 P3.1-T2 by Claude — Tier-2 三服務感知強化:補 SentryWebhookService 簽章驗證
from src.services.sentry_webhook_service import (
SentrySignatureError,
@@ -92,114 +87,6 @@ async def sentry_webhook_health() -> dict:
return {"status": "ok", "webhook": "sentry"}
def _sentry_event_tag(event_data: dict[str, Any], key: str) -> str | None:
tags = event_data.get("tags") or []
for tag in tags:
if isinstance(tag, list | tuple) and len(tag) >= 2 and str(tag[0]) == key:
return str(tag[1])
if isinstance(tag, dict) and str(tag.get("key")) == key:
value = tag.get("value")
return str(value) if value is not None else None
return None
def _is_sentry_upstream_canary(payload: dict[str, Any]) -> bool:
data = payload.get("data") if isinstance(payload, dict) else None
if not isinstance(data, dict) or payload.get("action") != "triggered":
return False
issue_data = data.get("issue") if isinstance(data.get("issue"), dict) else {}
event_data = data.get("event") if isinstance(data.get("event"), dict) else {}
issue_id = str(issue_data.get("id") or "")
short_id = str(issue_data.get("shortId") or "")
title = str(issue_data.get("title") or "")
return (
issue_id.startswith("awoooi-canary-")
or short_id.upper().startswith("AWOOOI-CANARY")
or title == "AwoooPSourceProviderCanary"
or (_sentry_event_tag(event_data, "awoooi_canary") or "").lower() == "true"
)
async def _record_sentry_upstream_canary(
payload: dict[str, Any],
request: Request,
) -> dict[str, Any]:
operator = authenticate_awooop_operator_headers(
request.headers.get("x-awooop-operator-id"),
request.headers.get("x-awooop-operator-key"),
)
data = payload.get("data") if isinstance(payload.get("data"), dict) else {}
issue_data = data.get("issue") if isinstance(data.get("issue"), dict) else {}
event_data = data.get("event") if isinstance(data.get("event"), dict) else {}
issue_id = str(
issue_data.get("id")
or issue_data.get("shortId")
or _sentry_event_tag(event_data, "run_ref")
or "awoooi-canary-unknown"
)
source_url = (
issue_data.get("permalink")
or issue_data.get("web_url")
or issue_data.get("url")
)
event_uuid = await record_external_alert_event(
project_id="awoooi",
provider="sentry",
event_id=issue_id,
stage="upstream_canary",
title=str(issue_data.get("title") or "AwoooPSourceProviderCanary"),
severity=str(issue_data.get("level") or "info"),
namespace="awoooi-prod",
target_resource=str(issue_data.get("culprit") or "source-provider-ingestion"),
fingerprint=f"source-provider-canary:sentry:{issue_id}",
source_url=source_url,
labels={
"project": issue_data.get("project", {}),
"level": issue_data.get("level", "info"),
"awoooi_canary": "true",
"operator_id": operator.operator_id,
"telegram": "not_sent",
"incident": "not_created",
"approval": "not_created",
},
annotations={
"message": event_data.get("message"),
"summary": (
"Operator-signed Sentry webhook canary; records upstream "
"source evidence without creating incident, approval, or Telegram."
),
},
payload={
"raw_canary": payload,
"operator_id": operator.operator_id,
"auth_method": operator.auth_method,
"side_effects": {
"incident_created": False,
"approval_created": False,
"telegram_sent": False,
"openclaw_called": False,
},
},
)
if event_uuid is None:
raise HTTPException(
status_code=500,
detail="sentry upstream canary was not recorded",
)
return {
"status": "canary_recorded",
"provider": "sentry",
"event_id": issue_id,
"conversation_event_id": str(event_uuid),
"side_effects": {
"incident_created": False,
"approval_created": False,
"telegram_sent": False,
"openclaw_called": False,
},
}
@router.post("/error")
async def handle_sentry_error(
request: Request,
@@ -221,14 +108,6 @@ async def handle_sentry_error(
try:
# 2026-04-27 P3.1-T2 by Claude — Tier-2 三服務感知強化:接入 SentryWebhookService 簽章驗證
body = await request.body()
try:
payload_from_body = json.loads(body.decode("utf-8") or "{}")
except json.JSONDecodeError:
payload_from_body = {}
if isinstance(payload_from_body, dict) and _is_sentry_upstream_canary(payload_from_body):
return await _record_sentry_upstream_canary(payload_from_body, request)
sig_header = request.headers.get("sentry-hook-signature", "")
try:
verify_sentry_signature(body, sig_header)
@@ -245,60 +124,16 @@ async def handle_sentry_error(
# 提取錯誤資訊
issue_data = payload.get("data", {}).get("issue", {})
event_data = payload.get("data", {}).get("event", {})
issue_id = issue_data.get("id")
source_url = (
issue_data.get("permalink")
or issue_data.get("web_url")
or issue_data.get("url")
)
background_tasks.add_task(
record_external_alert_event,
project_id="awoooi",
provider="sentry",
event_id=str(issue_id or issue_data.get("shortId") or "unknown"),
stage="received",
title=str(issue_data.get("title") or "Sentry issue"),
severity=str(issue_data.get("level") or "error"),
namespace="sentry",
target_resource=str(issue_data.get("culprit") or issue_data.get("project", {}).get("slug") or "unknown"),
fingerprint=f"sentry-{issue_id or issue_data.get('shortId') or 'unknown'}",
source_url=source_url,
labels={
"project": issue_data.get("project", {}),
"level": issue_data.get("level"),
"culprit": issue_data.get("culprit"),
},
annotations={"message": event_data.get("message")},
payload=payload,
)
# Phase 10.2.1: 去重檢查 (10 分鐘內不重複發送)
issue_id = issue_data.get("id")
sentry_service = get_sentry_service()
if not await sentry_service.check_dedup(issue_id, ttl=SENTRY_DEDUP_TTL):
background_tasks.add_task(
record_external_alert_event,
project_id="awoooi",
provider="sentry",
event_id=str(issue_id or issue_data.get("shortId") or "unknown"),
stage="deduplicated",
title=str(issue_data.get("title") or "Sentry issue"),
severity=str(issue_data.get("level") or "error"),
namespace="sentry",
target_resource=str(issue_data.get("culprit") or issue_data.get("project", {}).get("slug") or "unknown"),
fingerprint=f"sentry-{issue_id or issue_data.get('shortId') or 'unknown'}",
source_url=source_url,
labels={"project": issue_data.get("project", {}), "level": issue_data.get("level")},
annotations={"message": event_data.get("message")},
payload={"dedup_ttl": SENTRY_DEDUP_TTL},
is_duplicate=True,
)
return {"status": "deduplicated", "issue_id": issue_id, "ttl": SENTRY_DEDUP_TTL}
event_data = payload.get("data", {}).get("event", {})
error_context = {
"issue_id": issue_data.get("id"),
"source_url": source_url,
"title": issue_data.get("title"),
"culprit": issue_data.get("culprit"),
"level": issue_data.get("level"),
@@ -334,8 +169,6 @@ async def handle_sentry_error(
"message": "Analysis scheduled"
}
except HTTPException:
raise
except Exception as e:
logger.exception("Sentry webhook processing failed")
raise HTTPException(status_code=500, detail=str(e)) from e
@@ -423,29 +256,6 @@ async def analyze_and_comment(
analysis=analysis,
anomaly_frequency=frequency_dict,
)
await record_external_alert_event(
project_id="awoooi",
provider="sentry",
event_id=str(issue_id or error_context.get("issue_id") or "unknown"),
stage="approval_linked",
title=str(error_context.get("title") or "Sentry issue"),
severity=str(error_context.get("level") or "error"),
namespace="sentry",
target_resource=str(error_context.get("culprit") or error_context.get("project") or "unknown"),
fingerprint=f"sentry-{issue_id or error_context.get('issue_id') or 'unknown'}",
approval_id=approval_id,
source_url=error_context.get("source_url"),
labels={
"project": error_context.get("project"),
"level": error_context.get("level"),
},
annotations={"message": error_context.get("message")},
payload={
"anomaly_frequency": frequency_dict,
"ai_analyzed": analysis is not None,
"ai_provider": analysis.analyzed_by if analysis else None,
},
)
# 4. 發送 Telegram 告警 (含頻率資訊)
await send_sentry_telegram_alert(

View File

@@ -1,3 +1,7 @@
from __future__ import annotations
import asyncio
"""
AWOOOI API - SignOz Webhook Handler
====================================
@@ -13,17 +17,12 @@ AWOOOI API - SignOz Webhook Handler
🔴 HARD RULE: 時間顯示使用 Asia/Taipei (UTC+8)
"""
from __future__ import annotations
import asyncio
import uuid
from typing import TYPE_CHECKING
import structlog
from fastapi import APIRouter, BackgroundTasks, HTTPException, Request
from pydantic import BaseModel
from src.core.awooop_operator_auth import authenticate_awooop_operator_headers
from src.core.metrics import (
record_alert_chain_failure,
record_alert_chain_success,
@@ -38,14 +37,10 @@ from src.models.approval import (
)
from src.services.anomaly_counter import get_anomaly_counter
from src.services.approval_db import get_approval_service
from src.services.channel_hub import record_external_alert_event
from src.services.incident_service import get_incident_service
from src.services.telegram_gateway import get_telegram_gateway
from src.utils.timezone import now_taipei_iso
if TYPE_CHECKING:
from src.services.openclaw import LLMAnalysisResult
logger = structlog.get_logger(__name__)
router = APIRouter(prefix="/webhooks/signoz", tags=["SignOz Webhook"])
@@ -72,101 +67,6 @@ class SignOzAlertPayload(BaseModel):
generatorURL: str | None = None
def _is_signoz_upstream_canary(alert: dict) -> bool:
labels = alert.get("labels", {}) if isinstance(alert.get("labels"), dict) else {}
annotations = (
alert.get("annotations", {})
if isinstance(alert.get("annotations"), dict)
else {}
)
alert_name = str(alert.get("alertname") or labels.get("alertname") or "")
return (
str(labels.get("awoooi_canary", "")).lower() == "true"
or alert_name == "AwoooPSourceProviderCanary"
or str(annotations.get("awooop_canary", "")).lower() == "true"
)
async def _record_signoz_upstream_canary(
alert: dict,
request: Request,
) -> dict:
operator = authenticate_awooop_operator_headers(
request.headers.get("x-awooop-operator-id"),
request.headers.get("x-awooop-operator-key"),
)
labels = alert.get("labels", {}) if isinstance(alert.get("labels"), dict) else {}
annotations = (
alert.get("annotations", {})
if isinstance(alert.get("annotations"), dict)
else {}
)
alert_name = str(alert.get("alertname") or labels.get("alertname") or "AwoooPSourceProviderCanary")
run_ref = str(labels.get("run_ref") or labels.get("fingerprint") or "unknown")
event_id = f"awooop-canary-{run_ref}"
severity = str(labels.get("severity") or "info")
service_name = str(labels.get("service_name") or labels.get("service") or "source-provider-ingestion")
namespace = str(labels.get("namespace") or "awoooi-prod")
fingerprint = str(labels.get("fingerprint") or f"source-provider-canary:signoz:{run_ref}")
event_uuid = await record_external_alert_event(
project_id="awoooi",
provider="signoz",
event_id=event_id,
stage="upstream_canary",
title=alert_name,
severity=severity,
namespace=namespace,
target_resource=service_name,
fingerprint=fingerprint,
source_url=alert.get("generatorURL"),
labels={
**labels,
"awoooi_canary": "true",
"operator_id": operator.operator_id,
"telegram": "not_sent",
"incident": "not_created",
"approval": "not_created",
},
annotations={
**annotations,
"summary": annotations.get("summary")
or (
"Operator-signed SignOz webhook canary; records upstream "
"source evidence without creating incident, approval, or Telegram."
),
},
payload={
"raw_canary": alert,
"operator_id": operator.operator_id,
"auth_method": operator.auth_method,
"side_effects": {
"incident_created": False,
"approval_created": False,
"telegram_sent": False,
"openclaw_called": False,
},
},
)
if event_uuid is None:
raise HTTPException(
status_code=500,
detail="signoz upstream canary was not recorded",
)
return {
"status": "canary_recorded",
"provider": "signoz",
"event_id": event_id,
"alert_name": alert_name,
"conversation_event_id": str(event_uuid),
"side_effects": {
"incident_created": False,
"approval_created": False,
"telegram_sent": False,
"openclaw_called": False,
},
}
@router.post("/alert")
async def handle_signoz_alert(
request: Request,
@@ -199,35 +99,11 @@ async def handle_signoz_alert(
results.append({"status": "ignored", "reason": "not firing"})
continue
if _is_signoz_upstream_canary(alert):
results.append(await _record_signoz_upstream_canary(alert, request))
continue
# 提取告警資訊
alert_name = alert.get("alertname", alert.get("labels", {}).get("alertname", "unknown"))
labels = alert.get("labels", {})
annotations = alert.get("annotations", {})
severity = labels.get("severity", "warning")
source_url = alert.get("generatorURL")
service_name = labels.get("service_name", labels.get("service", "unknown"))
fingerprint = labels.get("fingerprint") or f"signoz-{alert_name}-{service_name}"
background_tasks.add_task(
record_external_alert_event,
project_id="awoooi",
provider="signoz",
event_id=str(fingerprint),
stage="received",
title=str(alert_name),
severity=str(severity),
namespace=str(labels.get("namespace", "signoz")),
target_resource=str(service_name),
fingerprint=str(fingerprint),
source_url=source_url,
labels=labels,
annotations=annotations,
payload=alert,
)
# 背景處理
background_tasks.add_task(
@@ -237,8 +113,6 @@ async def handle_signoz_alert(
annotations=annotations,
severity=severity,
starts_at=alert.get("startsAt"),
source_url=source_url,
raw_payload=alert,
)
results.append({
@@ -248,8 +122,6 @@ async def handle_signoz_alert(
return {"status": "ok", "processed": len(results), "results": results}
except HTTPException:
raise
except Exception as e:
logger.exception("signoz_webhook_error", error=str(e))
raise HTTPException(status_code=500, detail=str(e)) from e
@@ -261,8 +133,6 @@ async def process_signoz_alert(
annotations: dict,
severity: str,
starts_at: str | None,
source_url: str | None = None,
raw_payload: dict | None = None,
):
"""
背景處理 SignOz 告警
@@ -320,7 +190,6 @@ async def process_signoz_alert(
"annotations": annotations,
"fingerprint": f"signoz-{alert_name}-{labels.get('service_name', 'unknown')}",
}
fingerprint = signal_data["fingerprint"]
# ADR-037: 傳遞頻率統計到 Incident
incident = await incident_service.create_incident_from_signal(
signal_data, frequency_stats=anomaly_frequency
@@ -360,30 +229,6 @@ async def process_signoz_alert(
anomaly_frequency=anomaly_frequency,
analysis_result=analysis_result, # 帶入 AI 結果
)
await record_external_alert_event(
project_id="awoooi",
provider="signoz",
event_id=str(fingerprint),
stage="incident_linked",
title=str(alert_name),
severity=str(severity),
namespace=str(labels.get("namespace", "signoz")),
target_resource=str(labels.get("service_name", labels.get("service", "unknown"))),
fingerprint=str(fingerprint),
incident_id=str(incident.incident_id),
approval_id=str(approval_id),
source_url=source_url or trace_url,
labels=labels,
annotations=annotations,
payload={
"raw_alert": raw_payload or {},
"trace_url": trace_url,
"has_signoz_metrics": bool(signoz_metrics),
"ai_provider": ai_provider,
"tokens": tokens,
"cost": cost,
},
)
# =================================================================
# Step 5: 發送 Telegram 告警
@@ -437,7 +282,7 @@ async def create_signoz_approval(
severity: str,
incident_id: str,
anomaly_frequency: dict | None = None,
analysis_result: LLMAnalysisResult | None = None,
analysis_result: "LLMAnalysisResult" | None = None,
) -> str:
"""
為 SignOz 告警建立 Approval 記錄
@@ -534,7 +379,7 @@ async def send_signoz_telegram(
annotations: dict,
severity: str,
anomaly_frequency: dict | None = None,
analysis_result: LLMAnalysisResult | None = None,
analysis_result: "LLMAnalysisResult" | None = None,
ai_provider: str = "none",
):
"""
@@ -597,7 +442,6 @@ async def _send_log_summary_notification(
帶 5s 軟超時:超時後摘要繼續生成並存 Redis不阻塞告警主流程
"""
import html as _html
from src.services.log_summary_service import get_log_summary_service
from src.services.telegram_gateway import get_telegram_gateway

View File

@@ -19,7 +19,6 @@ Endpoints:
- 每個 Nonce 只能使用一次
"""
import asyncio
from uuid import UUID
from fastapi import APIRouter, HTTPException, status
@@ -28,8 +27,6 @@ from pydantic import BaseModel
from src.core.config import settings
from src.core.logging import get_logger
from src.services.approval_db import get_approval_service
from src.services.approval_execution import get_execution_service
from src.services.incident_approval_service import get_incident_approval_service
from src.services.security_interceptor import (
NonceReplayError,
UserNotWhitelistedError,
@@ -67,80 +64,6 @@ class TestPushRequest(BaseModel):
incident_id: str = ""
async def _run_telegram_approved_execution(approval) -> None:
"""Run the approved action that originated from a Telegram callback."""
approval_id = str(getattr(approval, "id", ""))
incident_id = getattr(approval, "incident_id", None)
try:
result = await get_execution_service().execute_approved_action(approval)
logger.info(
"telegram_approval_execution_completed",
approval_id=approval_id,
incident_id=incident_id,
success=bool(result),
)
except Exception as exc:
logger.error(
"telegram_approval_execution_failed",
approval_id=approval_id,
incident_id=incident_id,
error=str(exc),
)
def _schedule_telegram_approved_execution(approval) -> bool:
"""Schedule execution after Telegram approval reaches required signatures."""
try:
asyncio.create_task(_run_telegram_approved_execution(approval))
logger.info(
"telegram_approval_execution_scheduled",
approval_id=str(getattr(approval, "id", "")),
incident_id=getattr(approval, "incident_id", None),
)
return True
except Exception as exc:
logger.error(
"telegram_approval_execution_schedule_failed",
approval_id=str(getattr(approval, "id", "")),
incident_id=getattr(approval, "incident_id", None),
error=str(exc),
)
return False
async def _finalize_telegram_approval(approval, execution_triggered: bool) -> bool:
"""Complete the execution handoff for Telegram approvals.
ApprovalDBService only records the signature/status transition. The actual
executor scheduling lives in API callers, so Telegram must mirror the REST
approval endpoint instead of stopping at a visual approval stamp.
"""
if not execution_triggered:
return False
return _schedule_telegram_approved_execution(approval)
async def _sync_telegram_rejection(approval_id: str) -> bool:
"""Keep Incident state aligned when an approval is rejected from Telegram."""
try:
await get_incident_approval_service().on_approval_status_change(
approval_id=approval_id,
new_status="rejected",
)
logger.info(
"telegram_rejection_incident_synced",
approval_id=approval_id,
)
return True
except Exception as exc:
logger.error(
"telegram_rejection_incident_sync_failed",
approval_id=approval_id,
error=str(exc),
)
return False
# =============================================================================
# Endpoints
# =============================================================================
@@ -275,50 +198,21 @@ async def telegram_webhook(
)
if approval:
status_value = approval.status.value if hasattr(approval.status, "value") else str(approval.status)
if (
"Cannot sign" in msg
or "already signed" in msg
or "Concurrent modification" in msg
):
logger.info(
"telegram_approval_ignored_already_processed",
approval_id=approval_id,
user_id=user_id,
status=status_value,
message=msg,
)
await _log_user_action("approve_duplicate", False, getattr(approval, "incident_id", None))
return {
"ok": True,
"message": "Already processed",
"approval_id": approval_id,
"status": status_value,
"execution_triggered": False,
"execution_scheduled": False,
}
execution_scheduled = await _finalize_telegram_approval(
approval=approval,
execution_triggered=execution_triggered,
)
logger.info(
"telegram_approval_signed",
approval_id=approval_id,
user_id=user_id,
status=status_value,
status=approval.status.value,
execution_triggered=execution_triggered,
execution_scheduled=execution_scheduled,
)
await _log_user_action("approve", True, getattr(approval, "incident_id", None))
return {
"ok": True,
"message": "Approved" if execution_triggered else "Signed",
"message": "Approved",
"approval_id": approval_id,
"status": status_value,
"status": approval.status.value,
"execution_triggered": execution_triggered,
"execution_scheduled": execution_scheduled,
}
elif action == "reject":
@@ -330,12 +224,10 @@ async def telegram_webhook(
)
if approval:
incident_synced = await _sync_telegram_rejection(approval_id)
logger.info(
"telegram_approval_rejected",
approval_id=approval_id,
user_id=user_id,
incident_synced=incident_synced,
)
await _log_user_action("reject", False, getattr(approval, "incident_id", None))
@@ -344,7 +236,6 @@ async def telegram_webhook(
"message": "Rejected",
"approval_id": approval_id,
"status": approval.status.value,
"incident_synced": incident_synced,
}
return {"ok": False, "message": "Unknown action"}

View File

@@ -33,8 +33,14 @@ from pydantic import BaseModel, Field
from src.core.config import settings
from src.core.constants import is_cicd_alertname, is_heartbeat_alertname
from src.services.alert_rule_engine import get_incident_type, match_rule
from src.services.action_parser import is_safe_kubectl_action
from src.services.security_interceptor import check_webhook_nonce # P0-06: nonce dedup via Service 層
from src.core.logging import get_logger
from src.core.metrics import record_alert_chain_success
# Phase 15.2: Trace Context (moved to SignalProducerService)
# get_trace_context 已移至 Service 層
from src.models.approval import (
ApprovalRequestCreate,
BlastRadius,
@@ -42,43 +48,31 @@ from src.models.approval import (
DryRunCheck,
RiskLevel,
)
# R4 #129 (2026-04-01 ogt): AlertPayload/AlertResponse 移至 models 層AlertAnalyzer 移至 services 層
# ogt 更新 v1.1 2026-04-01 台北時間: generate_alert_fingerprint 移至 alert_analyzer_service (ADR-024)
# [首席架構師] 移除 generate_alert_fingerprint 直接 import改用 AlertAnalyzer.generate_fingerprint v1.2 2026-04-01 Asia/Taipei
from src.models.webhook import AlertPayload, AlertResponse
from src.services.action_parser import is_safe_kubectl_action
from src.services.alert_analyzer_service import AlertAnalyzer
from src.services.alert_approval_guard import guard_alert_approval_action
from src.services.alert_grouping_service import get_alert_grouping_service
from src.services.alert_rule_engine import get_incident_type, match_rule
from src.services.alertmanager_llm_guard import (
ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS,
try_acquire_alertmanager_llm_lock,
)
from src.services.approval_db import get_approval_service
from src.services.auto_approve import get_auto_approve_policy
from src.services.auto_repair_service import AutoRepairService
from src.services.channel_hub import (
record_alertmanager_event,
record_grouped_alert_event,
)
# Phase 15.2: Trace Context (moved to SignalProducerService)
# get_trace_context 已移至 Service 層
# R4 #129 (2026-04-01 ogt): AlertPayload/AlertResponse 移至 models 層AlertAnalyzer 移至 services 層
# ogt 更新 v1.1 2026-04-01 台北時間: generate_alert_fingerprint 移至 alert_analyzer_service (ADR-024)
# [首席架構師] 移除 generate_alert_fingerprint 直接 import改用 AlertAnalyzer.generate_fingerprint v1.2 2026-04-01 Asia/Taipei
# Phase 17 P0: Service 層 (消除 Router 直接存取 Redis)
# C2 修正 (首席架構師審查 2026-04-10): create_incident_for_approval + extract_affected_services 已移入 Service 層
from src.services.incident_service import (
classify_alert_early,
create_incident_for_approval,
extract_affected_services,
get_incident_service,
)
from src.services.auto_approve import get_auto_approve_policy
from src.services.auto_repair_service import AutoRepairService
# Phase 5: OpenClaw AI Engine
from src.services.openclaw import get_openclaw
from src.services.playbook_match_resolver import resolve_playbook_id_for_alert
from src.services.security_interceptor import check_webhook_nonce # P0-06: nonce dedup via Service 層
from src.services.signal_producer import SignalData, get_signal_producer
# Phase 5: Telegram Gateway (行動戰情室)
@@ -87,6 +81,9 @@ from src.services.telegram_gateway import TelegramGatewayError, get_telegram_gat
# Phase 18.1.7: K8s 資源名稱正規化 已移至 alert_analyzer_service (R4 #129)
from src.utils.timezone import now_taipei
# ADR-076: 告警聚合引擎 (2026-04-14 Claude Haiku 4.5 Asia/Taipei)
from src.services.alert_grouping_service import get_alert_grouping_service
router = APIRouter(prefix="/webhooks", tags=["Webhooks"])
logger = get_logger("awoooi.webhooks")
@@ -139,38 +136,6 @@ def _should_use_alertmanager_rule_first(
)
async def _analyze_alertmanager_with_timeout(
openclaw,
alert_context: dict,
*,
alert_id: str,
alertname: str,
) -> tuple:
"""Run Alertmanager AI analysis without letting it block the workflow forever."""
try:
return await asyncio.wait_for(
openclaw.analyze_alert(alert_context),
timeout=ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS,
)
except TimeoutError:
logger.warning(
"alertmanager_openclaw_timeout_fallback",
alert_id=alert_id,
alertname=alertname,
timeout_sec=ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS,
)
return None, "fallback_timeout", "", None, "", 0, 0.0
except Exception as exc:
logger.warning(
"alertmanager_openclaw_failed_fallback",
alert_id=alert_id,
alertname=alertname,
error=str(exc),
)
return None, "fallback_error", "", None, "", 0, 0.0
async def _escalate_auto_repair_unavailable(
*,
incident_id: str,
@@ -198,19 +163,6 @@ async def _escalate_auto_repair_unavailable(
)
def _auto_repair_action_label(result, fallback_target: str) -> str:
"""Build a verifier label that includes the actual playbook steps."""
playbook_id = getattr(result, "playbook_id", None) or "unknown"
steps = getattr(result, "executed_steps", None) or []
step_text = " | ".join(str(step) for step in steps).strip()
if not step_text:
step_text = fallback_target
step_text = " ".join(step_text.split())
if len(step_text) > 240:
step_text = f"{step_text[:237]}..."
return f"auto_repair_playbook:{playbook_id} {step_text}".strip()
async def _try_auto_repair_background(
incident_id: str,
approval_id: str,
@@ -300,46 +252,6 @@ async def _try_auto_repair_background(
},
)
_pre_execution_snapshot = None
try:
from src.core.feature_flags import aiops_flags
if aiops_flags.is_sub_flag_enabled("AIOPS_P1_PRE_DECISION_INVESTIGATOR"):
from src.services.evidence_snapshot import get_latest_snapshot
from src.services.post_execution_verifier import get_post_execution_verifier
_pre_execution_snapshot = await get_latest_snapshot(incident_id)
if _pre_execution_snapshot is None:
from src.services.pre_decision_investigator import (
get_pre_decision_investigator,
)
_pre_execution_snapshot = await asyncio.wait_for(
get_pre_decision_investigator().investigate(incident),
timeout=60.0,
)
if _pre_execution_snapshot is not None:
await asyncio.wait_for(
get_post_execution_verifier().capture_pre_execution_state(
incident,
_pre_execution_snapshot,
),
timeout=30.0,
)
except asyncio.TimeoutError:
logger.warning(
"auto_repair_pre_state_capture_timeout",
incident_id=incident_id,
approval_id=approval_id,
)
except Exception as _pre_state_err:
logger.warning(
"auto_repair_pre_state_capture_failed",
incident_id=incident_id,
approval_id=approval_id,
error=str(_pre_state_err),
)
# 執行自動修復
logger.info(
"auto_repair_executing",
@@ -351,7 +263,6 @@ async def _try_auto_repair_background(
playbook=decision.playbook,
is_cold_start=decision.is_cold_start,
similarity_score=decision.similarity_score,
run_post_verification=False,
)
logger.info(
@@ -362,20 +273,6 @@ async def _try_auto_repair_background(
# 記錄執行結果
if result:
try:
await get_approval_service().update_execution_status(
approval_id=approval_id,
success=result.success,
error_message=result.error,
)
except Exception as _approval_status_err:
logger.warning(
"auto_repair_approval_status_update_failed",
approval_id=approval_id,
incident_id=incident_id,
error=str(_approval_status_err),
)
await op_log.append(
"EXECUTION_COMPLETED",
incident_id=incident_id,
@@ -439,10 +336,11 @@ async def _try_auto_repair_background(
from src.services.evidence_snapshot import get_latest_snapshot
from src.services.learning_service import get_learning_service
_snapshot = _pre_execution_snapshot or await get_latest_snapshot(incident_id)
_action_label = _auto_repair_action_label(
result,
fallback_target=f"{target_resource}:{namespace}",
_snapshot = await get_latest_snapshot(incident_id)
_action_label = (
f"{target_resource}:{namespace}"
if not result.success
else f"auto_repair_playbook:{result.playbook_id}"
)
_verifier = get_post_execution_verifier()
_verify_result = await asyncio.wait_for(
@@ -894,7 +792,6 @@ async def verify_webhook_signature(
# 戰略 B: 滑動時間窗 (ADR-073: 5 分鐘改 30 分鐘,防同一問題反覆重建 Incident2026-04-12 ogt)
DEBOUNCE_WINDOW_MINUTES = 30
ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS = 90.0
# =============================================================================
@@ -1208,12 +1105,7 @@ async def receive_alert(
# 呼叫 OpenClaw LLM 分析 (v7.0 含 SignOz 整合)
# 2026-03-29 ogt: 加入 Token/Cost 追蹤
openclaw = get_openclaw()
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await _analyze_alertmanager_with_timeout(
openclaw,
alert_context,
alert_id=alert_id,
alertname=alert.alert_type,
)
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context)
if analysis_result:
# LLM 分析成功
@@ -1255,33 +1147,15 @@ async def receive_alert(
data_impact = impact_mapping.get(blast.data_impact.value, DataImpact.NONE)
# 2026-04-27 Claude Sonnet 4.6: shadow-run Step1 — 補 metadata kwarg讓 extra_metadata 可觀測
_cmd_cs1 = (analysis_result.kubectl_command or "").strip()
_alertname_cs1 = str((alert.labels or {}).get("alertname") or alert.alert_type or "")
_guarded_action_cs1 = await guard_alert_approval_action(
action=(_cmd_cs1 or f"{analysis_result.action_title} | NO_ACTION"),
alert_namespace=alert.namespace,
alertname=_alertname_cs1,
alert_category=get_incident_type(_alertname_cs1),
)
_matched_playbook_id_cs1 = await resolve_playbook_id_for_alert(
alertname=_alertname_cs1,
affected_services=analysis_result.affected_services
or ([alert.target_resource] if alert.target_resource else []),
severity=risk_level.value,
)
if _guarded_action_cs1.blocked:
risk_level = RiskLevel.LOW
_cmd_cs1 = ""
_approval_metadata_cs1 = {
"source": ai_provider,
"confidence_score": analysis_result.confidence,
"is_rule_based": False,
"playbook_id": _matched_playbook_id_cs1,
**_guarded_action_cs1.metadata,
"playbook_id": None,
}
_cmd_cs1 = (analysis_result.kubectl_command or "").strip()
approval_create = ApprovalRequestCreate(
action=_guarded_action_cs1.action,
action=(_cmd_cs1 or f"{analysis_result.action_title} | NO_ACTION"),
description=f"[AI: {ai_provider}] {analysis_result.action_title} | {analysis_result.description}",
risk_level=risk_level,
blast_radius=BlastRadius(
@@ -1298,7 +1172,6 @@ async def receive_alert(
],
requested_by=f"OpenClaw ({ai_provider})",
metadata=_approval_metadata_cs1,
matched_playbook_id=_matched_playbook_id_cs1,
)
suggested_action = analysis_result.kubectl_command
else:
@@ -1345,7 +1218,7 @@ async def receive_alert(
# 設計confidence ≥ 0.85 + 非 CRITICAL + 非破壞性 + 有 kubectl 指令 → 直接執行
# 安全防線CRITICAL / destructive patterns / NO_ACTION/INVESTIGATE/OBSERVE / 空 kubectl → 降級 PENDING
if analysis_result:
_cs1_kubectl = _cmd_cs1
_cs1_kubectl = analysis_result.kubectl_command.strip() if analysis_result.kubectl_command else ""
_cs1_can_auto = (
bool(_cs1_kubectl)
and analysis_result.confidence >= 0.85
@@ -1366,7 +1239,7 @@ async def receive_alert(
required_signatures=0,
status=ApprovalStatus.APPROVED,
risk_level=risk_level.value,
matched_playbook_id=_matched_playbook_id_cs1,
matched_playbook_id=None,
metadata={
**_approval_metadata_cs1,
"is_high_confidence": True,
@@ -1547,39 +1420,6 @@ class AlertmanagerPayload(BaseModel):
alerts: list[AlertmanagerAlert]
_CICD_JOB_STATUSES = frozenset({"running", "success", "failed", "pending"})
def _cicd_job_status_from_alert(alert: AlertmanagerAlert) -> str:
"""將 CI/CD Alertmanager label 轉成 TelegramGateway 支援的狀態。
2026-05-12 Codex: Gitea workflow 先送進 AWOOI API不能只靠
severity=info 推 success否則 failed/pending 事件進 AwoooP 後語義會失真。
"""
labels = alert.labels or {}
for key in ("status", "job_status", "ci_status"):
value = str(labels.get(key) or "").strip().lower()
if value in _CICD_JOB_STATUSES:
return value
severity = str(labels.get("severity") or "").strip().lower()
if severity == "info":
return "success"
if severity in {"critical", "error"}:
return "failed"
return "running"
def _cicd_duration_seconds_from_alert(alert: AlertmanagerAlert) -> int:
labels = alert.labels or {}
raw = labels.get("duration_seconds") or labels.get("duration") or 0
try:
value = int(str(raw).strip())
except (TypeError, ValueError):
return 0
return max(value, 0)
def is_internal_ip(client_ip: str) -> bool:
"""檢查是否為內網 IP"""
import ipaddress
@@ -1616,11 +1456,6 @@ async def _process_new_alert_background(
try:
service = get_approval_service()
openclaw = get_openclaw()
traced_alert_labels = {
**(alert_labels or {}),
"fingerprint": fingerprint,
"alert_id": alert_id,
}
rule_response = match_rule(alert_context)
should_bypass_llm = _should_use_alertmanager_rule_first(rule_response, alert_category)
@@ -1654,6 +1489,7 @@ async def _process_new_alert_background(
str(blast.get("data_impact", "NONE")).upper(),
DataImpact.NONE,
)
rule_action_title = str(rule_response.get("action_title", "人工排查主機告警"))
rule_kubectl = str(rule_response.get("kubectl_command", "")).strip()
rule_description = str(rule_response.get("description", message))
rule_action = (
@@ -1661,31 +1497,13 @@ async def _process_new_alert_background(
if rule_kubectl else
f"NO_ACTION - {rule_description[:120]}"
)
_matched_playbook_id_cs2 = await resolve_playbook_id_for_alert(
rule_id=str(rule_response.get("rule_id", "")),
alertname=alertname,
affected_services=[target_resource] if target_resource else [],
severity=rule_risk.value,
)
_guarded_action_cs2 = await guard_alert_approval_action(
action=rule_action,
alert_namespace=namespace,
alertname=alertname,
alert_category=alert_category,
)
if _guarded_action_cs2.blocked:
rule_action = _guarded_action_cs2.action
rule_kubectl = ""
rule_risk = RiskLevel.LOW
# 2026-04-27 Claude Sonnet 4.6: shadow-run Step1 — 補 metadata kwarg讓 extra_metadata 可觀測
_approval_metadata_cs2 = {
"source": "rule_engine",
"confidence_score": float(rule_response.get("confidence", 0.0) or 0.0),
"is_rule_based": True,
"rule_id": str(rule_response.get("rule_id", "")) or None,
"playbook_id": _matched_playbook_id_cs2,
**_guarded_action_cs2.metadata,
"playbook_id": str(rule_response.get("rule_id", "")) or None,
}
approval_create = ApprovalRequestCreate(
action=rule_action,
@@ -1716,7 +1534,6 @@ async def _process_new_alert_background(
],
requested_by="OpenClaw (rule-engine)",
metadata=_approval_metadata_cs2,
matched_playbook_id=_matched_playbook_id_cs2,
)
approval = await service.create_approval_with_fingerprint(
@@ -1748,10 +1565,6 @@ async def _process_new_alert_background(
# 2026-04-27 ogt + Claude Sonnet 4.6: CS2 規則引擎自動執行
# 設計is_rule_based=True 確定性高,滿足條件直接執行,不等人工審核
# 安全防線CRITICAL / destructive patterns / NO_ACTION / 空 kubectl → 全部降級 PENDING
_cs2_auto_approval = None
_cs2_executor = None
_cs2_exec_success: bool | None = None
_cs2_exec_error: str | None = None
try:
from src.models.approval import ApprovalRequest, ApprovalStatus
from src.services.approval_execution import ApprovalExecutionService
@@ -1771,11 +1584,10 @@ async def _process_new_alert_background(
required_signatures=0,
status=ApprovalStatus.APPROVED,
risk_level=rule_risk.value,
matched_playbook_id=_matched_playbook_id_cs2,
matched_playbook_id=_approval_metadata_cs2.get("playbook_id"),
)
# 使用 DB 中剛建立的 approval.id 讓 executor 可回寫
_auto_approval.id = approval.id
_cs2_auto_approval = _auto_approval
_cs2_executor = ApprovalExecutionService()
_cs2_exec_success = await _cs2_executor.execute_approved_action(_auto_approval)
@@ -1798,8 +1610,6 @@ async def _process_new_alert_background(
exec_success=_cs2_exec_success,
)
except Exception as _auto_err:
_cs2_exec_success = False if _cs2_auto_approval is not None else None
_cs2_exec_error = str(_auto_err)
logger.warning(
"cs2_auto_execute_failed_degraded_to_pending",
approval_id=str(approval.id),
@@ -1815,7 +1625,7 @@ async def _process_new_alert_background(
message=message,
source="alertmanager",
alertname=alertname,
alert_labels=traced_alert_labels,
alert_labels=alert_labels,
notification_type=notification_type,
alert_category=alert_category,
)
@@ -1831,41 +1641,6 @@ async def _process_new_alert_background(
error=str(_meta_err),
)
await record_alertmanager_event(
project_id="awoooi",
alert_id=alert_id,
alertname=alertname,
severity=severity,
namespace=namespace,
target_resource=target_resource,
fingerprint=fingerprint,
stage="incident_linked",
notification_type=notification_type,
alert_category=alert_category,
incident_id=incident_id,
approval_id=str(approval.id),
repeat_count=1,
labels=traced_alert_labels,
annotations=alert_context.get("annotations", {}),
)
if _cs2_auto_approval is not None and _cs2_exec_success is not None:
try:
_cs2_auto_approval.incident_id = incident_id
_cs2_executor = _cs2_executor or ApprovalExecutionService()
await _cs2_executor.finalize_auto_approved_execution(
_cs2_auto_approval,
success=_cs2_exec_success,
error_message=_cs2_exec_error,
)
except Exception as _cs2_finalize_err:
logger.warning(
"cs2_auto_execute_finalize_failed",
approval_id=str(approval.id),
incident_id=incident_id,
error=str(_cs2_finalize_err),
)
_is_heartbeat = is_heartbeat_alertname(alertname)
if can_auto_repair and not _is_heartbeat:
await _try_auto_repair_background(
@@ -1919,12 +1694,7 @@ async def _process_new_alert_background(
record_alert_chain_success("alertmanager")
return
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await _analyze_alertmanager_with_timeout(
openclaw,
alert_context,
alert_id=alert_id,
alertname=alertname,
)
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context)
if analysis_result:
risk_mapping = {
@@ -1954,34 +1724,15 @@ async def _process_new_alert_background(
data_impact = impact_mapping.get(blast.data_impact.value, DataImpact.NONE) if blast else DataImpact.NONE
# 2026-04-27 Claude Sonnet 4.6: shadow-run Step1 — 補 metadata kwarg讓 extra_metadata 可觀測
_cmd_cs3 = (analysis_result.kubectl_command or "").strip()
_guarded_action_cs3 = await guard_alert_approval_action(
action=(_cmd_cs3 or f"{analysis_result.action_title} | NO_ACTION"),
alert_namespace=namespace,
alertname=alertname,
alert_category=alert_category,
)
_matched_playbook_id_cs3 = await resolve_playbook_id_for_alert(
rule_id=str(rule_response.get("rule_id", "")),
alertname=alertname,
affected_services=analysis_result.affected_services
or ([target_resource] if target_resource else []),
severity=risk_level.value,
)
if _guarded_action_cs3.blocked:
risk_level = RiskLevel.LOW
_cmd_cs3 = ""
_approval_metadata_cs3 = {
"source": ai_provider,
"confidence_score": analysis_result.confidence,
"is_rule_based": False,
"rule_id": str(rule_response.get("rule_id", "")) or None,
"playbook_id": _matched_playbook_id_cs3,
**_guarded_action_cs3.metadata,
"playbook_id": None,
}
_cmd_cs3 = (analysis_result.kubectl_command or "").strip()
approval_create = ApprovalRequestCreate(
action=_guarded_action_cs3.action,
action=(_cmd_cs3 or f"{analysis_result.action_title} | NO_ACTION"),
description=f"[AI: {ai_provider}] {analysis_result.action_title} | {analysis_result.description}",
risk_level=risk_level,
blast_radius=BlastRadius(
@@ -1996,7 +1747,6 @@ async def _process_new_alert_background(
],
requested_by=f"OpenClaw ({ai_provider})",
metadata=_approval_metadata_cs3,
matched_playbook_id=_matched_playbook_id_cs3,
)
approval = await service.create_approval_with_fingerprint(
@@ -2010,7 +1760,7 @@ async def _process_new_alert_background(
"risk_level": risk_level.value,
"confidence": analysis_result.confidence,
"action": approval_create.action,
"kubectl_command": _cmd_cs3,
"kubectl_command": analysis_result.kubectl_command,
"is_rule_based": False,
"source": ai_provider,
}
@@ -2026,7 +1776,7 @@ async def _process_new_alert_background(
logger.warning("shadow_auto_approve_failed", error=str(_shadow_err_cs3))
# 2026-04-27 Claude Sonnet 4.6: CS3 LLM 高信心自動執行修法3擴展
_cs3_kubectl = _cmd_cs3
_cs3_kubectl = (analysis_result.kubectl_command or "").strip()
_cs3_can_auto = (
bool(_cs3_kubectl)
and analysis_result.confidence >= 0.85
@@ -2034,15 +1784,8 @@ async def _process_new_alert_background(
and "NO_ACTION" not in (analysis_result.action_title or "")
and is_safe_kubectl_action(_cs3_kubectl)
)
_cs3_auto_approval = None
_cs3_executor = None
_cs3_exec_success: bool | None = None
_cs3_exec_error: str | None = None
if _cs3_can_auto:
try:
from src.models.approval import ApprovalRequest, ApprovalStatus
from src.services.approval_execution import ApprovalExecutionService
_cs3_auto_approval = ApprovalRequest(
action=approval_create.action,
description=approval_create.description,
@@ -2050,7 +1793,7 @@ async def _process_new_alert_background(
required_signatures=0,
status=ApprovalStatus.APPROVED,
risk_level=risk_level.value,
matched_playbook_id=_matched_playbook_id_cs3,
matched_playbook_id=None,
metadata={
**_approval_metadata_cs3,
"is_high_confidence": True,
@@ -2059,17 +1802,8 @@ async def _process_new_alert_background(
else "cs3_auto_confident_execution",
},
)
_cs3_auto_approval.id = approval.id
_cs3_executor = ApprovalExecutionService()
_cs3_exec_success = await _cs3_executor.execute_approved_action(_cs3_auto_approval)
try:
await service.update_execution_status(approval.id, _cs3_exec_success)
except Exception as _cs3_upd_err:
logger.warning(
"cs3_auto_execute_status_update_failed",
approval_id=str(approval.id),
error=str(_cs3_upd_err),
)
logger.info(
"cs3_llm_auto_executed",
approval_id=str(approval.id),
@@ -2085,8 +1819,6 @@ async def _process_new_alert_background(
),
)
except Exception as _cs3_exec_err:
_cs3_exec_success = False if _cs3_auto_approval is not None else None
_cs3_exec_error = str(_cs3_exec_err)
logger.warning("cs3_llm_auto_execute_failed", error=str(_cs3_exec_err))
incident_id = await create_incident_for_approval(
@@ -2098,7 +1830,7 @@ async def _process_new_alert_background(
message=message,
source="alertmanager",
alertname=alertname,
alert_labels=traced_alert_labels,
alert_labels=alert_labels,
notification_type=notification_type,
alert_category=alert_category,
)
@@ -2114,41 +1846,6 @@ async def _process_new_alert_background(
error=str(_meta_err),
)
await record_alertmanager_event(
project_id="awoooi",
alert_id=alert_id,
alertname=alertname,
severity=severity,
namespace=namespace,
target_resource=target_resource,
fingerprint=fingerprint,
stage="incident_linked",
notification_type=notification_type,
alert_category=alert_category,
incident_id=incident_id,
approval_id=str(approval.id),
repeat_count=1,
labels=traced_alert_labels,
annotations=alert_context.get("annotations", {}),
)
if _cs3_auto_approval is not None and _cs3_exec_success is not None:
try:
_cs3_auto_approval.incident_id = incident_id
_cs3_executor = _cs3_executor or ApprovalExecutionService()
await _cs3_executor.finalize_auto_approved_execution(
_cs3_auto_approval,
success=_cs3_exec_success,
error_message=_cs3_exec_error,
)
except Exception as _cs3_finalize_err:
logger.warning(
"cs3_auto_execute_finalize_failed",
approval_id=str(approval.id),
incident_id=incident_id,
error=str(_cs3_finalize_err),
)
root_cause = analysis_result.description or message
estimated_downtime = blast.estimated_downtime if blast else "~30s"
primary_responsibility = analysis_result.primary_responsibility or "COLLAB"
@@ -2198,7 +1895,7 @@ async def _process_new_alert_background(
risk_level=risk_level.value,
resource_name=target_resource,
root_cause=root_cause,
suggested_action=approval_create.action,
suggested_action=(analysis_result.kubectl_command or "").strip() or analysis_result.suggested_action.value,
estimated_downtime=estimated_downtime,
hit_count=1,
primary_responsibility=primary_responsibility,
@@ -2224,17 +1921,11 @@ async def _process_new_alert_background(
else:
# LLM 失敗 - 使用預設值
# 2026-04-27 Claude Sonnet 4.6: shadow-run Step1 — 補 metadata kwarg讓 extra_metadata 可觀測
_matched_playbook_id_cs4 = await resolve_playbook_id_for_alert(
rule_id=str(rule_response.get("rule_id", "")),
alertname=alertname,
affected_services=[target_resource] if target_resource else [],
severity="medium",
)
_approval_metadata_cs4 = {
"source": "fallback",
"confidence_score": None,
"is_rule_based": False,
"playbook_id": _matched_playbook_id_cs4,
"playbook_id": None,
}
fallback_create = ApprovalRequestCreate(
action="OBSERVE",
@@ -2249,7 +1940,6 @@ async def _process_new_alert_background(
dry_run_checks=[],
requested_by="OpenClaw (fallback)",
metadata=_approval_metadata_cs4,
matched_playbook_id=_matched_playbook_id_cs4,
)
approval = await service.create_approval_with_fingerprint(
@@ -2287,7 +1977,7 @@ async def _process_new_alert_background(
message=message,
source="alertmanager",
alertname=alertname,
alert_labels=traced_alert_labels,
alert_labels=alert_labels,
notification_type=notification_type,
alert_category=alert_category,
)
@@ -2303,55 +1993,6 @@ async def _process_new_alert_background(
error=str(_meta_err),
)
await record_alertmanager_event(
project_id="awoooi",
alert_id=alert_id,
alertname=alertname,
severity=severity,
namespace=namespace,
target_resource=target_resource,
fingerprint=fingerprint,
stage="incident_linked",
notification_type=notification_type,
alert_category=alert_category,
incident_id=fallback_incident_id,
approval_id=str(approval.id),
repeat_count=1,
labels=traced_alert_labels,
annotations=alert_context.get("annotations", {}),
)
_is_heartbeat = is_heartbeat_alertname(alertname)
if can_auto_repair and not _is_heartbeat:
await _try_auto_repair_background(
incident_id=fallback_incident_id,
approval_id=str(approval.id),
alert_type=alert_type,
target_resource=target_resource,
namespace=namespace,
)
elif not can_auto_repair and not _is_heartbeat:
from src.repositories.alert_operation_log_repository import get_alert_operation_log_repository
_op_log_fallback = get_alert_operation_log_repository()
await _op_log_fallback.append(
"GUARDRAIL_BLOCKED",
incident_id=fallback_incident_id,
approval_id=str(approval.id),
actor="prometheus-rule",
action_detail=f"Prometheus rule 設定 auto_repair=falsefallback 轉人工: {alertname}",
success=False,
context={"alertname": alertname, "auto_repair_flag": False},
)
await _escalate_auto_repair_unavailable(
incident_id=fallback_incident_id,
approval_id=str(approval.id),
alert_type=alert_type,
target_resource=target_resource,
namespace=namespace,
failure_reason="Prometheus rule auto_repair=falsefallback 未進入自動修復評估",
attempted_actions="llm_fallback -> guardrail:auto_repair_false -> emergency_intervention",
)
await _push_to_telegram_background(
approval_id=str(approval.id),
risk_level="medium",
@@ -2441,7 +2082,6 @@ async def alertmanager_webhook(
# (2026-04-08 Claude Sonnet 4.6 Asia/TaipeiADR-062 Q9)
# ==========================================================================
_alert_labels = alert.labels or {}
_alert_annotations = alert.annotations or {}
_alertname_for_log = _alert_labels.get("alertname", "UnknownAlert")
# Q9: auto_repair flag — Rule=false 強制 HITL不觸發自動修復背景任務
_can_auto_repair_by_rule = _alert_labels.get("auto_repair", "true").lower() == "true"
@@ -2457,7 +2097,6 @@ async def alertmanager_webhook(
"alert_id": alert_id,
"alertname": _alertname_for_log,
"labels": _alert_labels,
"annotations": _alert_annotations,
"auto_repair_flag": _can_auto_repair_by_rule,
},
)
@@ -2486,12 +2125,11 @@ async def alertmanager_webhook(
telegram = get_telegram_gateway()
# 解析 CI/CD 狀態
stage = alert.labels.get("stage", "")
job_status = _cicd_job_status_from_alert(alert)
job_status = "success" if alert.labels.get("severity") == "info" else "running"
commit_sha = alert.labels.get("commit", "")
triggered_by = alert.labels.get("triggered_by", "CI")
workflow_url = alert.annotations.get("workflow_url", "")
summary = alert.annotations.get("summary", alertname)
detail_message = alert.annotations.get("description", "")
await telegram.send_cicd_progress(
job_name=summary,
@@ -2499,8 +2137,6 @@ async def alertmanager_webhook(
stage=stage,
commit_sha=commit_sha,
triggered_by=triggered_by,
duration_seconds=_cicd_duration_seconds_from_alert(alert),
message=detail_message,
workflow_url=workflow_url,
)
@@ -2600,22 +2236,6 @@ async def alertmanager_webhook(
target=target_resource,
fingerprint=fingerprint,
)
background_tasks.add_task(
record_alertmanager_event,
project_id="awoooi",
alert_id=alert_id,
alertname=alertname,
severity=severity,
namespace=namespace,
target_resource=target_resource,
fingerprint=fingerprint,
stage="received",
notification_type=notification_type,
alert_category=alert_category,
source_url=alert.generatorURL,
labels=dict(alert.labels) if alert.labels else {},
annotations=dict(alert.annotations) if alert.annotations else {},
)
# ==========================================================================
# ADR-076: 告警聚合引擎 — 5 分鐘滑動視窗,防止告警風暴
@@ -2646,19 +2266,6 @@ async def alertmanager_webhook(
parent_fingerprint=grouping_result.parent_fingerprint,
reason="Alert storm suppressed — child alert within 5-min window",
)
background_tasks.add_task(
record_grouped_alert_event,
project_id="awoooi",
alert_id=alert_id,
alertname=alertname,
severity=severity,
namespace=namespace,
target_resource=target_resource,
group_key=grouping_result.group_key,
count=grouping_result.count,
parent_fingerprint=grouping_result.parent_fingerprint,
fingerprint=fingerprint,
)
return AlertResponse(
success=True,
message=(
@@ -2698,26 +2305,6 @@ async def alertmanager_webhook(
hit_count=updated_approval.hit_count,
reason="Converged alert - Telegram already sent for this fingerprint",
)
background_tasks.add_task(
record_alertmanager_event,
project_id="awoooi",
alert_id=alert_id,
alertname=alertname,
severity=severity,
namespace=namespace,
target_resource=target_resource,
fingerprint=fingerprint,
stage="converged",
notification_type=notification_type,
alert_category=alert_category,
incident_id=getattr(updated_approval, "incident_id", None),
approval_id=str(updated_approval.id),
repeat_count=updated_approval.hit_count,
is_duplicate=True,
source_url=alert.generatorURL,
labels=dict(alert.labels) if alert.labels else {},
annotations=dict(alert.annotations) if alert.annotations else {},
)
return AlertResponse(
success=True,
@@ -2745,27 +2332,10 @@ async def alertmanager_webhook(
message=message,
source="alertmanager",
alertname=alertname,
alert_labels={**alert.labels, "fingerprint": fingerprint, "alert_id": alert_id},
alert_labels=alert.labels,
notification_type="TYPE-1",
alert_category=alert_category,
)
background_tasks.add_task(
record_alertmanager_event,
project_id="awoooi",
alert_id=alert_id,
alertname=alertname,
severity=severity,
namespace=namespace,
target_resource=target_resource,
fingerprint=fingerprint,
stage="incident_linked",
notification_type="TYPE-1",
alert_category=alert_category,
incident_id=_info_incident_id,
source_url=alert.generatorURL,
labels={**alert.labels, "fingerprint": fingerprint, "alert_id": alert_id},
annotations=dict(alert.annotations) if alert.annotations else {},
)
# 2026-04-15 ogt: TYPE-1 純資訊告警建立後立即關閉
# 設計原則: backup/heartbeat/info 告警無需追蹤狀態,通知即完成
# 防止 incidents 表無限累積 INVESTIGATING 記錄ADR-073 漏洞修補)
@@ -2785,7 +2355,7 @@ async def alertmanager_webhook(
record_alert_chain_success("alertmanager")
return AlertResponse(
success=True,
message="✅ TYPE-1 純資訊告警已通知 (no LLM)",
message=f"✅ TYPE-1 純資訊告警已通知 (no LLM)",
alert_id=alert_id,
approval_created=False,
)
@@ -2797,23 +2367,6 @@ async def alertmanager_webhook(
fingerprint=fingerprint,
ttl_seconds=ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS,
)
background_tasks.add_task(
record_alertmanager_event,
project_id="awoooi",
alert_id=alert_id,
alertname=alertname,
severity=severity,
namespace=namespace,
target_resource=target_resource,
fingerprint=fingerprint,
stage="llm_inflight_suppressed",
notification_type=notification_type,
alert_category=alert_category,
is_duplicate=True,
source_url=alert.generatorURL,
labels=dict(alert.labels) if alert.labels else {},
annotations=dict(alert.annotations) if alert.annotations else {},
)
return AlertResponse(
success=True,
message="🛡️ 告警已由同指紋背景 AI 分析處理中,跳過重複 LLM 呼叫",

View File

@@ -1,126 +0,0 @@
"""
AwoooP Operator authentication boundary.
ADR-116 Gate 5 approval decisions must not trust browser-supplied identities.
This module accepts a short-lived operator identity only when it is paired with
the server-side AwoooP operator key.
"""
from __future__ import annotations
import re
import secrets
from dataclasses import dataclass
from typing import Annotated
import structlog
from fastapi import Header, HTTPException, status
from src.core.config import settings
logger = structlog.get_logger(__name__)
_OPERATOR_ID_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9_.:@-]{1,127}$")
_PROD_ENVS = {"prod", "production"}
@dataclass(frozen=True, slots=True)
class AwoooPOperatorPrincipal:
"""Authenticated AwoooP operator principal."""
operator_id: str
auth_method: str
def _auth_error(detail: str = "Operator authentication required") -> HTTPException:
return HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail=detail)
def _clean_operator_id(operator_id: str | None) -> str:
if operator_id is None:
raise _auth_error()
cleaned = operator_id.strip()
if not _OPERATOR_ID_RE.fullmatch(cleaned):
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
detail="Invalid operator identity",
)
return cleaned
def authenticate_awooop_operator_headers(
operator_id: str | None,
operator_key: str | None,
*,
configured_key: str | None = None,
environment: str | None = None,
) -> AwoooPOperatorPrincipal:
"""Validate trusted AwoooP operator headers.
Args:
operator_id: Value from ``X-AwoooP-Operator-Id``.
operator_key: Value from ``X-AwoooP-Operator-Key``.
configured_key: Server-side shared key. Defaults to settings.
environment: Runtime environment. Defaults to settings.
Returns:
Authenticated operator principal.
Raises:
HTTPException: 401 when authentication is missing/invalid, or 422 for
malformed operator identity.
"""
cleaned_operator_id = _clean_operator_id(operator_id)
expected_key = (
settings.AWOOOP_OPERATOR_API_KEY
if configured_key is None
else configured_key
)
runtime_env = (environment or settings.ENVIRONMENT or "").lower()
if not expected_key:
if runtime_env in _PROD_ENVS:
logger.critical(
"awooop_operator_key_missing_in_production",
environment=runtime_env,
)
raise _auth_error()
logger.warning(
"awooop_operator_key_skipped_dev_only",
environment=runtime_env,
operator_id=cleaned_operator_id,
)
return AwoooPOperatorPrincipal(
operator_id=cleaned_operator_id,
auth_method="dev_header",
)
if not operator_key:
logger.warning("awooop_operator_key_missing", operator_id=cleaned_operator_id)
raise _auth_error()
if not secrets.compare_digest(operator_key, expected_key):
logger.warning("awooop_operator_key_invalid", operator_id=cleaned_operator_id)
raise _auth_error()
return AwoooPOperatorPrincipal(
operator_id=cleaned_operator_id,
auth_method="operator_api_key",
)
async def verify_awooop_operator(
x_awooop_operator_id: Annotated[
str | None,
Header(alias="X-AwoooP-Operator-Id"),
] = None,
x_awooop_operator_key: Annotated[
str | None,
Header(alias="X-AwoooP-Operator-Key"),
] = None,
) -> AwoooPOperatorPrincipal:
"""FastAPI dependency for operator mutation endpoints."""
return authenticate_awooop_operator_headers(
operator_id=x_awooop_operator_id,
operator_key=x_awooop_operator_key,
)

View File

@@ -215,8 +215,8 @@ class Settings(BaseSettings):
description="Phase 25 P0: DIAGNOSE NIM timeout (秒),實測 2.2-27.3s avg 10.6s60s 含 buffer",
)
OLLAMA_DIAGNOSE_TIMEOUT_SECONDS: int = Field(
default=300,
description="Ollama diagnose timeout (秒)。GCP qwen3:14b CPU-only can exceed the old 120s proxy limit.",
default=200,
description="Phase 25 P0: Ollama timeout (秒),實測 CPU-only 238s保留欄位但 DIAGNOSE 不再走 Ollama",
)
# ==========================================================================
@@ -370,16 +370,11 @@ class Settings(BaseSettings):
)
return v
# 2026-05-05 Codex: health inference must stay on alert-fast model; qwen2.5
# keeps reloading a 7B model on CPU-only GCP and slows incident fallback.
# 2026-04-25 Claude Engineer-C (P1.1): Ollama 健康檢測推理測試模型
OLLAMA_HEALTH_CHECK_MODEL: str = Field(
default="gemma3:4b",
default="qwen2.5:7b-instruct",
description="OllamaHealthMonitor 推理測試使用模型P1.1",
)
OLLAMA_EMBEDDING_MODEL: str = Field(
default="bge-m3:latest",
description="Ollama embedding model. ADR-110 migrated embeddings from nomic-embed-text to bge-m3.",
)
# 2026-04-12 ogt: 心跳必須確認載入的 Ollama 模型清單
# 2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP 升級更新必要模型清單nomic→bge-m3 + 新增 qwen3:14b + hermes3
OLLAMA_REQUIRED_MODELS: list[str] = Field(
@@ -505,7 +500,7 @@ class Settings(BaseSettings):
default=False,
description=(
"Allow LocalCodeReviewService to fall back to Gemini when the "
"local Ollama code-review lane fails. Default false to avoid "
"GCP-B/Ollama code-review lane fails. Default false to avoid "
"unexpected cloud spend from Gitea push/PR alerts."
),
)
@@ -525,22 +520,6 @@ class Settings(BaseSettings):
"then local 111 before cloud backup providers such as Gemini."
),
)
ALERT_OLLAMA_MODEL: str = Field(
default="qwen3:14b",
description=(
"Ollama model used for incident/alert deep diagnosis. Alert cards "
"may wait for this model; Gemini remains a backup after GCP-A, "
"GCP-B, and 111 fail."
),
)
INCIDENT_LLM_TIMEOUT_SECONDS: int = Field(
default=360,
description=(
"Outer timeout for incident OpenClaw proposal generation. This must "
"be long enough for the GCP-A/GCP-B/111 Ollama lane to complete "
"before Gemini backup is considered useful."
),
)
# 2026-03-29 ogt: ADR-036 Nemotron Tool Calling 整合
NVIDIA_API_KEY: str = Field(
default="",
@@ -602,77 +581,6 @@ class Settings(BaseSettings):
default="",
description="API Key for K8s admin endpoints (X-K8s-Api-Key header)",
)
AWOOOP_OPERATOR_API_KEY: str = Field(
default="",
description=(
"API key for AwoooP operator mutation endpoints "
"(X-AwoooP-Operator-Key header)"
),
)
ENABLE_AWOOOP_ANSIBLE_CHECK_MODE_WORKER: bool = Field(
default=False,
description=(
"True=consume ansible_candidate_matched AOL rows and run "
"ansible-playbook --check --diff only. Apply remains disabled."
),
)
AWOOOP_ANSIBLE_CHECK_MODE_INTERVAL_SECONDS: int = Field(
default=300,
ge=60,
description="AwoooP Ansible check-mode worker polling interval.",
)
AWOOOP_ANSIBLE_CHECK_MODE_BATCH_LIMIT: int = Field(
default=1,
ge=1,
le=5,
description="Maximum Ansible check-mode candidates claimed per worker tick.",
)
AWOOOP_ANSIBLE_CHECK_MODE_TIMEOUT_SECONDS: int = Field(
default=180,
ge=30,
le=600,
description="Timeout for one ansible-playbook --check --diff execution.",
)
AWOOOP_ANSIBLE_CHECK_MODE_STARTUP_SLEEP_SECONDS: int = Field(
default=120,
ge=0,
le=900,
description="Delay before the check-mode worker first tick after API startup.",
)
AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_PROFILE: str = Field(
default="ssh_mcp",
description=(
"SSH transport profile used by Ansible check-mode. Production uses "
"the existing ssh-mcp key so repair-bot forced-command remains reserved "
"for whitelist repairs."
),
)
AWOOOP_ANSIBLE_CHECK_MODE_SSH_KEY_PATH: str = Field(
default="/run/secrets/ssh_mcp_key",
description="Private key path for Ansible check-mode SSH transport.",
)
AWOOOP_ANSIBLE_CHECK_MODE_KNOWN_HOSTS_PATH: str = Field(
default="/etc/ssh-mcp/known_hosts",
description="known_hosts path for Ansible check-mode SSH transport.",
)
AWOOOP_ANSIBLE_CHECK_MODE_CANDIDATE_MAX_AGE_HOURS: int = Field(
default=24,
ge=1,
le=168,
description=(
"Only recent Ansible candidate audit rows are eligible for automatic "
"check-mode claims; older backlog remains visible but is not drained as noise."
),
)
AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_COOLDOWN_SECONDS: int = Field(
default=21_600,
ge=300,
le=86_400,
description=(
"Cooldown after transport-level check-mode blockers such as "
"forced-command repair SSH denial."
),
)
# ==========================================================================
# 統帥鐵律:禁止 SQLite (AWOOOI 憲法)

View File

@@ -37,8 +37,8 @@ REDIS_KEY_DECISION = "decision:"
APPROVAL_TO_INCIDENT_STATUS = {
"pending": "investigating",
"approved": "resolved",
"rejected": "escalated",
"expired": "escalated",
"rejected": "rejected",
"expired": "expired",
}
# Incident 狀態 → 是否活躍

View File

@@ -10,13 +10,51 @@
"""
from __future__ import annotations
from contextvars import ContextVar
from contextvars import ContextVar, Token
# 追蹤當前非同步任務的 project_id
# default="awoooi" 確保未設時也能正常查詢RLS fail-open 保護)
PROJECT_ID: ContextVar[str] = ContextVar("project_id", default="awoooi")
# Fail-Closed: 移除 default="awoooi",進 DB 路徑需要明確租戶標籤
PROJECT_ID: ContextVar[str | None] = ContextVar("project_id")
PROJECT_ID_SOURCE: ContextVar[str | None] = ContextVar("project_id_source")
PROJECT_ID_REQUEST_ID: ContextVar[str | None] = ContextVar("project_id_request_id")
def get_current_project_id() -> str:
def set_project_context(
project_id: str | None,
source: str = "runtime",
request_id: str | None = None,
) -> tuple[Token[str | None], Token[str | None], Token[str | None]]:
"""
設定當前 request/context 的 project 上下文,並回傳 ContextVar token 供 restore。
"""
return (
PROJECT_ID.set(project_id),
PROJECT_ID_SOURCE.set(source),
PROJECT_ID_REQUEST_ID.set(request_id),
)
def clear_project_context(tokens: tuple[Token[str | None], Token[str | None], Token[str | None]]) -> None:
"""清除 request 上下文,回復前一個 ContextVar 狀態。"""
PROJECT_ID_REQUEST_ID.reset(tokens[2])
PROJECT_ID_SOURCE.reset(tokens[1])
PROJECT_ID.reset(tokens[0])
def get_project_context() -> dict[str, str | None]:
"""取得目前上下文快照(可直接寫入 audit log"""
return {
"project_id": PROJECT_ID.get(None),
"source": PROJECT_ID_SOURCE.get(None),
"request_id": PROJECT_ID_REQUEST_ID.get(None),
}
def get_current_project_id() -> str | None:
"""取得當前任務的 project_id給 service 層使用)"""
return PROJECT_ID.get()
return PROJECT_ID.get(None)
def get_current_project_context() -> dict[str, str | None]:
"""取得可追溯上下文(同 get_project_context保留 API 命名)。"""
return get_project_context()

View File

@@ -11,7 +11,6 @@ Features:
"""
import logging
import re
import sys
from typing import Any
@@ -20,28 +19,6 @@ from structlog.types import Processor
from src.core.config import settings
_TELEGRAM_BOT_URL_RE = re.compile(r"(api\.telegram\.org/bot)[^/\s]+")
def _redact_sensitive_log_text(text: str) -> str:
"""遮蔽可能出現在第三方 logger 訊息中的敏感 URL。"""
return _TELEGRAM_BOT_URL_RE.sub(r"\1<redacted>", text)
class SensitiveURLRedactionFilter(logging.Filter):
"""標準 logging filter避免 httpx 等第三方 logger 把 token URL 打進 log。"""
def filter(self, record: logging.LogRecord) -> bool:
record.msg = _redact_sensitive_log_text(str(record.msg))
if isinstance(record.args, tuple):
record.args = tuple(_redact_sensitive_log_text(str(arg)) for arg in record.args)
elif isinstance(record.args, dict):
record.args = {
key: _redact_sensitive_log_text(str(value))
for key, value in record.args.items()
}
return True
def setup_logging() -> None:
"""Configure structlog for the application"""
@@ -91,15 +68,6 @@ def setup_logging() -> None:
stream=sys.stdout,
level=logging.getLevelName(settings.LOG_LEVEL),
)
redaction_filter = SensitiveURLRedactionFilter()
root_logger = logging.getLogger()
root_logger.addFilter(redaction_filter)
for handler in root_logger.handlers:
handler.addFilter(redaction_filter)
# httpx INFO 會輸出完整 request URLTelegram Bot API URL 內含 token。
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
def get_logger(name: str | None = None, **initial_context: Any) -> structlog.BoundLogger:

View File

@@ -17,7 +17,6 @@ PostgreSQL 事務管理器,確保多表操作原子性。
from typing import Any
import structlog
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
logger = structlog.get_logger(__name__)
@@ -50,20 +49,14 @@ class UnitOfWork:
- Redis 操作失敗時必須手動呼叫 rollback()
"""
def __init__(
self,
session_factory: async_sessionmaker[AsyncSession],
project_id: str | None = None,
):
def __init__(self, session_factory: async_sessionmaker[AsyncSession]):
"""
初始化 UnitOfWork
Args:
session_factory: SQLAlchemy async session factory
project_id: RLS project context. None means contextvar/default awoooi.
"""
self._session_factory = session_factory
self._project_id = project_id
self._session: AsyncSession | None = None
self._committed = False
@@ -81,18 +74,9 @@ class UnitOfWork:
async def __aenter__(self) -> "UnitOfWork":
"""進入事務"""
from src.core.context import get_current_project_id
self._session = self._session_factory()
effective_pid = (
self._project_id if self._project_id is not None else get_current_project_id()
)
await self._session.execute(
text("SELECT set_config('app.project_id', :pid, TRUE)"),
{"pid": effective_pid},
)
self._committed = False
logger.debug("uow_started", project_id=effective_pid)
logger.debug("uow_started")
return self
async def __aexit__(

View File

@@ -10,7 +10,7 @@ from __future__ import annotations
from datetime import datetime
from decimal import Decimal
from typing import Any
from uuid import UUID
from uuid import UUID, uuid4
from sqlalchemy import (
Boolean,
@@ -577,8 +577,8 @@ class AwoooPMcpGatewayAudit(Base):
run_id: Mapped[UUID | None] = mapped_column(nullable=True)
trace_id: Mapped[str | None] = mapped_column(String(128), nullable=True)
agent_id: Mapped[str | None] = mapped_column(String(128), nullable=True)
tool_id: Mapped[UUID | None] = mapped_column(
ForeignKey("awooop_mcp_tool_registry.tool_id"), nullable=True
tool_id: Mapped[UUID] = mapped_column(
ForeignKey("awooop_mcp_tool_registry.tool_id"), nullable=False
)
tool_name: Mapped[str] = mapped_column(String(128), nullable=False)
credential_ref: Mapped[str | None] = mapped_column(String(256), nullable=True)
@@ -635,13 +635,6 @@ class AwoooPConversationEvent(Base):
content_type: Mapped[str] = mapped_column(String(32), nullable=False, default="text")
content_hash: Mapped[str | None] = mapped_column(String(64), nullable=True)
content_preview: Mapped[str | None] = mapped_column(String(256), nullable=True)
content_redacted: Mapped[str | None] = mapped_column(Text, nullable=True)
redaction_version: Mapped[str] = mapped_column(
String(32), nullable=False, server_default=text("'audit_sink_v1'")
)
source_envelope: Mapped[dict[str, Any]] = mapped_column(
JSONB, nullable=False, server_default=text("'{}'::jsonb")
)
attachment_sha256: Mapped[str | None] = mapped_column(String(64), nullable=True)
is_duplicate: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
provider_ts: Mapped[datetime | None] = mapped_column(nullable=True)
@@ -687,13 +680,6 @@ class AwoooPOutboundMessage(Base):
message_type: Mapped[str] = mapped_column(String(32), nullable=False)
content_hash: Mapped[str | None] = mapped_column(String(64), nullable=True)
content_preview: Mapped[str | None] = mapped_column(String(256), nullable=True)
content_redacted: Mapped[str | None] = mapped_column(Text, nullable=True)
redaction_version: Mapped[str] = mapped_column(
String(32), nullable=False, server_default=text("'audit_sink_v1'")
)
source_envelope: Mapped[dict[str, Any]] = mapped_column(
JSONB, nullable=False, server_default=text("'{}'::jsonb")
)
provider_message_id: Mapped[str | None] = mapped_column(String(64), nullable=True)
send_status: Mapped[str] = mapped_column(String(16), nullable=False, default="pending")
send_error: Mapped[str | None] = mapped_column(Text, nullable=True)

View File

@@ -16,6 +16,7 @@ Features:
from collections.abc import AsyncGenerator
from contextlib import asynccontextmanager
from fastapi import HTTPException
from sqlalchemy import text
from sqlalchemy.ext.asyncio import (
AsyncEngine,
@@ -26,6 +27,8 @@ from sqlalchemy.ext.asyncio import (
from sqlalchemy.orm import DeclarativeBase
from src.core.config import settings
from src.core.context import get_current_project_context
from src.core.logging import get_logger
# =============================================================================
# Base Model
@@ -42,6 +45,19 @@ class Base(DeclarativeBase):
_engine: AsyncEngine | None = None
_session_factory: async_sessionmaker[AsyncSession] | None = None
logger = get_logger("awoooi.db")
def _raise_unauthorized_db_context(msg: str) -> None:
context = get_current_project_context()
logger.error(
"db_context_missing",
reason=msg,
project_id=context.get("project_id"),
project_id_source=context.get("source"),
request_id=context.get("request_id"),
)
raise HTTPException(status_code=401, detail="Missing tenant context: project_id is required")
def get_engine() -> AsyncEngine:
@@ -103,16 +119,21 @@ async def get_db() -> AsyncGenerator[AsyncSession, None]:
async def get_items(db: AsyncSession = Depends(get_db)):
...
"""
from src.core.context import get_current_project_id
factory = get_session_factory()
async with factory() as session:
try:
from src.core.context import get_current_project_id
# AwoooP Phase 2.3 (2026-05-04 ogt): SET LOCAL app.project_id 讓 RLS Policy 生效
# 預設 'awoooi',多租戶路由將透過 contextvar 注入實際 project_id
# Fail-Closed RLS: 遇到未授權情境拋出錯誤而非回退到 "awoooi"
pid = get_current_project_id()
if not pid:
_raise_unauthorized_db_context(
"Unauthorized: project_id is missing in context (Fail-Closed RLS)"
)
await session.execute(
text("SELECT set_config('app.project_id', :pid, TRUE)"),
{"pid": get_current_project_id()},
{"pid": pid},
)
yield session
await session.commit()
@@ -126,19 +147,22 @@ async def get_db_context(project_id: str | None = None) -> AsyncGenerator[AsyncS
"""
Context manager for database session (non-FastAPI usage)
AwoooP Phase 2.3/2.4: 優先序 — 明確參數 > contextvar > "awoooi"
AwoooP Phase 2.3/2.4: 優先序 — 明確參數 > contextvar(缺失則 fail-closed
- Phase 2.3: 啟用 RLS tenant isolationSET LOCAL app.project_id
- Phase 2.4: 從 asyncio contextvar 讀取 background loop 的 project_id
Usage:
async with get_db_context() as db: # 繼承 contextvar 或預設 awoooi
async with get_db_context() as db: # 繼承 contextvar(缺失將 fail-closed
...
async with get_db_context("other-tenant") as db: # 明確指定 tenant
...
"""
"""
from src.core.context import get_current_project_id
effective_pid = project_id if project_id is not None else get_current_project_id()
if not effective_pid:
_raise_unauthorized_db_context("Unauthorized: project_id is missing in context (Fail-Closed RLS)")
factory = get_session_factory()
async with factory() as session:
try:
@@ -157,9 +181,6 @@ async def get_db_context(project_id: str | None = None) -> AsyncGenerator[AsyncS
# Initialization
# =============================================================================
_DB_BOOTSTRAP_LOCK_NAME = "awoooi:init_db:ddl"
async def init_db() -> None:
"""
Initialize database tables
@@ -168,28 +189,6 @@ async def init_db() -> None:
"""
engine = get_engine()
async with engine.connect() as lock_conn:
# 2026-05-24 ogt + Codex: 兩個 API replica 同時啟動時PostgreSQL 會在
# ALTER TABLE ... IF NOT EXISTS 上互相等待並 deadlock。整段 bootstrap
# DDL 必須序列化,避免 rollout 因一個 pod CrashLoop 變成 1/2 ready。
await lock_conn.execute(
text("SELECT pg_advisory_lock(hashtext(:lock_name))"),
{"lock_name": _DB_BOOTSTRAP_LOCK_NAME},
)
try:
await _run_init_db_ddl(engine)
finally:
await lock_conn.execute(
text("SELECT pg_advisory_unlock(hashtext(:lock_name))"),
{"lock_name": _DB_BOOTSTRAP_LOCK_NAME},
)
async def _run_init_db_ddl(engine: AsyncEngine) -> None:
"""
Run idempotent DB bootstrap DDL while caller holds the bootstrap advisory lock.
"""
# 2026-04-15 ogt: 多 replica 並行啟動競爭修復
# 問題:單一大 transaction 裡兩個 pod 同時建 table → 其中一個 CREATE INDEX 失敗
# PostgreSQL 中 transaction 內任何錯誤導致整個 transaction ROLLBACK

View File

@@ -633,8 +633,6 @@ class AlertOperationLog(Base):
"RESOLVED", "SILENCED", "ESCALATED", "GUARDRAIL_BLOCKED",
"PRE_FLIGHT_PASSED", "PRE_FLIGHT_FAILED", "BACKUP_TRIGGERED",
"BACKUP_COMPLETED", "BACKUP_FAILED", "APPROVAL_ESCALATED", "CHANGE_APPLIED",
"NOTIFICATION_CLASSIFIED", "MANUAL_FIX_RECORDED", "KM_CONVERTED",
"PLAYBOOK_DRAFT_CREATED", "STATE_GUARD_BLOCKED",
name="alert_event_type", create_type=False,
),
nullable=False, index=True,

View File

@@ -9,7 +9,6 @@ Layer 1 意圖路由(關鍵字正則)→ Ollama 本地模型111→ Tel
debugger/vuln → deepseek-r1:14b推理; code agents → qwen2.5-coder:7b; 其他 → qwen2.5:7b-instruct
"""
from __future__ import annotations
import asyncio
import re
import time
@@ -18,12 +17,12 @@ import httpx
import structlog
from sqlalchemy import text
from src.core.config import settings
from src.core.redis_client import get_redis
from src.db.base import get_db_context
from src.hermes.agent_loader import get_agent_system_prompt
from src.hermes.display_names import DEFAULT_AGENT, format_response_header
from src.hermes.safety_hooks import is_dangerous_input, is_mutate_intent
from src.services.ollama_endpoint_resolver import resolve_ollama_order
logger = structlog.get_logger(__name__)
@@ -262,48 +261,42 @@ async def process_nl_message(
t0 = time.monotonic()
# 呼叫 Ollama 模型(GCP-A → GCP-B → 111零費用按 agent 選模型)
# 呼叫 Ollama 本地模型111零費用按 agent 選模型)
model = _pick_model(agent_name)
success = False
error_type: str | None = None
result_text = ""
async with httpx.AsyncClient(timeout=_OLLAMA_TIMEOUT) as _hc:
for endpoint in resolve_ollama_order("hermes"):
if not endpoint.url:
continue
try:
resp = await _hc.post(
f"{endpoint.url}/api/chat",
json={
"model": model,
# Keep Hermes responses in message.content across Ollama 0.24+.
"think": False,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt_with_ctx},
],
"stream": False,
"options": {"num_predict": 1500, "temperature": 0.3},
},
)
resp.raise_for_status()
result_text = resp.json().get("message", {}).get("content", "")
result_text = _strip_think_tags(result_text)
if not result_text:
result_text = "_Agent 回應為空請稍後再試。_"
success = True
break
except Exception as exc:
error_type = type(exc).__name__
logger.error(
"hermes_nl_ollama_error",
error=str(exc),
agent=agent_name,
model=model,
provider=endpoint.provider_name,
exc_type=error_type,
)
if not success:
try:
ollama_base = getattr(settings, "OLLAMA_URL", "http://34.143.170.20:11434") # 2026-05-03 ogt: ADR-110 GCP-A Primary
async with httpx.AsyncClient(timeout=_OLLAMA_TIMEOUT) as _hc:
resp = await _hc.post(
f"{ollama_base}/api/chat",
json={
"model": model,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt_with_ctx},
],
"stream": False,
"options": {"num_predict": 1500, "temperature": 0.3},
},
)
resp.raise_for_status()
result_text = resp.json().get("message", {}).get("content", "")
result_text = _strip_think_tags(result_text)
if not result_text:
result_text = "_Agent 回應為空請稍後再試。_"
success = True
except Exception as exc:
error_type = type(exc).__name__
logger.error(
"hermes_nl_ollama_error",
error=str(exc),
agent=agent_name,
model=model,
exc_type=error_type,
)
result_text = f"_Hermes 暫時無法連線({error_type}請稍後再試。_"
latency_ms = int((time.monotonic() - t0) * 1000)

View File

@@ -46,7 +46,6 @@ _DEDUP_TTL_SEC = 3600 # 同一告警 1 小時內不重複發送
_TG_SILENCE_THRESHOLD = 2 # PENDING telegram_message_id IS NULL 告警門檻
_FLYWHEEL_SUCCESS_MIN = 0.30 # 執行成功率下限
_STUCK_ANALYSIS_THRESHOLD = 3 # Agent Debate 失敗導致卡住的告警門檻
_TRUST_DRIFT_META_MIN_RATIO = 0.20 # 低於此比例只記治理事件,不升 Meta System
# 2026-05-03 ogt + Claude Opus 4.7 — feedback_silencing_alerts_recurring_violation
# 啟動寬限期30 分鐘內可 skip「資料還沒到」噪音超過寬限期仍空 = 真資料管線斷,必須告警
@@ -211,8 +210,7 @@ async def _check_once() -> None:
from src.services.governance_agent import get_governance_agent
trust_result = await get_governance_agent().check_trust_drift(emit_alert=False)
drifted = trust_result.get("drifted", 0)
drift_ratio = float(trust_result.get("drift_ratio") or 0.0)
if drifted > 0 and drift_ratio >= _TRUST_DRIFT_META_MIN_RATIO:
if drifted > 0:
auto_deprecated = trust_result.get("auto_deprecated", 0)
kept = trust_result.get("kept", 0)
violations.append(
@@ -221,13 +219,6 @@ async def _check_once() -> None:
)
# 2026-05-05 ogt W6 修復:移除動態 low_count避免 count 微變繞過 dedup
violation_codes.append("W6:trust_drift")
elif drifted > 0:
logger.info(
"watchdog_w6_trust_drift_below_meta_threshold",
drifted=drifted,
drift_ratio=round(drift_ratio, 3),
threshold=_TRUST_DRIFT_META_MIN_RATIO,
)
except Exception as e:
logger.warning("watchdog_w6_trust_drift_check_failed", error=str(e))

View File

@@ -1,44 +0,0 @@
"""AwoooP Ansible check-mode worker loop.
Runs only when explicitly enabled by settings. The worker consumes pending
``ansible_candidate_matched`` rows and records check-mode evidence; it never
executes Ansible apply.
"""
from __future__ import annotations
import asyncio
import structlog
from src.core.config import settings
from src.services.awooop_ansible_check_mode_service import run_pending_check_modes_once
logger = structlog.get_logger(__name__)
async def run_awooop_ansible_check_mode_loop() -> None:
if not settings.ENABLE_AWOOOP_ANSIBLE_CHECK_MODE_WORKER:
logger.info("awooop_ansible_check_mode_worker_disabled")
return
logger.info(
"awooop_ansible_check_mode_worker_started",
interval_seconds=settings.AWOOOP_ANSIBLE_CHECK_MODE_INTERVAL_SECONDS,
batch_limit=settings.AWOOOP_ANSIBLE_CHECK_MODE_BATCH_LIMIT,
timeout_seconds=settings.AWOOOP_ANSIBLE_CHECK_MODE_TIMEOUT_SECONDS,
)
await asyncio.sleep(settings.AWOOOP_ANSIBLE_CHECK_MODE_STARTUP_SLEEP_SECONDS)
while True:
try:
result = await run_pending_check_modes_once(
limit=settings.AWOOOP_ANSIBLE_CHECK_MODE_BATCH_LIMIT,
timeout_seconds=settings.AWOOOP_ANSIBLE_CHECK_MODE_TIMEOUT_SECONDS,
)
if result.get("claimed") or result.get("blockers"):
logger.info("awooop_ansible_check_mode_worker_tick", **result)
except Exception as exc:
logger.warning("awooop_ansible_check_mode_worker_failed", error=str(exc))
await asyncio.sleep(settings.AWOOOP_ANSIBLE_CHECK_MODE_INTERVAL_SECONDS)

View File

@@ -1,308 +0,0 @@
"""
Hermes KB Growth Worker
=======================
消費 governance_remediation_dispatch 中的 hermes_kb_growth_healthcheck work item
把 knowledge_degradation 告警推進成可審核的 KM 草稿。
邊界:
- 可以建立 REVIEW 狀態的 auto_runbook 草稿,讓 owner 在前端審核。
- 不可以直接把 KM 標成 APPROVED / PUBLISHED。
- 不修改 immutable ai_governance_events流程進度寫回 dispatch.decision_context。
2026-05-19 ogt + Codex: T90 Hermes KB growth healthcheck worker。
"""
from __future__ import annotations
import asyncio
from copy import deepcopy
from typing import Any
import structlog
from src.db.base import get_db_context
from src.db.models import GovernanceRemediationDispatch
from src.models.knowledge import (
EntrySource,
EntryStatus,
EntryType,
KnowledgeEntry,
KnowledgeEntryCreate,
)
from src.repositories.governance_remediation_dispatch_repo import (
InvalidStatusTransition,
list_pending_by_executor,
transition_status,
update_decision_context,
)
from src.repositories.knowledge_repository import KnowledgeDBRepository
logger = structlog.get_logger(__name__)
EXECUTOR_TYPE = "hermes_kb_growth_healthcheck"
DEFAULT_INTERVAL_SECONDS = 300
DEFAULT_LIMIT = 20
async def run_hermes_kb_growth_once(limit: int = DEFAULT_LIMIT) -> dict[str, int]:
"""執行一輪 Hermes KB growth healthcheck。
Returns:
統計資訊,供 log / smoke test 判讀。
"""
rows = await list_pending_by_executor(EXECUTOR_TYPE, limit=limit)
result = {
"scanned": len(rows),
"processed": 0,
"skipped": 0,
"failed": 0,
}
for row in rows:
try:
await _process_dispatch(row)
result["processed"] += 1
except InvalidStatusTransition as exc:
result["skipped"] += 1
logger.info(
"hermes_kb_growth_dispatch_skipped",
dispatch_id=row.id,
event_id=row.governance_event_id,
reason=str(exc),
)
except Exception as exc:
result["failed"] += 1
logger.exception(
"hermes_kb_growth_dispatch_failed",
dispatch_id=row.id,
event_id=row.governance_event_id,
error=str(exc),
)
await _mark_failed_if_started(row.id, str(exc))
if any(result.values()):
logger.info("hermes_kb_growth_once_completed", **result)
return result
async def run_hermes_kb_growth_loop(
interval_seconds: int = DEFAULT_INTERVAL_SECONDS,
limit: int = DEFAULT_LIMIT,
) -> None:
"""背景 loop定期消費 Hermes KB growth dispatch。"""
logger.info(
"hermes_kb_growth_loop_started",
interval_seconds=interval_seconds,
limit=limit,
)
while True:
try:
await run_hermes_kb_growth_once(limit=limit)
except asyncio.CancelledError:
raise
except Exception as exc:
logger.exception("hermes_kb_growth_loop_error", error=str(exc))
await asyncio.sleep(interval_seconds)
async def _process_dispatch(row: GovernanceRemediationDispatch) -> None:
"""處理單筆 pending dispatch最後停在 waiting_owner_review。"""
dispatched = await transition_status(row.id, "pending", "dispatched")
executing = await transition_status(dispatched.id, "dispatched", "executing")
km_entry = await _create_or_get_km_review_draft(executing)
updated_context = _build_review_context(
executing.decision_context or {},
dispatch_id=executing.id,
governance_event_id=executing.governance_event_id,
km_entry_id=km_entry.id,
)
await update_decision_context(executing.id, updated_context)
await transition_status(executing.id, "executing", "succeeded")
logger.info(
"hermes_kb_growth_review_draft_ready",
dispatch_id=executing.id,
event_id=executing.governance_event_id,
km_entry_id=km_entry.id,
workflow_stage="waiting_owner_review",
)
async def _create_or_get_km_review_draft(
dispatch: GovernanceRemediationDispatch,
) -> KnowledgeEntry:
"""以 governance event tag 做冪等,建立或取得 REVIEW 狀態 KM 草稿。"""
dispatch_tag = f"dispatch:{dispatch.id}"
event_tag = f"governance_event:{dispatch.governance_event_id}"
payload = _build_km_review_entry_payload(dispatch)
async with get_db_context() as db:
repo = KnowledgeDBRepository(db)
existing, _ = await repo.list_entries(tags=[event_tag], limit=1)
if existing:
return existing[0]
existing, _ = await repo.list_entries(tags=[dispatch_tag], limit=1)
if existing:
return existing[0]
return await repo.create(payload)
def _build_km_review_entry_payload(
dispatch: GovernanceRemediationDispatch,
) -> KnowledgeEntryCreate:
"""把 governance dispatch 轉成待審核的 KM 草稿 payload。"""
context = dispatch.decision_context or {}
workflow = context.get("workflow") if isinstance(context.get("workflow"), dict) else {}
impact = workflow.get("impact") if isinstance(workflow.get("impact"), dict) else {}
extra = context.get("extra") if isinstance(context.get("extra"), dict) else {}
ownership = context.get("ownership") if isinstance(context.get("ownership"), dict) else {}
if not ownership and isinstance(extra.get("ownership"), dict):
ownership = extra["ownership"]
stale_count = _pick_first(impact, extra, key="stale_count")
total_count = _pick_first(impact, extra, key="total_count")
stale_ratio = _pick_first(impact, context, key="stale_ratio")
threshold = _pick_first(impact, context, key="threshold")
stale_days = _pick_first(impact, extra, key="stale_days")
lead_agent = ownership.get("lead_agent") or "Hermes"
human_owner = ownership.get("human_owner") or "KM owner / SRE owner"
content = "\n".join([
"# KM 健康檢查草稿",
"",
"## 來源",
f"- governance_event_id: {dispatch.governance_event_id}",
f"- dispatch_id: {dispatch.id}",
f"- executor_type: {dispatch.executor_type}",
"",
"## 影響摘要",
f"- stale_count: {_format_unknown(stale_count)}",
f"- total_count: {_format_unknown(total_count)}",
f"- stale_ratio: {_format_ratio(stale_ratio)}",
f"- threshold: {_format_ratio(threshold)}",
f"- stale_days: {_format_unknown(stale_days)}",
"",
"## AI 已完成",
"- Hermes 已接手 knowledge_degradation dispatch。",
"- 已產生 KM 更新草稿與 owner review work item。",
"- 尚未把任何條目標成 approved / published。",
"",
"## Owner 審核重點",
"- 優先反查最近被 Incident、Sentry、SigNoz、PlayBook 引用的 KM。",
"- 確認草稿內容沒有把過期處置方式寫回正式知識庫。",
"- 審核通過後再進入 km_writeback_after_approval。",
"",
"## 安全邊界",
"- writes_km_without_approval=false",
f"- lead_agent={lead_agent}",
f"- human_owner={human_owner}",
])
return KnowledgeEntryCreate(
title=f"KM healthcheck review draft - {dispatch.governance_event_id[:8]}",
content=content,
entry_type=EntryType.AUTO_RUNBOOK,
category="AI治理",
tags=[
"governance:knowledge_degradation",
"workflow:kb_growth_healthcheck",
"stage:waiting_owner_review",
"agent:Hermes",
"needs_owner_review",
f"dispatch:{dispatch.id}",
f"governance_event:{dispatch.governance_event_id}",
],
source=EntrySource.AI_EXTRACTED,
status=EntryStatus.REVIEW,
path_type="hermes_kb_growth_healthcheck",
created_by="hermes_kb_growth_worker",
)
def _build_review_context(
context: dict[str, Any],
*,
dispatch_id: str,
governance_event_id: str,
km_entry_id: str,
) -> dict[str, Any]:
"""更新 dispatch read model讓 Work Items/Telegram 可見目前停在 owner review。"""
updated = deepcopy(context)
workflow = updated.setdefault("workflow", {})
if not isinstance(workflow, dict):
workflow = {}
updated["workflow"] = workflow
stages = workflow.setdefault("stage_by_dispatch_status", {})
if not isinstance(stages, dict):
stages = {}
workflow["stage_by_dispatch_status"] = stages
stages.update({
"executing": "draft_km_updates",
"succeeded": "waiting_owner_review",
"failed": "needs_manual_km_triage",
})
workflow["current_stage"] = "waiting_owner_review"
workflow["next_action"] = "owner_review_km_draft"
workflow["needs_human_review"] = True
workflow["writes_km_without_approval"] = False
workflow["kb_draft_entry_id"] = km_entry_id
updated["next_action"] = "owner_review_km_draft"
updated["decision_path"] = "draft_created_waiting_owner_review"
updated["proposed_action"] = "Hermes 已建立 KM 更新草稿,等待 owner 審核"
updated["worker_result"] = {
"worker": "Hermes",
"executor_type": EXECUTOR_TYPE,
"dispatch_id": dispatch_id,
"governance_event_id": governance_event_id,
"km_draft_entry_id": km_entry_id,
"stage": "waiting_owner_review",
"status": "draft_created",
"writes_km_without_approval": False,
}
return updated
async def _mark_failed_if_started(dispatch_id: str, error: str) -> None:
"""若 worker 已取得 dispatch將它收斂到 failed保留錯誤。"""
for from_status in ("executing", "dispatched"):
try:
await transition_status(
dispatch_id,
from_status,
"failed",
last_error=error[:500],
)
return
except InvalidStatusTransition:
continue
except Exception as exc:
logger.warning(
"hermes_kb_growth_mark_failed_failed",
dispatch_id=dispatch_id,
from_status=from_status,
error=str(exc),
)
return
def _pick_first(*sources: dict[str, Any], key: str) -> Any:
for source in sources:
if key in source:
return source[key]
return None
def _format_unknown(value: Any) -> str:
return "unknown" if value is None else str(value)
def _format_ratio(value: Any) -> str:
try:
return f"{float(value) * 100:.1f}%"
except (TypeError, ValueError):
return "unknown"

View File

@@ -1,289 +0,0 @@
"""
Incident Lifecycle Reconciler
=============================
把已有強證據的舊 stuck incident 收斂回 RESOLVED。
範圍刻意保守:
- auto_repair_executions.success = true
- approval_records.status = EXECUTION_SUCCESS
- approval_records.status = EXPIRED
不處理單純 APPROVED / NO_ACTION / manual_required避免把仍需人工的事件
誤當作自動修復完成。
"""
from __future__ import annotations
import asyncio
from dataclasses import dataclass
import httpx
import structlog
from sqlalchemy import text
from src.core.config import settings
from src.db.base import get_db_context
from src.utils.timezone import now_taipei
logger = structlog.get_logger(__name__)
BATCH_LIMIT = 100
INTERVAL_SECONDS = 1800
_PROMETHEUS_TIMEOUT_SECONDS = 5.0
@dataclass(frozen=True)
class LifecycleCandidate:
incident_id: str
resolution_type: str
reason: str
direct_db_only: bool = False
async def run_incident_lifecycle_reconciler_loop() -> None:
"""每 30 分鐘收斂一小批已有完成證據的 stuck incident。"""
while True:
try:
resolved, errors = await reconcile_stuck_incidents()
if resolved > 0 or errors > 0:
logger.info(
"incident_lifecycle_reconciler_done",
resolved=resolved,
errors=errors,
batch_limit=BATCH_LIMIT,
)
except Exception as exc:
logger.warning("incident_lifecycle_reconciler_loop_failed", error=str(exc))
await asyncio.sleep(INTERVAL_SECONDS)
async def reconcile_stuck_incidents(limit: int = BATCH_LIMIT) -> tuple[int, int]:
"""
找出已完成但仍卡在 INVESTIGATING 的 incident透過 IncidentService 統一路徑結案。
Returns:
(resolved_count, error_count)
"""
candidates = await _fetch_candidates(limit)
remaining = max(0, limit - len(candidates))
if remaining > 0:
active_alertnames = await _fetch_active_alertnames()
if active_alertnames is not None:
candidates.extend(
await _fetch_inactive_or_duplicate_alert_candidates(
limit=remaining,
active_alertnames=active_alertnames,
exclude_incident_ids={c.incident_id for c in candidates},
)
)
if not candidates:
return 0, 0
from src.services.incident_service import get_incident_service
incident_service = get_incident_service()
resolved = 0
errors = 0
for candidate in candidates:
try:
if candidate.direct_db_only:
result = await _resolve_db_only(candidate.incident_id)
else:
result = await incident_service.resolve_incident(
candidate.incident_id,
resolution_type=candidate.resolution_type,
emit_postmortem=False,
)
if not result:
continue
resolved += 1
logger.info(
"incident_lifecycle_reconciled",
incident_id=candidate.incident_id,
reason=candidate.reason,
resolution_type=candidate.resolution_type,
direct_db_only=candidate.direct_db_only,
)
except Exception as exc:
errors += 1
logger.warning(
"incident_lifecycle_reconcile_failed",
incident_id=candidate.incident_id,
reason=candidate.reason,
error=str(exc),
)
return resolved, errors
async def _fetch_active_alertnames() -> set[str] | None:
"""Read current firing alertnames from Prometheus. None means fail-closed."""
try:
async with httpx.AsyncClient(timeout=_PROMETHEUS_TIMEOUT_SECONDS) as client:
response = await client.get(
f"{settings.PROMETHEUS_URL.rstrip('/')}/api/v1/query",
params={"query": 'ALERTS{alertstate="firing"}'},
)
response.raise_for_status()
payload = response.json()
except Exception as exc:
logger.warning("incident_lifecycle_active_alerts_fetch_failed", error=str(exc))
return None
result = payload.get("data", {}).get("result", [])
active_alertnames = {
item.get("metric", {}).get("alertname")
for item in result
if item.get("metric", {}).get("alertname")
}
logger.info(
"incident_lifecycle_active_alerts_loaded",
active_alert_count=len(active_alertnames),
)
return active_alertnames
async def _resolve_db_only(incident_id: str) -> bool:
from src.repositories.incident_repository import get_incident_repository
now = now_taipei()
return await get_incident_repository().update_status(
incident_id=incident_id,
status="resolved",
updated_at=now,
resolved_at=now,
)
async def _fetch_candidates(limit: int) -> list[LifecycleCandidate]:
async with get_db_context() as db:
result = await db.execute(
text(
"""
WITH stale AS (
SELECT
i.incident_id,
i.created_at,
EXISTS (
SELECT 1
FROM auto_repair_executions are
WHERE are.incident_id = i.incident_id
AND are.success IS TRUE
) AS has_success_auto_repair,
EXISTS (
SELECT 1
FROM approval_records ar
WHERE ar.incident_id = i.incident_id
AND ar.status::text = 'EXECUTION_SUCCESS'
) AS has_execution_success,
EXISTS (
SELECT 1
FROM approval_records ar
WHERE ar.incident_id = i.incident_id
AND ar.status::text = 'EXPIRED'
) AS has_expired_approval
FROM incidents i
WHERE i.status = 'INVESTIGATING'
AND i.created_at <= now() - interval '24 hours'
)
SELECT
incident_id,
CASE
WHEN has_success_auto_repair THEN 'auto_repair'
WHEN has_execution_success THEN 'auto_repair'
ELSE 'timeout'
END AS resolution_type,
CASE
WHEN has_success_auto_repair THEN 'auto_repair_execution_success'
WHEN has_execution_success THEN 'approval_execution_success'
ELSE 'approval_expired'
END AS reason
FROM stale
WHERE has_success_auto_repair
OR has_execution_success
OR has_expired_approval
ORDER BY created_at DESC
LIMIT :limit
"""
),
{
"limit": limit,
},
)
rows = result.mappings().all()
return [
LifecycleCandidate(
incident_id=str(row["incident_id"]),
resolution_type=str(row["resolution_type"]),
reason=str(row["reason"]),
)
for row in rows
]
async def _fetch_inactive_or_duplicate_alert_candidates(
*,
limit: int,
active_alertnames: set[str],
exclude_incident_ids: set[str],
) -> list[LifecycleCandidate]:
"""
收斂 Alertmanager 已不再 firing 的舊 incident以及同一 active alertname 的舊重複案。
若 Prometheus/Alertmanager 讀不到 active alertnames上層會 fail-closed 不呼叫本函式。
"""
active_list = list(active_alertnames) or ["__no_active_alertnames__"]
exclude_list = list(exclude_incident_ids) or ["__no_excluded_incidents__"]
async with get_db_context() as db:
result = await db.execute(
text(
"""
WITH ranked AS (
SELECT
i.incident_id,
i.alertname,
i.created_at,
row_number() OVER (
PARTITION BY i.alertname
ORDER BY i.created_at DESC, i.incident_id DESC
) AS rn
FROM incidents i
WHERE i.status = 'INVESTIGATING'
AND i.created_at <= now() - interval '24 hours'
AND NOT (i.incident_id = ANY(:exclude_incident_ids))
)
SELECT
incident_id,
CASE
WHEN alertname = ANY(:active_alertnames)
THEN 'active_duplicate_stale'
ELSE 'inactive_alert_stale'
END AS reason
FROM ranked
WHERE NOT (alertname = ANY(:active_alertnames) AND rn = 1)
ORDER BY created_at ASC
LIMIT :limit
"""
),
{
"active_alertnames": active_list,
"exclude_incident_ids": exclude_list,
"limit": limit,
},
)
rows = result.mappings().all()
return [
LifecycleCandidate(
incident_id=str(row["incident_id"]),
resolution_type="timeout",
reason=str(row["reason"]),
direct_db_only=True,
)
for row in rows
]

View File

@@ -28,7 +28,7 @@ from datetime import timedelta
import structlog
from sqlalchemy import select, update
from src.db.base import get_db_context
from src.db.base import get_session_factory
from src.db.models import AiGovernanceEvent, KnowledgeEntryRecord
from src.utils.timezone import now_taipei
@@ -129,7 +129,7 @@ class KbRotCleaner:
rot_reasons: dict[str, list[str]] = {}
total = 0
async with get_db_context() as session:
async with get_session_factory()() as session:
# 只掃 active 狀態(非 archived
q = await session.execute(
select(KnowledgeEntryRecord).where(
@@ -193,7 +193,7 @@ class KbRotCleaner:
if not result.stale_ids:
return
async with get_db_context() as session:
async with get_session_factory()() as session:
# 逐條更新(避免 bulk update 覆蓋 tags JSONB
q = await session.execute(
select(KnowledgeEntryRecord).where(
@@ -220,7 +220,7 @@ class KbRotCleaner:
async def _save_event(self, result: RotScanResult) -> None:
"""寫 kb_stale 事件到 ai_governance_events。"""
try:
async with get_db_context() as session:
async with get_session_factory()() as session:
event = AiGovernanceEvent(
event_type="kb_stale",
details=result.to_dict(),

View File

@@ -25,9 +25,7 @@ Feature Flag
from __future__ import annotations
import asyncio
import json
import structlog
from src.core.config import settings

View File

@@ -33,7 +33,7 @@ from datetime import timedelta
import structlog
from sqlalchemy import and_, select, update
from src.db.base import get_db_context
from src.db.base import get_session_factory
from src.db.models import KnowledgeEntryRecord
from src.models.knowledge import EntryStatus
from src.utils.timezone import now_taipei
@@ -112,7 +112,8 @@ class KnowledgeDecayJob:
cutoff = now_taipei() - timedelta(days=DECAY_AGE_DAYS)
decayable_statuses = [EntryStatus.DRAFT.value, EntryStatus.REVIEW.value]
async with get_db_context() as db:
session_factory = get_session_factory()
async with session_factory() as db:
# 查30 天未引用view_count=0且 updated_at < cutoff 的 draft/review 條目
stmt = select(KnowledgeEntryRecord).where(
and_(

View File

@@ -29,7 +29,7 @@ from datetime import timedelta
import structlog
from sqlalchemy import and_, select
from src.db.base import get_db_context
from src.db.base import get_session_factory
from src.db.models import AgentSession, AiGovernanceEvent, AutoRepairExecution, IncidentEvidence
from src.utils.timezone import now_taipei
@@ -109,7 +109,9 @@ class OfflineReplayService:
async def _run_replay(self) -> OfflineReplayReport:
cutoff = now_taipei() - timedelta(days=REPLAY_LOOKBACK_DAYS)
async with get_db_context() as db:
session_factory = get_session_factory()
async with session_factory() as db:
# 1. 取最近 N 個有 AgentSession(coordinator) 的 Incident
stmt = (
select(AgentSession.incident_id)
@@ -135,7 +137,7 @@ class OfflineReplayService:
)
results: list[IncidentReplayResult] = []
async with get_db_context() as db:
async with session_factory() as db:
for incident_id in incident_ids:
r = await self._replay_one(db, incident_id)
results.append(r)

View File

@@ -20,6 +20,7 @@ Date: 2026-03-20
import asyncio
import os
from uuid import uuid4
from collections.abc import AsyncGenerator
from contextlib import asynccontextmanager
@@ -31,26 +32,20 @@ from fastapi.responses import JSONResponse, Response
from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
from sentry_sdk.integrations.fastapi import FastApiIntegration
from sentry_sdk.integrations.starlette import StarletteIntegration
from uvicorn.middleware.proxy_headers import ProxyHeadersMiddleware
from src.api.v1 import agents as agents_v1 # Phase 9.5: Agent Teams API
from src.api.v1 import ai as ai_v1
from src.api.v1 import (
ai_governance as ai_governance_v1, # 2026-05-02: /governance 頁面 3 endpoints
)
from src.api.v1 import ai_slo as ai_slo_v1 # Phase 6 ADR-087: AI SLO 自我治理
from src.api.v1 import aider_events as aider_events_v1 # aider-watch v2 ADR-091
from src.api.v1 import ai_governance as ai_governance_v1 # 2026-05-02: /governance 頁面 3 endpoints
from src.api.v1 import ai_slo as ai_slo_v1 # Phase 6 ADR-087: AI SLO 自我治理
from src.api.v1 import aiops_kpi as aiops_kpi_v1 # ADR-090 § Phase 7 KPI Dashboard
from src.api.v1 import (
aiops_timeline as aiops_timeline_v1, # 2026-04-27 Wave8-X3 B4 timeline endpoint
)
from src.api.v1 import alert_operation_logs as alert_operation_logs_v1
from src.api.v1 import aiops_timeline as aiops_timeline_v1 # 2026-04-27 Wave8-X3 B4 timeline endpoint
from src.api.v1 import approvals as approvals_v1
from src.api.v1 import alert_operation_logs as alert_operation_logs_v1
from src.api.v1 import audit_logs as audit_logs_v1
from src.api.v1 import auto_repair as auto_repair_v1 # #8: 自動升級決策
from src.api.v1 import csrf as csrf_v1 # Phase 20: CSRF Protection
from src.api.v1 import dashboard as dashboard_v1
from src.api.v1 import drift as drift_v1 # Phase 25 P2: Config Drift Detection
from src.api.v1 import errors as errors_v1 # #40: Sentry 錯誤 BFF API
from src.api.v1 import (
gitea_webhook as gitea_webhook_v1, # ADR-059: Gitea → OpenClaw (GitHub → Gitea 遷移)
@@ -62,20 +57,19 @@ from src.api.v1 import incidents as incidents_v1 # Phase 6.4: Decision Proposal
from src.api.v1 import knowledge as knowledge_v1 # KB Phase 1: Knowledge Base
from src.api.v1 import learning as learning_v1 # Phase D-G P0: Learning API
from src.api.v1 import metrics as metrics_v1 # Phase 7: Gold Metrics (真實血脈)
from src.api.v1 import monitoring as monitoring_v1 # 2026-04-03: 監控工具狀態
from src.api.v1 import notifications as notifications_v1 # 2026-04-10: 通知頻道狀態
from src.api.v1 import (
platform as platform_v1, # AwoooP Phase 4: Platform ShellShadow Mode
)
from src.api.v1 import playbooks as playbooks_v1 # #7: Playbook 萃取
from src.api.v1 import proposals as proposals_v1 # Phase 6.4h: Proposals CRUD API
from src.api.v1 import rag as rag_v1 # Phase 33 ADR-067: RAG 知識庫
from src.api.v1 import (
sentry_webhook as sentry_webhook_v1, # Phase 10.2.1: Sentry → Telegram
)
from src.api.v1 import (
signoz_webhook as signoz_webhook_v1, # Phase 21: SignOz → Telegram (ADR-037)
)
from src.api.v1 import drift as drift_v1 # Phase 25 P2: Config Drift Detection
from src.api.v1 import platform as platform_v1 # AwoooP Phase 4: Platform ShellShadow Mode
from src.api.v1 import rag as rag_v1 # Phase 33 ADR-067: RAG 知識庫
from src.api.v1 import monitoring as monitoring_v1 # 2026-04-03: 監控工具狀態
from src.api.v1 import notifications as notifications_v1 # 2026-04-10: 通知頻道狀態
from src.api.v1 import stats as stats_v1 # Phase 6.5: Statistics Analytics
from src.api.v1 import telegram as telegram_v1 # Phase 5.4: Telegram Gateway
from src.api.v1 import telegram_webhook as telegram_webhook_v1 # ADR-094: Webhook入口
@@ -83,13 +77,11 @@ from src.api.v1 import terminal as terminal_v1 # Phase 19.1: Omni-Terminal SSE
from src.api.v1 import timeline as timeline_v1
from src.api.v1 import webhooks as webhooks_v1
from src.core.config import settings
from src.core.feature_flags import aiops_flags # ADR-080: AI 自主化飛輪 feature flags 啟動驗證
from src.core.http_client import close_all_http_clients, init_all_http_clients
from src.core.logging import get_logger, setup_logging
from src.core.redis_client import (
close_redis_pool,
close_worker_redis_pool,
init_redis_pool,
)
from src.core.redis_client import close_redis_pool, init_redis_pool
from src.services.flywheel_stats_service import get_flywheel_stats_service
from src.core.sse import get_publisher
from src.core.telemetry import setup_telemetry, shutdown_telemetry
@@ -101,10 +93,7 @@ from src.routers import proposals as proposals_router
# Legacy route imports (to be migrated)
from src.routes import agent, notifications, pipelines, plugins
from src.services.adr100_slo_metrics_service import get_adr100_slo_metrics_service
from src.services.alert_chain_metrics_service import get_alert_chain_metrics_service
from src.services.executor import close_executor
from src.services.flywheel_stats_service import get_flywheel_stats_service
# Phase 5: OpenClaw AI Engine
from src.services.openclaw import close_openclaw
@@ -199,10 +188,9 @@ else:
@asynccontextmanager
async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
"""Application lifespan events"""
# AwoooP Phase 2.4 (2026-05-04 ogt): 設定 startup handler 的 project_id context
# asyncio.create_task() 自動繼承父任務的 ContextVar → 31 個 background loop 全部標記為 awoooi
from src.core.context import PROJECT_ID
PROJECT_ID.set("awoooi")
# AwoooP Phase 2.4 (2026-05-04 ogt):
# 改為不再在 lifespan 初始化預設 project_id context
# 後續請求皆需透過 middleware/runtime 攜帶 project_id 注入,否則拒絕查詢。
# Startup
logger.info(
@@ -279,21 +267,16 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
# 2026-04-05 ogt: 重開機後 Redis 清空,從 DB restore 未解決的 incidents
# 統帥批准: 數據必須長久記錄,重開機後自動恢復 Working Memory
try:
from sqlalchemy import select
from src.services.incident_service import get_incident_service
from src.db.base import get_db_context
from src.db.models import IncidentRecord
from src.models.incident import IncidentStatus
from src.services.incident_service import get_incident_service
from sqlalchemy import select
incident_service = get_incident_service()
async with get_db_context() as db:
result = await db.execute(
select(IncidentRecord).where(
IncidentRecord.status.in_([
IncidentStatus.INVESTIGATING,
IncidentStatus.MITIGATING,
])
IncidentRecord.status.in_(["investigating", "mitigating"])
)
)
records = result.scalars().all()
@@ -301,16 +284,31 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
restored = 0
for record in records:
try:
incident = incident_service._record_to_incident(record)
from src.models.incident import Incident
incident = Incident(
incident_id=record.incident_id,
status=record.status,
severity=record.severity,
signals=record.signals or [],
affected_services=record.affected_services or [],
decision_chain=record.decision_chain,
proposal_ids=record.proposal_ids or [],
outcome=record.outcome,
created_at=record.created_at,
updated_at=record.updated_at,
resolved_at=record.resolved_at,
closed_at=record.closed_at,
ttl_days=record.ttl_days,
vectorized=record.vectorized,
# ADR-073: 分類欄位必須還原,否則 KM 寫入時全為 "unknown"
notification_type=record.notification_type,
alert_category=record.alert_category,
)
if await incident_service.save_to_working_memory(incident):
restored += 1
except Exception as record_error:
except Exception:
# 舊資料 source 值不合法node-exporter 等)→ 跳過
logger.warning(
"working_memory_warmup_record_skipped",
incident_id=getattr(record, "incident_id", None),
error=str(record_error),
)
pass
logger.info("working_memory_warmed_up", restored=restored, total=len(records))
except Exception as e:
@@ -353,9 +351,7 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
logger.warning("playbook_pg_backfill_schedule_failed", error=str(e))
try:
from src.services.playbook_embedding_service import (
ensure_playbook_embeddings_indexed,
)
from src.services.playbook_embedding_service import ensure_playbook_embeddings_indexed
asyncio.create_task(ensure_playbook_embeddings_indexed())
logger.info("playbook_embedding_indexing_scheduled")
except Exception as e:
@@ -503,40 +499,6 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
except Exception as e:
logger.warning("approval_timeout_resolver_schedule_failed", error=str(e))
# T73: 已有完成證據但仍卡在 INVESTIGATING 的舊 incident 小批次收斂。
# 僅處理 auto-repair success / approval EXECUTION_SUCCESS / approval EXPIRED
# 不自動關閉 manual_required 或單純 APPROVED 事件。
try:
from src.jobs.incident_lifecycle_reconciler import (
INTERVAL_SECONDS as INCIDENT_LIFECYCLE_RECONCILER_INTERVAL,
)
from src.jobs.incident_lifecycle_reconciler import (
run_incident_lifecycle_reconciler_loop,
)
asyncio.create_task(run_incident_lifecycle_reconciler_loop())
logger.info(
"incident_lifecycle_reconciler_scheduled",
interval_sec=INCIDENT_LIFECYCLE_RECONCILER_INTERVAL,
)
except Exception as e:
logger.warning("incident_lifecycle_reconciler_schedule_failed", error=str(e))
# AwoooP Ansible check-mode worker.
# 只執行 ansible-playbook --check --diff 並回寫 automation_operation_log
# apply 仍必須走 approval gate本 worker 不寫 auto_repair_executions。
try:
from src.jobs.awooop_ansible_check_mode_job import (
run_awooop_ansible_check_mode_loop,
)
asyncio.create_task(run_awooop_ansible_check_mode_loop())
logger.info(
"awooop_ansible_check_mode_worker_scheduled",
enabled=settings.ENABLE_AWOOOP_ANSIBLE_CHECK_MODE_WORKER,
interval_seconds=settings.AWOOOP_ANSIBLE_CHECK_MODE_INTERVAL_SECONDS,
)
except Exception as e:
logger.warning("awooop_ansible_check_mode_worker_schedule_failed", error=str(e))
# ADR-083 Phase 3: Evolver Agent每日— Playbook 自動合併 + 低信任封存
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 初始建立
try:
@@ -548,9 +510,7 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
# ADR-104 T2: LLM Playbook DRAFT governance每小時
try:
from src.jobs.playbook_generation_governance_job import (
run_playbook_generation_governance_loop,
)
from src.jobs.playbook_generation_governance_job import run_playbook_generation_governance_loop
asyncio.create_task(run_playbook_generation_governance_loop())
logger.info(
"playbook_generation_governance_loop_scheduled",
@@ -594,11 +554,11 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
# 2026-04-27 P3.1-T3 by Claude
try:
from src.utils.timezone import now_taipei
from datetime import datetime as _dt
async def _run_kb_rot_cleaner_loop() -> None:
import asyncio as _asyncio
from src.jobs.kb_rot_cleaner import get_kb_rot_cleaner
import asyncio as _asyncio
while True:
try:
now = now_taipei()
@@ -689,24 +649,14 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
except Exception as e:
logger.warning("governance_dispatcher_schedule_failed", error=str(e))
# T90 2026-05-19 ogt + Codex: Hermes KB growth worker每 5 分鐘)
# 消費 knowledge_degradation 的 hermes_kb_growth_healthcheck dispatch
# 只產生 REVIEW 草稿並停在 owner review不直接批准或發布 KM。
try:
from src.jobs.hermes_kb_growth_worker import run_hermes_kb_growth_loop
asyncio.create_task(run_hermes_kb_growth_loop())
logger.info("hermes_kb_growth_worker_scheduled", interval_sec=300)
except Exception as e:
logger.warning("hermes_kb_growth_worker_schedule_failed", error=str(e))
# 2026-04-25 P1.2 by Claude Engineer-A2 — failover 整合到 ai_router + lifespan
# OllamaFailoverManager + OllamaAutoRecoveryService 飛輪接線:
# failover 切換時 → recovery_callback → set_current_primary → Redis 持久化
# recovery service 每 30s 檢查 → 111 連續 3 次 HEALTHY → 自動切回 → clear_cache
# 順序:先取 singleton → wire callback → 啟動 recovery service才能接收 callback
try:
from src.services.ollama_auto_recovery import get_ollama_auto_recovery_service
from src.services.ollama_failover_manager import get_ollama_failover_manager
from src.services.ollama_auto_recovery import get_ollama_auto_recovery_service
_failover_mgr = get_ollama_failover_manager()
_recovery_svc = get_ollama_auto_recovery_service()
@@ -719,8 +669,8 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
# alerter 還沒注入 Redis → dedup fail-open告警會送出且無 dedup 保護(重複告警風險)
# 修法configure_alerter() 提前到 start() 之前Redis pool 在 lifespan 早期已就緒
try:
from src.core.redis_client import get_redis
from src.services.failover_alerter import configure_alerter
from src.core.redis_client import get_redis
configure_alerter(get_redis())
logger.info("failover_alerter_configured")
except Exception as _alerter_err:
@@ -804,7 +754,6 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
# Phase 6.1: 關閉 Signal Worker (先關閉 Consumer)
await close_signal_worker()
await close_worker_redis_pool()
await publisher.stop()
await close_executor()
await close_openclaw()
@@ -857,8 +806,11 @@ else:
# Middleware
# =============================================================================
# 2026-04-03 ogt: Nginx 反向代理修正 — 讓 FastAPI 信任 X-Forwarded-Proto
# 避免 /api/v1/knowledge 等 redirect 在 HTTPS 反向代理後產生 http:// Location
# 2026-04-03 ogt: Nginx 反向代理修正 — 讓 FastAPI 信任 X-Forwarded-Proto
# 解決問題: /api/v1/knowledge (無結尾斜線) 307 redirect 產生 http:// Location
# 原因: FastAPI 不知道自己在 HTTPS 後面redirect 回 http://
# 效果: 有了此中間件307 Location 會是 https://
from uvicorn.middleware.proxy_headers import ProxyHeadersMiddleware
app.add_middleware(ProxyHeadersMiddleware, trusted_hosts="*")
# CORS - Strict Whitelist (Iron Law #2)
@@ -868,7 +820,7 @@ app.add_middleware(
allow_origins=settings.CORS_ORIGINS,
allow_credentials=True,
allow_methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
allow_headers=["Authorization", "Content-Type", "X-Request-ID"],
allow_headers=["Authorization", "Content-Type", "X-Request-ID", "X-Project-ID", "X-Tenant-ID"],
expose_headers=["X-Request-ID"],
)
@@ -886,27 +838,53 @@ async def request_logging_middleware(request: Request, call_next):
"""
import time
request_id = request.headers.get("X-Request-ID", "-")
from src.core.context import clear_project_context, get_current_project_context, set_project_context
request_id = request.headers.get("X-Request-ID") or str(uuid4())
project_id = (
request.headers.get("X-Project-ID")
or request.headers.get("X-Tenant-ID")
or request.query_params.get("project_id")
)
project_id = project_id.strip() if project_id else None
source = "request.project_id.missing"
if project_id:
source = "request.header_or_query"
context_tokens = set_project_context(
project_id=project_id,
source=source,
request_id=request_id,
)
start_time = time.perf_counter()
# Bind request context for all logs in this request
structlog.contextvars.clear_contextvars()
current_context = get_current_project_context()
structlog.contextvars.bind_contextvars(
request_id=request_id,
method=request.method,
path=request.url.path,
project_id=current_context["project_id"],
project_context_source=current_context["source"],
)
log = get_logger("awoooi.http")
log.debug("request_start")
response = await call_next(request)
try:
response = await call_next(request)
finally:
clear_project_context(context_tokens)
duration_ms = (time.perf_counter() - start_time) * 1000
log.info(
"request_complete",
status_code=response.status_code,
duration_ms=round(duration_ms, 2),
project_id=current_context["project_id"],
project_context_source=current_context["source"],
has_project_context=bool(current_context["project_id"]),
)
# Add request ID to response headers
@@ -914,6 +892,26 @@ async def request_logging_middleware(request: Request, call_next):
return response
@app.get("/api/v1/security/db-context-guard")
async def db_context_guard() -> dict:
"""
Context Guard Endpoint (P1-1 runtime evidence)
- 未提供 project contextX-Project-ID / X-Tenant-ID / project_id query
時,應回傳 401代表 RLS 已採 fail-closed
- 有提供 context 時回傳 context snapshot便於稽核
"""
from src.core.context import get_current_project_context
from src.db.base import get_db_context
async with get_db_context():
return {
"status": "ok",
"project_context": get_current_project_context(),
"source": "runtime_guard",
}
# =============================================================================
# Exception Handlers
# =============================================================================
@@ -1054,15 +1052,6 @@ app.include_router(platform_v1.router, prefix="/api/v1/platform", tags=["AwoooP
@app.get("/metrics", include_in_schema=False)
async def prometheus_metrics() -> Response:
"""Prometheus metrics endpoint for alerting"""
# 2026-05-19 Codex — T85 Alert Chain DB evidence refresh.
# record_alert_chain_success() 是 process-local gauge部署後第一個 scrape
# 可能尚未收到新 webhook導致 smoke test 誤判 metric 不存在。
# 先用 AwoooP inbound / alert_operation_log 的 durable evidence 回填 last_success。
try:
await get_alert_chain_metrics_service().refresh_last_success_gauge()
except Exception as exc:
logger.warning("prometheus_metrics_alert_chain_evidence_error", error=str(exc))
content = generate_latest().decode("utf-8")
# 2026-05-07 ogt + Claude Sonnet 4.6 — INC-20260507-99ADF2 修復
# 飛輪指標awoooi_flywheel_*)原本只在 /api/v1/stats/flywheel/metrics 暴露,
@@ -1073,13 +1062,6 @@ async def prometheus_metrics() -> Response:
content += flywheel_metrics.to_prometheus_lines()
except Exception:
logger.warning("prometheus_metrics_flywheel_error")
# 2026-05-14 Codex — T18 ADR-100 SLO emitter
# GovernanceAgent 讀 Prometheus recording rules若 /metrics 不吐底層 DB totals
# sli:* rules 會全空並每小時重複發 governance_slo_data_gap。
try:
content += await get_adr100_slo_metrics_service().to_prometheus_lines()
except Exception as exc:
logger.warning("prometheus_metrics_adr100_slo_error", error=str(exc))
return Response(content=content, media_type=CONTENT_TYPE_LATEST)

View File

@@ -167,8 +167,6 @@ class ApprovalRequest(ApprovalRequestBase):
fingerprint: str | None = Field(default=None, description="告警指紋 Hash")
hit_count: int = Field(default=1, description="聚合觸發次數")
last_seen_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc), description="最後觸發時間")
telegram_message_id: int | None = Field(default=None, description="Telegram approval card message ID")
telegram_chat_id: int | None = Field(default=None, description="Telegram chat ID for the approval card")
# 2026-04-14 Claude Sonnet 4.6: incident_id 已移至 Base避免 ApprovalRequestCreate 缺欄位)
@property
@@ -218,10 +216,6 @@ class ApprovalRequestResponse(BaseModel):
hit_count: int = 1
last_seen_at: datetime | None = None
# Phase 6.5: Incident 關聯 (用於簽核後更新 Incident 狀態)
incident_id: str | None = None
matched_playbook_id: str | None = None
telegram_message_id: int | None = None
telegram_chat_id: int | None = None
metadata: dict | None = None
@classmethod
@@ -247,10 +241,6 @@ class ApprovalRequestResponse(BaseModel):
hit_count=approval.hit_count,
last_seen_at=approval.last_seen_at,
# Phase 6.5
incident_id=approval.incident_id,
matched_playbook_id=approval.matched_playbook_id,
telegram_message_id=approval.telegram_message_id,
telegram_chat_id=approval.telegram_chat_id,
metadata=approval.metadata,
)

View File

@@ -87,27 +87,13 @@ class DispatchItem(BaseModel):
governance_event_id: str
event_type: str
dispatch_status: str
executor_type: str | None = None
proposed_action: str = Field(description="≤120 字動作摘要")
playbook_id: str | None = None
playbook_trust: float | None = Field(default=None, ge=0.0, le=1.0)
created_at: datetime
dispatched_at: datetime | None = None
started_at: datetime | None = None
completed_at: datetime | None = None
operator_note: str | None = None
decision_path: str | None = None
workflow_stage: str | None = None
workflow_steps: list[str] = Field(default_factory=list)
next_action: str | None = None
lead_agent: str | None = None
support_agents: list[str] = Field(default_factory=list)
human_owner: str | None = None
kb_draft_entry_id: str | None = None
worker_status: str | None = None
dry_run_plan_fingerprint: str | None = None
archived_count: int | None = None
stale_ratio_snapshot: dict | None = None
class GovernanceQueueResponse(BaseModel):
@@ -121,475 +107,6 @@ class GovernanceQueueResponse(BaseModel):
)
# =============================================================================
# Endpoint 2B: KM review draft dedupe
# =============================================================================
class KnowledgeReviewDraftDedupeGroup(BaseModel):
governance_event_id: str
canonical_entry_id: str
canonical_title: str
canonical_updated_at: datetime | None = None
preferred_source: Literal["dispatch_context", "latest_review_draft"]
duplicate_entry_ids: list[str] = Field(default_factory=list)
duplicate_count: int
total_entries: int
suggested_action: str
owner_action: str
writes_on_read: bool = False
can_archive_without_owner_approval: bool = False
archive_history: list[DispatchItem] = Field(default_factory=list)
class KnowledgeReviewDraftDedupeResponse(BaseModel):
schema_version: str = "km_review_draft_dedupe_v1"
total_review_drafts: int
event_group_total: int
duplicate_draft_total: int
groups: list[KnowledgeReviewDraftDedupeGroup]
generated_at: datetime
class KnowledgeReviewDraftArchiveRequest(BaseModel):
canonical_entry_id: str = Field(min_length=1, max_length=120)
duplicate_entry_ids: list[str] = Field(min_length=1, max_length=100)
owner: str = Field(default="operator_console", min_length=1, max_length=100)
owner_approved: bool = False
dry_run: bool = False
dry_run_plan_fingerprint: str | None = Field(
default=None,
max_length=80,
description="Dry-run response fingerprint that must be echoed before a write.",
)
class KnowledgeReviewDraftStaleRatioSnapshot(BaseModel):
stale_count: int
total_count: int
stale_ratio: float
threshold: float
stale_days: int
class KnowledgeReviewDraftArchiveResponse(BaseModel):
schema_version: str = "km_review_draft_archive_v1"
governance_event_id: str
canonical_entry_id: str
requested_duplicate_entry_ids: list[str]
archived_entry_ids: list[str] = Field(default_factory=list)
skipped_entry_ids: list[str] = Field(default_factory=list)
would_archive_entry_ids: list[str] = Field(default_factory=list)
status: Literal["dry_run", "archived", "noop_already_archived"]
owner: str
owner_approved: bool
dry_run: bool
writes_km: bool
writes_governance_audit: bool
audit_dispatch_id: str | None = None
stale_ratio_snapshot: KnowledgeReviewDraftStaleRatioSnapshot | None = None
stale_ratio_recheck_status: Literal[
"dry_run",
"completed",
"already_active",
"not_requested",
] = "not_requested"
stale_ratio_recheck_dispatch_id: str | None = None
dry_run_plan_fingerprint: str | None = None
next_action: str = "stale_ratio_recheck"
generated_at: datetime
# =============================================================================
# Endpoint 2C: KM stale candidates
# =============================================================================
class KnowledgeStaleCandidate(BaseModel):
entry_id: str
project_id: str
title: str
entry_type: str
category: str | None = None
status: str
source: str | None = None
updated_at: datetime | None = None
stale_days: int
view_count: int
priority_score: int
priority_tier: Literal["P0", "P1", "P2"]
recommended_action: Literal[
"refresh_with_evidence",
"owner_review",
"archive_or_supersede",
]
reasons: list[str] = Field(default_factory=list)
correlation_sources: list[str] = Field(default_factory=list)
related_incident_id: str | None = None
related_playbook_id: str | None = None
related_approval_id: str | None = None
tags: list[str] = Field(default_factory=list)
owner_review_dispatch_id: str | None = None
owner_review_status: str | None = None
owner_review_stage: str | None = None
owner_review_next_action: str | None = None
class KnowledgeStaleCandidatesResponse(BaseModel):
schema_version: str = "km_stale_candidates_v1"
project_id: str
total_stale: int
returned: int
threshold_days: int
writes_on_read: bool = False
manual_review_required: bool = True
items: list[KnowledgeStaleCandidate]
generated_at: datetime
class KnowledgeStaleOwnerReviewRequest(BaseModel):
owner: str = Field(default="operator_console", min_length=1, max_length=100)
owner_note: str | None = Field(default=None, max_length=240)
dry_run: bool = False
class KnowledgeStaleOwnerReviewResponse(BaseModel):
schema_version: str = "km_stale_owner_review_v1"
entry_id: str
project_id: str
status: Literal["dry_run", "queued", "already_queued"]
governance_event_id: str | None = None
dispatch_id: str | None = None
workflow_stage: str
recommended_action: Literal[
"refresh_with_evidence",
"owner_review",
"archive_or_supersede",
]
owner: str
owner_note: str | None = None
writes_km: bool = False
writes_governance_audit: bool
next_action: str = "owner_review_stale_km_candidate"
generated_at: datetime
class KnowledgeStaleOwnerReviewBatchQueueRequest(BaseModel):
project_id: str = Field(default="awoooi", min_length=1, max_length=64)
priority_tiers: list[Literal["P0", "P1", "P2"]] = Field(
default_factory=lambda: ["P0", "P1"],
min_length=1,
max_length=3,
)
limit: int = Field(default=10, ge=1, le=50)
owner: str = Field(default="operator_console", min_length=1, max_length=100)
owner_note: str | None = Field(default=None, max_length=240)
dry_run: bool = False
dry_run_plan_fingerprint: str | None = Field(
default=None,
max_length=80,
description="Dry-run response fingerprint that must be echoed before queueing a batch.",
)
class KnowledgeStaleOwnerReviewBatchItem(BaseModel):
entry_id: str
title: str
priority_tier: Literal["P0", "P1", "P2"]
recommended_action: Literal[
"refresh_with_evidence",
"owner_review",
"archive_or_supersede",
]
status: Literal["would_queue", "queued", "already_queued", "skipped"]
reason: str | None = None
governance_event_id: str | None = None
dispatch_id: str | None = None
workflow_stage: str
class KnowledgeStaleOwnerReviewBatchQueueResponse(BaseModel):
schema_version: str = "km_stale_owner_review_batch_v1"
project_id: str
status: Literal["dry_run", "queued", "noop_already_queued"]
owner: str
owner_note: str | None = None
dry_run: bool
priority_tiers: list[str]
requested_limit: int
candidate_count: int
queued_count: int
already_queued_count: int
skipped_count: int
batch_governance_event_id: str | None = None
batch_dispatch_id: str | None = None
workflow_stage: str
writes_km: bool = False
writes_governance_audit: bool
stale_ratio_snapshot: KnowledgeReviewDraftStaleRatioSnapshot | None = None
dry_run_plan_fingerprint: str | None = None
items: list[KnowledgeStaleOwnerReviewBatchItem] = Field(default_factory=list)
next_action: str = "owner_review_stale_km_batch"
generated_at: datetime
class KnowledgeStaleOwnerReviewInboxItem(BaseModel):
dispatch_id: str
governance_event_id: str
entry_id: str
project_id: str
title: str
dispatch_status: str
workflow_stage: str
next_action: str | None = None
owner: str | None = None
owner_note: str | None = None
batch_governance_event_id: str | None = None
batch_dispatch_id: str | None = None
priority_tier: Literal["P0", "P1", "P2"]
priority_score: int
recommended_action: Literal[
"refresh_with_evidence",
"owner_review",
"archive_or_supersede",
]
stale_days: int
view_count: int
correlation_sources: list[str] = Field(default_factory=list)
reasons: list[str] = Field(default_factory=list)
related_incident_id: str | None = None
related_playbook_id: str | None = None
related_approval_id: str | None = None
dry_run_plan_fingerprint: str | None = None
queued_at: datetime | None = None
started_at: datetime | None = None
completed_at: datetime | None = None
class KnowledgeStaleOwnerReviewInboxResponse(BaseModel):
schema_version: str = "km_stale_owner_review_inbox_v1"
project_id: str
dispatch_status: str
total: int
returned: int
writes_on_read: bool = False
manual_review_required: bool = True
items: list[KnowledgeStaleOwnerReviewInboxItem] = Field(default_factory=list)
generated_at: datetime
class KnowledgeStaleOwnerReviewBurnDownItem(BaseModel):
completion_dispatch_id: str
governance_event_id: str
source_dispatch_id: str | None = None
recheck_dispatch_id: str | None = None
entry_id: str | None = None
project_id: str
dispatch_status: str
workflow_stage: str
review_outcome: Literal[
"refresh_with_evidence",
"archive",
"supersede",
] | None = None
owner: str | None = None
completed_at: datetime | None = None
stale_ratio_snapshot: KnowledgeReviewDraftStaleRatioSnapshot | None = None
stale_count_delta: int | None = None
stale_ratio_delta: float | None = None
above_threshold: bool | None = None
class KnowledgeStaleOwnerReviewBurnDownResponse(BaseModel):
schema_version: str = "km_stale_owner_review_burndown_v1"
project_id: str
burn_down_status: Literal["above_threshold", "at_or_below_threshold", "no_data"]
current_snapshot: KnowledgeReviewDraftStaleRatioSnapshot | None = None
entries_to_threshold: int
pending_owner_reviews: int
completed_owner_reviews: int
completion_audit_total: int
stale_ratio_recheck_total: int
latest_stale_count_delta: int | None = None
latest_stale_ratio_delta: float | None = None
writes_on_read: bool = False
manual_review_required: bool = True
returned: int
items: list[KnowledgeStaleOwnerReviewBurnDownItem] = Field(default_factory=list)
generated_at: datetime
class KnowledgeStaleOwnerReviewCompletionQueueItem(BaseModel):
dispatch_id: str
governance_event_id: str
entry_id: str
project_id: str
title: str
dispatch_status: str
workflow_stage: str
readiness: Literal["ready", "blocked", "completed", "failed"]
recommended_completion_outcome: Literal[
"refresh_with_evidence",
"archive",
"supersede",
]
next_action: str
blockers: list[str] = Field(default_factory=list)
required_owner_fields: list[str] = Field(default_factory=list)
can_preview: bool
can_confirm_after_preview: bool
writes_km_on_confirm: bool
owner: str | None = None
owner_note: str | None = None
batch_governance_event_id: str | None = None
batch_dispatch_id: str | None = None
priority_tier: Literal["P0", "P1", "P2"]
priority_score: int
recommended_action: Literal[
"refresh_with_evidence",
"owner_review",
"archive_or_supersede",
]
stale_days: int
view_count: int
correlation_sources: list[str] = Field(default_factory=list)
reasons: list[str] = Field(default_factory=list)
related_incident_id: str | None = None
related_playbook_id: str | None = None
related_approval_id: str | None = None
dry_run_plan_fingerprint: str | None = None
queued_at: datetime | None = None
started_at: datetime | None = None
completed_at: datetime | None = None
class KnowledgeStaleOwnerReviewCompletionQueueResponse(BaseModel):
schema_version: str = "km_stale_owner_review_completion_queue_v1"
project_id: str
status_bucket: Literal["all", "ready", "blocked", "completed", "failed", "pending"]
priority_tiers: list[str] = Field(default_factory=list)
recommended_completion_outcome: Literal[
"all",
"refresh_with_evidence",
"archive",
"supersede",
] = "all"
batch_governance_event_id: str | None = None
can_preview: bool | None = None
total: int
returned: int
pending_count: int
ready_count: int
blocked_count: int
completed_count: int
failed_count: int
writes_on_read: bool = False
manual_review_required: bool = True
batch_writes_allowed: bool = False
items: list[KnowledgeStaleOwnerReviewCompletionQueueItem] = Field(default_factory=list)
generated_at: datetime
class KnowledgeStaleOwnerReviewCompletionBatchPreviewRequest(BaseModel):
project_id: str = Field(default="awoooi", min_length=1, max_length=64)
status_bucket: Literal["all", "ready", "blocked", "completed", "failed", "pending"] = "ready"
priority_tiers: list[Literal["P0", "P1", "P2"]] = Field(
default_factory=lambda: ["P0", "P1", "P2"],
min_length=1,
max_length=3,
)
recommended_completion_outcome: Literal[
"all",
"refresh_with_evidence",
"archive",
"supersede",
] = "all"
batch_governance_event_id: str | None = Field(default=None, max_length=120)
limit: int = Field(default=10, ge=1, le=30)
owner: str = Field(default="operator_console", min_length=1, max_length=100)
owner_note: str | None = Field(default=None, max_length=240)
class KnowledgeStaleOwnerReviewCompletionBatchPreviewResponse(BaseModel):
schema_version: str = "km_stale_owner_review_completion_batch_preview_v1"
project_id: str
status: Literal["dry_run"] = "dry_run"
owner: str
owner_note: str | None = None
status_bucket: Literal["all", "ready", "blocked", "completed", "failed", "pending"]
priority_tiers: list[str]
recommended_completion_outcome: Literal[
"all",
"refresh_with_evidence",
"archive",
"supersede",
]
batch_governance_event_id: str | None = None
requested_limit: int
candidate_count: int
previewable_count: int
blocked_count: int
completed_count: int
failed_count: int
writes_km: bool = False
writes_governance_audit: bool = False
batch_writes_allowed: bool = False
manual_review_required: bool = True
dry_run_plan_fingerprint: str
next_action: str = "preview_each_ready_item_then_confirm_single_item"
items: list[KnowledgeStaleOwnerReviewCompletionQueueItem] = Field(default_factory=list)
generated_at: datetime
class KnowledgeStaleOwnerReviewCompleteRequest(BaseModel):
dispatch_id: str | None = Field(
default=None,
max_length=120,
description="Owner-review dispatch id. Optional when the backend can resolve the active item by entry id.",
)
owner: str = Field(default="operator_console", min_length=1, max_length=100)
owner_approved: bool = False
dry_run: bool = False
review_outcome: Literal[
"refresh_with_evidence",
"archive",
"supersede",
]
owner_note: str | None = Field(default=None, max_length=500)
updated_title: str | None = Field(default=None, min_length=1, max_length=255)
updated_content: str | None = Field(default=None, min_length=1)
superseded_by_entry_id: str | None = Field(default=None, max_length=120)
dry_run_plan_fingerprint: str | None = Field(
default=None,
max_length=80,
description="Dry-run response fingerprint that must be echoed before a write.",
)
class KnowledgeStaleOwnerReviewCompleteResponse(BaseModel):
schema_version: str = "km_stale_owner_review_complete_v1"
entry_id: str
project_id: str
status: Literal["dry_run", "completed", "already_completed"]
review_outcome: Literal[
"refresh_with_evidence",
"archive",
"supersede",
]
governance_event_id: str
dispatch_id: str
audit_dispatch_id: str | None = None
stale_ratio_recheck_dispatch_id: str | None = None
workflow_stage: str
owner: str
owner_approved: bool
dry_run: bool
writes_km: bool
writes_governance_audit: bool
stale_ratio_snapshot: KnowledgeReviewDraftStaleRatioSnapshot | None = None
dry_run_plan_fingerprint: str | None = None
next_action: str = "stale_ratio_recheck"
generated_at: datetime
# =============================================================================
# Endpoint 3: summary
# =============================================================================

View File

@@ -39,15 +39,14 @@ import hashlib
import json
import time
from dataclasses import dataclass, field
from datetime import UTC, datetime
from datetime import datetime, timezone
from typing import Any
from uuid import UUID
import structlog
from sqlalchemy import select
from sqlalchemy import select, text
from sqlalchemy.ext.asyncio import AsyncSession
from src.core.redis_client import get_redis
from src.db.awooop_models import (
AwoooPActiveRevision,
AwoooPMcpGatewayAudit,
@@ -278,7 +277,7 @@ class McpGateway:
self, ctx: GatewayContext, gate_result: GateCheckResult
) -> tuple[AwoooPMcpToolRegistry, AwoooPMcpGrant]:
"""Gate 3tool 在白名單 + grant 有效(未到期、未撤銷)"""
now = datetime.now(UTC)
now = datetime.now(timezone.utc)
# 查 tool registry
tool_result = await self._db.execute(
@@ -360,9 +359,14 @@ class McpGateway:
raise GateApprovalError("write/admin 操作需要 run_idapproval 追蹤用)")
try:
redis = get_redis()
import aioredis
from src.core.config import settings
redis = aioredis.from_url(settings.REDIS_URL)
approval_key = f"mcp_approval:{ctx.project_id}:{ctx.agent_id}:{ctx.tool_name}:{ctx.run_id}"
approved = await redis.get(approval_key)
await redis.aclose()
except Exception as exc:
logger.warning(
"mcp_gate5_redis_error",
@@ -388,7 +392,10 @@ class McpGateway:
parameters: dict[str, Any],
) -> MCPToolResult:
"""呼叫底層 MCP provider 執行工具"""
provider = await self._resolve_provider(ctx, tool_row)
registry = get_provider_registry()
provider = registry.get(ctx.tool_name) or registry.get(
tool_row.tool_name if tool_row else ctx.tool_name
)
# 找不到 provider → 回傳 shadow no-op
if provider is None:
@@ -404,57 +411,14 @@ class McpGateway:
)
audit_params = dict(parameters)
existing_audit = (
parameters.get("_mcp_audit")
if isinstance(parameters, dict) and isinstance(parameters.get("_mcp_audit"), dict)
else {}
)
audit_params["_mcp_audit"] = {
"project_id": ctx.project_id,
"agent_id": ctx.agent_id,
"run_id": str(ctx.run_id) if ctx.run_id else None,
"trace_id": ctx.trace_id,
"incident_id": existing_audit.get("incident_id") or ctx.trace_id,
"session_id": existing_audit.get("session_id"),
"flywheel_node": existing_audit.get("flywheel_node"),
"agent_role": existing_audit.get("agent_role") or ctx.agent_id,
"gateway_path": "awooop_mcp_gateway",
}
return await provider.execute(ctx.tool_name, audit_params)
async def _resolve_provider(
self,
ctx: GatewayContext,
tool_row: AwoooPMcpToolRegistry | None,
):
"""Find the provider that owns ctx.tool_name.
ProviderRegistry is keyed by provider name (`kubernetes`, `ssh_host`, ...),
while GatewayContext intentionally uses the governed tool name
(`kubectl_get`, `ssh_diagnose`, ...). Scan provider tool manifests as the
compatibility bridge until registry exposes a first-class tool index.
"""
registry = get_provider_registry()
direct = registry.get(ctx.tool_name)
if direct is not None:
return direct
lookup_name = tool_row.tool_name if tool_row else ctx.tool_name
for provider in registry.all():
try:
tools = await provider.list_tools()
except Exception as exc:
logger.debug(
"mcp_gateway_provider_manifest_skipped",
provider=getattr(provider, "name", None),
tool_name=lookup_name,
error=str(exc),
)
continue
if any(tool.name == lookup_name for tool in tools):
return provider
return None
# ── Audit log ─────────────────────────────────────────────────────────────
async def _write_audit(
@@ -482,15 +446,6 @@ class McpGateway:
json.dumps(result.output, sort_keys=True, default=str).encode()
).hexdigest()
gate_payload = {
**gate_result.as_dict(),
"schema_version": "awooop_mcp_gateway_audit_v1",
"gateway_path": "awooop_mcp_gateway",
"policy_enforced": True,
"is_shadow": ctx.is_shadow,
"required_scope": ctx.required_scope,
}
audit = AwoooPMcpGatewayAudit(
project_id=ctx.project_id,
run_id=ctx.run_id,
@@ -500,15 +455,16 @@ class McpGateway:
tool_name=ctx.tool_name,
input_hash=input_hash,
output_hash=output_hash,
gate_result=gate_payload,
gate_result=gate_result.as_dict(),
result_status=result_status,
block_gate=block_gate,
block_reason=block_reason,
latency_ms=latency_ms,
)
self._db.add(audit)
await self._db.flush()
if tool_row is not None:
self._db.add(audit)
await self._db.flush()
except Exception as exc:
logger.warning(
"mcp_gateway_audit_write_failed",

View File

@@ -14,7 +14,6 @@ from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
from uuid import uuid4
from src.utils.timezone import now_taipei
@@ -30,9 +29,7 @@ class MCPTool:
name: str
description: str
input_schema: dict[str, Any]
# 2026-05-06 Codex: 部分舊 provider 的 list_tools() 尚未傳 server_name
# 先給 DTO 預設值registry 會以 provider.name 補正,避免啟動登記直接 crash。
server_name: str = ""
server_name: str
@dataclass
@@ -44,21 +41,12 @@ class MCPToolResult:
"""
success: bool
execution_id: str = ""
execution_id: str
output: Any | None = None
# 2026-05-06 Codex: 舊 provider 曾使用 data=... 作為成功輸出欄位。
# 保留 alias避免 provider 成功路徑因 dataclass 參數不相容而 crash。
data: Any | None = None
error: str | None = None
duration: float = 0.0
timestamp: datetime = field(default_factory=now_taipei)
def __post_init__(self) -> None:
if not self.execution_id:
self.execution_id = f"mcp-{uuid4()}"
if self.output is None and self.data is not None:
self.output = self.data
def to_dict(self) -> dict:
return {
"success": self.success,

View File

@@ -24,7 +24,6 @@ from typing import Any
import httpx
from src.core.config import settings # P0-13: K8s namespace 由 settings.AWOOOI_K8S_NAMESPACE 提供
from src.services.mcp_audit_context import with_mcp_audit_context
from src.utils.timezone import now_taipei
logger = logging.getLogger(__name__)
@@ -519,13 +518,6 @@ class MCPBridge:
raise ValueError(f"Unknown MCP Server: {server_name}")
server = self._servers[server_name]
parameters = with_mcp_audit_context(
parameters,
session_id=f"mcp_bridge:{execution_id}",
flywheel_node="govern",
agent_role="mcp_bridge",
gateway_path="legacy_mcp_bridge",
)
result = await self._execute_tool(server, tool_name, parameters)
# ========================================

View File

@@ -41,7 +41,6 @@ SSH 連線:
@see docs/superpowers/specs/2026-04-10-infra-rebuild-sprint-abc-design.md §MCP-2a
"""
import logging
import re
import uuid
from datetime import UTC, datetime
@@ -52,7 +51,6 @@ import structlog
from src.plugins.mcp.interfaces import MCPTool, MCPToolProvider, MCPToolResult
logger = structlog.get_logger(__name__)
_asyncssh_logger_configured = False
# =============================================================================
# 安全常數
@@ -60,17 +58,10 @@ _asyncssh_logger_configured = False
SSH_KEY_PATH = "/run/secrets/ssh_mcp_key"
SSH_USER = "wooo"
SSH_PORT = 22
DEFAULT_HOST_USERS = {
# AI/Web host is operated by the ollama account in the current topology.
"192.168.0.188": "ollama",
}
SHORT_HOST_MAP = {
"110": "192.168.0.110",
"120": "192.168.0.120",
"121": "192.168.0.121",
"188": "192.168.0.188",
}
DIAG_TIMEOUT = 10 # 診斷類超時(秒)
OP_TIMEOUT = 60 # 操作類超時(秒)
@@ -113,47 +104,6 @@ def _validate_param(key: str, value: str) -> str:
# tail / port / lines 由呼叫方 int() 轉換,不需字串白名單
return value
def _normalize_ssh_host(value: str) -> str:
"""
Normalize host labels before they enter asyncssh.
Prometheus labels often arrive as ``192.168.0.110:9100``. That port is the
exporter port, not SSH. The SSH provider must connect to the host on the
platform SSH port, otherwise asyncssh can receive a stringly port from
config/labels and fail with ``%d format`` before the tool even runs.
"""
host = (value or "").strip()
if host.startswith("ssh://"):
host = host.removeprefix("ssh://")
if "@" in host:
host = host.rsplit("@", 1)[1]
if host.startswith("[") and "]" in host:
return host[1:host.index("]")]
if host.count(":") == 1:
maybe_host, maybe_port = host.rsplit(":", 1)
if maybe_port.isdigit():
host = maybe_host
if host in SHORT_HOST_MAP:
return SHORT_HOST_MAP[host]
return host
def _quiet_asyncssh_info_logs() -> None:
"""Keep third-party asyncssh INFO logs from breaking stdlib %-format logging.
Some target SSH servers send exit status as a string. AsyncSSH then emits an
INFO log with ``%d`` and that string argument before our code sees the
result, which produces noisy ``TypeError: %d format`` tracebacks. The tool
result itself is still available, so production should keep asyncssh at
WARNING and rely on our structured MCP audit logs.
"""
global _asyncssh_logger_configured
if _asyncssh_logger_configured:
return
logging.getLogger("asyncssh").setLevel(logging.WARNING)
_asyncssh_logger_configured = True
# 群組 A只讀
GROUP_A_TOOLS = {
"ssh_diagnose",
@@ -248,10 +198,6 @@ class SSHProvider(MCPToolProvider):
),
input_schema={"type": "object", "properties": {
"host": {"type": "string", "description": "Target host IP"},
"container_name": {
"type": "string",
"description": "Optional Docker container name for container-focused diagnostics",
},
}, "required": ["host"]},
server_name=self.name,
),
@@ -429,7 +375,7 @@ class SSHProvider(MCPToolProvider):
error=f"Unknown tool: {tool_name}",
)
host = _normalize_ssh_host(str(parameters.get("host", "")))
host = parameters.get("host", "")
# 守衛 2: 允許的 host
if host not in self._allowed_hosts():
@@ -554,23 +500,12 @@ class SSHProvider(MCPToolProvider):
# 所有接受用戶字串的工具,必須先通過 _validate_param() 白名單驗證
if tool_name == "ssh_diagnose":
# 2026-04-27 Claude Sonnet 4.6: 主機告警自動診斷 — 只讀,不修改任何狀態
command = (
return (
"echo '=== CPU TOP ===' && ps aux --sort=-%cpu | head -15 && "
"echo '=== MEMORY ===' && free -h && "
"echo '=== DISK ===' && df -h && "
"echo '=== LOAD ===' && uptime"
)
container_name = params.get("container_name")
if container_name:
name = _validate_param("container_name", str(container_name))
command = (
f"{command} && "
f"echo '=== DOCKER STATS {name} ===' && "
f"docker stats --no-stream {name} 2>&1 && "
f"echo '=== DOCKER INSPECT {name} ===' && "
f"docker inspect {name} 2>&1 | head -80"
)
return command
if tool_name == "ssh_get_top_processes":
return "ps aux --sort=-%cpu | head -15"
@@ -669,9 +604,7 @@ class SSHProvider(MCPToolProvider):
raise RuntimeError(
"asyncssh is not installed. "
"Add 'asyncssh' to pyproject.toml dependencies."
) from None
_quiet_asyncssh_info_logs()
)
import os
if not os.path.exists(SSH_KEY_PATH):
@@ -692,13 +625,11 @@ class SSHProvider(MCPToolProvider):
async with asyncssh.connect(
host,
port=SSH_PORT,
username=username or SSH_USER,
client_keys=[SSH_KEY_PATH],
known_hosts=known_hosts_path, # None = 跳過驗證(內網),或指定文件路徑
config=None, # 禁止讀取使用者 ssh config避免 Port 字串污染 asyncssh
connect_timeout=float(timeout),
connect_timeout=timeout,
) as conn:
# Bug 根因asyncssh 模組沒有頂層 run();應呼叫 conn.run()2026-04-24 Claude Sonnet 4.6
result = await conn.run(cmd, timeout=float(timeout), check=False)
result = await conn.run(cmd, timeout=timeout, check=False)
return (result.stdout or ""), (result.stderr or "")

View File

@@ -106,8 +106,6 @@ def _record_to_request(record: ApprovalRecord) -> ApprovalRequest:
# B4 fix 2026-04-24 ogt + Claude Sonnet 4.6: 補回 DB 欄位(人工審核路徑讀回必要)
incident_id=getattr(record, "incident_id", None),
matched_playbook_id=getattr(record, "matched_playbook_id", None),
telegram_message_id=getattr(record, "telegram_message_id", None),
telegram_chat_id=getattr(record, "telegram_chat_id", None),
)

View File

@@ -18,14 +18,7 @@ import structlog
from sqlalchemy import text
from src.db.base import get_db_context
from src.models.drift import (
DriftIntent,
DriftInterpretation,
DriftItem,
DriftLevel,
DriftReport,
DriftStatus,
)
from src.models.drift import DriftInterpretation, DriftIntent, DriftItem, DriftLevel, DriftReport, DriftStatus
logger = structlog.get_logger(__name__)
@@ -174,40 +167,6 @@ class DriftReportRepository:
{"report_id": report_id, "narrative": narrative},
)
async def get_repeat_state(
self,
report: DriftReport,
*,
include_values: bool = True,
) -> dict:
"""Return stable fingerprint repeat state for a drift report."""
from src.services.drift_repeat_state import build_drift_repeat_state
async with get_db_context() as db:
result = await db.execute(
text("""
SELECT
report_id,
namespace,
status,
scanned_at,
created_at,
items
FROM drift_reports
WHERE namespace = :namespace
AND created_at > now() - interval '24 hours'
ORDER BY scanned_at DESC
LIMIT 200
"""),
{"namespace": report.namespace},
)
rows = [dict(row) for row in result.mappings().all()]
return build_drift_repeat_state(
report,
rows,
include_values=include_values,
)
_drift_repo: DriftReportRepository | None = None

View File

@@ -356,75 +356,6 @@ async def list_pending(
return list(result.scalars().all())
async def list_pending_by_executor(
executor_type: str,
*,
limit: int = 50,
) -> list[GovernanceRemediationDispatch]:
"""列出指定 executor 的 pending dispatch按 dispatched_at ASC
用於 Hermes / 其他 worker 消費自己的 work item。由 repository 層集中查詢,
避免 job 直接散落表名與狀態條件。
Args:
executor_type: dispatch.executor_type例如 hermes_kb_growth_healthcheck
limit: 本輪最多取幾筆,避免 backlog 一次拖垮 worker
Returns:
最舊優先的 pending dispatch 列表。
"""
async with get_db_context() as db:
result = await db.execute(
select(GovernanceRemediationDispatch)
.where(GovernanceRemediationDispatch.dispatch_status == "pending")
.where(GovernanceRemediationDispatch.executor_type == executor_type)
.order_by(GovernanceRemediationDispatch.dispatched_at.asc())
.limit(limit)
)
return list(result.scalars().all())
async def update_decision_context(
dispatch_id: str,
decision_context: dict[str, Any],
) -> GovernanceRemediationDispatch:
"""更新 dispatch 的 decision_context保留同一 row 的 audit trail。
這只更新 dispatch work item 的讀模型上下文,不修改 immutable
ai_governance_events也不代表治理事件已被解決。
Args:
dispatch_id: governance_remediation_dispatch.id
decision_context: 新的 JSONB context
Returns:
更新後的 GovernanceRemediationDispatch ORM 物件
Raises:
DispatchNotFound: 找不到 dispatch_id
"""
async with get_db_context() as db:
result = await db.execute(
select(GovernanceRemediationDispatch)
.where(GovernanceRemediationDispatch.id == dispatch_id)
)
row = result.scalar_one_or_none()
if row is None:
raise DispatchNotFound(f"dispatch_id={dispatch_id!r} 不存在")
row.decision_context = decision_context
await db.flush()
await db.refresh(row)
logger.info(
"dispatch_decision_context_updated",
dispatch_id=dispatch_id,
event_id=row.governance_event_id,
executor_type=row.executor_type,
)
return row
async def list_by_event(
event_id: str,
) -> list[GovernanceRemediationDispatch]:

View File

@@ -19,12 +19,7 @@ from sqlalchemy import select
from src.db.base import get_db_context
from src.db.models import IncidentRecord
from src.models.incident import (
Incident,
IncidentFrequencyStats,
IncidentStatus,
Severity,
)
from src.models.incident import Incident, IncidentFrequencyStats, IncidentStatus, Severity
from src.repositories.interfaces import IIncidentRepository
logger = structlog.get_logger(__name__)
@@ -46,8 +41,8 @@ def _record_to_incident(record: IncidentRecord) -> Incident:
return Incident(
incident_id=record.incident_id,
status=IncidentStatus(_normalize_status(record.status)),
severity=Severity(_normalize_severity(record.severity)),
status=IncidentStatus(record.status),
severity=Severity(record.severity),
signals=record.signals or [],
affected_services=record.affected_services or [],
proposal_ids=record.proposal_ids or [],
@@ -98,36 +93,6 @@ def _incident_to_record_data(incident: Incident) -> dict[str, Any]:
}
def _normalize_status(value: str | IncidentStatus) -> str:
if isinstance(value, IncidentStatus):
return value.value
raw = str(value)
if raw in IncidentStatus.__members__:
return IncidentStatus[raw].value
normalized = raw.strip().lower()
if normalized == "open":
return IncidentStatus.INVESTIGATING.value
return normalized
def _normalize_severity(value: str | Severity) -> str:
if isinstance(value, Severity):
return value.value
raw = str(value)
if raw in Severity.__members__:
return Severity[raw].value
legacy_map = {
"critical": Severity.P0.value,
"high": Severity.P1.value,
"warning": Severity.P2.value,
"medium": Severity.P2.value,
"info": Severity.P3.value,
"low": Severity.P3.value,
"none": Severity.P3.value,
}
return legacy_map.get(raw.strip().lower(), raw)
# =============================================================================
# IncidentDBRepository
# =============================================================================
@@ -171,8 +136,8 @@ class IncidentDBRepository(IIncidentRepository):
async def get_active(self) -> list[Incident]:
"""取得所有活躍的 Incident"""
active_statuses = [
IncidentStatus.INVESTIGATING,
IncidentStatus.MITIGATING,
IncidentStatus.INVESTIGATING.value,
IncidentStatus.MITIGATING.value,
]
async with get_db_context() as db:
result = await db.execute(

View File

@@ -190,7 +190,7 @@ class KnowledgeDBRepository:
count_query = count_query.where(KnowledgeEntryRecord.status == status)
if tags:
for tag in tags:
tag_filter = _json_string_array_has_tag(tag)
tag_filter = KnowledgeEntryRecord.tags.op('@>')(f'["{tag}"]')
query = query.where(tag_filter)
count_query = count_query.where(tag_filter)
if q:
@@ -347,18 +347,3 @@ class KnowledgeDBRepository:
created_at=record.created_at,
updated_at=record.updated_at,
)
def _json_string_array_has_tag(tag: str):
"""建立 JSON/JSONB 皆相容的 tag filter。
production 的 knowledge_entries.tags 目前是 JSON 欄位,不支援 json @> text。
這裡改用帶引號的字串比對,避免把 tag 片段誤判成完整 tag。
"""
escaped = (
tag
.replace("\\", "\\\\")
.replace("%", "\\%")
.replace("_", "\\_")
)
return KnowledgeEntryRecord.tags.cast(String).ilike(f'%"{escaped}"%', escape="\\")

View File

@@ -19,11 +19,10 @@ router = APIRouter()
logger = logging.getLogger(__name__)
# ==================== Ollama Config ====================
# 2026-05-19 Codex: agent thinking stream follows GCP-A → GCP-B → 111.
def _get_ollama_endpoints():
from src.services.ollama_endpoint_resolver import resolve_ollama_order
return resolve_ollama_order("interactive")
# 2026-05-03 ogt: ADR-110 GCP-A Primary — 改從 settings 讀取,不再硬編碼 111
def _get_ollama_base_url() -> str:
from src.core.config import get_settings
return get_settings().OLLAMA_URL
OLLAMA_MODEL = "llama3.2:latest" # 可根據實際部署調整
OLLAMA_TIMEOUT = 120.0 # 串流超時
@@ -113,82 +112,66 @@ async def get_agent_thinking(
# 1. 開始思考
yield f"data: {json.dumps({'type': 'thinking', 'content': '正在連接 AI 模型...'}, ensure_ascii=False)}\n\n"
last_error = ""
async with httpx.AsyncClient(timeout=OLLAMA_TIMEOUT) as client:
# 2. 發送請求到 Ollama
yield f"data: {json.dumps({'type': 'thinking', 'content': f'模型: {model}'}, ensure_ascii=False)}\n\n"
try:
async with httpx.AsyncClient(timeout=OLLAMA_TIMEOUT) as client:
# 2. 發送請求到 Ollama
yield f"data: {json.dumps({'type': 'thinking', 'content': f'模型: {model}'}, ensure_ascii=False)}\n\n"
for endpoint in _get_ollama_endpoints():
if not endpoint.url:
continue
try:
async with client.stream(
"POST",
f"{endpoint.url}/api/generate",
json={
"model": model,
"prompt": prompt,
"stream": True,
},
) as response:
if response.status_code != 200:
last_error = f"HTTP {response.status_code}"
logger.warning(
"agent_thinking_ollama_http_error",
provider=endpoint.provider_name,
status=response.status_code,
)
async with client.stream(
"POST",
f"{_get_ollama_base_url()}/api/generate",
json={
"model": model,
"prompt": prompt,
"stream": True,
},
) as response:
if response.status_code != 200:
yield f"data: {json.dumps({'type': 'error', 'content': f'Ollama 錯誤: HTTP {response.status_code}'}, ensure_ascii=False)}\n\n"
yield "data: [DONE]\n\n"
return
yield f"data: {json.dumps({'type': 'thinking', 'content': '開始接收 AI 回應...'}, ensure_ascii=False)}\n\n"
# 3. 串流讀取 Ollama 回應
buffer = ""
async for line in response.aiter_lines():
if not line:
continue
yield f"data: {json.dumps({'type': 'thinking', 'content': '開始接收 AI 回應...'}, ensure_ascii=False)}\n\n"
try:
chunk = json.loads(line)
token = chunk.get("response", "")
done = chunk.get("done", False)
# 3. 串流讀取 Ollama 回應
buffer = ""
async for line in response.aiter_lines():
if not line:
continue
if token:
# 累積 token每 10 字符或遇到標點符號時發送
buffer += token
if len(buffer) >= 10 or any(p in buffer for p in "。!?,、\n"):
yield f"data: {json.dumps({'type': 'thinking', 'content': buffer}, ensure_ascii=False)}\n\n"
buffer = ""
try:
chunk = json.loads(line)
token = chunk.get("response", "")
done = chunk.get("done", False)
if done:
# 發送剩餘 buffer
if buffer:
yield f"data: {json.dumps({'type': 'thinking', 'content': buffer}, ensure_ascii=False)}\n\n"
# 發送完成訊息
yield f"data: {json.dumps({'type': 'result', 'content': '分析完成'}, ensure_ascii=False)}\n\n"
break
if token:
# 累積 token每 10 字符或遇到標點符號時發送
buffer += token
if len(buffer) >= 10 or any(p in buffer for p in "。!?,、\n"):
yield f"data: {json.dumps({'type': 'thinking', 'content': buffer}, ensure_ascii=False)}\n\n"
buffer = ""
except json.JSONDecodeError as e:
logger.warning(f"JSON 解析失敗: {line[:100]}... - {e}")
continue
if done:
# 發送剩餘 buffer
if buffer:
yield f"data: {json.dumps({'type': 'thinking', 'content': buffer}, ensure_ascii=False)}\n\n"
# 發送完成訊息
yield f"data: {json.dumps({'type': 'result', 'content': '分析完成'}, ensure_ascii=False)}\n\n"
yield "data: [DONE]\n\n"
return
except json.JSONDecodeError as e:
logger.warning(f"JSON 解析失敗: {line[:100]}... - {e}")
continue
except (httpx.ConnectError, httpx.TimeoutException) as e:
last_error = type(e).__name__
logger.error(
"agent_thinking_ollama_endpoint_failed",
provider=endpoint.provider_name,
error=str(e),
)
except Exception as e:
last_error = str(e)
logger.error(
"agent_thinking_unknown_error",
provider=endpoint.provider_name,
error=str(e),
)
error_content = f"Ollama 全端點不可用: {last_error or 'unknown'}"
yield f"data: {json.dumps({'type': 'error', 'content': error_content}, ensure_ascii=False)}\n\n"
except httpx.ConnectError as e:
logger.error(f"無法連接 Ollama: {e}")
yield f"data: {json.dumps({'type': 'error', 'content': f'無法連接 Ollama ({_get_ollama_base_url()})'}, ensure_ascii=False)}\n\n"
except httpx.TimeoutException as e:
logger.error(f"Ollama 超時: {e}")
yield f"data: {json.dumps({'type': 'error', 'content': '請求超時'}, ensure_ascii=False)}\n\n"
except Exception as e:
logger.error(f"未知錯誤: {e}")
yield f"data: {json.dumps({'type': 'error', 'content': f'未知錯誤: {str(e)}'}, ensure_ascii=False)}\n\n"
# 4. 結束標記
yield "data: [DONE]\n\n"

View File

@@ -1,606 +0,0 @@
"""
ADR-100 Remediation Service
===========================
Safe operator entrypoints for verification remediation work items.
T25: remediation queue items are now actionable without mutating incident state:
- preview: show the selected guardrail path
- dry-run: collect read-only current state and validate supported executor routing
"""
from __future__ import annotations
import asyncio
from typing import Any, Literal, Protocol
import structlog
from src.models.incident import Incident
from src.repositories.incident_repository import IncidentDBRepository
from src.services.adr100_slo_status_service import (
Adr100SloStatusService,
get_adr100_slo_status_service,
)
from src.services.auto_repair_service import AutoRepairService
from src.services.post_execution_verifier import (
PostExecutionVerifier,
_assess_recovery,
_build_prometheus_query,
get_post_execution_verifier,
)
logger = structlog.get_logger(__name__)
RemediationMode = Literal["auto", "reverify", "replay"]
_READY_STATUSES = {"ready_for_replay", "ready_for_reverify"}
class RemediationNotFoundError(LookupError):
"""Requested ADR-100 remediation work item is not in the current read model."""
class _IncidentRepository(Protocol):
async def get_by_id(self, incident_id: str) -> Incident | None:
...
class Adr100RemediationService:
"""Read-only remediation preview and dry-run service."""
def __init__(
self,
*,
slo_service: Adr100SloStatusService | None = None,
incident_repository: _IncidentRepository | None = None,
auto_repair_service: AutoRepairService | None = None,
verifier: PostExecutionVerifier | None = None,
timeline_service: Any | None = None,
alert_operation_log_repository: Any | None = None,
record_history: bool = True,
) -> None:
self._slo_service = slo_service or get_adr100_slo_status_service()
self._incident_repository = incident_repository or IncidentDBRepository()
self._auto_repair_service = auto_repair_service or AutoRepairService()
self._verifier = verifier or get_post_execution_verifier()
self._timeline_service = timeline_service
self._alert_operation_log_repository = alert_operation_log_repository
self._record_history_enabled = record_history
async def preview(self, work_item_id: str, mode: RemediationMode = "auto") -> dict[str, Any]:
"""Return the safe execution plan for a remediation queue item."""
item = await self._find_work_item(work_item_id)
selected_mode = _select_mode(item, mode)
checks = _base_checks(item)
allowed = all(check["passed"] for check in checks)
return {
"schema_version": "adr100_remediation_preview_v1",
"work_item_id": item.get("work_item_id"),
"incident_id": item.get("incident_id"),
"auto_repair_id": item.get("auto_repair_id"),
"mode": selected_mode,
"allowed": allowed,
"safety_level": "read_only",
"writes_incident_state": False,
"writes_auto_repair_result": False,
"checks": checks,
"plan": _plan_for_item(item, selected_mode),
"source": "adr100.verification_coverage.remediation_queue",
}
async def dry_run(self, work_item_id: str, mode: RemediationMode = "auto") -> dict[str, Any]:
"""Run a safe, read-only remediation dry-run for one queue item."""
item = await self._find_work_item(work_item_id)
selected_mode = _select_mode(item, mode)
checks = _base_checks(item)
incident = await self._load_incident(item)
checks.append({
"name": "incident_loaded",
"passed": incident is not None,
"detail": item.get("incident_id") or "missing incident_id",
})
if incident is None or not all(check["passed"] for check in checks):
payload = _dry_run_blocked_payload(item, selected_mode, checks)
payload["history"] = await self._record_dry_run_history(item, payload)
return payload
if selected_mode == "replay":
return await self._dry_run_replay(item, incident, checks)
return await self._dry_run_reverify(item, incident, checks)
async def history(
self,
*,
limit: int = 50,
incident_id: str | None = None,
work_item_id: str | None = None,
) -> dict[str, Any]:
"""Return durable dry-run history written by this remediation service."""
safe_limit = max(1, min(limit, 200))
fetch_limit = min(max(safe_limit * 4, 50), 200)
rows: list[Any] = []
repo = self._alert_operation_log_repository
if repo is None:
from src.repositories.alert_operation_log_repository import (
get_alert_operation_log_repository,
)
repo = get_alert_operation_log_repository()
for event_type in ("PRE_FLIGHT_PASSED", "PRE_FLIGHT_FAILED"):
try:
batch, _total = await repo.list_recent(
limit=fetch_limit,
event_type=event_type,
incident_id=incident_id,
)
rows.extend(batch)
except Exception as exc:
logger.warning(
"adr100_remediation_history_fetch_failed",
event_type=event_type,
incident_id=incident_id,
error=str(exc),
)
rows.sort(key=_record_created_at, reverse=True)
items: list[dict[str, Any]] = []
for row in rows:
context = getattr(row, "context", None) or {}
if context.get("schema_version") != "adr100_remediation_dry_run_history_v1":
continue
if work_item_id and context.get("work_item_id") != work_item_id:
continue
items.append(_history_item(row, context))
if len(items) >= safe_limit:
break
return {
"schema_version": "adr100_remediation_history_v1",
"total": len(items),
"limit": safe_limit,
"filters": {
"incident_id": incident_id,
"work_item_id": work_item_id,
},
"items": items,
"by_work_item": _summarize_history_by_work_item(items),
}
async def _find_work_item(self, work_item_id: str) -> dict[str, Any]:
report = await self._slo_service.fetch_report()
coverage = report.get("verification_coverage") or {}
queue = coverage.get("remediation_queue") or {}
for item in queue.get("items") or []:
if item.get("work_item_id") == work_item_id:
return dict(item)
raise RemediationNotFoundError(work_item_id)
async def _load_incident(self, item: dict[str, Any]) -> Incident | None:
incident_id = str(item.get("incident_id") or "")
if not incident_id:
return None
return await self._incident_repository.get_by_id(incident_id)
async def _dry_run_reverify(
self,
item: dict[str, Any],
incident: Incident,
checks: list[dict[str, Any]],
) -> dict[str, Any]:
post_state = await self._collect_current_state(incident)
action_taken = f"dry_run_reverify:{item.get('playbook_id') or 'unknown'}"
result = _assess_recovery(None, post_state, action_taken)
payload = _dry_run_result_payload(
item=item,
mode="reverify",
checks=checks,
post_state=post_state,
verification_result_preview=result,
extra={
"promql": _promql_for_incident(incident),
"mcp_route": {
"agent_id": "post_execution_verifier",
"required_scope": "read",
"is_shadow": True,
"flywheel_node": "verify",
},
},
)
payload["history"] = await self._record_dry_run_history(item, payload)
return payload
async def _dry_run_replay(
self,
item: dict[str, Any],
incident: Incident,
checks: list[dict[str, Any]],
) -> dict[str, Any]:
diagnostic_command = _diagnostic_command_for_incident(incident)
route = self._auto_repair_service.preview_read_only_ssh_mcp_route(
incident,
diagnostic_command,
)
checks.append({
"name": "supported_executor_route",
"passed": route is not None,
"detail": "mcp:ssh_diagnose" if route else "missing host/container route",
})
post_state = await self._collect_current_state(incident)
action_taken = f"dry_run_replay:{item.get('playbook_id') or 'unknown'}"
result = _assess_recovery(None, post_state, action_taken)
payload = _dry_run_result_payload(
item=item,
mode="replay",
checks=checks,
post_state=post_state,
verification_result_preview=result,
extra={
"diagnostic_command_preview": diagnostic_command,
"mcp_route": route,
"promql": _promql_for_incident(incident),
},
)
payload["history"] = await self._record_dry_run_history(item, payload)
return payload
async def _collect_current_state(self, incident: Incident) -> dict[str, Any]:
try:
return await asyncio.wait_for(
self._verifier._collect_post_state(incident),
timeout=12.0,
)
except asyncio.TimeoutError:
logger.warning(
"adr100_remediation_dry_run_timeout",
incident_id=incident.incident_id,
)
return {}
except Exception as exc:
logger.warning(
"adr100_remediation_dry_run_collect_failed",
incident_id=incident.incident_id,
error=str(exc),
)
return {}
async def _record_dry_run_history(
self,
item: dict[str, Any],
payload: dict[str, Any],
) -> dict[str, Any]:
if not self._record_history_enabled:
return {"recorded": False, "reason": "disabled"}
incident_id = str(item.get("incident_id") or "")
if not incident_id:
return {"recorded": False, "reason": "missing_incident_id"}
history: dict[str, Any] = {
"recorded": False,
"alert_operation_id": None,
"timeline_event_id": None,
}
context = _history_context(item, payload)
allowed = bool(payload.get("allowed"))
try:
repo = self._alert_operation_log_repository
if repo is None:
from src.repositories.alert_operation_log_repository import (
get_alert_operation_log_repository,
)
repo = get_alert_operation_log_repository()
record = await repo.append(
"PRE_FLIGHT_PASSED" if allowed else "PRE_FLIGHT_FAILED",
incident_id=incident_id,
auto_repair_id=str(item.get("auto_repair_id") or "") or None,
actor="adr100_remediation_service",
action_detail=f"adr100_remediation_dry_run:{payload.get('mode')}"[:200],
success=allowed,
context=context,
)
if record is not None:
history["alert_operation_id"] = getattr(record, "id", None)
except Exception as exc:
logger.warning(
"adr100_remediation_alert_operation_history_failed",
incident_id=incident_id,
error=str(exc),
)
try:
timeline = self._timeline_service
if timeline is None:
from src.services.approval_db import get_timeline_service
timeline = get_timeline_service()
event = await timeline.add_event(
event_type="verifier",
status=_timeline_status(payload),
title="ADR-100 remediation dry-run",
description=_history_description(context),
actor="adr100_remediation_service",
actor_role=str(payload.get("mode") or "dry_run"),
incident_id=incident_id,
)
if event:
history["timeline_event_id"] = event.get("id")
except Exception as exc:
logger.warning(
"adr100_remediation_timeline_history_failed",
incident_id=incident_id,
error=str(exc),
)
history["recorded"] = bool(
history.get("alert_operation_id") or history.get("timeline_event_id")
)
return history
def _select_mode(item: dict[str, Any], requested: RemediationMode) -> Literal["reverify", "replay"]:
if requested in ("reverify", "replay"):
return requested
if item.get("remediation_status") == "ready_for_reverify":
return "reverify"
if item.get("remediation_action") == "reverify_with_promql_template":
return "reverify"
return "replay"
def _base_checks(item: dict[str, Any]) -> list[dict[str, Any]]:
status = str(item.get("remediation_status") or "unknown")
action = str(item.get("remediation_action") or "unknown")
return [
{
"name": "queue_item_ready",
"passed": status in _READY_STATUSES,
"detail": status,
},
{
"name": "read_only_guardrail",
"passed": action in {
"replay_with_supported_executor",
"reverify_with_promql_template",
},
"detail": action,
},
{
"name": "no_state_mutation",
"passed": True,
"detail": "dry_run_does_not_update_incident_or_auto_repair_rows",
},
]
def _plan_for_item(item: dict[str, Any], mode: str) -> dict[str, Any]:
if mode == "reverify":
return {
"step": "collect_current_state_and_assess",
"agent_id": "post_execution_verifier",
"required_scope": "read",
"writes": [],
}
return {
"step": "validate_supported_executor_route_then_collect_current_state",
"agent_id": "auto_repair_executor",
"required_scope": "read",
"writes": [],
"target_action": item.get("remediation_action"),
}
def _dry_run_blocked_payload(
item: dict[str, Any],
mode: str,
checks: list[dict[str, Any]],
) -> dict[str, Any]:
return {
"schema_version": "adr100_remediation_dry_run_v1",
"work_item_id": item.get("work_item_id"),
"incident_id": item.get("incident_id"),
"auto_repair_id": item.get("auto_repair_id"),
"mode": mode,
"allowed": False,
"executed": False,
"safety_level": "read_only",
"writes_incident_state": False,
"writes_auto_repair_result": False,
"checks": checks,
"verification_result_preview": "blocked",
"post_state_summary": {},
}
def _dry_run_result_payload(
*,
item: dict[str, Any],
mode: str,
checks: list[dict[str, Any]],
post_state: dict[str, Any],
verification_result_preview: str,
extra: dict[str, Any],
) -> dict[str, Any]:
return {
"schema_version": "adr100_remediation_dry_run_v1",
"work_item_id": item.get("work_item_id"),
"incident_id": item.get("incident_id"),
"auto_repair_id": item.get("auto_repair_id"),
"mode": mode,
"allowed": all(check["passed"] for check in checks),
"executed": True,
"safety_level": "read_only",
"writes_incident_state": False,
"writes_auto_repair_result": False,
"checks": checks,
"verification_result_preview": verification_result_preview,
"post_state_summary": _summarize_post_state(post_state),
**extra,
}
def _summarize_post_state(post_state: dict[str, Any]) -> dict[str, Any]:
keys = sorted(post_state.keys())
return {
"tool_count": len(keys),
"tools": keys[:8],
"has_state": bool(post_state),
}
def _history_context(item: dict[str, Any], payload: dict[str, Any]) -> dict[str, Any]:
return {
"schema_version": "adr100_remediation_dry_run_history_v1",
"work_item_id": item.get("work_item_id"),
"auto_repair_id": item.get("auto_repair_id"),
"playbook_id": item.get("playbook_id"),
"alertname": item.get("alertname"),
"mode": payload.get("mode"),
"allowed": payload.get("allowed"),
"executed": payload.get("executed"),
"safety_level": payload.get("safety_level"),
"writes_incident_state": payload.get("writes_incident_state"),
"writes_auto_repair_result": payload.get("writes_auto_repair_result"),
"verification_result_preview": payload.get("verification_result_preview"),
"post_state_summary": payload.get("post_state_summary"),
"mcp_route": payload.get("mcp_route"),
"checks": payload.get("checks"),
}
def _timeline_status(payload: dict[str, Any]) -> str:
if not payload.get("allowed"):
return "warning"
if payload.get("verification_result_preview") == "success":
return "success"
return "warning"
def _history_description(context: dict[str, Any]) -> str:
tool_count = (context.get("post_state_summary") or {}).get("tool_count", 0)
route = context.get("mcp_route") or {}
agent = route.get("agent_id") or "unknown_agent"
tool = route.get("tool_name") or "current_state"
return (
f"mode={context.get('mode')} "
f"preview={context.get('verification_result_preview')} "
f"tools={tool_count} route={agent}/{tool} "
f"writes_incident={context.get('writes_incident_state')} "
f"writes_auto_repair={context.get('writes_auto_repair_result')}"
)[:500]
def _record_created_at(record: Any) -> str:
value = getattr(record, "created_at", None)
if hasattr(value, "isoformat"):
return value.isoformat()
return str(value or "")
def _history_item(record: Any, context: dict[str, Any]) -> dict[str, Any]:
route = context.get("mcp_route") or {}
post_state = context.get("post_state_summary") or {}
return {
"id": str(getattr(record, "id", "")),
"incident_id": getattr(record, "incident_id", None),
"auto_repair_id": getattr(record, "auto_repair_id", None)
or context.get("auto_repair_id"),
"event_type": str(getattr(record, "event_type", "")),
"actor": getattr(record, "actor", None),
"success": getattr(record, "success", None),
"created_at": _record_created_at(record),
"work_item_id": context.get("work_item_id"),
"playbook_id": context.get("playbook_id"),
"alertname": context.get("alertname"),
"mode": context.get("mode"),
"allowed": context.get("allowed"),
"executed": context.get("executed"),
"safety_level": context.get("safety_level"),
"verification_result_preview": context.get("verification_result_preview"),
"tool_count": post_state.get("tool_count", 0),
"tools": post_state.get("tools") or [],
"agent_id": route.get("agent_id"),
"tool_name": route.get("tool_name") or "current_state",
"required_scope": route.get("required_scope"),
"writes_incident_state": context.get("writes_incident_state"),
"writes_auto_repair_result": context.get("writes_auto_repair_result"),
"checks": context.get("checks") or [],
}
def _summarize_history_by_work_item(items: list[dict[str, Any]]) -> list[dict[str, Any]]:
summary: dict[str, dict[str, Any]] = {}
for item in items:
key = str(item.get("work_item_id") or item.get("incident_id") or item.get("id"))
if key not in summary:
summary[key] = {
"work_item_id": item.get("work_item_id"),
"incident_id": item.get("incident_id"),
"count": 0,
"latest_at": item.get("created_at"),
"latest_event_type": item.get("event_type"),
"latest_success": item.get("success"),
"latest_preview": item.get("verification_result_preview"),
"latest_mode": item.get("mode"),
"latest_agent_id": item.get("agent_id"),
"latest_tool_name": item.get("tool_name"),
"required_scope": item.get("required_scope"),
}
summary[key]["count"] += 1
return list(summary.values())
def _diagnostic_command_for_incident(incident: Incident) -> str:
labels = _labels_for_incident(incident)
host = str(labels.get("host") or labels.get("instance") or "{host}")
container = str(labels.get("container_name") or labels.get("container") or "")
if container:
return f"ssh {host} 'uptime; docker stats --no-stream {container}'"
return f"ssh {host} 'uptime; docker stats --no-stream'"
def _promql_for_incident(incident: Incident) -> str:
labels = _labels_for_incident(incident)
alertname = ""
if incident.signals:
signal = incident.signals[0]
alertname = labels.get("alertname") or getattr(signal, "alert_name", "")
return _build_prometheus_query(alertname, labels)
def _labels_for_incident(incident: Incident) -> dict[str, Any]:
if incident.signals:
return incident.signals[0].labels or {}
return {}
_service: Adr100RemediationService | None = None
def get_adr100_remediation_service() -> Adr100RemediationService:
"""Return singleton ADR-100 remediation service."""
global _service
if _service is None:
_service = Adr100RemediationService()
return _service
def set_adr100_remediation_service(service: Adr100RemediationService | None) -> None:
"""Inject ADR-100 remediation service for tests."""
global _service
_service = service

View File

@@ -1,354 +0,0 @@
"""
ADR-100 SLO metrics emitter.
Prometheus recording rules for the AI flywheel SLOs expect a small set of
counter-like metrics. The source of truth already lives in PostgreSQL, so this
read-side emitter exposes DB totals on /metrics without changing runtime write
paths or introducing another state store.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from time import time
from sqlalchemy import text
from src.db.base import get_db_context
@dataclass(frozen=True)
class AutomationOperationSample:
outcome: str
operation_type: str
count: int
@dataclass(frozen=True)
class VerificationSample:
outcome: str
count: int
@dataclass(frozen=True)
class Adr100SloMetricsSnapshot:
automation_operations: list[AutomationOperationSample] = field(default_factory=list)
automation_operations_24h: list[AutomationOperationSample] = field(default_factory=list)
post_execution_verifications: list[VerificationSample] = field(default_factory=list)
post_execution_verifications_24h: list[VerificationSample] = field(default_factory=list)
knowledge_entries_total: int = 0
knowledge_entries_created_24h: int = 0
high_confidence_total: int = 0
high_confidence_success_total: int = 0
emitted_at: float = field(default_factory=time)
class Adr100SloMetricsService:
"""Build ADR-100 Prometheus samples from production DB state."""
async def to_prometheus_lines(self) -> str:
snapshot = await self.fetch_snapshot()
return render_adr100_slo_metrics(snapshot)
async def fetch_snapshot(self) -> Adr100SloMetricsSnapshot:
async with get_db_context() as db:
automation_rows = (
await db.execute(text(_AUTOMATION_OPERATION_SQL))
).fetchall()
automation_24h_rows = (
await db.execute(text(_AUTOMATION_OPERATION_24H_SQL))
).fetchall()
verification_rows = (
await db.execute(text(_POST_EXECUTION_VERIFICATION_SQL))
).fetchall()
verification_24h_rows = (
await db.execute(text(_POST_EXECUTION_VERIFICATION_24H_SQL))
).fetchall()
knowledge_total = int(
(await db.execute(text("SELECT count(*) FROM knowledge_entries"))).scalar()
or 0
)
knowledge_created_24h = int(
(
await db.execute(
text(
"""
SELECT count(*)
FROM knowledge_entries
WHERE created_at >= NOW() - INTERVAL '24 hours'
"""
)
)
).scalar()
or 0
)
confidence_row = (
await db.execute(text(_HIGH_CONFIDENCE_APPROVAL_SQL))
).one()
return Adr100SloMetricsSnapshot(
automation_operations=[
AutomationOperationSample(
outcome=str(row.outcome),
operation_type=str(row.operation_type),
count=int(row.count or 0),
)
for row in automation_rows
],
automation_operations_24h=[
AutomationOperationSample(
outcome=str(row.outcome),
operation_type=str(row.operation_type),
count=int(row.count or 0),
)
for row in automation_24h_rows
],
post_execution_verifications=[
VerificationSample(
outcome=str(row.outcome),
count=int(row.count or 0),
)
for row in verification_rows
],
post_execution_verifications_24h=[
VerificationSample(
outcome=str(row.outcome),
count=int(row.count or 0),
)
for row in verification_24h_rows
],
knowledge_entries_total=knowledge_total,
knowledge_entries_created_24h=knowledge_created_24h,
high_confidence_total=int(confidence_row.high_confidence_total or 0),
high_confidence_success_total=int(
confidence_row.high_confidence_success_total or 0
),
)
def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str:
"""Render ADR-100 SLO metrics in Prometheus text exposition format."""
lines: list[str] = [
"",
"# HELP automation_operation_log_total DB-derived AI automation operation count for ADR-100 SLOs",
"# TYPE automation_operation_log_total counter",
]
if snapshot.automation_operations:
for sample in snapshot.automation_operations:
lines.append(
"automation_operation_log_total"
f'{{outcome="{_escape_label(sample.outcome)}",'
f'operation_type="{_escape_label(sample.operation_type)}"}} '
f"{sample.count}"
)
else:
lines.append(
'automation_operation_log_total{outcome="none",operation_type="none"} 0'
)
lines.extend([
"# HELP automation_operation_created_24h DB-derived AI automation operation count created in the last 24 hours for ADR-100 SLO dashboards",
"# TYPE automation_operation_created_24h gauge",
])
if snapshot.automation_operations_24h:
for sample in snapshot.automation_operations_24h:
lines.append(
"automation_operation_created_24h"
f'{{outcome="{_escape_label(sample.outcome)}",'
f'operation_type="{_escape_label(sample.operation_type)}"}} '
f"{sample.count}"
)
else:
lines.append(
'automation_operation_created_24h{outcome="none",operation_type="none"} 0'
)
lines.extend([
"# HELP post_execution_verification_total DB-derived post execution verification result count for ADR-100 SLOs",
"# TYPE post_execution_verification_total counter",
])
if snapshot.post_execution_verifications:
for sample in snapshot.post_execution_verifications:
lines.append(
"post_execution_verification_total"
f'{{outcome="{_escape_label(sample.outcome)}"}} {sample.count}'
)
else:
lines.append('post_execution_verification_total{outcome="none"} 0')
lines.extend([
"# HELP post_execution_verification_created_24h DB-derived post execution verification result count created in the last 24 hours for ADR-100 SLO dashboards",
"# TYPE post_execution_verification_created_24h gauge",
])
if snapshot.post_execution_verifications_24h:
for sample in snapshot.post_execution_verifications_24h:
lines.append(
"post_execution_verification_created_24h"
f'{{outcome="{_escape_label(sample.outcome)}"}} {sample.count}'
)
else:
lines.append('post_execution_verification_created_24h{outcome="none"} 0')
lines.extend([
"# HELP knowledge_entries_total DB-derived knowledge entry count for ADR-100 SLOs",
"# TYPE knowledge_entries_total counter",
f"knowledge_entries_total {snapshot.knowledge_entries_total}",
"# HELP knowledge_entries_created_24h DB-derived knowledge entries created in the last 24 hours for ADR-100 SLOs",
"# TYPE knowledge_entries_created_24h gauge",
f"knowledge_entries_created_24h {snapshot.knowledge_entries_created_24h}",
"# HELP approval_records_high_confidence_total DB-derived high confidence approval decisions for ADR-100 SLOs",
"# TYPE approval_records_high_confidence_total counter",
f"approval_records_high_confidence_total {snapshot.high_confidence_total}",
"# HELP approval_records_high_confidence_success_total DB-derived high confidence approval decisions with successful verification for ADR-100 SLOs",
"# TYPE approval_records_high_confidence_success_total counter",
(
"approval_records_high_confidence_success_total "
f"{snapshot.high_confidence_success_total}"
),
"# HELP adr100_slo_emitter_last_success_timestamp Last successful ADR-100 DB metrics emission timestamp",
"# TYPE adr100_slo_emitter_last_success_timestamp gauge",
f"adr100_slo_emitter_last_success_timestamp {snapshot.emitted_at:.0f}",
"",
])
return "\n".join(lines)
def _escape_label(value: str) -> str:
return value.replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')
_AUTOMATION_OPERATION_SQL = """
WITH automation_scope AS (
SELECT
CASE
WHEN status <> 'success' THEN status
WHEN actor = 'approval_execution'
AND COALESCE(input->>'requested_by', '') NOT ILIKE 'auto%%'
THEN 'human_required'
ELSE 'auto_executed'
END AS outcome,
operation_type
FROM automation_operation_log
WHERE operation_type IN (
'playbook_executed',
'remediation_executed',
'remediation_verified',
'remediation_rolled_back',
'self_correction_attempted'
)
UNION ALL
SELECT
CASE WHEN success THEN 'auto_executed' ELSE 'failed' END AS outcome,
'auto_repair_executed' AS operation_type
FROM auto_repair_executions
)
SELECT
outcome,
operation_type,
count(*) AS count
FROM automation_scope
GROUP BY outcome, operation_type
ORDER BY outcome, operation_type
"""
_AUTOMATION_OPERATION_24H_SQL = """
WITH automation_scope AS (
SELECT
CASE
WHEN status <> 'success' THEN status
WHEN actor = 'approval_execution'
AND COALESCE(input->>'requested_by', '') NOT ILIKE 'auto%%'
THEN 'human_required'
ELSE 'auto_executed'
END AS outcome,
operation_type
FROM automation_operation_log
WHERE operation_type IN (
'playbook_executed',
'remediation_executed',
'remediation_verified',
'remediation_rolled_back',
'self_correction_attempted'
)
AND created_at >= NOW() - INTERVAL '24 hours'
UNION ALL
SELECT
CASE WHEN success THEN 'auto_executed' ELSE 'failed' END AS outcome,
'auto_repair_executed' AS operation_type
FROM auto_repair_executions
WHERE created_at >= NOW() - INTERVAL '24 hours'
)
SELECT
outcome,
operation_type,
count(*) AS count
FROM automation_scope
GROUP BY outcome, operation_type
ORDER BY outcome, operation_type
"""
_POST_EXECUTION_VERIFICATION_SQL = """
SELECT verification_result AS outcome, count(*) AS count
FROM incident_evidence
WHERE verification_result IS NOT NULL
GROUP BY verification_result
ORDER BY verification_result
"""
_POST_EXECUTION_VERIFICATION_24H_SQL = """
SELECT verification_result AS outcome, count(*) AS count
FROM incident_evidence
WHERE verification_result IS NOT NULL
AND collected_at >= NOW() - INTERVAL '24 hours'
GROUP BY verification_result
ORDER BY verification_result
"""
_HIGH_CONFIDENCE_APPROVAL_SQL = """
WITH approval_confidence AS (
SELECT
id,
incident_id,
COALESCE(
CASE
WHEN extra_metadata->>'confidence_score' ~ '^[0-9]+(\\.[0-9]+)?$'
THEN (extra_metadata->>'confidence_score')::numeric
ELSE NULL
END,
CASE
WHEN extra_metadata->>'confidence' ~ '^[0-9]+(\\.[0-9]+)?$'
THEN (extra_metadata->>'confidence')::numeric
ELSE NULL
END,
composite_score,
0
) AS confidence
FROM approval_records
)
SELECT
count(*) FILTER (WHERE confidence >= 0.8) AS high_confidence_total,
count(*) FILTER (
WHERE confidence >= 0.8
AND EXISTS (
SELECT 1
FROM incident_evidence ev
WHERE ev.incident_id = approval_confidence.incident_id
AND ev.verification_result = 'success'
)
) AS high_confidence_success_total
FROM approval_confidence
"""
_adr100_slo_metrics_service: Adr100SloMetricsService | None = None
def get_adr100_slo_metrics_service() -> Adr100SloMetricsService:
global _adr100_slo_metrics_service
if _adr100_slo_metrics_service is None:
_adr100_slo_metrics_service = Adr100SloMetricsService()
return _adr100_slo_metrics_service

View File

@@ -1,743 +0,0 @@
"""
Read-only ADR-100 SLO status snapshot.
GovernanceAgent.check_slo_compliance() can emit governance alerts when an SLO is
violated. This service is intentionally read-only so dashboards can show the
same Prometheus-backed state without producing Telegram/DB side effects.
"""
from __future__ import annotations
import math
from dataclasses import dataclass
from typing import Any
import httpx
import structlog
from sqlalchemy import text
from src.core.config import settings
from src.db.base import get_db_context
from src.utils.timezone import now_taipei_iso
logger = structlog.get_logger(__name__)
@dataclass(frozen=True)
class Adr100SloDefinition:
name: str
query: str
target: float
hard_red_line: float
direction: str
unit: str
window: str
denominator_query: str | None = None
denominator_window_seconds: int = 0
minimum_events: float = 1.0
ADR100_SLO_DEFINITIONS: tuple[Adr100SloDefinition, ...] = (
Adr100SloDefinition(
name="autonomy_rate",
query="sli:autonomy_rate:5m",
target=0.80,
hard_red_line=0.70,
direction="above",
unit="percent",
window="5m",
denominator_query="sum(rate(automation_operation_log_total[5m]))",
denominator_window_seconds=300,
),
Adr100SloDefinition(
name="decision_accuracy",
query="sli:decision_accuracy:5m",
target=0.90,
hard_red_line=0.85,
direction="above",
unit="percent",
window="5m",
denominator_query='sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m]))',
denominator_window_seconds=300,
),
Adr100SloDefinition(
name="confidence_calibration",
query="sli:confidence_calibration:1h",
target=0.80,
hard_red_line=0.70,
direction="above",
unit="percent",
window="1h",
denominator_query="sum(rate(approval_records_high_confidence_total[1h]))",
denominator_window_seconds=3600,
),
Adr100SloDefinition(
name="km_growth_rate",
query="max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)",
target=20.0,
hard_red_line=5.0,
direction="above",
unit="count",
window="24h",
),
)
class Adr100SloStatusService:
"""Fetch ADR-100 SLO status from Prometheus without writing governance events."""
async def fetch_report(self) -> dict[str, Any]:
prom_url = getattr(
settings,
"PROMETHEUS_URL",
"http://prometheus.observability.svc:9090",
).rstrip("/")
metrics: list[dict[str, Any]] = []
async with httpx.AsyncClient(timeout=5.0) as client:
for definition in ADR100_SLO_DEFINITIONS:
metrics.append(await self._fetch_metric(client, prom_url, definition))
evaluable = [metric for metric in metrics if metric.get("evaluable")]
ok_count = sum(1 for metric in evaluable if metric.get("status") == "ok")
overall_compliance = (ok_count / len(evaluable)) if evaluable else None
verification_coverage = await self._fetch_verification_coverage()
overall_status = _overall_status(metrics, evaluable, verification_coverage)
return {
"schema_version": "adr100_slo_status_v1",
"source": "prometheus+postgresql",
"evaluated_at": now_taipei_iso(),
"overall_status": overall_status,
"overall_compliance": overall_compliance,
"evaluable_count": len(evaluable),
"metric_count": len(metrics),
"metrics": metrics,
"verification_coverage": verification_coverage,
}
async def _fetch_metric(
self,
client: httpx.AsyncClient,
prom_url: str,
definition: Adr100SloDefinition,
) -> dict[str, Any]:
denominator_value: float | None = None
sample_count: float | None = None
if definition.denominator_query:
denominator_result = await _query_prometheus_value(
client,
prom_url,
definition.denominator_query,
)
if denominator_result["status"] != "ok":
return _metric_payload(
definition,
value=None,
status="no_data",
reason=denominator_result["reason"],
denominator_value=None,
sample_count=None,
)
denominator_value = float(denominator_result["value"])
sample_count = denominator_value * definition.denominator_window_seconds
if sample_count < definition.minimum_events:
return _metric_payload(
definition,
value=None,
status="skipped_low_volume",
reason="denominator_below_minimum_events",
denominator_value=denominator_value,
sample_count=sample_count,
)
value_result = await _query_prometheus_value(client, prom_url, definition.query)
if value_result["status"] != "ok":
status = (
"skipped_low_volume"
if value_result["reason"] == "prometheus_nan_or_inf"
else "no_data"
)
return _metric_payload(
definition,
value=None,
status=status,
reason=value_result["reason"],
denominator_value=denominator_value,
sample_count=sample_count,
)
value = float(value_result["value"])
status = _classify_status(value, definition)
return _metric_payload(
definition,
value=value,
status=status,
reason=None,
denominator_value=denominator_value,
sample_count=sample_count if sample_count is not None else value,
)
async def _fetch_verification_coverage(self) -> dict[str, Any]:
"""Summarize whether recent auto-repair executions have verifier evidence."""
try:
async with get_db_context() as db:
summary_row = (
await db.execute(text(_VERIFICATION_COVERAGE_SQL))
).mappings().one()
recent_rows = (
await db.execute(text(_VERIFICATION_COVERAGE_RECENT_SQL))
).mappings().all()
recent_non_success_rows = (
await db.execute(text(_VERIFICATION_COVERAGE_NON_SUCCESS_SQL))
).mappings().all()
except Exception as exc:
logger.warning("adr100_verification_coverage_query_error", error=str(exc))
return {
"schema_version": "adr100_verification_coverage_v1",
"source": "postgresql",
"window": "24h",
"status": "error",
"reason": "postgresql_query_error",
"evaluable": False,
"total_auto": 0,
"successful_auto": 0,
"verified_auto": 0,
"verified_success": 0,
"verified_non_success": 0,
"unverified_auto": 0,
"coverage_rate": None,
"verification_success_rate": None,
"last_auto_at": None,
"last_verified_auto_at": None,
"last_verification_evidence_at": None,
"latest_auto_age_seconds": None,
"last_verified_auto_age_seconds": None,
"recent_unverified": [],
"recent_non_success": [],
"non_success_breakdown": {
"by_verification_result": [],
"by_failure_class": [],
},
"remediation_queue": _remediation_queue_payload([]),
}
return _build_verification_coverage_payload(
summary_row,
recent_rows,
recent_non_success_rows,
)
_VERIFICATION_COVERAGE_SQL = """
WITH recent_auto AS (
SELECT id, incident_id, success, created_at
FROM auto_repair_executions
WHERE created_at >= NOW() - INTERVAL '24 hours'
),
per_auto AS (
SELECT
are.id,
are.incident_id,
are.success,
are.created_at,
latest.verification_result,
latest.collected_at AS verification_collected_at,
latest.self_healing_score
FROM recent_auto are
LEFT JOIN LATERAL (
SELECT ev.verification_result, ev.collected_at, ev.self_healing_score
FROM incident_evidence ev
WHERE ev.incident_id = are.incident_id
AND ev.verification_result IS NOT NULL
ORDER BY ev.collected_at DESC
LIMIT 1
) latest ON TRUE
)
SELECT
count(*)::int AS total_auto,
count(*) FILTER (WHERE success)::int AS successful_auto,
count(*) FILTER (WHERE verification_result IS NOT NULL)::int AS verified_auto,
count(*) FILTER (WHERE verification_result = 'success')::int AS verified_success,
count(*) FILTER (WHERE verification_result IN ('degraded','failed','timeout'))::int AS verified_non_success,
count(*) FILTER (WHERE verification_result IS NULL)::int AS unverified_auto,
max(created_at) AS last_auto_at,
max(created_at) FILTER (WHERE verification_result IS NOT NULL) AS last_verified_auto_at,
max(verification_collected_at) AS last_verification_evidence_at,
EXTRACT(EPOCH FROM (NOW() - max(created_at)))::int AS latest_auto_age_seconds,
EXTRACT(EPOCH FROM (NOW() - (max(created_at) FILTER (WHERE verification_result IS NOT NULL))))::int
AS last_verified_auto_age_seconds
FROM per_auto
"""
_VERIFICATION_COVERAGE_RECENT_SQL = """
WITH recent_auto AS (
SELECT id, incident_id, success, created_at
FROM auto_repair_executions
WHERE created_at >= NOW() - INTERVAL '24 hours'
),
per_auto AS (
SELECT
are.id,
are.incident_id,
are.success,
are.created_at,
latest.verification_result
FROM recent_auto are
LEFT JOIN LATERAL (
SELECT ev.verification_result
FROM incident_evidence ev
WHERE ev.incident_id = are.incident_id
AND ev.verification_result IS NOT NULL
ORDER BY ev.collected_at DESC
LIMIT 1
) latest ON TRUE
)
SELECT id, incident_id, success, created_at
FROM per_auto
WHERE verification_result IS NULL
ORDER BY created_at DESC
LIMIT 5
"""
_VERIFICATION_COVERAGE_NON_SUCCESS_SQL = """
WITH recent_auto AS (
SELECT
id,
incident_id,
success,
playbook_id,
playbook_name,
triggered_by,
risk_level,
error_message,
created_at
FROM auto_repair_executions
WHERE created_at >= NOW() - INTERVAL '24 hours'
),
per_auto AS (
SELECT
are.id AS auto_repair_id,
are.incident_id,
are.success AS auto_success,
are.playbook_id,
are.playbook_name,
are.triggered_by,
are.risk_level,
left(coalesce(are.error_message, ''), 240) AS auto_error,
are.created_at AS auto_created_at,
latest.verification_result,
latest.collected_at AS verification_collected_at,
left(coalesce(latest.post_execution_state::text, ''), 700) AS post_state_text,
left(coalesce(latest.evidence_summary, ''), 300) AS evidence_summary
FROM recent_auto are
LEFT JOIN LATERAL (
SELECT
ev.verification_result,
ev.collected_at,
ev.post_execution_state,
ev.evidence_summary
FROM incident_evidence ev
WHERE ev.incident_id = are.incident_id
AND ev.verification_result IS NOT NULL
ORDER BY ev.collected_at DESC
LIMIT 1
) latest ON TRUE
)
SELECT
p.*,
i.status::text AS incident_status,
i.severity::text AS incident_severity,
i.alert_category,
i.alertname
FROM per_auto p
LEFT JOIN incidents i ON i.incident_id = p.incident_id
WHERE p.verification_result IS NOT NULL
AND p.verification_result <> 'success'
ORDER BY p.auto_created_at DESC
LIMIT 8
"""
async def _query_prometheus_value(
client: httpx.AsyncClient,
prom_url: str,
query: str,
) -> dict[str, Any]:
try:
response = await client.get(
f"{prom_url}/api/v1/query",
params={"query": query},
)
data = response.json()
if data.get("status") != "success":
return {"status": "error", "reason": "prometheus_query_failed"}
results = data.get("data", {}).get("result", [])
if not results:
return {
"status": "no_data",
"reason": "prometheus_empty_result_metric_not_emitted",
}
raw_value = results[0]["value"][1]
value = float(raw_value)
if not math.isfinite(value):
return {
"status": "skipped",
"reason": "prometheus_nan_or_inf",
"raw_value": raw_value,
}
return {"status": "ok", "value": value}
except Exception as exc:
logger.warning("adr100_slo_prometheus_query_error", query=query, error=str(exc))
return {"status": "error", "reason": "prometheus_query_error"}
def _metric_payload(
definition: Adr100SloDefinition,
*,
value: float | None,
status: str,
reason: str | None,
denominator_value: float | None,
sample_count: float | None,
) -> dict[str, Any]:
return {
"name": definition.name,
"query": definition.query,
"value": value,
"target": definition.target,
"hard_red_line": definition.hard_red_line,
"direction": definition.direction,
"unit": definition.unit,
"window": definition.window,
"status": status,
"evaluable": status in {"ok", "warning", "violated"},
"reason": reason,
"denominator_query": definition.denominator_query,
"denominator_value": denominator_value,
"sample_count": sample_count,
}
def _classify_status(value: float, definition: Adr100SloDefinition) -> str:
if definition.direction == "above":
if value < definition.hard_red_line:
return "violated"
if value < definition.target:
return "warning"
return "ok"
if value > definition.hard_red_line:
return "violated"
if value > definition.target:
return "warning"
return "ok"
def _build_verification_coverage_payload(
summary_row: Any,
recent_unverified_rows: Any,
recent_non_success_rows: Any = (),
) -> dict[str, Any]:
row = dict(summary_row)
total_auto = int(row.get("total_auto") or 0)
verified_auto = int(row.get("verified_auto") or 0)
verified_success = int(row.get("verified_success") or 0)
verified_non_success = int(row.get("verified_non_success") or 0)
unverified_auto = int(row.get("unverified_auto") or 0)
if total_auto == 0:
status = "skipped_low_volume"
reason = "no_auto_repair_executions_24h"
evaluable = False
elif unverified_auto > 0:
status = "warning"
reason = "verification_backlog_present"
evaluable = True
elif verified_non_success > 0:
status = "warning"
reason = "non_success_verification_present"
evaluable = True
else:
status = "ok"
reason = None
evaluable = True
coverage_rate = (verified_auto / total_auto) if total_auto else None
verification_success_rate = (verified_success / verified_auto) if verified_auto else None
recent_non_success = [
_non_success_finding_payload(dict(raw))
for raw in recent_non_success_rows
]
remediation_queue = _remediation_queue_payload(recent_non_success)
return {
"schema_version": "adr100_verification_coverage_v1",
"source": "postgresql",
"window": "24h",
"status": status,
"reason": reason,
"evaluable": evaluable,
"total_auto": total_auto,
"successful_auto": int(row.get("successful_auto") or 0),
"verified_auto": verified_auto,
"verified_success": verified_success,
"verified_non_success": verified_non_success,
"unverified_auto": unverified_auto,
"coverage_rate": coverage_rate,
"verification_success_rate": verification_success_rate,
"last_auto_at": _iso(row.get("last_auto_at")),
"last_verified_auto_at": _iso(row.get("last_verified_auto_at")),
"last_verification_evidence_at": _iso(row.get("last_verification_evidence_at")),
"latest_auto_age_seconds": _int_or_none(row.get("latest_auto_age_seconds")),
"last_verified_auto_age_seconds": _int_or_none(row.get("last_verified_auto_age_seconds")),
"recent_unverified": [
{
"id": str(item.get("id")),
"incident_id": str(item.get("incident_id")),
"success": bool(item.get("success")),
"created_at": _iso(item.get("created_at")),
}
for item in (dict(raw) for raw in recent_unverified_rows)
],
"recent_non_success": recent_non_success,
"non_success_breakdown": {
"by_verification_result": _count_breakdown(
item["verification_result"] for item in recent_non_success
),
"by_failure_class": _count_breakdown(
item["failure_class"] for item in recent_non_success
),
"by_remediation_status": _count_breakdown(
item["remediation_status"] for item in remediation_queue["items"]
),
},
"remediation_queue": remediation_queue,
}
def _non_success_finding_payload(row: dict[str, Any]) -> dict[str, Any]:
failure_class = _classify_non_success_failure(row)
remediation = _remediation_for_failure_class(failure_class)
return {
"auto_repair_id": str(row.get("auto_repair_id")),
"incident_id": str(row.get("incident_id")),
"incident_status": str(row.get("incident_status") or "unknown"),
"incident_severity": str(row.get("incident_severity") or "unknown"),
"alert_category": row.get("alert_category"),
"alertname": row.get("alertname"),
"auto_success": bool(row.get("auto_success")),
"playbook_id": row.get("playbook_id"),
"playbook_name": row.get("playbook_name"),
"triggered_by": row.get("triggered_by"),
"risk_level": row.get("risk_level"),
"verification_result": str(row.get("verification_result") or "unknown"),
"failure_class": failure_class,
"next_step": _next_step_for_failure_class(failure_class),
"remediation_status": remediation["status"],
"remediation_action": remediation["action"],
"remediation_owner": remediation["owner"],
"remediation_reason": remediation["reason"],
"auto_error_excerpt": _short_text(row.get("auto_error"), 180),
"evidence_excerpt": _short_text(row.get("evidence_summary"), 180),
"auto_created_at": _iso(row.get("auto_created_at")),
"verification_collected_at": _iso(row.get("verification_collected_at")),
}
def _classify_non_success_failure(row: dict[str, Any]) -> str:
combined = " ".join(
str(row.get(key) or "")
for key in ("auto_error", "post_state_text", "evidence_summary")
).lower()
if "unsupported scheme" in combined:
return "unsupported_action_scheme"
if "missing_query_parameter" in combined:
return "verifier_missing_promql"
if "empty_pod_name" in combined:
return "verifier_target_missing_pod"
if not bool(row.get("auto_success")):
return "auto_repair_execution_failed"
result = str(row.get("verification_result") or "").lower()
if result in {"failed", "timeout"}:
return f"verification_{result}"
return "verification_degraded"
def _remediation_for_failure_class(failure_class: str) -> dict[str, str]:
"""Map a non-success verification class to a read-only remediation work item.
This is dashboard triage metadata only. It does not auto-close incidents,
replay repairs, or approve write actions.
"""
if failure_class == "unsupported_action_scheme":
return {
"status": "ready_for_replay",
"action": "replay_with_supported_executor",
"owner": "auto_repair_executor",
"reason": "executor_gateway_available_after_t23",
}
if failure_class == "verifier_missing_promql":
return {
"status": "ready_for_reverify",
"action": "reverify_with_promql_template",
"owner": "post_execution_verifier",
"reason": "promql_template_available_after_t23",
}
if failure_class == "verifier_target_missing_pod":
return {
"status": "needs_target_mapping",
"action": "map_target_and_reverify",
"owner": "post_execution_verifier",
"reason": "verifier_target_missing",
}
if failure_class == "auto_repair_execution_failed":
return {
"status": "needs_playbook_ticket",
"action": "create_playbook_ticket",
"owner": "solver_or_operator",
"reason": "execution_failed_after_route_normalization",
}
if failure_class in {"verification_failed", "verification_timeout"}:
return {
"status": "manual_review",
"action": "escalate_verification_failure",
"owner": "sre_operator",
"reason": "verifier_returned_hard_failure",
}
return {
"status": "manual_review",
"action": "inspect_degraded_evidence",
"owner": "sre_operator",
"reason": "degraded_evidence_requires_human_context",
}
def _next_step_for_failure_class(failure_class: str) -> str:
if failure_class == "unsupported_action_scheme":
return "normalize_playbook_executor"
if failure_class == "verifier_missing_promql":
return "add_verifier_query_template"
if failure_class == "verifier_target_missing_pod":
return "map_verifier_target"
if failure_class == "auto_repair_execution_failed":
return "review_auto_repair_execution"
if failure_class in {"verification_failed", "verification_timeout"}:
return "escalate_verification_failure"
return "review_degraded_verification"
def _remediation_queue_payload(recent_non_success: list[dict[str, Any]]) -> dict[str, Any]:
items: list[dict[str, Any]] = []
for item in recent_non_success:
items.append({
"work_item_id": (
f"verification:{item.get('incident_id')}:{item.get('auto_repair_id')}"
),
"incident_id": item.get("incident_id"),
"auto_repair_id": item.get("auto_repair_id"),
"alertname": item.get("alertname"),
"playbook_id": item.get("playbook_id"),
"failure_class": item.get("failure_class"),
"verification_result": item.get("verification_result"),
"remediation_status": item.get("remediation_status"),
"remediation_action": item.get("remediation_action"),
"remediation_owner": item.get("remediation_owner"),
"remediation_reason": item.get("remediation_reason"),
"source": "adr100_verification_coverage",
"auto_created_at": item.get("auto_created_at"),
"verification_collected_at": item.get("verification_collected_at"),
})
ready_for_ai = sum(
1 for item in items
if item.get("remediation_status") in {"ready_for_replay", "ready_for_reverify"}
)
needs_human = sum(
1 for item in items
if item.get("remediation_status") in {
"needs_target_mapping",
"needs_playbook_ticket",
"manual_review",
}
)
return {
"schema_version": "adr100_remediation_queue_v1",
"source": "recent_non_success_read_model",
"total": len(items),
"ready_for_ai": ready_for_ai,
"needs_human": needs_human,
"items": items,
"by_status": _count_breakdown(
item.get("remediation_status") for item in items
),
"by_action": _count_breakdown(
item.get("remediation_action") for item in items
),
}
def _count_breakdown(values: Any) -> list[dict[str, Any]]:
counts: dict[str, int] = {}
for value in values:
key = str(value or "unknown")
counts[key] = counts.get(key, 0) + 1
return [
{"name": name, "count": count}
for name, count in sorted(counts.items(), key=lambda item: (-item[1], item[0]))
]
def _short_text(value: Any, limit: int) -> str | None:
if value is None:
return None
text = " ".join(str(value).split())
if not text:
return None
return text[:limit]
def _iso(value: Any) -> str | None:
return value.isoformat() if hasattr(value, "isoformat") else None
def _int_or_none(value: Any) -> int | None:
return int(value) if value is not None else None
def _overall_status(
metrics: list[dict[str, Any]],
evaluable: list[dict[str, Any]],
verification_coverage: dict[str, Any] | None = None,
) -> str:
if any(metric.get("status") == "violated" for metric in metrics):
return "violated"
if verification_coverage and verification_coverage.get("status") in {"violated", "warning"}:
return str(verification_coverage["status"])
if any(metric.get("status") == "warning" for metric in metrics):
return "warning"
if evaluable and any(metric.get("status") == "skipped_low_volume" for metric in metrics):
return "partial"
if evaluable:
return "ok"
if any(metric.get("status") == "no_data" for metric in metrics):
return "no_data"
return "skipped_low_volume"
_adr100_slo_status_service: Adr100SloStatusService | None = None
def get_adr100_slo_status_service() -> Adr100SloStatusService:
global _adr100_slo_status_service
if _adr100_slo_status_service is None:
_adr100_slo_status_service = Adr100SloStatusService()
return _adr100_slo_status_service

View File

@@ -27,7 +27,7 @@ from __future__ import annotations
import asyncio
import dataclasses
import os
import json
import time
import uuid
from datetime import UTC, datetime
@@ -63,25 +63,11 @@ if TYPE_CHECKING:
logger = structlog.get_logger(__name__)
def _agent_debate_global_timeout_seconds() -> float:
"""Return the full Phase 2 debate timeout.
GCP Ollama incident analysis can legitimately take longer than the old
90s guard. Keep a hard ceiling, but make it an explicit deployment knob.
"""
raw = os.environ.get("AGENT_DEBATE_GLOBAL_TIMEOUT_SEC", "420.0")
try:
timeout = float(raw)
except (TypeError, ValueError):
timeout = 420.0
return max(timeout, 90.0)
# 全局超時(所有 Agent 加起來)
# 2026-05-06 Codex: configurable for GCP-A/GCP-B/111 Ollama-first incident
# diagnosis. The old 90s guard was cutting off valid deep diagnosis runs.
GLOBAL_TIMEOUT_SEC = _agent_debate_global_timeout_seconds()
# 2026-04-16 Claude Sonnet 4.6: deepseek-r1:14b 實測 2.2-27.3s avg 10.6s
# 原 30s 對 3 個序列 Agent 每個只剩 10s → 頻繁 timeout → confidence=20%
# 調整: 每 Agent 25s, 3個序列+1組並行 = 最差 75s + buffer = 90s
GLOBAL_TIMEOUT_SEC = 90.0
# 2026-04-16 ogt + Claude Sonnet 4.6: 移除 _PER_AGENT_TIMEOUT_SEC
# LLM 必須等到完整回應,不得人工截斷。降級只在真正異常(連線失敗、模型崩潰)觸發。

View File

@@ -29,62 +29,6 @@ from src.services.model_registry import get_model_registry
logger = structlog.get_logger(__name__)
settings = get_settings()
_GCP_LIGHTWEIGHT_MODELS = {
"gemma3:4b",
}
def _normalized_url(value: str | None) -> str:
return (value or "").rstrip("/")
def _is_gcp_alert_lane(endpoint_url: str) -> bool:
"""Return true for the CPU-only GCP-A/B synchronous alert lane."""
endpoint = _normalized_url(endpoint_url)
return endpoint in {
_normalized_url(getattr(settings, "OLLAMA_URL", "")),
_normalized_url(getattr(settings, "OLLAMA_SECONDARY_URL", "")),
}
def _resolve_model_for_endpoint(
*,
requested_model: str,
endpoint_url: str,
context: dict | None,
) -> str:
"""
Keep non-diagnosis calls from polluting the GCP diagnosis lane.
GCP-A/B are allowed to run the deep incident diagnosis model because the
alert goal is correctness and resolution, not the fastest Telegram card.
Accidental non-diagnosis workloads still fall back to the lightweight health
model so embedding/Hermes/background calls cannot occupy the same lane.
"""
model_name = requested_model.strip()
context = context or {}
allow_gcp_heavy = bool(context.get("allow_gcp_heavy_model"))
task_type = str(context.get("task_type") or context.get("intent_hint") or "").lower()
is_deep_diagnosis = task_type in {"diagnose", "alert_deep", "incident_diagnosis"}
if (
_is_gcp_alert_lane(endpoint_url)
and not allow_gcp_heavy
and not is_deep_diagnosis
and model_name not in _GCP_LIGHTWEIGHT_MODELS
):
fallback_model = str(getattr(settings, "OLLAMA_HEALTH_CHECK_MODEL", "gemma3:4b")).strip() or "gemma3:4b"
logger.warning(
"ollama_gcp_non_diagnosis_model_coerced",
endpoint=endpoint_url,
requested_model=model_name,
safe_model=fallback_model,
task_type=task_type,
)
return fallback_model
return model_name
class OllamaProvider:
"""
@@ -133,17 +77,11 @@ class OllamaProvider:
client = await self._get_client()
registry = get_model_registry()
endpoint_url = self._endpoint_url()
requested_model = str((context or {}).get("ollama_model") or registry.get_model("ollama", "rca")).strip()
model_name = _resolve_model_for_endpoint(
requested_model=requested_model,
endpoint_url=endpoint_url,
context=context,
)
model_name = registry.get_model("ollama", "rca")
options = registry.get_provider_options("ollama")
# P0 2026-04-04 Claude Code: per-task timeoutOption C 分情境)
# FORCE_LOCAL/diagnose → OLLAMA_DIAGNOSE_TIMEOUT_SECONDS
# FORCE_LOCAL/diagnose → OLLAMA_DIAGNOSE_TIMEOUT_SECONDS (200s實測 ~173s)
# 其他 → OPENCLAW_TIMEOUT既有設定
task_type = (context or {}).get("task_type", "")
if task_type in ("diagnose", "force_local"):
@@ -151,6 +89,7 @@ class OllamaProvider:
else:
read_timeout = float(settings.OPENCLAW_TIMEOUT)
endpoint_url = self._endpoint_url()
response = await client.post(
f"{endpoint_url}/api/generate",
json={
@@ -173,13 +112,7 @@ class OllamaProvider:
tokens = data.get("eval_count", 0) + data.get("prompt_eval_count", 0)
latency = (time.perf_counter() - start) * 1000
logger.info(
"ollama_provider_success",
response_length=len(result),
tokens=tokens,
latency_ms=round(latency, 1),
model=model_name,
)
logger.info("ollama_provider_success", response_length=len(result), tokens=tokens, latency_ms=round(latency, 1))
return AIResult(
raw_response=result,
success=True,
@@ -225,7 +158,7 @@ class OllamaProvider:
total_tokens = 0
messages: list[dict] = [{"role": "user", "content": prompt}]
registry = get_model_registry()
model_name = str((context or {}).get("ollama_model") or registry.get_model("ollama", "rca")).strip()
model_name = registry.get_model("ollama", "rca")
options = registry.get_provider_options("ollama")
task_type = (context or {}).get("task_type", "")
if task_type in ("diagnose", "force_local"):
@@ -382,9 +315,9 @@ class OllamaLocalProvider(OllamaProvider):
registry = get_model_registry()
# 嘗試取本地 fallback 專屬設定fallback 到 ollama 預設。
try:
model_name = str((context or {}).get("ollama_model") or registry.get_model("ollama_local", "rca")).strip()
model_name = registry.get_model("ollama_local", "rca")
except Exception:
model_name = str((context or {}).get("ollama_model") or getattr(settings, "OLLAMA_HEALTH_CHECK_MODEL", "qwen2.5:7b-instruct")).strip()
model_name = getattr(settings, "OLLAMA_HEALTH_CHECK_MODEL", "qwen2.5:7b-instruct")
try:
options = registry.get_provider_options("ollama_local")
@@ -425,7 +358,6 @@ class OllamaLocalProvider(OllamaProvider):
tokens=tokens,
latency_ms=round(latency, 1),
endpoint=fallback_url,
model=model_name,
)
return AIResult(
raw_response=result,

View File

@@ -274,13 +274,14 @@ class AIRateLimiter:
try:
from src.core.config import settings
from src.services.telegram_gateway import get_telegram_gateway
target_chat_id = settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID
if not settings.OPENCLAW_TG_BOT_TOKEN or not target_chat_id:
logger.warning("telegram_not_configured_for_cost_alert")
return
import httpx
message = (
f"🚨🚨🚨 <b>AI 成本超限警報</b> 🚨🚨🚨\n\n"
f"Provider: <code>{provider.upper()}</code>\n"
@@ -291,15 +292,15 @@ class AIRateLimiter:
f"<code>redis-cli DEL ai_rate:total_cost:{provider}</code>"
)
gateway = get_telegram_gateway()
await gateway._send_request(
"sendMessage",
{
"chat_id": target_chat_id,
"text": message,
"parse_mode": "HTML",
},
)
async with httpx.AsyncClient(timeout=10.0) as client:
await client.post(
f"https://api.telegram.org/bot{settings.OPENCLAW_TG_BOT_TOKEN}/sendMessage",
json={
"chat_id": target_chat_id,
"text": message,
"parse_mode": "HTML",
},
)
logger.error(
"ai_cost_alert_sent",
@@ -326,12 +327,13 @@ class AIRateLimiter:
try:
from src.core.config import settings
from src.services.telegram_gateway import get_telegram_gateway
target_chat_id = settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID
if not settings.OPENCLAW_TG_BOT_TOKEN or not target_chat_id:
return
import httpx
limit = COST_LIMITS[provider]["total_cost_usd"]
remaining = limit - current_cost
@@ -343,15 +345,15 @@ class AIRateLimiter:
f"接近上限,請注意監控!"
)
gateway = get_telegram_gateway()
await gateway._send_request(
"sendMessage",
{
"chat_id": target_chat_id,
"text": message,
"parse_mode": "HTML",
},
)
async with httpx.AsyncClient(timeout=10.0) as client:
await client.post(
f"https://api.telegram.org/bot{settings.OPENCLAW_TG_BOT_TOKEN}/sendMessage",
json={
"chat_id": target_chat_id,
"text": message,
"parse_mode": "HTML",
},
)
logger.warning(
"ai_cost_warning_sent",

View File

@@ -842,13 +842,14 @@ class AIRouter:
空 dict 代表無資料或查詢失敗caller 應降級為忽略)。
"""
try:
from src.db.base import get_db_context
from src.db.base import get_session_factory
from src.repositories.aider_event_repository import AiderEventRepository
except ImportError:
return {}
try:
async with get_db_context() as sess:
sf = get_session_factory()
async with sf() as sess:
repo_obj = AiderEventRepository(sess)
stats = await repo_obj.model_stats_since(days=days)
except Exception:
@@ -1089,11 +1090,6 @@ class AIRouterExecutor:
)
and bool(provider_order)
and provider_order[0].startswith("ollama")
) or (
bool(context)
and bool(context.get("enforce_ollama_first"))
and bool(provider_order)
and provider_order[0].startswith("ollama")
)
if (
cached_provider == "ollama"
@@ -1140,10 +1136,6 @@ class AIRouterExecutor:
_lf_trace_ctx = None
errors: list[str] = []
attempted_providers: set[str] = set()
alert_requires_ollama_before_cloud = bool(
(context or {}).get("alert_requires_ollama_before_cloud")
)
# 2026-04-27 Claude Sonnet 4.6: A2 INC-20260425 — DIAGNOSE fallback metric 追蹤
# 透過 context.get("intent_hint") 判斷是否為 DIAGNOSE避免改動 execute() 簽名
@@ -1193,31 +1185,13 @@ class AIRouterExecutor:
errors.append(f"{provider_name}: privacy_skip(non_local)")
continue
if alert_requires_ollama_before_cloud and provider.privacy_level == "cloud":
if "ollama_local" not in attempted_providers:
errors.append(f"{provider_name}: blocked_until_ollama_local_attempted")
logger.warning(
"ai_router_cloud_blocked_until_ollama_local_attempted",
provider=provider_name,
provider_order=provider_order,
attempted_providers=sorted(attempted_providers),
)
continue
# 閘門 1: Circuit Breaker (per-provider, C2 修復)
cb = self._get_circuit_breaker(provider_name)
if cb.is_open():
if alert_requires_ollama_before_cloud and provider_name.startswith("ollama"):
logger.warning(
"ai_router_alert_ollama_circuit_bypassed",
provider=provider_name,
reason="alert_requires_ollama_before_cloud",
)
else:
errors.append(f"{provider_name}: circuit_open")
logger.warning("ai_router_circuit_open", provider=provider_name)
# 2026-04-27 Claude Sonnet 4.6: F6 — circuit_open 不設 _last_attempted_provider未嘗試
continue
errors.append(f"{provider_name}: circuit_open")
logger.warning("ai_router_circuit_open", provider=provider_name)
# 2026-04-27 Claude Sonnet 4.6: F6 — circuit_open 不設 _last_attempted_provider未嘗試
continue
# 閘門 2: Rate Limiter
# 2026-04-02 Claude Code: Phase 24 B3 + C1 修復 — Rate Limiter (含 openclaw_nemo)
@@ -1237,7 +1211,6 @@ class AIRouterExecutor:
sem = self._get_semaphore(provider_name)
async with sem:
try:
attempted_providers.add(provider_name)
result = await provider.analyze(prompt, context)
if result.success:

View File

@@ -28,7 +28,7 @@ from datetime import timedelta
import structlog
from sqlalchemy import func, select, text
from src.db.base import get_db_context
from src.db.base import get_session_factory
from src.db.models import AiGovernanceEvent, AutoRepairExecution, ApprovalRecord
from src.utils.timezone import now_taipei
@@ -127,7 +127,7 @@ class AiSloCalculator:
try:
since = now_taipei() - timedelta(days=SLO_WINDOW_DAYS)
async with get_db_context() as session:
async with get_session_factory()() as session:
slo1 = await self._calc_auto_success_rate(session, since)
slo2 = await self._calc_human_override_rate(session, since)
slo3 = await self._calc_false_neg_rate(session, since)
@@ -210,7 +210,7 @@ class AiSloCalculator:
只在 any_violated=True 時呼叫。不管舊違反是否解決。
"""
try:
async with get_db_context() as session:
async with get_session_factory()() as session:
event = AiGovernanceEvent(
event_type="slo_violation",
details=report.to_dict(),

View File

@@ -1,151 +0,0 @@
"""Alert approval guardrails for AI-generated remediation actions.
This service runs before an Alertmanager-derived action becomes an
ApprovalRecord. It prevents a known failure mode: an LLM invents a kubectl
target that does not belong to the current alert domain, then the approval
pipeline faithfully executes or displays that bad command.
"""
from __future__ import annotations
from dataclasses import dataclass, field
import structlog
from src.services.action_parser import ActionKind, parse_kubectl_action
logger = structlog.get_logger(__name__)
_ALLOWED_K8S_NAMESPACES = frozenset({"awoooi-prod", "observability", "signoz", "langfuse"})
@dataclass(frozen=True)
class ApprovalActionGuardResult:
"""Guarded action payload returned to approval creation."""
action: str
blocked: bool = False
reason: str | None = None
metadata: dict[str, object] = field(default_factory=dict)
async def guard_alert_approval_action(
*,
action: str,
alert_namespace: str | None,
alertname: str,
alert_category: str,
) -> ApprovalActionGuardResult:
"""Validate an AI/rule action before it is persisted as an approval.
Non-kubectl actions are intentionally left to their domain-specific gates.
Kubectl actions must satisfy the structured parser and must not jump to an
unrelated namespace such as ``default`` or ``production`` when the alert
came from AWOOOI's production namespace.
"""
raw_action = (action or "").strip()
if not raw_action.lower().startswith("kubectl"):
return ApprovalActionGuardResult(action=action)
parsed = parse_kubectl_action(raw_action)
if not parsed.ok:
return _blocked(raw_action, f"invalid_kubectl:{parsed.reason}", alertname)
requested_namespace = parsed.namespace
expected_namespace = (alert_namespace or "awoooi-prod").strip() or "awoooi-prod"
if requested_namespace and requested_namespace not in _ALLOWED_K8S_NAMESPACES:
return _blocked(
raw_action,
f"namespace_not_allowed:{requested_namespace}",
alertname,
expected_namespace=expected_namespace,
)
if (
requested_namespace
and expected_namespace in _ALLOWED_K8S_NAMESPACES
and requested_namespace != expected_namespace
and requested_namespace != "observability"
):
return _blocked(
raw_action,
f"namespace_mismatch:{requested_namespace}!={expected_namespace}",
alertname,
expected_namespace=expected_namespace,
)
# Read-only commands are safe enough to display once the namespace is sane.
# Mutating commands still need resource existence checks to avoid executing
# hallucinated deployments like "flywheelexecutionratemissing".
if parsed.kind == ActionKind.READONLY and parsed.verb in {"get", "version"}:
return ApprovalActionGuardResult(action=action)
if parsed.resource_name and parsed.resource_type in {
"deployment",
"statefulset",
"daemonset",
"pod",
"service",
}:
try:
from src.services.resource_resolver import get_resource_resolver
resolver = get_resource_resolver()
resolved = await resolver.resolve(
raw_resource=parsed.resource_name,
namespace=requested_namespace or expected_namespace,
resource_kind=parsed.resource_type,
)
if not resolved.success:
return _blocked(
raw_action,
f"k8s_resource_not_found:{parsed.resource_type}/{parsed.resource_name}",
alertname,
expected_namespace=expected_namespace,
candidates=resolved.candidates,
)
except Exception as exc:
logger.warning(
"approval_action_resource_guard_unavailable",
alertname=alertname,
alert_category=alert_category,
action=raw_action[:160],
error=str(exc),
)
return ApprovalActionGuardResult(
action=action,
metadata={"action_guard_warning": "resource_guard_unavailable"},
)
return ApprovalActionGuardResult(action=action)
def _blocked(
raw_action: str,
reason: str,
alertname: str,
*,
expected_namespace: str | None = None,
candidates: list[str] | None = None,
) -> ApprovalActionGuardResult:
logger.warning(
"approval_action_blocked_before_persist",
alertname=alertname,
reason=reason,
action=raw_action[:160],
expected_namespace=expected_namespace,
candidates=candidates or [],
)
return ApprovalActionGuardResult(
action=f"NO_ACTION - INVALID_TARGET: {reason}; original={raw_action[:180]}",
blocked=True,
reason=reason,
metadata={
"action_guard": "blocked_before_persist",
"blocked_action": raw_action[:300],
"blocked_reason": reason,
"expected_namespace": expected_namespace,
"candidates": candidates or [],
},
)

Some files were not shown because too many files have changed in this diff Show More