Compare commits
249 Commits
drift/adop
...
drift/adop
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c88d82f2ac | ||
|
|
395cf742b9 | ||
|
|
72d86ba70b | ||
|
|
a26ccf8d80 | ||
|
|
77ef400598 | ||
|
|
08097f4070 | ||
|
|
32e8a045f4 | ||
|
|
814f5d8c6c | ||
|
|
4f0d677e18 | ||
|
|
5d38115d2f | ||
|
|
200b760512 | ||
|
|
83f4ab0dad | ||
|
|
2df36b11e2 | ||
|
|
1b7f46f02c | ||
|
|
6ae3a55aed | ||
|
|
94e680add4 | ||
|
|
4810125e9a | ||
|
|
3df23112ef | ||
|
|
2ccc9d3071 | ||
|
|
624c1b26c3 | ||
|
|
beba668a4c | ||
|
|
c52ebfc042 | ||
|
|
8b9a974c66 | ||
|
|
f960a4a19b | ||
|
|
9d85ec5e96 | ||
|
|
c00c7be9ae | ||
|
|
336fd76774 | ||
|
|
cd637ef616 | ||
|
|
66e22e26cb | ||
|
|
f10ab71c52 | ||
|
|
d5555697a1 | ||
|
|
3f69e03fcb | ||
|
|
57df3582dd | ||
|
|
14180182d3 | ||
|
|
6ac61ab6d7 | ||
|
|
968de38a94 | ||
|
|
e5fd9395f7 | ||
|
|
251554c044 | ||
|
|
1a1dea00eb | ||
|
|
8485d99336 | ||
|
|
c49246b8c6 | ||
|
|
67c70c071b | ||
|
|
18b34fed31 | ||
|
|
1f4a16e625 | ||
|
|
1a72f771de | ||
|
|
68e741e0c3 | ||
|
|
341c3b6523 | ||
|
|
f046742a4f | ||
|
|
b1167edde7 | ||
|
|
82e9aea057 | ||
|
|
2a8b96cc7f | ||
|
|
328b24de6a | ||
|
|
de4d35e184 | ||
|
|
ecc65be6e1 | ||
|
|
7b98f71393 | ||
|
|
cf0b6be695 | ||
|
|
9365bdab93 | ||
|
|
012cd27b4a | ||
|
|
678d489978 | ||
|
|
c5964fbcd3 | ||
|
|
886657473e | ||
|
|
d2d29185c9 | ||
|
|
7f4f5b24ba | ||
|
|
d2205dc1c0 | ||
|
|
19e721d4af | ||
|
|
9dfecc4d1b | ||
|
|
53994e75f0 | ||
|
|
2e06077337 | ||
|
|
8396d37275 | ||
|
|
150f17b219 | ||
|
|
9a3afa11ed | ||
|
|
edef1aa4c7 | ||
|
|
780a742110 | ||
|
|
a0179cec6e | ||
|
|
ea6b7d8f27 | ||
|
|
dd75a3b943 | ||
|
|
ea5ad040da | ||
|
|
b2f0db0717 | ||
|
|
93c4b62826 | ||
|
|
a132bee1d7 | ||
|
|
d0e98192de | ||
|
|
bcb9397c38 | ||
|
|
1a1ab0df6e | ||
|
|
572e7640cd | ||
|
|
2ece75935e | ||
|
|
2aaaa5654f | ||
|
|
8882301243 | ||
|
|
3aba5c7f9a | ||
|
|
2ef54ccc94 | ||
|
|
d90414ddfa | ||
|
|
a158b77422 | ||
|
|
d79ec4f647 | ||
|
|
ef3b05439a | ||
|
|
0e2e856f12 | ||
|
|
9b0f55fd90 | ||
|
|
7473a01322 | ||
|
|
38b61e290e | ||
|
|
fa0e956c0e | ||
|
|
76aaaf480c | ||
|
|
c1ac157aaf | ||
|
|
73d7e332a4 | ||
|
|
33f85ec8ca | ||
|
|
38a4748e17 | ||
|
|
8f715fd3f2 | ||
|
|
a94435f143 | ||
|
|
a7a9ba996d | ||
|
|
fcf93aac11 | ||
|
|
1d9dbac112 | ||
|
|
4e9981c182 | ||
|
|
7ed8c95409 | ||
|
|
1e68d45659 | ||
|
|
60c00d7a5d | ||
|
|
72811b967e | ||
|
|
927c2a758d | ||
|
|
e5094c5c53 | ||
|
|
154aec849e | ||
|
|
22453161e9 | ||
|
|
d3e1b61096 | ||
|
|
f88a3a846b | ||
|
|
2adbf1e6cd | ||
|
|
6c4f8379ad | ||
|
|
d441f70693 | ||
|
|
033ac8129b | ||
|
|
4111ea4f9f | ||
|
|
578bf3bc7c | ||
|
|
ffd767d4bb | ||
|
|
6e2ab7cedc | ||
|
|
c4f40235f4 | ||
|
|
4753099155 | ||
|
|
eb71bc61ed | ||
|
|
8ae7789e93 | ||
|
|
2c2bf9d665 | ||
|
|
56b4d8165b | ||
|
|
c696b99ccf | ||
|
|
e6eae5cdc4 | ||
|
|
072cc23a42 | ||
|
|
682c0b9995 | ||
|
|
96ad3a18ee | ||
|
|
9ef9633aff | ||
|
|
df5e6c6626 | ||
|
|
d2aebdd477 | ||
|
|
09256be62c | ||
|
|
a4fece11cc | ||
|
|
c2c0b1ec82 | ||
|
|
1d0e80c091 | ||
|
|
3b64d66836 | ||
|
|
5890fffd7f | ||
|
|
eced8617d3 | ||
|
|
587551c1f1 | ||
|
|
a2c4b3d47e | ||
|
|
20ef0c1455 | ||
|
|
cb9551fb00 | ||
|
|
5ed396e390 | ||
|
|
6e96623884 | ||
|
|
87ce02f34d | ||
|
|
0315c2b510 | ||
|
|
2aa31c205a | ||
|
|
23932773ef | ||
|
|
2f50c67f5c | ||
|
|
85d5b5c823 | ||
|
|
25b1923d2e | ||
|
|
e208798531 | ||
|
|
1ba36697ca | ||
|
|
405b8b8ef9 | ||
|
|
1cc215ec30 | ||
|
|
83daeb3f87 | ||
|
|
c4854bb355 | ||
|
|
1dcc6d61dc | ||
|
|
ed7c6946cb | ||
|
|
7baa316224 | ||
|
|
31fd9cbf48 | ||
|
|
e8f279280f | ||
|
|
787acd3bda | ||
|
|
86bd6432ee | ||
|
|
bf847ad045 | ||
|
|
a4e9a04982 | ||
|
|
72a1d33f9d | ||
|
|
bec82127e7 | ||
|
|
8f83773431 | ||
|
|
8495a45002 | ||
|
|
333c8a9cfd | ||
|
|
1baeb7ee61 | ||
|
|
ee5e3bc94f | ||
|
|
7b0a4bce98 | ||
|
|
2221fd3256 | ||
|
|
84a661beaf | ||
|
|
6b93c8f454 | ||
|
|
3a17a860a0 | ||
|
|
6ec5c06bad | ||
|
|
44d8322c4d | ||
|
|
819734f655 | ||
|
|
1cc9de5722 | ||
|
|
96c1ba20da | ||
|
|
855a39ad95 | ||
|
|
209da7ba33 | ||
|
|
d08d1e4951 | ||
|
|
e24c8ea051 | ||
|
|
72d66e4ae6 | ||
|
|
5e625f777d | ||
|
|
df72c77880 | ||
|
|
7d45f0cb58 | ||
|
|
fc1a6196df | ||
|
|
3b73cc7f94 | ||
|
|
96b860dc2c | ||
|
|
2e128f90db | ||
|
|
228768ff68 | ||
|
|
ab0f0a8a62 | ||
|
|
0e14935351 | ||
|
|
a5192d4e03 | ||
|
|
34d1c76be9 | ||
|
|
2b93975d37 | ||
|
|
fe618960a8 | ||
|
|
8e22110030 | ||
|
|
2ff0ef3bb6 | ||
|
|
bb1995f349 | ||
|
|
e8e6748f70 | ||
|
|
a57e3d3d75 | ||
|
|
b00a7b050a | ||
|
|
506744ba3a | ||
|
|
869646459c | ||
|
|
33d4326cce | ||
|
|
b3d412f9eb | ||
|
|
f78b1b0690 | ||
|
|
0ebd0d8a92 | ||
|
|
2e17325c3f | ||
|
|
e22b8e7ab2 | ||
|
|
aa4ccec429 | ||
|
|
3f853accf2 | ||
|
|
d934242846 | ||
|
|
10e665a540 | ||
|
|
40badc42cf | ||
|
|
ec013f662d | ||
|
|
a1b61289f5 | ||
|
|
45f6f17558 | ||
|
|
00bc3b0cc9 | ||
|
|
8629ac709b | ||
|
|
0a90dab1e9 | ||
|
|
855819652e | ||
|
|
f6b698c873 | ||
|
|
72cd79ed8b | ||
|
|
54a4e59af9 | ||
|
|
ccffaa5f3e | ||
|
|
439c432c7c | ||
|
|
898d7b0ff2 | ||
|
|
f2f5148ca6 | ||
|
|
2b2359e367 | ||
|
|
14bf86a462 | ||
|
|
13e51802fe | ||
|
|
b4055c5915 |
@@ -563,21 +563,21 @@
|
||||
"mcp__plugin_playwright_playwright__browser_navigate",
|
||||
"mcp__plugin_playwright_playwright__browser_take_screenshot",
|
||||
"Bash(open \"http://192.168.0.110:3001/wooo/awoooi/actions\")",
|
||||
"Bash(TOKEN=\"2fa33d4e6d8ef1806c18875ed6fec216c8a10e78\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs?limit=5\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"2fa33d4e6d8ef1806c18875ed6fec216c8a10e78\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs/166/jobs\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"2fa33d4e6d8ef1806c18875ed6fec216c8a10e78\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs?limit=10\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"2fa33d4e6d8ef1806c18875ed6fec216c8a10e78\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runners\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"2fa33d4e6d8ef1806c18875ed6fec216c8a10e78\" curl -s \"http://192.168.0.110:3001/api/v1/admin/runners\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"2fa33d4e6d8ef1806c18875ed6fec216c8a10e78\")",
|
||||
"Bash(TOKEN=\"2fa33d4e6d8ef1806c18875ed6fec216c8a10e78\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs?limit=3\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"2fa33d4e6d8ef1806c18875ed6fec216c8a10e78\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs/169/jobs\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"2fa33d4e6d8ef1806c18875ed6fec216c8a10e78\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/jobs/179/logs\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"2fa33d4e6d8ef1806c18875ed6fec216c8a10e78\" JOB_ID=180 curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/jobs/$JOB_ID/logs\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"2fa33d4e6d8ef1806c18875ed6fec216c8a10e78\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs?limit=2\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"2fa33d4e6d8ef1806c18875ed6fec216c8a10e78\" JOB_ID=181 curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/jobs/$JOB_ID/logs\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"2fa33d4e6d8ef1806c18875ed6fec216c8a10e78\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs/172/jobs\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"2fa33d4e6d8ef1806c18875ed6fec216c8a10e78\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/jobs/182/logs\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"2fa33d4e6d8ef1806c18875ed6fec216c8a10e78\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs/178\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs?limit=5\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs/166/jobs\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs?limit=10\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runners\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/admin/runners\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\")",
|
||||
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs?limit=3\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs/169/jobs\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/jobs/179/logs\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" JOB_ID=180 curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/jobs/$JOB_ID/logs\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs?limit=2\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" JOB_ID=181 curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/jobs/$JOB_ID/logs\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs/172/jobs\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/jobs/182/logs\" -H \"Authorization: token $TOKEN\")",
|
||||
"Bash(TOKEN=\"REDACTED_GITEA_TOKEN\" curl -s \"http://192.168.0.110:3001/api/v1/repos/wooo/awoooi/actions/runs/178\" -H \"Authorization: token $TOKEN\")",
|
||||
"mcp__plugin_playwright_playwright__browser_snapshot",
|
||||
"mcp__plugin_playwright_playwright__browser_fill_form",
|
||||
"mcp__plugin_playwright_playwright__browser_click",
|
||||
|
||||
@@ -108,7 +108,9 @@ jobs:
|
||||
mkdir -p ~/.ssh
|
||||
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
|
||||
chmod 600 ~/.ssh/deploy_key
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS
|
||||
# 2026-05-05 Codex: kubectl runs on 120 control-plane. 121 is a
|
||||
# worker and its local kubeconfig points at 127.0.0.1:6443.
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS
|
||||
set -e
|
||||
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
||||
|
||||
@@ -138,10 +140,10 @@ jobs:
|
||||
SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
|
||||
run: |
|
||||
cat k8s/awoooi-dev/02-configmap.yaml | \
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 \
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 \
|
||||
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
|
||||
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << 'DEPLOY'
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << 'DEPLOY'
|
||||
set -e
|
||||
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
||||
|
||||
|
||||
@@ -16,8 +16,9 @@ on:
|
||||
# 只有實際影響部署的程式碼才觸發 CD
|
||||
- 'apps/**'
|
||||
- 'k8s/**'
|
||||
- '.gitea/workflows/**'
|
||||
- '.dockerignore'
|
||||
# Workflow-only changes do not rebuild runtime images. Use workflow_dispatch
|
||||
# when an operator explicitly wants to test the CD pipeline itself.
|
||||
# docs/、memory/、ADR 等不觸發
|
||||
# ops/monitoring/alerts-unified.yml 由 deploy-alerts.yaml 獨立處理 (I3)
|
||||
workflow_dispatch:
|
||||
@@ -41,6 +42,15 @@ env:
|
||||
OTEL_SERVICE_NAME: awoooi-cd
|
||||
OTEL_RESOURCE_ATTRIBUTES: service.version=${{ github.sha }},deployment.environment=production
|
||||
CI_IMAGE: 192.168.0.110:5000/awoooi/ci-runner:act-22.04
|
||||
# 2026-05-06 Codex: deploy through the 120 control-plane node. After dirty
|
||||
# reboots, 121 host-key prompts can block the non-interactive host runner.
|
||||
# Both nodes support the sudo kubectl path, but 120 removes the extra hop.
|
||||
K8S_SSH_HOST: 192.168.0.120
|
||||
K8S_API_SERVER: https://192.168.0.120:6443
|
||||
# 2026-05-05 Codex: health/smoke probes use the keepalived VIP instead of a
|
||||
# fixed node. Kubectl still tunnels through K8S_SSH_HOST with --server=120.
|
||||
API_HEALTH_URL: http://192.168.0.125:32334/api/v1/health
|
||||
ALERT_CHAIN_API_URL: http://192.168.0.125:32334
|
||||
|
||||
jobs:
|
||||
tests:
|
||||
@@ -52,6 +62,15 @@ jobs:
|
||||
# 2026-04-10 ogt: B5 改用 docker run 本地啟動,移除 services: 宣告
|
||||
# Gitea act runner 的 services: container name 為空,導致 CI 失敗
|
||||
steps:
|
||||
- name: Bootstrap Host Runner Tools
|
||||
# 2026-05-05 Codex: awoooi-host maps to the long-lived act-runner
|
||||
# container. After dirty reboots it may not contain node/curl/git, and
|
||||
# actions/checkout@v4 fails before tests can start.
|
||||
run: |
|
||||
if command -v apk >/dev/null 2>&1; then
|
||||
apk add --no-cache nodejs npm git curl bash openssh-client docker-cli docker-cli-buildx
|
||||
fi
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
# 2026-03-31 ogt: 優化告警格式 - 提高可讀性
|
||||
@@ -150,6 +169,9 @@ jobs:
|
||||
exit $PYTEST_EXIT
|
||||
CI_SCRIPT
|
||||
docker run --rm \
|
||||
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-api-tests" \
|
||||
--cpus "2.0" \
|
||||
--memory "2g" \
|
||||
-v "$PWD:/workspace" \
|
||||
-v /tmp/awoooi-api-tests.sh:/tmp/awoooi-api-tests.sh:ro \
|
||||
-v awoooi-api-venv-cache:/opt/api-venv \
|
||||
@@ -213,6 +235,9 @@ jobs:
|
||||
docker rm -f pg-test-b5 || true
|
||||
CI_SCRIPT
|
||||
docker run --rm \
|
||||
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-b5-tests" \
|
||||
--cpus "2.0" \
|
||||
--memory "2g" \
|
||||
-v "$PWD:/workspace" \
|
||||
-v /tmp/awoooi-b5-tests.sh:/tmp/awoooi-b5-tests.sh:ro \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
@@ -242,6 +267,14 @@ jobs:
|
||||
timeout-minutes: 60
|
||||
runs-on: awoooi-host
|
||||
steps:
|
||||
- name: Bootstrap Host Runner Tools
|
||||
# 2026-05-05 Codex: keep the host-mode runner self-healing before
|
||||
# actions/checkout@v4 and Telegram failure notifications run.
|
||||
run: |
|
||||
if command -v apk >/dev/null 2>&1; then
|
||||
apk add --no-cache nodejs npm git curl bash openssh-client docker-cli docker-cli-buildx
|
||||
fi
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Get Commit Info
|
||||
@@ -267,6 +300,7 @@ jobs:
|
||||
run: |
|
||||
LOCK_NAME="awoooi-cd-docker-build-lock"
|
||||
STALE_SECONDS=7200
|
||||
EMPTY_LOCK_SECONDS=300
|
||||
WAIT_ATTEMPTS=180
|
||||
|
||||
for attempt in $(seq 1 "$WAIT_ATTEMPTS"); do
|
||||
@@ -287,16 +321,25 @@ jobs:
|
||||
# 修法:sed 去除奈秒 (.NNN...) 和末尾縮寫 (空格+大寫字母),GNU date 才能正確解析
|
||||
CREATED_CLEAN=$(echo "$CREATED_AT" | sed 's/\.[0-9]*//' | sed 's/ [A-Z][A-Z]*$//')
|
||||
CREATED_EPOCH=$(date -d "$CREATED_CLEAN" +%s 2>/dev/null || \
|
||||
python3 -c "
|
||||
import sys, datetime, re
|
||||
ts = re.sub(r'\.\d+', '', sys.argv[1])
|
||||
ts = re.sub(r'\s+[A-Z]{2,4}$', '', ts.strip())
|
||||
print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
" "$CREATED_AT" 2>/dev/null || echo 0)
|
||||
python3 -c "import sys, datetime, re; ts = re.sub(r'\\.\d+', '', sys.argv[1]); ts = re.sub(r'\\s+[A-Z]{2,4}$', '', ts.strip()); print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))" \
|
||||
"$CREATED_AT" 2>/dev/null || echo 0)
|
||||
NOW_EPOCH=$(date +%s)
|
||||
LOCK_AGE=$((NOW_EPOCH - CREATED_EPOCH))
|
||||
# 2026-05-05 Codex: dirty reboot / cancelled Actions can leave
|
||||
# the Docker-network lock behind with no active build or push.
|
||||
# Waiting the full 30m CD timeout keeps deploys queued even
|
||||
# though no job is protected, so clear empty locks after 5m.
|
||||
ACTIVE_DOCKER_WORK=$(ps -eo args | grep -E 'docker (build|push)|buildx build' | grep -v grep || true)
|
||||
if [ "$CREATED_EPOCH" -gt 0 ] && \
|
||||
[ $((NOW_EPOCH - CREATED_EPOCH)) -gt "$STALE_SECONDS" ]; then
|
||||
echo "⚠️ stale Docker build lock detected (age=$((NOW_EPOCH - CREATED_EPOCH))s > ${STALE_SECONDS}s), removing ${LOCK_NAME}"
|
||||
[ "$LOCK_AGE" -gt "$EMPTY_LOCK_SECONDS" ] && \
|
||||
[ -z "$ACTIVE_DOCKER_WORK" ]; then
|
||||
echo "⚠️ empty Docker build lock detected (age=${LOCK_AGE}s > ${EMPTY_LOCK_SECONDS}s, no active docker build/push), removing ${LOCK_NAME}"
|
||||
docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true
|
||||
continue
|
||||
fi
|
||||
if [ "$CREATED_EPOCH" -gt 0 ] && \
|
||||
[ "$LOCK_AGE" -gt "$STALE_SECONDS" ]; then
|
||||
echo "⚠️ stale Docker build lock detected (age=${LOCK_AGE}s > ${STALE_SECONDS}s), removing ${LOCK_NAME}"
|
||||
docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true
|
||||
continue
|
||||
fi
|
||||
@@ -312,8 +355,8 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
# ── API 鏡像建置(含 Layer Cache 加速)──────────────────────────────
|
||||
# 2026-04-01 ogt: CACHE_BUST=git_sha 確保 src/ 和 models.json 層每次重建
|
||||
# deps 層 (pip install) 仍可 cache → 加速;代碼/配置層強制失效
|
||||
# 首席架構師 Review C1 (2026-04-05 Claude Code): 補 DOCKER_BUILDKIT=1
|
||||
# BUILDKIT_INLINE_CACHE=1 只有在 BuildKit 啟用時才有效
|
||||
# 2026-05-05 Codex: host runner bootstrap installs docker-cli-buildx;
|
||||
# keep BuildKit enabled because the web Dockerfile uses RUN --mount.
|
||||
- name: Build and Push API
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
@@ -335,7 +378,7 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
# 2026-04-01 Claude Code: CACHE_BUST=git_sha 取代 --no-cache
|
||||
# - deps 層 (pnpm install) 仍可 cache → 節省 ~2-3 min
|
||||
# - COPY . . 以下由 CACHE_BUST 強制失效 → 業務邏輯/CSRF 等變更正確進入 bundle
|
||||
# 2026-04-12 ogt: 實測 --no-cache=10m50s;CACHE_BUST=5m50s,恢復此方案
|
||||
# 2026-05-05 Codex: mirror API build mode; BuildKit required for cache mounts.
|
||||
- name: Build and Push Web
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
@@ -390,6 +433,7 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
JWT_SECRET: ${{ secrets.JWT_SECRET }}
|
||||
JWT_ALGORITHM: ${{ secrets.JWT_ALGORITHM }}
|
||||
WEBHOOK_HMAC_SECRET: ${{ secrets.WEBHOOK_HMAC_SECRET }}
|
||||
AWOOOP_OPERATOR_API_KEY: ${{ secrets.AWOOOP_OPERATOR_API_KEY }}
|
||||
SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
|
||||
CLAUDE_API_KEY: ${{ secrets.CLAUDE_API_KEY }}
|
||||
# AWOOOI_ 前綴避開 Gitea 保留字(同 AWOOOI_GITEA_WEBHOOK_SECRET 模式)
|
||||
@@ -401,15 +445,17 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
run: |
|
||||
# S1/S2: 統一命名 deploy_key,改用 ssh-keyscan(比 StrictHostKeyChecking=no 更安全)
|
||||
mkdir -p ~/.ssh
|
||||
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
|
||||
chmod 600 ~/.ssh/deploy_key
|
||||
ssh-keyscan 192.168.0.121 >> ~/.ssh/known_hosts 2>/dev/null
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS
|
||||
echo "$SSH_PRIVATE_KEY" > "${HOME}/.ssh/deploy_key"
|
||||
chmod 600 "${HOME}/.ssh/deploy_key"
|
||||
ssh-keyscan -T 5 "${{ env.K8S_SSH_HOST }}" > ~/.ssh/known_hosts 2>/dev/null
|
||||
SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${HOME}/.ssh/known_hosts -o ConnectTimeout=10"
|
||||
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" << SECRETS
|
||||
set -e
|
||||
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
||||
K8S_API_SERVER="${{ env.K8S_API_SERVER }}"
|
||||
KUBECTL="sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=\${K8S_API_SERVER}"
|
||||
|
||||
# 注入 Telegram Secrets (ADR-035 鐵律)
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/OPENCLAW_TG_BOT_TOKEN","value":"'$(echo -n "${TG_BOT_TOKEN}" | base64 -w 0)'"},
|
||||
{"op":"add","path":"/data/OPENCLAW_TG_CHAT_ID","value":"'$(echo -n "${TG_CHAT_ID}" | base64 -w 0)'"}
|
||||
]' || { echo "❌ Telegram Secrets patch 失敗 — ADR-035 鐵律"; exit 1; }
|
||||
@@ -418,7 +464,7 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
# 2026-04-01 Claude Code: base64 -w 0 防止長 key 換行破壞 JSON
|
||||
# NVIDIA NIM (免費 tier)
|
||||
if [ -n "${NVIDIA_API_KEY}" ] && [ "${NVIDIA_API_KEY}" != "" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/NVIDIA_API_KEY","value":"'$(echo -n "${NVIDIA_API_KEY}" | base64 -w 0)'"}
|
||||
]' && echo "✅ NVIDIA_API_KEY 已注入" || echo "⚠️ NVIDIA_API_KEY patch 失敗"
|
||||
else
|
||||
@@ -427,7 +473,7 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
|
||||
# Gemini (備援)
|
||||
if [ -n "${GEMINI_API_KEY}" ] && [ "${GEMINI_API_KEY}" != "" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/GEMINI_API_KEY","value":"'$(echo -n "${GEMINI_API_KEY}" | base64 -w 0)'"}
|
||||
]' && echo "✅ GEMINI_API_KEY 已注入" || echo "⚠️ GEMINI_API_KEY patch 失敗"
|
||||
else
|
||||
@@ -436,7 +482,7 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
|
||||
# 2026-04-01 Claude Code: Langfuse LLMOps keys (補齊 CD 注入,之前只有手動設定)
|
||||
if [ -n "${LANGFUSE_PUBLIC_KEY}" ] && [ -n "${LANGFUSE_SECRET_KEY}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/LANGFUSE_PUBLIC_KEY","value":"'$(echo -n "${LANGFUSE_PUBLIC_KEY}" | base64 -w 0)'"},
|
||||
{"op":"add","path":"/data/LANGFUSE_SECRET_KEY","value":"'$(echo -n "${LANGFUSE_SECRET_KEY}" | base64 -w 0)'"}
|
||||
]' && echo "✅ LANGFUSE keys 已注入" || echo "⚠️ LANGFUSE keys patch 失敗"
|
||||
@@ -446,14 +492,14 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
|
||||
# 2026-04-02 Claude Code: Telegram Whitelist (授權簽核用戶 ID)
|
||||
if [ -n "${TG_USER_WHITELIST}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/OPENCLAW_TG_USER_WHITELIST","value":"'$(echo -n "${TG_USER_WHITELIST}" | base64 -w 0)'"}
|
||||
]' && echo "✅ TG_USER_WHITELIST 已注入" || echo "⚠️ TG_USER_WHITELIST patch 失敗"
|
||||
fi
|
||||
|
||||
# Phase O-4.1 2026-04-02: Sentry Auth Token (Wave A.1 ADR-037)
|
||||
if [ -n "${SENTRY_AUTH_TOKEN}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/SENTRY_AUTH_TOKEN","value":"'$(echo -n "${SENTRY_AUTH_TOKEN}" | base64 -w 0)'"}
|
||||
]' && echo "✅ SENTRY_AUTH_TOKEN 已注入" || echo "⚠️ SENTRY_AUTH_TOKEN patch 失敗"
|
||||
else
|
||||
@@ -462,7 +508,7 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
|
||||
# ADR-059 2026-04-05 Claude Code: Gitea Webhook Secret
|
||||
if [ -n "${GITEA_WEBHOOK_SECRET}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/GITEA_WEBHOOK_SECRET","value":"'$(echo -n "${GITEA_WEBHOOK_SECRET}" | base64 -w 0)'"}
|
||||
]' && echo "✅ GITEA_WEBHOOK_SECRET 已注入" || echo "⚠️ GITEA_WEBHOOK_SECRET patch 失敗"
|
||||
else
|
||||
@@ -471,7 +517,7 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
|
||||
# MCP Phase 3: ArgoCD API Token (2026-04-11 Claude Sonnet 4.6)
|
||||
if [ -n "${ARGOCD_API_TOKEN}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/ARGOCD_API_TOKEN","value":"'$(echo -n "${ARGOCD_API_TOKEN}" | base64 -w 0)'"}
|
||||
]' && echo "✅ ARGOCD_API_TOKEN 已注入" || echo "⚠️ ARGOCD_API_TOKEN patch 失敗"
|
||||
else
|
||||
@@ -486,7 +532,7 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
|
||||
# DATABASE_URL — PG 應用連線串(2026-04-18 輪替)
|
||||
if [ -n "${DATABASE_URL}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/DATABASE_URL","value":"'$(echo -n "${DATABASE_URL}" | base64 -w 0)'"}
|
||||
]' && echo "✅ DATABASE_URL 已注入" || echo "⚠️ DATABASE_URL patch 失敗"
|
||||
else
|
||||
@@ -495,14 +541,14 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
|
||||
# MIGRATION_DATABASE_URL — CI migration 用 awoooi_migrator 限權帳號(ADR-090-B)
|
||||
if [ -n "${MIGRATION_DATABASE_URL}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/MIGRATION_DATABASE_URL","value":"'$(echo -n "${MIGRATION_DATABASE_URL}" | base64 -w 0)'"}
|
||||
]' && echo "✅ MIGRATION_DATABASE_URL 已注入" || echo "⚠️ MIGRATION_DATABASE_URL patch 失敗"
|
||||
fi
|
||||
|
||||
# REDIS_URL — Redis 連線(6380 on 188)
|
||||
if [ -n "${REDIS_URL}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/REDIS_URL","value":"'$(echo -n "${REDIS_URL}" | base64 -w 0)'"}
|
||||
]' && echo "✅ REDIS_URL 已注入" || echo "⚠️ REDIS_URL patch 失敗"
|
||||
else
|
||||
@@ -511,64 +557,71 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
|
||||
# JWT_SECRET / JWT_ALGORITHM — API 認證
|
||||
if [ -n "${JWT_SECRET}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/JWT_SECRET","value":"'$(echo -n "${JWT_SECRET}" | base64 -w 0)'"}
|
||||
]' && echo "✅ JWT_SECRET 已注入" || echo "⚠️ JWT_SECRET patch 失敗"
|
||||
fi
|
||||
if [ -n "${JWT_ALGORITHM}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/JWT_ALGORITHM","value":"'$(echo -n "${JWT_ALGORITHM}" | base64 -w 0)'"}
|
||||
]' && echo "✅ JWT_ALGORITHM 已注入" || echo "⚠️ JWT_ALGORITHM patch 失敗"
|
||||
fi
|
||||
|
||||
# WEBHOOK_HMAC_SECRET — Alertmanager webhook HMAC 簽章
|
||||
if [ -n "${WEBHOOK_HMAC_SECRET}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/WEBHOOK_HMAC_SECRET","value":"'$(echo -n "${WEBHOOK_HMAC_SECRET}" | base64 -w 0)'"}
|
||||
]' && echo "✅ WEBHOOK_HMAC_SECRET 已注入" || echo "⚠️ WEBHOOK_HMAC_SECRET patch 失敗"
|
||||
fi
|
||||
|
||||
# AWOOOP_OPERATOR_API_KEY — AwoooP Operator mutation endpoints
|
||||
if [ -n "${AWOOOP_OPERATOR_API_KEY}" ]; then
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/AWOOOP_OPERATOR_API_KEY","value":"'$(echo -n "${AWOOOP_OPERATOR_API_KEY}" | base64 -w 0)'"}
|
||||
]' && echo "✅ AWOOOP_OPERATOR_API_KEY 已注入" || echo "⚠️ AWOOOP_OPERATOR_API_KEY patch 失敗"
|
||||
fi
|
||||
|
||||
# SENTRY_DSN — Sentry 錯誤追蹤(不是 auth token)
|
||||
if [ -n "${SENTRY_DSN}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/SENTRY_DSN","value":"'$(echo -n "${SENTRY_DSN}" | base64 -w 0)'"}
|
||||
]' && echo "✅ SENTRY_DSN 已注入" || echo "⚠️ SENTRY_DSN patch 失敗"
|
||||
fi
|
||||
|
||||
# CLAUDE_API_KEY — Claude 備援 LLM
|
||||
if [ -n "${CLAUDE_API_KEY}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/CLAUDE_API_KEY","value":"'$(echo -n "${CLAUDE_API_KEY}" | base64 -w 0)'"}
|
||||
]' && echo "✅ CLAUDE_API_KEY 已注入" || echo "⚠️ CLAUDE_API_KEY patch 失敗"
|
||||
fi
|
||||
|
||||
# GITEA_API_TOKEN — Gitea API Token(從 AWOOOI_GITEA_API_TOKEN 映射)
|
||||
if [ -n "${GITEA_API_TOKEN}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/GITEA_API_TOKEN","value":"'$(echo -n "${GITEA_API_TOKEN}" | base64 -w 0)'"}
|
||||
]' && echo "✅ GITEA_API_TOKEN 已注入" || echo "⚠️ GITEA_API_TOKEN patch 失敗"
|
||||
fi
|
||||
|
||||
# NEMOTRON_BOT_TOKEN / OPENCLAW_BOT_TOKEN — 多 Bot 架構
|
||||
if [ -n "${NEMOTRON_BOT_TOKEN}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/NEMOTRON_BOT_TOKEN","value":"'$(echo -n "${NEMOTRON_BOT_TOKEN}" | base64 -w 0)'"}
|
||||
]' && echo "✅ NEMOTRON_BOT_TOKEN 已注入" || echo "⚠️ NEMOTRON_BOT_TOKEN patch 失敗"
|
||||
fi
|
||||
if [ -n "${OPENCLAW_BOT_TOKEN}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/OPENCLAW_BOT_TOKEN","value":"'$(echo -n "${OPENCLAW_BOT_TOKEN}" | base64 -w 0)'"}
|
||||
]' && echo "✅ OPENCLAW_BOT_TOKEN 已注入" || echo "⚠️ OPENCLAW_BOT_TOKEN patch 失敗"
|
||||
fi
|
||||
|
||||
# SMTP_HOST / SRE_GROUP_CHAT_ID
|
||||
if [ -n "${SMTP_HOST}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/SMTP_HOST","value":"'$(echo -n "${SMTP_HOST}" | base64 -w 0)'"}
|
||||
]' && echo "✅ SMTP_HOST 已注入" || echo "⚠️ SMTP_HOST patch 失敗"
|
||||
fi
|
||||
if [ -n "${SRE_GROUP_CHAT_ID}" ]; then
|
||||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||||
{"op":"add","path":"/data/SRE_GROUP_CHAT_ID","value":"'$(echo -n "${SRE_GROUP_CHAT_ID}" | base64 -w 0)'"}
|
||||
]' && echo "✅ SRE_GROUP_CHAT_ID 已注入" || echo "⚠️ SRE_GROUP_CHAT_ID patch 失敗"
|
||||
fi
|
||||
@@ -587,26 +640,27 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
EXPECTED_HOSTS=4
|
||||
PRESENT=0
|
||||
for ip in 192.168.0.110 192.168.0.120 192.168.0.121 192.168.0.188; do
|
||||
if grep -qE "^${ip}[[:space:]]" /tmp/known_hosts_repair 2>/dev/null; then
|
||||
PRESENT=$((PRESENT + 1))
|
||||
if grep -qE "^\${ip}[[:space:]]" /tmp/known_hosts_repair 2>/dev/null; then
|
||||
PRESENT=\$((PRESENT + 1))
|
||||
else
|
||||
echo "⚠️ ssh-keyscan 缺主機 ${ip}"
|
||||
echo "⚠️ ssh-keyscan 缺主機 \${ip}"
|
||||
fi
|
||||
done
|
||||
if [ "$PRESENT" -eq "$EXPECTED_HOSTS" ]; then
|
||||
sudo kubectl create secret generic awoooi-repair-known-hosts \
|
||||
if [ "\$PRESENT" -eq "\$EXPECTED_HOSTS" ]; then
|
||||
\$KUBECTL create secret generic awoooi-repair-known-hosts \
|
||||
-n awoooi-prod \
|
||||
--from-file=known_hosts=/tmp/known_hosts_repair \
|
||||
--dry-run=client -o yaml | sudo kubectl apply -f - \
|
||||
--dry-run=client -o yaml | \$KUBECTL apply -f - \
|
||||
&& echo "✅ awoooi-repair-known-hosts Secret 已建立/更新" \
|
||||
|| echo "⚠️ awoooi-repair-known-hosts Secret 建立失敗 (非致命)"
|
||||
sudo kubectl patch secret ssh-mcp-key -n awoooi-prod --type=merge \
|
||||
-p='{"data":{"known_hosts":"'$(base64 -w 0 /tmp/known_hosts_repair)'"}}' \
|
||||
KNOWN_HOSTS_B64=\$(base64 -w 0 /tmp/known_hosts_repair)
|
||||
\$KUBECTL patch secret ssh-mcp-key -n awoooi-prod --type=merge \
|
||||
-p="{\"data\":{\"known_hosts\":\"\${KNOWN_HOSTS_B64}\"}}" \
|
||||
&& echo "✅ ssh-mcp-key known_hosts 已更新(4 台主機完整)" \
|
||||
|| echo "⚠️ ssh-mcp-key known_hosts 更新失敗 (非致命)"
|
||||
rm -f /tmp/known_hosts_repair /tmp/known_hosts_scan_err
|
||||
else
|
||||
echo "❌ ssh-keyscan 只抓到 ${PRESENT}/${EXPECTED_HOSTS} 台主機,跳過 patch(保留現有 secret)"
|
||||
echo "❌ ssh-keyscan 只抓到 \${PRESENT}/\${EXPECTED_HOSTS} 台主機,跳過 patch(保留現有 secret)"
|
||||
cat /tmp/known_hosts_scan_err 2>/dev/null | head -10
|
||||
rm -f /tmp/known_hosts_repair /tmp/known_hosts_scan_err
|
||||
fi
|
||||
@@ -629,22 +683,23 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
GITEA_TOKEN: ${{ secrets.CD_PUSH_TOKEN }}
|
||||
run: |
|
||||
mkdir -p ~/.ssh
|
||||
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
|
||||
chmod 600 ~/.ssh/deploy_key
|
||||
ssh-keyscan 192.168.0.121 >> ~/.ssh/known_hosts 2>/dev/null
|
||||
echo "$SSH_PRIVATE_KEY" > "${HOME}/.ssh/deploy_key"
|
||||
chmod 600 "${HOME}/.ssh/deploy_key"
|
||||
ssh-keyscan -T 5 "${{ env.K8S_SSH_HOST }}" > ~/.ssh/known_hosts 2>/dev/null
|
||||
SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${HOME}/.ssh/known_hosts -o ConnectTimeout=10"
|
||||
|
||||
IMAGE_TAG="${{ github.sha }}"
|
||||
HARBOR=192.168.0.110:5000
|
||||
|
||||
# ─── Step 1: Apply ConfigMap + ServiceRegistry (ArgoCD 管的是 Deployment,ConfigMap 仍直接 apply) ───
|
||||
cat k8s/awoooi-prod/04-configmap.yaml | \
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
|
||||
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
|
||||
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
|
||||
"KUBECTL='sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${{ env.K8S_API_SERVER }}'; \$KUBECTL apply -f -"
|
||||
echo "✅ ConfigMap 已更新"
|
||||
|
||||
cat k8s/awoooi-prod/15-service-registry-configmap.yaml | \
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
|
||||
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
|
||||
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
|
||||
"KUBECTL='sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${{ env.K8S_API_SERVER }}'; \$KUBECTL apply -f -"
|
||||
echo "✅ Service Registry ConfigMap 已更新"
|
||||
|
||||
# ─── Step 2: 更新 kustomization.yaml image tag ───
|
||||
@@ -685,23 +740,24 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
}
|
||||
|
||||
# ─── Step 4: 等待 ArgoCD sync + rollout ───
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
|
||||
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
|
||||
"EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" << 'ARGOCD_WAIT'
|
||||
set -e
|
||||
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
||||
K8S_API_SERVER="${{ env.K8S_API_SERVER }}"
|
||||
KUBECTL="sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER}"
|
||||
|
||||
# 等待 ArgoCD Application Synced(最多 180s)。只看
|
||||
# Synced/Healthy 可能誤判成上一個 revision 已同步,因此有
|
||||
# deploy commit 時必須同時確認 status.sync.revision。
|
||||
echo "⏳ 等待 ArgoCD sync..."
|
||||
sudo kubectl annotate application awoooi-prod -n argocd \
|
||||
$KUBECTL annotate application awoooi-prod -n argocd \
|
||||
argocd.argoproj.io/refresh=hard --overwrite >/dev/null 2>&1 || true
|
||||
for i in $(seq 1 36); do
|
||||
SYNC=$(sudo kubectl get application awoooi-prod -n argocd \
|
||||
SYNC=$($KUBECTL get application awoooi-prod -n argocd \
|
||||
-o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown")
|
||||
HEALTH=$(sudo kubectl get application awoooi-prod -n argocd \
|
||||
HEALTH=$($KUBECTL get application awoooi-prod -n argocd \
|
||||
-o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown")
|
||||
REVISION=$(sudo kubectl get application awoooi-prod -n argocd \
|
||||
REVISION=$($KUBECTL get application awoooi-prod -n argocd \
|
||||
-o jsonpath='{.status.sync.revision}' 2>/dev/null || echo "Unknown")
|
||||
SHORT_REVISION=$(echo "$REVISION" | cut -c1-8)
|
||||
SHORT_EXPECTED=$(echo "$EXPECTED_REVISION" | cut -c1-8)
|
||||
@@ -720,15 +776,15 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
done
|
||||
|
||||
# 確認 rollout 完成
|
||||
sudo kubectl rollout status deployment/awoooi-api -n awoooi-prod --timeout=120s
|
||||
sudo kubectl rollout status deployment/awoooi-web -n awoooi-prod --timeout=120s
|
||||
sudo kubectl rollout status deployment/awoooi-worker -n awoooi-prod --timeout=120s
|
||||
$KUBECTL rollout status deployment/awoooi-api -n awoooi-prod --timeout=120s
|
||||
$KUBECTL rollout status deployment/awoooi-web -n awoooi-prod --timeout=120s
|
||||
$KUBECTL rollout status deployment/awoooi-worker -n awoooi-prod --timeout=120s
|
||||
echo "✅ 部署完成"
|
||||
|
||||
# Health Check
|
||||
HEALTH_PASS=0
|
||||
for i in 1 2 3; do
|
||||
HTTP_CODE=$(curl -s -w "%{http_code}" -o /dev/null --connect-timeout 10 "http://localhost:32334/api/v1/health")
|
||||
HTTP_CODE=$(curl -s -w "%{http_code}" -o /dev/null --connect-timeout 10 "${{ env.API_HEALTH_URL }}")
|
||||
if [ "$HTTP_CODE" = "200" ]; then
|
||||
echo "✅ API 健康檢查通過"
|
||||
HEALTH_PASS=1
|
||||
@@ -752,26 +808,48 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
SSH_KEY_188: ${{ secrets.DEPLOY_SSH_KEY_188 }}
|
||||
run: |
|
||||
mkdir -p ~/.ssh
|
||||
echo "$SSH_KEY_188" > ~/.ssh/deploy_key_188
|
||||
chmod 600 ~/.ssh/deploy_key_188
|
||||
ssh-keyscan 192.168.0.188 >> ~/.ssh/known_hosts 2>/dev/null
|
||||
echo "$SSH_KEY_188" > "${HOME}/.ssh/deploy_key_188"
|
||||
chmod 600 "${HOME}/.ssh/deploy_key_188"
|
||||
timeout -k 5s 10s ssh-keyscan 192.168.0.188 >> ~/.ssh/known_hosts 2>/dev/null \
|
||||
|| echo "⚠️ 188 host key scan 失敗,改用 StrictHostKeyChecking=accept-new"
|
||||
SSH_188_COMMON_OPTS=(
|
||||
-i "${HOME}/.ssh/deploy_key_188"
|
||||
-o BatchMode=yes
|
||||
-o StrictHostKeyChecking=accept-new
|
||||
-o ConnectTimeout=10
|
||||
-o ServerAliveInterval=10
|
||||
-o ServerAliveCountMax=3
|
||||
-o LogLevel=ERROR
|
||||
)
|
||||
SSH_188_OPTS=(
|
||||
"${SSH_188_COMMON_OPTS[@]}"
|
||||
-n
|
||||
)
|
||||
# scp 不支援 ssh 的 -n 參數,避免 188 ops 腳本同步被參數解析擋下。
|
||||
SCP_188_OPTS=(
|
||||
"${SSH_188_COMMON_OPTS[@]}"
|
||||
)
|
||||
|
||||
timeout -k 5s 30s ssh "${SSH_188_OPTS[@]}" ollama@192.168.0.188 \
|
||||
"mkdir -p ~/awoooi-ops" \
|
||||
|| echo "⚠️ 188 ops 目錄確認失敗"
|
||||
|
||||
# 同步 docker-health-monitor.sh
|
||||
scp -i ~/.ssh/deploy_key_188 \
|
||||
timeout -k 5s 60s scp "${SCP_188_OPTS[@]}" \
|
||||
scripts/ops/docker-health-monitor.sh \
|
||||
ollama@192.168.0.188:~/awoooi-ops/docker-health-monitor.sh \
|
||||
&& echo "✅ docker-health-monitor.sh 已同步" \
|
||||
|| echo "⚠️ docker-health-monitor.sh 同步失敗"
|
||||
|
||||
# 同步 pg-backup.sh
|
||||
scp -i ~/.ssh/deploy_key_188 \
|
||||
timeout -k 5s 60s scp "${SCP_188_OPTS[@]}" \
|
||||
scripts/ops/pg-backup.sh \
|
||||
ollama@192.168.0.188:~/awoooi-ops/pg-backup.sh \
|
||||
&& echo "✅ pg-backup.sh 已同步" \
|
||||
|| echo "⚠️ pg-backup.sh 同步失敗"
|
||||
|
||||
# 確保執行權限
|
||||
ssh -i ~/.ssh/deploy_key_188 ollama@192.168.0.188 \
|
||||
timeout -k 5s 30s ssh "${SSH_188_OPTS[@]}" ollama@192.168.0.188 \
|
||||
"chmod +x ~/awoooi-ops/docker-health-monitor.sh ~/awoooi-ops/pg-backup.sh && echo '✅ 權限設定完成'" \
|
||||
|| echo "⚠️ 權限設定失敗"
|
||||
|
||||
@@ -795,6 +873,14 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
# install-deps can also kill the act-managed job container with RWLayer=nil.
|
||||
runs-on: awoooi-host
|
||||
steps:
|
||||
- name: Bootstrap Host Runner Tools
|
||||
# 2026-05-05 Codex: post-deploy also uses checkout and curl-based
|
||||
# notifications, so it needs the same runner bootstrap as earlier jobs.
|
||||
run: |
|
||||
if command -v apk >/dev/null 2>&1; then
|
||||
apk add --no-cache nodejs npm git curl bash openssh-client docker-cli docker-cli-buildx
|
||||
fi
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Get Commit Info
|
||||
@@ -811,14 +897,17 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
- name: Alert Chain Smoke Test
|
||||
id: alert_chain_smoke
|
||||
run: |
|
||||
# 2026-04-05 Claude Code: 使用真實 API 地址(192.168.0.121:32334 NodePort)
|
||||
# 2026-05-05 Codex: use the keepalived VIP instead of a fixed node.
|
||||
# Host runner launches the CI image explicitly to avoid act RWLayer=nil.
|
||||
if docker run --rm \
|
||||
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-alert-smoke" \
|
||||
--cpus "1.0" \
|
||||
--memory "1g" \
|
||||
-v "$PWD:/workspace" \
|
||||
-v awoooi-api-venv-cache:/opt/api-venv \
|
||||
-w /workspace \
|
||||
"${{ env.CI_IMAGE }}" \
|
||||
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url http://192.168.0.121:32334 --json | tee /tmp/alert_chain_result.json'; then
|
||||
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url ${{ env.ALERT_CHAIN_API_URL }} --json | tee /tmp/alert_chain_result.json'; then
|
||||
echo "alert_chain_status=pass" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "alert_chain_status=fail" >> $GITHUB_OUTPUT
|
||||
@@ -830,6 +919,9 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
id: monitoring_coverage
|
||||
run: |
|
||||
if docker run --rm \
|
||||
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-coverage" \
|
||||
--cpus "1.0" \
|
||||
--memory "1g" \
|
||||
-v "$PWD:/workspace" \
|
||||
-v awoooi-api-venv-cache:/opt/api-venv \
|
||||
-w /workspace \
|
||||
@@ -898,6 +990,9 @@ print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))
|
||||
touch "$SMOKE_OUTPUT"
|
||||
chmod 666 "$SMOKE_OUTPUT"
|
||||
docker run --rm \
|
||||
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-e2e-smoke" \
|
||||
--cpus "1.5" \
|
||||
--memory "2g" \
|
||||
-v "$PWD:/workspace" \
|
||||
-v /tmp/awoooi-smoke.sh:/tmp/awoooi-smoke.sh:ro \
|
||||
-v awoooi-pnpm-store:/opt/pnpm-store \
|
||||
|
||||
@@ -6,6 +6,7 @@ on:
|
||||
paths:
|
||||
- 'apps/**'
|
||||
- 'k8s/**'
|
||||
- '!k8s/awoooi-prod/kustomization.yaml'
|
||||
- 'ops/**'
|
||||
- 'scripts/**'
|
||||
- '.gitea/workflows/**'
|
||||
@@ -29,8 +30,26 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 50
|
||||
|
||||
- name: Skip Stale Main Push
|
||||
id: stale
|
||||
run: |
|
||||
set -euo pipefail
|
||||
BRANCH="${GITHUB_REF_NAME:-${GITHUB_REF#refs/heads/}}"
|
||||
if [ "${GITHUB_EVENT_NAME:-}" != "push" ] || [ "$BRANCH" != "main" ]; then
|
||||
echo "skip=false" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
LATEST="$(git ls-remote origin refs/heads/main | awk '{print $1}')"
|
||||
if [ -n "$LATEST" ] && [ "$LATEST" != "$GITHUB_SHA" ]; then
|
||||
echo "skip=true" >> "$GITHUB_OUTPUT"
|
||||
echo "Skip stale code review: current=$GITHUB_SHA latest=$LATEST"
|
||||
else
|
||||
echo "skip=false" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
- name: Prepare Review Context
|
||||
id: ctx
|
||||
if: steps.stale.outputs.skip != 'true'
|
||||
env:
|
||||
BASE_SHA: ${{ github.event.before }}
|
||||
run: |
|
||||
@@ -81,6 +100,7 @@ jobs:
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Notify Code Review Start
|
||||
if: steps.stale.outputs.skip != 'true'
|
||||
env:
|
||||
TG_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
|
||||
TG_CHAT_ID: ${{ env.TELEGRAM_ALERT_CHAT_ID }}
|
||||
@@ -104,6 +124,7 @@ jobs:
|
||||
>/dev/null
|
||||
|
||||
- name: Run Deterministic Review
|
||||
if: steps.stale.outputs.skip != 'true'
|
||||
env:
|
||||
BASE_SHA: ${{ steps.ctx.outputs.base_sha }}
|
||||
run: |
|
||||
@@ -116,7 +137,7 @@ jobs:
|
||||
jq . /tmp/code-review-report.json
|
||||
|
||||
- name: Notify Code Review Completion
|
||||
if: always()
|
||||
if: always() && steps.stale.outputs.skip != 'true'
|
||||
env:
|
||||
TG_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
|
||||
TG_CHAT_ID: ${{ env.TELEGRAM_ALERT_CHAT_ID }}
|
||||
|
||||
@@ -17,6 +17,7 @@ on:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'apps/api/migrations/*.sql'
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
TELEGRAM_ALERT_CHAT_ID: "-1003711974679"
|
||||
@@ -66,8 +67,11 @@ jobs:
|
||||
- name: Apply new migrations
|
||||
if: steps.diff.outputs.new_files != ''
|
||||
env:
|
||||
# 從 Gitea secrets 取,不直接明碼
|
||||
# 從 Gitea secrets 取,不直接明碼輸出。
|
||||
# MIGRATION_DATABASE_URL 是限權帳號;DATABASE_URL 只在 PostgreSQL
|
||||
# 明確回報「必須是 table owner」時作為受控 fallback。
|
||||
PGURL: ${{ secrets.MIGRATION_DATABASE_URL }}
|
||||
OWNER_PGURL: ${{ secrets.DATABASE_URL }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if [ -z "$PGURL" ]; then
|
||||
@@ -75,15 +79,37 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
PGURL_PSQL="${PGURL/postgresql+asyncpg:\/\//postgresql:\/\/}"
|
||||
OWNER_PGURL_PSQL="${OWNER_PGURL/postgresql+asyncpg:\/\//postgresql:\/\/}"
|
||||
|
||||
apply_migration() {
|
||||
local url="$1"
|
||||
local file="$2"
|
||||
psql "$url" \
|
||||
-v ON_ERROR_STOP=1 \
|
||||
--single-transaction \
|
||||
-f "$file"
|
||||
}
|
||||
|
||||
# 套用每個新檔 (single transaction per file)
|
||||
echo "${{ steps.diff.outputs.new_files }}" | while IFS= read -r file; do
|
||||
[ -z "$file" ] && continue
|
||||
echo "=== Applying: $file ==="
|
||||
psql "$PGURL_PSQL" \
|
||||
-v ON_ERROR_STOP=1 \
|
||||
--single-transaction \
|
||||
-f "$file"
|
||||
migration_err="$(mktemp)"
|
||||
if ! apply_migration "$PGURL_PSQL" "$file" 2>"$migration_err"; then
|
||||
if grep -q "must be owner of table" "$migration_err"; then
|
||||
if [ -z "$OWNER_PGURL_PSQL" ]; then
|
||||
cat "$migration_err" >&2
|
||||
echo "::error::migration requires table owner but DATABASE_URL secret is not set"
|
||||
exit 1
|
||||
fi
|
||||
echo "::warning::migration requires table owner; retrying with owner connection"
|
||||
apply_migration "$OWNER_PGURL_PSQL" "$file"
|
||||
else
|
||||
cat "$migration_err" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
rm -f "$migration_err"
|
||||
echo "=== OK: $file ==="
|
||||
done
|
||||
|
||||
@@ -91,10 +117,24 @@ jobs:
|
||||
if: steps.diff.outputs.new_files != ''
|
||||
env:
|
||||
PGURL: ${{ secrets.MIGRATION_DATABASE_URL }}
|
||||
OWNER_PGURL: ${{ secrets.DATABASE_URL }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if [ -z "$PGURL" ]; then
|
||||
echo "::error::MIGRATION_DATABASE_URL secret not set in Gitea"
|
||||
exit 1
|
||||
fi
|
||||
PGURL_PSQL="${PGURL/postgresql+asyncpg:\/\//postgresql:\/\/}"
|
||||
OWNER_PGURL_PSQL="${OWNER_PGURL/postgresql+asyncpg:\/\//postgresql:\/\/}"
|
||||
FILES_JSON=$(echo "${{ steps.diff.outputs.new_files }}" | jq -Rn '[inputs | select(length > 0)]')
|
||||
psql "$PGURL_PSQL" -c "
|
||||
|
||||
seed_audit() {
|
||||
local url="$1"
|
||||
psql "$url" \
|
||||
-v ON_ERROR_STOP=1 \
|
||||
-v commit_sha="${{ github.sha }}" \
|
||||
-v files_json="$FILES_JSON" \
|
||||
-c "
|
||||
INSERT INTO asset_discovery_run (
|
||||
run_id, triggered_by, scope, scan_depth, status,
|
||||
started_at, ended_at, tools_used, summary
|
||||
@@ -109,11 +149,29 @@ jobs:
|
||||
'{\"psql\": 1, \"gitea_ci\": 1}'::jsonb,
|
||||
jsonb_build_object(
|
||||
'type', 'ci_migration',
|
||||
'commit_sha', '${{ github.sha }}',
|
||||
'files', $FILES_JSON
|
||||
'commit_sha', :'commit_sha',
|
||||
'files', :'files_json'::jsonb
|
||||
)
|
||||
);
|
||||
"
|
||||
}
|
||||
|
||||
audit_err="$(mktemp)"
|
||||
if ! seed_audit "$PGURL_PSQL" 2>"$audit_err"; then
|
||||
if grep -q "permission denied for table asset_discovery_run" "$audit_err"; then
|
||||
if [ -z "$OWNER_PGURL_PSQL" ]; then
|
||||
cat "$audit_err" >&2
|
||||
echo "::error::audit requires table insert privilege but DATABASE_URL secret is not set"
|
||||
exit 1
|
||||
fi
|
||||
echo "::warning::audit requires owner connection; retrying with owner connection"
|
||||
seed_audit "$OWNER_PGURL_PSQL"
|
||||
else
|
||||
cat "$audit_err" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
rm -f "$audit_err"
|
||||
|
||||
- name: Notify Telegram (if configured)
|
||||
if: always()
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -92,3 +92,4 @@ tsconfig.tsbuildinfo
|
||||
.aider*
|
||||
!.aiderignore
|
||||
.claude/settings.local.json
|
||||
.claude/settings.json
|
||||
|
||||
@@ -31,6 +31,9 @@
|
||||
|
||||
## 🔴 絕對禁止 → [HARD_RULES.md](docs/HARD_RULES.md)
|
||||
|
||||
## 🔴 文件語言鐵律 → [文件語言規範](docs/HARD_RULES.md#文件語言規範)
|
||||
Markdown、ADR、LOGBOOK、Runbook、交接文件與計畫文件一律使用繁體中文;程式符號、API、指令、錯誤碼、服務名稱與原始 log 可保留英文。
|
||||
|
||||
## 🔴 紅區治理 → [RED_ZONES.md](docs/RED_ZONES.md)
|
||||
Tier 3 核心檔案 (decision_manager, trust_engine, config 等) 修改需首席架構師授權
|
||||
|
||||
|
||||
@@ -60,6 +60,9 @@ COPY k8s/ ./k8s/
|
||||
# 2026-04-10 Claude Sonnet 4.6: RAG 知識庫索引來源 (ADR-067 Phase 33)
|
||||
COPY docs/ ./docs/
|
||||
COPY .agents/skills/ ./.agents/skills/
|
||||
# 2026-05-04 Claude Sonnet 4.6 (Task 1.2): hermes agent_loader 的 system prompt 來源
|
||||
# agent_loader.py 預設讀 /app/.claude/agents/,對應 K8s AGENTS_DIR 環境變數
|
||||
COPY .claude/agents/ ./.claude/agents/
|
||||
# 2026-04-12 ogt (ADR-073 P2-1): CronJob 腳本 — 獨立腳本取代 inline Python
|
||||
COPY scripts/ ./scripts/
|
||||
|
||||
|
||||
@@ -163,6 +163,68 @@ rules:
|
||||
responsibility: INFRA
|
||||
reasoning: "[規則匹配] 主機層資源告警,自動 SSH 執行診斷指令(只讀,不修改),收集根因資訊後推送 Telegram 讓 SRE 決策。"
|
||||
|
||||
# 2026-05-05 ogt + Codex: 110/188 長時間過載事故後補 Docker Compose 過載與 restart spike 路由。
|
||||
# 原則:過載與重啟暴增只能先診斷,禁止通用 docker restart;由 LLM + Playbook trust 決定 service-specific 修復。
|
||||
- id: docker_baseline_overload_alert
|
||||
priority: 44
|
||||
description: Docker Compose 服務過載 / restart spike 基線告警(cadvisor + textfile exporter)
|
||||
match:
|
||||
alertname:
|
||||
- HostLoadAverageSustainedHigh
|
||||
- DockerContainerCpuSustainedHigh
|
||||
- DockerContainerCpuRunawayCritical
|
||||
- DockerContainerMemoryLimitPressure
|
||||
- DockerContainerMissingResourceLimit
|
||||
- DockerContainerRestartSpike
|
||||
- DockerGiteaActionsJobStale
|
||||
response:
|
||||
action_title: "🔍 Docker/Host 過載自動診斷 — 禁止通用重啟"
|
||||
description: "110/188 Docker Compose 或主機 load 長時間偏離 baseline。AI 需先收集容器 CPU、restart、logs、ClickHouse/Kafka/爬蟲狀態,再選擇限流、降併發或服務專屬 playbook。"
|
||||
suggested_action: SSH_DIAGNOSE
|
||||
kubectl_command: "ssh {host} 'echo \"=== LOAD ===\"; uptime; echo \"=== TOP ===\"; ps aux --sort=-%cpu | head -20; echo \"=== DOCKER ===\"; docker stats --no-stream | head -40'"
|
||||
estimated_downtime: "N/A"
|
||||
risk: low
|
||||
responsibility: INFRA
|
||||
responsibility_reasoning: "Docker Compose / bare-metal 過載屬主機與平台資源治理,不能交給 K8s restart 處理"
|
||||
secondary_teams: [BE, SRE]
|
||||
optimization:
|
||||
- type: BASELINE_CHECK
|
||||
description: "比較 load5/core、單容器 CPU core、restart spike 與 24h 動態基線"
|
||||
command: "Prometheus query: node_load5/core + rate(container_cpu_usage_seconds_total[5m]) + increase(docker_container_restart_count[15m])"
|
||||
- type: SERVICE_SPECIFIC_REPAIR
|
||||
description: "依服務選擇專屬修復:ClickHouse 降 merge / scheduler 限 concurrency / litellm 修 health 或路由 / exporter 降 collector"
|
||||
command: "由 AI 根據 evidence snapshot 選擇已驗證 playbook"
|
||||
reasoning: "[規則匹配] 長期過載先 read-only 診斷與分流,禁止通用 docker restart;修復必須服務專屬且可回寫 Playbook trust。"
|
||||
|
||||
# 2026-05-05 ogt + Codex: 110 self-hosted runner 是 systemd service,不在 Docker/cAdvisor 覆蓋內。
|
||||
# 原則:AI 可自動診斷 watchdog/quota/restart storm;套用 systemd drop-in 需要 sudo,必須走人工批准或 sudo playbook。
|
||||
- id: systemd_runner_baseline_alert
|
||||
priority: 43
|
||||
description: 110 self-hosted runner systemd watchdog / restart / quota 基線告警
|
||||
match:
|
||||
alertname:
|
||||
- SystemdRunnerRestartSpike
|
||||
- SystemdRunnerWatchdogEnabled
|
||||
- SystemdRunnerMissingResourceQuota
|
||||
response:
|
||||
action_title: "🔍 Systemd Runner 基線診斷 — 需要 sudo 才可修復"
|
||||
description: "110 self-hosted runner 發生 watchdog/restart storm 或缺 CPU/Memory quota。這會讓 CI 與 Sentry/ClickHouse/Gitea 搶主機資源,且 Docker/cAdvisor 看不到。"
|
||||
suggested_action: SSH_DIAGNOSE
|
||||
kubectl_command: "ssh {host} 'systemctl show {unit} -p WatchdogUSec -p NRestarts -p DropInPaths -p CPUQuotaPerSecUSec -p MemoryMax -p ActiveState -p SubState; journalctl -u {unit} --since \"20 minutes ago\" --no-pager | tail -120'"
|
||||
estimated_downtime: "N/A"
|
||||
risk: low
|
||||
responsibility: INFRA
|
||||
responsibility_reasoning: "self-hosted runner 是 bare-metal systemd 資源治理,非 K8s 或 Docker workload"
|
||||
secondary_teams: [SRE]
|
||||
optimization:
|
||||
- type: SYSTEMD_GUARDRAIL
|
||||
description: "人工批准後停用錯誤 watchdog drop-in,並為 runner 加 CPUQuota=200%、MemoryMax=2G"
|
||||
command: "sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply"
|
||||
- type: CI_CAPACITY
|
||||
description: "若 110 同時承載 Sentry/ClickHouse/Gitea,不應讓多個 runner 無限制並行"
|
||||
command: "檢查 active jobs、runner 數量與 Gitea Actions concurrency,必要時分流 runner"
|
||||
reasoning: "[規則匹配] systemd runner 過載先 read-only 診斷;改 systemd drop-in 需 sudo 與人工批准,避免 AI 擅自改 host unit。"
|
||||
|
||||
- id: high_cpu
|
||||
priority: 40
|
||||
description: K8s Pod/Deployment CPU 使用率過高
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
-- ADR-090 capacity_violation_event metric violation types
|
||||
-- 日期:2026-05-07(台北)
|
||||
-- 目的:讓 capacity_scanner_job.py 寫入的 cpu/mem/swap 細項違規符合 DB constraint。
|
||||
--
|
||||
-- 背景:
|
||||
-- capacity_scanner_job.py 會寫入:
|
||||
-- - cpu_over_threshold
|
||||
-- - mem_over_threshold
|
||||
-- - swap_over_threshold
|
||||
-- 但原始 ADR-090 DDL 只允許較粗的 host_saturation,導致 production 出現
|
||||
-- capacity_violation_event_type_valid check violation,容量治理事件漏記。
|
||||
|
||||
BEGIN;
|
||||
|
||||
ALTER TABLE capacity_violation_event
|
||||
DROP CONSTRAINT IF EXISTS capacity_violation_event_type_valid;
|
||||
|
||||
ALTER TABLE capacity_violation_event
|
||||
ADD CONSTRAINT capacity_violation_event_type_valid
|
||||
CHECK (violation_type IN (
|
||||
'no_limit_set',
|
||||
'over_request',
|
||||
'over_limit',
|
||||
'host_saturation',
|
||||
'over_sla_budget',
|
||||
'unauthorized_new_deploy',
|
||||
'cpu_over_threshold',
|
||||
'mem_over_threshold',
|
||||
'swap_over_threshold',
|
||||
'load_over_threshold'
|
||||
));
|
||||
|
||||
COMMIT;
|
||||
|
||||
-- Rollback(需人工確認後執行):
|
||||
-- BEGIN;
|
||||
-- ALTER TABLE capacity_violation_event
|
||||
-- DROP CONSTRAINT IF EXISTS capacity_violation_event_type_valid;
|
||||
-- ALTER TABLE capacity_violation_event
|
||||
-- ADD CONSTRAINT capacity_violation_event_type_valid
|
||||
-- CHECK (violation_type IN (
|
||||
-- 'no_limit_set',
|
||||
-- 'over_request',
|
||||
-- 'over_limit',
|
||||
-- 'host_saturation',
|
||||
-- 'over_sla_budget',
|
||||
-- 'unauthorized_new_deploy'
|
||||
-- ));
|
||||
-- COMMIT;
|
||||
271
apps/api/migrations/awooop_phase1_batch1_rls_2026-05-04.sql
Normal file
271
apps/api/migrations/awooop_phase1_batch1_rls_2026-05-04.sql
Normal file
@@ -0,0 +1,271 @@
|
||||
-- AwoooP Phase 1 Batch 1: 現有四表加 project_id + RLS
|
||||
-- 2026-05-04 ogt + Claude Sonnet 4.6(ADR-118 Batch 1,C-3/C-4 db-expert 修正版)
|
||||
-- 2026-05-04 critic 修正版:ADD CONSTRAINT IF NOT EXISTS 不存在於 PG → 改用 DO 塊檢查 pg_constraint
|
||||
--
|
||||
-- 對象:incidents / knowledge_entries / playbooks / audit_logs
|
||||
-- 這四張表是高頻寫入表,採「三步式 migration」避免長時間鎖表:
|
||||
--
|
||||
-- Step A: ADD COLUMN nullable(metadata-only,瞬間)
|
||||
-- Step B: 分批回填(每批 5000 筆,外部腳本呼叫)
|
||||
-- Step C: NOT VALID CHECK → VALIDATE(SHARE UPDATE EXCLUSIVE,不擋讀寫)
|
||||
-- → SET NOT NULL(PG 12+ 利用已驗證 check,不掃表)
|
||||
-- → SET DEFAULT 'awoooi'
|
||||
--
|
||||
-- ⚠️ 執行前必確認:
|
||||
-- 1. awooop_phase1_control_plane_2026-05-04.sql 已執行(awooop_projects 表存在)
|
||||
-- 2. apps/api 已 deploy 「SET LOCAL app.project_id」版本,rollout 100%
|
||||
-- 3. 31 個 background loop 改用 awooop_platform_admin role(PR-10)
|
||||
-- 4. 量測各表體量(見下方 pre-migration check query)
|
||||
--
|
||||
-- Pre-migration check:
|
||||
-- SELECT relname, n_live_tup, pg_size_pretty(pg_total_relation_size(oid))
|
||||
-- FROM pg_class
|
||||
-- WHERE relname IN ('incidents','knowledge_entries','playbooks','audit_logs');
|
||||
--
|
||||
-- 分批回填腳本:
|
||||
-- apps/api/scripts/awooop_phase1_batch1_backfill.py(另行提供)
|
||||
--
|
||||
-- ⚠️ RLS 是 fail-closed:
|
||||
-- SET LOCAL app.project_id 未設 → 讀不到任何資料(C-4 修正)
|
||||
-- WITH CHECK 防止 INSERT 寫入錯誤 tenant
|
||||
--
|
||||
-- 回滾路徑:
|
||||
-- ALTER TABLE incidents DISABLE ROW LEVEL SECURITY;
|
||||
-- DROP POLICY IF EXISTS incidents_tenant_isolation ON incidents;
|
||||
-- DROP POLICY IF EXISTS knowledge_entries_tenant_isolation ON knowledge_entries;
|
||||
-- DROP POLICY IF EXISTS playbooks_tenant_isolation ON playbooks;
|
||||
-- DROP POLICY IF EXISTS audit_logs_tenant_isolation ON audit_logs;
|
||||
-- ALTER TABLE incidents DISABLE ROW LEVEL SECURITY;
|
||||
-- ALTER TABLE knowledge_entries DISABLE ROW LEVEL SECURITY;
|
||||
-- ALTER TABLE playbooks DISABLE ROW LEVEL SECURITY;
|
||||
-- ALTER TABLE audit_logs DISABLE ROW LEVEL SECURITY;
|
||||
-- ALTER TABLE incidents DROP COLUMN IF EXISTS project_id;
|
||||
-- ALTER TABLE knowledge_entries DROP COLUMN IF EXISTS project_id;
|
||||
-- ALTER TABLE playbooks DROP COLUMN IF EXISTS project_id;
|
||||
-- ALTER TABLE audit_logs DROP COLUMN IF EXISTS project_id;
|
||||
-- ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- STEP A: ADD COLUMN(nullable,瞬間取鎖,不重寫表)
|
||||
-- ===========================
|
||||
-- 一次只做 ADD COLUMN,讓 AccessExclusiveLock 最短
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'incidents' AND column_name = 'project_id'
|
||||
) THEN
|
||||
ALTER TABLE incidents ADD COLUMN project_id VARCHAR(64);
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'knowledge_entries' AND column_name = 'project_id'
|
||||
) THEN
|
||||
ALTER TABLE knowledge_entries ADD COLUMN project_id VARCHAR(64);
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'playbooks' AND column_name = 'project_id'
|
||||
) THEN
|
||||
ALTER TABLE playbooks ADD COLUMN project_id VARCHAR(64);
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'audit_logs' AND column_name = 'project_id'
|
||||
) THEN
|
||||
ALTER TABLE audit_logs ADD COLUMN project_id VARCHAR(64);
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- STEP B: 分批回填(外部腳本)
|
||||
-- ===========================
|
||||
-- 此步驟由 apps/api/scripts/awooop_phase1_batch1_backfill.py 執行
|
||||
-- 每批 UPDATE ... WHERE project_id IS NULL LIMIT 5000
|
||||
-- 完成條件:SELECT count(*) FROM incidents WHERE project_id IS NULL; → 0
|
||||
--
|
||||
-- 快速驗證(執行此 SQL 前必須確認回填完成):
|
||||
-- SELECT
|
||||
-- 'incidents' as tbl, count(*) as null_count FROM incidents WHERE project_id IS NULL
|
||||
-- UNION ALL SELECT 'knowledge_entries', count(*) FROM knowledge_entries WHERE project_id IS NULL
|
||||
-- UNION ALL SELECT 'playbooks', count(*) FROM playbooks WHERE project_id IS NULL
|
||||
-- UNION ALL SELECT 'audit_logs', count(*) FROM audit_logs WHERE project_id IS NULL;
|
||||
-- 所有 null_count 必須為 0,否則停止。
|
||||
--
|
||||
-- ⚠️ 回填完成確認後才可繼續執行 Step C
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- STEP C: NOT NULL 強制 + DEFAULT + Index + RLS
|
||||
-- ===========================
|
||||
-- PostgreSQL 12+:NOT VALID CHECK → VALIDATE → SET NOT NULL
|
||||
-- VALIDATE 只取 SHARE UPDATE EXCLUSIVE,不擋讀寫
|
||||
-- SET NOT NULL 在 VALIDATE 後不再掃表(利用 check constraint 証明)
|
||||
|
||||
-- --- incidents ---
|
||||
|
||||
-- PostgreSQL 無 ADD CONSTRAINT IF NOT EXISTS,改用 DO 塊檢查 pg_constraint
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'chk_incidents_project_id_not_null'
|
||||
AND conrelid = 'incidents'::regclass
|
||||
) THEN
|
||||
ALTER TABLE incidents
|
||||
ADD CONSTRAINT chk_incidents_project_id_not_null
|
||||
CHECK (project_id IS NOT NULL) NOT VALID;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
ALTER TABLE incidents
|
||||
VALIDATE CONSTRAINT chk_incidents_project_id_not_null;
|
||||
|
||||
ALTER TABLE incidents ALTER COLUMN project_id SET NOT NULL;
|
||||
ALTER TABLE incidents ALTER COLUMN project_id SET DEFAULT 'awoooi';
|
||||
ALTER TABLE incidents DROP CONSTRAINT IF EXISTS chk_incidents_project_id_not_null;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_incidents_project_id ON incidents (project_id);
|
||||
|
||||
ALTER TABLE incidents ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE incidents FORCE ROW LEVEL SECURITY;
|
||||
DROP POLICY IF EXISTS incidents_tenant_isolation ON incidents;
|
||||
CREATE POLICY incidents_tenant_isolation ON incidents
|
||||
FOR ALL TO awooop_app
|
||||
USING (project_id = current_setting('app.project_id', TRUE))
|
||||
WITH CHECK (project_id = current_setting('app.project_id', TRUE));
|
||||
|
||||
|
||||
-- --- knowledge_entries ---
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'chk_km_project_id_not_null'
|
||||
AND conrelid = 'knowledge_entries'::regclass
|
||||
) THEN
|
||||
ALTER TABLE knowledge_entries
|
||||
ADD CONSTRAINT chk_km_project_id_not_null
|
||||
CHECK (project_id IS NOT NULL) NOT VALID;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
ALTER TABLE knowledge_entries
|
||||
VALIDATE CONSTRAINT chk_km_project_id_not_null;
|
||||
|
||||
ALTER TABLE knowledge_entries ALTER COLUMN project_id SET NOT NULL;
|
||||
ALTER TABLE knowledge_entries ALTER COLUMN project_id SET DEFAULT 'awoooi';
|
||||
ALTER TABLE knowledge_entries DROP CONSTRAINT IF EXISTS chk_km_project_id_not_null;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_knowledge_entries_project_id ON knowledge_entries (project_id);
|
||||
|
||||
ALTER TABLE knowledge_entries ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE knowledge_entries FORCE ROW LEVEL SECURITY;
|
||||
DROP POLICY IF EXISTS knowledge_entries_tenant_isolation ON knowledge_entries;
|
||||
CREATE POLICY knowledge_entries_tenant_isolation ON knowledge_entries
|
||||
FOR ALL TO awooop_app
|
||||
USING (project_id = current_setting('app.project_id', TRUE))
|
||||
WITH CHECK (project_id = current_setting('app.project_id', TRUE));
|
||||
|
||||
|
||||
-- --- playbooks ---
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'chk_playbooks_project_id_not_null'
|
||||
AND conrelid = 'playbooks'::regclass
|
||||
) THEN
|
||||
ALTER TABLE playbooks
|
||||
ADD CONSTRAINT chk_playbooks_project_id_not_null
|
||||
CHECK (project_id IS NOT NULL) NOT VALID;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
ALTER TABLE playbooks
|
||||
VALIDATE CONSTRAINT chk_playbooks_project_id_not_null;
|
||||
|
||||
ALTER TABLE playbooks ALTER COLUMN project_id SET NOT NULL;
|
||||
ALTER TABLE playbooks ALTER COLUMN project_id SET DEFAULT 'awoooi';
|
||||
ALTER TABLE playbooks DROP CONSTRAINT IF EXISTS chk_playbooks_project_id_not_null;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_playbooks_project_id ON playbooks (project_id);
|
||||
|
||||
ALTER TABLE playbooks ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE playbooks FORCE ROW LEVEL SECURITY;
|
||||
DROP POLICY IF EXISTS playbooks_tenant_isolation ON playbooks;
|
||||
CREATE POLICY playbooks_tenant_isolation ON playbooks
|
||||
FOR ALL TO awooop_app
|
||||
USING (project_id = current_setting('app.project_id', TRUE))
|
||||
WITH CHECK (project_id = current_setting('app.project_id', TRUE));
|
||||
|
||||
|
||||
-- --- audit_logs ---
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'chk_audit_project_id_not_null'
|
||||
AND conrelid = 'audit_logs'::regclass
|
||||
) THEN
|
||||
ALTER TABLE audit_logs
|
||||
ADD CONSTRAINT chk_audit_project_id_not_null
|
||||
CHECK (project_id IS NOT NULL) NOT VALID;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
ALTER TABLE audit_logs
|
||||
VALIDATE CONSTRAINT chk_audit_project_id_not_null;
|
||||
|
||||
ALTER TABLE audit_logs ALTER COLUMN project_id SET NOT NULL;
|
||||
ALTER TABLE audit_logs ALTER COLUMN project_id SET DEFAULT 'awoooi';
|
||||
ALTER TABLE audit_logs DROP CONSTRAINT IF EXISTS chk_audit_project_id_not_null;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_audit_logs_project_id ON audit_logs (project_id);
|
||||
|
||||
ALTER TABLE audit_logs ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE audit_logs FORCE ROW LEVEL SECURITY;
|
||||
DROP POLICY IF EXISTS audit_logs_tenant_isolation ON audit_logs;
|
||||
CREATE POLICY audit_logs_tenant_isolation ON audit_logs
|
||||
FOR ALL TO awooop_app
|
||||
USING (project_id = current_setting('app.project_id', TRUE))
|
||||
WITH CHECK (project_id = current_setting('app.project_id', TRUE));
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- 驗收查詢
|
||||
-- ===========================
|
||||
-- SELECT tablename, rowsecurity, forcerowsecurity FROM pg_tables
|
||||
-- WHERE tablename IN ('incidents','knowledge_entries','playbooks','audit_logs');
|
||||
--
|
||||
-- -- RLS fail-closed 測試(需 awooop_app role 執行):
|
||||
-- SET ROLE awooop_app;
|
||||
-- SET LOCAL app.project_id = 'ewoooc';
|
||||
-- SELECT count(*) FROM incidents; -- 應 = 0(無 ewoooc 資料)
|
||||
-- SET LOCAL app.project_id = 'awoooi';
|
||||
-- SELECT count(*) FROM incidents; -- 應 = 全部既有資料筆數
|
||||
-- RESET ROLE;
|
||||
--
|
||||
-- -- 確認無 NULL project_id:
|
||||
-- SELECT count(*) FROM incidents WHERE project_id IS NULL; -- = 0
|
||||
-- SELECT count(*) FROM knowledge_entries WHERE project_id IS NULL; -- = 0
|
||||
-- SELECT count(*) FROM playbooks WHERE project_id IS NULL; -- = 0
|
||||
-- SELECT count(*) FROM audit_logs WHERE project_id IS NULL; -- = 0
|
||||
546
apps/api/migrations/awooop_phase1_control_plane_2026-05-04.sql
Normal file
546
apps/api/migrations/awooop_phase1_control_plane_2026-05-04.sql
Normal file
@@ -0,0 +1,546 @@
|
||||
-- AwoooP Phase 1: Control Plane Schema Foundation
|
||||
-- 2026-05-04 ogt + Claude Sonnet 4.6(ADR-111~118,Phase 1 Task 1.3~1.7)
|
||||
-- 2026-05-04 db-expert review 修正版:C-1/C-2/C-4/C-5/M-1/M-2/M-4/M-5/Mi-1/Mi-2/Mi-3
|
||||
-- 2026-05-04 critic review 修正版:awooop_app role 建立 + GRANT、移除 __platform__ 後門、
|
||||
-- active_pointer_guard SECURITY DEFINER、pg_partman 冪等、immutability 強化
|
||||
--
|
||||
-- ⚠️ 部署順序鎖死(ADR-118 RLS 前置條件):
|
||||
-- 1. apps/api 必須先 deploy「會 SET LOCAL app.project_id」的版本
|
||||
-- 2. K8s rollout 完成(kubectl rollout status deploy/api = 100%)
|
||||
-- 3. 31 個 background loop 改用 awooop_platform_admin role(PR-10 完成)
|
||||
-- 4. 以上完成後,才執行此 migration SQL
|
||||
--
|
||||
-- ⚠️ 不包含 Batch 1 高流量表(incidents/knowledge_entries/playbooks/audit_logs)
|
||||
-- → 請執行 awooop_phase1_batch1_rls_2026-05-04.sql(三步式 migration)
|
||||
--
|
||||
-- 執行前確認:
|
||||
-- SELECT relname, n_live_tup, pg_size_pretty(pg_total_relation_size(oid))
|
||||
-- FROM pg_class WHERE relname IN ('incidents','knowledge_entries','playbooks','audit_logs');
|
||||
--
|
||||
-- 執行角色:awooop_migration(BYPASSRLS)
|
||||
-- 預估執行時間:< 30 秒(全為新表,無既有資料修改)
|
||||
--
|
||||
-- 回滾路徑:
|
||||
-- 見 awooop_phase1_control_plane_ROLLBACK.sql
|
||||
-- ---------------------------------------------------------------------------
|
||||
|
||||
CREATE EXTENSION IF NOT EXISTS pgcrypto;
|
||||
|
||||
-- ===========================
|
||||
-- Step 1: DB Roles(ADR-118 D1)
|
||||
-- ===========================
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
-- awooop_platform_admin: 平台管理(BYPASSRLS,背景 loop 使用)
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'awooop_platform_admin') THEN
|
||||
CREATE ROLE awooop_platform_admin NOLOGIN;
|
||||
END IF;
|
||||
ALTER ROLE awooop_platform_admin BYPASSRLS;
|
||||
|
||||
-- awooop_migration: migration 執行(BYPASSRLS,只在 migration 期間使用)
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'awooop_migration') THEN
|
||||
CREATE ROLE awooop_migration NOLOGIN;
|
||||
END IF;
|
||||
ALTER ROLE awooop_migration BYPASSRLS;
|
||||
|
||||
-- awooop_app: 應用程式角色(受 RLS 約束,需 SET LOCAL app.project_id)
|
||||
-- 必須在 GRANT 之前建立;NOLOGIN 代表 app connection user 要 SET ROLE awooop_app
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'awooop_app') THEN
|
||||
CREATE ROLE awooop_app NOLOGIN;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- Step 2: awooop_projects(租戶主表)
|
||||
-- ===========================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS awooop_projects (
|
||||
project_id VARCHAR(64) PRIMARY KEY,
|
||||
display_name VARCHAR(256) NOT NULL,
|
||||
migration_mode VARCHAR(32) NOT NULL DEFAULT 'legacy_awoooi_default',
|
||||
budget_limit_usd NUMERIC(14, 4) CHECK (budget_limit_usd IS NULL OR budget_limit_usd >= 0),
|
||||
allowed_channels JSONB NOT NULL DEFAULT '[]' CHECK (jsonb_typeof(allowed_channels) = 'array'),
|
||||
is_active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
CONSTRAINT chk_migration_mode CHECK (
|
||||
migration_mode IN ('legacy_awoooi_default','shadow','canary','active')
|
||||
)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_awooop_projects_active
|
||||
ON awooop_projects(is_active) WHERE is_active = TRUE;
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- Step 3: awooop_contract_revisions(六合約共用 revision,append-only)
|
||||
-- ===========================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS awooop_contract_revisions (
|
||||
revision_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
project_id VARCHAR(64) NOT NULL REFERENCES awooop_projects(project_id),
|
||||
contract_family VARCHAR(32) NOT NULL,
|
||||
contract_id VARCHAR(128) NOT NULL,
|
||||
version_major SMALLINT NOT NULL DEFAULT 1 CHECK (version_major >= 0),
|
||||
version_minor SMALLINT NOT NULL DEFAULT 0 CHECK (version_minor >= 0),
|
||||
lifecycle_status VARCHAR(16) NOT NULL DEFAULT 'draft',
|
||||
body_json JSONB NOT NULL,
|
||||
-- body_hash: SHA-256 hex(64 chars),強制格式
|
||||
body_hash VARCHAR(64) NOT NULL CHECK (body_hash ~ '^[0-9a-f]{64}$'),
|
||||
body_schema_version VARCHAR(16) NOT NULL DEFAULT 'v1.0',
|
||||
-- publish_signature: HMAC-SHA256 hex,draft 時 NULL
|
||||
publish_signature VARCHAR(128) CHECK (
|
||||
publish_signature IS NULL OR publish_signature ~ '^[0-9a-f]+$'
|
||||
),
|
||||
publisher_id VARCHAR(128),
|
||||
published_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
CONSTRAINT uq_revision_version
|
||||
UNIQUE (project_id, contract_family, contract_id, version_major, version_minor),
|
||||
CONSTRAINT chk_contract_family CHECK (
|
||||
contract_family IN (
|
||||
'project_tenant','agent','mcp_gateway','policy_routing',
|
||||
'runtime_run_state','channel_event','platform_resource'
|
||||
)
|
||||
),
|
||||
CONSTRAINT chk_lifecycle CHECK (
|
||||
lifecycle_status IN ('draft','published','active','revoked')
|
||||
)
|
||||
);
|
||||
|
||||
-- runtime 讀取路徑:找某 contract 最新 published/active 版本
|
||||
CREATE INDEX IF NOT EXISTS idx_revisions_lookup
|
||||
ON awooop_contract_revisions
|
||||
(project_id, contract_family, contract_id, lifecycle_status,
|
||||
version_major DESC, version_minor DESC);
|
||||
|
||||
-- forensic 驗章反查
|
||||
CREATE INDEX IF NOT EXISTS idx_revisions_hash
|
||||
ON awooop_contract_revisions (body_hash);
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- Step 4: awooop_active_revisions(active pointer)
|
||||
-- ===========================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS awooop_active_revisions (
|
||||
pointer_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
project_id VARCHAR(64) NOT NULL REFERENCES awooop_projects(project_id),
|
||||
contract_family VARCHAR(32) NOT NULL,
|
||||
contract_id VARCHAR(128) NOT NULL,
|
||||
-- NOT NULL + ON DELETE RESTRICT(C-1 修正)
|
||||
active_revision_id UUID NOT NULL REFERENCES awooop_contract_revisions(revision_id)
|
||||
ON DELETE RESTRICT,
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
CONSTRAINT uq_active_pointer
|
||||
UNIQUE (project_id, contract_family, contract_id)
|
||||
);
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- Step 5: awooop_contract_outbox(ADR-113,C-2 修正版)
|
||||
-- ===========================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS awooop_contract_outbox (
|
||||
event_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
event_type VARCHAR(64) NOT NULL,
|
||||
-- FK 到 projects(C-2 修正:outbox 不可是孤兒事件)
|
||||
project_id VARCHAR(64) NOT NULL REFERENCES awooop_projects(project_id),
|
||||
contract_family VARCHAR(32) NOT NULL,
|
||||
contract_id VARCHAR(128) NOT NULL,
|
||||
old_revision_id UUID REFERENCES awooop_contract_revisions(revision_id),
|
||||
new_revision_id UUID NOT NULL REFERENCES awooop_contract_revisions(revision_id),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
delivered_at TIMESTAMPTZ,
|
||||
relay_attempts INT NOT NULL DEFAULT 0,
|
||||
-- C-2 新增:exponential backoff 支援
|
||||
next_retry_at TIMESTAMPTZ,
|
||||
last_error TEXT,
|
||||
-- C-2 新增:上游 publisher 重試去重(同一 revision 的同一事件類型只記一次)
|
||||
CONSTRAINT uq_outbox_event UNIQUE (new_revision_id, event_type)
|
||||
);
|
||||
|
||||
-- relay worker 主查詢:未投遞 + 可重試(含 next_retry_at NULL = 立即重試)
|
||||
CREATE INDEX IF NOT EXISTS idx_outbox_pending
|
||||
ON awooop_contract_outbox (next_retry_at NULLS FIRST, created_at)
|
||||
WHERE delivered_at IS NULL;
|
||||
|
||||
-- 觀察用:per project backlog 體量
|
||||
CREATE INDEX IF NOT EXISTS idx_outbox_backlog_per_project
|
||||
ON awooop_contract_outbox (project_id, created_at)
|
||||
WHERE delivered_at IS NULL;
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- Step 6: awooop_channel_event_dedupe(ADR-114,M-1 Partition 版)
|
||||
-- ===========================
|
||||
-- pg_partman 維護 1 天 partition,retention 7 天,DROP PARTITION 毫秒清完
|
||||
|
||||
CREATE TABLE IF NOT EXISTS awooop_channel_event_dedupe (
|
||||
dedupe_id UUID NOT NULL DEFAULT gen_random_uuid(),
|
||||
project_id VARCHAR(64) NOT NULL,
|
||||
channel_type VARCHAR(32) NOT NULL,
|
||||
provider_event_id VARCHAR(256) NOT NULL,
|
||||
run_id UUID NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
-- Partition key 必須是 PK 的一部分(declarative partition 要求)
|
||||
PRIMARY KEY (dedupe_id, created_at),
|
||||
CONSTRAINT uq_channel_event_dedupe
|
||||
UNIQUE (project_id, channel_type, provider_event_id, created_at)
|
||||
) PARTITION BY RANGE (created_at);
|
||||
|
||||
-- 初始化 pg_partman(若 pg_partman 已安裝)
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_partman') THEN
|
||||
-- 冪等:已在 part_config 則跳過 create_parent(重跑 migration 安全)
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM partman.part_config
|
||||
WHERE parent_table = 'public.awooop_channel_event_dedupe'
|
||||
) THEN
|
||||
PERFORM partman.create_parent(
|
||||
p_parent_table := 'public.awooop_channel_event_dedupe',
|
||||
p_control := 'created_at',
|
||||
p_type := 'native',
|
||||
p_interval := '1 day',
|
||||
p_premake := 4
|
||||
);
|
||||
END IF;
|
||||
UPDATE partman.part_config
|
||||
SET retention = '7 days',
|
||||
retention_keep_table = false
|
||||
WHERE parent_table = 'public.awooop_channel_event_dedupe';
|
||||
ELSE
|
||||
-- pg_partman 未安裝:手動建前 14 天 partition(含今日 ±7 天)
|
||||
DECLARE
|
||||
d DATE;
|
||||
BEGIN
|
||||
FOR d IN
|
||||
SELECT generate_series(
|
||||
CURRENT_DATE - INTERVAL '7 days',
|
||||
CURRENT_DATE + INTERVAL '7 days',
|
||||
INTERVAL '1 day'
|
||||
)::DATE
|
||||
LOOP
|
||||
EXECUTE format(
|
||||
'CREATE TABLE IF NOT EXISTS awooop_channel_event_dedupe_%s
|
||||
PARTITION OF awooop_channel_event_dedupe
|
||||
FOR VALUES FROM (%L) TO (%L)',
|
||||
to_char(d, 'YYYYMMDD'),
|
||||
d::TIMESTAMPTZ,
|
||||
(d + INTERVAL '1 day')::TIMESTAMPTZ
|
||||
);
|
||||
END LOOP;
|
||||
END;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- run_id 反查(Mi-5)
|
||||
CREATE INDEX IF NOT EXISTS idx_dedupe_run
|
||||
ON awooop_channel_event_dedupe (run_id);
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- Step 7: awooop_platform_subjects(ADR-115)
|
||||
-- ===========================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS awooop_platform_subjects (
|
||||
subject_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
project_id VARCHAR(64) NOT NULL REFERENCES awooop_projects(project_id),
|
||||
channel_type VARCHAR(32) NOT NULL,
|
||||
channel_user_id VARCHAR(256) NOT NULL,
|
||||
channel_chat_id VARCHAR(256),
|
||||
platform_subject_id VARCHAR(128) NOT NULL,
|
||||
display_name VARCHAR(256),
|
||||
roles JSONB NOT NULL DEFAULT '[]' CHECK (jsonb_typeof(roles) = 'array'),
|
||||
first_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
last_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
CONSTRAINT uq_platform_subject
|
||||
UNIQUE (project_id, channel_type, channel_user_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_platform_subjects_lookup
|
||||
ON awooop_platform_subjects (project_id, channel_type, channel_user_id);
|
||||
|
||||
-- platform_subject_id 反查(Operator Console M2 用)
|
||||
CREATE INDEX IF NOT EXISTS idx_platform_subjects_resolve
|
||||
ON awooop_platform_subjects (project_id, platform_subject_id);
|
||||
|
||||
-- 近期活躍 user 查詢
|
||||
CREATE INDEX IF NOT EXISTS idx_platform_subjects_last_seen
|
||||
ON awooop_platform_subjects (project_id, last_seen_at DESC);
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- Step 8: awooop_project_migration_state(Strangler Fig 追蹤)
|
||||
-- ===========================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS awooop_project_migration_state (
|
||||
state_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
project_id VARCHAR(64) NOT NULL REFERENCES awooop_projects(project_id),
|
||||
capability VARCHAR(64) NOT NULL,
|
||||
current_phase VARCHAR(32) NOT NULL DEFAULT 'legacy_awoooi_default',
|
||||
phase_entered_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
CONSTRAINT uq_project_capability UNIQUE (project_id, capability),
|
||||
CONSTRAINT chk_capability CHECK (
|
||||
capability IN (
|
||||
'run_execution','contract_governance',
|
||||
'budget_tracking','principal_mapping'
|
||||
)
|
||||
),
|
||||
CONSTRAINT chk_phase CHECK (
|
||||
current_phase IN (
|
||||
'legacy_awoooi_default','shadow','canary',
|
||||
'read_only','suggest','auto_remediate'
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- Step 9: awooop_published_revisions VIEW(ADR-112 D6 draft 隔離)
|
||||
-- ===========================
|
||||
|
||||
CREATE OR REPLACE VIEW awooop_published_revisions AS
|
||||
SELECT *
|
||||
FROM awooop_contract_revisions
|
||||
WHERE lifecycle_status IN ('published', 'active');
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- Step 10: updated_at 自動更新 trigger(Mi-1)
|
||||
-- ===========================
|
||||
|
||||
CREATE OR REPLACE FUNCTION awooop_set_updated_at()
|
||||
RETURNS TRIGGER LANGUAGE plpgsql AS $$
|
||||
BEGIN
|
||||
NEW.updated_at = NOW();
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
DECLARE
|
||||
t TEXT;
|
||||
BEGIN
|
||||
FOREACH t IN ARRAY ARRAY[
|
||||
'awooop_projects',
|
||||
'awooop_active_revisions',
|
||||
'awooop_platform_subjects',
|
||||
'awooop_project_migration_state'
|
||||
] LOOP
|
||||
EXECUTE format(
|
||||
'DROP TRIGGER IF EXISTS trg_%s_updated_at ON %I;
|
||||
CREATE TRIGGER trg_%s_updated_at
|
||||
BEFORE UPDATE ON %I
|
||||
FOR EACH ROW EXECUTE FUNCTION awooop_set_updated_at();',
|
||||
t, t, t, t
|
||||
);
|
||||
END LOOP;
|
||||
END $$;
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- Step 11: Immutability Trigger(C-5 完整版,ADR-112 D2)
|
||||
-- ===========================
|
||||
-- 允許的 lifecycle 流轉:
|
||||
-- draft → published(publish 操作)
|
||||
-- published → active (activate 操作)
|
||||
-- active → revoked (revoke 操作)
|
||||
-- 禁止:body/hash/signature/version 在 published/active/revoked 後修改
|
||||
|
||||
CREATE OR REPLACE FUNCTION awooop_revision_immutability_guard()
|
||||
RETURNS TRIGGER LANGUAGE plpgsql AS $$
|
||||
BEGIN
|
||||
-- 所有 lifecycle_status 下都禁止修改身份欄位(project_id/family/contract_id)
|
||||
IF NEW.project_id IS DISTINCT FROM OLD.project_id
|
||||
OR NEW.contract_family IS DISTINCT FROM OLD.contract_family
|
||||
OR NEW.contract_id IS DISTINCT FROM OLD.contract_id
|
||||
THEN
|
||||
RAISE EXCEPTION
|
||||
'revision % identity fields (project_id/contract_family/contract_id) are immutable',
|
||||
OLD.revision_id;
|
||||
END IF;
|
||||
|
||||
-- draft 可以自由修改,離開 draft 後鎖住核心欄位
|
||||
IF OLD.lifecycle_status IN ('published', 'active', 'revoked') THEN
|
||||
IF NEW.body_json IS DISTINCT FROM OLD.body_json
|
||||
OR NEW.body_hash IS DISTINCT FROM OLD.body_hash
|
||||
OR NEW.publish_signature IS DISTINCT FROM OLD.publish_signature
|
||||
OR NEW.version_major IS DISTINCT FROM OLD.version_major
|
||||
OR NEW.version_minor IS DISTINCT FROM OLD.version_minor
|
||||
OR NEW.publisher_id IS DISTINCT FROM OLD.publisher_id
|
||||
OR NEW.published_at IS DISTINCT FROM OLD.published_at
|
||||
OR NEW.body_schema_version IS DISTINCT FROM OLD.body_schema_version
|
||||
THEN
|
||||
RAISE EXCEPTION
|
||||
'revision % (%) is immutable: body/signature/version cannot be changed',
|
||||
OLD.revision_id, OLD.lifecycle_status;
|
||||
END IF;
|
||||
END IF;
|
||||
|
||||
-- lifecycle_status 流轉白名單
|
||||
IF NEW.lifecycle_status IS DISTINCT FROM OLD.lifecycle_status THEN
|
||||
IF NOT (
|
||||
(OLD.lifecycle_status = 'draft' AND NEW.lifecycle_status = 'published') OR
|
||||
(OLD.lifecycle_status = 'published' AND NEW.lifecycle_status = 'active') OR
|
||||
(OLD.lifecycle_status = 'active' AND NEW.lifecycle_status = 'revoked')
|
||||
) THEN
|
||||
RAISE EXCEPTION
|
||||
'illegal lifecycle transition on revision %: % -> %',
|
||||
OLD.revision_id, OLD.lifecycle_status, NEW.lifecycle_status;
|
||||
END IF;
|
||||
END IF;
|
||||
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$;
|
||||
|
||||
DROP TRIGGER IF EXISTS trg_revision_immutability ON awooop_contract_revisions;
|
||||
CREATE TRIGGER trg_revision_immutability
|
||||
BEFORE UPDATE ON awooop_contract_revisions
|
||||
FOR EACH ROW EXECUTE FUNCTION awooop_revision_immutability_guard();
|
||||
|
||||
-- DELETE 完全禁止(append-only 語意)
|
||||
CREATE OR REPLACE FUNCTION awooop_revision_no_delete()
|
||||
RETURNS TRIGGER LANGUAGE plpgsql AS $$
|
||||
BEGIN
|
||||
RAISE EXCEPTION
|
||||
'awooop_contract_revisions is append-only: DELETE forbidden on revision %',
|
||||
OLD.revision_id;
|
||||
END;
|
||||
$$;
|
||||
|
||||
DROP TRIGGER IF EXISTS trg_revision_no_delete ON awooop_contract_revisions;
|
||||
CREATE TRIGGER trg_revision_no_delete
|
||||
BEFORE DELETE ON awooop_contract_revisions
|
||||
FOR EACH ROW EXECUTE FUNCTION awooop_revision_no_delete();
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- Step 12: Active Pointer Guard(M-5,確保 active_revision_id 指向正確的 active revision)
|
||||
-- ===========================
|
||||
|
||||
-- SECURITY DEFINER:trigger 以 migration 擁有者執行,繞過 awooop_contract_revisions 的 RLS,
|
||||
-- 確保跨租戶指向檢測(FORCE RLS 下 SECURITY INVOKER 只能看自己租戶的 revision)
|
||||
CREATE OR REPLACE FUNCTION awooop_active_pointer_guard()
|
||||
RETURNS TRIGGER LANGUAGE plpgsql
|
||||
SECURITY DEFINER
|
||||
SET search_path = public, pg_catalog
|
||||
AS $$
|
||||
DECLARE
|
||||
rev RECORD;
|
||||
BEGIN
|
||||
SELECT project_id, contract_family, contract_id, lifecycle_status
|
||||
INTO rev
|
||||
FROM awooop_contract_revisions
|
||||
WHERE revision_id = NEW.active_revision_id;
|
||||
|
||||
IF NOT FOUND THEN
|
||||
RAISE EXCEPTION 'revision % not found', NEW.active_revision_id;
|
||||
END IF;
|
||||
IF rev.project_id <> NEW.project_id
|
||||
OR rev.contract_family <> NEW.contract_family
|
||||
OR rev.contract_id <> NEW.contract_id
|
||||
THEN
|
||||
RAISE EXCEPTION
|
||||
'active pointer contract identity mismatch: pointer=(%,%,%) revision=(%,%,%)',
|
||||
NEW.project_id, NEW.contract_family, NEW.contract_id,
|
||||
rev.project_id, rev.contract_family, rev.contract_id;
|
||||
END IF;
|
||||
IF rev.lifecycle_status <> 'active' THEN
|
||||
RAISE EXCEPTION
|
||||
'active pointer must reference an active revision (got %)', rev.lifecycle_status;
|
||||
END IF;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$;
|
||||
|
||||
DROP TRIGGER IF EXISTS trg_active_pointer_guard ON awooop_active_revisions;
|
||||
CREATE TRIGGER trg_active_pointer_guard
|
||||
BEFORE INSERT OR UPDATE ON awooop_active_revisions
|
||||
FOR EACH ROW EXECUTE FUNCTION awooop_active_pointer_guard();
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- Step 13: GRANT awooop_app 基本操作權限
|
||||
-- ===========================
|
||||
-- awooop_app 受 RLS 約束,需設定 app.project_id 才能存取資料
|
||||
-- awooop_platform_admin / awooop_migration 有 BYPASSRLS,不需 GRANT(直接用 superuser 連線)
|
||||
|
||||
GRANT SELECT, INSERT, UPDATE, DELETE ON awooop_contract_revisions TO awooop_app;
|
||||
GRANT SELECT, INSERT, UPDATE ON awooop_active_revisions TO awooop_app;
|
||||
GRANT SELECT, INSERT ON awooop_contract_outbox TO awooop_app;
|
||||
GRANT SELECT, INSERT ON awooop_channel_event_dedupe TO awooop_app;
|
||||
GRANT SELECT, INSERT, UPDATE ON awooop_platform_subjects TO awooop_app;
|
||||
GRANT SELECT ON awooop_projects TO awooop_app;
|
||||
GRANT SELECT ON awooop_project_migration_state TO awooop_app;
|
||||
GRANT SELECT ON awooop_published_revisions TO awooop_app;
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- Step 14: awooop_* 表 RLS(ADR-118,C-4 fail-closed 修正版)
|
||||
-- ===========================
|
||||
-- ⚠️ fail-closed:沒有 SET LOCAL app.project_id 的 session 看不到任何資料
|
||||
-- ⚠️ awooop_platform_admin / awooop_migration 已 BYPASSRLS,不受 policy 約束
|
||||
-- ⚠️ WITH CHECK 防止 INSERT 時塞入不同 tenant 的 project_id
|
||||
-- ⚠️ 移除 __platform__ 後門(critic C-3 修正):平台層改用 BYPASSRLS 角色,不靠 GUC 魔術字串
|
||||
|
||||
ALTER TABLE awooop_contract_revisions ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE awooop_contract_revisions FORCE ROW LEVEL SECURITY;
|
||||
DROP POLICY IF EXISTS contract_revisions_tenant ON awooop_contract_revisions;
|
||||
CREATE POLICY contract_revisions_tenant ON awooop_contract_revisions
|
||||
FOR ALL TO awooop_app
|
||||
USING (project_id = current_setting('app.project_id', TRUE))
|
||||
WITH CHECK (project_id = current_setting('app.project_id', TRUE));
|
||||
|
||||
ALTER TABLE awooop_active_revisions ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE awooop_active_revisions FORCE ROW LEVEL SECURITY;
|
||||
DROP POLICY IF EXISTS active_revisions_tenant ON awooop_active_revisions;
|
||||
CREATE POLICY active_revisions_tenant ON awooop_active_revisions
|
||||
FOR ALL TO awooop_app
|
||||
USING (project_id = current_setting('app.project_id', TRUE))
|
||||
WITH CHECK (project_id = current_setting('app.project_id', TRUE));
|
||||
|
||||
ALTER TABLE awooop_platform_subjects ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE awooop_platform_subjects FORCE ROW LEVEL SECURITY;
|
||||
DROP POLICY IF EXISTS platform_subjects_tenant ON awooop_platform_subjects;
|
||||
CREATE POLICY platform_subjects_tenant ON awooop_platform_subjects
|
||||
FOR ALL TO awooop_app
|
||||
USING (project_id = current_setting('app.project_id', TRUE))
|
||||
WITH CHECK (project_id = current_setting('app.project_id', TRUE));
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- Step 15: AWOOOI 種子資料(ADR-111 bootstrap)
|
||||
-- ===========================
|
||||
|
||||
INSERT INTO awooop_projects (project_id, display_name, migration_mode, is_active)
|
||||
VALUES ('awoooi', 'AWOOOI', 'legacy_awoooi_default', TRUE)
|
||||
ON CONFLICT (project_id) DO NOTHING;
|
||||
|
||||
INSERT INTO awooop_project_migration_state (project_id, capability, current_phase)
|
||||
VALUES
|
||||
('awoooi', 'run_execution', 'legacy_awoooi_default'),
|
||||
('awoooi', 'contract_governance', 'legacy_awoooi_default'),
|
||||
('awoooi', 'budget_tracking', 'legacy_awoooi_default'),
|
||||
('awoooi', 'principal_mapping', 'legacy_awoooi_default')
|
||||
ON CONFLICT (project_id, capability) DO NOTHING;
|
||||
|
||||
|
||||
-- ===========================
|
||||
-- 驗收查詢(執行後人工確認)
|
||||
-- ===========================
|
||||
-- \dt awooop_*
|
||||
-- SELECT project_id, display_name, migration_mode FROM awooop_projects;
|
||||
-- SELECT project_id, capability, current_phase FROM awooop_project_migration_state;
|
||||
-- SELECT tablename, rowsecurity, forcerowsecurity FROM pg_tables
|
||||
-- WHERE tablename LIKE 'awooop_%';
|
||||
-- -- RLS fail-closed 測試:
|
||||
-- SET LOCAL app.project_id = 'ewoooc';
|
||||
-- SELECT count(*) FROM awooop_contract_revisions; -- 應回傳 0('ewoooc' 不存在 projects)
|
||||
-- SET LOCAL app.project_id = 'awoooi';
|
||||
-- SELECT count(*) FROM awooop_projects; -- 應回傳 1
|
||||
@@ -0,0 +1,66 @@
|
||||
-- AwoooP Phase 2.6: budget_ledger 建表 + 欄位定義
|
||||
-- 2026-05-04 ogt + Claude Sonnet 4.6(ADR-120 D5 實作)
|
||||
--
|
||||
-- 防止 $47k 事故的三層 Hard Kill 架構中的 accounting 層:
|
||||
-- - 每次 LLM call 完成後寫入一筆 ledger record
|
||||
-- - 供 Tenant Budget Cache 計算 / 儀表板消費統計 / 告警閾值觸發
|
||||
--
|
||||
-- Phase 1 Control Plane migration 必須先執行(awooop_projects 表存在)
|
||||
-- awooop_run_state 欄位在 Phase 3 SAGA 實作後補加
|
||||
|
||||
-- =========================================================
|
||||
-- STEP 1: 建立 budget_ledger 表
|
||||
-- =========================================================
|
||||
CREATE TABLE IF NOT EXISTS budget_ledger (
|
||||
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
|
||||
project_id VARCHAR(64) NOT NULL DEFAULT 'awoooi',
|
||||
agent_id VARCHAR(128),
|
||||
run_id UUID,
|
||||
model VARCHAR(64),
|
||||
provider VARCHAR(32),
|
||||
prompt_tokens INT,
|
||||
completion_tokens INT,
|
||||
cost_usd NUMERIC(10, 4) NOT NULL DEFAULT 0.0000,
|
||||
recorded_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
COMMENT ON TABLE budget_ledger IS 'ADR-120: 每次 LLM call 的 token/cost accounting 記錄';
|
||||
COMMENT ON COLUMN budget_ledger.cost_usd IS 'prompt + completion token 的估算費用(USD)';
|
||||
|
||||
-- =========================================================
|
||||
-- STEP 2: Index(分析 + 查詢效率)
|
||||
-- =========================================================
|
||||
CREATE INDEX IF NOT EXISTS idx_budget_ledger_project_date
|
||||
ON budget_ledger(project_id, recorded_at DESC);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_budget_ledger_run
|
||||
ON budget_ledger(run_id)
|
||||
WHERE run_id IS NOT NULL;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_budget_ledger_agent
|
||||
ON budget_ledger(project_id, agent_id, recorded_at DESC)
|
||||
WHERE agent_id IS NOT NULL;
|
||||
|
||||
-- =========================================================
|
||||
-- STEP 3: RLS(ADR-118 多租戶隔離)
|
||||
-- =========================================================
|
||||
ALTER TABLE budget_ledger ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE budget_ledger FORCE ROW LEVEL SECURITY;
|
||||
|
||||
DROP POLICY IF EXISTS budget_ledger_tenant_isolation ON budget_ledger;
|
||||
CREATE POLICY budget_ledger_tenant_isolation ON budget_ledger
|
||||
FOR ALL TO awooop_app
|
||||
USING (project_id = current_setting('app.project_id', TRUE))
|
||||
WITH CHECK (project_id = current_setting('app.project_id', TRUE));
|
||||
|
||||
-- =========================================================
|
||||
-- STEP 4: GRANT
|
||||
-- =========================================================
|
||||
GRANT SELECT, INSERT ON budget_ledger TO awooop_app;
|
||||
|
||||
-- =========================================================
|
||||
-- 驗收查詢
|
||||
-- =========================================================
|
||||
-- SELECT tablename, rowsecurity FROM pg_tables WHERE tablename = 'budget_ledger';
|
||||
-- -- 結果:rowsecurity = true
|
||||
-- SELECT count(*) FROM budget_ledger; -- = 0(剛建)
|
||||
200
apps/api/migrations/awooop_phase4_run_state_2026-05-04.sql
Normal file
200
apps/api/migrations/awooop_phase4_run_state_2026-05-04.sql
Normal file
@@ -0,0 +1,200 @@
|
||||
-- AwoooP Phase 4: Platform Shell in Shadow Mode
|
||||
-- Run State Machine 持久化表
|
||||
-- 2026-05-04 ogt + Claude Sonnet 4.6(ADR-114/ADR-119)
|
||||
--
|
||||
-- 前置:Phase 1 control plane(awooop_projects)必須已執行
|
||||
--
|
||||
-- 三表:
|
||||
-- awooop_run_state — Run FSM 主表(lease + heartbeat + SKIP LOCKED)
|
||||
-- awooop_run_step_journal — SAGA step journal(tool call + 補償指令,ADR-119)
|
||||
-- awooop_run_idempotency — 去重冪等表(ADR-114)
|
||||
|
||||
-- =========================================================
|
||||
-- STEP 1: awooop_run_state
|
||||
-- =========================================================
|
||||
CREATE TABLE IF NOT EXISTS awooop_run_state (
|
||||
run_id UUID PRIMARY KEY,
|
||||
project_id VARCHAR(64) NOT NULL REFERENCES awooop_projects(project_id),
|
||||
agent_id VARCHAR(128) NOT NULL,
|
||||
|
||||
-- FSM 狀態
|
||||
state VARCHAR(32) NOT NULL DEFAULT 'pending'
|
||||
CHECK (state IN (
|
||||
'pending','running','waiting_tool',
|
||||
'waiting_approval','completed','failed',
|
||||
'cancelled','timeout'
|
||||
)),
|
||||
|
||||
-- Worker lease(SKIP LOCKED 防 double-pickup)
|
||||
lease_until TIMESTAMPTZ,
|
||||
heartbeat_at TIMESTAMPTZ,
|
||||
worker_id VARCHAR(128),
|
||||
|
||||
-- Retry 計數
|
||||
attempt_count SMALLINT NOT NULL DEFAULT 0,
|
||||
max_attempts SMALLINT NOT NULL DEFAULT 3,
|
||||
|
||||
-- Observability
|
||||
trace_id VARCHAR(128),
|
||||
|
||||
-- Trigger 來源
|
||||
trigger_type VARCHAR(32),
|
||||
trigger_ref VARCHAR(256), -- channel_event_id / schedule_id / etc.
|
||||
|
||||
-- Shadow mode flag
|
||||
is_shadow BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
|
||||
-- Artifact integrity(ADR-112)
|
||||
input_sha256 CHAR(64),
|
||||
output_sha256 CHAR(64),
|
||||
|
||||
-- Budget
|
||||
cost_usd NUMERIC(10, 4) NOT NULL DEFAULT 0.0000,
|
||||
step_count SMALLINT NOT NULL DEFAULT 0,
|
||||
|
||||
-- 結果
|
||||
error_code VARCHAR(64),
|
||||
error_detail TEXT,
|
||||
|
||||
-- 時間戳記
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
timeout_at TIMESTAMPTZ
|
||||
);
|
||||
|
||||
COMMENT ON TABLE awooop_run_state IS
|
||||
'ADR-114: Run FSM 主表,SKIP LOCKED worker lease';
|
||||
COMMENT ON COLUMN awooop_run_state.is_shadow IS
|
||||
'Phase 4 shadow mode:TRUE = 不產生 user response,不執行 destructive tool';
|
||||
|
||||
-- Index: worker 掃 PENDING(SKIP LOCKED 用)
|
||||
CREATE INDEX IF NOT EXISTS idx_run_state_pending
|
||||
ON awooop_run_state (project_id, created_at)
|
||||
WHERE state = 'pending' AND lease_until IS NULL;
|
||||
|
||||
-- Index: stale run reaper(找 lease 過期的 running run)
|
||||
CREATE INDEX IF NOT EXISTS idx_run_state_stale
|
||||
ON awooop_run_state (lease_until)
|
||||
WHERE state = 'running' AND lease_until IS NOT NULL;
|
||||
|
||||
-- Index: project timeline(dashboard 查詢)
|
||||
CREATE INDEX IF NOT EXISTS idx_run_state_project_timeline
|
||||
ON awooop_run_state (project_id, created_at DESC);
|
||||
|
||||
-- Index: trace_id(跨系統追蹤)
|
||||
CREATE INDEX IF NOT EXISTS idx_run_state_trace_id
|
||||
ON awooop_run_state (trace_id)
|
||||
WHERE trace_id IS NOT NULL;
|
||||
|
||||
-- =========================================================
|
||||
-- STEP 2: awooop_run_step_journal(SAGA step journal,ADR-119)
|
||||
-- =========================================================
|
||||
CREATE TABLE IF NOT EXISTS awooop_run_step_journal (
|
||||
step_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
run_id UUID NOT NULL REFERENCES awooop_run_state(run_id) ON DELETE CASCADE,
|
||||
project_id VARCHAR(64) NOT NULL,
|
||||
|
||||
-- Step 順序(每個 run 內遞增)
|
||||
step_seq SMALLINT NOT NULL,
|
||||
|
||||
-- Tool call 資訊
|
||||
tool_name VARCHAR(128) NOT NULL,
|
||||
mcp_gateway_id VARCHAR(128),
|
||||
|
||||
-- Artifact integrity(ADR-112)
|
||||
input_hash CHAR(64),
|
||||
output_hash CHAR(64),
|
||||
|
||||
-- SAGA 補償指令(JSON)
|
||||
compensation_json JSONB,
|
||||
|
||||
-- 執行結果
|
||||
result_status VARCHAR(16) NOT NULL DEFAULT 'pending'
|
||||
CHECK (result_status IN ('pending','success','failed','compensated')),
|
||||
error_code VARCHAR(64),
|
||||
|
||||
-- Shadow 攔截記錄
|
||||
was_blocked BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
block_reason VARCHAR(128),
|
||||
|
||||
-- 時間
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
completed_at TIMESTAMPTZ,
|
||||
latency_ms INTEGER
|
||||
);
|
||||
|
||||
COMMENT ON TABLE awooop_run_step_journal IS
|
||||
'ADR-119 SAGA step journal:每個 tool call 獨立記錄 + 補償指令';
|
||||
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uix_run_step_seq
|
||||
ON awooop_run_step_journal (run_id, step_seq);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_run_step_run_id
|
||||
ON awooop_run_step_journal (run_id, step_seq);
|
||||
|
||||
-- =========================================================
|
||||
-- STEP 3: awooop_run_idempotency(ADR-114 去重冪等)
|
||||
-- =========================================================
|
||||
CREATE TABLE IF NOT EXISTS awooop_run_idempotency (
|
||||
idempotency_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
project_id VARCHAR(64) NOT NULL,
|
||||
channel_type VARCHAR(32) NOT NULL,
|
||||
provider_event_id VARCHAR(256) NOT NULL,
|
||||
|
||||
-- 映射到的 run
|
||||
run_id UUID NOT NULL REFERENCES awooop_run_state(run_id),
|
||||
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
COMMENT ON TABLE awooop_run_idempotency IS
|
||||
'ADR-114: (project_id, channel_type, provider_event_id) → run_id 去重';
|
||||
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uix_run_idempotency_key
|
||||
ON awooop_run_idempotency (project_id, channel_type, provider_event_id);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_run_idempotency_run_id
|
||||
ON awooop_run_idempotency (run_id);
|
||||
|
||||
-- =========================================================
|
||||
-- STEP 4: RLS(ADR-118 多租戶隔離)
|
||||
-- =========================================================
|
||||
ALTER TABLE awooop_run_state ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE awooop_run_state FORCE ROW LEVEL SECURITY;
|
||||
ALTER TABLE awooop_run_step_journal ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE awooop_run_step_journal FORCE ROW LEVEL SECURITY;
|
||||
ALTER TABLE awooop_run_idempotency ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE awooop_run_idempotency FORCE ROW LEVEL SECURITY;
|
||||
|
||||
DROP POLICY IF EXISTS run_state_tenant_isolation ON awooop_run_state;
|
||||
CREATE POLICY run_state_tenant_isolation ON awooop_run_state
|
||||
FOR ALL TO awooop_app
|
||||
USING (project_id = current_setting('app.project_id', TRUE))
|
||||
WITH CHECK (project_id = current_setting('app.project_id', TRUE));
|
||||
|
||||
DROP POLICY IF EXISTS run_step_journal_tenant_isolation ON awooop_run_step_journal;
|
||||
CREATE POLICY run_step_journal_tenant_isolation ON awooop_run_step_journal
|
||||
FOR ALL TO awooop_app
|
||||
USING (project_id = current_setting('app.project_id', TRUE))
|
||||
WITH CHECK (project_id = current_setting('app.project_id', TRUE));
|
||||
|
||||
DROP POLICY IF EXISTS run_idempotency_tenant_isolation ON awooop_run_idempotency;
|
||||
CREATE POLICY run_idempotency_tenant_isolation ON awooop_run_idempotency
|
||||
FOR ALL TO awooop_app
|
||||
USING (project_id = current_setting('app.project_id', TRUE))
|
||||
WITH CHECK (project_id = current_setting('app.project_id', TRUE));
|
||||
|
||||
-- =========================================================
|
||||
-- STEP 5: GRANT
|
||||
-- =========================================================
|
||||
GRANT SELECT, INSERT, UPDATE ON awooop_run_state TO awooop_app;
|
||||
GRANT SELECT, INSERT, UPDATE ON awooop_run_step_journal TO awooop_app;
|
||||
GRANT SELECT, INSERT ON awooop_run_idempotency TO awooop_app;
|
||||
|
||||
-- =========================================================
|
||||
-- 驗收查詢
|
||||
-- =========================================================
|
||||
-- SELECT tablename, rowsecurity FROM pg_tables
|
||||
-- WHERE tablename IN ('awooop_run_state','awooop_run_step_journal','awooop_run_idempotency');
|
||||
-- 預期:所有 rowsecurity = true
|
||||
198
apps/api/migrations/awooop_phase5_mcp_gateway_2026-05-04.sql
Normal file
198
apps/api/migrations/awooop_phase5_mcp_gateway_2026-05-04.sql
Normal file
@@ -0,0 +1,198 @@
|
||||
-- =============================================================================
|
||||
-- AwoooP Phase 5: MCP Gateway 四表
|
||||
-- ADR-116(五閘門 enforcement)+ ADR-118(credential isolation)
|
||||
-- 2026-05-04 ogt + Claude Sonnet 4.6
|
||||
-- =============================================================================
|
||||
-- 執行順序:
|
||||
-- 1. awooop_mcp_tool_registry — Tool 白名單
|
||||
-- 2. awooop_mcp_grants — Agent × Tool 授權記錄
|
||||
-- 3. awooop_mcp_credential_refs — k8s Secret 參照(不儲存明文)
|
||||
-- 4. awooop_mcp_gateway_audit — 每次 gateway call 稽核
|
||||
-- =============================================================================
|
||||
|
||||
BEGIN;
|
||||
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- 1. awooop_mcp_tool_registry — Tool 白名單(Gate 3: Tool)
|
||||
-- ---------------------------------------------------------------------------
|
||||
CREATE TABLE IF NOT EXISTS awooop_mcp_tool_registry (
|
||||
tool_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
project_id VARCHAR(64) NOT NULL
|
||||
REFERENCES awooop_projects(project_id) ON DELETE CASCADE,
|
||||
tool_name VARCHAR(128) NOT NULL,
|
||||
tool_type VARCHAR(32) NOT NULL, -- 'builtin' | 'mcp_server' | 'custom'
|
||||
description TEXT,
|
||||
allowed_scopes JSONB NOT NULL DEFAULT '[]'::jsonb, -- ["read","write","admin"]
|
||||
environment_tags JSONB NOT NULL DEFAULT '{}'::jsonb, -- {"env": "prod"} gate 4 用
|
||||
is_active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT chk_tool_type
|
||||
CHECK (tool_type IN ('builtin','mcp_server','custom')),
|
||||
CONSTRAINT chk_allowed_scopes_array
|
||||
CHECK (jsonb_typeof(allowed_scopes) = 'array'),
|
||||
CONSTRAINT uix_tool_registry_project_name
|
||||
UNIQUE (project_id, tool_name)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_mcp_tool_registry_project
|
||||
ON awooop_mcp_tool_registry (project_id, is_active);
|
||||
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- 2. awooop_mcp_grants — Agent × Tool 授權(Gate 2: Agent + Gate 3: Tool)
|
||||
-- ---------------------------------------------------------------------------
|
||||
CREATE TABLE IF NOT EXISTS awooop_mcp_grants (
|
||||
grant_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
project_id VARCHAR(64) NOT NULL
|
||||
REFERENCES awooop_projects(project_id) ON DELETE CASCADE,
|
||||
agent_id VARCHAR(128) NOT NULL, -- awooop_agents.agent_id
|
||||
tool_id UUID NOT NULL
|
||||
REFERENCES awooop_mcp_tool_registry(tool_id) ON DELETE CASCADE,
|
||||
granted_by VARCHAR(128) NOT NULL, -- principal(human user / system)
|
||||
granted_scopes JSONB NOT NULL DEFAULT '[]'::jsonb, -- subset of tool.allowed_scopes
|
||||
expires_at TIMESTAMPTZ, -- NULL = 永不過期
|
||||
is_revoked BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
revoked_at TIMESTAMPTZ,
|
||||
revoked_by VARCHAR(128),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT chk_grant_scopes_array
|
||||
CHECK (jsonb_typeof(granted_scopes) = 'array'),
|
||||
CONSTRAINT chk_revoke_consistency
|
||||
CHECK (
|
||||
(is_revoked = FALSE AND revoked_at IS NULL AND revoked_by IS NULL)
|
||||
OR
|
||||
(is_revoked = TRUE AND revoked_at IS NOT NULL)
|
||||
),
|
||||
CONSTRAINT uix_mcp_grant_agent_tool
|
||||
UNIQUE (project_id, agent_id, tool_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_mcp_grants_lookup
|
||||
ON awooop_mcp_grants (project_id, agent_id, tool_id)
|
||||
WHERE is_revoked = FALSE;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_mcp_grants_expiry
|
||||
ON awooop_mcp_grants (expires_at)
|
||||
WHERE is_revoked = FALSE AND expires_at IS NOT NULL;
|
||||
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- 3. awooop_mcp_credential_refs — k8s Secret 參照(ADR-118 credential isolation)
|
||||
-- 只儲存 ref 路徑 + sha256 指紋;明文絕不入庫
|
||||
-- ---------------------------------------------------------------------------
|
||||
CREATE TABLE IF NOT EXISTS awooop_mcp_credential_refs (
|
||||
ref_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tool_id UUID NOT NULL
|
||||
REFERENCES awooop_mcp_tool_registry(tool_id) ON DELETE CASCADE,
|
||||
project_id VARCHAR(64) NOT NULL
|
||||
REFERENCES awooop_projects(project_id) ON DELETE CASCADE,
|
||||
-- k8s secret ref:格式 "namespace/secret-name#key"
|
||||
k8s_secret_ref VARCHAR(256) NOT NULL,
|
||||
-- sha256(actual_secret_value) — 用於 audit;不可還原原值
|
||||
value_sha256 VARCHAR(64),
|
||||
description TEXT,
|
||||
is_active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
rotated_at TIMESTAMPTZ,
|
||||
|
||||
CONSTRAINT chk_k8s_ref_format
|
||||
CHECK (k8s_secret_ref ~ '^[a-z0-9-]+/[a-z0-9-]+#[a-zA-Z0-9_-]+$'),
|
||||
CONSTRAINT chk_value_sha256_hex
|
||||
CHECK (value_sha256 IS NULL OR value_sha256 ~ '^[0-9a-f]{64}$'),
|
||||
CONSTRAINT uix_credential_ref_tool
|
||||
UNIQUE (tool_id, k8s_secret_ref)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_mcp_cred_refs_tool
|
||||
ON awooop_mcp_credential_refs (tool_id)
|
||||
WHERE is_active = TRUE;
|
||||
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- 4. awooop_mcp_gateway_audit — Gateway call 稽核日誌(ADR-116 P1-09)
|
||||
-- 不儲存 raw input/output;只儲存 hash + 結果狀態
|
||||
-- ---------------------------------------------------------------------------
|
||||
CREATE TABLE IF NOT EXISTS awooop_mcp_gateway_audit (
|
||||
call_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
project_id VARCHAR(64) NOT NULL,
|
||||
run_id UUID, -- FK soft(run 可能不存在)
|
||||
trace_id VARCHAR(128),
|
||||
agent_id VARCHAR(128),
|
||||
tool_id UUID NOT NULL
|
||||
REFERENCES awooop_mcp_tool_registry(tool_id),
|
||||
tool_name VARCHAR(128) NOT NULL,
|
||||
credential_ref VARCHAR(256), -- k8s_secret_ref 路徑(不含 key value)
|
||||
input_hash VARCHAR(64), -- sha256(canonical input JSON)
|
||||
output_hash VARCHAR(64), -- sha256(canonical output JSON)
|
||||
gate_result JSONB NOT NULL DEFAULT '{}'::jsonb,
|
||||
-- {"gate1_project": true, "gate2_agent": true, "gate3_tool": true,
|
||||
-- "gate4_env": true, "gate5_approval": true}
|
||||
result_status VARCHAR(16) NOT NULL, -- 'success' | 'blocked' | 'failed' | 'timeout'
|
||||
block_gate SMALLINT, -- 哪個 gate 攔截(1-5,NULL=未攔截)
|
||||
block_reason VARCHAR(256),
|
||||
latency_ms INTEGER,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT chk_gateway_result_status
|
||||
CHECK (result_status IN ('success','blocked','failed','timeout')),
|
||||
CONSTRAINT chk_block_gate_range
|
||||
CHECK (block_gate IS NULL OR (block_gate >= 1 AND block_gate <= 5)),
|
||||
CONSTRAINT chk_input_hash_hex
|
||||
CHECK (input_hash IS NULL OR input_hash ~ '^[0-9a-f]{64}$'),
|
||||
CONSTRAINT chk_output_hash_hex
|
||||
CHECK (output_hash IS NULL OR output_hash ~ '^[0-9a-f]{64}$')
|
||||
);
|
||||
|
||||
-- 查詢熱路徑:by project + run
|
||||
CREATE INDEX IF NOT EXISTS idx_mcp_audit_run
|
||||
ON awooop_mcp_gateway_audit (project_id, run_id, created_at DESC);
|
||||
|
||||
-- 查詢熱路徑:blocked calls 分析
|
||||
CREATE INDEX IF NOT EXISTS idx_mcp_audit_blocked
|
||||
ON awooop_mcp_gateway_audit (project_id, block_gate, created_at DESC)
|
||||
WHERE result_status = 'blocked';
|
||||
|
||||
-- 時序熱路徑(recent calls)
|
||||
CREATE INDEX IF NOT EXISTS idx_mcp_audit_recent
|
||||
ON awooop_mcp_gateway_audit (project_id, created_at DESC);
|
||||
|
||||
-- =============================================================================
|
||||
-- Row Level Security
|
||||
-- =============================================================================
|
||||
|
||||
ALTER TABLE awooop_mcp_tool_registry ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE awooop_mcp_grants ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE awooop_mcp_credential_refs ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE awooop_mcp_gateway_audit ENABLE ROW LEVEL SECURITY;
|
||||
|
||||
ALTER TABLE awooop_mcp_tool_registry FORCE ROW LEVEL SECURITY;
|
||||
ALTER TABLE awooop_mcp_grants FORCE ROW LEVEL SECURITY;
|
||||
ALTER TABLE awooop_mcp_credential_refs FORCE ROW LEVEL SECURITY;
|
||||
ALTER TABLE awooop_mcp_gateway_audit FORCE ROW LEVEL SECURITY;
|
||||
|
||||
-- awooop_app role:只能看自己 project 的資料
|
||||
CREATE POLICY mcp_tool_registry_tenant_isolation ON awooop_mcp_tool_registry
|
||||
USING (
|
||||
project_id = current_setting('app.project_id', TRUE)
|
||||
OR current_setting('app.project_id', TRUE) IS NULL
|
||||
);
|
||||
|
||||
CREATE POLICY mcp_grants_tenant_isolation ON awooop_mcp_grants
|
||||
USING (
|
||||
project_id = current_setting('app.project_id', TRUE)
|
||||
OR current_setting('app.project_id', TRUE) IS NULL
|
||||
);
|
||||
|
||||
CREATE POLICY mcp_credential_refs_tenant_isolation ON awooop_mcp_credential_refs
|
||||
USING (
|
||||
project_id = current_setting('app.project_id', TRUE)
|
||||
OR current_setting('app.project_id', TRUE) IS NULL
|
||||
);
|
||||
|
||||
CREATE POLICY mcp_gateway_audit_tenant_isolation ON awooop_mcp_gateway_audit
|
||||
USING (
|
||||
project_id = current_setting('app.project_id', TRUE)
|
||||
OR current_setting('app.project_id', TRUE) IS NULL
|
||||
);
|
||||
|
||||
COMMIT;
|
||||
@@ -0,0 +1,14 @@
|
||||
-- AwoooP Phase 5b:MCP Gateway blocked call 稽核覆蓋
|
||||
-- 日期:2026-05-06
|
||||
-- 維護者:Codex
|
||||
--
|
||||
-- Gate 1 / Gate 2 / 未知工具的 blocked call 可能發生在 tool registry row
|
||||
-- 取得之前。這些安全決策仍必須落稽核紀錄,因此 tool_id 允許為 NULL,
|
||||
-- 但 tool_name 仍維持必填,作為未知工具與早期 gate block 的追蹤線索。
|
||||
|
||||
BEGIN;
|
||||
|
||||
ALTER TABLE awooop_mcp_gateway_audit
|
||||
ALTER COLUMN tool_id DROP NOT NULL;
|
||||
|
||||
COMMIT;
|
||||
@@ -0,0 +1,93 @@
|
||||
-- =============================================================================
|
||||
-- AwoooP Phase 6: EwoooC Tenant Onboarding
|
||||
-- ADR-115(Tenant Onboarding 模板)
|
||||
-- 2026-05-04 ogt + Claude Sonnet 4.6
|
||||
-- =============================================================================
|
||||
-- 執行前提:Phase 1 migration(awooop_phase1_control_plane_2026-05-04.sql)已執行
|
||||
-- 說明:
|
||||
-- EwoooC 是第二個接入 AwoooP 的租戶(awoooi 為第一個)
|
||||
-- migration_mode = 'shadow' 啟動,進入 canary 前需通過 shadow run 驗證
|
||||
-- budget_limit_usd = 50.0(初始限制,可調整)
|
||||
-- 4 個 read-only MCP tools 預先在白名單中(不需 approval)
|
||||
-- =============================================================================
|
||||
|
||||
BEGIN;
|
||||
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- Step 1: INSERT awooop_projects(EwoooC 租戶)
|
||||
-- ---------------------------------------------------------------------------
|
||||
INSERT INTO awooop_projects (
|
||||
project_id,
|
||||
display_name,
|
||||
migration_mode,
|
||||
budget_limit_usd,
|
||||
allowed_channels,
|
||||
metadata
|
||||
) VALUES (
|
||||
'ewoooc',
|
||||
'EwoooC Business Platform',
|
||||
'shadow', -- Phase 6 啟動模式;通過驗證後升級為 canary
|
||||
50.00, -- 初始 USD 預算上限
|
||||
'["telegram","api"]'::jsonb,
|
||||
'{
|
||||
"onboarded_at": "2026-05-04",
|
||||
"tier": "business",
|
||||
"ollama_topology": "gcp_three_tier",
|
||||
"note": "ADR-115 EwoooC 接入,共用 GCP Ollama 三層拓撲"
|
||||
}'::jsonb
|
||||
) ON CONFLICT (project_id) DO NOTHING;
|
||||
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- Step 2: awooop_mcp_tool_registry — 4 個 read-only MCP tools
|
||||
-- (ewoooc 初始只允許唯讀工具,write/admin 需另外建 grant)
|
||||
-- ---------------------------------------------------------------------------
|
||||
|
||||
-- Tool 1: k8s_get — 查詢 k8s resource(唯讀)
|
||||
INSERT INTO awooop_mcp_tool_registry (
|
||||
project_id, tool_name, tool_type, description, allowed_scopes, environment_tags
|
||||
) VALUES (
|
||||
'ewoooc',
|
||||
'k8s_get',
|
||||
'builtin',
|
||||
'kubectl get 唯讀查詢(pod/deployment/service 狀態)',
|
||||
'["read"]'::jsonb,
|
||||
'{"env": "any"}'::jsonb
|
||||
) ON CONFLICT (project_id, tool_name) DO NOTHING;
|
||||
|
||||
-- Tool 2: signoz_query — 查詢 SigNoz metrics/traces(唯讀)
|
||||
INSERT INTO awooop_mcp_tool_registry (
|
||||
project_id, tool_name, tool_type, description, allowed_scopes, environment_tags
|
||||
) VALUES (
|
||||
'ewoooc',
|
||||
'signoz_query',
|
||||
'builtin',
|
||||
'SigNoz metrics/traces 查詢(唯讀,無告警修改)',
|
||||
'["read"]'::jsonb,
|
||||
'{"env": "any"}'::jsonb
|
||||
) ON CONFLICT (project_id, tool_name) DO NOTHING;
|
||||
|
||||
-- Tool 3: incident_read — 讀取 EwoooC incident 記錄(唯讀,RLS 隔離)
|
||||
INSERT INTO awooop_mcp_tool_registry (
|
||||
project_id, tool_name, tool_type, description, allowed_scopes, environment_tags
|
||||
) VALUES (
|
||||
'ewoooc',
|
||||
'incident_read',
|
||||
'builtin',
|
||||
'Incident 查詢(僅限 ewoooc 租戶資料,RLS 強制隔離)',
|
||||
'["read"]'::jsonb,
|
||||
'{"env": "any"}'::jsonb
|
||||
) ON CONFLICT (project_id, tool_name) DO NOTHING;
|
||||
|
||||
-- Tool 4: km_read — 讀取 Knowledge Management 條目(唯讀)
|
||||
INSERT INTO awooop_mcp_tool_registry (
|
||||
project_id, tool_name, tool_type, description, allowed_scopes, environment_tags
|
||||
) VALUES (
|
||||
'ewoooc',
|
||||
'km_read',
|
||||
'builtin',
|
||||
'Knowledge Management 讀取(ewoooc 租戶 KM,RLS 隔離)',
|
||||
'["read"]'::jsonb,
|
||||
'{"env": "any"}'::jsonb
|
||||
) ON CONFLICT (project_id, tool_name) DO NOTHING;
|
||||
|
||||
COMMIT;
|
||||
131
apps/api/migrations/awooop_phase7_channel_hub_2026-05-04.sql
Normal file
131
apps/api/migrations/awooop_phase7_channel_hub_2026-05-04.sql
Normal file
@@ -0,0 +1,131 @@
|
||||
-- =============================================================================
|
||||
-- AwoooP Phase 7: Channel Hub 雙表
|
||||
-- ADR-106(channel_event family)+ Progressive Feedback Policy
|
||||
-- 2026-05-04 ogt + Claude Sonnet 4.6
|
||||
-- =============================================================================
|
||||
-- 兩張表:
|
||||
-- awooop_conversation_event — 入站事件鏡像(Telegram/LINE inbound)
|
||||
-- awooop_outbound_message — 出站訊息記錄(interim + final reply)
|
||||
-- =============================================================================
|
||||
|
||||
BEGIN;
|
||||
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- 1. awooop_conversation_event — 入站 Channel Event 鏡像
|
||||
-- 目的:AwoooP 平台保留所有入站事件的不可變記錄,與 legacy 系統解耦
|
||||
-- ---------------------------------------------------------------------------
|
||||
CREATE TABLE IF NOT EXISTS awooop_conversation_event (
|
||||
event_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
project_id VARCHAR(64) NOT NULL
|
||||
REFERENCES awooop_projects(project_id) ON DELETE CASCADE,
|
||||
-- Channel 原始身份
|
||||
channel_type VARCHAR(32) NOT NULL, -- 'telegram' | 'line' | 'slack' | 'api'
|
||||
provider_event_id VARCHAR(256) NOT NULL, -- Telegram: message_id, LINE: webhook event_id
|
||||
-- 統一身份(由 ProviderProxy 注入)
|
||||
platform_subject_id VARCHAR(128),
|
||||
channel_user_id VARCHAR(256),
|
||||
channel_chat_id VARCHAR(256),
|
||||
-- 關聯 run(若已建立)
|
||||
run_id UUID, -- FK soft(run 可能晚於 event 建立)
|
||||
-- 事件內容(只存摘要/hash,不存明文)
|
||||
content_type VARCHAR(32) NOT NULL DEFAULT 'text', -- 'text' | 'photo' | 'document' | 'command'
|
||||
content_hash VARCHAR(64), -- sha256(raw_content),明文不入庫
|
||||
content_preview VARCHAR(256), -- 前 256 字元(無 PII/secret)
|
||||
attachment_sha256 VARCHAR(64), -- 附件 sha256
|
||||
-- 去重(與 awooop_run_idempotency 對應)
|
||||
is_duplicate BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
-- 時間
|
||||
provider_ts TIMESTAMPTZ, -- provider 原始時間戳
|
||||
received_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT chk_conv_event_channel_type
|
||||
CHECK (channel_type IN ('telegram','line','slack','api','internal')),
|
||||
CONSTRAINT chk_conv_event_content_type
|
||||
CHECK (content_type IN ('text','photo','document','command','callback_query')),
|
||||
CONSTRAINT uix_conv_event_dedup
|
||||
UNIQUE (project_id, channel_type, provider_event_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_conv_event_run
|
||||
ON awooop_conversation_event (project_id, run_id, received_at DESC);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_conv_event_subject
|
||||
ON awooop_conversation_event (project_id, platform_subject_id, received_at DESC);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_conv_event_recent
|
||||
ON awooop_conversation_event (project_id, channel_type, received_at DESC);
|
||||
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- 2. awooop_outbound_message — 出站訊息記錄(interim + final reply)
|
||||
-- 目的:追蹤 AwoooP 發出的每一條訊息(shadow 不發、canary/active 發)
|
||||
-- Progressive Feedback Policy:WAITING_TOOL 超過 30s → 發 interim message
|
||||
-- ---------------------------------------------------------------------------
|
||||
CREATE TABLE IF NOT EXISTS awooop_outbound_message (
|
||||
message_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
project_id VARCHAR(64) NOT NULL
|
||||
REFERENCES awooop_projects(project_id) ON DELETE CASCADE,
|
||||
run_id UUID NOT NULL, -- FK soft
|
||||
conversation_event_id UUID, -- 觸發訊息的入站 event
|
||||
-- 出站目的地
|
||||
channel_type VARCHAR(32) NOT NULL,
|
||||
channel_chat_id VARCHAR(256) NOT NULL,
|
||||
-- 訊息分類
|
||||
message_type VARCHAR(32) NOT NULL, -- 'interim' | 'final' | 'error' | 'approval_request'
|
||||
-- 內容(只存 hash,不存明文)
|
||||
content_hash VARCHAR(64), -- sha256(rendered_content)
|
||||
content_preview VARCHAR(256), -- 前 256 字元(無 PII/secret)
|
||||
-- provider 回報的 message_id(Telegram: message.message_id)
|
||||
provider_message_id VARCHAR(64),
|
||||
-- 狀態
|
||||
send_status VARCHAR(16) NOT NULL DEFAULT 'pending', -- 'pending'|'sent'|'failed'|'shadow'
|
||||
send_error TEXT,
|
||||
-- 時間
|
||||
queued_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
sent_at TIMESTAMPTZ,
|
||||
-- Progressive Feedback Policy(WAITING_TOOL 超 30s 觸發 interim)
|
||||
triggered_by_state VARCHAR(32), -- 觸發本訊息的 run state('waiting_tool'等)
|
||||
waiting_since TIMESTAMPTZ, -- 開始等待的時間(計算 30s 超時用)
|
||||
|
||||
CONSTRAINT chk_outbound_channel_type
|
||||
CHECK (channel_type IN ('telegram','line','slack','api','internal')),
|
||||
CONSTRAINT chk_outbound_message_type
|
||||
CHECK (message_type IN ('interim','final','error','approval_request')),
|
||||
CONSTRAINT chk_outbound_send_status
|
||||
CHECK (send_status IN ('pending','sent','failed','shadow'))
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_outbound_msg_run
|
||||
ON awooop_outbound_message (project_id, run_id, queued_at DESC);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_outbound_msg_pending
|
||||
ON awooop_outbound_message (project_id, channel_type, queued_at)
|
||||
WHERE send_status = 'pending';
|
||||
|
||||
-- Progressive Feedback Policy 查詢:找等待超過 30s 的 runs
|
||||
CREATE INDEX IF NOT EXISTS idx_outbound_msg_waiting
|
||||
ON awooop_outbound_message (project_id, triggered_by_state, waiting_since)
|
||||
WHERE triggered_by_state = 'waiting_tool' AND send_status = 'pending';
|
||||
|
||||
-- =============================================================================
|
||||
-- Row Level Security
|
||||
-- =============================================================================
|
||||
|
||||
ALTER TABLE awooop_conversation_event ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE awooop_outbound_message ENABLE ROW LEVEL SECURITY;
|
||||
|
||||
ALTER TABLE awooop_conversation_event FORCE ROW LEVEL SECURITY;
|
||||
ALTER TABLE awooop_outbound_message FORCE ROW LEVEL SECURITY;
|
||||
|
||||
CREATE POLICY conv_event_tenant_isolation ON awooop_conversation_event
|
||||
USING (
|
||||
project_id = current_setting('app.project_id', TRUE)
|
||||
OR current_setting('app.project_id', TRUE) IS NULL
|
||||
);
|
||||
|
||||
CREATE POLICY outbound_msg_tenant_isolation ON awooop_outbound_message
|
||||
USING (
|
||||
project_id = current_setting('app.project_id', TRUE)
|
||||
OR current_setting('app.project_id', TRUE) IS NULL
|
||||
);
|
||||
|
||||
COMMIT;
|
||||
173
apps/api/migrations/embedding_bge_m3_1024.sql
Normal file
173
apps/api/migrations/embedding_bge_m3_1024.sql
Normal file
@@ -0,0 +1,173 @@
|
||||
-- ADR-110 GCP-A Primary Embedding 升級:nomic-embed-text 768 → bge-m3 1024 維
|
||||
-- 2026-05-04 ogt + Claude Sonnet 4.6
|
||||
--
|
||||
-- 背景:
|
||||
-- GCP-A (34.143.170.20) 無 nomic-embed-text,改用 bge-m3:latest(專用 embedding 模型)
|
||||
-- bge-m3 產生 1024 維向量,現有 schema vector(768) 不相容,INSERT 會直接失敗
|
||||
--
|
||||
-- 影響範圍:
|
||||
-- 1. knowledge_entries.embedding vector(768) → vector(1024)
|
||||
-- 2. rag_chunks.embedding vector(768) → vector(1024)
|
||||
-- 3. playbook_embeddings.embedding vector(768) → vector(1024)
|
||||
--
|
||||
-- 遷移策略:僅在欄位不是 vector(1024) 時清空現有向量資料,切換維度後由 re-embed script 重新嵌入
|
||||
-- 已經是 vector(1024) 的環境重跑本 migration 時,必須保留既有向量資料。
|
||||
-- 現有向量資料若要保留,需先 dump 用 nomic 格式備份(舊維度無法轉換)
|
||||
--
|
||||
-- 執行前置條件:
|
||||
-- 1. pgvector >= 0.5.0 (已滿足)
|
||||
-- 2. 確認現有向量資料是否需要備份(重要 playbook 建議先備份)
|
||||
-- 3. embedding service 已切換到 bge-m3(models.json v1.4.0)
|
||||
--
|
||||
-- 回滾方式:執行 embedding_rollback_768.sql(需重新嵌入至 nomic-embed-text 格式)
|
||||
|
||||
BEGIN;
|
||||
|
||||
-- 1. knowledge_entries:備份舊向量並清空,變更欄位維度
|
||||
DO $$
|
||||
DECLARE
|
||||
v_dim integer;
|
||||
BEGIN
|
||||
SELECT a.atttypmod INTO v_dim
|
||||
FROM pg_attribute a
|
||||
JOIN pg_class c ON a.attrelid = c.oid
|
||||
WHERE c.relname = 'knowledge_entries'
|
||||
AND a.attname = 'embedding';
|
||||
|
||||
IF v_dim IS DISTINCT FROM 1024 THEN
|
||||
EXECUTE $sql$
|
||||
CREATE TABLE IF NOT EXISTS knowledge_entries_embedding_backup_20260505 AS
|
||||
SELECT
|
||||
id,
|
||||
embedding::text AS embedding_768,
|
||||
NOW() AS backed_up_at
|
||||
FROM knowledge_entries
|
||||
WHERE embedding IS NOT NULL
|
||||
$sql$;
|
||||
|
||||
EXECUTE $sql$
|
||||
ALTER TABLE knowledge_entries
|
||||
ALTER COLUMN embedding TYPE vector(1024)
|
||||
USING NULL
|
||||
$sql$;
|
||||
|
||||
RAISE NOTICE 'knowledge_entries.embedding migrated from vector(%) to vector(1024); old embeddings were backed up and cleared', v_dim;
|
||||
ELSE
|
||||
RAISE NOTICE 'knowledge_entries.embedding already vector(1024); existing embeddings preserved';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
COMMENT ON COLUMN knowledge_entries.embedding IS
|
||||
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-05 ADR-110 follow-up)';
|
||||
|
||||
|
||||
-- 2. rag_chunks:清空向量資料,變更欄位維度
|
||||
-- ivfflat index 必須先 DROP 才能 ALTER COLUMN
|
||||
DO $$
|
||||
DECLARE
|
||||
v_dim integer;
|
||||
BEGIN
|
||||
SELECT a.atttypmod INTO v_dim
|
||||
FROM pg_attribute a
|
||||
JOIN pg_class c ON a.attrelid = c.oid
|
||||
WHERE c.relname = 'rag_chunks'
|
||||
AND a.attname = 'embedding';
|
||||
|
||||
IF v_dim IS DISTINCT FROM 1024 THEN
|
||||
EXECUTE 'DROP INDEX IF EXISTS idx_rag_chunks_embedding';
|
||||
EXECUTE $sql$
|
||||
ALTER TABLE rag_chunks
|
||||
ALTER COLUMN embedding TYPE vector(1024)
|
||||
USING NULL
|
||||
$sql$;
|
||||
|
||||
RAISE NOTICE 'rag_chunks.embedding migrated from vector(%) to vector(1024); old embeddings were cleared', v_dim;
|
||||
ELSE
|
||||
RAISE NOTICE 'rag_chunks.embedding already vector(1024); existing embeddings preserved';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- 重建 ivfflat index(lists=100 適合 ~10k 筆以下資料)
|
||||
CREATE INDEX IF NOT EXISTS idx_rag_chunks_embedding
|
||||
ON rag_chunks
|
||||
USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 100);
|
||||
|
||||
COMMENT ON COLUMN rag_chunks.embedding IS
|
||||
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-04 ADR-110)';
|
||||
|
||||
|
||||
-- 3. playbook_embeddings:清空向量資料,變更欄位維度
|
||||
DO $$
|
||||
DECLARE
|
||||
v_dim integer;
|
||||
BEGIN
|
||||
SELECT a.atttypmod INTO v_dim
|
||||
FROM pg_attribute a
|
||||
JOIN pg_class c ON a.attrelid = c.oid
|
||||
WHERE c.relname = 'playbook_embeddings'
|
||||
AND a.attname = 'embedding';
|
||||
|
||||
IF v_dim IS DISTINCT FROM 1024 THEN
|
||||
EXECUTE 'DROP INDEX IF EXISTS ix_playbook_embeddings_vec';
|
||||
EXECUTE $sql$
|
||||
ALTER TABLE playbook_embeddings
|
||||
ALTER COLUMN embedding TYPE vector(1024)
|
||||
USING NULL
|
||||
$sql$;
|
||||
|
||||
RAISE NOTICE 'playbook_embeddings.embedding migrated from vector(%) to vector(1024); old embeddings were cleared', v_dim;
|
||||
ELSE
|
||||
RAISE NOTICE 'playbook_embeddings.embedding already vector(1024); existing embeddings preserved';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS ix_playbook_embeddings_vec
|
||||
ON playbook_embeddings
|
||||
USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 100);
|
||||
|
||||
COMMENT ON COLUMN playbook_embeddings.embedding IS
|
||||
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-04 ADR-110)';
|
||||
|
||||
COMMENT ON TABLE playbook_embeddings IS
|
||||
'Playbook 向量索引 — ADR-110 GCP-A bge-m3 1024 維 (2026-05-04)';
|
||||
|
||||
|
||||
-- 3. 驗證遷移結果
|
||||
DO $$
|
||||
DECLARE
|
||||
v_km_dim integer;
|
||||
v_rag_dim integer;
|
||||
v_pb_dim integer;
|
||||
BEGIN
|
||||
SELECT atttypmod INTO v_km_dim
|
||||
FROM pg_attribute
|
||||
JOIN pg_class ON attrelid = pg_class.oid
|
||||
WHERE relname = 'knowledge_entries' AND attname = 'embedding';
|
||||
|
||||
SELECT atttypmod INTO v_rag_dim
|
||||
FROM pg_attribute
|
||||
JOIN pg_class ON attrelid = pg_class.oid
|
||||
WHERE relname = 'rag_chunks' AND attname = 'embedding';
|
||||
|
||||
SELECT atttypmod INTO v_pb_dim
|
||||
FROM pg_attribute
|
||||
JOIN pg_class ON attrelid = pg_class.oid
|
||||
WHERE relname = 'playbook_embeddings' AND attname = 'embedding';
|
||||
|
||||
-- pgvector atttypmod stores the configured dimension.
|
||||
IF v_km_dim != 1024 THEN
|
||||
RAISE EXCEPTION 'knowledge_entries.embedding 維度驗證失敗:expected 1024, got %', v_km_dim;
|
||||
END IF;
|
||||
IF v_rag_dim != 1024 THEN
|
||||
RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗:expected 1024, got %', v_rag_dim;
|
||||
END IF;
|
||||
IF v_pb_dim != 1024 THEN
|
||||
RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗:expected 1024, got %', v_pb_dim;
|
||||
END IF;
|
||||
|
||||
RAISE NOTICE '✅ embedding 遷移驗證通過:knowledge_entries、rag_chunks、playbook_embeddings 均為 vector(1024)';
|
||||
END $$;
|
||||
|
||||
COMMIT;
|
||||
@@ -25,7 +25,7 @@
|
||||
"log_anomaly": "deepseek-r1:14b",
|
||||
"nemoclaw": "deepseek-r1:14b",
|
||||
"playbook_draft": "qwen3:14b",
|
||||
"code_review": "qwen2.5-coder:32b",
|
||||
"code_review": "qwen2.5-coder:7b",
|
||||
"embedding": "bge-m3:latest",
|
||||
"rag_generate": "qwen3:14b",
|
||||
"image_analysis": "minicpm-v:latest",
|
||||
@@ -175,7 +175,7 @@
|
||||
},
|
||||
"pr_code_review": {
|
||||
"phase": 32,
|
||||
"model": "qwen2.5-coder:32b",
|
||||
"model": "qwen2.5-coder:7b",
|
||||
"timeout_seconds": 120,
|
||||
"purpose": "Gitea PR 自動審查"
|
||||
},
|
||||
|
||||
113
apps/api/scripts/awooop_phase1_batch1_backfill.py
Normal file
113
apps/api/scripts/awooop_phase1_batch1_backfill.py
Normal file
@@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AwoooP Phase 1 Batch 1 回填腳本
|
||||
================================
|
||||
對 incidents / knowledge_entries / playbooks / audit_logs 四張表
|
||||
分批將 project_id IS NULL 的列回填為 'awoooi'。
|
||||
|
||||
前置條件:
|
||||
awooop_phase1_batch1_rls_2026-05-04.sql Step A(ADD COLUMN nullable)已執行
|
||||
|
||||
執行方式:
|
||||
export DATABASE_URL="postgresql+asyncpg://awoooi:<password>@192.168.0.188:5432/awoooi_prod"
|
||||
cd apps/api && python scripts/awooop_phase1_batch1_backfill.py
|
||||
|
||||
2026-05-04 ogt + Claude Sonnet 4.6(ADR-118 Batch 1 C-3 修正)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import time
|
||||
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.ext.asyncio import create_async_engine
|
||||
|
||||
DATABASE_URL = os.environ["DATABASE_URL"]
|
||||
|
||||
TABLES = [
|
||||
("incidents", "incident_id"),
|
||||
("knowledge_entries", "id"),
|
||||
("playbooks", "id"),
|
||||
("audit_logs", "id"),
|
||||
]
|
||||
|
||||
BATCH_SIZE = 5000
|
||||
SLEEP_MS = 100 # 批次間休眠 ms,降低對正常流量的影響
|
||||
|
||||
|
||||
async def count_nulls(conn, table: str) -> int:
|
||||
result = await conn.execute(
|
||||
text(f"SELECT count(*) FROM {table} WHERE project_id IS NULL") # noqa: S608
|
||||
)
|
||||
return result.scalar()
|
||||
|
||||
|
||||
async def backfill_table(engine, table: str, pk_col: str) -> int:
|
||||
total_updated = 0
|
||||
print(f"\n[{table}] 開始回填...")
|
||||
|
||||
while True:
|
||||
async with engine.begin() as conn:
|
||||
result = await conn.execute(text(f"""
|
||||
UPDATE {table}
|
||||
SET project_id = 'awoooi'
|
||||
WHERE {pk_col} IN (
|
||||
SELECT {pk_col} FROM {table}
|
||||
WHERE project_id IS NULL
|
||||
LIMIT :batch_size
|
||||
FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
"""), {"batch_size": BATCH_SIZE})
|
||||
rows = result.rowcount
|
||||
|
||||
total_updated += rows
|
||||
if rows == 0:
|
||||
break
|
||||
|
||||
print(f" [{table}] 已回填 {total_updated} 筆...")
|
||||
await asyncio.sleep(SLEEP_MS / 1000)
|
||||
|
||||
print(f" [{table}] 回填完成,共 {total_updated} 筆")
|
||||
return total_updated
|
||||
|
||||
|
||||
async def verify(engine) -> bool:
|
||||
print("\n=== 驗收確認 ===")
|
||||
ok = True
|
||||
async with engine.connect() as conn:
|
||||
for table, _ in TABLES:
|
||||
null_count = await count_nulls(conn, table)
|
||||
status = "✅" if null_count == 0 else "❌"
|
||||
print(f" {status} {table}: {null_count} 筆 NULL project_id")
|
||||
if null_count != 0:
|
||||
ok = False
|
||||
return ok
|
||||
|
||||
|
||||
async def main():
|
||||
print("=" * 60)
|
||||
print("AwoooP Phase 1 Batch 1 Backfill")
|
||||
print("=" * 60)
|
||||
|
||||
engine = create_async_engine(DATABASE_URL, echo=False)
|
||||
t0 = time.monotonic()
|
||||
|
||||
for table, pk_col in TABLES:
|
||||
await backfill_table(engine, table, pk_col)
|
||||
|
||||
passed = await verify(engine)
|
||||
elapsed = time.monotonic() - t0
|
||||
|
||||
print(f"\n{'✅ 所有表回填完成' if passed else '❌ 仍有 NULL,請重跑'}")
|
||||
print(f"耗時:{elapsed:.1f}s")
|
||||
print()
|
||||
if passed:
|
||||
print("下一步:執行 awooop_phase1_batch1_rls_2026-05-04.sql 的 Step C")
|
||||
else:
|
||||
print("⚠️ 請確認無長 transaction 持有 SKIP LOCKED 的列後重跑")
|
||||
|
||||
await engine.dispose()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
187
apps/api/scripts/reembed_bge_m3.py
Normal file
187
apps/api/scripts/reembed_bge_m3.py
Normal file
@@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Re-embed Script: bge-m3:latest 1024 維重新嵌入
|
||||
===============================================
|
||||
遷移 embedding_bge_m3_1024.sql 後執行,重新嵌入:
|
||||
1. rag_chunks(embedding IS NULL 的筆數)
|
||||
2. playbook_embeddings(embedding IS NULL 的筆數)
|
||||
|
||||
用法:
|
||||
cd apps/api
|
||||
python scripts/reembed_bge_m3.py [--dry-run] [--batch 50]
|
||||
|
||||
前置條件:
|
||||
1. embedding_bge_m3_1024.sql 已執行(schema 已升為 vector(1024))
|
||||
2. GCP-A Ollama (34.143.170.20:11434) 可連線且有 bge-m3:latest
|
||||
3. DATABASE_URL 環境變數已設定(或 .env 存在)
|
||||
|
||||
2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP-A Primary Embedding 升級
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# 確保 src 在 import 路徑
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import asyncpg
|
||||
import httpx
|
||||
import structlog
|
||||
|
||||
logging = structlog.get_logger(__name__)
|
||||
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://34.143.170.20:11434")
|
||||
EMBEDDING_MODEL = "bge-m3:latest"
|
||||
EXPECTED_DIM = 1024
|
||||
|
||||
|
||||
async def embed_text(client: httpx.AsyncClient, text: str) -> list[float]:
|
||||
"""呼叫 Ollama bge-m3 嵌入單一文本"""
|
||||
resp = await client.post(
|
||||
f"{OLLAMA_URL}/api/embeddings",
|
||||
json={"model": EMBEDDING_MODEL, "prompt": text},
|
||||
timeout=60.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
embedding = resp.json().get("embedding", [])
|
||||
if len(embedding) != EXPECTED_DIM:
|
||||
raise ValueError(f"bge-m3 維度錯誤: got {len(embedding)}, expected {EXPECTED_DIM}")
|
||||
return embedding
|
||||
|
||||
|
||||
async def reembed_rag_chunks(
|
||||
conn: asyncpg.Connection,
|
||||
client: httpx.AsyncClient,
|
||||
batch_size: int,
|
||||
dry_run: bool,
|
||||
) -> int:
|
||||
rows = await conn.fetch(
|
||||
"SELECT id, content FROM rag_chunks WHERE embedding IS NULL ORDER BY id LIMIT $1",
|
||||
batch_size * 10,
|
||||
)
|
||||
if not rows:
|
||||
logging.info("rag_chunks_all_embedded")
|
||||
return 0
|
||||
|
||||
done = 0
|
||||
for row in rows:
|
||||
try:
|
||||
vec = await embed_text(client, row["content"])
|
||||
if not dry_run:
|
||||
vec_str = "[" + ",".join(f"{v:.8f}" for v in vec) + "]"
|
||||
await conn.execute(
|
||||
"UPDATE rag_chunks SET embedding = $1::vector WHERE id = $2",
|
||||
vec_str, row["id"],
|
||||
)
|
||||
done += 1
|
||||
if done % 10 == 0:
|
||||
logging.info("rag_chunks_progress", done=done, total=len(rows))
|
||||
except Exception as e:
|
||||
logging.error("rag_chunk_embed_failed", id=row["id"], error=str(e))
|
||||
|
||||
return done
|
||||
|
||||
|
||||
async def reembed_playbook_embeddings(
|
||||
conn: asyncpg.Connection,
|
||||
client: httpx.AsyncClient,
|
||||
batch_size: int,
|
||||
dry_run: bool,
|
||||
) -> int:
|
||||
# playbook_embeddings 關聯 playbooks 表取原始內容
|
||||
rows = await conn.fetch("""
|
||||
SELECT pe.playbook_id, p.title, p.description, p.steps
|
||||
FROM playbook_embeddings pe
|
||||
JOIN playbooks p ON pe.playbook_id = p.id
|
||||
WHERE pe.embedding IS NULL
|
||||
ORDER BY pe.playbook_id
|
||||
LIMIT $1
|
||||
""", batch_size * 10)
|
||||
|
||||
if not rows:
|
||||
logging.info("playbook_embeddings_all_embedded")
|
||||
return 0
|
||||
|
||||
done = 0
|
||||
for row in rows:
|
||||
text_parts = [row["title"] or "", row["description"] or ""]
|
||||
if row["steps"]:
|
||||
if isinstance(row["steps"], list):
|
||||
text_parts.extend(str(s) for s in row["steps"])
|
||||
else:
|
||||
text_parts.append(str(row["steps"]))
|
||||
text = "\n".join(p for p in text_parts if p)
|
||||
|
||||
try:
|
||||
vec = await embed_text(client, text)
|
||||
if not dry_run:
|
||||
vec_str = "[" + ",".join(f"{v:.8f}" for v in vec) + "]"
|
||||
await conn.execute(
|
||||
"UPDATE playbook_embeddings SET embedding = $1::vector WHERE playbook_id = $2",
|
||||
vec_str, row["playbook_id"],
|
||||
)
|
||||
done += 1
|
||||
if done % 10 == 0:
|
||||
logging.info("playbook_embed_progress", done=done, total=len(rows))
|
||||
except Exception as e:
|
||||
logging.error("playbook_embed_failed", playbook_id=row["playbook_id"], error=str(e))
|
||||
|
||||
return done
|
||||
|
||||
|
||||
async def main(dry_run: bool, batch_size: int) -> None:
|
||||
database_url = os.getenv("DATABASE_URL")
|
||||
if not database_url:
|
||||
# 嘗試讀 .env
|
||||
env_file = Path(__file__).parent.parent / ".env"
|
||||
if env_file.exists():
|
||||
for line in env_file.read_text().splitlines():
|
||||
if line.startswith("DATABASE_URL="):
|
||||
database_url = line.split("=", 1)[1].strip().strip('"\'')
|
||||
break
|
||||
if not database_url:
|
||||
print("❌ DATABASE_URL 未設定,請設定環境變數或 .env 檔案", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if dry_run:
|
||||
print("🔍 DRY RUN 模式 — 不會實際更新 DB")
|
||||
|
||||
async with httpx.AsyncClient() as http_client:
|
||||
# 先驗證 bge-m3 可用且維度正確
|
||||
print(f"🔗 驗證 GCP-A Ollama ({OLLAMA_URL}) bge-m3 連線...")
|
||||
try:
|
||||
test_vec = await embed_text(http_client, "連線測試")
|
||||
print(f"✅ bge-m3 可用,維度 = {len(test_vec)}")
|
||||
except Exception as e:
|
||||
print(f"❌ bge-m3 連線失敗: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
conn = await asyncpg.connect(database_url)
|
||||
try:
|
||||
# 統計待嵌入筆數
|
||||
rag_null = await conn.fetchval("SELECT COUNT(*) FROM rag_chunks WHERE embedding IS NULL")
|
||||
pb_null = await conn.fetchval("SELECT COUNT(*) FROM playbook_embeddings WHERE embedding IS NULL")
|
||||
print(f"📊 待嵌入:rag_chunks={rag_null} 筆,playbook_embeddings={pb_null} 筆")
|
||||
|
||||
if rag_null == 0 and pb_null == 0:
|
||||
print("✅ 所有向量已嵌入,無需重新處理")
|
||||
return
|
||||
|
||||
rag_done = await reembed_rag_chunks(conn, http_client, batch_size, dry_run)
|
||||
pb_done = await reembed_playbook_embeddings(conn, http_client, batch_size, dry_run)
|
||||
|
||||
print(f"{'[DRY RUN] ' if dry_run else ''}✅ 完成: rag_chunks={rag_done}, playbook_embeddings={pb_done}")
|
||||
finally:
|
||||
await conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Re-embed script for bge-m3 1024 維遷移")
|
||||
parser.add_argument("--dry-run", action="store_true", help="只統計,不寫 DB")
|
||||
parser.add_argument("--batch", type=int, default=50, help="每批次處理筆數")
|
||||
args = parser.parse_args()
|
||||
asyncio.run(main(dry_run=args.dry_run, batch_size=args.batch))
|
||||
@@ -28,7 +28,7 @@ except ImportError:
|
||||
# ============================================================================
|
||||
|
||||
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
|
||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://192.168.0.188:11434")
|
||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://192.168.0.110:11435")
|
||||
|
||||
if not NVIDIA_API_KEY:
|
||||
print("❌ 請設定 NVIDIA_API_KEY 環境變數")
|
||||
|
||||
@@ -177,34 +177,42 @@ async def _analyze_and_notify(report: DriftReport) -> None:
|
||||
interpretation = await interpreter.analyze(report)
|
||||
repo = get_drift_repository()
|
||||
await repo.update_interpretation(report.report_id, interpretation)
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: 修根因 — report 是 in-memory 物件,
|
||||
# update_interpretation 只更新 DB,不會回寫 report.interpretation,
|
||||
# 導致 auto_adopt_if_safe 永遠看到 None → 觸發「尚無 Nemotron 意圖分析」條件
|
||||
report.interpretation = interpretation
|
||||
|
||||
# 2026-04-24: 嘗試低風險自動採納
|
||||
auto_adopted = False
|
||||
auto_block_reason = ""
|
||||
from src.core.config import get_settings as _gs
|
||||
_drift_auto_enabled = _gs().DRIFT_AUTO_ADOPT_ENABLED
|
||||
# flag=False 視為「停用」,不設 auto_block_reason 避免誤觸 escalation
|
||||
try:
|
||||
adopt_svc = get_drift_adopt_service()
|
||||
auto_result = await adopt_svc.auto_adopt_if_safe(report)
|
||||
if auto_result.get("success"):
|
||||
# 自動採納成功:更新狀態,跳過人工卡片
|
||||
await repo.update_status(
|
||||
report.report_id,
|
||||
DriftStatus.ADOPTED,
|
||||
resolved_at=now_taipei(),
|
||||
)
|
||||
auto_adopted = True
|
||||
_logger.info(
|
||||
"drift_auto_adopted",
|
||||
report_id=report.report_id,
|
||||
pr_url=auto_result.get("pr_url"),
|
||||
)
|
||||
else:
|
||||
auto_block_reason = auto_result.get("reason", "") or "auto adopt skipped"
|
||||
_logger.info(
|
||||
"drift_auto_adopt_skipped",
|
||||
report_id=report.report_id,
|
||||
reason=auto_block_reason,
|
||||
skipped=auto_result.get("skipped", True),
|
||||
)
|
||||
if _drift_auto_enabled:
|
||||
adopt_svc = get_drift_adopt_service()
|
||||
auto_result = await adopt_svc.auto_adopt_if_safe(report)
|
||||
if auto_result.get("success"):
|
||||
# 自動採納成功:更新狀態,跳過人工卡片
|
||||
await repo.update_status(
|
||||
report.report_id,
|
||||
DriftStatus.ADOPTED,
|
||||
resolved_at=now_taipei(),
|
||||
)
|
||||
auto_adopted = True
|
||||
_logger.info(
|
||||
"drift_auto_adopted",
|
||||
report_id=report.report_id,
|
||||
pr_url=auto_result.get("pr_url"),
|
||||
)
|
||||
else:
|
||||
auto_block_reason = auto_result.get("reason", "") or "auto adopt skipped"
|
||||
_logger.info(
|
||||
"drift_auto_adopt_skipped",
|
||||
report_id=report.report_id,
|
||||
reason=auto_block_reason,
|
||||
skipped=auto_result.get("skipped", True),
|
||||
)
|
||||
except Exception as e:
|
||||
auto_block_reason = f"auto adopt error: {str(e)[:120]}"
|
||||
_logger.warning("drift_auto_adopt_error", report_id=report.report_id, error=str(e))
|
||||
|
||||
@@ -11,7 +11,7 @@ Endpoints:
|
||||
Components Checked:
|
||||
- PostgreSQL (192.168.0.188:5432)
|
||||
- Redis (192.168.0.188:6380)
|
||||
- Ollama (192.168.0.188:11434)
|
||||
- Ollama (settings.OLLAMA_URL / ADR-110 provider pool)
|
||||
- OpenClaw (192.168.0.188:8089)
|
||||
- SigNoz (192.168.0.188:3301)
|
||||
"""
|
||||
|
||||
@@ -17,9 +17,10 @@ Phase 6.4 核心功能:
|
||||
- Proposal 必須關聯到 Incident
|
||||
"""
|
||||
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, HTTPException, status
|
||||
from fastapi import APIRouter, HTTPException, Query, status
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from src.core.logging import get_logger
|
||||
@@ -148,18 +149,26 @@ class IncidentTimelineResponse(BaseModel):
|
||||
|
||||
Phase 6.5 升級:
|
||||
- 每個事件自動附帶 decision_token
|
||||
- 確保 UI 永遠有決策可操作
|
||||
- 雙軌引擎: LLM (主) + Expert System (備)
|
||||
- 預設只讀取已存在的 decision_token
|
||||
- 需要新決策時改由明確的 proposal / operator run 入口觸發
|
||||
""",
|
||||
)
|
||||
async def list_incidents() -> IncidentListResponse:
|
||||
async def list_incidents(
|
||||
generate_missing_decisions: bool = Query(
|
||||
False,
|
||||
description=(
|
||||
"預設 false,列表查詢只讀既有 decision token;"
|
||||
"true 僅供明確維運操作使用,會背景產生缺少的決策。"
|
||||
),
|
||||
),
|
||||
) -> IncidentListResponse:
|
||||
"""
|
||||
取得活躍事件清單
|
||||
|
||||
Phase 6.5: 自動為每個事件生成決策令牌
|
||||
- P0/P1 事件優先處理
|
||||
- 30 秒內保證有決策
|
||||
- LLM 失敗時 Expert System 保底
|
||||
Phase 6.5: 附帶既有決策令牌
|
||||
- 列表查詢必須是低成本純讀路徑
|
||||
- 不可因為前端輪詢就背景觸發 LLM / Ollama / OpenClaw
|
||||
- 需要新決策時,呼叫 POST /api/v1/incidents/{incident_id}/proposal
|
||||
|
||||
Returns:
|
||||
IncidentListResponse: 事件清單與計數 (含決策令牌)
|
||||
@@ -174,8 +183,6 @@ async def list_incidents() -> IncidentListResponse:
|
||||
|
||||
# 按時間排序 (最新優先)
|
||||
# 2026-03-26 修復: 處理 timezone-aware 與 naive datetime 混合問題
|
||||
from datetime import UTC
|
||||
|
||||
def safe_created_at(i: Incident) -> float:
|
||||
"""安全取得 timestamp,處理 timezone 混合問題"""
|
||||
dt = i.created_at
|
||||
@@ -189,15 +196,24 @@ async def list_incidents() -> IncidentListResponse:
|
||||
# 2026-04-09 Claude Sonnet 4.6: 效能修復 — list endpoint 不同步等待 AI
|
||||
# 原設計: 每個 incident await AI 決策 (120-180s timeout),多 incident 時乘積爆炸
|
||||
# 修復: 只取已存在的決策 token,若無則背景觸發生成,前端 poll 單筆 GET 取得結果
|
||||
import asyncio
|
||||
#
|
||||
# 2026-05-06 Codex: 成本與推理槽修復 — 預設不再背景觸發 AI。
|
||||
# 根因: 多個前端頁面會輪詢 GET /incidents;若列表查詢偷偷 create_task,
|
||||
# 每次頁面載入都可能消耗 GCP Ollama / OpenClaw 推理槽,甚至 fallback 到 Gemini。
|
||||
# 新規則: GET list 是純讀;生成新修復建議必須走明確 proposal/operator-run 入口。
|
||||
if generate_missing_decisions:
|
||||
import asyncio
|
||||
|
||||
responses = []
|
||||
background_tasks = []
|
||||
existing_tokens = await decision_manager._find_existing_tokens_for_incidents(
|
||||
[incident.incident_id for incident in incidents]
|
||||
)
|
||||
|
||||
for incident in incidents:
|
||||
try:
|
||||
# 只查已快取的決策 (不等待 AI,立即返回)
|
||||
existing = await decision_manager._find_existing_token(incident.incident_id)
|
||||
existing = existing_tokens.get(incident.incident_id)
|
||||
if existing:
|
||||
decision_info = DecisionInfo(
|
||||
token=existing.token,
|
||||
@@ -207,17 +223,20 @@ async def list_incidents() -> IncidentListResponse:
|
||||
)
|
||||
responses.append(IncidentResponse.from_incident(incident, decision_info))
|
||||
else:
|
||||
# 無快取 → 背景觸發,本次返回 None(前端看到 decision=null 會 poll)
|
||||
# 無快取 → 本次返回 None。列表查詢預設不觸發 AI;
|
||||
# 前端若需要修復建議,必須呼叫明確的 proposal 入口。
|
||||
responses.append(IncidentResponse.from_incident(incident, None))
|
||||
if not generate_missing_decisions:
|
||||
continue
|
||||
|
||||
# 2026-04-16 Claude Sonnet 4.6: 只對 48h 內的 incident 觸發 AI 分析
|
||||
# 舊 incident token 每小時過期,若不限制會反覆重新分析歷史事件 → Telegram 洪水
|
||||
from datetime import datetime, timezone, timedelta
|
||||
_created = getattr(incident, "created_at", None)
|
||||
_too_old = False
|
||||
if _created:
|
||||
if _created.tzinfo is None:
|
||||
_created = _created.replace(tzinfo=timezone.utc)
|
||||
_too_old = (_created < datetime.now(timezone.utc) - timedelta(hours=48))
|
||||
_created = _created.replace(tzinfo=UTC)
|
||||
_too_old = (_created < datetime.now(UTC) - timedelta(hours=48))
|
||||
if not _too_old:
|
||||
timeout = 120.0 if incident.severity in (Severity.P0, Severity.P1) else 180.0
|
||||
background_tasks.append(
|
||||
@@ -240,6 +259,7 @@ async def list_incidents() -> IncidentListResponse:
|
||||
"incidents_listed",
|
||||
count=len(incidents),
|
||||
with_decisions=sum(1 for r in responses if r.decision is not None),
|
||||
generate_missing_decisions=generate_missing_decisions,
|
||||
)
|
||||
|
||||
return IncidentListResponse(
|
||||
|
||||
27
apps/api/src/api/v1/platform/__init__.py
Normal file
27
apps/api/src/api/v1/platform/__init__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
"""
|
||||
AwoooP Platform API — Operator Console Router 彙整
|
||||
===================================================
|
||||
Phase 4 Shadow Mode + Phase 8 Operator Console
|
||||
ADR-106/ADR-107/ADR-114/ADR-115/ADR-116
|
||||
2026-05-05 ogt + Claude Sonnet 4.6(新增 Operator Console 四 router)
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
from src.api.v1.platform.contracts import router as contracts_router
|
||||
from src.api.v1.platform.events import router as events_router
|
||||
from src.api.v1.platform.operator_runs import router as operator_runs_router
|
||||
from src.api.v1.platform.runs import router as runs_router
|
||||
from src.api.v1.platform.tenants import router as tenants_router
|
||||
|
||||
router = APIRouter()
|
||||
router.include_router(events_router)
|
||||
# 2026-05-06 Codex: FastAPI 依註冊順序比對路由。Operator Console 的
|
||||
# `/runs/list` 必須排在 `/runs/{run_id}` 前面,否則 `list` 會被當成
|
||||
# run_id,造成前端 Run 監控頁 HTTP 422。
|
||||
router.include_router(operator_runs_router)
|
||||
router.include_router(runs_router)
|
||||
router.include_router(tenants_router)
|
||||
router.include_router(contracts_router)
|
||||
|
||||
__all__ = ["router"]
|
||||
53
apps/api/src/api/v1/platform/contracts.py
Normal file
53
apps/api/src/api/v1/platform/contracts.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""
|
||||
AwoooP Operator Console — Contracts List API
|
||||
=============================================
|
||||
ADR-106(AwoooP Agent Platform),ADR-107/ADR-112(Contract Revision)
|
||||
2026-05-05 ogt + Claude Sonnet 4.6
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import APIRouter, Query
|
||||
from pydantic import BaseModel
|
||||
|
||||
from src.services.platform_operator_service import list_contracts as list_contracts_svc
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class ContractItem(BaseModel):
|
||||
revision_id: UUID
|
||||
contract_id: str
|
||||
contract_family: str
|
||||
lifecycle_status: str
|
||||
body_hash: str
|
||||
version_major: int
|
||||
version_minor: int
|
||||
created_at: datetime
|
||||
project_id: str
|
||||
|
||||
|
||||
class ListContractsResponse(BaseModel):
|
||||
contracts: list[ContractItem]
|
||||
total: int
|
||||
|
||||
|
||||
@router.get(
|
||||
"/contracts",
|
||||
response_model=ListContractsResponse,
|
||||
summary="列出合約 Revisions",
|
||||
description=(
|
||||
"返回 awooop_contract_revisions,支援 project_id / lifecycle_status filter。\n\n"
|
||||
"- 按 created_at DESC 排序,最多 200 筆\n"
|
||||
"- ADR-107/ADR-112:append-only revision 表,只查不寫"
|
||||
),
|
||||
)
|
||||
async def list_contracts(
|
||||
project_id: str | None = Query(None, description="租戶 ID(可選)"),
|
||||
lifecycle_status: str | None = Query(None, description="lifecycle status filter(draft/published/active/revoked)"),
|
||||
) -> dict[str, Any]:
|
||||
return await list_contracts_svc(project_id=project_id, lifecycle_status=lifecycle_status)
|
||||
58
apps/api/src/api/v1/platform/events.py
Normal file
58
apps/api/src/api/v1/platform/events.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""
|
||||
AwoooP Operator Console — Channel Events API
|
||||
============================================
|
||||
提供 Operator Console 讀取 Communication Hub / legacy mirror 的事件摘要。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import APIRouter, Query
|
||||
from pydantic import BaseModel
|
||||
|
||||
from src.services.platform_operator_service import list_recent_channel_events
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class ChannelEventItem(BaseModel):
|
||||
event_id: UUID
|
||||
project_id: str
|
||||
channel_type: str
|
||||
provider_event_id: str
|
||||
channel_chat_id: str | None
|
||||
content_preview: str | None
|
||||
is_duplicate: bool
|
||||
received_at: datetime
|
||||
|
||||
|
||||
class RecentEventsResponse(BaseModel):
|
||||
events: list[ChannelEventItem]
|
||||
total: int
|
||||
limit: int
|
||||
|
||||
|
||||
@router.get(
|
||||
"/events/recent",
|
||||
response_model=RecentEventsResponse,
|
||||
summary="列出最近 Channel Events",
|
||||
description=(
|
||||
"返回 awooop_conversation_event 最近事件。"
|
||||
"可用 channel_type / provider_prefix 過濾,例如 alert-group 收斂事件。"
|
||||
),
|
||||
)
|
||||
async def list_recent_events(
|
||||
project_id: str | None = Query(None, description="租戶 ID(可選)"),
|
||||
channel_type: str | None = Query(None, description="通道類型(可選)"),
|
||||
provider_prefix: str | None = Query(None, description="provider_event_id 前綴(可選)"),
|
||||
limit: int = Query(20, ge=1, le=100, description="最多返回筆數"),
|
||||
) -> dict[str, Any]:
|
||||
return await list_recent_channel_events(
|
||||
project_id=project_id,
|
||||
channel_type=channel_type,
|
||||
provider_prefix=provider_prefix,
|
||||
limit=limit,
|
||||
)
|
||||
167
apps/api/src/api/v1/platform/operator_runs.py
Normal file
167
apps/api/src/api/v1/platform/operator_runs.py
Normal file
@@ -0,0 +1,167 @@
|
||||
"""
|
||||
AwoooP Operator Console — Runs List & Approval API
|
||||
====================================================
|
||||
GET /runs/list — 列出 runs(可 filter)
|
||||
GET /approvals — 列出待審核 runs(state=waiting_approval)
|
||||
POST /approvals/{run_id}/decide — 核准或拒絕 run
|
||||
ADR-106(AwoooP Agent Platform),ADR-114(Run State Machine),ADR-116(Gate 5 Approval)
|
||||
2026-05-05 ogt + Claude Sonnet 4.6
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from typing import Any, Literal
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from src.core.awooop_operator_auth import (
|
||||
AwoooPOperatorPrincipal,
|
||||
verify_awooop_operator,
|
||||
)
|
||||
from src.services.platform_operator_service import (
|
||||
decide_approval as decide_approval_svc,
|
||||
)
|
||||
from src.services.platform_operator_service import (
|
||||
get_run_detail as get_run_detail_svc,
|
||||
)
|
||||
from src.services.platform_operator_service import (
|
||||
list_approvals as list_approvals_svc,
|
||||
)
|
||||
from src.services.platform_operator_service import (
|
||||
list_runs as list_runs_svc,
|
||||
)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
_DEFAULT_PER_PAGE = 50
|
||||
_MAX_PER_PAGE = 200
|
||||
|
||||
|
||||
class RunItem(BaseModel):
|
||||
run_id: UUID
|
||||
project_id: str
|
||||
agent_id: str
|
||||
state: str
|
||||
is_shadow: bool
|
||||
cost_usd: Decimal
|
||||
step_count: int
|
||||
created_at: datetime
|
||||
timeout_at: datetime | None
|
||||
|
||||
|
||||
class ListRunsResponse(BaseModel):
|
||||
runs: list[RunItem]
|
||||
total: int
|
||||
page: int
|
||||
per_page: int
|
||||
|
||||
|
||||
class ApprovalItem(BaseModel):
|
||||
run_id: UUID
|
||||
project_id: str
|
||||
agent_id: str
|
||||
created_at: datetime
|
||||
timeout_at: datetime | None
|
||||
|
||||
|
||||
class ListApprovalsResponse(BaseModel):
|
||||
items: list[ApprovalItem]
|
||||
total: int
|
||||
|
||||
|
||||
class DecideApprovalRequest(BaseModel):
|
||||
project_id: str = Field(..., description="租戶 ID")
|
||||
decision: Literal["approve", "reject"] = Field(..., description="核准或拒絕")
|
||||
approver_id: str | None = Field(
|
||||
default=None,
|
||||
description="Deprecated. Ignored; approver comes from trusted operator headers.",
|
||||
)
|
||||
reason: str | None = Field(None, description="決策原因(可選)")
|
||||
|
||||
|
||||
class DecideApprovalResponse(BaseModel):
|
||||
run_id: str
|
||||
decision: str
|
||||
new_state: str
|
||||
approval_token_jti: str | None
|
||||
|
||||
|
||||
@router.get(
|
||||
"/runs/list",
|
||||
response_model=ListRunsResponse,
|
||||
summary="列出 Runs",
|
||||
description=(
|
||||
"返回 awooop_run_state 記錄,支援 project_id / state filter 與分頁。\n\n"
|
||||
"- 按 created_at DESC 排序\n"
|
||||
"- 注意:此路徑為 /runs/list 以避免與 runs.py 的 /runs/{run_id} 衝突"
|
||||
),
|
||||
)
|
||||
async def list_runs(
|
||||
project_id: str | None = Query(None, description="租戶 ID(可選)"),
|
||||
state: str | None = Query(None, description="Run 狀態 filter(可選)"),
|
||||
page: int = Query(1, ge=1, description="頁碼,從 1 開始"),
|
||||
per_page: int = Query(_DEFAULT_PER_PAGE, ge=1, le=_MAX_PER_PAGE, description="每頁筆數"),
|
||||
) -> dict[str, Any]:
|
||||
return await list_runs_svc(
|
||||
project_id=project_id, state=state, page=page, per_page=per_page
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/runs/{run_id}/detail",
|
||||
summary="查詢 Run 詳細時間線",
|
||||
description=(
|
||||
"返回單一 Run 的主狀態、Step Journal、MCP Gateway audit、"
|
||||
"入站 Channel Event 與出站訊息,供 Operator Console 顯示完整處置脈絡。"
|
||||
),
|
||||
)
|
||||
async def get_run_detail(
|
||||
run_id: str,
|
||||
project_id: str | None = Query(None, description="租戶 ID(可選)"),
|
||||
) -> dict[str, Any]:
|
||||
return await get_run_detail_svc(run_id=run_id, project_id=project_id)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/approvals",
|
||||
response_model=ListApprovalsResponse,
|
||||
summary="列出待審核 Runs",
|
||||
description=(
|
||||
"返回 state=waiting_approval 的 runs,即需要人工審核的任務清單。\n\n"
|
||||
"ADR-116 Gate 5:人工審核關卡"
|
||||
),
|
||||
)
|
||||
async def list_approvals(
|
||||
project_id: str | None = Query(None, description="租戶 ID(可選)"),
|
||||
run_id: str | None = Query(None, description="Run ID(可選,M8 詳情頁查單筆)"),
|
||||
) -> dict[str, Any]:
|
||||
return await list_approvals_svc(project_id=project_id, run_id=run_id)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/approvals/{run_id}/decide",
|
||||
response_model=DecideApprovalResponse,
|
||||
summary="核准或拒絕 Run",
|
||||
description=(
|
||||
"對 waiting_approval 狀態的 run 做出審核決定。\n\n"
|
||||
"- approve:發行 approval token → record_approval → run 轉為 running\n"
|
||||
"- reject:直接 transition → cancelled\n\n"
|
||||
"ADR-116 Gate 5:Operator Console 人工審核"
|
||||
),
|
||||
)
|
||||
async def decide_approval(
|
||||
run_id: str,
|
||||
body: DecideApprovalRequest,
|
||||
operator: AwoooPOperatorPrincipal = Depends(verify_awooop_operator),
|
||||
) -> dict[str, Any]:
|
||||
return await decide_approval_svc(
|
||||
run_id=run_id,
|
||||
project_id=body.project_id,
|
||||
decision=body.decision,
|
||||
approver_id=operator.operator_id,
|
||||
reason=body.reason,
|
||||
)
|
||||
149
apps/api/src/api/v1/platform/runs.py
Normal file
149
apps/api/src/api/v1/platform/runs.py
Normal file
@@ -0,0 +1,149 @@
|
||||
"""
|
||||
Platform Runs API
|
||||
==================
|
||||
AwoooP Phase 4: POST /v1/platform/runs — Shadow mode run 建立
|
||||
2026-05-04 ogt + Claude Sonnet 4.6(ADR-106/ADR-114)
|
||||
|
||||
禁止碰:
|
||||
- /v1/incidents/ — legacy 路由
|
||||
- /v1/webhooks/ — legacy 路由
|
||||
- Telegram bot handler — legacy 維持
|
||||
|
||||
Shadow mode 保證(Phase 4):
|
||||
- 建立的 run 全部 is_shadow=True
|
||||
- 不發送任何 user-visible response
|
||||
- 不執行任何 destructive tool call
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, HTTPException, status
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from src.services.audit_sink import write_audit
|
||||
from src.services.platform_runtime import create_run
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Request / Response models
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
class CreateRunRequest(BaseModel):
|
||||
"""POST /v1/platform/runs request body"""
|
||||
|
||||
project_id: str = Field(..., description="租戶 ID")
|
||||
agent_id: str = Field(..., description="執行此 run 的 agent ID")
|
||||
trigger_type: str = Field(
|
||||
...,
|
||||
pattern="^(channel_event|schedule|api|sub_agent|retry)$",
|
||||
description="觸發來源類型",
|
||||
)
|
||||
trigger_ref: str | None = Field(None, description="觸發來源 ref(channel_event_id 等)")
|
||||
input_payload: dict[str, Any] | None = Field(None, description="Run 輸入 payload")
|
||||
channel_type: str | None = Field(None, description="Channel 類型(idempotency 用)")
|
||||
provider_event_id: str | None = Field(
|
||||
None, max_length=256,
|
||||
description="Channel provider 原始事件 ID(idempotency 去重用)",
|
||||
)
|
||||
timeout_seconds: int = Field(600, ge=30, le=3600, description="Run 超時秒數")
|
||||
|
||||
|
||||
class CreateRunResponse(BaseModel):
|
||||
"""POST /v1/platform/runs response"""
|
||||
|
||||
run_id: str
|
||||
is_duplicate: bool = Field(description="True = 冪等命中,返回既有 run_id")
|
||||
is_shadow: bool = Field(True, description="Phase 4 固定 True")
|
||||
message: str
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Routes
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
@router.post(
|
||||
"/runs",
|
||||
response_model=CreateRunResponse,
|
||||
status_code=status.HTTP_202_ACCEPTED,
|
||||
summary="建立 Platform Run(Shadow Mode)",
|
||||
description=(
|
||||
"AwoooP Phase 4 Shadow Mode:建立新 run,非同步執行。\n\n"
|
||||
"- `is_shadow=true`:不產生任何 user-visible response\n"
|
||||
"- `is_duplicate=true`:冪等命中,返回既有 run_id(不建立新 run)\n"
|
||||
"- provider_event_id + channel_type 構成冪等 key(24h 視窗)"
|
||||
),
|
||||
)
|
||||
async def create_platform_run(
|
||||
request: CreateRunRequest,
|
||||
) -> CreateRunResponse:
|
||||
"""建立 shadow run。"""
|
||||
try:
|
||||
run_id, is_duplicate = await create_run(
|
||||
project_id=request.project_id,
|
||||
agent_id=request.agent_id,
|
||||
trigger_type=request.trigger_type,
|
||||
trigger_ref=request.trigger_ref,
|
||||
input_payload=request.input_payload,
|
||||
channel_type=request.channel_type,
|
||||
provider_event_id=request.provider_event_id,
|
||||
timeout_seconds=request.timeout_seconds,
|
||||
)
|
||||
except Exception as exc:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Run 建立失敗: {exc}",
|
||||
) from exc
|
||||
|
||||
# Audit log(非阻擋)
|
||||
await write_audit(
|
||||
project_id=request.project_id,
|
||||
action="run.created",
|
||||
resource_type="run",
|
||||
resource_id=str(run_id),
|
||||
details={
|
||||
"agent_id": request.agent_id,
|
||||
"trigger_type": request.trigger_type,
|
||||
"is_duplicate": is_duplicate,
|
||||
"is_shadow": True,
|
||||
},
|
||||
)
|
||||
|
||||
return CreateRunResponse(
|
||||
run_id=str(run_id),
|
||||
is_duplicate=is_duplicate,
|
||||
is_shadow=True,
|
||||
message="Run 已接受(shadow mode)" if not is_duplicate else "冪等命中,返回既有 run_id",
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/runs/{run_id}",
|
||||
summary="查詢 Run 狀態",
|
||||
)
|
||||
async def get_run_status(
|
||||
run_id: str,
|
||||
project_id: str,
|
||||
) -> dict[str, Any]:
|
||||
"""查詢單一 run 的 FSM 狀態。"""
|
||||
from src.services.platform_runtime import get_run_status as _svc_get_run_status
|
||||
|
||||
try:
|
||||
uid = uuid.UUID(run_id)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
detail=f"run_id 格式錯誤: {exc}",
|
||||
) from exc
|
||||
|
||||
result = await _svc_get_run_status(uid, project_id)
|
||||
if result is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=f"run {run_id!r} 不存在",
|
||||
)
|
||||
return result
|
||||
47
apps/api/src/api/v1/platform/tenants.py
Normal file
47
apps/api/src/api/v1/platform/tenants.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""
|
||||
AwoooP Operator Console — Tenants List API
|
||||
==========================================
|
||||
ADR-106(AwoooP Agent Platform),ADR-115(Tenant Onboarding)
|
||||
2026-05-05 ogt + Claude Sonnet 4.6
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import APIRouter
|
||||
from pydantic import BaseModel
|
||||
|
||||
from src.services.platform_operator_service import list_tenants as list_tenants_svc
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class TenantItem(BaseModel):
|
||||
project_id: str
|
||||
display_name: str
|
||||
migration_mode: str
|
||||
budget_limit_usd: Decimal | None
|
||||
is_active: bool
|
||||
created_at: datetime
|
||||
|
||||
|
||||
class ListTenantsResponse(BaseModel):
|
||||
tenants: list[TenantItem]
|
||||
total: int
|
||||
|
||||
|
||||
@router.get(
|
||||
"/tenants",
|
||||
response_model=ListTenantsResponse,
|
||||
summary="列出所有租戶",
|
||||
description=(
|
||||
"返回所有 awooop_projects 記錄(含已停用)。\n\n"
|
||||
"ADR-106/ADR-115:Operator Console 使用,不依 RLS 過濾。"
|
||||
),
|
||||
)
|
||||
async def list_tenants() -> dict[str, Any]:
|
||||
return await list_tenants_svc()
|
||||
@@ -8,9 +8,10 @@ leWOOOgo 原則: Router 只做 HTTP 轉發,業務邏輯在 KnowledgeRAGService
|
||||
建立者: Claude Code (Phase 33 ADR-067)
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, BackgroundTasks, HTTPException
|
||||
from fastapi import APIRouter, BackgroundTasks
|
||||
from pydantic import BaseModel
|
||||
|
||||
from src.core.config import get_settings
|
||||
from src.services.knowledge_rag_service import get_knowledge_rag_service
|
||||
|
||||
router = APIRouter(prefix="/rag", tags=["RAG Knowledge Base"])
|
||||
@@ -43,9 +44,10 @@ async def trigger_index(background_tasks: BackgroundTasks) -> RagIndexResponse:
|
||||
- .agents/skills/*.md
|
||||
"""
|
||||
background_tasks.add_task(_run_index)
|
||||
model = get_settings().OLLAMA_EMBEDDING_MODEL
|
||||
return RagIndexResponse(
|
||||
status="accepted",
|
||||
message="索引已排程,背景執行中(nomic-embed-text @ Ollama 111)",
|
||||
message=f"索引已排程,背景執行中({model} @ Ollama GCP-A/GCP-B/111)",
|
||||
)
|
||||
|
||||
|
||||
@@ -76,15 +78,16 @@ async def rag_debug() -> dict:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as c:
|
||||
from src.core.config import get_settings as _gs
|
||||
settings = _gs()
|
||||
r = await c.post(
|
||||
f"{_gs().OLLAMA_URL}/api/embeddings",
|
||||
json={"model": "nomic-embed-text", "prompt": "test"},
|
||||
f"{settings.OLLAMA_URL}/api/embeddings",
|
||||
json={"model": settings.OLLAMA_EMBEDDING_MODEL, "prompt": "test"},
|
||||
)
|
||||
ollama_ok = r.status_code == 200 if r.status_code == 200 else f"http_{r.status_code}"
|
||||
except Exception as e:
|
||||
ollama_ok = f"error: {type(e).__name__}: {e}"
|
||||
|
||||
return {"cwd": os.getcwd(), "paths": paths_check, "ollama_111_embed": ollama_ok}
|
||||
return {"cwd": os.getcwd(), "paths": paths_check, "ollama_embedding": ollama_ok}
|
||||
|
||||
|
||||
@router.get("/stats", summary="索引統計")
|
||||
|
||||
@@ -33,13 +33,8 @@ from pydantic import BaseModel, Field
|
||||
|
||||
from src.core.config import settings
|
||||
from src.core.constants import is_cicd_alertname, is_heartbeat_alertname
|
||||
from src.services.alert_rule_engine import get_incident_type, match_rule
|
||||
from src.services.action_parser import is_safe_kubectl_action
|
||||
from src.core.logging import get_logger
|
||||
from src.core.metrics import record_alert_chain_success
|
||||
|
||||
# Phase 15.2: Trace Context (moved to SignalProducerService)
|
||||
# get_trace_context 已移至 Service 層
|
||||
from src.models.approval import (
|
||||
ApprovalRequestCreate,
|
||||
BlastRadius,
|
||||
@@ -47,31 +42,40 @@ from src.models.approval import (
|
||||
DryRunCheck,
|
||||
RiskLevel,
|
||||
)
|
||||
|
||||
# R4 #129 (2026-04-01 ogt): AlertPayload/AlertResponse 移至 models 層,AlertAnalyzer 移至 services 層
|
||||
# ogt 更新 v1.1 2026-04-01 台北時間: generate_alert_fingerprint 移至 alert_analyzer_service (ADR-024)
|
||||
# [首席架構師] 移除 generate_alert_fingerprint 直接 import,改用 AlertAnalyzer.generate_fingerprint v1.2 2026-04-01 Asia/Taipei
|
||||
from src.models.webhook import AlertPayload, AlertResponse
|
||||
from src.services.action_parser import is_safe_kubectl_action
|
||||
from src.services.alert_analyzer_service import AlertAnalyzer
|
||||
from src.services.alert_approval_guard import guard_alert_approval_action
|
||||
from src.services.alert_grouping_service import get_alert_grouping_service
|
||||
from src.services.alert_rule_engine import get_incident_type, match_rule
|
||||
from src.services.alertmanager_llm_guard import (
|
||||
ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS,
|
||||
try_acquire_alertmanager_llm_lock,
|
||||
)
|
||||
from src.services.approval_db import get_approval_service
|
||||
from src.services.auto_approve import get_auto_approve_policy
|
||||
from src.services.auto_repair_service import AutoRepairService
|
||||
from src.services.channel_hub import record_grouped_alert_event
|
||||
|
||||
# Phase 15.2: Trace Context (moved to SignalProducerService)
|
||||
# get_trace_context 已移至 Service 層
|
||||
|
||||
# R4 #129 (2026-04-01 ogt): AlertPayload/AlertResponse 移至 models 層,AlertAnalyzer 移至 services 層
|
||||
# ogt 更新 v1.1 2026-04-01 台北時間: generate_alert_fingerprint 移至 alert_analyzer_service (ADR-024)
|
||||
# [首席架構師] 移除 generate_alert_fingerprint 直接 import,改用 AlertAnalyzer.generate_fingerprint v1.2 2026-04-01 Asia/Taipei
|
||||
|
||||
# Phase 17 P0: Service 層 (消除 Router 直接存取 Redis)
|
||||
# C2 修正 (首席架構師審查 2026-04-10): create_incident_for_approval + extract_affected_services 已移入 Service 層
|
||||
from src.services.incident_service import (
|
||||
classify_alert_early,
|
||||
create_incident_for_approval,
|
||||
extract_affected_services,
|
||||
get_incident_service,
|
||||
)
|
||||
from src.services.auto_approve import get_auto_approve_policy
|
||||
from src.services.auto_repair_service import AutoRepairService
|
||||
|
||||
# Phase 5: OpenClaw AI Engine
|
||||
from src.services.openclaw import get_openclaw
|
||||
from src.services.playbook_match_resolver import resolve_playbook_id_for_alert
|
||||
from src.services.security_interceptor import check_webhook_nonce # P0-06: nonce dedup via Service 層
|
||||
from src.services.signal_producer import SignalData, get_signal_producer
|
||||
|
||||
# Phase 5: Telegram Gateway (行動戰情室)
|
||||
@@ -80,9 +84,6 @@ from src.services.telegram_gateway import TelegramGatewayError, get_telegram_gat
|
||||
# Phase 18.1.7: K8s 資源名稱正規化 已移至 alert_analyzer_service (R4 #129)
|
||||
from src.utils.timezone import now_taipei
|
||||
|
||||
# ADR-076: 告警聚合引擎 (2026-04-14 Claude Haiku 4.5 Asia/Taipei)
|
||||
from src.services.alert_grouping_service import get_alert_grouping_service
|
||||
|
||||
router = APIRouter(prefix="/webhooks", tags=["Webhooks"])
|
||||
logger = get_logger("awoooi.webhooks")
|
||||
|
||||
@@ -648,6 +649,8 @@ class HMACVerificationError(Exception):
|
||||
async def verify_webhook_signature(
|
||||
request: Request,
|
||||
x_signature_256: str | None = Header(None, alias="X-Signature-256"),
|
||||
x_webhook_timestamp: str | None = Header(None, alias="X-Webhook-Timestamp"),
|
||||
x_webhook_nonce: str | None = Header(None, alias="X-Webhook-Nonce"),
|
||||
) -> bool:
|
||||
"""
|
||||
驗證 Webhook 請求的 HMAC-SHA256 簽章
|
||||
@@ -657,6 +660,11 @@ async def verify_webhook_signature(
|
||||
- 簽章格式: sha256=<hex_digest>
|
||||
- 使用 WEBHOOK_HMAC_SECRET 進行驗證
|
||||
|
||||
ADR-116 Replay 防護(向後相容):
|
||||
- X-Webhook-Timestamp: Unix epoch 秒,若提供則驗證 ±300 秒範圍
|
||||
- X-Webhook-Nonce: 隨機字串,若提供則用 Redis NX 去重(TTL=600s)
|
||||
- 兩個 Header 均可選;過渡期不提供時僅記錄 warning
|
||||
|
||||
安全鐵律 (Fail-Closed):
|
||||
- 生產環境: HMAC Secret 未設定 → 直接拒絕 (不可跳過)
|
||||
- 開發環境: 可跳過驗證 (僅供本地測試)
|
||||
@@ -664,6 +672,8 @@ async def verify_webhook_signature(
|
||||
Args:
|
||||
request: FastAPI Request 物件
|
||||
x_signature_256: X-Signature-256 Header 值
|
||||
x_webhook_timestamp: X-Webhook-Timestamp Header 值(Unix epoch 秒,可選)
|
||||
x_webhook_nonce: X-Webhook-Nonce Header 值(隨機字串,可選)
|
||||
|
||||
Returns:
|
||||
bool: 驗證是否通過
|
||||
@@ -671,6 +681,8 @@ async def verify_webhook_signature(
|
||||
Raises:
|
||||
HMACVerificationError: 簽章驗證失敗
|
||||
"""
|
||||
import time as _time
|
||||
|
||||
# ==========================================================================
|
||||
# Fail-Closed 安全策略 (CISO 要求)
|
||||
# ==========================================================================
|
||||
@@ -725,6 +737,54 @@ async def verify_webhook_signature(
|
||||
raise HMACVerificationError("Invalid signature")
|
||||
|
||||
logger.info("hmac_verification_success")
|
||||
|
||||
# ==========================================================================
|
||||
# ADR-116: Replay 防護(向後相容,HMAC 驗證成功後才執行)
|
||||
# ==========================================================================
|
||||
|
||||
# --- Timestamp 驗證(±300 秒) ---
|
||||
if x_webhook_timestamp is not None:
|
||||
try:
|
||||
req_ts = int(x_webhook_timestamp)
|
||||
now_ts = int(_time.time())
|
||||
skew = abs(now_ts - req_ts)
|
||||
if skew > 300:
|
||||
logger.warning(
|
||||
"webhook_timestamp_out_of_window",
|
||||
request_ts=req_ts,
|
||||
server_ts=now_ts,
|
||||
skew_seconds=skew,
|
||||
)
|
||||
raise HMACVerificationError(
|
||||
f"Timestamp out of acceptable window (skew={skew}s > 300s)"
|
||||
)
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
"webhook_timestamp_invalid_format",
|
||||
raw_value=x_webhook_timestamp,
|
||||
)
|
||||
raise HMACVerificationError("X-Webhook-Timestamp must be a Unix epoch integer")
|
||||
else:
|
||||
# 過渡期:沒有提供 Timestamp 則記錄 warning 但允許通過
|
||||
logger.warning(
|
||||
"webhook_replay_protection_missing",
|
||||
header="X-Webhook-Timestamp",
|
||||
note="transition period: request allowed but sender should add replay headers",
|
||||
)
|
||||
|
||||
# --- Nonce 去重(透過 security_interceptor.check_webhook_nonce,fail open) ---
|
||||
if x_webhook_nonce is not None:
|
||||
valid = await check_webhook_nonce(x_webhook_nonce)
|
||||
if not valid:
|
||||
raise HMACVerificationError("Nonce replay detected")
|
||||
else:
|
||||
# 過渡期:沒有提供 Nonce 則記錄 warning 但允許通過
|
||||
logger.warning(
|
||||
"webhook_replay_protection_missing",
|
||||
header="X-Webhook-Nonce",
|
||||
note="transition period: request allowed but sender should add replay headers",
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@@ -1087,15 +1147,33 @@ async def receive_alert(
|
||||
data_impact = impact_mapping.get(blast.data_impact.value, DataImpact.NONE)
|
||||
|
||||
# 2026-04-27 Claude Sonnet 4.6: shadow-run Step1 — 補 metadata kwarg,讓 extra_metadata 可觀測
|
||||
_cmd_cs1 = (analysis_result.kubectl_command or "").strip()
|
||||
_alertname_cs1 = str((alert.labels or {}).get("alertname") or alert.alert_type or "")
|
||||
_guarded_action_cs1 = await guard_alert_approval_action(
|
||||
action=(_cmd_cs1 or f"{analysis_result.action_title} | NO_ACTION"),
|
||||
alert_namespace=alert.namespace,
|
||||
alertname=_alertname_cs1,
|
||||
alert_category=get_incident_type(_alertname_cs1),
|
||||
)
|
||||
_matched_playbook_id_cs1 = await resolve_playbook_id_for_alert(
|
||||
alertname=_alertname_cs1,
|
||||
affected_services=analysis_result.affected_services
|
||||
or ([alert.target_resource] if alert.target_resource else []),
|
||||
severity=risk_level.value,
|
||||
)
|
||||
if _guarded_action_cs1.blocked:
|
||||
risk_level = RiskLevel.LOW
|
||||
_cmd_cs1 = ""
|
||||
|
||||
_approval_metadata_cs1 = {
|
||||
"source": ai_provider,
|
||||
"confidence_score": analysis_result.confidence,
|
||||
"is_rule_based": False,
|
||||
"playbook_id": None,
|
||||
"playbook_id": _matched_playbook_id_cs1,
|
||||
**_guarded_action_cs1.metadata,
|
||||
}
|
||||
_cmd_cs1 = (analysis_result.kubectl_command or "").strip()
|
||||
approval_create = ApprovalRequestCreate(
|
||||
action=(_cmd_cs1 or f"{analysis_result.action_title} | NO_ACTION"),
|
||||
action=_guarded_action_cs1.action,
|
||||
description=f"[AI: {ai_provider}] {analysis_result.action_title} | {analysis_result.description}",
|
||||
risk_level=risk_level,
|
||||
blast_radius=BlastRadius(
|
||||
@@ -1112,6 +1190,7 @@ async def receive_alert(
|
||||
],
|
||||
requested_by=f"OpenClaw ({ai_provider})",
|
||||
metadata=_approval_metadata_cs1,
|
||||
matched_playbook_id=_matched_playbook_id_cs1,
|
||||
)
|
||||
suggested_action = analysis_result.kubectl_command
|
||||
else:
|
||||
@@ -1158,7 +1237,7 @@ async def receive_alert(
|
||||
# 設計:confidence ≥ 0.85 + 非 CRITICAL + 非破壞性 + 有 kubectl 指令 → 直接執行
|
||||
# 安全防線:CRITICAL / destructive patterns / NO_ACTION/INVESTIGATE/OBSERVE / 空 kubectl → 降級 PENDING
|
||||
if analysis_result:
|
||||
_cs1_kubectl = analysis_result.kubectl_command.strip() if analysis_result.kubectl_command else ""
|
||||
_cs1_kubectl = _cmd_cs1
|
||||
_cs1_can_auto = (
|
||||
bool(_cs1_kubectl)
|
||||
and analysis_result.confidence >= 0.85
|
||||
@@ -1179,7 +1258,7 @@ async def receive_alert(
|
||||
required_signatures=0,
|
||||
status=ApprovalStatus.APPROVED,
|
||||
risk_level=risk_level.value,
|
||||
matched_playbook_id=None,
|
||||
matched_playbook_id=_matched_playbook_id_cs1,
|
||||
metadata={
|
||||
**_approval_metadata_cs1,
|
||||
"is_high_confidence": True,
|
||||
@@ -1429,7 +1508,6 @@ async def _process_new_alert_background(
|
||||
str(blast.get("data_impact", "NONE")).upper(),
|
||||
DataImpact.NONE,
|
||||
)
|
||||
rule_action_title = str(rule_response.get("action_title", "人工排查主機告警"))
|
||||
rule_kubectl = str(rule_response.get("kubectl_command", "")).strip()
|
||||
rule_description = str(rule_response.get("description", message))
|
||||
rule_action = (
|
||||
@@ -1437,13 +1515,31 @@ async def _process_new_alert_background(
|
||||
if rule_kubectl else
|
||||
f"NO_ACTION - {rule_description[:120]}"
|
||||
)
|
||||
_matched_playbook_id_cs2 = await resolve_playbook_id_for_alert(
|
||||
rule_id=str(rule_response.get("rule_id", "")),
|
||||
alertname=alertname,
|
||||
affected_services=[target_resource] if target_resource else [],
|
||||
severity=rule_risk.value,
|
||||
)
|
||||
_guarded_action_cs2 = await guard_alert_approval_action(
|
||||
action=rule_action,
|
||||
alert_namespace=namespace,
|
||||
alertname=alertname,
|
||||
alert_category=alert_category,
|
||||
)
|
||||
if _guarded_action_cs2.blocked:
|
||||
rule_action = _guarded_action_cs2.action
|
||||
rule_kubectl = ""
|
||||
rule_risk = RiskLevel.LOW
|
||||
|
||||
# 2026-04-27 Claude Sonnet 4.6: shadow-run Step1 — 補 metadata kwarg,讓 extra_metadata 可觀測
|
||||
_approval_metadata_cs2 = {
|
||||
"source": "rule_engine",
|
||||
"confidence_score": float(rule_response.get("confidence", 0.0) or 0.0),
|
||||
"is_rule_based": True,
|
||||
"playbook_id": str(rule_response.get("rule_id", "")) or None,
|
||||
"rule_id": str(rule_response.get("rule_id", "")) or None,
|
||||
"playbook_id": _matched_playbook_id_cs2,
|
||||
**_guarded_action_cs2.metadata,
|
||||
}
|
||||
approval_create = ApprovalRequestCreate(
|
||||
action=rule_action,
|
||||
@@ -1474,6 +1570,7 @@ async def _process_new_alert_background(
|
||||
],
|
||||
requested_by="OpenClaw (rule-engine)",
|
||||
metadata=_approval_metadata_cs2,
|
||||
matched_playbook_id=_matched_playbook_id_cs2,
|
||||
)
|
||||
|
||||
approval = await service.create_approval_with_fingerprint(
|
||||
@@ -1524,7 +1621,7 @@ async def _process_new_alert_background(
|
||||
required_signatures=0,
|
||||
status=ApprovalStatus.APPROVED,
|
||||
risk_level=rule_risk.value,
|
||||
matched_playbook_id=_approval_metadata_cs2.get("playbook_id"),
|
||||
matched_playbook_id=_matched_playbook_id_cs2,
|
||||
)
|
||||
# 使用 DB 中剛建立的 approval.id 讓 executor 可回寫
|
||||
_auto_approval.id = approval.id
|
||||
@@ -1664,15 +1761,34 @@ async def _process_new_alert_background(
|
||||
data_impact = impact_mapping.get(blast.data_impact.value, DataImpact.NONE) if blast else DataImpact.NONE
|
||||
|
||||
# 2026-04-27 Claude Sonnet 4.6: shadow-run Step1 — 補 metadata kwarg,讓 extra_metadata 可觀測
|
||||
_cmd_cs3 = (analysis_result.kubectl_command or "").strip()
|
||||
_guarded_action_cs3 = await guard_alert_approval_action(
|
||||
action=(_cmd_cs3 or f"{analysis_result.action_title} | NO_ACTION"),
|
||||
alert_namespace=namespace,
|
||||
alertname=alertname,
|
||||
alert_category=alert_category,
|
||||
)
|
||||
_matched_playbook_id_cs3 = await resolve_playbook_id_for_alert(
|
||||
rule_id=str(rule_response.get("rule_id", "")),
|
||||
alertname=alertname,
|
||||
affected_services=analysis_result.affected_services
|
||||
or ([target_resource] if target_resource else []),
|
||||
severity=risk_level.value,
|
||||
)
|
||||
if _guarded_action_cs3.blocked:
|
||||
risk_level = RiskLevel.LOW
|
||||
_cmd_cs3 = ""
|
||||
|
||||
_approval_metadata_cs3 = {
|
||||
"source": ai_provider,
|
||||
"confidence_score": analysis_result.confidence,
|
||||
"is_rule_based": False,
|
||||
"playbook_id": None,
|
||||
"rule_id": str(rule_response.get("rule_id", "")) or None,
|
||||
"playbook_id": _matched_playbook_id_cs3,
|
||||
**_guarded_action_cs3.metadata,
|
||||
}
|
||||
_cmd_cs3 = (analysis_result.kubectl_command or "").strip()
|
||||
approval_create = ApprovalRequestCreate(
|
||||
action=(_cmd_cs3 or f"{analysis_result.action_title} | NO_ACTION"),
|
||||
action=_guarded_action_cs3.action,
|
||||
description=f"[AI: {ai_provider}] {analysis_result.action_title} | {analysis_result.description}",
|
||||
risk_level=risk_level,
|
||||
blast_radius=BlastRadius(
|
||||
@@ -1687,6 +1803,7 @@ async def _process_new_alert_background(
|
||||
],
|
||||
requested_by=f"OpenClaw ({ai_provider})",
|
||||
metadata=_approval_metadata_cs3,
|
||||
matched_playbook_id=_matched_playbook_id_cs3,
|
||||
)
|
||||
|
||||
approval = await service.create_approval_with_fingerprint(
|
||||
@@ -1700,7 +1817,7 @@ async def _process_new_alert_background(
|
||||
"risk_level": risk_level.value,
|
||||
"confidence": analysis_result.confidence,
|
||||
"action": approval_create.action,
|
||||
"kubectl_command": analysis_result.kubectl_command,
|
||||
"kubectl_command": _cmd_cs3,
|
||||
"is_rule_based": False,
|
||||
"source": ai_provider,
|
||||
}
|
||||
@@ -1716,7 +1833,7 @@ async def _process_new_alert_background(
|
||||
logger.warning("shadow_auto_approve_failed", error=str(_shadow_err_cs3))
|
||||
|
||||
# 2026-04-27 Claude Sonnet 4.6: CS3 LLM 高信心自動執行(修法3擴展)
|
||||
_cs3_kubectl = (analysis_result.kubectl_command or "").strip()
|
||||
_cs3_kubectl = _cmd_cs3
|
||||
_cs3_can_auto = (
|
||||
bool(_cs3_kubectl)
|
||||
and analysis_result.confidence >= 0.85
|
||||
@@ -1733,7 +1850,7 @@ async def _process_new_alert_background(
|
||||
required_signatures=0,
|
||||
status=ApprovalStatus.APPROVED,
|
||||
risk_level=risk_level.value,
|
||||
matched_playbook_id=None,
|
||||
matched_playbook_id=_matched_playbook_id_cs3,
|
||||
metadata={
|
||||
**_approval_metadata_cs3,
|
||||
"is_high_confidence": True,
|
||||
@@ -1835,7 +1952,7 @@ async def _process_new_alert_background(
|
||||
risk_level=risk_level.value,
|
||||
resource_name=target_resource,
|
||||
root_cause=root_cause,
|
||||
suggested_action=(analysis_result.kubectl_command or "").strip() or analysis_result.suggested_action.value,
|
||||
suggested_action=approval_create.action,
|
||||
estimated_downtime=estimated_downtime,
|
||||
hit_count=1,
|
||||
primary_responsibility=primary_responsibility,
|
||||
@@ -2206,6 +2323,19 @@ async def alertmanager_webhook(
|
||||
parent_fingerprint=grouping_result.parent_fingerprint,
|
||||
reason="Alert storm suppressed — child alert within 5-min window",
|
||||
)
|
||||
background_tasks.add_task(
|
||||
record_grouped_alert_event,
|
||||
project_id="awoooi",
|
||||
alert_id=alert_id,
|
||||
alertname=alertname,
|
||||
severity=severity,
|
||||
namespace=namespace,
|
||||
target_resource=target_resource,
|
||||
group_key=grouping_result.group_key,
|
||||
count=grouping_result.count,
|
||||
parent_fingerprint=grouping_result.parent_fingerprint,
|
||||
fingerprint=fingerprint,
|
||||
)
|
||||
return AlertResponse(
|
||||
success=True,
|
||||
message=(
|
||||
|
||||
126
apps/api/src/core/awooop_operator_auth.py
Normal file
126
apps/api/src/core/awooop_operator_auth.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""
|
||||
AwoooP Operator authentication boundary.
|
||||
|
||||
ADR-116 Gate 5 approval decisions must not trust browser-supplied identities.
|
||||
This module accepts a short-lived operator identity only when it is paired with
|
||||
the server-side AwoooP operator key.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import secrets
|
||||
from dataclasses import dataclass
|
||||
from typing import Annotated
|
||||
|
||||
import structlog
|
||||
from fastapi import Header, HTTPException, status
|
||||
|
||||
from src.core.config import settings
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
_OPERATOR_ID_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9_.:@-]{1,127}$")
|
||||
_PROD_ENVS = {"prod", "production"}
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class AwoooPOperatorPrincipal:
|
||||
"""Authenticated AwoooP operator principal."""
|
||||
|
||||
operator_id: str
|
||||
auth_method: str
|
||||
|
||||
|
||||
def _auth_error(detail: str = "Operator authentication required") -> HTTPException:
|
||||
return HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail=detail)
|
||||
|
||||
|
||||
def _clean_operator_id(operator_id: str | None) -> str:
|
||||
if operator_id is None:
|
||||
raise _auth_error()
|
||||
cleaned = operator_id.strip()
|
||||
if not _OPERATOR_ID_RE.fullmatch(cleaned):
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
|
||||
detail="Invalid operator identity",
|
||||
)
|
||||
return cleaned
|
||||
|
||||
|
||||
def authenticate_awooop_operator_headers(
|
||||
operator_id: str | None,
|
||||
operator_key: str | None,
|
||||
*,
|
||||
configured_key: str | None = None,
|
||||
environment: str | None = None,
|
||||
) -> AwoooPOperatorPrincipal:
|
||||
"""Validate trusted AwoooP operator headers.
|
||||
|
||||
Args:
|
||||
operator_id: Value from ``X-AwoooP-Operator-Id``.
|
||||
operator_key: Value from ``X-AwoooP-Operator-Key``.
|
||||
configured_key: Server-side shared key. Defaults to settings.
|
||||
environment: Runtime environment. Defaults to settings.
|
||||
|
||||
Returns:
|
||||
Authenticated operator principal.
|
||||
|
||||
Raises:
|
||||
HTTPException: 401 when authentication is missing/invalid, or 422 for
|
||||
malformed operator identity.
|
||||
"""
|
||||
cleaned_operator_id = _clean_operator_id(operator_id)
|
||||
expected_key = (
|
||||
settings.AWOOOP_OPERATOR_API_KEY
|
||||
if configured_key is None
|
||||
else configured_key
|
||||
)
|
||||
runtime_env = (environment or settings.ENVIRONMENT or "").lower()
|
||||
|
||||
if not expected_key:
|
||||
if runtime_env in _PROD_ENVS:
|
||||
logger.critical(
|
||||
"awooop_operator_key_missing_in_production",
|
||||
environment=runtime_env,
|
||||
)
|
||||
raise _auth_error()
|
||||
logger.warning(
|
||||
"awooop_operator_key_skipped_dev_only",
|
||||
environment=runtime_env,
|
||||
operator_id=cleaned_operator_id,
|
||||
)
|
||||
return AwoooPOperatorPrincipal(
|
||||
operator_id=cleaned_operator_id,
|
||||
auth_method="dev_header",
|
||||
)
|
||||
|
||||
if not operator_key:
|
||||
logger.warning("awooop_operator_key_missing", operator_id=cleaned_operator_id)
|
||||
raise _auth_error()
|
||||
|
||||
if not secrets.compare_digest(operator_key, expected_key):
|
||||
logger.warning("awooop_operator_key_invalid", operator_id=cleaned_operator_id)
|
||||
raise _auth_error()
|
||||
|
||||
return AwoooPOperatorPrincipal(
|
||||
operator_id=cleaned_operator_id,
|
||||
auth_method="operator_api_key",
|
||||
)
|
||||
|
||||
|
||||
async def verify_awooop_operator(
|
||||
x_awooop_operator_id: Annotated[
|
||||
str | None,
|
||||
Header(alias="X-AwoooP-Operator-Id"),
|
||||
] = None,
|
||||
x_awooop_operator_key: Annotated[
|
||||
str | None,
|
||||
Header(alias="X-AwoooP-Operator-Key"),
|
||||
] = None,
|
||||
) -> AwoooPOperatorPrincipal:
|
||||
"""FastAPI dependency for operator mutation endpoints."""
|
||||
return authenticate_awooop_operator_headers(
|
||||
operator_id=x_awooop_operator_id,
|
||||
operator_key=x_awooop_operator_key,
|
||||
)
|
||||
@@ -145,7 +145,7 @@ class Settings(BaseSettings):
|
||||
# ==========================================================================
|
||||
# ADR-104: LLM Playbook Generator
|
||||
# 成功修復且未命中既有 Playbook 時,用本地 LLM 生成 DRAFT/REVIEW Playbook。
|
||||
# 成本護欄:實作層只走 local provider(Ollama 111 → Ollama 188),不新增雲端 fallback。
|
||||
# 成本護欄:實作層只走 local provider(GCP-A → GCP-B → 111),不新增雲端 fallback。
|
||||
# 回滾指令: kubectl set env deployment/awoooi-api ENABLE_LLM_PLAYBOOK_GENERATION=false
|
||||
# ==========================================================================
|
||||
ENABLE_LLM_PLAYBOOK_GENERATION: bool = Field(
|
||||
@@ -215,8 +215,8 @@ class Settings(BaseSettings):
|
||||
description="Phase 25 P0: DIAGNOSE NIM timeout (秒),實測 2.2-27.3s avg 10.6s,60s 含 buffer",
|
||||
)
|
||||
OLLAMA_DIAGNOSE_TIMEOUT_SECONDS: int = Field(
|
||||
default=200,
|
||||
description="Phase 25 P0: Ollama timeout (秒),實測 CPU-only 238s,保留欄位但 DIAGNOSE 不再走 Ollama",
|
||||
default=300,
|
||||
description="Ollama diagnose timeout (秒)。GCP qwen3:14b CPU-only can exceed the old 120s proxy limit.",
|
||||
)
|
||||
|
||||
# ==========================================================================
|
||||
@@ -362,7 +362,7 @@ class Settings(BaseSettings):
|
||||
raise ValueError(
|
||||
f"OLLAMA URL host 不允許的外部域名:{host!r}(完整 URL:{v!r})"
|
||||
",必須使用私網 IP 或已知 K8s Service hostname"
|
||||
)
|
||||
) from None
|
||||
if not (ip.is_private or ip.is_loopback):
|
||||
raise ValueError(
|
||||
f"OLLAMA URL 必須是私網/loopback IP、已知 K8s SVC 或 GCP 白名單 IP,"
|
||||
@@ -370,11 +370,16 @@ class Settings(BaseSettings):
|
||||
)
|
||||
return v
|
||||
|
||||
# 2026-04-25 Claude Engineer-C (P1.1): Ollama 健康檢測推理測試模型
|
||||
# 2026-05-05 Codex: health inference must stay on alert-fast model; qwen2.5
|
||||
# keeps reloading a 7B model on CPU-only GCP and slows incident fallback.
|
||||
OLLAMA_HEALTH_CHECK_MODEL: str = Field(
|
||||
default="qwen2.5:7b-instruct",
|
||||
default="gemma3:4b",
|
||||
description="OllamaHealthMonitor 推理測試使用模型(P1.1)",
|
||||
)
|
||||
OLLAMA_EMBEDDING_MODEL: str = Field(
|
||||
default="bge-m3:latest",
|
||||
description="Ollama embedding model. ADR-110 migrated embeddings from nomic-embed-text to bge-m3.",
|
||||
)
|
||||
# 2026-04-12 ogt: 心跳必須確認載入的 Ollama 模型清單
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP 升級,更新必要模型清單(nomic→bge-m3 + 新增 qwen3:14b + hermes3)
|
||||
OLLAMA_REQUIRED_MODELS: list[str] = Field(
|
||||
@@ -429,7 +434,8 @@ class Settings(BaseSettings):
|
||||
|
||||
# ==========================================================================
|
||||
# OpenTelemetry (可觀測性鐵律)
|
||||
# 四主機架構強制校驗: OTEL 必須指向 192.168.0.188
|
||||
# 四主機架構強制校驗: OTEL 必須指向 192.168.0.188(AWOOOI 主站)
|
||||
# ADR-121 + P0-08 修正:改為 config-driven,允許 EwoooC 指向不同 host
|
||||
# ==========================================================================
|
||||
OTEL_ENABLED: bool = Field(
|
||||
default=True,
|
||||
@@ -439,6 +445,18 @@ class Settings(BaseSettings):
|
||||
default="192.168.0.188:24317",
|
||||
description="SigNoz OTLP gRPC endpoint (Host port 24317 -> Container 4317) - NO http:// prefix for gRPC",
|
||||
)
|
||||
OTEL_ALLOWED_ENDPOINTS: list[str] = Field(
|
||||
default=["192.168.0.188"],
|
||||
description="允許的 OTEL endpoint host 列表(逗號分隔可用 env 覆寫)。EwoooC 可設自己的 SigNoz host。",
|
||||
)
|
||||
OTEL_FORBIDDEN_ENDPOINTS: list[str] = Field(
|
||||
default=["192.168.0.110", "192.168.0.112", "192.168.0.120", "192.168.0.121"],
|
||||
description="明確禁止的 OTEL endpoint host 列表(不允許誤指向非 SigNoz 主機)",
|
||||
)
|
||||
AWOOOI_K8S_NAMESPACE: str = Field(
|
||||
default="awoooi-prod",
|
||||
description="K8s namespace(P0-13 修正:不再硬碼,EwoooC/Tsenyang 可設自己的 namespace)",
|
||||
)
|
||||
OTEL_SERVICE_NAME: str = Field(
|
||||
default="awoooi-api",
|
||||
description="Service name for tracing",
|
||||
@@ -483,6 +501,46 @@ class Settings(BaseSettings):
|
||||
)
|
||||
GEMINI_API_KEY: str = Field(default="", description="Google Gemini API key")
|
||||
CLAUDE_API_KEY: str = Field(default="", description="Anthropic Claude API key")
|
||||
LOCAL_CODE_REVIEW_ALLOW_GEMINI_FALLBACK: bool = Field(
|
||||
default=False,
|
||||
description=(
|
||||
"Allow LocalCodeReviewService to fall back to Gemini when the "
|
||||
"local Ollama code-review lane fails. Default false to avoid "
|
||||
"unexpected cloud spend from Gitea push/PR alerts."
|
||||
),
|
||||
)
|
||||
ALERT_AI_ALLOW_CLOUD_FALLBACK: bool = Field(
|
||||
default=True,
|
||||
description=(
|
||||
"Allow incident/alert OpenClaw analysis to use cloud fallback "
|
||||
"providers after the GCP-A/GCP-B/111 Ollama lane is exhausted. "
|
||||
"Default true so Gemini can act as the final backup, after the "
|
||||
"ordered Ollama lane is exhausted."
|
||||
),
|
||||
)
|
||||
ALERT_AI_ENFORCE_OLLAMA_FIRST: bool = Field(
|
||||
default=True,
|
||||
description=(
|
||||
"Force incident/alert OpenClaw analysis to try GCP-A, then GCP-B, "
|
||||
"then local 111 before cloud backup providers such as Gemini."
|
||||
),
|
||||
)
|
||||
ALERT_OLLAMA_MODEL: str = Field(
|
||||
default="qwen3:14b",
|
||||
description=(
|
||||
"Ollama model used for incident/alert deep diagnosis. Alert cards "
|
||||
"may wait for this model; Gemini remains a backup after GCP-A, "
|
||||
"GCP-B, and 111 fail."
|
||||
),
|
||||
)
|
||||
INCIDENT_LLM_TIMEOUT_SECONDS: int = Field(
|
||||
default=360,
|
||||
description=(
|
||||
"Outer timeout for incident OpenClaw proposal generation. This must "
|
||||
"be long enough for the GCP-A/GCP-B/111 Ollama lane to complete "
|
||||
"before Gemini backup is considered useful."
|
||||
),
|
||||
)
|
||||
# 2026-03-29 ogt: ADR-036 Nemotron Tool Calling 整合
|
||||
NVIDIA_API_KEY: str = Field(
|
||||
default="",
|
||||
@@ -544,6 +602,13 @@ class Settings(BaseSettings):
|
||||
default="",
|
||||
description="API Key for K8s admin endpoints (X-K8s-Api-Key header)",
|
||||
)
|
||||
AWOOOP_OPERATOR_API_KEY: str = Field(
|
||||
default="",
|
||||
description=(
|
||||
"API key for AwoooP operator mutation endpoints "
|
||||
"(X-AwoooP-Operator-Key header)"
|
||||
),
|
||||
)
|
||||
|
||||
# ==========================================================================
|
||||
# 統帥鐵律:禁止 SQLite (AWOOOI 憲法)
|
||||
@@ -649,6 +714,24 @@ class Settings(BaseSettings):
|
||||
default=True,
|
||||
description="ADR-091 T1: True=AI 自學規則雙寫 alert_rule_catalog DB, False=僅 YAML(回滾用)",
|
||||
)
|
||||
# ==========================================================================
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: Drift 自動採納開關
|
||||
# 根因修復後啟用(report.interpretation in-memory 未更新 bug 已修)
|
||||
# 回滾指令: kubectl set env deployment/awoooi-api DRIFT_AUTO_ADOPT_ENABLED=false
|
||||
# ==========================================================================
|
||||
DRIFT_AUTO_ADOPT_ENABLED: bool = Field(
|
||||
default=True,
|
||||
description="2026-05-04: True=啟用 drift auto_adopt_if_safe 自動採納低風險漂移, False=回滾停用",
|
||||
)
|
||||
# ==========================================================================
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: Coverage Gap → AI 規則自動生成
|
||||
# evaluate_once() 末段:對 auto_alerting=red 的 asset 自動生成 alert_rule_catalog 記錄
|
||||
# 回滾指令: kubectl set env deployment/awoooi-api COVERAGE_AUTO_RULE_ENABLED=false
|
||||
# ==========================================================================
|
||||
COVERAGE_AUTO_RULE_ENABLED: bool = Field(
|
||||
default=True,
|
||||
description="2026-05-04: True=coverage 缺口自動生成 alert_rule_catalog(source='ai_generated',review_status='pending_review'), False=停用",
|
||||
)
|
||||
# 2026-04-27 P3.1-T2-PathA by Claude — DiagAggregator 信號分類層補 PDI
|
||||
# 路徑 A 已啟用:DA 只取 PDI 已收集的 raw 資料做業務邏輯分類(OOMKilled/CrashLoop 等),
|
||||
# 不重複呼叫 K8s/SignOz API(純邏輯分類,不打外部服務)。
|
||||
@@ -690,6 +773,13 @@ class Settings(BaseSettings):
|
||||
default="",
|
||||
description="HMAC secret for webhook signature verification",
|
||||
)
|
||||
# ADR-116 P0-05: Callback Nonce 防偽造 HMAC Secret
|
||||
# 2026-05-04 Claude Sonnet 4.6 (ADR-116): 附加至 callback nonce 末尾的 HMAC-SHA256[:16]
|
||||
# 空字串 → 過渡期跳過驗證並記錄 warning
|
||||
CALLBACK_HMAC_SECRET: str = Field(
|
||||
default="",
|
||||
description="ADR-116: HMAC secret for callback nonce anti-forgery (HMAC-SHA256 appended to nonce)",
|
||||
)
|
||||
# 2026-04-24 Claude Sonnet 4.6 (ADR-094): Telegram Webhook Secret Token
|
||||
# 與 setWebhook API 呼叫時的 secret_token 相同;空字串 → dev 環境跳過驗證
|
||||
TELEGRAM_WEBHOOK_SECRET: str = Field(
|
||||
@@ -809,7 +899,7 @@ class Settings(BaseSettings):
|
||||
# ==========================================================================
|
||||
# MCP Phase 2b: Prometheus MCP Server (ADR-071, 2026-04-11 Claude Sonnet 4.6)
|
||||
# ==========================================================================
|
||||
# 2026-04-29 ogt + Claude Opus 4.7: drift fix — 188 是 Ollama Hub,Prometheus 實際在 110
|
||||
# 2026-04-29 ogt + Claude Opus 4.7: drift fix — Prometheus 實際在 110
|
||||
# ConfigMap 04-configmap.yaml 也是 110;governance_agent / SLO check 連 188 會 timeout
|
||||
# 此 drift 是 SPF-4 (governance_agent silently fail) 根因之一
|
||||
PROMETHEUS_URL: str = Field(
|
||||
@@ -883,7 +973,7 @@ class Settings(BaseSettings):
|
||||
"devops": "192.168.0.110", # Harbor, GH Runner
|
||||
"security": "192.168.0.112", # Kali Scanner
|
||||
"k3s_master": "192.168.0.120", # K3s Master
|
||||
"ai_web": "192.168.0.188", # Nginx, Postgres, Redis, Ollama
|
||||
"ai_web": "192.168.0.188", # Nginx, Postgres, Redis, SignOz
|
||||
}
|
||||
|
||||
|
||||
|
||||
22
apps/api/src/core/context.py
Normal file
22
apps/api/src/core/context.py
Normal file
@@ -0,0 +1,22 @@
|
||||
"""AwoooP Phase 2.4: Project ID Context Variable
|
||||
================================================
|
||||
2026-05-04 ogt + Claude Sonnet 4.6(ADR-123 background loop tagging)
|
||||
|
||||
設計原則:
|
||||
- Python asyncio.create_task() 自動繼承父任務的 ContextVar 值
|
||||
- startup handler 設一次 PROJECT_ID.set("awoooi"),所有 31 個 loop 自動繼承
|
||||
- get_db_context() 讀此 contextvar 作為 fallback,確保 RLS SET LOCAL 正確
|
||||
- 多租戶未來:呼叫端傳入不同 project_id 即可隔離,無需改 loop 本體
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from contextvars import ContextVar
|
||||
|
||||
# 追蹤當前非同步任務的 project_id
|
||||
# default="awoooi" 確保未設時也能正常查詢(RLS fail-open 保護)
|
||||
PROJECT_ID: ContextVar[str] = ContextVar("project_id", default="awoooi")
|
||||
|
||||
|
||||
def get_current_project_id() -> str:
|
||||
"""取得當前任務的 project_id(給 service 層使用)"""
|
||||
return PROJECT_ID.get()
|
||||
@@ -11,6 +11,7 @@ Features:
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
@@ -19,6 +20,28 @@ from structlog.types import Processor
|
||||
|
||||
from src.core.config import settings
|
||||
|
||||
_TELEGRAM_BOT_URL_RE = re.compile(r"(api\.telegram\.org/bot)[^/\s]+")
|
||||
|
||||
|
||||
def _redact_sensitive_log_text(text: str) -> str:
|
||||
"""遮蔽可能出現在第三方 logger 訊息中的敏感 URL。"""
|
||||
return _TELEGRAM_BOT_URL_RE.sub(r"\1<redacted>", text)
|
||||
|
||||
|
||||
class SensitiveURLRedactionFilter(logging.Filter):
|
||||
"""標準 logging filter:避免 httpx 等第三方 logger 把 token URL 打進 log。"""
|
||||
|
||||
def filter(self, record: logging.LogRecord) -> bool:
|
||||
record.msg = _redact_sensitive_log_text(str(record.msg))
|
||||
if isinstance(record.args, tuple):
|
||||
record.args = tuple(_redact_sensitive_log_text(str(arg)) for arg in record.args)
|
||||
elif isinstance(record.args, dict):
|
||||
record.args = {
|
||||
key: _redact_sensitive_log_text(str(value))
|
||||
for key, value in record.args.items()
|
||||
}
|
||||
return True
|
||||
|
||||
|
||||
def setup_logging() -> None:
|
||||
"""Configure structlog for the application"""
|
||||
@@ -68,6 +91,15 @@ def setup_logging() -> None:
|
||||
stream=sys.stdout,
|
||||
level=logging.getLevelName(settings.LOG_LEVEL),
|
||||
)
|
||||
redaction_filter = SensitiveURLRedactionFilter()
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.addFilter(redaction_filter)
|
||||
for handler in root_logger.handlers:
|
||||
handler.addFilter(redaction_filter)
|
||||
|
||||
# httpx INFO 會輸出完整 request URL;Telegram Bot API URL 內含 token。
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def get_logger(name: str | None = None, **initial_context: Any) -> structlog.BoundLogger:
|
||||
|
||||
@@ -108,10 +108,11 @@ The `alertname` field is your PRIMARY signal. Use it to determine the problem ty
|
||||
|
||||
| Alert category / alertname pattern | suggested_action | kubectl_command guidance |
|
||||
|-------------------------------------|-----------------|--------------------------|
|
||||
| starts with "Host" (HostHighCpuLoad, HostHighMemoryUsage, HostHighLoad, HostOutOfMemory, HostDisk*, etc.) | INVESTIGATE | `ssh <instance_ip> 'ps aux --sort=-%cpu \| head -15; free -h; uptime'` — use labels.instance for host IP; do NOT use kubectl |
|
||||
| contains "Disk", "Storage", "PVC", "Volume" | NO_ACTION | `kubectl exec <pod> -- df -h` or `kubectl get pvc -n <ns>` |
|
||||
| contains "Postgres", "MySQL", "Redis", "DB", "Database" | NO_ACTION | `kubectl exec <pod> -- psql` or `kubectl logs <pod>` |
|
||||
| contains "CrashLoop", "OOMKilled", "Pod" | DELETE_POD or RESTART_DEPLOYMENT | `kubectl delete pod <pod> -n <ns>` |
|
||||
| contains "CPU", "Memory", "Resource" | TUNE_RESOURCES or SCALE_DEPLOYMENT | `kubectl top pod -n <ns>` or HPA command |
|
||||
| contains "CPU", "Memory", "Resource" (K8s Pod alerts only — NOT Host* alerts) | TUNE_RESOURCES or SCALE_DEPLOYMENT | `kubectl top pod -n <ns>` or HPA command |
|
||||
| contains "Node", "NodeNotReady" | NO_ACTION | `kubectl describe node <node>` |
|
||||
| contains "SSL", "Certificate", "Cert" | NO_ACTION | `kubectl get certificate -n <ns>` |
|
||||
| alert_category = "database" | NO_ACTION | DB investigation commands only |
|
||||
@@ -184,10 +185,11 @@ You are an SRE AI. Analyze the alert and respond with ONLY valid JSON.
|
||||
|
||||
## CRITICAL: Read alertname first
|
||||
The `alertname` field tells you what kind of problem this is. Use it:
|
||||
- starts with "Host" (HostHighCpuLoad, HostHighMemoryUsage, HostHighLoad, HostOutOfMemory, HostDisk*, etc.) → suggested_action=INVESTIGATE, kubectl_command="ssh <labels.instance_ip> 'ps aux --sort=-%cpu | head -15; free -h; uptime'" — NO kubectl commands for host alerts
|
||||
- "Disk/Storage/PVC/Volume" → suggested_action=NO_ACTION, kubectl_command="kubectl get pvc" or "kubectl exec <pod> -- df -h"
|
||||
- "Postgres/MySQL/Redis/DB/Database" → suggested_action=NO_ACTION, DB investigation commands
|
||||
- "CrashLoop/OOM/Pod" → suggested_action=DELETE_POD or RESTART_DEPLOYMENT
|
||||
- "CPU/Memory/Resource" → suggested_action=TUNE_RESOURCES or SCALE_DEPLOYMENT
|
||||
- "CPU/Memory/Resource" (K8s Pod alerts only) → suggested_action=TUNE_RESOURCES or SCALE_DEPLOYMENT
|
||||
- "SSL/Cert" → suggested_action=NO_ACTION
|
||||
NEVER use "kubectl rollout restart deployment/awoooi-prod" (that is the NAMESPACE, not a deployment).
|
||||
Make action_title describe the ACTUAL problem (not generic "自動修復 AWOOOI 服務").
|
||||
|
||||
@@ -5,14 +5,18 @@ P0 基礎設施: 可觀測性鐵律
|
||||
|
||||
Traces + Metrics → SigNoz (192.168.0.188:24317)
|
||||
|
||||
四主機架構強制校驗:
|
||||
四主機架構強制校驗(允許 host 由 OTEL_ALLOWED_ENDPOINTS 設定,預設 192.168.0.188):
|
||||
| IP | 允許 OTEL? |
|
||||
|-----------------|-----------|
|
||||
| 192.168.0.110 | ❌ 禁止 |
|
||||
| 192.168.0.112 | ❌ 禁止 |
|
||||
| 192.168.0.188 | ✅ 唯一 |
|
||||
| 192.168.0.188 | ✅ 預設 |
|
||||
| 192.168.0.120 | ❌ 禁止 |
|
||||
|
||||
P0-08 修正(ADR-121,2026-05-04 ogt + Claude Sonnet 4.6):
|
||||
移除硬碼 IP assert,改為 config-driven allowed/forbidden 清單。
|
||||
EwoooC 可用 OTEL_ALLOWED_ENDPOINTS env 覆寫指向自己的 SigNoz host。
|
||||
|
||||
優雅降級 (Graceful Degradation):
|
||||
- OTEL 連線失敗不會導致 API 崩潰
|
||||
- 使用 BatchSpanProcessor 非同步傳輸
|
||||
@@ -61,30 +65,34 @@ _initialized: bool = False
|
||||
|
||||
def _validate_endpoint() -> bool:
|
||||
"""
|
||||
四主機架構強制校驗
|
||||
OTEL Endpoint 校驗(config-driven,P0-08 ADR-121 修正版)
|
||||
|
||||
OTEL Endpoint 必須指向 192.168.0.188 (AI+Web 中心)
|
||||
允許 host 清單:settings.OTEL_ALLOWED_ENDPOINTS(預設 192.168.0.188)
|
||||
禁止 host 清單:settings.OTEL_FORBIDDEN_ENDPOINTS(DevOps / DB / 其他主機)
|
||||
"""
|
||||
endpoint = settings.OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
allowed = settings.OTEL_ALLOWED_ENDPOINTS
|
||||
forbidden = settings.OTEL_FORBIDDEN_ENDPOINTS
|
||||
|
||||
# 檢查是否為合法的 AI+Web 中心
|
||||
if "192.168.0.188" not in endpoint:
|
||||
_logger.error(
|
||||
f"四主機架構違規! OTEL Endpoint 必須指向 192.168.0.188, "
|
||||
f"當前: {endpoint}"
|
||||
)
|
||||
return False
|
||||
|
||||
# 檢查是否誤指向其他主機
|
||||
forbidden_hosts = ["192.168.0.110", "192.168.0.112", "192.168.0.120", "192.168.0.121"]
|
||||
for host in forbidden_hosts:
|
||||
# 明確禁止的 host 優先判斷
|
||||
for host in forbidden:
|
||||
if host in endpoint:
|
||||
_logger.error(
|
||||
f"四主機架構違規! OTEL Endpoint 禁止指向 {host}, "
|
||||
f"必須使用 192.168.0.188"
|
||||
"otel_endpoint_forbidden_host",
|
||||
endpoint=endpoint,
|
||||
forbidden_host=host,
|
||||
)
|
||||
return False
|
||||
|
||||
# 確認至少有一個允許 host 命中
|
||||
if not any(h in endpoint for h in allowed):
|
||||
_logger.error(
|
||||
"otel_endpoint_not_in_allowlist",
|
||||
endpoint=endpoint,
|
||||
allowed=allowed,
|
||||
)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
|
||||
691
apps/api/src/db/awooop_models.py
Normal file
691
apps/api/src/db/awooop_models.py
Normal file
@@ -0,0 +1,691 @@
|
||||
"""
|
||||
AwoooP Control Plane Models
|
||||
============================
|
||||
Phase 1 新表:六合約 control plane、tenant 隔離、principal mapping。
|
||||
ADR-111~118,2026-05-04 ogt + Claude Sonnet 4.6
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
from sqlalchemy import (
|
||||
Boolean,
|
||||
CheckConstraint,
|
||||
ForeignKey,
|
||||
Index,
|
||||
Integer,
|
||||
Numeric,
|
||||
SmallInteger,
|
||||
String,
|
||||
Text,
|
||||
UniqueConstraint,
|
||||
text,
|
||||
)
|
||||
from sqlalchemy.dialects.postgresql import JSONB
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
|
||||
from src.db.base import Base
|
||||
|
||||
|
||||
class AwoooPProject(Base):
|
||||
"""租戶主表(ADR-111 bootstrap,ADR-115 tenant onboarding)"""
|
||||
|
||||
__tablename__ = "awooop_projects"
|
||||
__table_args__ = (
|
||||
CheckConstraint(
|
||||
"migration_mode IN ('legacy_awoooi_default','shadow','canary','active')",
|
||||
name="chk_migration_mode",
|
||||
),
|
||||
CheckConstraint(
|
||||
"budget_limit_usd IS NULL OR budget_limit_usd >= 0",
|
||||
name="chk_budget_non_negative",
|
||||
),
|
||||
CheckConstraint(
|
||||
"jsonb_typeof(allowed_channels) = 'array'",
|
||||
name="chk_allowed_channels_array",
|
||||
),
|
||||
)
|
||||
|
||||
project_id: Mapped[str] = mapped_column(String(64), primary_key=True)
|
||||
display_name: Mapped[str] = mapped_column(String(256), nullable=False)
|
||||
migration_mode: Mapped[str] = mapped_column(
|
||||
String(32), nullable=False, default="legacy_awoooi_default"
|
||||
)
|
||||
budget_limit_usd: Mapped[Decimal | None] = mapped_column(
|
||||
Numeric(14, 4), nullable=True
|
||||
)
|
||||
allowed_channels: Mapped[list[Any]] = mapped_column(
|
||||
JSONB, nullable=False, server_default=text("'[]'::jsonb")
|
||||
)
|
||||
is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
updated_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
|
||||
|
||||
class AwoooPContractRevision(Base):
|
||||
"""六合約共用 revision 表(append-only,ADR-107/ADR-112)"""
|
||||
|
||||
__tablename__ = "awooop_contract_revisions"
|
||||
__table_args__ = (
|
||||
UniqueConstraint(
|
||||
"project_id", "contract_family", "contract_id",
|
||||
"version_major", "version_minor",
|
||||
name="uq_revision_version",
|
||||
),
|
||||
CheckConstraint(
|
||||
"contract_family IN ("
|
||||
"'project_tenant','agent','mcp_gateway','policy_routing',"
|
||||
"'runtime_run_state','channel_event','platform_resource')",
|
||||
name="chk_contract_family",
|
||||
),
|
||||
CheckConstraint(
|
||||
"lifecycle_status IN ('draft','published','active','revoked')",
|
||||
name="chk_lifecycle",
|
||||
),
|
||||
CheckConstraint("version_major >= 0", name="chk_version_major_non_neg"),
|
||||
CheckConstraint("version_minor >= 0", name="chk_version_minor_non_neg"),
|
||||
CheckConstraint(
|
||||
r"body_hash ~ '^[0-9a-f]{64}$'", name="chk_body_hash_format"
|
||||
),
|
||||
Index(
|
||||
"idx_revisions_lookup",
|
||||
"project_id", "contract_family", "contract_id",
|
||||
"lifecycle_status", "version_major", "version_minor",
|
||||
),
|
||||
Index("idx_revisions_hash", "body_hash"),
|
||||
)
|
||||
|
||||
revision_id: Mapped[UUID] = mapped_column(
|
||||
primary_key=True, server_default=text("gen_random_uuid()")
|
||||
)
|
||||
project_id: Mapped[str] = mapped_column(
|
||||
String(64), ForeignKey("awooop_projects.project_id"), nullable=False
|
||||
)
|
||||
contract_family: Mapped[str] = mapped_column(String(32), nullable=False)
|
||||
contract_id: Mapped[str] = mapped_column(String(128), nullable=False)
|
||||
version_major: Mapped[int] = mapped_column(SmallInteger, nullable=False, default=1)
|
||||
version_minor: Mapped[int] = mapped_column(SmallInteger, nullable=False, default=0)
|
||||
lifecycle_status: Mapped[str] = mapped_column(
|
||||
String(16), nullable=False, default="draft"
|
||||
)
|
||||
body_json: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False)
|
||||
body_hash: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
body_schema_version: Mapped[str] = mapped_column(
|
||||
String(16), nullable=False, default="v1.0"
|
||||
)
|
||||
publish_signature: Mapped[str | None] = mapped_column(String(128), nullable=True)
|
||||
publisher_id: Mapped[str | None] = mapped_column(String(128), nullable=True)
|
||||
published_at: Mapped[datetime | None] = mapped_column(nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
|
||||
|
||||
class AwoooPActiveRevision(Base):
|
||||
"""Active revision pointer(ADR-107/ADR-113)"""
|
||||
|
||||
__tablename__ = "awooop_active_revisions"
|
||||
__table_args__ = (
|
||||
UniqueConstraint(
|
||||
"project_id", "contract_family", "contract_id",
|
||||
name="uq_active_pointer",
|
||||
),
|
||||
)
|
||||
|
||||
pointer_id: Mapped[UUID] = mapped_column(
|
||||
primary_key=True, server_default=text("gen_random_uuid()")
|
||||
)
|
||||
project_id: Mapped[str] = mapped_column(
|
||||
String(64), ForeignKey("awooop_projects.project_id"), nullable=False
|
||||
)
|
||||
contract_family: Mapped[str] = mapped_column(String(32), nullable=False)
|
||||
contract_id: Mapped[str] = mapped_column(String(128), nullable=False)
|
||||
active_revision_id: Mapped[UUID] = mapped_column(
|
||||
ForeignKey("awooop_contract_revisions.revision_id", ondelete="RESTRICT"),
|
||||
nullable=False,
|
||||
)
|
||||
updated_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
|
||||
|
||||
class AwoooPContractOutbox(Base):
|
||||
"""Transactional outbox for contract revision invalidation(ADR-113)"""
|
||||
|
||||
__tablename__ = "awooop_contract_outbox"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("new_revision_id", "event_type", name="uq_outbox_event"),
|
||||
Index(
|
||||
"idx_outbox_pending",
|
||||
"next_retry_at", "created_at",
|
||||
postgresql_where=text("delivered_at IS NULL"),
|
||||
),
|
||||
Index(
|
||||
"idx_outbox_backlog_per_project",
|
||||
"project_id", "created_at",
|
||||
postgresql_where=text("delivered_at IS NULL"),
|
||||
),
|
||||
)
|
||||
|
||||
event_id: Mapped[UUID] = mapped_column(
|
||||
primary_key=True, server_default=text("gen_random_uuid()")
|
||||
)
|
||||
event_type: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
project_id: Mapped[str] = mapped_column(
|
||||
String(64), ForeignKey("awooop_projects.project_id"), nullable=False
|
||||
)
|
||||
contract_family: Mapped[str] = mapped_column(String(32), nullable=False)
|
||||
contract_id: Mapped[str] = mapped_column(String(128), nullable=False)
|
||||
old_revision_id: Mapped[UUID | None] = mapped_column(
|
||||
ForeignKey("awooop_contract_revisions.revision_id"), nullable=True
|
||||
)
|
||||
new_revision_id: Mapped[UUID] = mapped_column(
|
||||
ForeignKey("awooop_contract_revisions.revision_id"), nullable=False
|
||||
)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
delivered_at: Mapped[datetime | None] = mapped_column(nullable=True)
|
||||
relay_attempts: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
||||
next_retry_at: Mapped[datetime | None] = mapped_column(nullable=True)
|
||||
last_error: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
|
||||
|
||||
class AwoooPChannelEventDedupe(Base):
|
||||
"""Channel event idempotency key(ADR-114,partitioned by created_at)"""
|
||||
|
||||
__tablename__ = "awooop_channel_event_dedupe"
|
||||
__table_args__ = (
|
||||
UniqueConstraint(
|
||||
"project_id", "channel_type", "provider_event_id", "created_at",
|
||||
name="uq_channel_event_dedupe",
|
||||
),
|
||||
Index("idx_dedupe_run", "run_id"),
|
||||
)
|
||||
|
||||
# Composite PK(partition key 必須是 PK 一部分)
|
||||
# SQLAlchemy 2.x 要求 primary_key=True 標在 mapped_column,不能用 __mapper_args__ 字串 list
|
||||
dedupe_id: Mapped[UUID] = mapped_column(
|
||||
primary_key=True, server_default=text("gen_random_uuid()")
|
||||
)
|
||||
project_id: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
channel_type: Mapped[str] = mapped_column(String(32), nullable=False)
|
||||
provider_event_id: Mapped[str] = mapped_column(String(256), nullable=False)
|
||||
run_id: Mapped[UUID] = mapped_column(nullable=False)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
primary_key=True, server_default=text("NOW()")
|
||||
)
|
||||
|
||||
|
||||
class AwoooPPlatformSubject(Base):
|
||||
"""Canonical principal mapping(ADR-115)"""
|
||||
|
||||
__tablename__ = "awooop_platform_subjects"
|
||||
__table_args__ = (
|
||||
UniqueConstraint(
|
||||
"project_id", "channel_type", "channel_user_id",
|
||||
name="uq_platform_subject",
|
||||
),
|
||||
CheckConstraint(
|
||||
"jsonb_typeof(roles) = 'array'", name="chk_roles_array"
|
||||
),
|
||||
Index(
|
||||
"idx_platform_subjects_lookup",
|
||||
"project_id", "channel_type", "channel_user_id",
|
||||
),
|
||||
Index(
|
||||
"idx_platform_subjects_resolve",
|
||||
"project_id", "platform_subject_id",
|
||||
),
|
||||
Index(
|
||||
"idx_platform_subjects_last_seen",
|
||||
"project_id", "last_seen_at",
|
||||
),
|
||||
)
|
||||
|
||||
subject_id: Mapped[UUID] = mapped_column(
|
||||
primary_key=True, server_default=text("gen_random_uuid()")
|
||||
)
|
||||
project_id: Mapped[str] = mapped_column(
|
||||
String(64), ForeignKey("awooop_projects.project_id"), nullable=False
|
||||
)
|
||||
channel_type: Mapped[str] = mapped_column(String(32), nullable=False)
|
||||
channel_user_id: Mapped[str] = mapped_column(String(256), nullable=False)
|
||||
channel_chat_id: Mapped[str | None] = mapped_column(String(256), nullable=True)
|
||||
platform_subject_id: Mapped[str] = mapped_column(String(128), nullable=False)
|
||||
display_name: Mapped[str | None] = mapped_column(String(256), nullable=True)
|
||||
roles: Mapped[list[str]] = mapped_column(
|
||||
JSONB, nullable=False, server_default=text("'[]'::jsonb")
|
||||
)
|
||||
first_seen_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
last_seen_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
|
||||
|
||||
class AwoooPProjectMigrationState(Base):
|
||||
"""Strangler Fig migration state per project × capability(ADR-106 遷移追蹤)"""
|
||||
|
||||
__tablename__ = "awooop_project_migration_state"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("project_id", "capability", name="uq_project_capability"),
|
||||
CheckConstraint(
|
||||
"capability IN ("
|
||||
"'run_execution','contract_governance',"
|
||||
"'budget_tracking','principal_mapping')",
|
||||
name="chk_capability",
|
||||
),
|
||||
CheckConstraint(
|
||||
"current_phase IN ("
|
||||
"'legacy_awoooi_default','shadow','canary',"
|
||||
"'read_only','suggest','auto_remediate')",
|
||||
name="chk_phase",
|
||||
),
|
||||
)
|
||||
|
||||
state_id: Mapped[UUID] = mapped_column(
|
||||
primary_key=True, server_default=text("gen_random_uuid()")
|
||||
)
|
||||
project_id: Mapped[str] = mapped_column(
|
||||
String(64), ForeignKey("awooop_projects.project_id"), nullable=False
|
||||
)
|
||||
capability: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
current_phase: Mapped[str] = mapped_column(
|
||||
String(32), nullable=False, default="legacy_awoooi_default"
|
||||
)
|
||||
phase_entered_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
updated_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Phase 4: Run State Machine(ADR-114/ADR-119)
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
class AwoooPRunState(Base):
|
||||
"""Run FSM 主表(SKIP LOCKED worker lease,ADR-114)"""
|
||||
|
||||
__tablename__ = "awooop_run_state"
|
||||
__table_args__ = (
|
||||
CheckConstraint(
|
||||
"state IN ("
|
||||
"'pending','running','waiting_tool',"
|
||||
"'waiting_approval','completed','failed','cancelled','timeout')",
|
||||
name="chk_run_state",
|
||||
),
|
||||
Index("idx_run_state_pending", "project_id", "created_at",
|
||||
postgresql_where=text("state = 'pending' AND lease_until IS NULL")),
|
||||
Index("idx_run_state_stale", "lease_until",
|
||||
postgresql_where=text("state = 'running' AND lease_until IS NOT NULL")),
|
||||
Index("idx_run_state_project_timeline", "project_id", "created_at"),
|
||||
Index("idx_run_state_trace_id", "trace_id",
|
||||
postgresql_where=text("trace_id IS NOT NULL")),
|
||||
)
|
||||
|
||||
run_id: Mapped[UUID] = mapped_column(primary_key=True)
|
||||
project_id: Mapped[str] = mapped_column(
|
||||
String(64), ForeignKey("awooop_projects.project_id"), nullable=False
|
||||
)
|
||||
agent_id: Mapped[str] = mapped_column(String(128), nullable=False)
|
||||
state: Mapped[str] = mapped_column(String(32), nullable=False, default="pending")
|
||||
lease_until: Mapped[datetime | None] = mapped_column(nullable=True)
|
||||
heartbeat_at: Mapped[datetime | None] = mapped_column(nullable=True)
|
||||
worker_id: Mapped[str | None] = mapped_column(String(128), nullable=True)
|
||||
attempt_count: Mapped[int] = mapped_column(SmallInteger, nullable=False, default=0)
|
||||
max_attempts: Mapped[int] = mapped_column(SmallInteger, nullable=False, default=3)
|
||||
trace_id: Mapped[str | None] = mapped_column(String(128), nullable=True)
|
||||
trigger_type: Mapped[str | None] = mapped_column(String(32), nullable=True)
|
||||
trigger_ref: Mapped[str | None] = mapped_column(String(256), nullable=True)
|
||||
is_shadow: Mapped[bool] = mapped_column(Boolean, nullable=False, default=True)
|
||||
input_sha256: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
output_sha256: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
cost_usd: Mapped[Decimal] = mapped_column(
|
||||
Numeric(10, 4), nullable=False, default=Decimal("0.0000")
|
||||
)
|
||||
step_count: Mapped[int] = mapped_column(SmallInteger, nullable=False, default=0)
|
||||
error_code: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
error_detail: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
started_at: Mapped[datetime | None] = mapped_column(nullable=True)
|
||||
completed_at: Mapped[datetime | None] = mapped_column(nullable=True)
|
||||
timeout_at: Mapped[datetime | None] = mapped_column(nullable=True)
|
||||
|
||||
|
||||
class AwoooPRunStepJournal(Base):
|
||||
"""SAGA step journal(ADR-119)— 每個 tool call 獨立記錄"""
|
||||
|
||||
__tablename__ = "awooop_run_step_journal"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("run_id", "step_seq", name="uix_run_step_seq"),
|
||||
CheckConstraint(
|
||||
"result_status IN ('pending','success','failed','compensated')",
|
||||
name="chk_step_result_status",
|
||||
),
|
||||
Index("idx_run_step_run_id", "run_id", "step_seq"),
|
||||
)
|
||||
|
||||
step_id: Mapped[UUID] = mapped_column(
|
||||
primary_key=True, server_default=text("gen_random_uuid()")
|
||||
)
|
||||
run_id: Mapped[UUID] = mapped_column(
|
||||
ForeignKey("awooop_run_state.run_id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
project_id: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
step_seq: Mapped[int] = mapped_column(SmallInteger, nullable=False)
|
||||
tool_name: Mapped[str] = mapped_column(String(128), nullable=False)
|
||||
mcp_gateway_id: Mapped[str | None] = mapped_column(String(128), nullable=True)
|
||||
input_hash: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
output_hash: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
compensation_json: Mapped[dict[str, Any] | None] = mapped_column(JSONB, nullable=True)
|
||||
result_status: Mapped[str] = mapped_column(String(16), nullable=False, default="pending")
|
||||
error_code: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
was_blocked: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
||||
block_reason: Mapped[str | None] = mapped_column(String(128), nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
completed_at: Mapped[datetime | None] = mapped_column(nullable=True)
|
||||
latency_ms: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
|
||||
|
||||
class AwoooPRunIdempotency(Base):
|
||||
"""Run 去重冪等表(ADR-114)— (project_id, channel_type, provider_event_id) → run_id"""
|
||||
|
||||
__tablename__ = "awooop_run_idempotency"
|
||||
__table_args__ = (
|
||||
UniqueConstraint(
|
||||
"project_id", "channel_type", "provider_event_id",
|
||||
name="uix_run_idempotency_key",
|
||||
),
|
||||
Index("idx_run_idempotency_run_id", "run_id"),
|
||||
)
|
||||
|
||||
idempotency_id: Mapped[UUID] = mapped_column(
|
||||
primary_key=True, server_default=text("gen_random_uuid()")
|
||||
)
|
||||
project_id: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
channel_type: Mapped[str] = mapped_column(String(32), nullable=False)
|
||||
provider_event_id: Mapped[str] = mapped_column(String(256), nullable=False)
|
||||
run_id: Mapped[UUID] = mapped_column(
|
||||
ForeignKey("awooop_run_state.run_id"), nullable=False
|
||||
)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Phase 5: MCP Gateway 四表(ADR-116/ADR-118,2026-05-04)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class AwoooPMcpToolRegistry(Base):
|
||||
"""MCP Tool 白名單(Gate 3: Tool)"""
|
||||
|
||||
__tablename__ = "awooop_mcp_tool_registry"
|
||||
__table_args__ = (
|
||||
CheckConstraint(
|
||||
"tool_type IN ('builtin','mcp_server','custom')",
|
||||
name="chk_tool_type",
|
||||
),
|
||||
CheckConstraint(
|
||||
"jsonb_typeof(allowed_scopes) = 'array'",
|
||||
name="chk_allowed_scopes_array",
|
||||
),
|
||||
UniqueConstraint("project_id", "tool_name", name="uix_tool_registry_project_name"),
|
||||
Index("idx_mcp_tool_registry_project", "project_id", "is_active"),
|
||||
)
|
||||
|
||||
tool_id: Mapped[UUID] = mapped_column(
|
||||
primary_key=True, server_default=text("gen_random_uuid()")
|
||||
)
|
||||
project_id: Mapped[str] = mapped_column(
|
||||
String(64), ForeignKey("awooop_projects.project_id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
tool_name: Mapped[str] = mapped_column(String(128), nullable=False)
|
||||
tool_type: Mapped[str] = mapped_column(String(32), nullable=False)
|
||||
description: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
allowed_scopes: Mapped[list[Any]] = mapped_column(JSONB, nullable=False, default=list)
|
||||
environment_tags: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict)
|
||||
is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
updated_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
|
||||
|
||||
class AwoooPMcpGrant(Base):
|
||||
"""Agent × Tool 授權記錄(Gate 2 + Gate 3)"""
|
||||
|
||||
__tablename__ = "awooop_mcp_grants"
|
||||
__table_args__ = (
|
||||
CheckConstraint(
|
||||
"jsonb_typeof(granted_scopes) = 'array'",
|
||||
name="chk_grant_scopes_array",
|
||||
),
|
||||
CheckConstraint(
|
||||
"(is_revoked = FALSE AND revoked_at IS NULL AND revoked_by IS NULL)"
|
||||
" OR (is_revoked = TRUE AND revoked_at IS NOT NULL)",
|
||||
name="chk_revoke_consistency",
|
||||
),
|
||||
UniqueConstraint("project_id", "agent_id", "tool_id", name="uix_mcp_grant_agent_tool"),
|
||||
Index(
|
||||
"idx_mcp_grants_lookup", "project_id", "agent_id", "tool_id",
|
||||
postgresql_where=text("is_revoked = FALSE"),
|
||||
),
|
||||
)
|
||||
|
||||
grant_id: Mapped[UUID] = mapped_column(
|
||||
primary_key=True, server_default=text("gen_random_uuid()")
|
||||
)
|
||||
project_id: Mapped[str] = mapped_column(
|
||||
String(64), ForeignKey("awooop_projects.project_id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
agent_id: Mapped[str] = mapped_column(String(128), nullable=False)
|
||||
tool_id: Mapped[UUID] = mapped_column(
|
||||
ForeignKey("awooop_mcp_tool_registry.tool_id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
granted_by: Mapped[str] = mapped_column(String(128), nullable=False)
|
||||
granted_scopes: Mapped[list[Any]] = mapped_column(JSONB, nullable=False, default=list)
|
||||
expires_at: Mapped[datetime | None] = mapped_column(nullable=True)
|
||||
is_revoked: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
||||
revoked_at: Mapped[datetime | None] = mapped_column(nullable=True)
|
||||
revoked_by: Mapped[str | None] = mapped_column(String(128), nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
|
||||
|
||||
class AwoooPMcpCredentialRef(Base):
|
||||
"""k8s Secret 參照(ADR-118 credential isolation)— 只存路徑,不存明文"""
|
||||
|
||||
__tablename__ = "awooop_mcp_credential_refs"
|
||||
__table_args__ = (
|
||||
CheckConstraint(
|
||||
r"k8s_secret_ref ~ '^[a-z0-9-]+/[a-z0-9-]+#[a-zA-Z0-9_-]+$'",
|
||||
name="chk_k8s_ref_format",
|
||||
),
|
||||
CheckConstraint(
|
||||
r"value_sha256 IS NULL OR value_sha256 ~ '^[0-9a-f]{64}$'",
|
||||
name="chk_value_sha256_hex",
|
||||
),
|
||||
UniqueConstraint("tool_id", "k8s_secret_ref", name="uix_credential_ref_tool"),
|
||||
Index("idx_mcp_cred_refs_tool", "tool_id", postgresql_where=text("is_active = TRUE")),
|
||||
)
|
||||
|
||||
ref_id: Mapped[UUID] = mapped_column(
|
||||
primary_key=True, server_default=text("gen_random_uuid()")
|
||||
)
|
||||
tool_id: Mapped[UUID] = mapped_column(
|
||||
ForeignKey("awooop_mcp_tool_registry.tool_id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
project_id: Mapped[str] = mapped_column(
|
||||
String(64), ForeignKey("awooop_projects.project_id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
k8s_secret_ref: Mapped[str] = mapped_column(String(256), nullable=False)
|
||||
value_sha256: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
description: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
rotated_at: Mapped[datetime | None] = mapped_column(nullable=True)
|
||||
|
||||
|
||||
class AwoooPMcpGatewayAudit(Base):
|
||||
"""MCP Gateway call 稽核日誌(ADR-116 P1-09)"""
|
||||
|
||||
__tablename__ = "awooop_mcp_gateway_audit"
|
||||
__table_args__ = (
|
||||
CheckConstraint(
|
||||
"result_status IN ('success','blocked','failed','timeout')",
|
||||
name="chk_gateway_result_status",
|
||||
),
|
||||
CheckConstraint(
|
||||
"block_gate IS NULL OR (block_gate >= 1 AND block_gate <= 5)",
|
||||
name="chk_block_gate_range",
|
||||
),
|
||||
Index("idx_mcp_audit_run", "project_id", "run_id", "created_at"),
|
||||
Index(
|
||||
"idx_mcp_audit_blocked", "project_id", "block_gate", "created_at",
|
||||
postgresql_where=text("result_status = 'blocked'"),
|
||||
),
|
||||
)
|
||||
|
||||
call_id: Mapped[UUID] = mapped_column(
|
||||
primary_key=True, server_default=text("gen_random_uuid()")
|
||||
)
|
||||
project_id: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
run_id: Mapped[UUID | None] = mapped_column(nullable=True)
|
||||
trace_id: Mapped[str | None] = mapped_column(String(128), nullable=True)
|
||||
agent_id: Mapped[str | None] = mapped_column(String(128), nullable=True)
|
||||
tool_id: Mapped[UUID | None] = mapped_column(
|
||||
ForeignKey("awooop_mcp_tool_registry.tool_id"), nullable=True
|
||||
)
|
||||
tool_name: Mapped[str] = mapped_column(String(128), nullable=False)
|
||||
credential_ref: Mapped[str | None] = mapped_column(String(256), nullable=True)
|
||||
input_hash: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
output_hash: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
gate_result: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict)
|
||||
result_status: Mapped[str] = mapped_column(String(16), nullable=False)
|
||||
block_gate: Mapped[int | None] = mapped_column(SmallInteger, nullable=True)
|
||||
block_reason: Mapped[str | None] = mapped_column(String(256), nullable=True)
|
||||
latency_ms: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Phase 7: Channel Hub 雙表(ADR-106 channel_event family,2026-05-04)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class AwoooPConversationEvent(Base):
|
||||
"""入站 Channel Event 鏡像(Telegram/LINE inbound,不儲存明文)"""
|
||||
|
||||
__tablename__ = "awooop_conversation_event"
|
||||
__table_args__ = (
|
||||
CheckConstraint(
|
||||
"channel_type IN ('telegram','line','slack','api','internal')",
|
||||
name="chk_conv_event_channel_type",
|
||||
),
|
||||
CheckConstraint(
|
||||
"content_type IN ('text','photo','document','command','callback_query')",
|
||||
name="chk_conv_event_content_type",
|
||||
),
|
||||
UniqueConstraint(
|
||||
"project_id", "channel_type", "provider_event_id",
|
||||
name="uix_conv_event_dedup",
|
||||
),
|
||||
Index("idx_conv_event_run", "project_id", "run_id", "received_at"),
|
||||
Index("idx_conv_event_subject", "project_id", "platform_subject_id", "received_at"),
|
||||
)
|
||||
|
||||
event_id: Mapped[UUID] = mapped_column(
|
||||
primary_key=True, server_default=text("gen_random_uuid()")
|
||||
)
|
||||
project_id: Mapped[str] = mapped_column(
|
||||
String(64), ForeignKey("awooop_projects.project_id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
channel_type: Mapped[str] = mapped_column(String(32), nullable=False)
|
||||
provider_event_id: Mapped[str] = mapped_column(String(256), nullable=False)
|
||||
platform_subject_id: Mapped[str | None] = mapped_column(String(128), nullable=True)
|
||||
channel_user_id: Mapped[str | None] = mapped_column(String(256), nullable=True)
|
||||
channel_chat_id: Mapped[str | None] = mapped_column(String(256), nullable=True)
|
||||
run_id: Mapped[UUID | None] = mapped_column(nullable=True)
|
||||
content_type: Mapped[str] = mapped_column(String(32), nullable=False, default="text")
|
||||
content_hash: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
content_preview: Mapped[str | None] = mapped_column(String(256), nullable=True)
|
||||
attachment_sha256: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
is_duplicate: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
||||
provider_ts: Mapped[datetime | None] = mapped_column(nullable=True)
|
||||
received_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
|
||||
|
||||
class AwoooPOutboundMessage(Base):
|
||||
"""出站訊息記錄(interim/final/approval_request + shadow status)"""
|
||||
|
||||
__tablename__ = "awooop_outbound_message"
|
||||
__table_args__ = (
|
||||
CheckConstraint(
|
||||
"channel_type IN ('telegram','line','slack','api','internal')",
|
||||
name="chk_outbound_channel_type",
|
||||
),
|
||||
CheckConstraint(
|
||||
"message_type IN ('interim','final','error','approval_request')",
|
||||
name="chk_outbound_message_type",
|
||||
),
|
||||
CheckConstraint(
|
||||
"send_status IN ('pending','sent','failed','shadow')",
|
||||
name="chk_outbound_send_status",
|
||||
),
|
||||
Index("idx_outbound_msg_run", "project_id", "run_id", "queued_at"),
|
||||
Index(
|
||||
"idx_outbound_msg_pending", "project_id", "channel_type", "queued_at",
|
||||
postgresql_where=text("send_status = 'pending'"),
|
||||
),
|
||||
)
|
||||
|
||||
message_id: Mapped[UUID] = mapped_column(
|
||||
primary_key=True, server_default=text("gen_random_uuid()")
|
||||
)
|
||||
project_id: Mapped[str] = mapped_column(
|
||||
String(64), ForeignKey("awooop_projects.project_id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
run_id: Mapped[UUID] = mapped_column(nullable=False)
|
||||
conversation_event_id: Mapped[UUID | None] = mapped_column(nullable=True)
|
||||
channel_type: Mapped[str] = mapped_column(String(32), nullable=False)
|
||||
channel_chat_id: Mapped[str] = mapped_column(String(256), nullable=False)
|
||||
message_type: Mapped[str] = mapped_column(String(32), nullable=False)
|
||||
content_hash: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
content_preview: Mapped[str | None] = mapped_column(String(256), nullable=True)
|
||||
provider_message_id: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
send_status: Mapped[str] = mapped_column(String(16), nullable=False, default="pending")
|
||||
send_error: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
queued_at: Mapped[datetime] = mapped_column(
|
||||
nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
sent_at: Mapped[datetime | None] = mapped_column(nullable=True)
|
||||
triggered_by_state: Mapped[str | None] = mapped_column(String(32), nullable=True)
|
||||
waiting_since: Mapped[datetime | None] = mapped_column(nullable=True)
|
||||
@@ -106,6 +106,11 @@ async def get_db() -> AsyncGenerator[AsyncSession, None]:
|
||||
factory = get_session_factory()
|
||||
async with factory() as session:
|
||||
try:
|
||||
# AwoooP Phase 2.3 (2026-05-04 ogt): SET LOCAL app.project_id 讓 RLS Policy 生效
|
||||
# 預設 'awoooi',多租戶路由將在 middleware 注入實際 project_id
|
||||
await session.execute(
|
||||
text("SELECT set_config('app.project_id', 'awoooi', TRUE)")
|
||||
)
|
||||
yield session
|
||||
await session.commit()
|
||||
except Exception:
|
||||
@@ -114,17 +119,30 @@ async def get_db() -> AsyncGenerator[AsyncSession, None]:
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def get_db_context() -> AsyncGenerator[AsyncSession, None]:
|
||||
async def get_db_context(project_id: str | None = None) -> AsyncGenerator[AsyncSession, None]:
|
||||
"""
|
||||
Context manager for database session (non-FastAPI usage)
|
||||
|
||||
AwoooP Phase 2.3/2.4: 優先序 — 明確參數 > contextvar > "awoooi"
|
||||
- Phase 2.3: 啟用 RLS tenant isolation(SET LOCAL app.project_id)
|
||||
- Phase 2.4: 從 asyncio contextvar 讀取 background loop 的 project_id
|
||||
|
||||
Usage:
|
||||
async with get_db_context() as db:
|
||||
async with get_db_context() as db: # 繼承 contextvar 或預設 awoooi
|
||||
...
|
||||
async with get_db_context("other-tenant") as db: # 明確指定 tenant
|
||||
...
|
||||
"""
|
||||
from src.core.context import get_current_project_id
|
||||
effective_pid = project_id if project_id is not None else get_current_project_id()
|
||||
|
||||
factory = get_session_factory()
|
||||
async with factory() as session:
|
||||
try:
|
||||
await session.execute(
|
||||
text("SELECT set_config('app.project_id', :pid, TRUE)"),
|
||||
{"pid": effective_pid},
|
||||
)
|
||||
yield session
|
||||
await session.commit()
|
||||
except Exception:
|
||||
@@ -299,6 +317,62 @@ async def init_db() -> None:
|
||||
"ON timeline_events(incident_id);"
|
||||
))
|
||||
|
||||
# AwoooP Phase 2.6 (2026-05-04 ogt): budget_ledger 建表(ADR-120 Token Budget Hard Kill)
|
||||
await conn.execute(text("""
|
||||
CREATE TABLE IF NOT EXISTS budget_ledger (
|
||||
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
|
||||
project_id VARCHAR(64) NOT NULL DEFAULT 'awoooi',
|
||||
agent_id VARCHAR(128),
|
||||
run_id UUID,
|
||||
model VARCHAR(64),
|
||||
provider VARCHAR(32),
|
||||
prompt_tokens INT,
|
||||
completion_tokens INT,
|
||||
cost_usd NUMERIC(10, 4) NOT NULL DEFAULT 0.0000,
|
||||
recorded_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
"""))
|
||||
await conn.execute(text(
|
||||
"CREATE INDEX IF NOT EXISTS idx_budget_ledger_project_date "
|
||||
"ON budget_ledger(project_id, recorded_at DESC);"
|
||||
))
|
||||
|
||||
# AwoooP Phase 2.3 (2026-05-04 ogt): 四表加 project_id(RLS 多租戶隔離)
|
||||
# 防禦性 ALTER — 已存在欄位為 no-op,安全。
|
||||
# Batch 1 RLS migration 執行後,app.project_id 由 get_db_context() 自動設置。
|
||||
await conn.execute(text(
|
||||
"ALTER TABLE incidents "
|
||||
"ADD COLUMN IF NOT EXISTS project_id VARCHAR(64) NOT NULL DEFAULT 'awoooi';"
|
||||
))
|
||||
await conn.execute(text(
|
||||
"CREATE INDEX IF NOT EXISTS idx_incidents_project_id "
|
||||
"ON incidents (project_id);"
|
||||
))
|
||||
await conn.execute(text(
|
||||
"ALTER TABLE knowledge_entries "
|
||||
"ADD COLUMN IF NOT EXISTS project_id VARCHAR(64) NOT NULL DEFAULT 'awoooi';"
|
||||
))
|
||||
await conn.execute(text(
|
||||
"CREATE INDEX IF NOT EXISTS idx_knowledge_entries_project_id "
|
||||
"ON knowledge_entries (project_id);"
|
||||
))
|
||||
await conn.execute(text(
|
||||
"ALTER TABLE playbooks "
|
||||
"ADD COLUMN IF NOT EXISTS project_id VARCHAR(64) NOT NULL DEFAULT 'awoooi';"
|
||||
))
|
||||
await conn.execute(text(
|
||||
"CREATE INDEX IF NOT EXISTS idx_playbooks_project_id "
|
||||
"ON playbooks (project_id);"
|
||||
))
|
||||
await conn.execute(text(
|
||||
"ALTER TABLE audit_logs "
|
||||
"ADD COLUMN IF NOT EXISTS project_id VARCHAR(64) NOT NULL DEFAULT 'awoooi';"
|
||||
))
|
||||
await conn.execute(text(
|
||||
"CREATE INDEX IF NOT EXISTS idx_audit_logs_project_id "
|
||||
"ON audit_logs (project_id);"
|
||||
))
|
||||
|
||||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 6 自我治理閉環
|
||||
# ADR-087: ai_governance_events 不可變 Event Sourcing 表
|
||||
# asyncpg 不允許 prepared statement 內多條指令,必須分開 execute
|
||||
|
||||
@@ -11,8 +11,9 @@ Schema 設計原則:
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
from uuid import uuid4
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from sqlalchemy import (
|
||||
JSON,
|
||||
@@ -25,6 +26,7 @@ from sqlalchemy import (
|
||||
ForeignKey,
|
||||
Index,
|
||||
Integer,
|
||||
Numeric,
|
||||
String,
|
||||
Text,
|
||||
text,
|
||||
@@ -34,6 +36,7 @@ from sqlalchemy import (
|
||||
)
|
||||
from sqlalchemy.dialects.postgresql import ENUM as PgEnum
|
||||
from sqlalchemy.dialects.postgresql import JSONB
|
||||
from sqlalchemy.dialects.postgresql import UUID as pg_UUID
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
|
||||
from src.db.base import Base
|
||||
@@ -368,6 +371,13 @@ class AuditLog(Base):
|
||||
default="default",
|
||||
nullable=False,
|
||||
)
|
||||
# AwoooP Phase 2.3 (2026-05-04 ogt): 多租戶隔離欄位,配合 Batch 1 RLS migration
|
||||
project_id: Mapped[str] = mapped_column(
|
||||
String(64),
|
||||
default="awoooi",
|
||||
nullable=False,
|
||||
index=True,
|
||||
)
|
||||
|
||||
# Execution Result
|
||||
success: Mapped[bool] = mapped_column(default=False, nullable=False)
|
||||
@@ -671,6 +681,13 @@ class IncidentRecord(Base):
|
||||
primary_key=True,
|
||||
comment="事件唯一識別碼 (如 INC-20260322-A1B2C3)",
|
||||
)
|
||||
# AwoooP Phase 2.3 (2026-05-04 ogt): 多租戶隔離欄位,配合 Batch 1 RLS migration
|
||||
project_id: Mapped[str] = mapped_column(
|
||||
String(64),
|
||||
default="awoooi",
|
||||
nullable=False,
|
||||
index=True,
|
||||
)
|
||||
|
||||
# === 狀態與嚴重度 ===
|
||||
status: Mapped[str] = mapped_column(
|
||||
@@ -813,6 +830,13 @@ class KnowledgeEntryRecord(Base):
|
||||
primary_key=True,
|
||||
default=generate_uuid,
|
||||
)
|
||||
# AwoooP Phase 2.3 (2026-05-04 ogt): 多租戶隔離欄位,配合 Batch 1 RLS migration
|
||||
project_id: Mapped[str] = mapped_column(
|
||||
String(64),
|
||||
default="awoooi",
|
||||
nullable=False,
|
||||
index=True,
|
||||
)
|
||||
|
||||
# Core Fields
|
||||
title: Mapped[str] = mapped_column(String(255), nullable=False)
|
||||
@@ -1075,6 +1099,13 @@ class PlaybookRecord(Base):
|
||||
String(36), primary_key=True,
|
||||
comment="Playbook 唯一識別碼 (PB-YYYYMMDD-XXXXXX)",
|
||||
)
|
||||
# AwoooP Phase 2.3 (2026-05-04 ogt): 多租戶隔離欄位,配合 Batch 1 RLS migration
|
||||
project_id: Mapped[str] = mapped_column(
|
||||
String(64),
|
||||
default="awoooi",
|
||||
nullable=False,
|
||||
index=True,
|
||||
)
|
||||
|
||||
# Core Fields
|
||||
name: Mapped[str] = mapped_column(String(256), nullable=False)
|
||||
@@ -1612,3 +1643,45 @@ class AIProviderVersionHistory(Base):
|
||||
__table_args__ = (
|
||||
Index("ix_provider_version_captured", "provider", "captured_at"),
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# BudgetLedgerRecord — ADR-120 Token Budget Hard Kill(Phase 2.6)
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6
|
||||
# =============================================================================
|
||||
|
||||
class BudgetLedgerRecord(Base):
|
||||
"""
|
||||
LLM call 費用記帳表(ADR-120 D5)
|
||||
|
||||
每次 LLM call 完成後插入一筆記錄,供:
|
||||
- Tenant Budget 累計計算(Redis 快取,每分鐘從此表同步)
|
||||
- 儀表板消費統計
|
||||
- 告警閾值觸發(80% / 95% / 100%)
|
||||
"""
|
||||
__tablename__ = "budget_ledger"
|
||||
|
||||
id: Mapped[UUID] = mapped_column(
|
||||
pg_UUID(as_uuid=True),
|
||||
primary_key=True,
|
||||
server_default=text("gen_random_uuid()"),
|
||||
)
|
||||
project_id: Mapped[str] = mapped_column(
|
||||
String(64), nullable=False, default="awoooi", index=True
|
||||
)
|
||||
agent_id: Mapped[str | None] = mapped_column(String(128), nullable=True)
|
||||
run_id: Mapped[UUID | None] = mapped_column(pg_UUID(as_uuid=True), nullable=True)
|
||||
model: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
provider: Mapped[str | None] = mapped_column(String(32), nullable=True)
|
||||
prompt_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
completion_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
cost_usd: Mapped[Decimal] = mapped_column(
|
||||
Numeric(10, 4), nullable=False, default=Decimal("0.0000")
|
||||
)
|
||||
recorded_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), nullable=False, server_default=text("NOW()")
|
||||
)
|
||||
|
||||
__table_args__ = (
|
||||
Index("idx_budget_ledger_project_date", "project_id", "recorded_at"),
|
||||
)
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
"""載入 .claude/agents/*.md 並解析 system prompt(ADR-095)
|
||||
|
||||
2026-04-24 Claude Sonnet 4.6 (WS4 Hermes NL)
|
||||
2026-05-04 Claude Sonnet 4.6 (Task 1.2): 移除本機絕對路徑,改用 AGENTS_DIR 環境變數
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import os
|
||||
import pathlib
|
||||
from functools import lru_cache
|
||||
|
||||
_AGENTS_DIR = pathlib.Path("/Users/ogt/awoooi/.claude/agents")
|
||||
# 本機預設: /Users/ogt/awoooi/.claude/agents(由 AGENTS_DIR 覆蓋)
|
||||
# K8s 容器預設: /app/.claude/agents(Dockerfile COPY .claude/agents/ ./.claude/agents/)
|
||||
_AGENTS_DIR = pathlib.Path(os.getenv("AGENTS_DIR", "/app/.claude/agents"))
|
||||
|
||||
|
||||
def _parse_agent_md(path: pathlib.Path) -> str:
|
||||
|
||||
@@ -9,6 +9,7 @@ Layer 1 意圖路由(關鍵字正則)→ Ollama 本地模型(111)→ Tel
|
||||
debugger/vuln → deepseek-r1:14b(推理); code agents → qwen2.5-coder:7b; 其他 → qwen2.5:7b-instruct
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
import time
|
||||
@@ -17,7 +18,6 @@ import httpx
|
||||
import structlog
|
||||
from sqlalchemy import text
|
||||
|
||||
from src.core.config import settings
|
||||
from src.core.redis_client import get_redis
|
||||
from src.db.base import get_db_context
|
||||
from src.hermes.agent_loader import get_agent_system_prompt
|
||||
@@ -139,11 +139,11 @@ async def _write_dispatch_log(
|
||||
# T2:per-chat_id 速率限制(ADR-094,fail-open)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def _check_rate_limit(chat_id: str) -> bool:
|
||||
async def _check_rate_limit(chat_id: str, project_id: str = "awoooi") -> bool:
|
||||
"""True = 允許;False = 超過限制(20 req/min per chat_id)。Redis 不可用時放行。"""
|
||||
try:
|
||||
redis = get_redis()
|
||||
key = f"hermes:rl:{chat_id}"
|
||||
key = f"{project_id}:hermes:rl:{chat_id}"
|
||||
count = await redis.incr(key)
|
||||
if count == 1:
|
||||
await redis.expire(key, _RATE_LIMIT_WINDOW_SEC)
|
||||
@@ -156,12 +156,15 @@ async def _check_rate_limit(chat_id: str) -> bool:
|
||||
# T3:Multi-turn session(Redis Hash TTL=300s,ADR-094)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def _load_session_context(chat_id: str, user_id: int) -> str:
|
||||
async def _load_session_context(chat_id: str, user_id: int, project_id: str = "awoooi") -> str:
|
||||
"""載入最近 3 輪對話歷史(最多 600 字),組成 context prefix。Redis 不可用時回空字串。"""
|
||||
try:
|
||||
redis = get_redis()
|
||||
key = f"hermes:session:{chat_id}:{user_id}"
|
||||
key = f"{project_id}:hermes:session:{chat_id}:{user_id}"
|
||||
data = await redis.hgetall(key)
|
||||
if not data:
|
||||
# Phase A: fallback 到舊 key(滾動部署相容)
|
||||
data = await redis.hgetall(f"hermes:session:{chat_id}:{user_id}")
|
||||
if not data:
|
||||
return ""
|
||||
turns = sorted(
|
||||
@@ -175,16 +178,19 @@ async def _load_session_context(chat_id: str, user_id: int) -> str:
|
||||
|
||||
|
||||
async def _save_session_turn(
|
||||
chat_id: str, user_id: int, user_msg: str, assistant_reply: str
|
||||
chat_id: str, user_id: int, user_msg: str, assistant_reply: str, project_id: str = "awoooi"
|
||||
) -> None:
|
||||
"""將本輪對話存入 Redis Hash,並重置 TTL=300s。Redis 不可用時靜默忽略。"""
|
||||
try:
|
||||
redis = get_redis()
|
||||
key = f"hermes:session:{chat_id}:{user_id}"
|
||||
key = f"{project_id}:hermes:session:{chat_id}:{user_id}"
|
||||
legacy_key = f"hermes:session:{chat_id}:{user_id}" # Phase A dual-write
|
||||
turn_key = f"turn_{int(time.time())}"
|
||||
value = f"用戶:{user_msg[:100]}\nHermes:{assistant_reply[:200]}"
|
||||
await redis.hset(key, turn_key, value)
|
||||
await redis.expire(key, 300)
|
||||
await redis.hset(legacy_key, turn_key, value)
|
||||
await redis.expire(legacy_key, 300)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -199,6 +205,7 @@ async def process_nl_message(
|
||||
chat_id: str,
|
||||
user_id: int,
|
||||
username: str = "",
|
||||
project_id: str = "awoooi",
|
||||
) -> str:
|
||||
"""
|
||||
處理 NL 訊息,回傳 Telegram 格式的回覆文字。
|
||||
@@ -231,7 +238,7 @@ async def process_nl_message(
|
||||
)
|
||||
|
||||
# T2:速率限制
|
||||
if not await _check_rate_limit(chat_id):
|
||||
if not await _check_rate_limit(chat_id, project_id):
|
||||
return "⚠️ 請求太頻繁,請稍後再試(每分鐘上限 20 次)。"
|
||||
|
||||
# Layer 1 意圖路由
|
||||
@@ -249,7 +256,7 @@ async def process_nl_message(
|
||||
system_prompt = get_agent_system_prompt(agent_name) or ""
|
||||
|
||||
# T3:載入 session context(最近 3 輪)
|
||||
session_ctx = await _load_session_context(chat_id, user_id)
|
||||
session_ctx = await _load_session_context(chat_id, user_id, project_id)
|
||||
prompt_with_ctx = f"{session_ctx}{user_message}" if session_ctx else user_message
|
||||
|
||||
t0 = time.monotonic()
|
||||
@@ -259,7 +266,9 @@ async def process_nl_message(
|
||||
success = False
|
||||
error_type: str | None = None
|
||||
try:
|
||||
ollama_base = getattr(settings, "OLLAMA_URL", "http://34.143.170.20:11434") # 2026-05-03 ogt: ADR-110 GCP-A Primary
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
ollama_base = resolve_ollama_endpoint("hermes")
|
||||
async with httpx.AsyncClient(timeout=_OLLAMA_TIMEOUT) as _hc:
|
||||
resp = await _hc.post(
|
||||
f"{ollama_base}/api/chat",
|
||||
@@ -306,7 +315,7 @@ async def process_nl_message(
|
||||
|
||||
# T3:儲存本輪對話(只在成功時存)
|
||||
if success:
|
||||
await _save_session_turn(chat_id, user_id, user_message, result_text)
|
||||
await _save_session_turn(chat_id, user_id, user_message, result_text, project_id)
|
||||
|
||||
# T1:非阻擋寫入 hermes_dispatch_log(失敗不影響回覆)
|
||||
asyncio.create_task(
|
||||
|
||||
@@ -6,6 +6,11 @@ ADR-092 (2026-04-20 ogt + Claude Opus 4.7 Asia/Taipei)
|
||||
ADR-092 B3 (2026-04-24 ogt + Claude Sonnet 4.6 Asia/Taipei):
|
||||
W-2 修復:改用 telegram_message_id IS NULL 判斷真正靜默,排除 tg_sent TTL 過期誤判
|
||||
W-5 新增:Agent Debate 失敗導致告警卡在分析中(description='待分析')
|
||||
ADR-092 B4 (2026-05-05 ogt + Claude Sonnet 4.6 Asia/Taipei):
|
||||
A2 修復:新 Pod 啟動後 90s leading sleep,避免 rollout 時立即觸發告警
|
||||
A3 修復:grace period 改為 Redis cluster-shared(watchdog:cluster_grace),
|
||||
消除 replicas=2 時 Pod 間 grace period 不一致造成 violation_codes 分歧
|
||||
W6 修復:dedup key 移除動態 low_count,改為穩定 "W6:trust_drift"
|
||||
|
||||
檢查項目:
|
||||
W-1 AI SLO 違反(決策品質,7d 滾動)
|
||||
@@ -13,6 +18,7 @@ ADR-092 B3 (2026-04-24 ogt + Claude Sonnet 4.6 Asia/Taipei):
|
||||
W-3 飛輪 execution_success_rate 低落(< 30%)
|
||||
W-4 無 APPROVED Playbook(自動修復鏈路斷裂)
|
||||
W-5 Agent Debate 失敗(PENDING 告警 description='待分析' 超過 1 小時)
|
||||
W-6 Trust Drift 偵測(Playbook 信任度漂移)
|
||||
|
||||
任一異常 → send_meta_alert(TYPE-8M,flywheel_health)
|
||||
去重:Redis watchdog:alert:{dedup_hash} TTL 1h,避免每 15 分鐘重複洗版
|
||||
@@ -40,25 +46,52 @@ _DEDUP_TTL_SEC = 3600 # 同一告警 1 小時內不重複發送
|
||||
_TG_SILENCE_THRESHOLD = 2 # PENDING telegram_message_id IS NULL 告警門檻
|
||||
_FLYWHEEL_SUCCESS_MIN = 0.30 # 執行成功率下限
|
||||
_STUCK_ANALYSIS_THRESHOLD = 3 # Agent Debate 失敗導致卡住的告警門檻
|
||||
_TRUST_DRIFT_META_MIN_RATIO = 0.20 # 低於此比例只記治理事件,不升 Meta System
|
||||
|
||||
# 2026-05-03 ogt + Claude Opus 4.7 — feedback_silencing_alerts_recurring_violation
|
||||
# 啟動寬限期:30 分鐘內可 skip「資料還沒到」噪音;超過寬限期仍空 = 真資料管線斷,必須告警
|
||||
# 不可單獨用 skip 吞告警 — 一定要配對打「初始化期過、資料應該來但沒來」新告警
|
||||
_INIT_GRACE_SEC = 1800
|
||||
# 2026-05-05 ogt A3:_PROCESS_START 僅作 Redis 故障時的 fallback
|
||||
_PROCESS_START = time.monotonic()
|
||||
|
||||
# 2026-05-05 ogt A2:新 Pod 啟動 leading sleep,避免 rollout 時立即觸發告警
|
||||
# 90s < dedup TTL(3600s),不影響正常告警時效
|
||||
_STARTUP_SLEEP_SEC = 90
|
||||
|
||||
def _grace_active() -> bool:
|
||||
"""啟動 30 分鐘內為寬限期;超過後資料缺失必須告警"""
|
||||
return (time.monotonic() - _PROCESS_START) < _INIT_GRACE_SEC
|
||||
# Redis key for cluster-shared grace period(A3)
|
||||
_GRACE_REDIS_KEY = "watchdog:cluster_grace"
|
||||
|
||||
|
||||
async def _is_grace_active() -> bool:
|
||||
"""
|
||||
叢集級別啟動寬限期(A3 修復)。
|
||||
第一個 Pod 執行時 SET nx=True,後續 Pod SET 失敗但 key 仍存在。
|
||||
key TTL = _INIT_GRACE_SEC(30min);到期後 grace 結束。
|
||||
Redis 故障時降級為 process-local monotonic 判斷(fail-safe)。
|
||||
2026-05-05 ogt + Claude Sonnet 4.6 — ADR-092 B4
|
||||
"""
|
||||
try:
|
||||
redis = get_redis()
|
||||
await redis.set(_GRACE_REDIS_KEY, "1", nx=True, ex=_INIT_GRACE_SEC)
|
||||
return bool(await redis.exists(_GRACE_REDIS_KEY))
|
||||
except Exception:
|
||||
return (time.monotonic() - _PROCESS_START) < _INIT_GRACE_SEC
|
||||
|
||||
|
||||
async def run_ai_slo_watchdog_loop() -> None:
|
||||
"""
|
||||
永久迴圈:每 15 分鐘自健診,異常時發送 TYPE-8M Meta-System 告警。
|
||||
由 main.py lifespan 透過 asyncio.create_task() 啟動。
|
||||
A2:先 sleep 90s 再開始第一次 check,避免新 Pod 上線立即觸發告警。
|
||||
"""
|
||||
logger.info("ai_slo_watchdog_started", interval_sec=_INTERVAL_SEC)
|
||||
logger.info(
|
||||
"ai_slo_watchdog_started",
|
||||
interval_sec=_INTERVAL_SEC,
|
||||
startup_sleep_sec=_STARTUP_SLEEP_SEC,
|
||||
)
|
||||
# A2 修復:Leading sleep — 讓服務先穩定,避免 rollout 時立即觸發
|
||||
await asyncio.sleep(_STARTUP_SLEEP_SEC)
|
||||
while True:
|
||||
try:
|
||||
await _check_once()
|
||||
@@ -68,7 +101,15 @@ async def run_ai_slo_watchdog_loop() -> None:
|
||||
|
||||
|
||||
async def _check_once() -> None:
|
||||
# violations = 顯示用(含動態數值,送 Telegram)
|
||||
# violation_codes = dedup 用(穩定 W-code,不含動態數值)
|
||||
# 2026-05-04 ogt: 分離 dedup key 與顯示字串
|
||||
# 根因:W-2/3/5/6 字串含動態數字(count/ratio/score),每次微變 → 不同 SHA256 → dedup 失效
|
||||
# 修法:dedup 用穩定 violation_codes(W-N:type 格式),Telegram 照常顯示動態值
|
||||
violations: list[str] = []
|
||||
violation_codes: list[str] = []
|
||||
# A3 修復:cluster-shared grace period,單次查詢供所有 W-check 使用,避免 Pod 間不一致
|
||||
grace = await _is_grace_active()
|
||||
|
||||
# W-1: AI SLO 違反(決策品質 7d 滾動)
|
||||
try:
|
||||
@@ -77,6 +118,7 @@ async def _check_once() -> None:
|
||||
if report.any_violated:
|
||||
violated = [m.name for m in report.metrics if m.violated]
|
||||
violations.append(f"SLO 違反: {', '.join(violated)}")
|
||||
violation_codes.append(f"W1:slo_violated:{','.join(sorted(violated))}")
|
||||
except Exception as e:
|
||||
logger.warning("watchdog_w1_slo_check_failed", error=str(e))
|
||||
|
||||
@@ -91,6 +133,7 @@ async def _check_once() -> None:
|
||||
violations.append(
|
||||
f"{silent_count} 個 PENDING 告警超 30 分鐘未送達 Telegram(未曾發送,非 TTL 過期)"
|
||||
)
|
||||
violation_codes.append("W2:tg_silence")
|
||||
except Exception as e:
|
||||
logger.warning("watchdog_w2_tg_silence_check_failed", error=str(e))
|
||||
|
||||
@@ -104,20 +147,21 @@ async def _check_once() -> None:
|
||||
from src.services.flywheel_stats_service import FlywheelStatsService
|
||||
metrics = await FlywheelStatsService().compute()
|
||||
if metrics and metrics.execution_success_rate is None:
|
||||
if _grace_active():
|
||||
if grace:
|
||||
logger.debug(
|
||||
"watchdog_w3_init_grace_skip",
|
||||
reason="execution_sample_below_min",
|
||||
uptime_sec=int(time.monotonic() - _PROCESS_START),
|
||||
)
|
||||
else:
|
||||
violations.append(
|
||||
"飛輪執行成功率資料管線無流量(uptime > 30min 仍無樣本)"
|
||||
)
|
||||
violation_codes.append("W3:flywheel_no_data")
|
||||
elif metrics and metrics.execution_success_rate < _FLYWHEEL_SUCCESS_MIN:
|
||||
violations.append(
|
||||
f"飛輪執行成功率 {metrics.execution_success_rate:.1%} < {_FLYWHEEL_SUCCESS_MIN:.0%}"
|
||||
)
|
||||
violation_codes.append("W3:flywheel_low_rate")
|
||||
except Exception as e:
|
||||
logger.warning("watchdog_w3_flywheel_check_failed", error=str(e))
|
||||
|
||||
@@ -129,18 +173,19 @@ async def _check_once() -> None:
|
||||
try:
|
||||
approved_count, total_playbook_count = await _count_approved_playbooks()
|
||||
if total_playbook_count == 0:
|
||||
if _grace_active():
|
||||
if grace:
|
||||
logger.info(
|
||||
"watchdog_w4_init_grace_skip",
|
||||
reason="playbook_table_empty_likely_initializing",
|
||||
uptime_sec=int(time.monotonic() - _PROCESS_START),
|
||||
)
|
||||
else:
|
||||
violations.append(
|
||||
"Playbook 表為空 — 初始化失敗或表被清空(uptime > 30min 仍 0 筆)"
|
||||
)
|
||||
violation_codes.append("W4:playbook_table_empty")
|
||||
elif approved_count == 0:
|
||||
violations.append("無 APPROVED Playbook — 自動修復鏈路斷裂(evolver 可能全部封存)")
|
||||
violation_codes.append("W4:no_approved_playbook")
|
||||
except Exception as e:
|
||||
logger.warning("watchdog_w4_playbook_check_failed", error=str(e))
|
||||
|
||||
@@ -154,27 +199,35 @@ async def _check_once() -> None:
|
||||
violations.append(
|
||||
f"Agent Debate 失敗導致 {stuck_count} 個告警分析卡住(PENDING + description='待分析' 超過 1 小時)"
|
||||
)
|
||||
violation_codes.append("W5:stuck_analysis")
|
||||
except Exception as e:
|
||||
logger.warning("watchdog_w5_stuck_analysis_check_failed", error=str(e))
|
||||
|
||||
# W-6: Trust Drift 偵測(Playbook 信任度漂移)
|
||||
# 2026-05-02 ogt + Claude Sonnet 4.6(亞太): 整併雙寫路徑
|
||||
# 原行為:呼叫 trust_drift_detector.run() 直接寫 event_type=trust_drift 到 PG
|
||||
# governance_agent.check_trust_drift() 每 1h 也寫同一 event_type → 雙寫
|
||||
# 整併:改呼叫 governance_agent.check_trust_drift() 為唯一 source-of-truth
|
||||
# W-6 watchdog 仍每 15 分鐘執行(感知器),violations 計數用於 meta-alert 觸發
|
||||
# PG 寫入由 governance_agent._alert() 統一處理,避免雙寫
|
||||
# 2026-05-05 Codex: Watchdog 仍透過 governance_agent 單一入口,
|
||||
# 但用 emit_alert=False 只取統計,避免與 hourly self-check 發出雙重 Telegram。
|
||||
try:
|
||||
from src.services.governance_agent import get_governance_agent
|
||||
trust_result = await get_governance_agent().check_trust_drift()
|
||||
if trust_result.get("drifted", 0) > 0:
|
||||
drifted = trust_result["drifted"]
|
||||
trust_result = await get_governance_agent().check_trust_drift(emit_alert=False)
|
||||
drifted = trust_result.get("drifted", 0)
|
||||
drift_ratio = float(trust_result.get("drift_ratio") or 0.0)
|
||||
if drifted > 0 and drift_ratio >= _TRUST_DRIFT_META_MIN_RATIO:
|
||||
auto_deprecated = trust_result.get("auto_deprecated", 0)
|
||||
kept = trust_result.get("kept", 0)
|
||||
violations.append(
|
||||
f"Trust Drift 偵測到 {drifted} 個 Playbook 信任度低落"
|
||||
f"(auto-deprecated: {auto_deprecated},待人工審核: {kept})"
|
||||
)
|
||||
# 2026-05-05 ogt W6 修復:移除動態 low_count,避免 count 微變繞過 dedup
|
||||
violation_codes.append("W6:trust_drift")
|
||||
elif drifted > 0:
|
||||
logger.info(
|
||||
"watchdog_w6_trust_drift_below_meta_threshold",
|
||||
drifted=drifted,
|
||||
drift_ratio=round(drift_ratio, 3),
|
||||
threshold=_TRUST_DRIFT_META_MIN_RATIO,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("watchdog_w6_trust_drift_check_failed", error=str(e))
|
||||
|
||||
@@ -182,14 +235,20 @@ async def _check_once() -> None:
|
||||
logger.debug("ai_slo_watchdog_all_ok", checks=6)
|
||||
return
|
||||
|
||||
# 去重:violations 相同內容 1 小時內不重複發
|
||||
dedup_hash = f"{hash(tuple(sorted(violations))) & 0xFFFFFF:06x}"
|
||||
# 去重:用穩定 violation_codes 計算 SHA256,避免動態數值(ratio/score)造成每次不同 hash
|
||||
# 2026-05-04 ogt: dedup 分離顯示字串與 dedup key
|
||||
# 根因:violations 字串含動態數字(count/ratio/score),每次微變 → SHA256 不同 → dedup 失效
|
||||
# 修法:violation_codes 只含 W-code + 穩定類型,不含浮點數值
|
||||
import hashlib
|
||||
_content = "|".join(sorted(violation_codes))
|
||||
dedup_hash = hashlib.sha256(_content.encode()).hexdigest()[:12]
|
||||
dedup_key = f"watchdog:alert:{dedup_hash}"
|
||||
redis = get_redis()
|
||||
if await redis.exists(dedup_key):
|
||||
# setnx atomic — 同時多個 pod 只有第一個能 set,避免並發多發
|
||||
set_ok = await redis.set(dedup_key, "1", ex=_DEDUP_TTL_SEC, nx=True)
|
||||
if not set_ok:
|
||||
logger.debug("ai_slo_watchdog_deduped", key=dedup_key)
|
||||
return
|
||||
await redis.setex(dedup_key, _DEDUP_TTL_SEC, "1")
|
||||
|
||||
violation_lines = [
|
||||
f"{idx + 1}. {item}" for idx, item in enumerate(violations)
|
||||
@@ -198,7 +257,7 @@ async def _check_once() -> None:
|
||||
system_impact = "\n".join(
|
||||
[
|
||||
f"檢出 {len(violations)} 項 KPI 異常(W-1~W-6)",
|
||||
f"關鍵影響:飛輪自動化能力可能降級",
|
||||
"關鍵影響:飛輪自動化能力可能降級",
|
||||
*violation_lines,
|
||||
]
|
||||
)
|
||||
|
||||
@@ -479,7 +479,7 @@ async def _collect_all_k8s_assets() -> tuple[list[dict[str, Any]], list[dict[str
|
||||
|
||||
# 6. Prometheus targets — 補齊 host-install services (110/112/188/125 等非 K8s)
|
||||
# Gap 1 修補 (2026-04-19 audit): 原本 asset_inventory 只涵蓋 K8s,
|
||||
# 110 Harbor/Gitea/監控 + 188 PostgreSQL/Redis/Ollama host-install 全漏
|
||||
# 110 Harbor/Gitea/監控 + 188 PostgreSQL/Redis host-install 全漏
|
||||
# 用 Prometheus /api/v1/targets 自動發現全節點服務
|
||||
try:
|
||||
prom_assets, host_relationships = await _collect_prometheus_targets()
|
||||
|
||||
@@ -172,7 +172,7 @@ _LLM_FORECAST_PROMPT = """你是 AWOOOI 容量規劃專家。以下 host 過去
|
||||
{findings_json}
|
||||
|
||||
## 當前主機環境資訊
|
||||
- 主機架構: 110 (Harbor/Gitea/監控), 112 (Security), 120/121 (K3s), 125 (K3s backup), 188 (PG/Redis/Ollama/MinIO)
|
||||
- 主機架構: 110 (Harbor/Gitea/監控), 112 (Security), 120/121 (K3s), 125 (K3s backup), 188 (PG/Redis/MinIO)
|
||||
- 判斷請考慮: 該主機上跑什麼服務、常見瓶頸模式
|
||||
|
||||
## 輸出規格 (必須是合法 JSON,純 JSON 無前後文字)
|
||||
|
||||
@@ -86,6 +86,7 @@ async def evaluate_once() -> dict[str, int]:
|
||||
"monitoring_updated": 0, "alerting_updated": 0, "km_updated": 0,
|
||||
"playbook_updated": 0, "remediation_updated": 0,
|
||||
"rule_matching_updated": 0, "rule_creation_updated": 0,
|
||||
"rules_auto_created": 0,
|
||||
}
|
||||
error_msg: str | None = None
|
||||
|
||||
@@ -129,6 +130,13 @@ async def evaluate_once() -> dict[str, int]:
|
||||
stats["llm_analyzed"] = True
|
||||
await _send_telegram_gaps(red_summary, llm_analysis)
|
||||
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: Coverage Gap → AI 規則自動生成執行器
|
||||
# 對 auto_alerting=red 的 asset 自動生成 alert_rule_catalog 記錄
|
||||
# COVERAGE_AUTO_RULE_ENABLED flag 控制(預設啟用)
|
||||
if getattr(settings, "COVERAGE_AUTO_RULE_ENABLED", True):
|
||||
created = await _auto_create_rules_for_uncovered_assets(run_id)
|
||||
stats["rules_auto_created"] = created
|
||||
|
||||
await _log_aol(stats, duration_ms, error_msg)
|
||||
|
||||
logger.info(
|
||||
@@ -140,6 +148,7 @@ async def evaluate_once() -> dict[str, int]:
|
||||
remediation=stats["remediation_updated"],
|
||||
rule_matching=stats["rule_matching_updated"],
|
||||
rule_creation=stats["rule_creation_updated"],
|
||||
rules_auto_created=stats.get("rules_auto_created", 0),
|
||||
llm_analyzed=bool(llm_analysis),
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
@@ -744,3 +753,179 @@ async def _log_aol(stats: dict[str, int], duration_ms: int, error: str | None) -
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("coverage_evaluator_aol_failed", error=str(e))
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: Coverage Gap → AI 規則自動生成執行器
|
||||
# ============================================================================
|
||||
|
||||
_COVERAGE_RULE_COOLDOWN_SEC = 86400 # 每個 asset 24h 冷卻,避免重複建規則
|
||||
|
||||
|
||||
async def _auto_create_rules_for_uncovered_assets(run_id: str | None) -> int:
|
||||
"""
|
||||
對 auto_alerting=red 的 top 3 asset 自動生成 alert_rule_catalog 記錄。
|
||||
|
||||
流程:
|
||||
1. 查最新 run 中 auto_alerting=red 的 host/k8s_workload(最多 5 筆)
|
||||
2. 每個 asset 用 Redis 24h 冷卻防重複
|
||||
3. 依 asset_type 建立範本化 PromQL rule
|
||||
4. UPSERT 進 alert_rule_catalog(source='ai_generated', review_status='pending_review')
|
||||
5. 回傳成功建立數量
|
||||
|
||||
設計鐵律:
|
||||
- 只建 pending_review,不自動 approve
|
||||
- rule_name UNIQUE 鍵:CoverageAuto_{type}_{safe_key}
|
||||
- Redis 不可用時跳過冷卻檢查(不中斷主流程)
|
||||
"""
|
||||
from sqlalchemy import text as _sql
|
||||
from src.db.base import get_db_context
|
||||
import json as _j
|
||||
import re
|
||||
|
||||
if not run_id:
|
||||
return 0
|
||||
|
||||
created = 0
|
||||
try:
|
||||
async with get_db_context() as db:
|
||||
# 查 auto_alerting=red 的 host 和 k8s_workload asset(最多 5 筆)
|
||||
rows = await db.execute(
|
||||
_sql("""
|
||||
SELECT ai.asset_id, ai.asset_key, ai.asset_type,
|
||||
ai.name, ai.host, ai.namespace,
|
||||
ai.metadata->>'internal_ip' AS internal_ip
|
||||
FROM asset_coverage_snapshot cs
|
||||
JOIN asset_inventory ai ON cs.asset_id = ai.asset_id
|
||||
WHERE cs.run_id = CAST(:rid AS uuid)
|
||||
AND cs.dimension = 'auto_alerting'
|
||||
AND cs.coverage_status = 'red'
|
||||
AND ai.asset_type IN ('host', 'k8s_workload')
|
||||
ORDER BY ai.asset_type, ai.asset_key
|
||||
LIMIT 5
|
||||
"""),
|
||||
{"rid": run_id},
|
||||
)
|
||||
assets = rows.fetchall()
|
||||
|
||||
# PromQL 值安全性:只允許合法 hostname/IP/k8s name 字元,防止 PromQL 語意污染
|
||||
_safe_label_val = re.compile(r'^[a-zA-Z0-9._\-]+$')
|
||||
|
||||
for asset in assets:
|
||||
asset_key = str(asset.asset_key or "")
|
||||
asset_type = str(asset.asset_type or "")
|
||||
name = str(asset.name or "")
|
||||
host = str(asset.host or "")
|
||||
namespace = str(asset.namespace or "")
|
||||
internal_ip = str(asset.internal_ip or "")
|
||||
|
||||
# Redis 24h 冷卻
|
||||
cooldown_key = f"coverage_rule_created:{asset_key}"
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
already = await redis.get(cooldown_key)
|
||||
if already:
|
||||
logger.debug("coverage_auto_rule_cooldown", asset_key=asset_key)
|
||||
continue
|
||||
except RuntimeError as e:
|
||||
logger.warning("coverage_auto_rule_redis_unavailable", asset_key=asset_key, error=str(e))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 建立 PromQL 規則(所有代入值必須通過白名單驗證)
|
||||
safe_key = re.sub(r"[^a-zA-Z0-9]", "_", asset_key)[:60]
|
||||
if asset_type == "host":
|
||||
ip_for_match = internal_ip or host
|
||||
if not ip_for_match or not _safe_label_val.match(ip_for_match):
|
||||
logger.debug("coverage_auto_rule_skip_unsafe_ip", asset_key=asset_key, ip=ip_for_match)
|
||||
continue
|
||||
rule_name = f"CoverageAuto_HostDown_{safe_key}"
|
||||
expr = f'up{{instance=~"{ip_for_match}:.*"}} == 0'
|
||||
severity = "warning"
|
||||
display_host = host if _safe_label_val.match(host) else ip_for_match
|
||||
labels = {"host": display_host, "layer": "infrastructure", "source": "coverage_auto"}
|
||||
annotations = {
|
||||
"summary": f"主機 {display_host} 無 Prometheus 探測響應",
|
||||
"description": f"Coverage 缺口自動建規則 — asset_key={asset_key},請 SRE 複核 expr 後 approve",
|
||||
}
|
||||
duration_seconds = 120
|
||||
elif asset_type == "k8s_workload":
|
||||
if not name or not _safe_label_val.match(name):
|
||||
logger.debug("coverage_auto_rule_skip_unsafe_name", asset_key=asset_key, name=name)
|
||||
continue
|
||||
if namespace and not _safe_label_val.match(namespace):
|
||||
logger.debug("coverage_auto_rule_skip_unsafe_ns", asset_key=asset_key, namespace=namespace)
|
||||
continue
|
||||
rule_name = f"CoverageAuto_WorkloadDown_{safe_key}"
|
||||
ns_selector = f',namespace="{namespace}"' if namespace else ""
|
||||
expr = f'kube_deployment_status_replicas_available{{deployment="{name}"{ns_selector}}} == 0'
|
||||
severity = "warning"
|
||||
labels = {"namespace": namespace or "default", "deployment": name, "source": "coverage_auto"}
|
||||
annotations = {
|
||||
"summary": f"{name} 在 {namespace or 'default'} 無可用副本",
|
||||
"description": f"Coverage 缺口自動建規則 — asset_key={asset_key},請 SRE 複核 expr 後 approve",
|
||||
}
|
||||
duration_seconds = 180
|
||||
else:
|
||||
continue
|
||||
|
||||
# UPSERT 進 alert_rule_catalog(source='ai_generated')
|
||||
# 用 RETURNING 判斷是否實際插入(ON CONFLICT DO NOTHING 衝突時無 RETURNING row)
|
||||
try:
|
||||
async with get_db_context() as db:
|
||||
row = await db.execute(
|
||||
_sql("""
|
||||
INSERT INTO alert_rule_catalog (
|
||||
rule_name, source, expr, duration_seconds,
|
||||
severity, labels, annotations,
|
||||
created_by_agent, review_status,
|
||||
created_at, updated_at
|
||||
) VALUES (
|
||||
:rname, 'ai_generated', :expr, :dur,
|
||||
:sev, CAST(:labels AS jsonb), CAST(:ann AS jsonb),
|
||||
'coverage_evaluator', 'pending_review',
|
||||
NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (rule_name) DO NOTHING
|
||||
RETURNING rule_name
|
||||
"""),
|
||||
{
|
||||
"rname": rule_name[:200],
|
||||
"expr": expr[:4000],
|
||||
"dur": duration_seconds,
|
||||
"sev": severity,
|
||||
"labels": _j.dumps(labels, ensure_ascii=False),
|
||||
"ann": _j.dumps(annotations, ensure_ascii=False),
|
||||
},
|
||||
)
|
||||
actually_inserted = row.fetchone() is not None
|
||||
|
||||
if actually_inserted:
|
||||
created += 1
|
||||
logger.info(
|
||||
"coverage_auto_rule_created",
|
||||
rule_name=rule_name,
|
||||
asset_key=asset_key,
|
||||
asset_type=asset_type,
|
||||
)
|
||||
# 設置 Redis 冷卻(僅實際插入才設)
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
await redis.set(cooldown_key, "1", ex=_COVERAGE_RULE_COOLDOWN_SEC)
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
logger.debug("coverage_auto_rule_conflict_skip", rule_name=rule_name)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("coverage_auto_rule_upsert_failed", asset_key=asset_key, error=str(e))
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("coverage_auto_create_rules_failed", error=str(e))
|
||||
|
||||
if created > 0:
|
||||
logger.info("coverage_auto_rules_summary", created=created)
|
||||
|
||||
return created
|
||||
|
||||
@@ -25,7 +25,9 @@ Feature Flag:
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
import structlog
|
||||
|
||||
from src.core.config import settings
|
||||
|
||||
@@ -65,6 +65,7 @@ from src.api.v1 import (
|
||||
signoz_webhook as signoz_webhook_v1, # Phase 21: SignOz → Telegram (ADR-037)
|
||||
)
|
||||
from src.api.v1 import drift as drift_v1 # Phase 25 P2: Config Drift Detection
|
||||
from src.api.v1 import platform as platform_v1 # AwoooP Phase 4: Platform Shell(Shadow Mode)
|
||||
from src.api.v1 import rag as rag_v1 # Phase 33 ADR-067: RAG 知識庫
|
||||
from src.api.v1 import monitoring as monitoring_v1 # 2026-04-03: 監控工具狀態
|
||||
from src.api.v1 import notifications as notifications_v1 # 2026-04-10: 通知頻道狀態
|
||||
@@ -185,6 +186,11 @@ else:
|
||||
@asynccontextmanager
|
||||
async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
"""Application lifespan events"""
|
||||
# AwoooP Phase 2.4 (2026-05-04 ogt): 設定 startup handler 的 project_id context
|
||||
# asyncio.create_task() 自動繼承父任務的 ContextVar → 31 個 background loop 全部標記為 awoooi
|
||||
from src.core.context import PROJECT_ID
|
||||
PROJECT_ID.set("awoooi")
|
||||
|
||||
# Startup
|
||||
logger.info(
|
||||
"api_startup",
|
||||
@@ -677,7 +683,7 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
logger.warning("ollama_failover_system_start_failed", error=str(e))
|
||||
|
||||
# 2026-04-27 P3.2.2 by Claude — AI Provider 版本追蹤(每 1 小時)
|
||||
# 探測 5 Provider(ollama/ollama_188/gemini/claude/openclaw_nemo)版本
|
||||
# 探測 5 Provider(ollama/ollama_local/gemini/claude/openclaw_nemo)版本
|
||||
# 寫入 ai_provider_version_history;版本變更時 log warning,P3.2.3 alerter 後續整合
|
||||
try:
|
||||
async def _run_model_version_tracker_loop() -> None:
|
||||
@@ -703,6 +709,16 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
except Exception as e:
|
||||
logger.warning("model_version_tracker_schedule_failed", error=str(e))
|
||||
|
||||
# AwoooP Phase 4 (2026-05-04 ogt + Claude Sonnet 4.6): Platform Worker(Shadow Mode Shell)
|
||||
# ADR-106 Strangler Fig Phase 4:SKIP LOCKED run worker + stale run reaper
|
||||
# Shadow mode:is_shadow=True,0 user-visible response,0 destructive tool call
|
||||
try:
|
||||
from src.workers.platform_worker import start_platform_worker
|
||||
await start_platform_worker()
|
||||
logger.info("platform_worker_started", mode="shadow")
|
||||
except Exception as e:
|
||||
logger.warning("platform_worker_start_failed", error=str(e))
|
||||
|
||||
yield
|
||||
|
||||
# Shutdown
|
||||
@@ -727,6 +743,14 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
except Exception as e:
|
||||
logger.warning("auto_repair_drain_failed", error=str(e))
|
||||
|
||||
# AwoooP Phase 4: Platform Worker 優雅停機(2026-05-04 ogt)
|
||||
try:
|
||||
from src.workers.platform_worker import stop_platform_worker
|
||||
await stop_platform_worker()
|
||||
logger.info("platform_worker_stopped")
|
||||
except Exception as e:
|
||||
logger.warning("platform_worker_stop_failed", error=str(e))
|
||||
|
||||
# Phase 6.1: 關閉 Signal Worker (先關閉 Consumer)
|
||||
await close_signal_worker()
|
||||
await publisher.stop()
|
||||
@@ -968,6 +992,8 @@ app.include_router(agent.router, prefix="/api/v1/agent", tags=["Agent"])
|
||||
app.include_router(
|
||||
notifications.router, prefix="/api/v1/notifications", tags=["Notifications"]
|
||||
)
|
||||
# AwoooP Phase 4 (2026-05-04 ogt): Platform Shell — Shadow Mode Run API
|
||||
app.include_router(platform_v1.router, prefix="/api/v1/platform", tags=["AwoooP Platform"])
|
||||
|
||||
|
||||
# =============================================================================
|
||||
|
||||
437
apps/api/src/models/awooop_contracts.py
Normal file
437
apps/api/src/models/awooop_contracts.py
Normal file
@@ -0,0 +1,437 @@
|
||||
"""
|
||||
AwoooP Contract Pydantic Models
|
||||
================================
|
||||
Phase 3: 六合約家族 Pydantic v2 驗證模型(ADR-112)
|
||||
2026-05-04 ogt + Claude Sonnet 4.6
|
||||
|
||||
六合約家族:
|
||||
1. ProjectTenantContract — 租戶/專案能力邊界
|
||||
2. AgentContract — Agent 模型、工具、治理
|
||||
3. MCPGatewayContract — MCP 工具閘道
|
||||
4. PolicyRoutingContract — LLM 路由規則
|
||||
5. RuntimeRunStateContract — Run FSM 狀態
|
||||
6. ChannelEventContract — Channel 事件(冪等)
|
||||
|
||||
所有含 artifact ref 的欄位都附 sha256(ADR-112 artifact integrity)。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
from pydantic import BaseModel, Field, field_validator, model_validator
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 共用型別
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
_SHA256_RE = re.compile(r"^[0-9a-f]{64}$")
|
||||
_PROJECT_ID_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{1,63}$")
|
||||
_AGENT_ID_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{1,127}$")
|
||||
_UUID_RE = re.compile(
|
||||
r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$"
|
||||
)
|
||||
|
||||
|
||||
def _validate_sha256(v: str | None, field_name: str = "sha256") -> str | None:
|
||||
if v is None:
|
||||
return v
|
||||
if not _SHA256_RE.match(v):
|
||||
raise ValueError(f"{field_name} 必須為 64 位 hex 字串")
|
||||
return v
|
||||
|
||||
|
||||
class MigrationMode(str, Enum):
|
||||
LEGACY = "legacy_awoooi_default"
|
||||
SHADOW = "shadow"
|
||||
CANARY = "canary"
|
||||
ACTIVE = "active"
|
||||
|
||||
|
||||
class ChannelType(str, Enum):
|
||||
TELEGRAM = "telegram"
|
||||
SLACK = "slack"
|
||||
WEBHOOK = "webhook"
|
||||
API = "api"
|
||||
|
||||
|
||||
class Provider(str, Enum):
|
||||
ANTHROPIC = "anthropic"
|
||||
OPENAI = "openai"
|
||||
OLLAMA = "ollama"
|
||||
GEMINI = "gemini"
|
||||
NVIDIA = "nvidia"
|
||||
OPENROUTER = "openrouter"
|
||||
|
||||
|
||||
class RunState(str, Enum):
|
||||
PENDING = "pending"
|
||||
RUNNING = "running"
|
||||
WAITING_APPROVAL = "waiting_approval"
|
||||
WAITING_TOOL = "waiting_tool"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
CANCELLED = "cancelled"
|
||||
TIMEOUT = "timeout"
|
||||
|
||||
|
||||
class AuthScheme(str, Enum):
|
||||
NONE = "none"
|
||||
BEARER = "bearer"
|
||||
HMAC = "hmac"
|
||||
|
||||
|
||||
class Transport(str, Enum):
|
||||
STDIO = "stdio"
|
||||
HTTP = "http"
|
||||
SSE = "sse"
|
||||
|
||||
|
||||
class EventType(str, Enum):
|
||||
MESSAGE_RECEIVED = "message_received"
|
||||
CALLBACK_QUERY = "callback_query"
|
||||
COMMAND_INVOKED = "command_invoked"
|
||||
WEBHOOK_POST = "webhook_post"
|
||||
API_REQUEST = "api_request"
|
||||
APPROVAL_RESPONSE = "approval_response"
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 1. Project Tenant Contract
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
class ProjectTenantContract(BaseModel):
|
||||
"""租戶/專案合約(ADR-111/115)"""
|
||||
|
||||
model_config = {"extra": "forbid"}
|
||||
|
||||
project_id: str = Field(..., description="全局唯一租戶識別符")
|
||||
display_name: str = Field(..., min_length=1, max_length=256)
|
||||
migration_mode: MigrationMode = MigrationMode.LEGACY
|
||||
budget_limit_usd: float | None = Field(None, ge=0)
|
||||
allowed_channels: list[ChannelType] = Field(default_factory=list)
|
||||
is_active: bool = True
|
||||
metadata: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
@field_validator("project_id")
|
||||
@classmethod
|
||||
def validate_project_id(cls, v: str) -> str:
|
||||
if not _PROJECT_ID_RE.match(v):
|
||||
raise ValueError("project_id 只允許 a-z, 0-9, _, -,長度 2-64")
|
||||
return v
|
||||
|
||||
@field_validator("allowed_channels")
|
||||
@classmethod
|
||||
def validate_unique_channels(cls, v: list[ChannelType]) -> list[ChannelType]:
|
||||
if len(v) != len(set(v)):
|
||||
raise ValueError("allowed_channels 不可包含重複項目")
|
||||
return v
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 2. Agent Contract
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
class ArtifactRef(BaseModel):
|
||||
"""含 SHA-256 的 artifact 參照(ADR-112 artifact integrity)"""
|
||||
|
||||
model_config = {"extra": "forbid"}
|
||||
|
||||
artifact_id: str
|
||||
sha256: str = Field(..., description="SHA-256 hex digest(64 位)")
|
||||
|
||||
@field_validator("sha256")
|
||||
@classmethod
|
||||
def validate_sha256(cls, v: str) -> str:
|
||||
return _validate_sha256(v, "sha256") # type: ignore[return-value]
|
||||
|
||||
|
||||
class ToolRef(BaseModel):
|
||||
"""Agent 工具參照"""
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
tool_name: str
|
||||
mcp_gateway_id: str | None = None
|
||||
sha256: str | None = None
|
||||
|
||||
@field_validator("sha256")
|
||||
@classmethod
|
||||
def validate_sha256(cls, v: str | None) -> str | None:
|
||||
return _validate_sha256(v, "tool sha256")
|
||||
|
||||
|
||||
class AgentContract(BaseModel):
|
||||
"""Agent 合約(ADR-112)"""
|
||||
|
||||
model_config = {"extra": "forbid"}
|
||||
|
||||
agent_id: str = Field(..., description="Agent 識別符")
|
||||
agent_name: str = Field(..., min_length=1, max_length=256)
|
||||
model: str = Field(..., min_length=1, max_length=128)
|
||||
provider: Provider
|
||||
max_tokens: int | None = Field(None, ge=1, le=200000)
|
||||
temperature: float | None = Field(None, ge=0.0, le=2.0)
|
||||
system_prompt_ref: ArtifactRef | None = None
|
||||
tools: list[ToolRef] = Field(default_factory=list)
|
||||
budget_limit_usd_per_run: float | None = Field(None, ge=0)
|
||||
require_approval: bool = False
|
||||
approval_timeout_seconds: int | None = Field(None, ge=60, le=86400)
|
||||
max_parallel_runs: int = Field(1, ge=1, le=100)
|
||||
tags: list[str] = Field(default_factory=list)
|
||||
|
||||
@field_validator("agent_id")
|
||||
@classmethod
|
||||
def validate_agent_id(cls, v: str) -> str:
|
||||
if not _AGENT_ID_RE.match(v):
|
||||
raise ValueError("agent_id 只允許 a-z, 0-9, _, -,長度 2-128")
|
||||
return v
|
||||
|
||||
@model_validator(mode="after")
|
||||
def validate_approval_config(self) -> AgentContract:
|
||||
if self.require_approval and self.approval_timeout_seconds is None:
|
||||
self.approval_timeout_seconds = 300
|
||||
return self
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 3. MCP Gateway Contract
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
class ToolExposed(BaseModel):
|
||||
"""Gateway 暴露的工具定義"""
|
||||
|
||||
model_config = {"extra": "forbid"}
|
||||
|
||||
tool_name: str
|
||||
description: str | None = None
|
||||
schema_sha256: str = Field(..., description="工具 input schema SHA-256")
|
||||
is_destructive: bool = False
|
||||
|
||||
@field_validator("schema_sha256")
|
||||
@classmethod
|
||||
def validate_schema_sha256(cls, v: str) -> str:
|
||||
return _validate_sha256(v, "schema_sha256") # type: ignore[return-value]
|
||||
|
||||
|
||||
class MCPGatewayContract(BaseModel):
|
||||
"""MCP Gateway 合約(ADR-113)"""
|
||||
|
||||
model_config = {"extra": "forbid"}
|
||||
|
||||
gateway_id: str
|
||||
gateway_name: str = Field(..., min_length=1, max_length=256)
|
||||
transport: Transport
|
||||
endpoint: str | None = None
|
||||
auth_scheme: AuthScheme = AuthScheme.NONE
|
||||
hmac_secret_ref: str | None = None
|
||||
tools_exposed: list[ToolExposed] = Field(default_factory=list)
|
||||
rate_limit_rpm: int | None = Field(None, ge=1)
|
||||
timeout_seconds: int = Field(30, ge=1, le=300)
|
||||
is_enabled: bool = True
|
||||
|
||||
@model_validator(mode="after")
|
||||
def validate_http_endpoint(self) -> MCPGatewayContract:
|
||||
if self.transport in (Transport.HTTP, Transport.SSE) and not self.endpoint:
|
||||
raise ValueError(f"transport={self.transport} 時 endpoint 為必填")
|
||||
return self
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 4. Policy Routing Contract
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
class TimeRange(BaseModel):
|
||||
model_config = {"extra": "forbid"}
|
||||
|
||||
start_utc: str = Field(..., pattern=r"^[0-2][0-9]:[0-5][0-9]$")
|
||||
end_utc: str = Field(..., pattern=r"^[0-2][0-9]:[0-5][0-9]$")
|
||||
|
||||
|
||||
class RoutingCondition(BaseModel):
|
||||
model_config = {"extra": "forbid"}
|
||||
|
||||
task_types: list[str] = Field(default_factory=list)
|
||||
max_prompt_tokens: int | None = Field(None, ge=1)
|
||||
time_range: TimeRange | None = None
|
||||
|
||||
|
||||
class RoutingRule(BaseModel):
|
||||
model_config = {"extra": "forbid"}
|
||||
|
||||
rule_id: str
|
||||
priority: int = Field(..., ge=0, le=9999)
|
||||
provider: Provider
|
||||
model: str
|
||||
condition: RoutingCondition | None = None
|
||||
weight: int = Field(100, ge=1, le=100)
|
||||
|
||||
|
||||
class RetryPolicy(BaseModel):
|
||||
model_config = {"extra": "forbid"}
|
||||
|
||||
max_retries: int = Field(3, ge=0, le=10)
|
||||
backoff_base_seconds: float = Field(1.0, ge=0.1, le=60)
|
||||
retry_on_provider_errors: bool = True
|
||||
|
||||
|
||||
class PolicyRoutingContract(BaseModel):
|
||||
"""路由/政策合約"""
|
||||
|
||||
model_config = {"extra": "forbid"}
|
||||
|
||||
policy_id: str
|
||||
policy_name: str = Field(..., min_length=1, max_length=256)
|
||||
routing_rules: list[RoutingRule] = Field(..., min_length=1)
|
||||
fallback_provider: Provider | None = None
|
||||
fallback_model: str | None = None
|
||||
max_cost_per_run_usd: float | None = Field(None, ge=0)
|
||||
retry_policy: RetryPolicy = Field(default_factory=RetryPolicy)
|
||||
effective_from: datetime | None = None
|
||||
effective_to: datetime | None = None
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 5. Runtime Run State Contract
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
class RunTrigger(BaseModel):
|
||||
model_config = {"extra": "forbid"}
|
||||
|
||||
trigger_type: str = Field(
|
||||
..., pattern="^(channel_event|schedule|api|sub_agent|retry)$"
|
||||
)
|
||||
channel_event_id: str | None = None
|
||||
schedule_id: str | None = None
|
||||
triggered_by: str | None = None
|
||||
|
||||
|
||||
class RuntimeRunStateContract(BaseModel):
|
||||
"""Run 狀態機合約(ADR-106 Phase 3)"""
|
||||
|
||||
model_config = {"extra": "forbid"}
|
||||
|
||||
run_id: str = Field(..., description="UUID v7")
|
||||
project_id: str
|
||||
agent_id: str
|
||||
state: RunState
|
||||
trace_id: str | None = None
|
||||
parent_run_id: str | None = None
|
||||
trigger: RunTrigger | None = None
|
||||
input_sha256: str | None = None
|
||||
output_sha256: str | None = None
|
||||
started_at: datetime | None = None
|
||||
completed_at: datetime | None = None
|
||||
timeout_at: datetime | None = None
|
||||
error_code: str | None = None
|
||||
cost_usd: float | None = Field(None, ge=0)
|
||||
step_count: int = Field(0, ge=0)
|
||||
|
||||
@field_validator("run_id", "parent_run_id")
|
||||
@classmethod
|
||||
def validate_uuid(cls, v: str | None) -> str | None:
|
||||
if v is None:
|
||||
return v
|
||||
if not _UUID_RE.match(v):
|
||||
raise ValueError("必須為標準 UUID 格式")
|
||||
return v
|
||||
|
||||
@field_validator("input_sha256", "output_sha256")
|
||||
@classmethod
|
||||
def validate_sha256_fields(cls, v: str | None) -> str | None:
|
||||
return _validate_sha256(v)
|
||||
|
||||
@field_validator("project_id")
|
||||
@classmethod
|
||||
def validate_project_id(cls, v: str) -> str:
|
||||
if not _PROJECT_ID_RE.match(v):
|
||||
raise ValueError("project_id 格式不合法")
|
||||
return v
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 6. Channel Event Contract
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
class AttachmentRef(BaseModel):
|
||||
model_config = {"extra": "forbid"}
|
||||
|
||||
attachment_type: str = Field(..., pattern="^(photo|document|audio|video)$")
|
||||
file_id: str
|
||||
sha256: str | None = None
|
||||
|
||||
@field_validator("sha256")
|
||||
@classmethod
|
||||
def validate_sha256(cls, v: str | None) -> str | None:
|
||||
return _validate_sha256(v, "attachment sha256")
|
||||
|
||||
|
||||
class ChannelEventContract(BaseModel):
|
||||
"""Channel Event 合約(ADR-114 冪等去重)"""
|
||||
|
||||
model_config = {"extra": "forbid"}
|
||||
|
||||
event_id: str = Field(..., description="Platform 生成的 UUID")
|
||||
project_id: str
|
||||
channel_type: ChannelType
|
||||
event_type: EventType
|
||||
provider_event_id: str | None = Field(None, max_length=256)
|
||||
user_id: str | None = None
|
||||
chat_id: str | None = None
|
||||
payload: dict[str, Any] = Field(..., min_length=1)
|
||||
text: str | None = Field(None, max_length=4096)
|
||||
attachments: list[AttachmentRef] = Field(default_factory=list)
|
||||
run_id: str | None = None
|
||||
is_duplicate: bool = False
|
||||
received_at: datetime
|
||||
|
||||
@field_validator("event_id", "run_id")
|
||||
@classmethod
|
||||
def validate_uuid(cls, v: str | None) -> str | None:
|
||||
if v is None:
|
||||
return v
|
||||
if not _UUID_RE.match(v):
|
||||
raise ValueError("必須為標準 UUID 格式")
|
||||
return v
|
||||
|
||||
@field_validator("project_id")
|
||||
@classmethod
|
||||
def validate_project_id(cls, v: str) -> str:
|
||||
if not _PROJECT_ID_RE.match(v):
|
||||
raise ValueError("project_id 格式不合法")
|
||||
return v
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Contract family dispatcher
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
CONTRACT_FAMILY_MODELS: dict[str, type[BaseModel]] = {
|
||||
"project_tenant": ProjectTenantContract,
|
||||
"agent": AgentContract,
|
||||
"mcp_gateway": MCPGatewayContract,
|
||||
"policy_routing": PolicyRoutingContract,
|
||||
"runtime_run_state": RuntimeRunStateContract,
|
||||
"channel_event": ChannelEventContract,
|
||||
}
|
||||
|
||||
VALID_CONTRACT_FAMILIES = frozenset(CONTRACT_FAMILY_MODELS.keys())
|
||||
|
||||
|
||||
def validate_contract_body(family: str, body: dict[str, Any]) -> BaseModel:
|
||||
"""
|
||||
依 contract_family 驗證 body_json。
|
||||
驗證失敗拋出 pydantic.ValidationError。
|
||||
"""
|
||||
model_cls = CONTRACT_FAMILY_MODELS.get(family)
|
||||
if model_cls is None:
|
||||
raise ValueError(
|
||||
f"未知 contract_family: {family!r}。"
|
||||
f"合法值:{sorted(VALID_CONTRACT_FAMILIES)}"
|
||||
)
|
||||
return model_cls.model_validate(body)
|
||||
@@ -29,7 +29,7 @@ from __future__ import annotations
|
||||
from prometheus_client import Histogram
|
||||
|
||||
# Buckets 對齊 NIM 實測分佈(2-27s),並覆蓋三段 timeout 30/20/15s 邊界
|
||||
# 低端(0.5-5s):快速路徑(Ollama 188 本地)
|
||||
# 低端(0.5-5s):快速路徑(Ollama provider pool)
|
||||
# 中端(5-20s):NIM + Gemini fallback
|
||||
# 高端(20-60s):超時 / 慢速 Provider
|
||||
_AGENT_STEP_BUCKETS = [0.5, 1.0, 2.0, 5.0, 10.0, 15.0, 20.0, 30.0, 45.0, 60.0]
|
||||
|
||||
136
apps/api/src/plugins/mcp/credential_resolver.py
Normal file
136
apps/api/src/plugins/mcp/credential_resolver.py
Normal file
@@ -0,0 +1,136 @@
|
||||
"""
|
||||
MCP Credential Resolver — k8s Secret 參照解析
|
||||
=============================================
|
||||
AwoooP Phase 5.5: ADR-118 Credential Isolation
|
||||
2026-05-04 ogt + Claude Sonnet 4.6
|
||||
|
||||
設計原則(2026-04-18 Secret Leak 事故教訓):
|
||||
- 明文 credential 絕不進入 audit log / LLM context
|
||||
- Gateway 只傳 k8s secret ref(格式:"namespace/secret-name#key")
|
||||
- 真實 secret value 在記憶體中短暫存在,使用後立刻清除
|
||||
- 回傳給 caller 時只提供「遮罩版」(前 4 字元 + *** + 後 4 字元)
|
||||
- sha256(actual_value) 記入 awooop_mcp_credential_refs.value_sha256(指紋,不可還原)
|
||||
|
||||
k8s secret ref 格式:
|
||||
"namespace/secret-name#key"
|
||||
例:"awoooi/telegram-bot#TELEGRAM_BOT_TOKEN"
|
||||
|
||||
解析方式(兩種,依環境):
|
||||
1. k8s in-cluster:使用 kubernetes asyncclient(prod)
|
||||
2. 本機開發 fallback:讀 AWOOOP_DEV_SECRETS_JSON 環境變數(dev only)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
|
||||
import structlog
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
# k8s secret ref 格式正則(與 DB CHECK 一致)
|
||||
_K8S_REF_RE = re.compile(r"^([a-z0-9-]+)/([a-z0-9-]+)#([a-zA-Z0-9_-]+)$")
|
||||
|
||||
# dev fallback:JSON 格式 {"namespace/secret-name#key": "actual_value"}
|
||||
_DEV_SECRETS_ENV = "AWOOOP_DEV_SECRETS_JSON"
|
||||
|
||||
|
||||
class CredentialResolutionError(Exception):
|
||||
error_code = "E-MCP-GATE-009"
|
||||
|
||||
|
||||
def _mask_secret(value: str) -> str:
|
||||
"""回傳遮罩版:前 4 + *** + 後 4(若長度 < 8 則全遮罩)"""
|
||||
if len(value) < 8:
|
||||
return "***"
|
||||
return f"{value[:4]}***{value[-4:]}"
|
||||
|
||||
|
||||
def _sha256_secret(value: str) -> str:
|
||||
return hashlib.sha256(value.encode()).hexdigest()
|
||||
|
||||
|
||||
async def resolve_k8s_secret(ref: str) -> tuple[str, str, str]:
|
||||
"""
|
||||
解析 k8s secret ref,回傳 (actual_value, masked_value, sha256)。
|
||||
|
||||
actual_value:明文,caller 必須在使用後清除(不可存入任何持久化層)
|
||||
masked_value:供 log / response 使用
|
||||
sha256:供 awooop_mcp_credential_refs.value_sha256 記錄
|
||||
|
||||
Raises:
|
||||
CredentialResolutionError: ref 格式錯誤或 secret 不存在
|
||||
"""
|
||||
m = _K8S_REF_RE.match(ref)
|
||||
if not m:
|
||||
raise CredentialResolutionError(
|
||||
f"k8s secret ref 格式錯誤(期望 'namespace/secret-name#key'):{ref!r}"
|
||||
)
|
||||
|
||||
namespace, secret_name, key = m.group(1), m.group(2), m.group(3)
|
||||
|
||||
# Dev fallback:讀環境變數
|
||||
dev_json = os.environ.get(_DEV_SECRETS_ENV)
|
||||
if dev_json:
|
||||
try:
|
||||
import json
|
||||
dev_secrets: dict[str, str] = json.loads(dev_json)
|
||||
value = dev_secrets.get(ref)
|
||||
if value is None:
|
||||
raise CredentialResolutionError(
|
||||
f"dev secrets 中找不到 ref={ref!r}"
|
||||
)
|
||||
logger.debug("credential_resolved_dev", ref=ref)
|
||||
return value, _mask_secret(value), _sha256_secret(value)
|
||||
except CredentialResolutionError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise CredentialResolutionError(
|
||||
f"AWOOOP_DEV_SECRETS_JSON 解析失敗: {exc}"
|
||||
) from exc
|
||||
|
||||
# Production:k8s in-cluster
|
||||
try:
|
||||
from kubernetes_asyncio import client, config # type: ignore[import]
|
||||
from kubernetes_asyncio.client import CoreV1Api # type: ignore[import]
|
||||
|
||||
await config.load_incluster_config()
|
||||
async with client.ApiClient() as api:
|
||||
v1 = CoreV1Api(api)
|
||||
secret = await v1.read_namespaced_secret(secret_name, namespace)
|
||||
|
||||
if secret.data is None or key not in secret.data:
|
||||
raise CredentialResolutionError(
|
||||
f"k8s secret '{namespace}/{secret_name}' 中找不到 key='{key}'"
|
||||
)
|
||||
|
||||
import base64
|
||||
encoded = secret.data[key]
|
||||
value = base64.b64decode(encoded).decode()
|
||||
|
||||
logger.info(
|
||||
"credential_resolved_k8s",
|
||||
namespace=namespace,
|
||||
secret_name=secret_name,
|
||||
key=key,
|
||||
masked=_mask_secret(value),
|
||||
)
|
||||
return value, _mask_secret(value), _sha256_secret(value)
|
||||
|
||||
except CredentialResolutionError:
|
||||
raise
|
||||
except ImportError:
|
||||
raise CredentialResolutionError(
|
||||
"kubernetes_asyncio 未安裝,且未設定 AWOOOP_DEV_SECRETS_JSON(dev fallback)"
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.exception(
|
||||
"credential_resolution_k8s_failed",
|
||||
ref=ref,
|
||||
error=str(exc),
|
||||
)
|
||||
raise CredentialResolutionError(
|
||||
f"k8s secret 解析失敗({namespace}/{secret_name}#{key}): {exc}"
|
||||
) from exc
|
||||
502
apps/api/src/plugins/mcp/gateway.py
Normal file
502
apps/api/src/plugins/mcp/gateway.py
Normal file
@@ -0,0 +1,502 @@
|
||||
"""
|
||||
MCP Gateway — 五閘門 Enforcement Service
|
||||
=========================================
|
||||
AwoooP Phase 5.2: ADR-116 五閘門強制執行
|
||||
2026-05-04 ogt + Claude Sonnet 4.6
|
||||
|
||||
五閘門定義(依序,任一失敗即阻斷):
|
||||
Gate 1 — Project:project_id 在 awooop_projects 且 migration_mode != 'legacy_awoooi_default'
|
||||
Gate 2 — Agent:agent_id 在 awooop_agents 且 status = 'active'
|
||||
Gate 3 — Tool:tool_id 在 awooop_mcp_tool_registry 且 grant 存在且未到期
|
||||
Gate 4 — Environment:tool.environment_tags 與 run context 匹配(shadow mode 強制放行)
|
||||
Gate 5 — Approval:工具 scope 需要 approval 時,檢查 multi_sig 是否已核准
|
||||
|
||||
錯誤碼(E-MCP-GATE-XXX):
|
||||
E-MCP-GATE-001 Gate 1 project 不存在或 migration_mode 不符
|
||||
E-MCP-GATE-002 Gate 2 agent 不存在或未啟用
|
||||
E-MCP-GATE-003 Gate 3 tool 不在白名單或 grant 不存在/已到期/已撤銷
|
||||
E-MCP-GATE-004 Gate 4 environment 標籤不匹配(非 shadow mode)
|
||||
E-MCP-GATE-005 Gate 5 approval 尚未取得
|
||||
E-MCP-GATE-009 credential 解析失敗(k8s secret 取不到)
|
||||
|
||||
使用方式:
|
||||
from src.plugins.mcp.gateway import McpGateway, GatewayContext
|
||||
|
||||
ctx = GatewayContext(
|
||||
project_id="awoooi",
|
||||
agent_id="my-agent",
|
||||
tool_name="kubectl_get",
|
||||
run_id=run_id,
|
||||
trace_id=trace_id,
|
||||
is_shadow=True,
|
||||
)
|
||||
result = await McpGateway(db).call(ctx, parameters={"namespace": "default"})
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
import structlog
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from src.core.redis_client import get_redis
|
||||
from src.db.awooop_models import (
|
||||
AwoooPActiveRevision,
|
||||
AwoooPMcpGatewayAudit,
|
||||
AwoooPMcpGrant,
|
||||
AwoooPMcpToolRegistry,
|
||||
AwoooPProject,
|
||||
)
|
||||
from src.plugins.mcp.interfaces import MCPToolResult
|
||||
from src.plugins.mcp.registry import get_provider_registry
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 錯誤定義
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
class McpGatewayError(Exception):
|
||||
"""所有 Gateway 攔截錯誤的基礎類別"""
|
||||
|
||||
def __init__(self, error_code: str, message: str, gate: int) -> None:
|
||||
super().__init__(message)
|
||||
self.error_code = error_code
|
||||
self.gate = gate
|
||||
|
||||
|
||||
class GateProjectError(McpGatewayError):
|
||||
def __init__(self, msg: str = "project 不存在或 migration_mode 不符") -> None:
|
||||
super().__init__("E-MCP-GATE-001", msg, gate=1)
|
||||
|
||||
|
||||
class GateAgentError(McpGatewayError):
|
||||
def __init__(self, msg: str = "agent 不存在或未啟用") -> None:
|
||||
super().__init__("E-MCP-GATE-002", msg, gate=2)
|
||||
|
||||
|
||||
class GateToolError(McpGatewayError):
|
||||
def __init__(self, msg: str = "tool 不在白名單或 grant 失效") -> None:
|
||||
super().__init__("E-MCP-GATE-003", msg, gate=3)
|
||||
|
||||
|
||||
class GateEnvironmentError(McpGatewayError):
|
||||
def __init__(self, msg: str = "environment 標籤不匹配") -> None:
|
||||
super().__init__("E-MCP-GATE-004", msg, gate=4)
|
||||
|
||||
|
||||
class GateApprovalError(McpGatewayError):
|
||||
def __init__(self, msg: str = "approval 尚未取得") -> None:
|
||||
super().__init__("E-MCP-GATE-005", msg, gate=5)
|
||||
|
||||
|
||||
class CredentialResolutionError(McpGatewayError):
|
||||
def __init__(self, msg: str = "credential 解析失敗") -> None:
|
||||
super().__init__("E-MCP-GATE-009", msg, gate=0)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Gateway Context(每次 call 一個)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class GatewayContext:
|
||||
project_id: str
|
||||
agent_id: str
|
||||
tool_name: str
|
||||
run_id: UUID | None = None
|
||||
trace_id: str | None = None
|
||||
is_shadow: bool = True # shadow mode:Gate 4/5 放行,不執行 destructive
|
||||
environment: dict[str, str] = field(default_factory=dict) # e.g. {"env": "prod"}
|
||||
required_scope: str = "read" # "read" | "write" | "admin"
|
||||
|
||||
|
||||
@dataclass
|
||||
class GateCheckResult:
|
||||
gate1_project: bool = False
|
||||
gate2_agent: bool = False
|
||||
gate3_tool: bool = False
|
||||
gate4_env: bool = False
|
||||
gate5_approval: bool = False
|
||||
|
||||
def as_dict(self) -> dict[str, bool]:
|
||||
return {
|
||||
"gate1_project": self.gate1_project,
|
||||
"gate2_agent": self.gate2_agent,
|
||||
"gate3_tool": self.gate3_tool,
|
||||
"gate4_env": self.gate4_env,
|
||||
"gate5_approval": self.gate5_approval,
|
||||
}
|
||||
|
||||
@property
|
||||
def all_passed(self) -> bool:
|
||||
return all([
|
||||
self.gate1_project,
|
||||
self.gate2_agent,
|
||||
self.gate3_tool,
|
||||
self.gate4_env,
|
||||
self.gate5_approval,
|
||||
])
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# McpGateway
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
class McpGateway:
|
||||
"""
|
||||
MCP Gateway:五閘門 enforcement + audit log + credential isolation。
|
||||
|
||||
每個 gateway call 都寫一筆 awooop_mcp_gateway_audit。
|
||||
"""
|
||||
|
||||
def __init__(self, db: AsyncSession) -> None:
|
||||
self._db = db
|
||||
|
||||
async def call(
|
||||
self,
|
||||
ctx: GatewayContext,
|
||||
parameters: dict[str, Any],
|
||||
) -> MCPToolResult:
|
||||
"""
|
||||
執行五閘門檢查後呼叫底層 MCP provider。
|
||||
任一閘門失敗 → raise McpGatewayError + 寫 blocked audit。
|
||||
"""
|
||||
started = time.monotonic()
|
||||
gate_result = GateCheckResult()
|
||||
tool_row: AwoooPMcpToolRegistry | None = None
|
||||
grant_row: AwoooPMcpGrant | None = None
|
||||
|
||||
try:
|
||||
# Gate 1 — Project
|
||||
tool_row, grant_row = await self._gate1_project(ctx, gate_result)
|
||||
|
||||
# Gate 2 — Agent
|
||||
await self._gate2_agent(ctx, gate_result)
|
||||
|
||||
# Gate 3 — Tool + Grant
|
||||
tool_row, grant_row = await self._gate3_tool(ctx, gate_result)
|
||||
|
||||
# Gate 4 — Environment(shadow mode 直接放行)
|
||||
await self._gate4_environment(ctx, tool_row, gate_result)
|
||||
|
||||
# Gate 5 — Approval(shadow mode + scope=read 直接放行)
|
||||
await self._gate5_approval(ctx, grant_row, gate_result)
|
||||
|
||||
except McpGatewayError as exc:
|
||||
latency = int((time.monotonic() - started) * 1000)
|
||||
await self._write_audit(
|
||||
ctx=ctx,
|
||||
tool_row=tool_row,
|
||||
parameters=parameters,
|
||||
result=None,
|
||||
gate_result=gate_result,
|
||||
result_status="blocked",
|
||||
block_gate=exc.gate,
|
||||
block_reason=f"{exc.error_code}: {exc}",
|
||||
latency_ms=latency,
|
||||
)
|
||||
raise
|
||||
|
||||
# 五閘通過 → 執行 tool
|
||||
result: MCPToolResult | None = None
|
||||
result_status = "failed"
|
||||
try:
|
||||
result = await self._execute_tool(ctx, tool_row, parameters)
|
||||
result_status = "success" if result.success else "failed"
|
||||
return result
|
||||
except Exception as exc:
|
||||
logger.exception(
|
||||
"mcp_gateway_execution_error",
|
||||
project_id=ctx.project_id,
|
||||
tool_name=ctx.tool_name,
|
||||
error=str(exc),
|
||||
)
|
||||
raise
|
||||
finally:
|
||||
latency = int((time.monotonic() - started) * 1000)
|
||||
await self._write_audit(
|
||||
ctx=ctx,
|
||||
tool_row=tool_row,
|
||||
parameters=parameters,
|
||||
result=result,
|
||||
gate_result=gate_result,
|
||||
result_status=result_status,
|
||||
block_gate=None,
|
||||
block_reason=None,
|
||||
latency_ms=latency,
|
||||
)
|
||||
|
||||
# ── 五閘門實作 ────────────────────────────────────────────────────────────
|
||||
|
||||
async def _gate1_project(
|
||||
self, ctx: GatewayContext, gate_result: GateCheckResult
|
||||
) -> tuple[AwoooPMcpToolRegistry | None, AwoooPMcpGrant | None]:
|
||||
"""Gate 1:project 必須存在且 migration_mode != 'legacy_awoooi_default'"""
|
||||
result = await self._db.execute(
|
||||
select(AwoooPProject).where(
|
||||
AwoooPProject.project_id == ctx.project_id,
|
||||
AwoooPProject.migration_mode != "legacy_awoooi_default",
|
||||
)
|
||||
)
|
||||
project = result.scalar_one_or_none()
|
||||
if project is None:
|
||||
raise GateProjectError(
|
||||
f"project '{ctx.project_id}' 不存在或 migration_mode=legacy_awoooi_default"
|
||||
)
|
||||
gate_result.gate1_project = True
|
||||
return None, None
|
||||
|
||||
async def _gate2_agent(
|
||||
self, ctx: GatewayContext, gate_result: GateCheckResult
|
||||
) -> None:
|
||||
"""Gate 2:agent 必須在 awooop_active_revisions 中有 active contract(family='agent')"""
|
||||
result = await self._db.execute(
|
||||
select(AwoooPActiveRevision).where(
|
||||
AwoooPActiveRevision.project_id == ctx.project_id,
|
||||
AwoooPActiveRevision.contract_family == "agent",
|
||||
AwoooPActiveRevision.contract_id == ctx.agent_id,
|
||||
)
|
||||
)
|
||||
active = result.scalar_one_or_none()
|
||||
if active is None:
|
||||
raise GateAgentError(
|
||||
f"agent '{ctx.agent_id}' 在 '{ctx.project_id}' 無 active contract"
|
||||
)
|
||||
gate_result.gate2_agent = True
|
||||
|
||||
async def _gate3_tool(
|
||||
self, ctx: GatewayContext, gate_result: GateCheckResult
|
||||
) -> tuple[AwoooPMcpToolRegistry, AwoooPMcpGrant]:
|
||||
"""Gate 3:tool 在白名單 + grant 有效(未到期、未撤銷)"""
|
||||
now = datetime.now(UTC)
|
||||
|
||||
# 查 tool registry
|
||||
tool_result = await self._db.execute(
|
||||
select(AwoooPMcpToolRegistry).where(
|
||||
AwoooPMcpToolRegistry.project_id == ctx.project_id,
|
||||
AwoooPMcpToolRegistry.tool_name == ctx.tool_name,
|
||||
AwoooPMcpToolRegistry.is_active.is_(True),
|
||||
)
|
||||
)
|
||||
tool_row = tool_result.scalar_one_or_none()
|
||||
if tool_row is None:
|
||||
raise GateToolError(f"tool '{ctx.tool_name}' 不在白名單")
|
||||
|
||||
# 查 grant(scope 必須包含 required_scope)
|
||||
grant_result = await self._db.execute(
|
||||
select(AwoooPMcpGrant).where(
|
||||
AwoooPMcpGrant.project_id == ctx.project_id,
|
||||
AwoooPMcpGrant.agent_id == ctx.agent_id,
|
||||
AwoooPMcpGrant.tool_id == tool_row.tool_id,
|
||||
AwoooPMcpGrant.is_revoked.is_(False),
|
||||
)
|
||||
)
|
||||
grant_row = grant_result.scalar_one_or_none()
|
||||
if grant_row is None:
|
||||
raise GateToolError(
|
||||
f"agent '{ctx.agent_id}' 對 tool '{ctx.tool_name}' 無有效 grant"
|
||||
)
|
||||
if grant_row.expires_at is not None and grant_row.expires_at < now:
|
||||
raise GateToolError(
|
||||
f"agent '{ctx.agent_id}' 對 tool '{ctx.tool_name}' 的 grant 已到期"
|
||||
)
|
||||
# scope 檢查:required_scope 必須在 granted_scopes 中
|
||||
granted_scopes: list[str] = grant_row.granted_scopes or []
|
||||
if ctx.required_scope not in granted_scopes:
|
||||
raise GateToolError(
|
||||
f"grant 未包含所需 scope '{ctx.required_scope}'(有:{granted_scopes})"
|
||||
)
|
||||
|
||||
gate_result.gate3_tool = True
|
||||
return tool_row, grant_row
|
||||
|
||||
async def _gate4_environment(
|
||||
self,
|
||||
ctx: GatewayContext,
|
||||
tool_row: AwoooPMcpToolRegistry | None,
|
||||
gate_result: GateCheckResult,
|
||||
) -> None:
|
||||
"""Gate 4:environment 標籤匹配(shadow mode 強制放行)"""
|
||||
if ctx.is_shadow:
|
||||
gate_result.gate4_env = True
|
||||
return
|
||||
|
||||
if tool_row is None:
|
||||
gate_result.gate4_env = True
|
||||
return
|
||||
|
||||
required_tags: dict[str, str] = tool_row.environment_tags or {}
|
||||
for k, v in required_tags.items():
|
||||
if ctx.environment.get(k) != v:
|
||||
raise GateEnvironmentError(
|
||||
f"environment tag '{k}' 期望 '{v}',實際 '{ctx.environment.get(k)}'"
|
||||
)
|
||||
gate_result.gate4_env = True
|
||||
|
||||
async def _gate5_approval(
|
||||
self,
|
||||
ctx: GatewayContext,
|
||||
grant_row: AwoooPMcpGrant | None,
|
||||
gate_result: GateCheckResult,
|
||||
) -> None:
|
||||
"""Gate 5:需要 approval 時,檢查 Redis multi_sig(shadow + read scope 直接放行)"""
|
||||
# shadow mode 或 read scope 不需 approval
|
||||
if ctx.is_shadow or ctx.required_scope == "read":
|
||||
gate_result.gate5_approval = True
|
||||
return
|
||||
|
||||
# write/admin scope 需要檢查 approval
|
||||
if ctx.run_id is None:
|
||||
raise GateApprovalError("write/admin 操作需要 run_id(approval 追蹤用)")
|
||||
|
||||
try:
|
||||
redis = get_redis()
|
||||
approval_key = f"mcp_approval:{ctx.project_id}:{ctx.agent_id}:{ctx.tool_name}:{ctx.run_id}"
|
||||
approved = await redis.get(approval_key)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"mcp_gate5_redis_error",
|
||||
project_id=ctx.project_id,
|
||||
tool_name=ctx.tool_name,
|
||||
error=str(exc),
|
||||
)
|
||||
# Redis 失敗時 fail-closed(不放行)
|
||||
raise GateApprovalError(f"approval Redis 查詢失敗: {exc}") from exc
|
||||
|
||||
if not approved:
|
||||
raise GateApprovalError(
|
||||
f"tool '{ctx.tool_name}' 需要 approval(key={approval_key})"
|
||||
)
|
||||
gate_result.gate5_approval = True
|
||||
|
||||
# ── 執行層 ───────────────────────────────────────────────────────────────
|
||||
|
||||
async def _execute_tool(
|
||||
self,
|
||||
ctx: GatewayContext,
|
||||
tool_row: AwoooPMcpToolRegistry | None,
|
||||
parameters: dict[str, Any],
|
||||
) -> MCPToolResult:
|
||||
"""呼叫底層 MCP provider 執行工具"""
|
||||
registry = get_provider_registry()
|
||||
provider = registry.get(ctx.tool_name) or registry.get(
|
||||
tool_row.tool_name if tool_row else ctx.tool_name
|
||||
)
|
||||
|
||||
# 找不到 provider → 回傳 shadow no-op
|
||||
if provider is None:
|
||||
logger.warning(
|
||||
"mcp_gateway_no_provider",
|
||||
tool_name=ctx.tool_name,
|
||||
is_shadow=ctx.is_shadow,
|
||||
)
|
||||
return MCPToolResult(
|
||||
success=True,
|
||||
execution_id=f"shadow-noop-{ctx.tool_name}",
|
||||
output={"shadow": True, "message": "no provider registered, shadow no-op"},
|
||||
)
|
||||
|
||||
audit_params = dict(parameters)
|
||||
audit_params["_mcp_audit"] = {
|
||||
"project_id": ctx.project_id,
|
||||
"agent_id": ctx.agent_id,
|
||||
"run_id": str(ctx.run_id) if ctx.run_id else None,
|
||||
"trace_id": ctx.trace_id,
|
||||
}
|
||||
return await provider.execute(ctx.tool_name, audit_params)
|
||||
|
||||
# ── Audit log ─────────────────────────────────────────────────────────────
|
||||
|
||||
async def _write_audit(
|
||||
self,
|
||||
*,
|
||||
ctx: GatewayContext,
|
||||
tool_row: AwoooPMcpToolRegistry | None,
|
||||
parameters: dict[str, Any],
|
||||
result: MCPToolResult | None,
|
||||
gate_result: GateCheckResult,
|
||||
result_status: str,
|
||||
block_gate: int | None,
|
||||
block_reason: str | None,
|
||||
latency_ms: int,
|
||||
) -> None:
|
||||
"""寫 awooop_mcp_gateway_audit — 只寫 hash,不寫明文 input/output"""
|
||||
try:
|
||||
input_hash = hashlib.sha256(
|
||||
json.dumps(parameters, sort_keys=True, default=str).encode()
|
||||
).hexdigest()
|
||||
|
||||
output_hash: str | None = None
|
||||
if result is not None:
|
||||
output_hash = hashlib.sha256(
|
||||
json.dumps(result.output, sort_keys=True, default=str).encode()
|
||||
).hexdigest()
|
||||
|
||||
audit = AwoooPMcpGatewayAudit(
|
||||
project_id=ctx.project_id,
|
||||
run_id=ctx.run_id,
|
||||
trace_id=ctx.trace_id,
|
||||
agent_id=ctx.agent_id,
|
||||
tool_id=tool_row.tool_id if tool_row else None, # type: ignore[arg-type]
|
||||
tool_name=ctx.tool_name,
|
||||
input_hash=input_hash,
|
||||
output_hash=output_hash,
|
||||
gate_result=gate_result.as_dict(),
|
||||
result_status=result_status,
|
||||
block_gate=block_gate,
|
||||
block_reason=block_reason,
|
||||
latency_ms=latency_ms,
|
||||
)
|
||||
|
||||
self._db.add(audit)
|
||||
await self._db.flush()
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"mcp_gateway_audit_write_failed",
|
||||
project_id=ctx.project_id,
|
||||
tool_name=ctx.tool_name,
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 便捷函數
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def gateway_call(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
project_id: str,
|
||||
agent_id: str,
|
||||
tool_name: str,
|
||||
parameters: dict[str, Any],
|
||||
run_id: UUID | None = None,
|
||||
trace_id: str | None = None,
|
||||
is_shadow: bool = True,
|
||||
required_scope: str = "read",
|
||||
environment: dict[str, str] | None = None,
|
||||
) -> MCPToolResult:
|
||||
"""
|
||||
Stateless 便捷函數:建立 GatewayContext + 執行 McpGateway.call()。
|
||||
"""
|
||||
ctx = GatewayContext(
|
||||
project_id=project_id,
|
||||
agent_id=agent_id,
|
||||
tool_name=tool_name,
|
||||
run_id=run_id,
|
||||
trace_id=trace_id,
|
||||
is_shadow=is_shadow,
|
||||
required_scope=required_scope,
|
||||
environment=environment or {},
|
||||
)
|
||||
return await McpGateway(db).call(ctx, parameters)
|
||||
@@ -14,6 +14,7 @@ from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from uuid import uuid4
|
||||
|
||||
from src.utils.timezone import now_taipei
|
||||
|
||||
@@ -29,7 +30,9 @@ class MCPTool:
|
||||
name: str
|
||||
description: str
|
||||
input_schema: dict[str, Any]
|
||||
server_name: str
|
||||
# 2026-05-06 Codex: 部分舊 provider 的 list_tools() 尚未傳 server_name。
|
||||
# 先給 DTO 預設值,registry 會以 provider.name 補正,避免啟動登記直接 crash。
|
||||
server_name: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -41,12 +44,21 @@ class MCPToolResult:
|
||||
"""
|
||||
|
||||
success: bool
|
||||
execution_id: str
|
||||
execution_id: str = ""
|
||||
output: Any | None = None
|
||||
# 2026-05-06 Codex: 舊 provider 曾使用 data=... 作為成功輸出欄位。
|
||||
# 保留 alias,避免 provider 成功路徑因 dataclass 參數不相容而 crash。
|
||||
data: Any | None = None
|
||||
error: str | None = None
|
||||
duration: float = 0.0
|
||||
timestamp: datetime = field(default_factory=now_taipei)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if not self.execution_id:
|
||||
self.execution_id = f"mcp-{uuid4()}"
|
||||
if self.output is None and self.data is not None:
|
||||
self.output = self.data
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"success": self.success,
|
||||
|
||||
@@ -23,6 +23,8 @@ from typing import Any
|
||||
|
||||
import httpx
|
||||
|
||||
from src.core.config import settings # P0-13: K8s namespace 由 settings.AWOOOI_K8S_NAMESPACE 提供
|
||||
from src.services.mcp_audit_context import with_mcp_audit_context
|
||||
from src.utils.timezone import now_taipei
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -517,6 +519,13 @@ class MCPBridge:
|
||||
raise ValueError(f"Unknown MCP Server: {server_name}")
|
||||
|
||||
server = self._servers[server_name]
|
||||
parameters = with_mcp_audit_context(
|
||||
parameters,
|
||||
session_id=f"mcp_bridge:{execution_id}",
|
||||
flywheel_node="govern",
|
||||
agent_role="mcp_bridge",
|
||||
gateway_path="legacy_mcp_bridge",
|
||||
)
|
||||
result = await self._execute_tool(server, tool_name, parameters)
|
||||
|
||||
# ========================================
|
||||
@@ -589,7 +598,7 @@ class MCPBridge:
|
||||
|
||||
if tool_name == "kubectl_get":
|
||||
# 使用 kubectl 指令查詢
|
||||
namespace = parameters.get("namespace", "awoooi-prod")
|
||||
namespace = parameters.get("namespace", settings.AWOOOI_K8S_NAMESPACE)
|
||||
resource = parameters.get("resource", "pods")
|
||||
name = parameters.get("name", "")
|
||||
cmd = f"kubectl get {resource} {name} -n {namespace} -o json".strip()
|
||||
@@ -599,7 +608,7 @@ class MCPBridge:
|
||||
return {"error": result.error}
|
||||
|
||||
elif tool_name == "kubectl_delete":
|
||||
namespace = parameters.get("namespace", "awoooi-prod")
|
||||
namespace = parameters.get("namespace", settings.AWOOOI_K8S_NAMESPACE)
|
||||
resource = parameters.get("resource", "pod")
|
||||
name = parameters.get("name", "")
|
||||
if not name:
|
||||
@@ -628,7 +637,7 @@ class MCPBridge:
|
||||
}
|
||||
|
||||
elif tool_name == "kubectl_scale":
|
||||
namespace = parameters.get("namespace", "awoooi-prod")
|
||||
namespace = parameters.get("namespace", settings.AWOOOI_K8S_NAMESPACE)
|
||||
deployment = parameters.get("deployment", "")
|
||||
replicas = parameters.get("replicas", 1)
|
||||
if not deployment:
|
||||
@@ -644,7 +653,7 @@ class MCPBridge:
|
||||
}
|
||||
|
||||
elif tool_name == "kubectl_restart":
|
||||
namespace = parameters.get("namespace", "awoooi-prod")
|
||||
namespace = parameters.get("namespace", settings.AWOOOI_K8S_NAMESPACE)
|
||||
deployment = parameters.get("deployment", "")
|
||||
if not deployment:
|
||||
return {"error": "Missing 'deployment' parameter"}
|
||||
@@ -678,7 +687,7 @@ class MCPBridge:
|
||||
if not service_name:
|
||||
return {"error": "Missing 'service_name' parameter"}
|
||||
|
||||
namespace = parameters.get("namespace", "awoooi-prod")
|
||||
namespace = parameters.get("namespace", settings.AWOOOI_K8S_NAMESPACE)
|
||||
time_window = parameters.get("time_window_minutes", 10)
|
||||
|
||||
metrics = await signoz.get_gold_metrics(
|
||||
|
||||
@@ -41,6 +41,7 @@ SSH 連線:
|
||||
@see docs/superpowers/specs/2026-04-10-infra-rebuild-sprint-abc-design.md §MCP-2a
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import uuid
|
||||
from datetime import UTC, datetime
|
||||
@@ -51,6 +52,7 @@ import structlog
|
||||
from src.plugins.mcp.interfaces import MCPTool, MCPToolProvider, MCPToolResult
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
_asyncssh_logger_configured = False
|
||||
|
||||
# =============================================================================
|
||||
# 安全常數
|
||||
@@ -58,6 +60,7 @@ logger = structlog.get_logger(__name__)
|
||||
|
||||
SSH_KEY_PATH = "/run/secrets/ssh_mcp_key"
|
||||
SSH_USER = "wooo"
|
||||
SSH_PORT = 22
|
||||
DEFAULT_HOST_USERS = {
|
||||
# AI/Web host is operated by the ollama account in the current topology.
|
||||
"192.168.0.188": "ollama",
|
||||
@@ -104,6 +107,45 @@ def _validate_param(key: str, value: str) -> str:
|
||||
# tail / port / lines 由呼叫方 int() 轉換,不需字串白名單
|
||||
return value
|
||||
|
||||
|
||||
def _normalize_ssh_host(value: str) -> str:
|
||||
"""
|
||||
Normalize host labels before they enter asyncssh.
|
||||
|
||||
Prometheus labels often arrive as ``192.168.0.110:9100``. That port is the
|
||||
exporter port, not SSH. The SSH provider must connect to the host on the
|
||||
platform SSH port, otherwise asyncssh can receive a stringly port from
|
||||
config/labels and fail with ``%d format`` before the tool even runs.
|
||||
"""
|
||||
host = (value or "").strip()
|
||||
if host.startswith("ssh://"):
|
||||
host = host.removeprefix("ssh://")
|
||||
if "@" in host:
|
||||
host = host.rsplit("@", 1)[1]
|
||||
if host.startswith("[") and "]" in host:
|
||||
return host[1:host.index("]")]
|
||||
if host.count(":") == 1:
|
||||
maybe_host, maybe_port = host.rsplit(":", 1)
|
||||
if maybe_port.isdigit():
|
||||
return maybe_host
|
||||
return host
|
||||
|
||||
|
||||
def _quiet_asyncssh_info_logs() -> None:
|
||||
"""Keep third-party asyncssh INFO logs from breaking stdlib %-format logging.
|
||||
|
||||
Some target SSH servers send exit status as a string. AsyncSSH then emits an
|
||||
INFO log with ``%d`` and that string argument before our code sees the
|
||||
result, which produces noisy ``TypeError: %d format`` tracebacks. The tool
|
||||
result itself is still available, so production should keep asyncssh at
|
||||
WARNING and rely on our structured MCP audit logs.
|
||||
"""
|
||||
global _asyncssh_logger_configured
|
||||
if _asyncssh_logger_configured:
|
||||
return
|
||||
logging.getLogger("asyncssh").setLevel(logging.WARNING)
|
||||
_asyncssh_logger_configured = True
|
||||
|
||||
# 群組 A(只讀)
|
||||
GROUP_A_TOOLS = {
|
||||
"ssh_diagnose",
|
||||
@@ -375,7 +417,7 @@ class SSHProvider(MCPToolProvider):
|
||||
error=f"Unknown tool: {tool_name}",
|
||||
)
|
||||
|
||||
host = parameters.get("host", "")
|
||||
host = _normalize_ssh_host(str(parameters.get("host", "")))
|
||||
|
||||
# 守衛 2: 允許的 host
|
||||
if host not in self._allowed_hosts():
|
||||
@@ -604,7 +646,9 @@ class SSHProvider(MCPToolProvider):
|
||||
raise RuntimeError(
|
||||
"asyncssh is not installed. "
|
||||
"Add 'asyncssh' to pyproject.toml dependencies."
|
||||
)
|
||||
) from None
|
||||
|
||||
_quiet_asyncssh_info_logs()
|
||||
|
||||
import os
|
||||
if not os.path.exists(SSH_KEY_PATH):
|
||||
@@ -625,11 +669,13 @@ class SSHProvider(MCPToolProvider):
|
||||
|
||||
async with asyncssh.connect(
|
||||
host,
|
||||
port=SSH_PORT,
|
||||
username=username or SSH_USER,
|
||||
client_keys=[SSH_KEY_PATH],
|
||||
known_hosts=known_hosts_path, # None = 跳過驗證(內網),或指定文件路徑
|
||||
connect_timeout=timeout,
|
||||
config=None, # 禁止讀取使用者 ssh config,避免 Port 字串污染 asyncssh
|
||||
connect_timeout=float(timeout),
|
||||
) as conn:
|
||||
# Bug 根因:asyncssh 模組沒有頂層 run();應呼叫 conn.run()(2026-04-24 Claude Sonnet 4.6)
|
||||
result = await conn.run(cmd, timeout=timeout, check=False)
|
||||
result = await conn.run(cmd, timeout=float(timeout), check=False)
|
||||
return (result.stdout or ""), (result.stderr or "")
|
||||
|
||||
159
apps/api/src/plugins/mcp/redaction_middleware.py
Normal file
159
apps/api/src/plugins/mcp/redaction_middleware.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""
|
||||
MCP Redaction Middleware — 雙層 PII/Secret Redaction
|
||||
=====================================================
|
||||
AwoooP Phase 5.3: ADR-116 P1-04 + P1-09
|
||||
2026-05-04 ogt + Claude Sonnet 4.6
|
||||
|
||||
MCP tool call 的 input/output 必須經過雙層 redaction:
|
||||
Layer 1(audit_sink)— 寫入 audit log 前的 sanitization(欄位黑名單 + pattern 攔截)
|
||||
Layer 2(本層) — MCP tool call input/output 專用:
|
||||
- 移除已知 secret 欄位(_mcp_audit 注入的 context)
|
||||
- 對 output 套用 audit_sink 的完整 redaction patterns
|
||||
- 限制 output 大小(防 prompt stuffing)
|
||||
|
||||
設計原則(ADR-118 credential isolation 延伸):
|
||||
- MCP tool 的 output 可能含 k8s secret 值 → 必須在 output 進入 LLM context 前 redact
|
||||
- 只有「安全的」output 才能被 platform_runtime.shadow_execute 使用
|
||||
- input credential 欄位(如 k8s_value)在送入 provider 前清除(credential isolation)
|
||||
|
||||
雙層保障的必要性:
|
||||
- audit_sink 保護的是 audit log DB
|
||||
- 本 middleware 保護的是 LLM context + gateway audit hash
|
||||
- 兩者防護對象不同,不可互相替代
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
|
||||
from src.services.audit_sink import _BLOCKED_FIELD_NAMES, _REDACTION_PATTERNS, _redact_string
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
# MCP output 進入 LLM context 的最大字元數(防 prompt stuffing)
|
||||
_MCP_OUTPUT_MAX_CHARS = 16_000
|
||||
|
||||
# MCP gateway 注入的 audit context key(送 provider 前移除)
|
||||
_MCP_AUDIT_KEY = "_mcp_audit"
|
||||
|
||||
# MCP credential 欄位名稱(Gate 5 credential isolation — 在 input 中清除)
|
||||
_MCP_CREDENTIAL_FIELDS = frozenset({
|
||||
"k8s_value", "secret_value", "credential", "credential_value",
|
||||
"token_value", "api_key_value", "private_key_value",
|
||||
})
|
||||
|
||||
|
||||
def redact_mcp_input(parameters: dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
Layer 2 Input Redaction:清理 MCP tool call 的 input parameters。
|
||||
|
||||
1. 移除 _mcp_audit(audit context,不應傳給 provider)
|
||||
2. 移除 credential 欄位(credential isolation)
|
||||
3. 對剩餘的 string values 套用 audit_sink patterns
|
||||
"""
|
||||
cleaned: dict[str, Any] = {}
|
||||
for key, value in parameters.items():
|
||||
# 移除 audit context injection
|
||||
if key == _MCP_AUDIT_KEY:
|
||||
continue
|
||||
|
||||
# credential isolation — 不讓 credential 明文流向 provider
|
||||
if key.lower() in _MCP_CREDENTIAL_FIELDS:
|
||||
cleaned[key] = "[REDACTED:CREDENTIAL_ISOLATION]"
|
||||
continue
|
||||
|
||||
# 欄位名稱黑名單(與 audit_sink 對齊)
|
||||
if key.lower() in _BLOCKED_FIELD_NAMES:
|
||||
cleaned[key] = "[REDACTED:BLOCKED_FIELD]"
|
||||
continue
|
||||
|
||||
# string value — 套用 pattern redaction
|
||||
if isinstance(value, str):
|
||||
cleaned[key] = _redact_string(value)
|
||||
elif isinstance(value, dict):
|
||||
cleaned[key] = redact_mcp_input(value)
|
||||
elif isinstance(value, list):
|
||||
cleaned[key] = [
|
||||
redact_mcp_input(item) if isinstance(item, dict)
|
||||
else (_redact_string(item) if isinstance(item, str) else item)
|
||||
for item in value
|
||||
]
|
||||
else:
|
||||
cleaned[key] = value
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def redact_mcp_output(output: Any) -> Any:
|
||||
"""
|
||||
Layer 2 Output Redaction:清理 MCP tool call 的 output。
|
||||
|
||||
1. 對 output dict / string 套用 audit_sink patterns
|
||||
2. 限制 output 大小(防 prompt stuffing)
|
||||
3. 回傳清理後的 output(供 LLM context 使用)
|
||||
"""
|
||||
if output is None:
|
||||
return None
|
||||
|
||||
if isinstance(output, str):
|
||||
redacted = _redact_string(output)
|
||||
if len(redacted) > _MCP_OUTPUT_MAX_CHARS:
|
||||
redacted = redacted[:_MCP_OUTPUT_MAX_CHARS] + f"\n[TRUNCATED:{len(output)} chars]"
|
||||
return redacted
|
||||
|
||||
if isinstance(output, dict):
|
||||
return _redact_output_dict(output)
|
||||
|
||||
if isinstance(output, list):
|
||||
result = []
|
||||
total = 0
|
||||
for item in output:
|
||||
if total > _MCP_OUTPUT_MAX_CHARS:
|
||||
result.append(f"[TRUNCATED:{len(output)} items total]")
|
||||
break
|
||||
cleaned = redact_mcp_output(item)
|
||||
serialized = json.dumps(cleaned, ensure_ascii=False, default=str)
|
||||
total += len(serialized)
|
||||
result.append(cleaned)
|
||||
return result
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def _redact_output_dict(d: dict[str, Any], depth: int = 0) -> dict[str, Any]:
|
||||
"""遞迴 redact output dict"""
|
||||
if depth > 8:
|
||||
return {"[MAX_DEPTH]": True}
|
||||
|
||||
result: dict[str, Any] = {}
|
||||
for key, value in d.items():
|
||||
# 欄位名稱黑名單
|
||||
if key.lower() in _BLOCKED_FIELD_NAMES:
|
||||
result[key] = "[REDACTED:BLOCKED_FIELD]"
|
||||
continue
|
||||
|
||||
if isinstance(value, str):
|
||||
result[key] = _redact_string(value)
|
||||
elif isinstance(value, dict):
|
||||
result[key] = _redact_output_dict(value, depth + 1)
|
||||
elif isinstance(value, list):
|
||||
result[key] = [
|
||||
_redact_output_dict(item, depth + 1) if isinstance(item, dict)
|
||||
else (_redact_string(item) if isinstance(item, str) else item)
|
||||
for item in value
|
||||
]
|
||||
else:
|
||||
result[key] = value
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def compute_safe_hash(data: Any) -> str:
|
||||
"""計算 redacted data 的 sha256(供 gateway audit 使用)"""
|
||||
serialized = json.dumps(data, sort_keys=True, ensure_ascii=False, default=str)
|
||||
return hashlib.sha256(serialized.encode()).hexdigest()
|
||||
@@ -21,18 +21,20 @@ class AuditedMCPToolProvider(MCPToolProvider):
|
||||
"""Provider wrapper that writes every MCP tool call to the audit subsystem."""
|
||||
|
||||
def __init__(self, provider: MCPToolProvider) -> None:
|
||||
self._provider = provider
|
||||
# __provider 使用 Python name mangling(_AuditedMCPToolProvider__provider)
|
||||
# 防止 caller 透過 wrapper._provider 直接存取 inner provider(ADR-116 封裝要求)
|
||||
self.__provider = provider # noqa: SLF001 — intentional name mangling
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return self._provider.name
|
||||
return self.__provider.name
|
||||
|
||||
@property
|
||||
def enabled(self) -> bool:
|
||||
return self._provider.enabled
|
||||
return self.__provider.enabled
|
||||
|
||||
async def list_tools(self) -> list[MCPTool]:
|
||||
return await self._provider.list_tools()
|
||||
return await self.__provider.list_tools()
|
||||
|
||||
async def execute(
|
||||
self,
|
||||
@@ -49,7 +51,7 @@ class AuditedMCPToolProvider(MCPToolProvider):
|
||||
started = monotonic_ms()
|
||||
result: MCPToolResult | None = None
|
||||
try:
|
||||
result = await self._provider.execute(tool_name, provider_parameters)
|
||||
result = await self.__provider.execute(tool_name, provider_parameters)
|
||||
return result
|
||||
finally:
|
||||
duration_ms = monotonic_ms() - started
|
||||
@@ -68,7 +70,7 @@ class AuditedMCPToolProvider(MCPToolProvider):
|
||||
)
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
return await self._provider.health_check()
|
||||
return await self.__provider.health_check()
|
||||
|
||||
|
||||
class ProviderRegistry:
|
||||
|
||||
261
apps/api/src/repositories/contract_repository.py
Normal file
261
apps/api/src/repositories/contract_repository.py
Normal file
@@ -0,0 +1,261 @@
|
||||
"""
|
||||
Contract Repository
|
||||
===================
|
||||
AwoooP Phase 3: contract revision CRUD(append-only)
|
||||
2026-05-04 ogt + Claude Sonnet 4.6(ADR-107/ADR-112)
|
||||
|
||||
設計原則:
|
||||
- append-only:已 published 的 revision 不可修改
|
||||
- active pointer 以 UPSERT 維護(awooop_active_revisions)
|
||||
- outbox 事件在同一 transaction 寫入(ADR-113)
|
||||
- RLS 透過 get_db_context() 自動套用
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
import structlog
|
||||
from sqlalchemy import select, update
|
||||
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||
|
||||
from src.db.awooop_models import (
|
||||
AwoooPActiveRevision,
|
||||
AwoooPContractOutbox,
|
||||
AwoooPContractRevision,
|
||||
)
|
||||
from src.db.base import get_db_context
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Read
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def get_revision(
|
||||
revision_id: UUID,
|
||||
project_id: str = "awoooi",
|
||||
) -> AwoooPContractRevision | None:
|
||||
"""依 revision_id 讀取單筆(含 RLS 驗證)"""
|
||||
async with get_db_context(project_id) as db:
|
||||
result = await db.execute(
|
||||
select(AwoooPContractRevision).where(
|
||||
AwoooPContractRevision.revision_id == revision_id,
|
||||
AwoooPContractRevision.project_id == project_id,
|
||||
)
|
||||
)
|
||||
return result.scalar_one_or_none()
|
||||
|
||||
|
||||
async def get_active_revision(
|
||||
project_id: str,
|
||||
contract_family: str,
|
||||
contract_id: str,
|
||||
) -> AwoooPContractRevision | None:
|
||||
"""
|
||||
讀取 active revision(runtime 路徑)。
|
||||
只返回 lifecycle_status='active' 的 revision。
|
||||
"""
|
||||
async with get_db_context(project_id) as db:
|
||||
result = await db.execute(
|
||||
select(AwoooPContractRevision)
|
||||
.join(
|
||||
AwoooPActiveRevision,
|
||||
AwoooPActiveRevision.active_revision_id == AwoooPContractRevision.revision_id,
|
||||
)
|
||||
.where(
|
||||
AwoooPActiveRevision.project_id == project_id,
|
||||
AwoooPActiveRevision.contract_family == contract_family,
|
||||
AwoooPActiveRevision.contract_id == contract_id,
|
||||
AwoooPContractRevision.lifecycle_status == "active",
|
||||
)
|
||||
)
|
||||
return result.scalar_one_or_none()
|
||||
|
||||
|
||||
async def list_revisions(
|
||||
project_id: str,
|
||||
contract_family: str,
|
||||
contract_id: str,
|
||||
lifecycle_status: str | None = None,
|
||||
) -> list[AwoooPContractRevision]:
|
||||
"""列出所有 revision(按 version 降序)"""
|
||||
async with get_db_context(project_id) as db:
|
||||
q = select(AwoooPContractRevision).where(
|
||||
AwoooPContractRevision.project_id == project_id,
|
||||
AwoooPContractRevision.contract_family == contract_family,
|
||||
AwoooPContractRevision.contract_id == contract_id,
|
||||
)
|
||||
if lifecycle_status:
|
||||
q = q.where(AwoooPContractRevision.lifecycle_status == lifecycle_status)
|
||||
q = q.order_by(
|
||||
AwoooPContractRevision.version_major.desc(),
|
||||
AwoooPContractRevision.version_minor.desc(),
|
||||
)
|
||||
result = await db.execute(q)
|
||||
return list(result.scalars().all())
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Write(append-only)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def create_draft(
|
||||
*,
|
||||
project_id: str,
|
||||
contract_family: str,
|
||||
contract_id: str,
|
||||
version_major: int,
|
||||
version_minor: int,
|
||||
body_json: dict[str, Any],
|
||||
body_hash: str,
|
||||
body_schema_version: str = "v1.0",
|
||||
) -> AwoooPContractRevision:
|
||||
"""建立 draft revision(不可被 runtime 讀取)"""
|
||||
async with get_db_context(project_id) as db:
|
||||
revision = AwoooPContractRevision(
|
||||
project_id=project_id,
|
||||
contract_family=contract_family,
|
||||
contract_id=contract_id,
|
||||
version_major=version_major,
|
||||
version_minor=version_minor,
|
||||
lifecycle_status="draft",
|
||||
body_json=body_json,
|
||||
body_hash=body_hash,
|
||||
body_schema_version=body_schema_version,
|
||||
)
|
||||
db.add(revision)
|
||||
await db.flush()
|
||||
await db.refresh(revision)
|
||||
|
||||
logger.info(
|
||||
"contract_draft_created",
|
||||
revision_id=str(revision.revision_id),
|
||||
project_id=project_id,
|
||||
contract_family=contract_family,
|
||||
contract_id=contract_id,
|
||||
)
|
||||
return revision
|
||||
|
||||
|
||||
async def mark_published(
|
||||
*,
|
||||
revision_id: UUID,
|
||||
project_id: str,
|
||||
publisher_id: str,
|
||||
publish_signature: str,
|
||||
published_at: Any, # datetime
|
||||
) -> AwoooPContractRevision:
|
||||
"""
|
||||
draft → published 轉換(HMAC 簽章驗證後由 service 呼叫)。
|
||||
published revision 可被 activate,但不可被 runtime 直接讀取。
|
||||
"""
|
||||
async with get_db_context(project_id) as db:
|
||||
await db.execute(
|
||||
update(AwoooPContractRevision)
|
||||
.where(
|
||||
AwoooPContractRevision.revision_id == revision_id,
|
||||
AwoooPContractRevision.project_id == project_id,
|
||||
AwoooPContractRevision.lifecycle_status == "draft",
|
||||
)
|
||||
.values(
|
||||
lifecycle_status="published",
|
||||
publisher_id=publisher_id,
|
||||
publish_signature=publish_signature,
|
||||
published_at=published_at,
|
||||
)
|
||||
)
|
||||
result = await db.execute(
|
||||
select(AwoooPContractRevision).where(
|
||||
AwoooPContractRevision.revision_id == revision_id
|
||||
)
|
||||
)
|
||||
revision = result.scalar_one()
|
||||
logger.info(
|
||||
"contract_published",
|
||||
revision_id=str(revision_id),
|
||||
project_id=project_id,
|
||||
publisher_id=publisher_id,
|
||||
)
|
||||
return revision
|
||||
|
||||
|
||||
async def mark_active(
|
||||
*,
|
||||
revision_id: UUID,
|
||||
project_id: str,
|
||||
contract_family: str,
|
||||
contract_id: str,
|
||||
old_revision_id: UUID | None,
|
||||
) -> AwoooPContractRevision:
|
||||
"""
|
||||
published → active 轉換 + 更新 active pointer + 寫入 outbox。
|
||||
三個操作在同一 transaction(ADR-113 transactional outbox)。
|
||||
"""
|
||||
async with get_db_context(project_id) as db:
|
||||
# 1. 更新 revision lifecycle_status
|
||||
await db.execute(
|
||||
update(AwoooPContractRevision)
|
||||
.where(
|
||||
AwoooPContractRevision.revision_id == revision_id,
|
||||
AwoooPContractRevision.project_id == project_id,
|
||||
AwoooPContractRevision.lifecycle_status == "published",
|
||||
)
|
||||
.values(lifecycle_status="active")
|
||||
)
|
||||
|
||||
# 2. UPSERT active pointer
|
||||
stmt = pg_insert(AwoooPActiveRevision).values(
|
||||
project_id=project_id,
|
||||
contract_family=contract_family,
|
||||
contract_id=contract_id,
|
||||
active_revision_id=revision_id,
|
||||
)
|
||||
stmt = stmt.on_conflict_do_update(
|
||||
constraint="uq_active_pointer",
|
||||
set_={
|
||||
"active_revision_id": revision_id,
|
||||
},
|
||||
)
|
||||
await db.execute(stmt)
|
||||
|
||||
# 3. 寫入 outbox event(ADR-113)
|
||||
outbox_event = AwoooPContractOutbox(
|
||||
event_type="contract.activated",
|
||||
project_id=project_id,
|
||||
contract_family=contract_family,
|
||||
contract_id=contract_id,
|
||||
old_revision_id=old_revision_id,
|
||||
new_revision_id=revision_id,
|
||||
)
|
||||
db.add(outbox_event)
|
||||
|
||||
# 4. 如有舊 active revision,標記為 revoked
|
||||
if old_revision_id:
|
||||
await db.execute(
|
||||
update(AwoooPContractRevision)
|
||||
.where(
|
||||
AwoooPContractRevision.revision_id == old_revision_id,
|
||||
AwoooPContractRevision.lifecycle_status == "active",
|
||||
)
|
||||
.values(lifecycle_status="revoked")
|
||||
)
|
||||
|
||||
result = await db.execute(
|
||||
select(AwoooPContractRevision).where(
|
||||
AwoooPContractRevision.revision_id == revision_id
|
||||
)
|
||||
)
|
||||
revision = result.scalar_one()
|
||||
|
||||
logger.info(
|
||||
"contract_activated",
|
||||
revision_id=str(revision_id),
|
||||
old_revision_id=str(old_revision_id) if old_revision_id else None,
|
||||
project_id=project_id,
|
||||
contract_family=contract_family,
|
||||
contract_id=contract_id,
|
||||
)
|
||||
return revision
|
||||
@@ -63,6 +63,7 @@ def _incident_to_record_data(incident: Incident) -> dict[str, Any]:
|
||||
|
||||
return {
|
||||
"incident_id": incident.incident_id,
|
||||
"project_id": getattr(incident, "project_id", "awoooi"), # AwoooP Phase 2.3
|
||||
"status": incident.status.value,
|
||||
"severity": incident.severity.value,
|
||||
"signals": [
|
||||
|
||||
@@ -274,7 +274,7 @@ class IKnowledgeRepository(Protocol):
|
||||
...
|
||||
|
||||
async def save_embedding(self, entry_id: str, embedding: list[float]) -> bool:
|
||||
"""儲存向量 embedding (768 維, pgvector)"""
|
||||
"""儲存向量 embedding (1024 維, pgvector, bge-m3:latest)"""
|
||||
...
|
||||
|
||||
async def semantic_search(
|
||||
|
||||
@@ -60,13 +60,17 @@ class MetricsDBRepository(IMetricsRepository):
|
||||
cutoff = datetime.now(UTC) - timedelta(hours=hours)
|
||||
|
||||
# Query: 統計 executed vs total (approved + executed + execution_failed)
|
||||
# 2026-05-06 ogt + Codex:
|
||||
# approval_records.status 目前實際寫入的是大寫 enum
|
||||
# (APPROVED / EXECUTION_SUCCESS / EXECUTION_FAILED)。舊查詢只看
|
||||
# lowercase executed,導致 AI Success 在報表層永遠趨近 0。
|
||||
query = text("""
|
||||
SELECT
|
||||
COUNT(CASE WHEN status = 'executed' THEN 1 END) as executed_count,
|
||||
COUNT(CASE WHEN UPPER(status::text) = 'EXECUTION_SUCCESS' THEN 1 END) as executed_count,
|
||||
COUNT(*) as total_count
|
||||
FROM approval_records
|
||||
WHERE created_at >= :cutoff
|
||||
AND status IN ('approved', 'executed', 'execution_failed')
|
||||
AND UPPER(status::text) IN ('APPROVED', 'EXECUTION_SUCCESS', 'EXECUTION_FAILED')
|
||||
""")
|
||||
|
||||
result = await session.execute(query, {"cutoff": cutoff})
|
||||
@@ -127,11 +131,11 @@ class MetricsDBRepository(IMetricsRepository):
|
||||
trend_query = text("""
|
||||
SELECT
|
||||
date_trunc('hour', created_at) as hour_bucket,
|
||||
COUNT(CASE WHEN status = 'executed' THEN 1 END) * 100.0 /
|
||||
COUNT(CASE WHEN UPPER(status::text) = 'EXECUTION_SUCCESS' THEN 1 END) * 100.0 /
|
||||
NULLIF(COUNT(*), 0) as hourly_rate
|
||||
FROM approval_records
|
||||
WHERE created_at >= :cutoff
|
||||
AND status IN ('approved', 'executed', 'execution_failed')
|
||||
AND UPPER(status::text) IN ('APPROVED', 'EXECUTION_SUCCESS', 'EXECUTION_FAILED')
|
||||
GROUP BY hour_bucket
|
||||
ORDER BY hour_bucket DESC
|
||||
LIMIT :limit
|
||||
|
||||
@@ -23,7 +23,7 @@ class PlaybookEmbeddingRepository:
|
||||
Playbook Embedding Repository
|
||||
|
||||
職責: playbook_embeddings 表 CRUD
|
||||
使用 pgvector 儲存 nomic-embed-text 768 維向量
|
||||
使用 pgvector 儲存 bge-m3:latest 1024 維向量(ADR-110 2026-05-04 升級自 768 維)
|
||||
|
||||
Args:
|
||||
db: SQLAlchemy AsyncSession (DI 注入)
|
||||
@@ -47,7 +47,7 @@ class PlaybookEmbeddingRepository:
|
||||
|
||||
Args:
|
||||
playbook_id: Playbook ID
|
||||
embedding: 768 維浮點向量 (list[float])
|
||||
embedding: 1024 維浮點向量 (list[float]),bge-m3:latest
|
||||
alert_names: 索引時的 alert_names 快照
|
||||
keywords: 索引時的 keywords 快照
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ import structlog
|
||||
from sqlalchemy import select
|
||||
|
||||
from src.core.redis_client import get_redis
|
||||
from src.db.base import get_session_factory
|
||||
from src.db.base import get_db_context
|
||||
from src.db.models import PlaybookRecord
|
||||
from src.models.playbook import (
|
||||
Playbook,
|
||||
@@ -255,8 +255,7 @@ class PlaybookRepository:
|
||||
Phase 3.5:改用 PG 查詢,效率更高,資料更完整
|
||||
"""
|
||||
try:
|
||||
factory = get_session_factory()
|
||||
async with factory() as session:
|
||||
async with get_db_context() as session:
|
||||
stmt = select(PlaybookRecord)
|
||||
if status is not None:
|
||||
stmt = stmt.where(PlaybookRecord.status == status.value)
|
||||
@@ -356,8 +355,7 @@ class PlaybookRepository:
|
||||
"""
|
||||
try:
|
||||
# 使用 SELECT FOR UPDATE 確保並行 update_stats 不會 lost update
|
||||
factory = get_session_factory()
|
||||
async with factory() as session:
|
||||
async with get_db_context() as session:
|
||||
async with session.begin():
|
||||
stmt = (
|
||||
select(PlaybookRecord)
|
||||
@@ -411,8 +409,7 @@ class PlaybookRepository:
|
||||
async def find_by_source_incident(self, incident_id: str) -> list[Playbook]:
|
||||
"""根據來源 Incident ID 找 Playbook(從 PG 查詢)"""
|
||||
try:
|
||||
factory = get_session_factory()
|
||||
async with factory() as session:
|
||||
async with get_db_context() as session:
|
||||
# PG JSONB contains 查詢
|
||||
stmt = select(PlaybookRecord).where(
|
||||
PlaybookRecord.source_incident_ids.contains([incident_id])
|
||||
@@ -529,10 +526,12 @@ class PlaybookRepository:
|
||||
try:
|
||||
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||
|
||||
factory = get_session_factory()
|
||||
async with factory() as session:
|
||||
async with get_db_context(
|
||||
getattr(playbook, "project_id", "awoooi")
|
||||
) as session:
|
||||
stmt = pg_insert(PlaybookRecord).values(
|
||||
playbook_id=playbook.playbook_id,
|
||||
project_id=getattr(playbook, "project_id", "awoooi"), # AwoooP Phase 2.3
|
||||
name=playbook.name,
|
||||
description=playbook.description,
|
||||
status=playbook.status.value,
|
||||
@@ -600,8 +599,7 @@ class PlaybookRepository:
|
||||
async def _pg_get(self, playbook_id: str) -> Playbook | None:
|
||||
"""從 PostgreSQL 載入 Playbook"""
|
||||
try:
|
||||
factory = get_session_factory()
|
||||
async with factory() as session:
|
||||
async with get_db_context() as session:
|
||||
result = await session.get(PlaybookRecord, playbook_id)
|
||||
if result is None:
|
||||
return None
|
||||
|
||||
@@ -104,7 +104,7 @@ async def get_agent_thinking(
|
||||
) -> StreamingResponse:
|
||||
"""
|
||||
OpenClaw 思考軌跡 (SSE 串流)
|
||||
Phase 1.2: 真實串接 Ollama at 192.168.0.188:11434
|
||||
Phase 1.2: 真實串接設定中的 Ollama provider pool
|
||||
"""
|
||||
|
||||
async def generate_thinking_stream():
|
||||
|
||||
@@ -130,30 +130,49 @@ async def check_redis() -> Literal["up", "down"]:
|
||||
return "down"
|
||||
|
||||
|
||||
async def check_ollama() -> Literal["up", "down"]:
|
||||
async def check_ollama() -> Literal["up", "down", "degraded"]:
|
||||
"""
|
||||
Check Ollama service via /api/tags endpoint
|
||||
Check Ollama 三層容災狀態(primary → secondary → tertiary)
|
||||
|
||||
統帥鐵律: 真實 HTTP 請求,禁止假數據
|
||||
2026-05-04 ogt: 改為檢查三台(OLLAMA_URL / SECONDARY / FALLBACK),
|
||||
只要有任一台 up → "up";primary down 但 fallback up → "degraded";
|
||||
全部 down → "down"。反映 K8s 實際可用的 Ollama 路由狀態。
|
||||
"""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=HEALTH_CHECK_TIMEOUT) as client:
|
||||
response = await client.get(f"{settings.OLLAMA_URL}/api/tags")
|
||||
if response.status_code == 200:
|
||||
logger.debug("health_check_ollama", status="up")
|
||||
return "up"
|
||||
else:
|
||||
logger.warning(
|
||||
"health_check_ollama",
|
||||
status="down",
|
||||
status_code=response.status_code,
|
||||
)
|
||||
return "down"
|
||||
except httpx.TimeoutException:
|
||||
logger.warning("health_check_ollama", status="down", reason="timeout")
|
||||
return "down"
|
||||
except Exception as e:
|
||||
logger.warning("health_check_ollama", status="down", error=str(e))
|
||||
urls = [
|
||||
(settings.OLLAMA_URL, "primary"),
|
||||
(getattr(settings, "OLLAMA_SECONDARY_URL", ""), "secondary"),
|
||||
(getattr(settings, "OLLAMA_FALLBACK_URL", ""), "tertiary"),
|
||||
]
|
||||
any_up = False
|
||||
primary_up = False
|
||||
|
||||
async with httpx.AsyncClient(timeout=HEALTH_CHECK_TIMEOUT) as client:
|
||||
for i, (url, label) in enumerate(urls):
|
||||
if not url:
|
||||
continue
|
||||
try:
|
||||
response = await client.get(f"{url}/api/tags")
|
||||
if response.status_code == 200:
|
||||
any_up = True
|
||||
if i == 0:
|
||||
primary_up = True
|
||||
logger.debug("health_check_ollama", status="up", tier=label, url=url)
|
||||
break # 找到第一台可用就停
|
||||
else:
|
||||
logger.debug("health_check_ollama_tier", tier=label, status_code=response.status_code)
|
||||
except (httpx.TimeoutException, httpx.ConnectError, httpx.NetworkError):
|
||||
logger.debug("health_check_ollama_tier", tier=label, status="unreachable")
|
||||
except Exception as e:
|
||||
logger.warning("health_check_ollama_tier", tier=label, error=str(e))
|
||||
|
||||
if primary_up:
|
||||
return "up"
|
||||
elif any_up:
|
||||
logger.warning("health_check_ollama", status="degraded", reason="primary down, fallback active")
|
||||
return "degraded"
|
||||
else:
|
||||
logger.warning("health_check_ollama", status="down", reason="all tiers unreachable")
|
||||
return "down"
|
||||
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import dataclasses
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from datetime import UTC, datetime
|
||||
@@ -63,11 +63,25 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
def _agent_debate_global_timeout_seconds() -> float:
|
||||
"""Return the full Phase 2 debate timeout.
|
||||
|
||||
GCP Ollama incident analysis can legitimately take longer than the old
|
||||
90s guard. Keep a hard ceiling, but make it an explicit deployment knob.
|
||||
"""
|
||||
|
||||
raw = os.environ.get("AGENT_DEBATE_GLOBAL_TIMEOUT_SEC", "420.0")
|
||||
try:
|
||||
timeout = float(raw)
|
||||
except (TypeError, ValueError):
|
||||
timeout = 420.0
|
||||
return max(timeout, 90.0)
|
||||
|
||||
|
||||
# 全局超時(所有 Agent 加起來)
|
||||
# 2026-04-16 Claude Sonnet 4.6: deepseek-r1:14b 實測 2.2-27.3s avg 10.6s
|
||||
# 原 30s 對 3 個序列 Agent 每個只剩 10s → 頻繁 timeout → confidence=20%
|
||||
# 調整: 每 Agent 25s, 3個序列+1組並行 = 最差 75s + buffer = 90s
|
||||
GLOBAL_TIMEOUT_SEC = 90.0
|
||||
# 2026-05-06 Codex: configurable for GCP-A/GCP-B/111 Ollama-first incident
|
||||
# diagnosis. The old 90s guard was cutting off valid deep diagnosis runs.
|
||||
GLOBAL_TIMEOUT_SEC = _agent_debate_global_timeout_seconds()
|
||||
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: 移除 _PER_AGENT_TIMEOUT_SEC
|
||||
# LLM 必須等到完整回應,不得人工截斷。降級只在真正異常(連線失敗、模型崩潰)觸發。
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
"""
|
||||
Ollama Provider - Phase 24 ADR-052
|
||||
====================================
|
||||
本地 LLM 推理 (192.168.0.188 VMware VM, CPU-only)
|
||||
本地 / 私有 LLM 推理 Provider。
|
||||
|
||||
搬移自: openclaw.py _call_ollama (L349-409)
|
||||
特性: 免費、隱私安全 (local)、但 CPU 慢 (~97s/30tokens for qwen2.5:7b)
|
||||
特性: 免費、隱私安全 (local)、可依 ADR-110 指向 GCP-A/GCP-B/111。
|
||||
|
||||
2026-04-02 ogt: Phase 24-A 從 openclaw.py 抽出
|
||||
"""
|
||||
@@ -19,13 +19,72 @@ import structlog
|
||||
|
||||
from src.core.config import get_settings
|
||||
from src.plugins.mcp.interfaces import MCPTool
|
||||
from src.services.ai_providers.interfaces import AIProvider, AIResult, is_provider_enabled_by_env
|
||||
from src.services.ai_providers.interfaces import (
|
||||
AIResult,
|
||||
is_provider_enabled_by_env,
|
||||
)
|
||||
from src.services.ai_providers.tool_schema import openai_tools_for_agent
|
||||
from src.services.model_registry import get_model_registry
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
settings = get_settings()
|
||||
|
||||
_GCP_LIGHTWEIGHT_MODELS = {
|
||||
"gemma3:4b",
|
||||
}
|
||||
|
||||
|
||||
def _normalized_url(value: str | None) -> str:
|
||||
return (value or "").rstrip("/")
|
||||
|
||||
|
||||
def _is_gcp_alert_lane(endpoint_url: str) -> bool:
|
||||
"""Return true for the CPU-only GCP-A/B synchronous alert lane."""
|
||||
endpoint = _normalized_url(endpoint_url)
|
||||
return endpoint in {
|
||||
_normalized_url(getattr(settings, "OLLAMA_URL", "")),
|
||||
_normalized_url(getattr(settings, "OLLAMA_SECONDARY_URL", "")),
|
||||
}
|
||||
|
||||
|
||||
def _resolve_model_for_endpoint(
|
||||
*,
|
||||
requested_model: str,
|
||||
endpoint_url: str,
|
||||
context: dict | None,
|
||||
) -> str:
|
||||
"""
|
||||
Keep non-diagnosis calls from polluting the GCP diagnosis lane.
|
||||
|
||||
GCP-A/B are allowed to run the deep incident diagnosis model because the
|
||||
alert goal is correctness and resolution, not the fastest Telegram card.
|
||||
Accidental non-diagnosis workloads still fall back to the lightweight health
|
||||
model so embedding/Hermes/background calls cannot occupy the same lane.
|
||||
"""
|
||||
model_name = requested_model.strip()
|
||||
context = context or {}
|
||||
allow_gcp_heavy = bool(context.get("allow_gcp_heavy_model"))
|
||||
task_type = str(context.get("task_type") or context.get("intent_hint") or "").lower()
|
||||
is_deep_diagnosis = task_type in {"diagnose", "alert_deep", "incident_diagnosis"}
|
||||
|
||||
if (
|
||||
_is_gcp_alert_lane(endpoint_url)
|
||||
and not allow_gcp_heavy
|
||||
and not is_deep_diagnosis
|
||||
and model_name not in _GCP_LIGHTWEIGHT_MODELS
|
||||
):
|
||||
fallback_model = str(getattr(settings, "OLLAMA_HEALTH_CHECK_MODEL", "gemma3:4b")).strip() or "gemma3:4b"
|
||||
logger.warning(
|
||||
"ollama_gcp_non_diagnosis_model_coerced",
|
||||
endpoint=endpoint_url,
|
||||
requested_model=model_name,
|
||||
safe_model=fallback_model,
|
||||
task_type=task_type,
|
||||
)
|
||||
return fallback_model
|
||||
|
||||
return model_name
|
||||
|
||||
|
||||
class OllamaProvider:
|
||||
"""
|
||||
@@ -74,11 +133,17 @@ class OllamaProvider:
|
||||
client = await self._get_client()
|
||||
|
||||
registry = get_model_registry()
|
||||
model_name = registry.get_model("ollama", "rca")
|
||||
endpoint_url = self._endpoint_url()
|
||||
requested_model = str((context or {}).get("ollama_model") or registry.get_model("ollama", "rca")).strip()
|
||||
model_name = _resolve_model_for_endpoint(
|
||||
requested_model=requested_model,
|
||||
endpoint_url=endpoint_url,
|
||||
context=context,
|
||||
)
|
||||
options = registry.get_provider_options("ollama")
|
||||
|
||||
# P0 2026-04-04 Claude Code: per-task timeout(Option C 分情境)
|
||||
# FORCE_LOCAL/diagnose → OLLAMA_DIAGNOSE_TIMEOUT_SECONDS (200s,實測 ~173s)
|
||||
# FORCE_LOCAL/diagnose → OLLAMA_DIAGNOSE_TIMEOUT_SECONDS
|
||||
# 其他 → OPENCLAW_TIMEOUT(既有設定)
|
||||
task_type = (context or {}).get("task_type", "")
|
||||
if task_type in ("diagnose", "force_local"):
|
||||
@@ -87,7 +152,7 @@ class OllamaProvider:
|
||||
read_timeout = float(settings.OPENCLAW_TIMEOUT)
|
||||
|
||||
response = await client.post(
|
||||
f"{settings.OLLAMA_URL}/api/generate",
|
||||
f"{endpoint_url}/api/generate",
|
||||
json={
|
||||
"model": model_name,
|
||||
"prompt": prompt,
|
||||
@@ -108,7 +173,13 @@ class OllamaProvider:
|
||||
tokens = data.get("eval_count", 0) + data.get("prompt_eval_count", 0)
|
||||
latency = (time.perf_counter() - start) * 1000
|
||||
|
||||
logger.info("ollama_provider_success", response_length=len(result), tokens=tokens, latency_ms=round(latency, 1))
|
||||
logger.info(
|
||||
"ollama_provider_success",
|
||||
response_length=len(result),
|
||||
tokens=tokens,
|
||||
latency_ms=round(latency, 1),
|
||||
model=model_name,
|
||||
)
|
||||
return AIResult(
|
||||
raw_response=result,
|
||||
success=True,
|
||||
@@ -154,7 +225,7 @@ class OllamaProvider:
|
||||
total_tokens = 0
|
||||
messages: list[dict] = [{"role": "user", "content": prompt}]
|
||||
registry = get_model_registry()
|
||||
model_name = registry.get_model("ollama", "rca")
|
||||
model_name = str((context or {}).get("ollama_model") or registry.get_model("ollama", "rca")).strip()
|
||||
options = registry.get_provider_options("ollama")
|
||||
task_type = (context or {}).get("task_type", "")
|
||||
if task_type in ("diagnose", "force_local"):
|
||||
@@ -253,7 +324,7 @@ class OllamaProvider:
|
||||
async def health_check(self) -> bool:
|
||||
try:
|
||||
client = await self._get_client()
|
||||
resp = await client.get(f"{settings.OLLAMA_URL}/api/tags", timeout=5.0)
|
||||
resp = await client.get(f"{self._endpoint_url()}/api/tags", timeout=5.0)
|
||||
return resp.status_code == 200
|
||||
except Exception:
|
||||
return False
|
||||
@@ -264,33 +335,27 @@ class OllamaProvider:
|
||||
self._http_client = None
|
||||
|
||||
|
||||
# 2026-04-26 Wave5 B1-fix by Claude Engineer-A4 — OLLAMA_188 provider 註冊
|
||||
class Ollama188Provider(OllamaProvider):
|
||||
# 2026-05-06 Codex — 188 不再作為 Ollama Provider;本地備援統一命名為 ollama_local。
|
||||
class OllamaLocalProvider(OllamaProvider):
|
||||
"""
|
||||
Ollama 188 CPU-only 備援 Provider
|
||||
Ollama Local fallback Provider
|
||||
|
||||
繼承 OllamaProvider,但使用 OLLAMA_FALLBACK_URL(192.168.0.188:11434)
|
||||
作為推理端點,模型預設 OLLAMA_HEALTH_CHECK_MODEL(qwen2.5:7b-instruct)。
|
||||
|
||||
B1 修復:原本 _init_registry 未登錄此 provider,導致
|
||||
executor.execute() 遇到 "ollama_188" → not_registered → 跳過,
|
||||
188 從未被打到。此類別補全登錄鏈路。
|
||||
|
||||
2026-04-26 Wave5 B1-fix by Claude Engineer-A4
|
||||
使用 OLLAMA_FALLBACK_URL 作為本地最後防線端點。
|
||||
ADR-110 目前設定為 110 nginx proxy → 111 Ollama;188 不得再作為 Ollama provider。
|
||||
"""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "ollama_188"
|
||||
return "ollama_local"
|
||||
|
||||
@property
|
||||
def is_enabled(self) -> bool:
|
||||
import os
|
||||
# 優先查 ENABLE_OLLAMA_188;若未設定(預設 true)則看 OLLAMA_FALLBACK_URL 是否有值
|
||||
env_override = os.getenv("ENABLE_OLLAMA_188", "true").lower() == "true"
|
||||
# 優先查 ENABLE_OLLAMA_LOCAL;若未設定(預設 true)則看 OLLAMA_FALLBACK_URL 是否有值。
|
||||
env_override = os.getenv("ENABLE_OLLAMA_LOCAL", "true").lower() == "true"
|
||||
if not env_override:
|
||||
return False
|
||||
# OLLAMA_FALLBACK_URL 空字串 → 未設定 188 節點 → 停用
|
||||
# OLLAMA_FALLBACK_URL 空字串 → 未設定本地節點 → 停用。
|
||||
return bool(getattr(settings, "OLLAMA_FALLBACK_URL", ""))
|
||||
|
||||
def _endpoint_url(self) -> str:
|
||||
@@ -315,18 +380,18 @@ class Ollama188Provider(OllamaProvider):
|
||||
client = await self._get_client()
|
||||
|
||||
registry = get_model_registry()
|
||||
# 嘗試取 ollama_188 專屬設定,fallback 到 ollama 預設
|
||||
# 嘗試取本地 fallback 專屬設定,fallback 到 ollama 預設。
|
||||
try:
|
||||
model_name = registry.get_model("ollama_188", "rca")
|
||||
model_name = str((context or {}).get("ollama_model") or registry.get_model("ollama_local", "rca")).strip()
|
||||
except Exception:
|
||||
model_name = getattr(settings, "OLLAMA_HEALTH_CHECK_MODEL", "qwen2.5:7b-instruct")
|
||||
model_name = str((context or {}).get("ollama_model") or getattr(settings, "OLLAMA_HEALTH_CHECK_MODEL", "qwen2.5:7b-instruct")).strip()
|
||||
|
||||
try:
|
||||
options = registry.get_provider_options("ollama_188")
|
||||
options = registry.get_provider_options("ollama_local")
|
||||
except Exception:
|
||||
options = registry.get_provider_options("ollama")
|
||||
|
||||
# CPU-only 備援:固定使用較長 timeout(CPU 推理慢)
|
||||
# 本地備援:固定使用較長 timeout,避免 111 模型載入時被過早判死。
|
||||
task_type = (context or {}).get("task_type", "")
|
||||
if task_type in ("diagnose", "force_local"):
|
||||
read_timeout = float(getattr(settings, "OLLAMA_DIAGNOSE_TIMEOUT_SECONDS", 200))
|
||||
@@ -355,11 +420,12 @@ class Ollama188Provider(OllamaProvider):
|
||||
latency = (time.perf_counter() - start) * 1000
|
||||
|
||||
logger.info(
|
||||
"ollama_188_provider_success",
|
||||
"ollama_local_provider_success",
|
||||
response_length=len(result),
|
||||
tokens=tokens,
|
||||
latency_ms=round(latency, 1),
|
||||
endpoint=fallback_url,
|
||||
model=model_name,
|
||||
)
|
||||
return AIResult(
|
||||
raw_response=result,
|
||||
@@ -371,12 +437,12 @@ class Ollama188Provider(OllamaProvider):
|
||||
|
||||
except httpx.TimeoutException as e:
|
||||
latency = (time.perf_counter() - start) * 1000
|
||||
logger.warning("ollama_188_provider_timeout", error=str(e), latency_ms=round(latency, 1))
|
||||
logger.warning("ollama_local_provider_timeout", error=str(e), latency_ms=round(latency, 1))
|
||||
return AIResult(raw_response="", success=False, provider=self.name, latency_ms=latency, error=f"Timeout: {e}")
|
||||
|
||||
except Exception as e:
|
||||
latency = (time.perf_counter() - start) * 1000
|
||||
logger.warning("ollama_188_provider_failed", error=str(e), latency_ms=round(latency, 1))
|
||||
logger.warning("ollama_local_provider_failed", error=str(e), latency_ms=round(latency, 1))
|
||||
return AIResult(raw_response="", success=False, provider=self.name, latency_ms=latency, error=str(e))
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
@@ -389,3 +455,38 @@ class Ollama188Provider(OllamaProvider):
|
||||
return resp.status_code == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
class OllamaGcpBProvider(OllamaProvider):
|
||||
"""
|
||||
GCP-B Secondary Ollama Provider
|
||||
|
||||
繼承 OllamaProvider,使用 OLLAMA_SECONDARY_URL(34.21.145.224:11434)。
|
||||
ADR-110 三層容災:GCP-A → GCP-B → Local(111)。
|
||||
OllamaFailoverManager 回傳 provider_name="ollama_gcp_b" 時由此 Provider 執行。
|
||||
|
||||
2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP-B 容災補全
|
||||
根因:AIProviderRegistry 缺少 "ollama_gcp_b" → not_registered → 跳 Gemini
|
||||
"""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "ollama_gcp_b"
|
||||
|
||||
@property
|
||||
def is_enabled(self) -> bool:
|
||||
return bool(getattr(settings, "OLLAMA_SECONDARY_URL", ""))
|
||||
|
||||
def _endpoint_url(self) -> str:
|
||||
return getattr(settings, "OLLAMA_SECONDARY_URL", "")
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
url = getattr(settings, "OLLAMA_SECONDARY_URL", "")
|
||||
if not url:
|
||||
return False
|
||||
try:
|
||||
client = await self._get_client()
|
||||
resp = await client.get(f"{url}/api/tags", timeout=5.0)
|
||||
return resp.status_code == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@@ -274,14 +274,13 @@ class AIRateLimiter:
|
||||
|
||||
try:
|
||||
from src.core.config import settings
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
|
||||
target_chat_id = settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID
|
||||
if not settings.OPENCLAW_TG_BOT_TOKEN or not target_chat_id:
|
||||
logger.warning("telegram_not_configured_for_cost_alert")
|
||||
return
|
||||
|
||||
import httpx
|
||||
|
||||
message = (
|
||||
f"🚨🚨🚨 <b>AI 成本超限警報</b> 🚨🚨🚨\n\n"
|
||||
f"Provider: <code>{provider.upper()}</code>\n"
|
||||
@@ -292,15 +291,15 @@ class AIRateLimiter:
|
||||
f"<code>redis-cli DEL ai_rate:total_cost:{provider}</code>"
|
||||
)
|
||||
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
await client.post(
|
||||
f"https://api.telegram.org/bot{settings.OPENCLAW_TG_BOT_TOKEN}/sendMessage",
|
||||
json={
|
||||
"chat_id": target_chat_id,
|
||||
"text": message,
|
||||
"parse_mode": "HTML",
|
||||
},
|
||||
)
|
||||
gateway = get_telegram_gateway()
|
||||
await gateway._send_request(
|
||||
"sendMessage",
|
||||
{
|
||||
"chat_id": target_chat_id,
|
||||
"text": message,
|
||||
"parse_mode": "HTML",
|
||||
},
|
||||
)
|
||||
|
||||
logger.error(
|
||||
"ai_cost_alert_sent",
|
||||
@@ -327,13 +326,12 @@ class AIRateLimiter:
|
||||
|
||||
try:
|
||||
from src.core.config import settings
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
|
||||
target_chat_id = settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID
|
||||
if not settings.OPENCLAW_TG_BOT_TOKEN or not target_chat_id:
|
||||
return
|
||||
|
||||
import httpx
|
||||
|
||||
limit = COST_LIMITS[provider]["total_cost_usd"]
|
||||
remaining = limit - current_cost
|
||||
|
||||
@@ -345,15 +343,15 @@ class AIRateLimiter:
|
||||
f"接近上限,請注意監控!"
|
||||
)
|
||||
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
await client.post(
|
||||
f"https://api.telegram.org/bot{settings.OPENCLAW_TG_BOT_TOKEN}/sendMessage",
|
||||
json={
|
||||
"chat_id": target_chat_id,
|
||||
"text": message,
|
||||
"parse_mode": "HTML",
|
||||
},
|
||||
)
|
||||
gateway = get_telegram_gateway()
|
||||
await gateway._send_request(
|
||||
"sendMessage",
|
||||
{
|
||||
"chat_id": target_chat_id,
|
||||
"text": message,
|
||||
"parse_mode": "HTML",
|
||||
},
|
||||
)
|
||||
|
||||
logger.warning(
|
||||
"ai_cost_warning_sent",
|
||||
|
||||
@@ -73,10 +73,12 @@ class AIProviderEnum(str, Enum):
|
||||
"""AI 提供者"""
|
||||
|
||||
OLLAMA = "ollama"
|
||||
# 2026-04-25 critic-fix Part2 B2 by Claude Engineer-C2
|
||||
# P1.1b OllamaFailoverManager 使用 provider_name="ollama_188",
|
||||
# 但 AIProviderEnum 沒有此值 → P1.2 整合時 lookup 失敗
|
||||
OLLAMA_188 = "ollama_188" # 188 CPU-only 備援節點(P1.1b)
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP 三層容災
|
||||
# OllamaFailoverManager 回傳 provider_name="ollama_gcp_a"/"ollama_gcp_b"/"ollama_local"
|
||||
# 缺少 enum 值 → AIProviderEnum(primary_str) 拋 ValueError → fallback chain 清空 → 直跳 Gemini
|
||||
OLLAMA_GCP_A = "ollama_gcp_a" # GCP-A 34.143.170.20 Primary
|
||||
OLLAMA_GCP_B = "ollama_gcp_b" # GCP-B 34.21.145.224 Secondary
|
||||
OLLAMA_LOCAL = "ollama_local" # 192.168.0.111 Local Fallback
|
||||
GEMINI = "gemini"
|
||||
CLAUDE = "claude"
|
||||
# 2026-04-02 ogt: C1 修復 — 對齊 Registry 實際名稱
|
||||
@@ -90,8 +92,10 @@ class AIProviderEnum(str, Enum):
|
||||
# Provider 對應延遲預算 (ms)
|
||||
PROVIDER_LATENCY_BUDGET: dict[AIProviderEnum, int] = {
|
||||
AIProviderEnum.OLLAMA: 60000, # 本地,允許較長處理時間
|
||||
# 2026-04-25 critic-fix Part2 B2 by Claude Engineer-C2 — 188 CPU-only 推理較慢
|
||||
AIProviderEnum.OLLAMA_188: 120000, # 120s budget for CPU inference
|
||||
# 2026-05-04 ogt: ADR-110 GCP 三層容災 — GCP NVMe SSD 推理快,60s 足夠
|
||||
AIProviderEnum.OLLAMA_GCP_A: 60000,
|
||||
AIProviderEnum.OLLAMA_GCP_B: 60000,
|
||||
AIProviderEnum.OLLAMA_LOCAL: 90000, # 111 本地 HDD 稍慢
|
||||
AIProviderEnum.GEMINI: 30000, # 雲端,較低延遲
|
||||
AIProviderEnum.CLAUDE: 30000, # 雲端,較低延遲
|
||||
# 2026-04-02 ogt: C1 修復 — 對齊 Registry 名稱
|
||||
@@ -422,7 +426,7 @@ class AIRouter:
|
||||
model = failover_result.primary.model
|
||||
reason = f"{reason} [failover→{primary_str}]"
|
||||
except ValueError:
|
||||
# provider_name 無法對應已知 enum(理論上不應發生,OLLAMA_188 已加)
|
||||
# provider_name 無法對應已知 enum;避免未知 provider 靜默進入執行層。
|
||||
logger.warning(
|
||||
"ai_router_unknown_failover_provider",
|
||||
provider=primary_str,
|
||||
@@ -1068,11 +1072,51 @@ class AIRouterExecutor:
|
||||
cached = await redis.get(cache_key)
|
||||
if cached:
|
||||
data = _json.loads(cached)
|
||||
cached_provider = data.get("provider", "cache")
|
||||
provider_allowed = cached_provider in provider_order
|
||||
ollama_first_required = (
|
||||
bool(context)
|
||||
and any(
|
||||
key in context
|
||||
for key in (
|
||||
"alert_type",
|
||||
"alertname",
|
||||
"alert_name",
|
||||
"fingerprint",
|
||||
"incident_id",
|
||||
"severity",
|
||||
"target_resource",
|
||||
)
|
||||
)
|
||||
and bool(provider_order)
|
||||
and provider_order[0].startswith("ollama")
|
||||
) or (
|
||||
bool(context)
|
||||
and bool(context.get("enforce_ollama_first"))
|
||||
and bool(provider_order)
|
||||
and provider_order[0].startswith("ollama")
|
||||
)
|
||||
if (
|
||||
cached_provider == "ollama"
|
||||
and any(provider.startswith("ollama") for provider in provider_order)
|
||||
):
|
||||
provider_allowed = True
|
||||
if ollama_first_required and not cached_provider.startswith("ollama"):
|
||||
provider_allowed = False
|
||||
if not provider_allowed:
|
||||
logger.info(
|
||||
"ai_router_cache_provider_mismatch_skip",
|
||||
cache_key=cache_key[:30],
|
||||
cached_provider=cached_provider,
|
||||
provider_order=provider_order,
|
||||
ollama_first_required=ollama_first_required,
|
||||
)
|
||||
raise ValueError("cached provider not allowed by current provider_order")
|
||||
logger.info("ai_router_cache_hit", cache_key=cache_key[:30])
|
||||
return AIResult(
|
||||
raw_response=data.get("response", ""),
|
||||
success=True,
|
||||
provider=data.get("provider", "cache"),
|
||||
provider=cached_provider,
|
||||
from_cache=True,
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -1097,6 +1141,10 @@ class AIRouterExecutor:
|
||||
_lf_trace_ctx = None
|
||||
|
||||
errors: list[str] = []
|
||||
attempted_providers: set[str] = set()
|
||||
alert_requires_ollama_before_cloud = bool(
|
||||
(context or {}).get("alert_requires_ollama_before_cloud")
|
||||
)
|
||||
|
||||
# 2026-04-27 Claude Sonnet 4.6: A2 INC-20260425 — DIAGNOSE fallback metric 追蹤
|
||||
# 透過 context.get("intent_hint") 判斷是否為 DIAGNOSE,避免改動 execute() 簽名
|
||||
@@ -1146,13 +1194,31 @@ class AIRouterExecutor:
|
||||
errors.append(f"{provider_name}: privacy_skip(non_local)")
|
||||
continue
|
||||
|
||||
if alert_requires_ollama_before_cloud and provider.privacy_level == "cloud":
|
||||
if "ollama_local" not in attempted_providers:
|
||||
errors.append(f"{provider_name}: blocked_until_ollama_local_attempted")
|
||||
logger.warning(
|
||||
"ai_router_cloud_blocked_until_ollama_local_attempted",
|
||||
provider=provider_name,
|
||||
provider_order=provider_order,
|
||||
attempted_providers=sorted(attempted_providers),
|
||||
)
|
||||
continue
|
||||
|
||||
# 閘門 1: Circuit Breaker (per-provider, C2 修復)
|
||||
cb = self._get_circuit_breaker(provider_name)
|
||||
if cb.is_open():
|
||||
errors.append(f"{provider_name}: circuit_open")
|
||||
logger.warning("ai_router_circuit_open", provider=provider_name)
|
||||
# 2026-04-27 Claude Sonnet 4.6: F6 — circuit_open 不設 _last_attempted_provider(未嘗試)
|
||||
continue
|
||||
if alert_requires_ollama_before_cloud and provider_name.startswith("ollama"):
|
||||
logger.warning(
|
||||
"ai_router_alert_ollama_circuit_bypassed",
|
||||
provider=provider_name,
|
||||
reason="alert_requires_ollama_before_cloud",
|
||||
)
|
||||
else:
|
||||
errors.append(f"{provider_name}: circuit_open")
|
||||
logger.warning("ai_router_circuit_open", provider=provider_name)
|
||||
# 2026-04-27 Claude Sonnet 4.6: F6 — circuit_open 不設 _last_attempted_provider(未嘗試)
|
||||
continue
|
||||
|
||||
# 閘門 2: Rate Limiter
|
||||
# 2026-04-02 Claude Code: Phase 24 B3 + C1 修復 — Rate Limiter (含 openclaw_nemo)
|
||||
@@ -1172,6 +1238,7 @@ class AIRouterExecutor:
|
||||
sem = self._get_semaphore(provider_name)
|
||||
async with sem:
|
||||
try:
|
||||
attempted_providers.add(provider_name)
|
||||
result = await provider.analyze(prompt, context)
|
||||
|
||||
if result.success:
|
||||
@@ -1294,13 +1361,21 @@ _executor: AIRouterExecutor | None = None
|
||||
|
||||
def _init_registry() -> AIProviderRegistry:
|
||||
"""初始化 Provider Registry (首次呼叫時自動註冊所有 Provider)"""
|
||||
from src.services.ai_providers.ollama import OllamaProvider, Ollama188Provider # 2026-04-26 Wave5 B1-fix by Claude Engineer-A4
|
||||
from src.services.ai_providers.ollama import (
|
||||
OllamaProvider,
|
||||
OllamaLocalProvider,
|
||||
OllamaGcpBProvider, # 2026-05-04 ADR-110 GCP-B
|
||||
)
|
||||
from src.services.ai_providers.gemini import GeminiProvider
|
||||
from src.services.ai_providers.claude import ClaudeProvider
|
||||
from src.services.ai_providers.openclaw_nemo import OpenClawNemoProvider
|
||||
|
||||
registry = AIProviderRegistry()
|
||||
registry.register(OllamaProvider())
|
||||
|
||||
# GCP-A Primary(name="ollama",OLLAMA_URL)
|
||||
ollama_gcp_a = OllamaProvider()
|
||||
registry.register(ollama_gcp_a)
|
||||
|
||||
registry.register(GeminiProvider())
|
||||
registry.register(ClaudeProvider())
|
||||
registry.register(OpenClawNemoProvider())
|
||||
@@ -1309,10 +1384,21 @@ def _init_registry() -> AIProviderRegistry:
|
||||
from src.services.ai_providers.nemotron import NemotronProvider
|
||||
registry.register(NemotronProvider())
|
||||
|
||||
# 2026-04-26 Wave5 B1-fix by Claude Engineer-A4 — 補登 OLLAMA_188 備援 provider
|
||||
# 修復:原本 failover_manager 決策返回 "ollama_188",但 executor 查不到 → not_registered
|
||||
# → 188 從未被打到。必須明確 register 才能讓 executor.execute() 路由到 188。
|
||||
registry.register(Ollama188Provider())
|
||||
# 2026-05-06 Codex: 188 不再作為 Ollama provider。
|
||||
# Local fallback 統一命名為 ollama_local,端點由 OLLAMA_FALLBACK_URL 指向 111/110 proxy。
|
||||
ollama_local = OllamaLocalProvider()
|
||||
registry.register(ollama_local)
|
||||
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP 三層容災修復
|
||||
# 根因:OllamaFailoverManager 回傳 "ollama_gcp_a"/"ollama_gcp_b"/"ollama_local"
|
||||
# 但 registry 無這些名稱 → not_registered → 整條 Ollama 鏈跳過 → 直接跳 Gemini
|
||||
# 修復:
|
||||
# "ollama_gcp_a" alias → 同 OllamaProvider(OLLAMA_URL = GCP-A)
|
||||
# "ollama_gcp_b" → 新 OllamaGcpBProvider(OLLAMA_SECONDARY_URL = GCP-B)
|
||||
# "ollama_local" → OllamaLocalProvider(OLLAMA_FALLBACK_URL = 111 / 110:11437)
|
||||
registry._providers["ollama_gcp_a"] = ollama_gcp_a
|
||||
registry.register(OllamaGcpBProvider())
|
||||
registry._providers["ollama_local"] = ollama_local
|
||||
|
||||
return registry
|
||||
|
||||
|
||||
151
apps/api/src/services/alert_approval_guard.py
Normal file
151
apps/api/src/services/alert_approval_guard.py
Normal file
@@ -0,0 +1,151 @@
|
||||
"""Alert approval guardrails for AI-generated remediation actions.
|
||||
|
||||
This service runs before an Alertmanager-derived action becomes an
|
||||
ApprovalRecord. It prevents a known failure mode: an LLM invents a kubectl
|
||||
target that does not belong to the current alert domain, then the approval
|
||||
pipeline faithfully executes or displays that bad command.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import structlog
|
||||
|
||||
from src.services.action_parser import ActionKind, parse_kubectl_action
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
_ALLOWED_K8S_NAMESPACES = frozenset({"awoooi-prod", "observability", "signoz", "langfuse"})
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ApprovalActionGuardResult:
|
||||
"""Guarded action payload returned to approval creation."""
|
||||
|
||||
action: str
|
||||
blocked: bool = False
|
||||
reason: str | None = None
|
||||
metadata: dict[str, object] = field(default_factory=dict)
|
||||
|
||||
|
||||
async def guard_alert_approval_action(
|
||||
*,
|
||||
action: str,
|
||||
alert_namespace: str | None,
|
||||
alertname: str,
|
||||
alert_category: str,
|
||||
) -> ApprovalActionGuardResult:
|
||||
"""Validate an AI/rule action before it is persisted as an approval.
|
||||
|
||||
Non-kubectl actions are intentionally left to their domain-specific gates.
|
||||
Kubectl actions must satisfy the structured parser and must not jump to an
|
||||
unrelated namespace such as ``default`` or ``production`` when the alert
|
||||
came from AWOOOI's production namespace.
|
||||
"""
|
||||
|
||||
raw_action = (action or "").strip()
|
||||
if not raw_action.lower().startswith("kubectl"):
|
||||
return ApprovalActionGuardResult(action=action)
|
||||
|
||||
parsed = parse_kubectl_action(raw_action)
|
||||
if not parsed.ok:
|
||||
return _blocked(raw_action, f"invalid_kubectl:{parsed.reason}", alertname)
|
||||
|
||||
requested_namespace = parsed.namespace
|
||||
expected_namespace = (alert_namespace or "awoooi-prod").strip() or "awoooi-prod"
|
||||
if requested_namespace and requested_namespace not in _ALLOWED_K8S_NAMESPACES:
|
||||
return _blocked(
|
||||
raw_action,
|
||||
f"namespace_not_allowed:{requested_namespace}",
|
||||
alertname,
|
||||
expected_namespace=expected_namespace,
|
||||
)
|
||||
|
||||
if (
|
||||
requested_namespace
|
||||
and expected_namespace in _ALLOWED_K8S_NAMESPACES
|
||||
and requested_namespace != expected_namespace
|
||||
and requested_namespace != "observability"
|
||||
):
|
||||
return _blocked(
|
||||
raw_action,
|
||||
f"namespace_mismatch:{requested_namespace}!={expected_namespace}",
|
||||
alertname,
|
||||
expected_namespace=expected_namespace,
|
||||
)
|
||||
|
||||
# Read-only commands are safe enough to display once the namespace is sane.
|
||||
# Mutating commands still need resource existence checks to avoid executing
|
||||
# hallucinated deployments like "flywheelexecutionratemissing".
|
||||
if parsed.kind == ActionKind.READONLY and parsed.verb in {"get", "version"}:
|
||||
return ApprovalActionGuardResult(action=action)
|
||||
|
||||
if parsed.resource_name and parsed.resource_type in {
|
||||
"deployment",
|
||||
"statefulset",
|
||||
"daemonset",
|
||||
"pod",
|
||||
"service",
|
||||
}:
|
||||
try:
|
||||
from src.services.resource_resolver import get_resource_resolver
|
||||
|
||||
resolver = get_resource_resolver()
|
||||
resolved = await resolver.resolve(
|
||||
raw_resource=parsed.resource_name,
|
||||
namespace=requested_namespace or expected_namespace,
|
||||
resource_kind=parsed.resource_type,
|
||||
)
|
||||
if not resolved.success:
|
||||
return _blocked(
|
||||
raw_action,
|
||||
f"k8s_resource_not_found:{parsed.resource_type}/{parsed.resource_name}",
|
||||
alertname,
|
||||
expected_namespace=expected_namespace,
|
||||
candidates=resolved.candidates,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"approval_action_resource_guard_unavailable",
|
||||
alertname=alertname,
|
||||
alert_category=alert_category,
|
||||
action=raw_action[:160],
|
||||
error=str(exc),
|
||||
)
|
||||
return ApprovalActionGuardResult(
|
||||
action=action,
|
||||
metadata={"action_guard_warning": "resource_guard_unavailable"},
|
||||
)
|
||||
|
||||
return ApprovalActionGuardResult(action=action)
|
||||
|
||||
|
||||
def _blocked(
|
||||
raw_action: str,
|
||||
reason: str,
|
||||
alertname: str,
|
||||
*,
|
||||
expected_namespace: str | None = None,
|
||||
candidates: list[str] | None = None,
|
||||
) -> ApprovalActionGuardResult:
|
||||
logger.warning(
|
||||
"approval_action_blocked_before_persist",
|
||||
alertname=alertname,
|
||||
reason=reason,
|
||||
action=raw_action[:160],
|
||||
expected_namespace=expected_namespace,
|
||||
candidates=candidates or [],
|
||||
)
|
||||
return ApprovalActionGuardResult(
|
||||
action=f"NO_ACTION - INVALID_TARGET: {reason}; original={raw_action[:180]}",
|
||||
blocked=True,
|
||||
reason=reason,
|
||||
metadata={
|
||||
"action_guard": "blocked_before_persist",
|
||||
"blocked_action": raw_action[:300],
|
||||
"blocked_reason": reason,
|
||||
"expected_namespace": expected_namespace,
|
||||
"candidates": candidates or [],
|
||||
},
|
||||
)
|
||||
@@ -36,6 +36,17 @@ if TYPE_CHECKING:
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
def _decode_redis_member(value: object, fallback: str) -> str:
|
||||
"""Redis client 可能回 bytes 或 str;統一成 str 供 DB / log 使用。"""
|
||||
if isinstance(value, bytes):
|
||||
return value.decode("utf-8", errors="replace")
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
if value is None:
|
||||
return fallback
|
||||
return str(value)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Data Types
|
||||
# =============================================================================
|
||||
@@ -83,8 +94,9 @@ class AlertGroupingService:
|
||||
# 5 分鐘滑動視窗
|
||||
WINDOW_SECONDS: int = 300
|
||||
|
||||
# 觸發聚合的閾值(同一分組 5 分鐘內超過此數量才聚合)
|
||||
GROUP_THRESHOLD: int = 3
|
||||
# 觸發聚合的閾值:保留第一張主卡,第二個同組告警開始收斂。
|
||||
# 2026-05-07 Codex — Telegram 群組噪音治理:舊值 3 會讓前兩張同類告警仍進 AI/Telegram。
|
||||
GROUP_THRESHOLD: int = 2
|
||||
|
||||
# Redis Key 前綴
|
||||
PREFIX_WINDOW = "alert_group:window:"
|
||||
@@ -188,7 +200,10 @@ class AlertGroupingService:
|
||||
|
||||
count = results[2]
|
||||
first_members = results[3]
|
||||
parent_fingerprint = first_members[0] if first_members else fingerprint
|
||||
parent_fingerprint = _decode_redis_member(
|
||||
first_members[0] if first_members else None,
|
||||
fallback=fingerprint,
|
||||
)
|
||||
|
||||
# 是否為父告警(第一個)
|
||||
is_parent = parent_fingerprint == fingerprint or count == 1
|
||||
|
||||
@@ -7,7 +7,7 @@ OpenClaw 告警規則匹配引擎
|
||||
- 規則在 YAML 定義,不需要改 Python 代碼
|
||||
- 匹配邏輯: alertname 完全匹配 > alert_type 部分匹配 > message 關鍵字
|
||||
- priority 越小越優先,999 = 通用兜底
|
||||
- 變數替換: {target} {host} {container} {instance} {job} {namespace}
|
||||
- 變數替換: {target} {host} {container} {instance} {job} {namespace} {unit}
|
||||
|
||||
自動規則生成:
|
||||
- 只有 generic_fallback 觸發時才生成(具體規則不觸發)
|
||||
@@ -185,6 +185,7 @@ def _extract_vars(alert_context: dict) -> dict[str, str]:
|
||||
host = instance.split(":")[0] if ":" in instance else instance
|
||||
job = labels.get("job", "exporter")
|
||||
namespace = alert_context.get("namespace", "awoooi-prod")
|
||||
unit = labels.get("unit", "")
|
||||
|
||||
# GAP-A4: 多層 label 查找,由最權威到最弱
|
||||
target = ""
|
||||
@@ -245,6 +246,7 @@ def _extract_vars(alert_context: dict) -> dict[str, str]:
|
||||
"instance": instance,
|
||||
"job": job,
|
||||
"namespace": namespace,
|
||||
"unit": unit,
|
||||
}
|
||||
|
||||
|
||||
@@ -397,8 +399,15 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None:
|
||||
# GAP-A4 (2026-04-14 Claude Sonnet 4.6): 後置驗證 — 垃圾 target 丟棄 command
|
||||
# 避免 `kubectl rollout restart deployment unknown/HostHighCpuLoad/...` 這類無效指令
|
||||
# 清空 kubectl_command 讓 decision_manager 降級給 LLM 處理
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: SSH 指令豁免 bad_target 驗證
|
||||
# 根因:host_resource_alert 規則的 kubectl_command 以 "ssh {host} '...'" 組成,
|
||||
# {host} 由 instance label 派生(_extract_vars 第 184-185 行),與 {target} 無關。
|
||||
# 但 host 告警缺少 K8s deployment label → target="unknown" → _is_bad_target=True
|
||||
# → kubectl_command 被清空 → auto_approve 以 no_executable_action 拒絕 → 人工攔截。
|
||||
# 修復:SSH 指令不依賴 target,跳過 bad_target 驗證,保留指令讓自動診斷路徑通行。
|
||||
_invalid_target = False
|
||||
if kubectl_command and _is_bad_target(vars["target"], alertname):
|
||||
_is_ssh_command = kubectl_command.startswith("ssh ")
|
||||
if kubectl_command and not _is_ssh_command and _is_bad_target(vars["target"], alertname):
|
||||
logger.warning(
|
||||
"rule_kubectl_command_discarded_bad_target",
|
||||
rule_id=matched_rule["id"],
|
||||
|
||||
@@ -115,8 +115,20 @@ class AnomalyCounter:
|
||||
# TTL 設定 (35 天,比清理週期長一點)
|
||||
TTL_SECONDS = 35 * 24 * 3600
|
||||
|
||||
def __init__(self, redis_client: redis.Redis) -> None:
|
||||
def __init__(self, redis_client: redis.Redis, project_id: str = "awoooi") -> None:
|
||||
self.redis = redis_client
|
||||
self.project_id = project_id
|
||||
|
||||
def _pkey(self, prefix: str, key: str) -> str:
|
||||
"""新格式 key: {project_id}:{prefix}{key}(Phase A 多租戶)"""
|
||||
return f"{self.project_id}:{prefix}{key}"
|
||||
|
||||
async def _redis_get_with_fallback(self, prefix: str, key: str) -> bytes | None:
|
||||
"""Phase A: 讀新 key,fallback 到舊 key。"""
|
||||
val = await self.redis.get(self._pkey(prefix, key))
|
||||
if val is None:
|
||||
val = await self.redis.get(f"{prefix}{key}")
|
||||
return val
|
||||
|
||||
@staticmethod
|
||||
def derive_key_from_incident(incident) -> str | None:
|
||||
@@ -217,7 +229,7 @@ class AnomalyCounter:
|
||||
) -> AnomalyFrequency:
|
||||
"""實際的異常記錄邏輯(可能拋出 Redis 異常)"""
|
||||
timestamp = now.timestamp()
|
||||
timeline_key = f"{self.PREFIX_TIMELINE}{anomaly_key}"
|
||||
timeline_key = self._pkey(self.PREFIX_TIMELINE, anomaly_key)
|
||||
|
||||
# 1. 添加到 Sorted Set (score = timestamp, member = timestamp string)
|
||||
await self.redis.zadd(timeline_key, {str(timestamp): timestamp})
|
||||
@@ -270,27 +282,22 @@ class AnomalyCounter:
|
||||
else now
|
||||
)
|
||||
|
||||
# 6. 讀取修復統計
|
||||
repair_count_str = await self.redis.get(
|
||||
f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}"
|
||||
)
|
||||
# 6. 讀取修復統計(Phase A: 讀新 key,fallback 到舊 key)
|
||||
repair_count_str = await self._redis_get_with_fallback(self.PREFIX_REPAIR_COUNT, anomaly_key)
|
||||
auto_repair_count = int(repair_count_str) if repair_count_str else 0
|
||||
|
||||
permanent_fix_str = await self.redis.get(
|
||||
f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}"
|
||||
)
|
||||
permanent_fix = permanent_fix_str == "1"
|
||||
permanent_fix_str = await self._redis_get_with_fallback(self.PREFIX_PERMANENT_FIX, anomaly_key)
|
||||
permanent_fix = permanent_fix_str == b"1" or permanent_fix_str == "1"
|
||||
|
||||
# 7. 儲存 metadata (首次記錄時)
|
||||
metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}"
|
||||
if not await self.redis.exists(metadata_key):
|
||||
await self.redis.hset(
|
||||
metadata_key,
|
||||
mapping={
|
||||
"signature": json.dumps(anomaly_signature),
|
||||
"first_seen": now.isoformat(),
|
||||
},
|
||||
)
|
||||
metadata_key = self._pkey(self.PREFIX_METADATA, anomaly_key)
|
||||
legacy_metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}"
|
||||
if not await self.redis.exists(metadata_key) and not await self.redis.exists(legacy_metadata_key):
|
||||
metadata_payload = {
|
||||
"signature": json.dumps(anomaly_signature),
|
||||
"first_seen": now.isoformat(),
|
||||
}
|
||||
await self.redis.hset(metadata_key, mapping=metadata_payload)
|
||||
await self.redis.expire(metadata_key, self.TTL_SECONDS)
|
||||
|
||||
# 8. 判斷升級等級
|
||||
@@ -353,14 +360,14 @@ class AnomalyCounter:
|
||||
success: 是否成功
|
||||
"""
|
||||
try:
|
||||
repair_key = f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}"
|
||||
repair_key = self._pkey(self.PREFIX_REPAIR_COUNT, anomaly_key)
|
||||
|
||||
# 遞增修復嘗試次數
|
||||
await self.redis.incr(repair_key)
|
||||
await self.redis.expire(repair_key, self.TTL_SECONDS)
|
||||
|
||||
# 記錄修復歷史 (用於學習)
|
||||
history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}"
|
||||
history_key = self._pkey(self.PREFIX_REPAIR_HISTORY, anomaly_key)
|
||||
await self.redis.lpush(
|
||||
history_key,
|
||||
json.dumps(
|
||||
@@ -411,7 +418,7 @@ class AnomalyCounter:
|
||||
return
|
||||
|
||||
try:
|
||||
key = f"{self.PREFIX_DISPOSITION}{anomaly_key}"
|
||||
key = self._pkey(self.PREFIX_DISPOSITION, anomaly_key)
|
||||
await self.redis.hincrby(key, disposition_type, 1)
|
||||
await self.redis.hincrby(key, "total", 1)
|
||||
await self.redis.expire(key, self.TTL_SECONDS)
|
||||
@@ -434,8 +441,11 @@ class AnomalyCounter:
|
||||
"cold_start_trust": N, "total": N}
|
||||
"""
|
||||
try:
|
||||
key = f"{self.PREFIX_DISPOSITION}{anomaly_key}"
|
||||
key = self._pkey(self.PREFIX_DISPOSITION, anomaly_key)
|
||||
raw = await self.redis.hgetall(key)
|
||||
if not raw:
|
||||
# Phase A: fallback 到舊 key
|
||||
raw = await self.redis.hgetall(f"{self.PREFIX_DISPOSITION}{anomaly_key}")
|
||||
return {
|
||||
"auto_repair": int(raw.get(b"auto_repair", raw.get("auto_repair", 0))),
|
||||
"human_approved": int(raw.get(b"human_approved", raw.get("human_approved", 0))),
|
||||
@@ -471,11 +481,25 @@ class AnomalyCounter:
|
||||
|
||||
try:
|
||||
# S2 Fix: 使用 Pipeline 批次查詢,消除 N+1 問題
|
||||
pattern = f"{self.PREFIX_DISPOSITION}*"
|
||||
# Phase A: 先掃新前綴,若無資料 fallback 到舊前綴
|
||||
new_pattern = f"{self.project_id}:{self.PREFIX_DISPOSITION}*"
|
||||
new_strip = f"{self.project_id}:{self.PREFIX_DISPOSITION}"
|
||||
legacy_pattern = f"{self.PREFIX_DISPOSITION}*"
|
||||
legacy_strip = self.PREFIX_DISPOSITION
|
||||
|
||||
keys: list = []
|
||||
async for key in self.redis.scan_iter(match=pattern, count=100):
|
||||
async for key in self.redis.scan_iter(match=new_pattern, count=100):
|
||||
keys.append(key)
|
||||
|
||||
if keys:
|
||||
strip_prefix = new_strip
|
||||
meta_prefix = f"{self.project_id}:{self.PREFIX_METADATA}"
|
||||
else:
|
||||
async for key in self.redis.scan_iter(match=legacy_pattern, count=100):
|
||||
keys.append(key)
|
||||
strip_prefix = legacy_strip
|
||||
meta_prefix = self.PREFIX_METADATA
|
||||
|
||||
if not keys:
|
||||
return total_summary, by_anomaly
|
||||
|
||||
@@ -489,11 +513,11 @@ class AnomalyCounter:
|
||||
anomaly_keys_str = []
|
||||
for key in keys:
|
||||
key_str = key.decode() if isinstance(key, bytes) else key
|
||||
anomaly_keys_str.append(key_str.replace(self.PREFIX_DISPOSITION, ""))
|
||||
anomaly_keys_str.append(key_str.replace(strip_prefix, ""))
|
||||
|
||||
meta_pipe = self.redis.pipeline(transaction=False)
|
||||
for ak in anomaly_keys_str:
|
||||
meta_pipe.hget(f"{self.PREFIX_METADATA}{ak}", "signature")
|
||||
meta_pipe.hget(f"{meta_prefix}{ak}", "signature")
|
||||
meta_results = await meta_pipe.execute()
|
||||
|
||||
for i, raw in enumerate(results):
|
||||
@@ -547,13 +571,13 @@ class AnomalyCounter:
|
||||
"""
|
||||
try:
|
||||
await self.redis.set(
|
||||
f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}",
|
||||
self._pkey(self.PREFIX_PERMANENT_FIX, anomaly_key),
|
||||
"1",
|
||||
ex=90 * 24 * 3600, # 90 天
|
||||
)
|
||||
|
||||
# 記錄修復詳情
|
||||
metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}"
|
||||
metadata_key = self._pkey(self.PREFIX_METADATA, anomaly_key)
|
||||
await self.redis.hset(
|
||||
metadata_key,
|
||||
mapping={
|
||||
@@ -588,8 +612,11 @@ class AnomalyCounter:
|
||||
}
|
||||
"""
|
||||
try:
|
||||
history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}"
|
||||
history_key = self._pkey(self.PREFIX_REPAIR_HISTORY, anomaly_key)
|
||||
history = await self.redis.lrange(history_key, 0, -1)
|
||||
if not history:
|
||||
# Phase A: fallback 到舊 key
|
||||
history = await self.redis.lrange(f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}", 0, -1)
|
||||
|
||||
total = 0
|
||||
success_count = 0
|
||||
@@ -627,8 +654,11 @@ class AnomalyCounter:
|
||||
}
|
||||
"""
|
||||
try:
|
||||
history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}"
|
||||
history_key = self._pkey(self.PREFIX_REPAIR_HISTORY, anomaly_key)
|
||||
history = await self.redis.lrange(history_key, 0, -1)
|
||||
if not history:
|
||||
# Phase A: fallback 到舊 key
|
||||
history = await self.redis.lrange(f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}", 0, -1)
|
||||
|
||||
stats: dict[str, dict] = {}
|
||||
|
||||
@@ -666,11 +696,14 @@ class AnomalyCounter:
|
||||
AnomalyFrequency 或 None (若無記錄 或 Redis 重連失敗)
|
||||
"""
|
||||
try:
|
||||
timeline_key = f"{self.PREFIX_TIMELINE}{anomaly_key}"
|
||||
timeline_key = self._pkey(self.PREFIX_TIMELINE, anomaly_key)
|
||||
legacy_timeline_key = f"{self.PREFIX_TIMELINE}{anomaly_key}"
|
||||
|
||||
# 檢查是否有記錄
|
||||
# Phase A: 若新 key 無資料,改用舊 key
|
||||
if not await self.redis.exists(timeline_key):
|
||||
return None
|
||||
if not await self.redis.exists(legacy_timeline_key):
|
||||
return None
|
||||
timeline_key = legacy_timeline_key
|
||||
|
||||
now = datetime.now()
|
||||
cutoff_30d = (now - timedelta(days=30)).timestamp()
|
||||
@@ -716,16 +749,12 @@ class AnomalyCounter:
|
||||
else now
|
||||
)
|
||||
|
||||
# 讀取修復統計
|
||||
repair_count_str = await self.redis.get(
|
||||
f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}"
|
||||
)
|
||||
# 讀取修復統計(Phase A: 讀新 key,fallback 到舊 key)
|
||||
repair_count_str = await self._redis_get_with_fallback(self.PREFIX_REPAIR_COUNT, anomaly_key)
|
||||
auto_repair_count = int(repair_count_str) if repair_count_str else 0
|
||||
|
||||
permanent_fix_str = await self.redis.get(
|
||||
f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}"
|
||||
)
|
||||
permanent_fix = permanent_fix_str == "1"
|
||||
permanent_fix_str = await self._redis_get_with_fallback(self.PREFIX_PERMANENT_FIX, anomaly_key)
|
||||
permanent_fix = permanent_fix_str in (b"1", "1")
|
||||
|
||||
escalation_level = self._get_escalation_level(count_24h)
|
||||
|
||||
@@ -797,7 +826,7 @@ def get_anomaly_counter() -> AnomalyCounter:
|
||||
if _anomaly_counter is None:
|
||||
from src.core.redis_client import get_redis
|
||||
|
||||
_anomaly_counter = AnomalyCounter(get_redis())
|
||||
_anomaly_counter = AnomalyCounter(get_redis(), project_id="awoooi")
|
||||
return _anomaly_counter
|
||||
|
||||
|
||||
|
||||
@@ -685,10 +685,36 @@ class ApprovalExecutionService:
|
||||
)
|
||||
|
||||
# 呼叫 SSH MCP Provider
|
||||
# 2026-05-06 Codex: approved execution 是高風險「實際執行」路徑。
|
||||
# 在 AwoooP MCP Gateway 完全接管前,至少必須經過 AuditedMCPToolProvider
|
||||
# 寫入 durable mcp_audit_log,並標記這仍是 legacy direct provider path。
|
||||
from src.plugins.mcp.providers.ssh_provider import SSHProvider
|
||||
provider = SSHProvider()
|
||||
from src.plugins.mcp.registry import AuditedMCPToolProvider
|
||||
|
||||
provider = AuditedMCPToolProvider(SSHProvider())
|
||||
params_with_audit = {
|
||||
**params,
|
||||
"_mcp_audit": {
|
||||
"session_id": f"approval:{approval.id}",
|
||||
"incident_id": approval.incident_id,
|
||||
"agent_role": "approval_executor",
|
||||
"flywheel_node": "execute",
|
||||
"gateway_path": "legacy_direct_provider",
|
||||
},
|
||||
}
|
||||
try:
|
||||
mcp_result = await provider.execute(tool_name=tool_name, parameters=params)
|
||||
logger.warning(
|
||||
"mcp_gateway_legacy_direct_provider_path",
|
||||
approval_id=str(approval.id),
|
||||
incident_id=approval.incident_id,
|
||||
tool=tool_name,
|
||||
host=host,
|
||||
reason="awooop_gateway_not_enforced_for_legacy_approval_execution",
|
||||
)
|
||||
mcp_result = await provider.execute(
|
||||
tool_name=tool_name,
|
||||
parameters=params_with_audit,
|
||||
)
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
success = bool(mcp_result.success)
|
||||
return ExecutionResult(
|
||||
@@ -812,9 +838,9 @@ class ApprovalExecutionService:
|
||||
f"{km_info}"
|
||||
)
|
||||
|
||||
await gateway._http_client.post(
|
||||
f"https://api.telegram.org/bot{settings.OPENCLAW_TG_BOT_TOKEN}/sendMessage",
|
||||
json={
|
||||
await gateway._send_request(
|
||||
"sendMessage",
|
||||
{
|
||||
"chat_id": target_chat_id,
|
||||
"text": text,
|
||||
"parse_mode": "HTML",
|
||||
|
||||
227
apps/api/src/services/audit_sink.py
Normal file
227
apps/api/src/services/audit_sink.py
Normal file
@@ -0,0 +1,227 @@
|
||||
"""
|
||||
Audit Sink with PII/Secret Redaction
|
||||
======================================
|
||||
AwoooP Phase 4.4: Audit log 寫入前的 sanitization pipeline(ADR-116)
|
||||
2026-05-04 ogt + Claude Sonnet 4.6
|
||||
|
||||
設計原則:
|
||||
- audit log 不記錄 raw LLM input/output,只記 hash + schema validation result
|
||||
- PII / secret pattern 硬攔(不可被 caller 繞過)
|
||||
- 攔截清單:GCP IP、PostgreSQL password、Telegram token、SSH key、Bearer token 等
|
||||
- redaction 後原值不可還原(替換為 [REDACTED:<type>])
|
||||
- 所有 audit 寫入透過此 sink(禁止其他 service 直接 INSERT audit_logs)
|
||||
|
||||
使用:
|
||||
from src.services.audit_sink import write_audit
|
||||
|
||||
await write_audit(
|
||||
project_id="awoooi",
|
||||
action="run.completed",
|
||||
resource_type="run",
|
||||
resource_id=str(run_id),
|
||||
details={"trace_id": trace_id, "cost_usd": 0.012},
|
||||
)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Redaction patterns(ADR-116 P1-08)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
# 每個 pattern: (compiled_re, replacement_tag)
|
||||
_REDACTION_PATTERNS: list[tuple[re.Pattern[str], str]] = [
|
||||
# Telegram bot token(數字:英數字母混合 32~64 字元)
|
||||
(re.compile(r"\d{8,12}:[A-Za-z0-9_-]{32,64}"), "TELEGRAM_TOKEN"),
|
||||
|
||||
# PostgreSQL connection string
|
||||
(re.compile(r"postgresql(?:\+asyncpg)?://[^:]+:[^@]+@[^/\s]+"), "PG_DSN"),
|
||||
|
||||
# Generic password in URL / config
|
||||
(re.compile(r"(?i)(?:password|passwd|pwd)\s*[:=]\s*\S+"), "PASSWORD"),
|
||||
|
||||
# Bearer / Authorization header value
|
||||
(re.compile(r"(?i)(?:bearer|token)\s+[A-Za-z0-9\-._~+/]+=*"), "BEARER_TOKEN"),
|
||||
|
||||
# AWS / GCP / NVIDIA API key patterns
|
||||
(re.compile(r"(?i)(?:api[_-]?key|apikey)\s*[:=]\s*[A-Za-z0-9\-._]{20,}"), "API_KEY"),
|
||||
|
||||
# Private GCP internal IPs(ADR-116 禁止 GCP 內網 IP 進 log)
|
||||
(re.compile(r"\b10\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"), "INTERNAL_IP"),
|
||||
(re.compile(r"\b172\.(?:1[6-9]|2\d|3[0-1])\.\d{1,3}\.\d{1,3}\b"), "INTERNAL_IP"),
|
||||
(re.compile(r"\b192\.168\.\d{1,3}\.\d{1,3}\b"), "INTERNAL_IP"),
|
||||
|
||||
# SSH private key
|
||||
(re.compile(r"-----BEGIN (?:RSA|EC|OPENSSH) PRIVATE KEY-----[\s\S]*?-----END [A-Z ]+ PRIVATE KEY-----"), "SSH_PRIVATE_KEY"),
|
||||
|
||||
# JWT(三段 base64 以 . 分隔)
|
||||
(re.compile(r"eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+"), "JWT_TOKEN"),
|
||||
|
||||
# Hex secret >= 32 位(可能是 HMAC key / session token)
|
||||
(re.compile(r"\b[0-9a-f]{64}\b"), "HEX_SECRET_64"),
|
||||
]
|
||||
|
||||
# 欄位名稱黑名單:這些 key 的 value 直接替換(不做 pattern 掃描)
|
||||
_BLOCKED_FIELD_NAMES = frozenset({
|
||||
"password", "passwd", "pwd", "secret", "token", "api_key", "apikey",
|
||||
"private_key", "private_key_pem", "bot_token", "telegram_token",
|
||||
"hmac_key", "jwt", "authorization", "cookie", "session",
|
||||
})
|
||||
|
||||
# LLM raw input/output 欄位名稱(只記 hash)
|
||||
_LLM_RAW_FIELDS = frozenset({
|
||||
"raw_input", "raw_output", "llm_input", "llm_output",
|
||||
"prompt", "completion", "system_prompt",
|
||||
})
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Sanitization pipeline
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def _redact_string(value: str) -> str:
|
||||
"""對字串套用所有 redaction patterns"""
|
||||
for pattern, tag in _REDACTION_PATTERNS:
|
||||
value = pattern.sub(f"[REDACTED:{tag}]", value)
|
||||
return value
|
||||
|
||||
|
||||
def sanitize(details: dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
遞迴處理 details dict,套用所有 redaction 規則。
|
||||
|
||||
規則優先序:
|
||||
1. key 在 _BLOCKED_FIELD_NAMES → value 替換為 [REDACTED:BLOCKED_FIELD]
|
||||
2. key 在 _LLM_RAW_FIELDS → value 替換為 sha256(str(value))(只記 hash)
|
||||
3. string value → pattern redaction
|
||||
4. nested dict/list → 遞迴處理
|
||||
"""
|
||||
return _sanitize_value(details, depth=0)
|
||||
|
||||
|
||||
def _sanitize_value(value: Any, depth: int = 0) -> Any:
|
||||
if depth > 10:
|
||||
return "[REDACTED:MAX_DEPTH]"
|
||||
|
||||
if isinstance(value, dict):
|
||||
return {k: _sanitize_dict_entry(k, v, depth) for k, v in value.items()}
|
||||
if isinstance(value, list):
|
||||
return [_sanitize_value(item, depth + 1) for item in value]
|
||||
if isinstance(value, str):
|
||||
return _redact_string(value)
|
||||
return value
|
||||
|
||||
|
||||
def _sanitize_dict_entry(key: str, value: Any, depth: int) -> Any:
|
||||
key_lower = key.lower()
|
||||
|
||||
if key_lower in _BLOCKED_FIELD_NAMES:
|
||||
return "[REDACTED:BLOCKED_FIELD]"
|
||||
|
||||
if key_lower in _LLM_RAW_FIELDS:
|
||||
# 只記 sha256 hash,不記原始內容
|
||||
raw_str = json.dumps(value, ensure_ascii=False) if not isinstance(value, str) else value
|
||||
return f"[LLM_RAW_HASH:{hashlib.sha256(raw_str.encode()).hexdigest()[:16]}]"
|
||||
|
||||
return _sanitize_value(value, depth + 1)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Audit write
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def write_audit(
|
||||
*,
|
||||
project_id: str,
|
||||
action: str,
|
||||
resource_type: str,
|
||||
resource_id: str,
|
||||
details: dict[str, Any] | None = None,
|
||||
run_id: str | None = None,
|
||||
trace_id: str | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
統一 audit log 寫入入口(Phase 4+ 所有 service 必須透過此方法)。
|
||||
|
||||
1. sanitize details(PII / secret redaction)
|
||||
2. 附加 run_id / trace_id(可觀測性)
|
||||
3. INSERT audit_logs(非阻擋 background task)
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
asyncio.create_task(
|
||||
_write_audit_impl(
|
||||
project_id=project_id,
|
||||
action=action,
|
||||
resource_type=resource_type,
|
||||
resource_id=resource_id,
|
||||
details=details,
|
||||
run_id=run_id,
|
||||
trace_id=trace_id,
|
||||
),
|
||||
name="audit_sink_write",
|
||||
)
|
||||
|
||||
|
||||
async def _write_audit_impl(
|
||||
*,
|
||||
project_id: str,
|
||||
action: str,
|
||||
resource_type: str,
|
||||
resource_id: str,
|
||||
details: dict[str, Any] | None,
|
||||
run_id: str | None,
|
||||
trace_id: str | None,
|
||||
) -> None:
|
||||
try:
|
||||
from sqlalchemy import text as sa_text
|
||||
from src.db.base import get_db_context
|
||||
|
||||
clean_details: dict[str, Any] = sanitize(details or {})
|
||||
if run_id:
|
||||
clean_details["_run_id"] = run_id
|
||||
if trace_id:
|
||||
clean_details["_trace_id"] = trace_id
|
||||
|
||||
async with get_db_context(project_id) as db:
|
||||
await db.execute(
|
||||
sa_text("""
|
||||
INSERT INTO audit_logs
|
||||
(project_id, action, resource_type, resource_id, details)
|
||||
VALUES
|
||||
(:project_id, :action, :resource_type, :resource_id, :details::jsonb)
|
||||
"""),
|
||||
{
|
||||
"project_id": project_id,
|
||||
"action": action,
|
||||
"resource_type": resource_type,
|
||||
"resource_id": resource_id,
|
||||
"details": json.dumps(clean_details),
|
||||
},
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"audit_sink_write_failed",
|
||||
action=action,
|
||||
resource_id=resource_id,
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Convenience:可在測試中驗證 sanitization 結果
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def sanitize_for_test(details: dict[str, Any]) -> dict[str, Any]:
|
||||
"""同步 sanitize,供測試使用"""
|
||||
return sanitize(details)
|
||||
@@ -457,6 +457,8 @@ class AutoRepairService:
|
||||
except Exception as _db_e:
|
||||
logger.error("auto_repair_db_write_failed", error=str(_db_e))
|
||||
|
||||
self._record_auto_repair_metric(playbook, success=True)
|
||||
|
||||
# 2026-04-07 Claude Code: Sprint 4 B1/B2 — 記錄處置類型
|
||||
# P0-1 Fix: 統一使用 AnomalyCounter.hash_signature()
|
||||
try:
|
||||
@@ -630,6 +632,8 @@ class AutoRepairService:
|
||||
except Exception as _db_e:
|
||||
logger.error("auto_repair_db_write_failed", error=str(_db_e))
|
||||
|
||||
self._record_auto_repair_metric(playbook, success=False)
|
||||
|
||||
# 2026-04-04 Claude Code: Phase 25 P1 — 失敗修復後 fire-and-forget 生成 ANTI_PATTERN
|
||||
# 2026-04-05 Claude Code: I1 修正 — 補齊 _pending_tasks GC 防護(對稱化)
|
||||
try:
|
||||
@@ -700,6 +704,35 @@ class AutoRepairService:
|
||||
|
||||
return max_risk
|
||||
|
||||
def _record_auto_repair_metric(self, playbook: Playbook, success: bool) -> None:
|
||||
"""把實際 auto-repair 執行寫入 Prometheus 指標。
|
||||
|
||||
2026-05-06 ogt + Codex:DB 已有 auto_repair_executions,但
|
||||
core.metrics.record_auto_repair() 長期零 caller,導致治理/心跳用
|
||||
Prometheus 看起來像「飛輪沒做事」。label 使用 action_type,避免
|
||||
playbook_id 造成高基數。
|
||||
"""
|
||||
try:
|
||||
from src.core.metrics import record_auto_repair
|
||||
|
||||
first_step = playbook.repair_steps[0] if playbook.repair_steps else None
|
||||
action = first_step.action_type.value if first_step else "unknown"
|
||||
max_risk = self._get_max_risk_level(playbook)
|
||||
tier = {
|
||||
RiskLevel.LOW: 1,
|
||||
RiskLevel.MEDIUM: 2,
|
||||
RiskLevel.HIGH: 3,
|
||||
RiskLevel.CRITICAL: 4,
|
||||
}.get(max_risk, 0)
|
||||
record_auto_repair(action=action, tier=tier, success=success)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"auto_repair_metric_record_failed",
|
||||
playbook_id=playbook.playbook_id,
|
||||
success=success,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
def _is_host_or_backup_incident(self, incident: Incident) -> bool:
|
||||
"""主機/備份類事件只能走 SSH/只讀診斷,不允許 K8s rollout 類修復。"""
|
||||
|
||||
|
||||
349
apps/api/src/services/awooop_approval_token.py
Normal file
349
apps/api/src/services/awooop_approval_token.py
Normal file
@@ -0,0 +1,349 @@
|
||||
"""
|
||||
AwoooP Approval Token — HS256 簽核令牌 + Multi-sig + Suggest Mode
|
||||
==================================================================
|
||||
AwoooP Phase 8: ADR-116 Gate 5 approval flow
|
||||
2026-05-04 ogt + Claude Sonnet 4.6
|
||||
|
||||
功能:
|
||||
1. HS256 Approval Token(自製,不依賴 PyJWT):
|
||||
- issue_approval_token() → signed token(3 段 base64url)
|
||||
- verify_approval_token() → payload(含 jti/exp/sub/approver)
|
||||
- jti 存 Redis NX(TTL = exp - now)防 token replay
|
||||
- TTL = 15 分鐘(APPROVAL_TOKEN_TTL = 900s)
|
||||
|
||||
2. Multi-sig quorum:
|
||||
- record_approval() → 驗 token + NX jti + SADD approver_id → 目前簽核數
|
||||
- check_approval_quorum(required=1) → bool | raise QuorumNotMetError
|
||||
- Redis Set TTL = 1h
|
||||
|
||||
3. Suggest Mode(AWOOOP_SUGGEST_MODE feature flag):
|
||||
- is_suggest_mode_enabled() → bool
|
||||
- build_suggest_action(action_type, target) → SuggestedAction(dry-run)
|
||||
- 支援 3 個 SRE flow:rollback / scale / restart
|
||||
|
||||
Redis key 前綴(與 legacy multi_sig_redis.py 不衝突):
|
||||
awooop_appr:jti:{jti} — NX token replay 防護
|
||||
awooop_appr:sigs:{project_id}:{run_id}:{tool_name} — 簽核人 Set
|
||||
|
||||
錯誤碼:
|
||||
E-APPR-001 token 無效或已過期
|
||||
E-APPR-002 jti 已使用(replay attack)
|
||||
E-APPR-003 quorum 未達
|
||||
E-APPR-004 approver 重複簽核
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import hashlib
|
||||
import hmac as _hmac_module
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
|
||||
from src.core.redis_client import get_redis
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 常數
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
APPROVAL_TOKEN_TTL = 900 # 15 分鐘
|
||||
_JTI_KEY_PREFIX = "awooop_appr:jti:"
|
||||
_SIG_SET_PREFIX = "awooop_appr:sigs:"
|
||||
_SIG_TTL_SECONDS = 3600 # 簽核 Set 1h TTL
|
||||
_SUGGEST_MODE_ENV = "AWOOOP_SUGGEST_MODE"
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 錯誤定義
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
class InvalidApprovalTokenError(Exception):
|
||||
error_code = "E-APPR-001"
|
||||
|
||||
class TokenReplayError(Exception):
|
||||
error_code = "E-APPR-002"
|
||||
|
||||
class QuorumNotMetError(Exception):
|
||||
error_code = "E-APPR-003"
|
||||
|
||||
class DuplicateApproverError(Exception):
|
||||
error_code = "E-APPR-004"
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# HS256 Token 實作
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def _b64url_encode(data: bytes) -> str:
|
||||
return base64.urlsafe_b64encode(data).rstrip(b"=").decode()
|
||||
|
||||
|
||||
def _b64url_decode(s: str) -> bytes:
|
||||
padding = 4 - len(s) % 4
|
||||
if padding != 4:
|
||||
s += "=" * padding
|
||||
return base64.urlsafe_b64decode(s)
|
||||
|
||||
|
||||
def _get_hmac_key() -> bytes:
|
||||
try:
|
||||
from src.core.config import settings
|
||||
key = getattr(settings, "APPROVAL_HMAC_KEY", None) or ""
|
||||
except Exception:
|
||||
key = ""
|
||||
key = key or os.environ.get("APPROVAL_HMAC_KEY", "")
|
||||
if not key:
|
||||
logger.warning("approval_hmac_key_not_set_using_dev_fallback")
|
||||
key = "dev-awooop-approval-hmac-fallback"
|
||||
return key.encode()
|
||||
|
||||
|
||||
def issue_approval_token(
|
||||
*,
|
||||
project_id: str,
|
||||
run_id: str,
|
||||
tool_name: str,
|
||||
approver_id: str,
|
||||
ttl_seconds: int = APPROVAL_TOKEN_TTL,
|
||||
) -> str:
|
||||
"""
|
||||
產生 HS256 Approval Token。
|
||||
|
||||
payload:
|
||||
jti = uuid4().hex(唯一 token ID,用於 Redis NX 防 replay)
|
||||
iss = "awooop-approval"
|
||||
sub = "{project_id}:{run_id}:{tool_name}"
|
||||
approver = approver_id
|
||||
iat / exp
|
||||
"""
|
||||
now = int(time.time())
|
||||
jti = uuid.uuid4().hex
|
||||
|
||||
header = {"alg": "HS256", "typ": "JWT"}
|
||||
payload = {
|
||||
"jti": jti,
|
||||
"iss": "awooop-approval",
|
||||
"sub": f"{project_id}:{run_id}:{tool_name}",
|
||||
"approver": approver_id,
|
||||
"iat": now,
|
||||
"exp": now + ttl_seconds,
|
||||
}
|
||||
|
||||
h_b64 = _b64url_encode(json.dumps(header, separators=(",", ":")).encode())
|
||||
p_b64 = _b64url_encode(json.dumps(payload, separators=(",", ":")).encode())
|
||||
signing_input = f"{h_b64}.{p_b64}"
|
||||
|
||||
sig = _hmac_module.new(
|
||||
_get_hmac_key(),
|
||||
signing_input.encode(),
|
||||
hashlib.sha256,
|
||||
).digest()
|
||||
return f"{signing_input}.{_b64url_encode(sig)}"
|
||||
|
||||
|
||||
def verify_approval_token(token: str) -> dict[str, Any]:
|
||||
"""
|
||||
驗證 HS256 token,回傳 payload。
|
||||
|
||||
Raises:
|
||||
InvalidApprovalTokenError: 簽名無效/過期/格式錯誤
|
||||
"""
|
||||
try:
|
||||
parts = token.split(".")
|
||||
if len(parts) != 3:
|
||||
raise InvalidApprovalTokenError("token 非 3 段格式")
|
||||
|
||||
h_b64, p_b64, sig_b64 = parts
|
||||
signing_input = f"{h_b64}.{p_b64}"
|
||||
|
||||
expected_sig = _hmac_module.new(
|
||||
_get_hmac_key(),
|
||||
signing_input.encode(),
|
||||
hashlib.sha256,
|
||||
).digest()
|
||||
|
||||
if not _hmac_module.compare_digest(sig_b64, _b64url_encode(expected_sig)):
|
||||
raise InvalidApprovalTokenError("token 簽名無效")
|
||||
|
||||
payload = json.loads(_b64url_decode(p_b64))
|
||||
|
||||
if int(time.time()) > payload.get("exp", 0):
|
||||
raise InvalidApprovalTokenError("token 已過期")
|
||||
|
||||
return payload
|
||||
|
||||
except InvalidApprovalTokenError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise InvalidApprovalTokenError(f"token 解析失敗: {exc}") from exc
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Multi-sig Redis approval
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def record_approval(
|
||||
*,
|
||||
project_id: str,
|
||||
run_id: str,
|
||||
tool_name: str,
|
||||
approver_id: str,
|
||||
token: str,
|
||||
) -> int:
|
||||
"""
|
||||
記錄一筆簽核。步驟:
|
||||
1. verify_approval_token(HS256 + exp)
|
||||
2. sub 匹配驗證
|
||||
3. Redis NX jti(防 replay)
|
||||
4. Redis SADD approver_id(防重複)
|
||||
5. 回傳目前簽核數
|
||||
|
||||
Raises:
|
||||
InvalidApprovalTokenError, TokenReplayError, DuplicateApproverError
|
||||
"""
|
||||
payload = verify_approval_token(token)
|
||||
|
||||
expected_sub = f"{project_id}:{run_id}:{tool_name}"
|
||||
if payload.get("sub") != expected_sub:
|
||||
raise InvalidApprovalTokenError(
|
||||
f"token sub 不符(期望 '{expected_sub}',實際 '{payload.get('sub')}')"
|
||||
)
|
||||
|
||||
jti = payload["jti"]
|
||||
exp = payload["exp"]
|
||||
|
||||
try:
|
||||
redis = get_redis()
|
||||
|
||||
# jti NX
|
||||
jti_key = f"{_JTI_KEY_PREFIX}{jti}"
|
||||
ttl_remaining = max(exp - int(time.time()), 1)
|
||||
ok = await redis.set(jti_key, "1", nx=True, ex=ttl_remaining)
|
||||
if not ok:
|
||||
raise TokenReplayError(f"jti={jti!r} 已使用")
|
||||
|
||||
# SADD approver
|
||||
sig_key = f"{_SIG_SET_PREFIX}{project_id}:{run_id}:{tool_name}"
|
||||
added = await redis.sadd(sig_key, approver_id)
|
||||
if added == 0:
|
||||
raise DuplicateApproverError(f"approver '{approver_id}' 已簽核")
|
||||
|
||||
await redis.expire(sig_key, _SIG_TTL_SECONDS)
|
||||
count = int(await redis.scard(sig_key))
|
||||
|
||||
logger.info(
|
||||
"awooop_approval_recorded",
|
||||
project_id=project_id,
|
||||
run_id=run_id,
|
||||
tool_name=tool_name,
|
||||
approver_id=approver_id,
|
||||
count=count,
|
||||
)
|
||||
return count
|
||||
|
||||
except (InvalidApprovalTokenError, TokenReplayError, DuplicateApproverError):
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.exception("awooop_approval_redis_error", error=str(exc))
|
||||
raise InvalidApprovalTokenError(f"Redis 錯誤: {exc}") from exc
|
||||
|
||||
|
||||
async def check_approval_quorum(
|
||||
*,
|
||||
project_id: str,
|
||||
run_id: str,
|
||||
tool_name: str,
|
||||
required_count: int = 1,
|
||||
) -> bool:
|
||||
"""
|
||||
檢查 quorum。Raises QuorumNotMetError if 不足。
|
||||
"""
|
||||
try:
|
||||
redis = get_redis()
|
||||
sig_key = f"{_SIG_SET_PREFIX}{project_id}:{run_id}:{tool_name}"
|
||||
count = int(await redis.scard(sig_key))
|
||||
|
||||
if count < required_count:
|
||||
raise QuorumNotMetError(f"簽核數不足({count}/{required_count})")
|
||||
return True
|
||||
|
||||
except QuorumNotMetError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise QuorumNotMetError(f"Redis 查詢失敗: {exc}") from exc
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Suggest Mode
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class SuggestedAction:
|
||||
"""Suggest mode dry-run 結果(不真正執行)"""
|
||||
action_type: str # 'rollback' | 'scale' | 'restart'
|
||||
target: str
|
||||
suggested_command: str
|
||||
rollback_evidence: dict[str, Any] = field(default_factory=dict)
|
||||
dry_run: bool = True
|
||||
approval_required: bool = True
|
||||
|
||||
|
||||
def is_suggest_mode_enabled() -> bool:
|
||||
return os.environ.get(_SUGGEST_MODE_ENV, "").lower() in ("true", "1", "yes")
|
||||
|
||||
|
||||
async def build_suggest_action(
|
||||
action_type: str,
|
||||
*,
|
||||
target: str,
|
||||
run_id: str,
|
||||
project_id: str,
|
||||
) -> SuggestedAction:
|
||||
"""
|
||||
Suggest mode:返回 dry-run 建議,不執行真實操作。
|
||||
支援 rollback / scale / restart 三個 SRE flow。
|
||||
"""
|
||||
if action_type not in ("rollback", "scale", "restart"):
|
||||
raise ValueError(f"不支援的 action_type: {action_type!r}")
|
||||
|
||||
if action_type == "rollback":
|
||||
command = f"kubectl rollout undo deployment/{target}"
|
||||
evidence: dict[str, Any] = {
|
||||
"note": f"需確認 deployment/{target} 當前 image 與 rollout history",
|
||||
"suggested_verification": f"kubectl rollout history deployment/{target}",
|
||||
}
|
||||
elif action_type == "scale":
|
||||
command = f"kubectl scale deployment/{target} --replicas=<N>"
|
||||
evidence = {
|
||||
"note": f"需確認 deployment/{target} 當前 replicas 數量",
|
||||
"suggested_verification": f"kubectl get deployment/{target} -o json | jq .spec.replicas",
|
||||
}
|
||||
else: # restart
|
||||
command = f"kubectl rollout restart deployment/{target}"
|
||||
evidence = {
|
||||
"note": f"需確認 deployment/{target} 當前 pod 狀態",
|
||||
"suggested_verification": f"kubectl get pods -l app={target}",
|
||||
}
|
||||
|
||||
logger.info(
|
||||
"suggest_action_built",
|
||||
project_id=project_id,
|
||||
run_id=run_id,
|
||||
action_type=action_type,
|
||||
target=target,
|
||||
)
|
||||
|
||||
return SuggestedAction(
|
||||
action_type=action_type,
|
||||
target=target,
|
||||
suggested_command=command,
|
||||
rollback_evidence=evidence,
|
||||
)
|
||||
378
apps/api/src/services/budget_service.py
Normal file
378
apps/api/src/services/budget_service.py
Normal file
@@ -0,0 +1,378 @@
|
||||
"""AwoooP Token Budget Hard Kill Service
|
||||
=======================================
|
||||
ADR-120: 三層 Hard Kill 防護架構
|
||||
2026-05-04 ogt + Claude Sonnet 4.6(Phase 2.6)
|
||||
|
||||
防線:
|
||||
1. Pre-call check(呼叫前)— Layer 1 Tenant + Layer 2 Platform + Layer 3 Emergency Kill
|
||||
2. Post-call accounting(呼叫後)— 寫 budget_ledger + 更新 Redis cache
|
||||
3. 告警閾值通知(80% / 95% Telegram 告警)
|
||||
|
||||
注意:Layer 0 Run budget 需要 awooop_run_state(Phase 3 SAGA 實作後補加)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from decimal import Decimal
|
||||
|
||||
import structlog
|
||||
|
||||
from src.core.config import settings
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 告警閾值(ADR-120 D4)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
BUDGET_ALERT_THRESHOLDS = {
|
||||
"warn": Decimal("0.80"),
|
||||
"critical": Decimal("0.95"),
|
||||
"hard_kill": Decimal("1.00"),
|
||||
}
|
||||
|
||||
# Redis key 前綴
|
||||
_EMERGENCY_KILL_KEY = "platform:budget:emergency_kill"
|
||||
_TENANT_BUDGET_KEY_PREFIX = "budget:tenant:" # {project_id}:daily_used_usd
|
||||
_PLATFORM_BUDGET_KEY = "budget:platform:daily_used_usd"
|
||||
_BUDGET_CACHE_TTL = 300 # 5 分鐘,每次寫入後 refresh
|
||||
|
||||
|
||||
class BudgetExhaustedError(Exception):
|
||||
"""LLM call 被 hard kill 攔截"""
|
||||
|
||||
def __init__(self, error_code: str, message: str) -> None:
|
||||
self.error_code = error_code
|
||||
super().__init__(f"[{error_code}] {message}")
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 費用計算(按模型定價估算)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
# USD per 1M tokens(in + out)
|
||||
_COST_PER_MILLION_TOKENS: dict[str, tuple[float, float]] = {
|
||||
# (prompt_per_M, completion_per_M)
|
||||
"claude-opus-4-7": (15.0, 75.0),
|
||||
"claude-sonnet-4-6": (3.0, 15.0),
|
||||
"claude-haiku-4-5": (0.8, 4.0),
|
||||
"gpt-4o": (5.0, 15.0),
|
||||
"gpt-4o-mini": (0.15, 0.6),
|
||||
"gemini-2.0-flash": (0.075, 0.3),
|
||||
"deepseek-r1:14b": (0.0, 0.0), # local Ollama — 無費用
|
||||
"qwen3:8b": (0.0, 0.0), # local Ollama — 無費用
|
||||
}
|
||||
_DEFAULT_COST_PER_M = (3.0, 15.0) # fallback → claude-sonnet
|
||||
|
||||
|
||||
def estimate_cost(
|
||||
prompt_tokens: int,
|
||||
completion_tokens: int,
|
||||
model: str,
|
||||
) -> Decimal:
|
||||
"""估算一次 LLM call 的費用(USD)"""
|
||||
prompt_rate, completion_rate = _COST_PER_MILLION_TOKENS.get(
|
||||
model, _DEFAULT_COST_PER_M
|
||||
)
|
||||
cost = (prompt_tokens / 1_000_000 * prompt_rate +
|
||||
completion_tokens / 1_000_000 * completion_rate)
|
||||
return Decimal(str(round(cost, 6)))
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Pre-call Budget Check(ADR-120 D2 防線 1)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def check_budget_before_llm_call(
|
||||
project_id: str,
|
||||
model: str,
|
||||
estimated_prompt_tokens: int = 4000,
|
||||
*,
|
||||
agent_id: str | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
LLM call 前的三層 budget check。
|
||||
|
||||
超出任一層預算 → 拋出 BudgetExhaustedError,阻止 API call。
|
||||
Redis 不可用時 fail-open(不阻擋呼叫,但記 warning)。
|
||||
|
||||
Args:
|
||||
project_id: 租戶 ID
|
||||
model: 模型名稱(用於費用估算)
|
||||
estimated_prompt_tokens: 預估 prompt token 數(保守估計 × 1.5 已含在外)
|
||||
"""
|
||||
# Layer 3:Emergency Kill Switch(最優先)
|
||||
await check_emergency_kill()
|
||||
|
||||
# Local Ollama 模型無費用,跳過 Layer 1/2
|
||||
if model in {"deepseek-r1:14b", "qwen3:8b"} or model.startswith("ollama/"):
|
||||
return
|
||||
|
||||
estimated_cost = estimate_cost(estimated_prompt_tokens, 0, model)
|
||||
|
||||
# Layer 2:Tenant Budget
|
||||
await _check_tenant_budget(project_id, estimated_cost)
|
||||
|
||||
# Layer 1:Platform Budget
|
||||
await _check_platform_budget(estimated_cost)
|
||||
|
||||
|
||||
async def check_emergency_kill() -> None:
|
||||
"""Layer 3: Emergency Kill Switch — Redis key platform:budget:emergency_kill"""
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
if await redis.exists(_EMERGENCY_KILL_KEY):
|
||||
raise BudgetExhaustedError(
|
||||
"E-BUDGET-004",
|
||||
"Emergency kill switch activated — contact platform admin",
|
||||
)
|
||||
except BudgetExhaustedError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.warning("budget_emergency_kill_check_failed", error=str(exc))
|
||||
|
||||
|
||||
async def _check_tenant_budget(project_id: str, estimated_cost: Decimal) -> None:
|
||||
"""Layer 2: Tenant Budget(Redis 快取 + awooop_projects.budget_limit_usd)"""
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
|
||||
# 讀取 Tenant 每日已用金額
|
||||
cache_key = f"{_TENANT_BUDGET_KEY_PREFIX}{project_id}"
|
||||
used_raw = await redis.get(cache_key)
|
||||
used_usd = Decimal(used_raw.decode() if isinstance(used_raw, bytes) else used_raw or "0")
|
||||
|
||||
# 讀取 Tenant 預算上限(從 awooop_projects 表)
|
||||
limit_usd = await _get_tenant_budget_limit(project_id)
|
||||
if limit_usd is None:
|
||||
return # 無上限 → 放行
|
||||
|
||||
if used_usd + estimated_cost > limit_usd:
|
||||
raise BudgetExhaustedError(
|
||||
"E-BUDGET-002",
|
||||
f"Tenant {project_id} budget exhausted: "
|
||||
f"used ${used_usd:.4f} / ${limit_usd:.4f}",
|
||||
)
|
||||
|
||||
# 告警閾值
|
||||
usage_pct = (used_usd + estimated_cost) / limit_usd
|
||||
if usage_pct >= BUDGET_ALERT_THRESHOLDS["critical"]:
|
||||
logger.warning(
|
||||
"budget_tenant_critical",
|
||||
project_id=project_id,
|
||||
usage_pct=float(usage_pct),
|
||||
used_usd=float(used_usd),
|
||||
limit_usd=float(limit_usd),
|
||||
)
|
||||
elif usage_pct >= BUDGET_ALERT_THRESHOLDS["warn"]:
|
||||
logger.warning(
|
||||
"budget_tenant_warn",
|
||||
project_id=project_id,
|
||||
usage_pct=float(usage_pct),
|
||||
used_usd=float(used_usd),
|
||||
limit_usd=float(limit_usd),
|
||||
)
|
||||
|
||||
except BudgetExhaustedError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.warning("budget_tenant_check_failed", project_id=project_id, error=str(exc))
|
||||
|
||||
|
||||
async def _check_platform_budget(estimated_cost: Decimal) -> None:
|
||||
"""Layer 1: Platform Budget(config 靜態上限 + Redis 累計)"""
|
||||
platform_limit = getattr(settings, "PLATFORM_DAILY_BUDGET_USD", None)
|
||||
if not platform_limit:
|
||||
return # 未設定 → 放行
|
||||
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
used_raw = await redis.get(_PLATFORM_BUDGET_KEY)
|
||||
used_usd = Decimal(used_raw.decode() if isinstance(used_raw, bytes) else used_raw or "0")
|
||||
limit_usd = Decimal(str(platform_limit))
|
||||
|
||||
if used_usd + estimated_cost > limit_usd:
|
||||
raise BudgetExhaustedError(
|
||||
"E-BUDGET-003",
|
||||
f"Platform budget exhausted: used ${used_usd:.4f} / ${limit_usd:.4f} — "
|
||||
"all LLM calls suspended",
|
||||
)
|
||||
except BudgetExhaustedError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.warning("budget_platform_check_failed", error=str(exc))
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Post-call Accounting(ADR-120 D2 防線 2)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def record_token_usage(
|
||||
*,
|
||||
project_id: str,
|
||||
model: str,
|
||||
provider: str,
|
||||
prompt_tokens: int,
|
||||
completion_tokens: int,
|
||||
agent_id: str | None = None,
|
||||
run_id: str | None = None,
|
||||
) -> Decimal:
|
||||
"""
|
||||
LLM call 完成後記帳。
|
||||
|
||||
1. 計算實際費用
|
||||
2. INSERT budget_ledger
|
||||
3. 更新 Redis budget cache(async,不阻擋回傳)
|
||||
4. 觸發告警閾值通知
|
||||
|
||||
Returns:
|
||||
actual_cost_usd
|
||||
"""
|
||||
import asyncio
|
||||
from uuid import UUID
|
||||
|
||||
actual_cost = estimate_cost(prompt_tokens, completion_tokens, model)
|
||||
|
||||
# 寫入 budget_ledger(非阻擋)
|
||||
asyncio.create_task(
|
||||
_write_budget_ledger(
|
||||
project_id=project_id,
|
||||
agent_id=agent_id,
|
||||
run_id=UUID(run_id) if run_id else None,
|
||||
model=model,
|
||||
provider=provider,
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
cost_usd=actual_cost,
|
||||
),
|
||||
name="budget_ledger_write",
|
||||
)
|
||||
|
||||
# 更新 Redis cache(非阻擋)
|
||||
asyncio.create_task(
|
||||
_update_budget_cache(project_id, actual_cost),
|
||||
name="budget_cache_update",
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"token_usage_recorded",
|
||||
project_id=project_id,
|
||||
model=model,
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
cost_usd=float(actual_cost),
|
||||
)
|
||||
return actual_cost
|
||||
|
||||
|
||||
async def _write_budget_ledger(
|
||||
*,
|
||||
project_id: str,
|
||||
agent_id: str | None,
|
||||
run_id, # UUID | None
|
||||
model: str,
|
||||
provider: str,
|
||||
prompt_tokens: int,
|
||||
completion_tokens: int,
|
||||
cost_usd: Decimal,
|
||||
) -> None:
|
||||
"""INSERT budget_ledger(leWOOOgo: DB 寫入在 Service 層,非 Router)"""
|
||||
try:
|
||||
from sqlalchemy import text
|
||||
from src.db.base import get_db_context
|
||||
async with get_db_context(project_id) as db:
|
||||
await db.execute(
|
||||
text("""
|
||||
INSERT INTO budget_ledger
|
||||
(project_id, agent_id, run_id, model, provider,
|
||||
prompt_tokens, completion_tokens, cost_usd)
|
||||
VALUES
|
||||
(:project_id, :agent_id, :run_id, :model, :provider,
|
||||
:prompt_tokens, :completion_tokens, :cost_usd)
|
||||
"""),
|
||||
{
|
||||
"project_id": project_id,
|
||||
"agent_id": agent_id,
|
||||
"run_id": run_id,
|
||||
"model": model,
|
||||
"provider": provider,
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"cost_usd": cost_usd,
|
||||
},
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("budget_ledger_write_failed", project_id=project_id, error=str(exc))
|
||||
|
||||
|
||||
async def _update_budget_cache(project_id: str, cost: Decimal) -> None:
|
||||
"""用 Redis INCRBYFLOAT 更新 Tenant + Platform daily budget cache"""
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
cost_f = float(cost)
|
||||
|
||||
# Tenant daily budget
|
||||
tenant_key = f"{_TENANT_BUDGET_KEY_PREFIX}{project_id}"
|
||||
await redis.incrbyfloat(tenant_key, cost_f)
|
||||
await redis.expire(tenant_key, 86400) # 24h TTL(每日重置)
|
||||
|
||||
# Platform daily budget
|
||||
await redis.incrbyfloat(_PLATFORM_BUDGET_KEY, cost_f)
|
||||
await redis.expire(_PLATFORM_BUDGET_KEY, 86400)
|
||||
|
||||
except Exception as exc:
|
||||
logger.warning("budget_cache_update_failed", project_id=project_id, error=str(exc))
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Helper:從 DB 讀取 Tenant budget limit
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def _get_tenant_budget_limit(project_id: str) -> Decimal | None:
|
||||
"""從 awooop_projects.budget_limit_usd 讀取 Tenant 每日上限(允許 NULL = 無上限)"""
|
||||
try:
|
||||
from sqlalchemy import text
|
||||
from src.db.base import get_db_context
|
||||
async with get_db_context() as db:
|
||||
row = await db.execute(
|
||||
text("SELECT budget_limit_usd FROM awooop_projects WHERE project_id = :pid"),
|
||||
{"pid": project_id},
|
||||
)
|
||||
result = row.scalar_one_or_none()
|
||||
return Decimal(str(result)) if result is not None else None
|
||||
except Exception as exc:
|
||||
logger.warning("get_tenant_budget_limit_failed", project_id=project_id, error=str(exc))
|
||||
return None
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Emergency Kill Switch 管理(Admin 工具)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def activate_emergency_kill(reason: str = "") -> None:
|
||||
"""啟動緊急停機 — SET platform:budget:emergency_kill"""
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
await redis.set(_EMERGENCY_KILL_KEY, reason or "activated", ex=86400 * 7)
|
||||
logger.warning("budget_emergency_kill_activated", reason=reason)
|
||||
|
||||
|
||||
async def deactivate_emergency_kill() -> None:
|
||||
"""解除緊急停機"""
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
await redis.delete(_EMERGENCY_KILL_KEY)
|
||||
logger.info("budget_emergency_kill_deactivated")
|
||||
|
||||
|
||||
async def is_emergency_kill_active() -> bool:
|
||||
"""查詢緊急停機狀態"""
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
return bool(await redis.exists(_EMERGENCY_KILL_KEY))
|
||||
except Exception:
|
||||
return False
|
||||
@@ -280,6 +280,7 @@ async def dispatch_action(
|
||||
|
||||
# MCP registry dispatch
|
||||
from src.plugins.mcp.registry import get_provider
|
||||
from src.services.mcp_audit_context import with_mcp_audit_context
|
||||
provider_name = _resolve_provider_name(spec.mcp_provider)
|
||||
provider = get_provider(provider_name)
|
||||
if not provider:
|
||||
@@ -293,8 +294,16 @@ async def dispatch_action(
|
||||
)
|
||||
|
||||
# 執行 MCP tool with timeout
|
||||
audited_params = with_mcp_audit_context(
|
||||
resolved_params,
|
||||
session_id=f"callback:{incident_id}:{action_name}",
|
||||
incident_id=incident_id,
|
||||
flywheel_node="operate",
|
||||
agent_role="telegram_callback_dispatcher",
|
||||
operator_user_id=user_id,
|
||||
)
|
||||
mcp_result = await asyncio.wait_for(
|
||||
provider.execute(spec.mcp_tool, resolved_params),
|
||||
provider.execute(spec.mcp_tool, audited_params),
|
||||
timeout=float(spec.timeout_sec),
|
||||
)
|
||||
|
||||
|
||||
746
apps/api/src/services/channel_hub.py
Normal file
746
apps/api/src/services/channel_hub.py
Normal file
@@ -0,0 +1,746 @@
|
||||
"""
|
||||
Channel Hub — AwoooP 入站事件統一路由 + Progressive Feedback Policy
|
||||
====================================================================
|
||||
AwoooP Phase 7: ADR-106(channel_event family)
|
||||
2026-05-04 ogt + Claude Sonnet 4.6
|
||||
|
||||
功能:
|
||||
1. Telegram 入站事件鏡像(記錄到 awooop_conversation_event)
|
||||
2. 建立 platform run(呼叫 platform_runtime.create_run)
|
||||
3. Progressive Feedback Policy:
|
||||
- run 進入 WAITING_TOOL 狀態 → 30 秒後若未 complete → 發 interim Telegram 訊息
|
||||
- 訊息記錄到 awooop_outbound_message
|
||||
4. Shadow Mode:不發任何 Telegram 訊息(只記錄到 outbound_message, status='shadow')
|
||||
|
||||
Progressive Feedback Policy 設計(ADR-106 P2-03):
|
||||
- 用 asyncio.create_task 啟動 30s 計時器
|
||||
- 30s 後查詢 run state:若仍在 WAITING_TOOL → 發 interim 訊息
|
||||
- interim 訊息:「AI 正在分析中,請稍候...」(不洩漏 run 細節)
|
||||
- Final reply 由 shadow_execute() 完成後觸發(Phase 8 實作)
|
||||
|
||||
與 legacy telegram_gateway.py 的關係:
|
||||
- 完全獨立,不修改 legacy gateway
|
||||
- legacy 繼續處理 legacy flow(signal_worker 觸發的 approval/notification)
|
||||
- AwoooP run 只走本模組
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import html
|
||||
import json
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any
|
||||
from uuid import NAMESPACE_URL, UUID, uuid5
|
||||
|
||||
import structlog
|
||||
from sqlalchemy import select, text
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from src.db.awooop_models import AwoooPRunState
|
||||
from src.services.audit_sink import _redact_string
|
||||
from src.services.platform_runtime import create_run
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
# Progressive Feedback Policy:等待超過此秒數才發 interim 訊息
|
||||
_INTERIM_WAIT_SECONDS = 30
|
||||
|
||||
|
||||
def _input_sha256(input_payload: dict[str, Any] | None) -> str | None:
|
||||
"""計算 Run input 的穩定 hash,讓 mirror run 也能保留最小完整性證據。"""
|
||||
if not input_payload:
|
||||
return None
|
||||
canonical = json.dumps(
|
||||
input_payload,
|
||||
sort_keys=True,
|
||||
separators=(",", ":"),
|
||||
ensure_ascii=False,
|
||||
)
|
||||
return hashlib.sha256(canonical.encode()).hexdigest()
|
||||
|
||||
|
||||
async def ensure_completed_shadow_run(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
project_id: str,
|
||||
run_id: UUID,
|
||||
agent_id: str,
|
||||
trigger_type: str,
|
||||
trigger_ref: str | None,
|
||||
input_payload: dict[str, Any] | None = None,
|
||||
) -> bool:
|
||||
"""為 legacy mirror 資料補一筆 completed shadow run。
|
||||
|
||||
AwoooP 在 strangler 階段會先 mirror legacy Telegram / alert-grouping
|
||||
資料。這些事件不應重新觸發 runtime,但需要 run_state 當 Console 的
|
||||
聚合錨點;因此這裡建立的是已完成的 shadow run,不會被 worker pick up。
|
||||
"""
|
||||
result = await db.execute(
|
||||
text("""
|
||||
INSERT INTO awooop_run_state (
|
||||
run_id, project_id, agent_id, state,
|
||||
trigger_type, trigger_ref, is_shadow,
|
||||
input_sha256, created_at, completed_at, timeout_at
|
||||
) VALUES (
|
||||
:run_id, :project_id, :agent_id, 'completed',
|
||||
:trigger_type, :trigger_ref, TRUE,
|
||||
:input_sha256, NOW(), NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (run_id) DO NOTHING
|
||||
RETURNING run_id
|
||||
"""),
|
||||
{
|
||||
"run_id": run_id,
|
||||
"project_id": project_id,
|
||||
"agent_id": agent_id,
|
||||
"trigger_type": trigger_type,
|
||||
"trigger_ref": trigger_ref,
|
||||
"input_sha256": _input_sha256(input_payload),
|
||||
},
|
||||
)
|
||||
inserted = result.fetchone() is not None
|
||||
if inserted:
|
||||
logger.info(
|
||||
"completed_shadow_run_created",
|
||||
project_id=project_id,
|
||||
run_id=str(run_id),
|
||||
agent_id=agent_id,
|
||||
trigger_type=trigger_type,
|
||||
)
|
||||
return inserted
|
||||
|
||||
|
||||
def build_grouped_alert_run_id(project_id: str, provider_event_id: str) -> UUID:
|
||||
"""為 grouped child alert 建立穩定 run_id,讓 Run Monitor 可回查。"""
|
||||
return uuid5(NAMESPACE_URL, f"awooop:grouped-alert:{project_id}:{provider_event_id}")
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 入站事件記錄
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def mirror_inbound_event(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
project_id: str,
|
||||
channel_type: str,
|
||||
provider_event_id: str,
|
||||
platform_subject_id: str | None = None,
|
||||
channel_user_id: str | None = None,
|
||||
channel_chat_id: str | None = None,
|
||||
content_type: str = "text",
|
||||
raw_content: str | None = None,
|
||||
attachment_sha256: str | None = None,
|
||||
provider_ts: datetime | None = None,
|
||||
run_id: UUID | None = None,
|
||||
is_duplicate: bool = False,
|
||||
) -> UUID:
|
||||
"""
|
||||
記錄入站 channel event 到 awooop_conversation_event。
|
||||
|
||||
raw_content 只用於計算 hash 和 preview,不入庫明文。
|
||||
回傳 event_id。
|
||||
"""
|
||||
content_hash: str | None = None
|
||||
content_preview: str | None = None
|
||||
|
||||
if raw_content is not None:
|
||||
content_hash = hashlib.sha256(raw_content.encode()).hexdigest()
|
||||
# preview:redact 後截取前 256 字元
|
||||
redacted = _redact_string(raw_content)
|
||||
content_preview = redacted[:256] if len(redacted) > 256 else redacted
|
||||
|
||||
result = await db.execute(
|
||||
text("""
|
||||
INSERT INTO awooop_conversation_event (
|
||||
project_id, channel_type, provider_event_id,
|
||||
platform_subject_id, channel_user_id, channel_chat_id,
|
||||
run_id, content_type, content_hash, content_preview,
|
||||
attachment_sha256, is_duplicate, provider_ts, received_at
|
||||
) VALUES (
|
||||
:project_id, :channel_type, :provider_event_id,
|
||||
:platform_subject_id, :channel_user_id, :channel_chat_id,
|
||||
:run_id, :content_type, :content_hash, :content_preview,
|
||||
:attachment_sha256, :is_duplicate, :provider_ts, NOW()
|
||||
)
|
||||
ON CONFLICT (project_id, channel_type, provider_event_id) DO UPDATE SET
|
||||
is_duplicate = TRUE,
|
||||
run_id = COALESCE(EXCLUDED.run_id, awooop_conversation_event.run_id)
|
||||
RETURNING event_id
|
||||
"""),
|
||||
{
|
||||
"project_id": project_id,
|
||||
"channel_type": channel_type,
|
||||
"provider_event_id": provider_event_id,
|
||||
"platform_subject_id": platform_subject_id,
|
||||
"channel_user_id": channel_user_id,
|
||||
"channel_chat_id": channel_chat_id,
|
||||
"run_id": run_id,
|
||||
"content_type": content_type,
|
||||
"content_hash": content_hash,
|
||||
"content_preview": content_preview,
|
||||
"attachment_sha256": attachment_sha256,
|
||||
"is_duplicate": is_duplicate,
|
||||
"provider_ts": provider_ts,
|
||||
},
|
||||
)
|
||||
row = result.fetchone()
|
||||
event_id: UUID = row[0]
|
||||
logger.info(
|
||||
"channel_event_mirrored",
|
||||
project_id=project_id,
|
||||
channel_type=channel_type,
|
||||
event_id=str(event_id),
|
||||
is_duplicate=is_duplicate,
|
||||
)
|
||||
return event_id
|
||||
|
||||
|
||||
def build_grouped_alert_provider_event_id(alert_id: str, fingerprint: str) -> str:
|
||||
"""建立 grouped child alert 的冪等 provider_event_id。"""
|
||||
safe_alert_id = str(alert_id).strip() or "unknown"
|
||||
safe_fingerprint = str(fingerprint).strip()[:32] or "no-fingerprint"
|
||||
return f"alert-group:{safe_alert_id}:{safe_fingerprint}"
|
||||
|
||||
|
||||
def format_grouped_alert_event_content(
|
||||
*,
|
||||
alert_id: str,
|
||||
alertname: str,
|
||||
severity: str,
|
||||
namespace: str,
|
||||
target_resource: str,
|
||||
group_key: str,
|
||||
count: int,
|
||||
parent_fingerprint: str | None,
|
||||
fingerprint: str,
|
||||
) -> str:
|
||||
"""格式化只落 AwoooP、不發 Telegram 的告警收斂事件摘要。"""
|
||||
parent = parent_fingerprint or "-"
|
||||
target = target_resource or "-"
|
||||
ns = namespace or "default"
|
||||
return "\n".join(
|
||||
[
|
||||
"告警已收斂,不發 Telegram",
|
||||
f"Alert ID: {alert_id}",
|
||||
f"Alert: {alertname}",
|
||||
f"Severity: {severity}",
|
||||
f"Namespace: {ns}",
|
||||
f"Target: {target}",
|
||||
f"Group: {group_key}",
|
||||
f"Group Count: {count}",
|
||||
f"Parent Fingerprint: {parent}",
|
||||
f"Child Fingerprint: {fingerprint}",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def format_grouped_alert_digest_text(
|
||||
*,
|
||||
alertname: str,
|
||||
severity: str,
|
||||
namespace: str,
|
||||
target_resource: str,
|
||||
group_key: str,
|
||||
count: int,
|
||||
) -> str:
|
||||
"""格式化要回覆到父告警卡的短 digest。"""
|
||||
safe_alert = html.escape(alertname or "unknown")
|
||||
safe_severity = html.escape(severity or "unknown")
|
||||
safe_namespace = html.escape(namespace or "default")
|
||||
safe_target = html.escape(target_resource or "unknown")
|
||||
safe_group = html.escape(group_key or "unknown")
|
||||
|
||||
return "\n".join(
|
||||
[
|
||||
"🧩 <b>告警已收斂到父卡</b>",
|
||||
f"├ 類型:<code>{safe_alert}</code>",
|
||||
f"├ 等級:<code>{safe_severity}</code>",
|
||||
f"├ 範圍:<code>{safe_namespace}</code>",
|
||||
f"├ 最新目標:<code>{safe_target}</code>",
|
||||
f"├ 群組:<code>{safe_group}</code>",
|
||||
f"└ 目前視窗:<b>{count}</b> 筆同組告警",
|
||||
"",
|
||||
"完整子告警請看 AwoooP Run 監控,不再逐筆發 Telegram。",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
async def maybe_send_grouped_alert_digest(
|
||||
*,
|
||||
project_id: str,
|
||||
alertname: str,
|
||||
severity: str,
|
||||
namespace: str,
|
||||
target_resource: str,
|
||||
group_key: str,
|
||||
count: int,
|
||||
parent_fingerprint: str | None,
|
||||
) -> bool:
|
||||
"""若父告警卡已存在,回覆一則低頻 digest;找不到父卡則安靜降級。"""
|
||||
if not parent_fingerprint:
|
||||
return False
|
||||
|
||||
try:
|
||||
from sqlalchemy import select
|
||||
|
||||
from src.db.base import get_db_context
|
||||
from src.db.models import ApprovalRecord
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
|
||||
async with get_db_context(project_id) as db:
|
||||
result = await db.execute(
|
||||
select(ApprovalRecord.incident_id)
|
||||
.where(ApprovalRecord.fingerprint == parent_fingerprint)
|
||||
.where(ApprovalRecord.incident_id.is_not(None))
|
||||
.order_by(ApprovalRecord.created_at.desc())
|
||||
.limit(1)
|
||||
)
|
||||
incident_id = result.scalar_one_or_none()
|
||||
|
||||
if not incident_id:
|
||||
logger.info(
|
||||
"grouped_alert_digest_parent_not_ready",
|
||||
project_id=project_id,
|
||||
group_key=group_key,
|
||||
parent_fingerprint=parent_fingerprint,
|
||||
)
|
||||
return False
|
||||
|
||||
digest_text = format_grouped_alert_digest_text(
|
||||
alertname=alertname,
|
||||
severity=severity,
|
||||
namespace=namespace,
|
||||
target_resource=target_resource,
|
||||
group_key=group_key,
|
||||
count=count,
|
||||
)
|
||||
sent = await get_telegram_gateway().append_grouped_alert_digest(
|
||||
incident_id=str(incident_id),
|
||||
group_key=group_key,
|
||||
digest_text=digest_text,
|
||||
)
|
||||
logger.info(
|
||||
"grouped_alert_digest_result",
|
||||
project_id=project_id,
|
||||
incident_id=str(incident_id),
|
||||
group_key=group_key,
|
||||
count=count,
|
||||
sent=sent,
|
||||
)
|
||||
return sent
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"grouped_alert_digest_failed",
|
||||
project_id=project_id,
|
||||
group_key=group_key,
|
||||
parent_fingerprint=parent_fingerprint,
|
||||
error=str(exc),
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
async def record_grouped_alert_event(
|
||||
*,
|
||||
project_id: str,
|
||||
alert_id: str,
|
||||
alertname: str,
|
||||
severity: str,
|
||||
namespace: str,
|
||||
target_resource: str,
|
||||
group_key: str,
|
||||
count: int,
|
||||
parent_fingerprint: str | None,
|
||||
fingerprint: str,
|
||||
) -> UUID | None:
|
||||
"""
|
||||
將被 AlertGroupingService 收斂的子告警落到 AwoooP conversation_event。
|
||||
|
||||
這條路徑刻意不發 Telegram,只保留 operator-facing 脈絡:
|
||||
- 群組不洗版
|
||||
- Console 仍能看到同組告警正在持續發生
|
||||
- DB 失敗 fail-open,不影響 Alertmanager webhook ACK
|
||||
"""
|
||||
try:
|
||||
from src.db.base import get_db_context
|
||||
|
||||
provider_event_id = build_grouped_alert_provider_event_id(alert_id, fingerprint)
|
||||
content = format_grouped_alert_event_content(
|
||||
alert_id=alert_id,
|
||||
alertname=alertname,
|
||||
severity=severity,
|
||||
namespace=namespace,
|
||||
target_resource=target_resource,
|
||||
group_key=group_key,
|
||||
count=count,
|
||||
parent_fingerprint=parent_fingerprint,
|
||||
fingerprint=fingerprint,
|
||||
)
|
||||
|
||||
async with get_db_context(project_id) as db:
|
||||
run_id = build_grouped_alert_run_id(project_id, provider_event_id)
|
||||
await ensure_completed_shadow_run(
|
||||
db,
|
||||
project_id=project_id,
|
||||
run_id=run_id,
|
||||
agent_id="legacy-alert-grouping",
|
||||
trigger_type="grouped_alert_event",
|
||||
trigger_ref=provider_event_id,
|
||||
input_payload={
|
||||
"alert_id": alert_id,
|
||||
"alertname": alertname,
|
||||
"severity": severity,
|
||||
"group_key": group_key,
|
||||
"fingerprint": fingerprint,
|
||||
},
|
||||
)
|
||||
event_id = await mirror_inbound_event(
|
||||
db,
|
||||
project_id=project_id,
|
||||
channel_type="internal",
|
||||
provider_event_id=provider_event_id,
|
||||
platform_subject_id="alertmanager",
|
||||
channel_user_id="alertmanager",
|
||||
channel_chat_id=f"alert-group:{group_key}",
|
||||
content_type="text",
|
||||
raw_content=content,
|
||||
provider_ts=datetime.now(UTC),
|
||||
run_id=run_id,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"grouped_alert_event_recorded",
|
||||
project_id=project_id,
|
||||
alert_id=alert_id,
|
||||
event_id=str(event_id),
|
||||
group_key=group_key,
|
||||
count=count,
|
||||
)
|
||||
await maybe_send_grouped_alert_digest(
|
||||
project_id=project_id,
|
||||
alertname=alertname,
|
||||
severity=severity,
|
||||
namespace=namespace,
|
||||
target_resource=target_resource,
|
||||
group_key=group_key,
|
||||
count=count,
|
||||
parent_fingerprint=parent_fingerprint,
|
||||
)
|
||||
return event_id
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"grouped_alert_event_record_failed",
|
||||
project_id=project_id,
|
||||
alert_id=alert_id,
|
||||
group_key=group_key,
|
||||
error=str(exc),
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 出站訊息記錄
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def record_outbound_message(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
project_id: str,
|
||||
run_id: UUID,
|
||||
channel_type: str,
|
||||
channel_chat_id: str,
|
||||
message_type: str, # 'interim' | 'final' | 'error' | 'approval_request'
|
||||
content: str | None = None,
|
||||
provider_message_id: str | None = None,
|
||||
send_status: str = "pending",
|
||||
conversation_event_id: UUID | None = None,
|
||||
triggered_by_state: str | None = None,
|
||||
waiting_since: datetime | None = None,
|
||||
is_shadow: bool = True,
|
||||
) -> UUID:
|
||||
"""
|
||||
記錄出站訊息到 awooop_outbound_message。
|
||||
|
||||
is_shadow=True:status='shadow'(不實際發送,只記錄)
|
||||
"""
|
||||
content_hash: str | None = None
|
||||
content_preview: str | None = None
|
||||
if content is not None:
|
||||
content_hash = hashlib.sha256(content.encode()).hexdigest()
|
||||
redacted = _redact_string(content)
|
||||
content_preview = redacted[:256]
|
||||
|
||||
actual_status = "shadow" if is_shadow else send_status
|
||||
|
||||
await ensure_completed_shadow_run(
|
||||
db,
|
||||
project_id=project_id,
|
||||
run_id=run_id,
|
||||
agent_id="legacy-telegram-gateway",
|
||||
trigger_type="legacy_outbound",
|
||||
trigger_ref=provider_message_id,
|
||||
input_payload={
|
||||
"channel_type": channel_type,
|
||||
"channel_chat_id": channel_chat_id,
|
||||
"message_type": message_type,
|
||||
"send_status": actual_status,
|
||||
"triggered_by_state": triggered_by_state,
|
||||
},
|
||||
)
|
||||
|
||||
result = await db.execute(
|
||||
text("""
|
||||
INSERT INTO awooop_outbound_message (
|
||||
project_id, run_id, conversation_event_id,
|
||||
channel_type, channel_chat_id, message_type,
|
||||
content_hash, content_preview, provider_message_id,
|
||||
send_status, queued_at,
|
||||
triggered_by_state, waiting_since
|
||||
) VALUES (
|
||||
:project_id, :run_id, :conversation_event_id,
|
||||
:channel_type, :channel_chat_id, :message_type,
|
||||
:content_hash, :content_preview, :provider_message_id,
|
||||
:send_status, NOW(),
|
||||
:triggered_by_state, :waiting_since
|
||||
)
|
||||
RETURNING message_id
|
||||
"""),
|
||||
{
|
||||
"project_id": project_id,
|
||||
"run_id": run_id,
|
||||
"conversation_event_id": conversation_event_id,
|
||||
"channel_type": channel_type,
|
||||
"channel_chat_id": channel_chat_id,
|
||||
"message_type": message_type,
|
||||
"content_hash": content_hash,
|
||||
"content_preview": content_preview,
|
||||
"provider_message_id": provider_message_id,
|
||||
"send_status": actual_status,
|
||||
"triggered_by_state": triggered_by_state,
|
||||
"waiting_since": waiting_since,
|
||||
},
|
||||
)
|
||||
row = result.fetchone()
|
||||
message_id: UUID = row[0]
|
||||
logger.info(
|
||||
"outbound_message_recorded",
|
||||
project_id=project_id,
|
||||
run_id=str(run_id),
|
||||
message_type=message_type,
|
||||
send_status=actual_status,
|
||||
message_id=str(message_id),
|
||||
)
|
||||
return message_id
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Progressive Feedback Policy
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def schedule_interim_feedback(
|
||||
*,
|
||||
project_id: str,
|
||||
run_id: UUID,
|
||||
channel_type: str,
|
||||
channel_chat_id: str,
|
||||
conversation_event_id: UUID | None = None,
|
||||
is_shadow: bool = True,
|
||||
wait_seconds: int = _INTERIM_WAIT_SECONDS,
|
||||
) -> None:
|
||||
"""
|
||||
Progressive Feedback Policy:
|
||||
等待 wait_seconds 秒後,若 run 仍在 WAITING_TOOL → 發 interim 訊息。
|
||||
|
||||
Shadow Mode:記錄到 outbound_message(status='shadow'),不實際發 Telegram 訊息。
|
||||
"""
|
||||
asyncio.create_task(
|
||||
_interim_feedback_task(
|
||||
project_id=project_id,
|
||||
run_id=run_id,
|
||||
channel_type=channel_type,
|
||||
channel_chat_id=channel_chat_id,
|
||||
conversation_event_id=conversation_event_id,
|
||||
is_shadow=is_shadow,
|
||||
wait_seconds=wait_seconds,
|
||||
),
|
||||
name=f"interim_feedback_{str(run_id)[:8]}",
|
||||
)
|
||||
|
||||
|
||||
async def _interim_feedback_task(
|
||||
*,
|
||||
project_id: str,
|
||||
run_id: UUID,
|
||||
channel_type: str,
|
||||
channel_chat_id: str,
|
||||
conversation_event_id: UUID | None,
|
||||
is_shadow: bool,
|
||||
wait_seconds: int,
|
||||
) -> None:
|
||||
"""等待後查 run state,仍 waiting_tool 才發 interim"""
|
||||
await asyncio.sleep(wait_seconds)
|
||||
|
||||
try:
|
||||
from src.db.base import get_db_context
|
||||
|
||||
async with get_db_context(project_id) as db:
|
||||
result = await db.execute(
|
||||
select(AwoooPRunState.state, AwoooPRunState.is_shadow).where(
|
||||
AwoooPRunState.run_id == run_id,
|
||||
AwoooPRunState.project_id == project_id,
|
||||
)
|
||||
)
|
||||
row = result.first()
|
||||
|
||||
if row is None:
|
||||
logger.warning(
|
||||
"interim_feedback_run_not_found",
|
||||
run_id=str(run_id),
|
||||
)
|
||||
return
|
||||
|
||||
state, run_is_shadow = row
|
||||
if state != "waiting_tool":
|
||||
# run 已推進(complete/failed 等),不需要 interim
|
||||
return
|
||||
|
||||
waiting_since = datetime.now(UTC)
|
||||
interim_content = "AI 正在分析中,請稍候... ⏳"
|
||||
|
||||
await record_outbound_message(
|
||||
db,
|
||||
project_id=project_id,
|
||||
run_id=run_id,
|
||||
channel_type=channel_type,
|
||||
channel_chat_id=channel_chat_id,
|
||||
message_type="interim",
|
||||
content=interim_content,
|
||||
send_status="pending",
|
||||
conversation_event_id=conversation_event_id,
|
||||
triggered_by_state="waiting_tool",
|
||||
waiting_since=waiting_since,
|
||||
is_shadow=is_shadow or run_is_shadow,
|
||||
)
|
||||
|
||||
if not (is_shadow or run_is_shadow):
|
||||
# Non-shadow:實際發 Telegram 訊息
|
||||
await _send_telegram_interim(
|
||||
channel_chat_id=channel_chat_id,
|
||||
content=interim_content,
|
||||
run_id=run_id,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"interim_feedback_sent",
|
||||
project_id=project_id,
|
||||
run_id=str(run_id),
|
||||
is_shadow=is_shadow or run_is_shadow,
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
logger.exception(
|
||||
"interim_feedback_task_error",
|
||||
run_id=str(run_id),
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
|
||||
async def _send_telegram_interim(
|
||||
*,
|
||||
channel_chat_id: str,
|
||||
content: str,
|
||||
run_id: UUID,
|
||||
) -> None:
|
||||
"""實際發送 Telegram interim 訊息(non-shadow 專用)"""
|
||||
try:
|
||||
import os
|
||||
|
||||
import httpx
|
||||
|
||||
bot_token = os.environ.get("TELEGRAM_BOT_TOKEN")
|
||||
if not bot_token:
|
||||
logger.warning("interim_telegram_no_token", run_id=str(run_id))
|
||||
return
|
||||
|
||||
async with httpx.AsyncClient(timeout=10) as client:
|
||||
await client.post(
|
||||
f"https://api.telegram.org/bot{bot_token}/sendMessage",
|
||||
json={
|
||||
"chat_id": channel_chat_id,
|
||||
"text": content,
|
||||
"parse_mode": "HTML",
|
||||
},
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"interim_telegram_send_failed",
|
||||
run_id=str(run_id),
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Channel Hub 主入口(Telegram inbound)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def handle_telegram_inbound(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
project_id: str,
|
||||
agent_id: str,
|
||||
message_id: str,
|
||||
user_id: str,
|
||||
chat_id: str,
|
||||
text: str | None = None,
|
||||
is_shadow: bool = True,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Telegram 入站訊息的統一處理入口:
|
||||
1. mirror_inbound_event(記錄)
|
||||
2. create_run(建立 platform run)
|
||||
3. schedule_interim_feedback(Progressive Feedback)
|
||||
4. 回傳 {event_id, run_id, is_duplicate}
|
||||
"""
|
||||
# Step 1: 嘗試建立 run(有冪等保護)
|
||||
run_id, is_duplicate = await create_run(
|
||||
project_id=project_id,
|
||||
agent_id=agent_id,
|
||||
trigger_type="channel_event",
|
||||
trigger_ref=f"telegram:{message_id}",
|
||||
input_payload={"chat_id": chat_id, "user_id": user_id},
|
||||
channel_type="telegram",
|
||||
provider_event_id=message_id,
|
||||
)
|
||||
|
||||
# Step 2: Mirror event(含 run_id)
|
||||
event_id = await mirror_inbound_event(
|
||||
db,
|
||||
project_id=project_id,
|
||||
channel_type="telegram",
|
||||
provider_event_id=message_id,
|
||||
channel_user_id=user_id,
|
||||
channel_chat_id=chat_id,
|
||||
content_type="text" if text else "callback_query",
|
||||
raw_content=text,
|
||||
run_id=run_id,
|
||||
is_duplicate=is_duplicate,
|
||||
)
|
||||
|
||||
# Step 3: Progressive Feedback(30s 計時器)
|
||||
if not is_duplicate:
|
||||
await schedule_interim_feedback(
|
||||
project_id=project_id,
|
||||
run_id=run_id,
|
||||
channel_type="telegram",
|
||||
channel_chat_id=chat_id,
|
||||
conversation_event_id=event_id,
|
||||
is_shadow=is_shadow,
|
||||
)
|
||||
|
||||
return {
|
||||
"event_id": str(event_id),
|
||||
"run_id": str(run_id),
|
||||
"is_duplicate": is_duplicate,
|
||||
}
|
||||
@@ -5,23 +5,28 @@ Phase 21.5 初版: 2026-03-31 ogt
|
||||
Phase 22.6 重寫: 2026-04-03 ogt (老闆需求: 雙 AI 互動對話)
|
||||
Phase 22.7 更新: 2026-04-03 ogt (老闆指示: OpenClaw→Gemini, NemoClaw→Ollama llama3.2:3b)
|
||||
Phase 22.8 更新: 2026-04-09 ogt (老闆指示: NemoClaw→Ollama 111 deepseek-r1:14b,SRE 推理更強)
|
||||
Phase 33 更新: 2026-05-05 ogt (ADR-110: OpenClaw chat 改走 GCP-A Ollama interactive lane)
|
||||
|
||||
架構:
|
||||
- OpenClaw (Gemini API): SRE 首席顧問,精準分析
|
||||
- NemoClaw (Ollama 192.168.0.111 deepseek-r1:14b): 戰術參謀,深度推理
|
||||
- OpenClaw (Ollama GCP-A interactive lane): SRE 首席顧問,精準分析
|
||||
- NemoClaw (Ollama interactive lane deepseek-r1:14b): 戰術參謀,深度推理
|
||||
|
||||
費用控管:
|
||||
- Gemini Flash: Input $0.075/1M tokens, Output $0.30/1M tokens
|
||||
- NemoClaw: 免費 (本地 Ollama)
|
||||
- 每次回覆顯示 token 用量與費用
|
||||
- 月上限 $10 USD (由 ai_rate_limiter 控管)
|
||||
- OpenClaw/NemoClaw chat 預設免費 Ollama;Gemini 不再作為 ChatManager 預設路徑
|
||||
- 每次回覆顯示 token 用量
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
|
||||
import httpx
|
||||
import structlog
|
||||
from src.utils.timezone import now_taipei
|
||||
from src.repositories.k8s_repository import get_k8s_repository
|
||||
|
||||
from src.core.config import get_settings
|
||||
from src.repositories.incident_repository import get_incident_repository
|
||||
from src.repositories.k8s_repository import get_k8s_repository
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
from src.utils.timezone import now_taipei
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
@@ -81,73 +86,49 @@ class ChatManager:
|
||||
|
||||
async def _call_openclaw(self, system_prompt: str, user_message: str) -> str | None:
|
||||
"""
|
||||
呼叫 OpenClaw 對話 — Gemini Flash API
|
||||
|
||||
2026-04-03 ogt: 老闆指示改用 Gemini,費用控管月上限 $10 USD
|
||||
每次回覆附帶 token 用量與費用統計
|
||||
呼叫 OpenClaw 對話 — Ollama interactive lane
|
||||
|
||||
2026-04-10 Claude Code: 強制合併 OPENCLAW_PERSONA,確保字數限制與格式規範
|
||||
2026-05-05 Codex: 改走 ADR-110 GCP-A/GCP-B/111 Ollama topology,避免個人聊天直打 Gemini
|
||||
"""
|
||||
# 強制在 system_prompt 前置 persona,確保 LLM 遵守字數與格式
|
||||
system_prompt = f"{OPENCLAW_PERSONA}\n{system_prompt}"
|
||||
import httpx
|
||||
from src.core.config import get_settings
|
||||
settings = get_settings()
|
||||
|
||||
api_key = settings.GEMINI_API_KEY
|
||||
if not api_key:
|
||||
logger.warning("openclaw_chat_failed", error="GEMINI_API_KEY not configured")
|
||||
return None
|
||||
|
||||
# 月費用上限檢查 ($10 USD)
|
||||
MONTHLY_LIMIT_USD = 10.0
|
||||
from src.core.redis_client import get_redis
|
||||
from src.utils.timezone import now_taipei
|
||||
redis = get_redis()
|
||||
month_key = f"gemini_cost:{now_taipei().strftime('%Y-%m')}"
|
||||
model = settings.OPENCLAW_DEFAULT_MODEL
|
||||
ollama_url = resolve_ollama_endpoint("interactive")
|
||||
try:
|
||||
current_cost = float(await redis.get(month_key) or 0)
|
||||
except Exception:
|
||||
current_cost = 0.0
|
||||
|
||||
if current_cost >= MONTHLY_LIMIT_USD:
|
||||
logger.warning("openclaw_gemini_monthly_limit_reached", current_usd=current_cost, limit_usd=MONTHLY_LIMIT_USD)
|
||||
return f"🔴 OpenClaw 本月 Gemini 用量已達上限 ${MONTHLY_LIMIT_USD} USD(已用 ${current_cost:.4f})"
|
||||
|
||||
# Gemini 2.0 Flash-Lite: 最便宜 (2026-04-03 老闆指示)
|
||||
model = "gemini-2.0-flash-lite"
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
async with httpx.AsyncClient(timeout=40.0) as client:
|
||||
resp = await client.post(
|
||||
f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent",
|
||||
headers={"x-goog-api-key": api_key},
|
||||
f"{ollama_url}/api/chat",
|
||||
json={
|
||||
"system_instruction": {"parts": [{"text": system_prompt}]},
|
||||
"contents": [{"parts": [{"text": user_message}]}],
|
||||
"generationConfig": {"maxOutputTokens": 300, "temperature": 0.7},
|
||||
"model": model,
|
||||
"stream": False,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_message},
|
||||
],
|
||||
"options": {"num_predict": 900, "temperature": 0.2},
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
text = data["candidates"][0]["content"]["parts"][0]["text"].strip()
|
||||
raw = data.get("message", {}).get("content", "").strip()
|
||||
text = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip() or raw
|
||||
|
||||
# Token/費用統計 + 累計到 Redis
|
||||
usage = data.get("usageMetadata", {})
|
||||
in_tok = usage.get("promptTokenCount", 0)
|
||||
out_tok = usage.get("candidatesTokenCount", 0)
|
||||
# Gemini 2.0 Flash-Lite: Input $0.075/1M, Output $0.30/1M
|
||||
cost = (in_tok * 0.000000075) + (out_tok * 0.0000003)
|
||||
new_total = current_cost + cost
|
||||
eval_count = data.get("eval_count", 0)
|
||||
prompt_eval_count = data.get("prompt_eval_count", 0)
|
||||
total_tokens = eval_count + prompt_eval_count
|
||||
|
||||
try:
|
||||
await redis.set(month_key, str(round(new_total, 6)), ex=40 * 24 * 3600) # 40天 TTL
|
||||
except Exception:
|
||||
pass
|
||||
logger.info(
|
||||
"openclaw_ollama_chat_usage",
|
||||
model=model,
|
||||
endpoint=ollama_url,
|
||||
prompt_tokens=prompt_eval_count,
|
||||
output_tokens=eval_count,
|
||||
)
|
||||
|
||||
logger.info("openclaw_gemini_usage", in_tokens=in_tok, out_tokens=out_tok,
|
||||
cost_usd=round(cost, 6), monthly_total_usd=round(new_total, 4))
|
||||
|
||||
return f"{text}\n\n<i>📊 {in_tok+out_tok} tokens | ${cost:.4f} | 本月累計 ${new_total:.4f}</i>"
|
||||
return f"{text}\n\n<i>🦙 {model} | {total_tokens} tokens | 免費</i>"
|
||||
except Exception as e:
|
||||
logger.warning("openclaw_chat_failed", error=str(e))
|
||||
return None
|
||||
@@ -164,12 +145,8 @@ class ChatManager:
|
||||
# 強制在 system_prompt 前置 persona
|
||||
system_prompt = f"{NEMOCLAW_PERSONA}\n{system_prompt}"
|
||||
|
||||
import httpx
|
||||
import re
|
||||
from src.core.config import get_settings as _get_settings
|
||||
|
||||
# 2026-05-03 ogt: ADR-110 GCP-A Primary — 改從 settings 讀取,不再硬編碼 111
|
||||
OLLAMA_URL = _get_settings().OLLAMA_URL
|
||||
# 2026-05-05 Codex: ADR-110 interactive lane,由 resolver 管理 GCP-A/GCP-B/111 拓撲
|
||||
OLLAMA_URL = resolve_ollama_endpoint("interactive")
|
||||
MODEL = "deepseek-r1:14b"
|
||||
|
||||
try:
|
||||
@@ -250,14 +227,14 @@ class ChatManager:
|
||||
# 2026-04-03 ogt: 移除 asyncio.shield — shield 會在超時後讓 task 繼續跑但無人等待,造成 silent leak
|
||||
try:
|
||||
openclaw_raw = await asyncio.wait_for(openclaw_task, timeout=40.0)
|
||||
except asyncio.TimeoutError:
|
||||
except TimeoutError:
|
||||
openclaw_raw = None
|
||||
|
||||
openclaw_block = f"🦞 <b>OpenClaw:</b>\n{openclaw_raw or '🔴 無響應'}"
|
||||
|
||||
try:
|
||||
nemo_raw = await asyncio.wait_for(nemo_task, timeout=60.0)
|
||||
except asyncio.TimeoutError:
|
||||
except TimeoutError:
|
||||
nemo_raw = None
|
||||
|
||||
if nemo_raw:
|
||||
|
||||
@@ -337,10 +337,36 @@ class PerformanceAgent(ExpertAgent):
|
||||
# Consensus Engine
|
||||
# =============================================================================
|
||||
|
||||
CONSENSUS_PREFIX = "consensus:"
|
||||
# P0-12 修正 2026-05-04 ogt + Claude Sonnet 4.6:
|
||||
# 舊格式(無 project 前綴):consensus:{consensus_id}
|
||||
# 新格式(含 project 前綴):{project_id}:consensus:{consensus_id}
|
||||
# 遷移策略:Phase A 雙寫 + fallback 讀舊 key,待全部遷移後移除 fallback
|
||||
CONSENSUS_PREFIX = "consensus:" # 舊格式前綴(讀 fallback 用)
|
||||
PLATFORM_INTERNAL = "__platform__" # project_id 不可得時的 sentinel namespace
|
||||
CONSENSUS_TTL = 3600 # 1 小時
|
||||
|
||||
|
||||
def _consensus_key(consensus_id: str, project_id: str | None) -> str:
|
||||
"""
|
||||
建構 consensus Redis key(含 project_id namespace)
|
||||
|
||||
Args:
|
||||
consensus_id: 共識 ID(如 CON-20260504-ABCD1234)
|
||||
project_id: 租戶 project ID;若為 None 則使用 __platform__ sentinel
|
||||
|
||||
Returns:
|
||||
新格式 key:{project_id}:consensus:{consensus_id}
|
||||
或 fallback:__platform__:consensus:{consensus_id}
|
||||
"""
|
||||
ns = project_id if project_id else PLATFORM_INTERNAL
|
||||
return f"{ns}:consensus:{consensus_id}"
|
||||
|
||||
|
||||
def _consensus_legacy_key(consensus_id: str) -> str:
|
||||
"""舊格式 key(Phase A fallback 讀取用)"""
|
||||
return f"{CONSENSUS_PREFIX}{consensus_id}"
|
||||
|
||||
|
||||
class ConsensusEngine:
|
||||
"""
|
||||
共識引擎 - Phase 9.4 核心
|
||||
@@ -526,6 +552,7 @@ class ConsensusEngine:
|
||||
consensus_score: float,
|
||||
recommended_action_type: str,
|
||||
dissenting: list[str],
|
||||
project_id: str | None = None,
|
||||
) -> ConsensusResult:
|
||||
"""
|
||||
產生最終決策
|
||||
@@ -578,8 +605,8 @@ class ConsensusEngine:
|
||||
dissenting_opinions=dissenting,
|
||||
)
|
||||
|
||||
# 儲存到 Redis
|
||||
await self._save_consensus(result)
|
||||
# 儲存到 Redis(含 project_id namespace)
|
||||
await self._save_consensus(result, project_id=project_id)
|
||||
|
||||
logger.info(
|
||||
"consensus_generated",
|
||||
@@ -595,6 +622,7 @@ class ConsensusEngine:
|
||||
self,
|
||||
incident: Incident,
|
||||
timeout_sec: float = 30.0,
|
||||
project_id: str | None = None,
|
||||
) -> ConsensusResult:
|
||||
"""
|
||||
執行完整的共識流程
|
||||
@@ -603,6 +631,11 @@ class ConsensusEngine:
|
||||
1. 收集意見
|
||||
2. 計算共識
|
||||
3. 產生決策
|
||||
|
||||
Args:
|
||||
incident: 要分析的事件
|
||||
timeout_sec: 超時秒數
|
||||
project_id: 租戶 project ID,用於 Redis key namespace 隔離(P0-12)
|
||||
"""
|
||||
# Step 1: 收集意見
|
||||
opinions = await self.gather_opinions(incident, timeout_sec)
|
||||
@@ -610,32 +643,43 @@ class ConsensusEngine:
|
||||
# Step 2: 計算共識
|
||||
consensus_score, recommended_action, dissenting = self.calculate_consensus(opinions)
|
||||
|
||||
# Step 3: 產生決策
|
||||
# Step 3: 產生決策(傳入 project_id 供 Redis key namespace 隔離)
|
||||
result = await self.generate_final_decision(
|
||||
incident=incident,
|
||||
opinions=opinions,
|
||||
consensus_score=consensus_score,
|
||||
recommended_action_type=recommended_action,
|
||||
dissenting=dissenting,
|
||||
project_id=project_id,
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
async def _save_consensus(self, result: ConsensusResult) -> None:
|
||||
async def _save_consensus(
|
||||
self,
|
||||
result: ConsensusResult,
|
||||
project_id: str | None = None,
|
||||
) -> None:
|
||||
"""儲存共識結果到 Redis(熱快取)+ PG(永久記錄)
|
||||
|
||||
2026-04-26 P2-DB-Fix by Claude — db-expert P0 三修(P0.2):
|
||||
補 PG 寫入 agent_sessions,符合 ADR-085 鐵律
|
||||
Redis TTL 到期不再造成共識記憶消失
|
||||
|
||||
P0-12 修正 2026-05-04 ogt + Claude Sonnet 4.6:
|
||||
Phase A 雙寫:新 key(含 project_id 前綴)+ 舊 key(向後相容)
|
||||
待全部遷移完成後移除舊 key 寫入
|
||||
"""
|
||||
# 1. 既有 Redis 寫(熱快取,保留)
|
||||
redis_client = get_redis()
|
||||
key = f"{CONSENSUS_PREFIX}{result.consensus_id}"
|
||||
await redis_client.set(
|
||||
key,
|
||||
json.dumps(result.to_dict()),
|
||||
ex=CONSENSUS_TTL,
|
||||
)
|
||||
payload = json.dumps(result.to_dict())
|
||||
|
||||
# 1a. 新 key(含 project namespace)— Phase A 主要 key
|
||||
new_key = _consensus_key(result.consensus_id, project_id)
|
||||
await redis_client.set(new_key, payload, ex=CONSENSUS_TTL)
|
||||
|
||||
# 1b. 舊 key(無 project 前綴)— Phase A fallback,向後相容
|
||||
legacy_key = _consensus_legacy_key(result.consensus_id)
|
||||
await redis_client.set(legacy_key, payload, ex=CONSENSUS_TTL)
|
||||
|
||||
# 2. 補 PG 永久寫入(ADR-085 鐵律 — 失敗不阻斷主流程)
|
||||
try:
|
||||
@@ -691,12 +735,34 @@ class ConsensusEngine:
|
||||
consensus_id=result.consensus_id,
|
||||
)
|
||||
|
||||
async def get_consensus(self, consensus_id: str) -> ConsensusResult | None:
|
||||
"""取得共識結果"""
|
||||
redis_client = get_redis()
|
||||
key = f"{CONSENSUS_PREFIX}{consensus_id}"
|
||||
async def get_consensus(
|
||||
self,
|
||||
consensus_id: str,
|
||||
project_id: str | None = None,
|
||||
) -> ConsensusResult | None:
|
||||
"""取得共識結果
|
||||
|
||||
P0-12 修正 2026-05-04 ogt + Claude Sonnet 4.6:
|
||||
Phase A 雙讀:先讀新 key(含 project 前綴),若 miss 再 fallback 舊 key
|
||||
"""
|
||||
redis_client = get_redis()
|
||||
|
||||
# 先嘗試新格式 key(含 project namespace)
|
||||
new_key = _consensus_key(consensus_id, project_id)
|
||||
data = await redis_client.get(new_key)
|
||||
|
||||
if not data:
|
||||
# Phase A fallback:讀舊格式 key(無 project 前綴)
|
||||
legacy_key = _consensus_legacy_key(consensus_id)
|
||||
data = await redis_client.get(legacy_key)
|
||||
if data:
|
||||
logger.info(
|
||||
"consensus_legacy_key_hit",
|
||||
consensus_id=consensus_id,
|
||||
project_id=project_id,
|
||||
note="Phase A fallback 命中,建議觸發資料遷移",
|
||||
)
|
||||
|
||||
data = await redis_client.get(key)
|
||||
if data:
|
||||
return ConsensusResult.from_dict(json.loads(data))
|
||||
return None
|
||||
|
||||
449
apps/api/src/services/contract_service.py
Normal file
449
apps/api/src/services/contract_service.py
Normal file
@@ -0,0 +1,449 @@
|
||||
"""
|
||||
Contract Lifecycle Service
|
||||
===========================
|
||||
AwoooP Phase 3: 合約生命週期管理(ADR-107/ADR-112)
|
||||
2026-05-04 ogt + Claude Sonnet 4.6
|
||||
|
||||
生命週期狀態機:
|
||||
draft → published → active → revoked
|
||||
↑ ↓(新 active 把舊的設為 revoked)
|
||||
|
||||
操作:
|
||||
draft() — 建立 draft revision(schema 驗證 + body_hash)
|
||||
publish() — HMAC 簽章驗證後 draft → published
|
||||
activate() — approval 確認後 published → active + outbox
|
||||
get_active() — runtime 唯一讀取路徑(只返回 active revision)
|
||||
|
||||
安全機制:
|
||||
- body_hash = sha256(canonical JSON)(ADR-112)
|
||||
- publish() 需 HMAC 簽章(settings.CONTRACT_HMAC_KEY)
|
||||
- activate() 需 Redis multi_sig 確認(ADR-112 approval workflow)
|
||||
- 所有操作寫入 audit_log
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
import structlog
|
||||
from pydantic import ValidationError
|
||||
|
||||
from src.core.config import settings
|
||||
from src.db.awooop_models import AwoooPContractRevision
|
||||
from src.models.awooop_contracts import validate_contract_body
|
||||
from src.repositories import contract_repository
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 錯誤定義
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
class ContractError(Exception):
|
||||
"""合約操作基礎錯誤"""
|
||||
def __init__(self, error_code: str, message: str) -> None:
|
||||
self.error_code = error_code
|
||||
super().__init__(f"[{error_code}] {message}")
|
||||
|
||||
|
||||
class ContractSchemaError(ContractError):
|
||||
"""body_json 不符合 schema"""
|
||||
def __init__(self, family: str, details: str) -> None:
|
||||
super().__init__("E-CONTRACT-001", f"Contract family={family} schema 驗證失敗: {details}")
|
||||
|
||||
|
||||
class ContractSignatureError(ContractError):
|
||||
"""HMAC 簽章驗證失敗"""
|
||||
def __init__(self) -> None:
|
||||
super().__init__("E-CONTRACT-002", "Contract publish 簽章驗證失敗")
|
||||
|
||||
|
||||
class ContractStateError(ContractError):
|
||||
"""非法狀態轉換"""
|
||||
def __init__(self, from_state: str, to_state: str) -> None:
|
||||
super().__init__(
|
||||
"E-CONTRACT-003",
|
||||
f"非法狀態轉換 {from_state!r} → {to_state!r}",
|
||||
)
|
||||
|
||||
|
||||
class ContractApprovalError(ContractError):
|
||||
"""缺少必要的 activation approval"""
|
||||
def __init__(self, revision_id: str) -> None:
|
||||
super().__init__(
|
||||
"E-CONTRACT-004",
|
||||
f"revision {revision_id} 尚未取得足夠的 approval 簽核",
|
||||
)
|
||||
|
||||
|
||||
class ContractNotFoundError(ContractError):
|
||||
"""Revision 不存在"""
|
||||
def __init__(self, revision_id: str) -> None:
|
||||
super().__init__(
|
||||
"E-CONTRACT-005",
|
||||
f"Revision {revision_id!r} 不存在或無權限存取",
|
||||
)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Body hash(ADR-112 artifact integrity)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def _compute_body_hash(body_json: dict[str, Any]) -> str:
|
||||
"""
|
||||
計算 body_json 的 SHA-256 hex digest。
|
||||
使用 canonical JSON(sorted keys, no spaces)確保確定性。
|
||||
"""
|
||||
canonical = json.dumps(body_json, sort_keys=True, separators=(",", ":"), ensure_ascii=False)
|
||||
return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _verify_publish_signature(
|
||||
revision_id: str,
|
||||
body_hash: str,
|
||||
publisher_id: str,
|
||||
signature: str,
|
||||
) -> bool:
|
||||
"""
|
||||
驗證 publish HMAC 簽章。
|
||||
message = f"{revision_id}:{body_hash}:{publisher_id}"
|
||||
secret = settings.CONTRACT_HMAC_KEY(base64 or hex)
|
||||
"""
|
||||
secret = getattr(settings, "CONTRACT_HMAC_KEY", "")
|
||||
if not secret:
|
||||
# 未設定 HMAC key → 開發環境放行(但記錄 warning)
|
||||
logger.warning(
|
||||
"contract_hmac_key_not_set",
|
||||
warning="CONTRACT_HMAC_KEY 未設定,publish 簽章驗證跳過(非 production 行為)",
|
||||
)
|
||||
return True
|
||||
|
||||
message = f"{revision_id}:{body_hash}:{publisher_id}".encode("utf-8")
|
||||
expected = hmac.new(
|
||||
secret.encode("utf-8"), message, hashlib.sha256
|
||||
).hexdigest()
|
||||
return hmac.compare_digest(expected, signature)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Multi-sig approval(ADR-112 activation approval)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
_APPROVAL_KEY_PREFIX = "contract:approval:"
|
||||
_APPROVAL_REQUIRED = 1 # Phase 3:1 人核准即可;Phase 5+ 升為 2
|
||||
|
||||
|
||||
async def _check_activation_approval(revision_id: str, project_id: str) -> bool:
|
||||
"""
|
||||
檢查 Redis 中是否有足夠的 activation approval。
|
||||
key = contract:approval:{project_id}:{revision_id}
|
||||
value = JSON list of approver IDs
|
||||
"""
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
key = f"{_APPROVAL_KEY_PREFIX}{project_id}:{revision_id}"
|
||||
raw = await redis.get(key)
|
||||
if not raw:
|
||||
return False
|
||||
approvers = json.loads(raw.decode() if isinstance(raw, bytes) else raw)
|
||||
return len(approvers) >= _APPROVAL_REQUIRED
|
||||
except Exception as exc:
|
||||
logger.warning("contract_approval_check_failed", revision_id=revision_id, error=str(exc))
|
||||
return False
|
||||
|
||||
|
||||
async def record_activation_approval(
|
||||
revision_id: str,
|
||||
project_id: str,
|
||||
approver_id: str,
|
||||
) -> int:
|
||||
"""
|
||||
記錄一個 approver 的核准簽名。
|
||||
Returns: 目前收到的 approval 數。
|
||||
"""
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
key = f"{_APPROVAL_KEY_PREFIX}{project_id}:{revision_id}"
|
||||
raw = await redis.get(key)
|
||||
approvers: list[str] = json.loads(raw.decode() if isinstance(raw, bytes) else raw or "[]")
|
||||
if approver_id not in approvers:
|
||||
approvers.append(approver_id)
|
||||
await redis.set(key, json.dumps(approvers), ex=86400) # 24h TTL
|
||||
logger.info(
|
||||
"contract_approval_recorded",
|
||||
revision_id=revision_id,
|
||||
approver_id=approver_id,
|
||||
total_approvals=len(approvers),
|
||||
)
|
||||
return len(approvers)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Core lifecycle operations
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def draft(
|
||||
*,
|
||||
project_id: str,
|
||||
contract_family: str,
|
||||
contract_id: str,
|
||||
version_major: int,
|
||||
version_minor: int,
|
||||
body_json: dict[str, Any],
|
||||
body_schema_version: str = "v1.0",
|
||||
) -> AwoooPContractRevision:
|
||||
"""
|
||||
Step 1: 建立 draft revision。
|
||||
|
||||
- 驗證 body_json 符合 contract_family 的 Pydantic schema
|
||||
- 計算 body_hash(sha256 canonical JSON)
|
||||
- 寫入 DB(lifecycle_status='draft')
|
||||
- 寫入 audit log
|
||||
|
||||
draft revision 不可被 runtime 讀取(get_active() 只返回 active)。
|
||||
"""
|
||||
# Schema 驗證
|
||||
try:
|
||||
validate_contract_body(contract_family, body_json)
|
||||
except ValidationError as exc:
|
||||
raise ContractSchemaError(contract_family, exc.json(indent=0)) from exc
|
||||
except ValueError as exc:
|
||||
raise ContractSchemaError(contract_family, str(exc)) from exc
|
||||
|
||||
body_hash = _compute_body_hash(body_json)
|
||||
|
||||
revision = await contract_repository.create_draft(
|
||||
project_id=project_id,
|
||||
contract_family=contract_family,
|
||||
contract_id=contract_id,
|
||||
version_major=version_major,
|
||||
version_minor=version_minor,
|
||||
body_json=body_json,
|
||||
body_hash=body_hash,
|
||||
body_schema_version=body_schema_version,
|
||||
)
|
||||
|
||||
await _write_audit(
|
||||
project_id=project_id,
|
||||
action="contract.drafted",
|
||||
resource_type="contract_revision",
|
||||
resource_id=str(revision.revision_id),
|
||||
details={
|
||||
"contract_family": contract_family,
|
||||
"contract_id": contract_id,
|
||||
"version": f"{version_major}.{version_minor}",
|
||||
"body_hash": body_hash,
|
||||
},
|
||||
)
|
||||
return revision
|
||||
|
||||
|
||||
async def publish(
|
||||
*,
|
||||
revision_id: UUID,
|
||||
project_id: str,
|
||||
publisher_id: str,
|
||||
signature: str,
|
||||
) -> AwoooPContractRevision:
|
||||
"""
|
||||
Step 2: draft → published。
|
||||
|
||||
- 讀取 revision(驗證 lifecycle_status='draft')
|
||||
- HMAC 簽章驗證(publisher_id + body_hash + revision_id)
|
||||
- 更新 lifecycle_status='published'
|
||||
- 寫入 audit log
|
||||
"""
|
||||
revision = await contract_repository.get_revision(revision_id, project_id)
|
||||
if revision is None:
|
||||
raise ContractNotFoundError(str(revision_id))
|
||||
if revision.lifecycle_status != "draft":
|
||||
raise ContractStateError(revision.lifecycle_status, "published")
|
||||
|
||||
if not _verify_publish_signature(
|
||||
str(revision_id), revision.body_hash, publisher_id, signature
|
||||
):
|
||||
raise ContractSignatureError()
|
||||
|
||||
published_at = datetime.now(timezone.utc)
|
||||
revision = await contract_repository.mark_published(
|
||||
revision_id=revision_id,
|
||||
project_id=project_id,
|
||||
publisher_id=publisher_id,
|
||||
publish_signature=signature,
|
||||
published_at=published_at,
|
||||
)
|
||||
|
||||
await _write_audit(
|
||||
project_id=project_id,
|
||||
action="contract.published",
|
||||
resource_type="contract_revision",
|
||||
resource_id=str(revision_id),
|
||||
details={
|
||||
"publisher_id": publisher_id,
|
||||
"published_at": published_at.isoformat(),
|
||||
"body_hash": revision.body_hash,
|
||||
},
|
||||
)
|
||||
return revision
|
||||
|
||||
|
||||
async def activate(
|
||||
*,
|
||||
revision_id: UUID,
|
||||
project_id: str,
|
||||
activator_id: str,
|
||||
bypass_approval: bool = False,
|
||||
) -> AwoooPContractRevision:
|
||||
"""
|
||||
Step 3: published → active。
|
||||
|
||||
- 讀取 revision(驗證 lifecycle_status='published')
|
||||
- 確認 Redis approval(除非 bypass_approval=True)
|
||||
- 更新 active pointer(UPSERT awooop_active_revisions)
|
||||
- 舊 active revision → revoked
|
||||
- 寫入 outbox event(ADR-113)
|
||||
- 寫入 audit log
|
||||
"""
|
||||
revision = await contract_repository.get_revision(revision_id, project_id)
|
||||
if revision is None:
|
||||
raise ContractNotFoundError(str(revision_id))
|
||||
if revision.lifecycle_status != "published":
|
||||
raise ContractStateError(revision.lifecycle_status, "active")
|
||||
|
||||
if not bypass_approval:
|
||||
approved = await _check_activation_approval(str(revision_id), project_id)
|
||||
if not approved:
|
||||
raise ContractApprovalError(str(revision_id))
|
||||
|
||||
# 找舊 active revision(如果有)
|
||||
old_revision = await contract_repository.get_active_revision(
|
||||
project_id=project_id,
|
||||
contract_family=revision.contract_family,
|
||||
contract_id=revision.contract_id,
|
||||
)
|
||||
old_revision_id = old_revision.revision_id if old_revision else None
|
||||
|
||||
revision = await contract_repository.mark_active(
|
||||
revision_id=revision_id,
|
||||
project_id=project_id,
|
||||
contract_family=revision.contract_family,
|
||||
contract_id=revision.contract_id,
|
||||
old_revision_id=old_revision_id,
|
||||
)
|
||||
|
||||
await _write_audit(
|
||||
project_id=project_id,
|
||||
action="contract.activated",
|
||||
resource_type="contract_revision",
|
||||
resource_id=str(revision_id),
|
||||
details={
|
||||
"activator_id": activator_id,
|
||||
"old_revision_id": str(old_revision_id) if old_revision_id else None,
|
||||
"contract_family": revision.contract_family,
|
||||
"contract_id": revision.contract_id,
|
||||
},
|
||||
)
|
||||
return revision
|
||||
|
||||
|
||||
async def get_active(
|
||||
*,
|
||||
project_id: str,
|
||||
contract_family: str,
|
||||
contract_id: str,
|
||||
verify_hash: bool = True,
|
||||
) -> AwoooPContractRevision | None:
|
||||
"""
|
||||
Runtime 讀取路徑:只返回 active revision。
|
||||
|
||||
verify_hash=True(預設):從 DB 讀取後驗證 body_hash,
|
||||
確保 body_json 未被竄改(ADR-112 artifact integrity)。
|
||||
"""
|
||||
revision = await contract_repository.get_active_revision(
|
||||
project_id=project_id,
|
||||
contract_family=contract_family,
|
||||
contract_id=contract_id,
|
||||
)
|
||||
if revision is None:
|
||||
return None
|
||||
|
||||
if verify_hash:
|
||||
computed = _compute_body_hash(revision.body_json)
|
||||
if computed != revision.body_hash:
|
||||
logger.error(
|
||||
"contract_hash_mismatch",
|
||||
revision_id=str(revision.revision_id),
|
||||
expected=revision.body_hash,
|
||||
computed=computed,
|
||||
)
|
||||
raise ContractError(
|
||||
"E-CONTRACT-006",
|
||||
f"revision {revision.revision_id} body_hash 不符(資料可能被竄改)",
|
||||
)
|
||||
|
||||
return revision
|
||||
|
||||
|
||||
async def get_active_body(
|
||||
*,
|
||||
project_id: str,
|
||||
contract_family: str,
|
||||
contract_id: str,
|
||||
) -> dict[str, Any] | None:
|
||||
"""
|
||||
便利方法:直接返回 body_json(含 hash 驗證)。
|
||||
None = 沒有 active revision。
|
||||
"""
|
||||
revision = await get_active(
|
||||
project_id=project_id,
|
||||
contract_family=contract_family,
|
||||
contract_id=contract_id,
|
||||
)
|
||||
return revision.body_json if revision else None
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Audit log helper
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def _write_audit(
|
||||
*,
|
||||
project_id: str,
|
||||
action: str,
|
||||
resource_type: str,
|
||||
resource_id: str,
|
||||
details: dict[str, Any],
|
||||
) -> None:
|
||||
"""寫入 audit_log(非阻擋,失敗只 warning)"""
|
||||
try:
|
||||
from sqlalchemy import text as sa_text
|
||||
from src.db.base import get_db_context
|
||||
async with get_db_context(project_id) as db:
|
||||
await db.execute(
|
||||
sa_text("""
|
||||
INSERT INTO audit_logs
|
||||
(project_id, action, resource_type, resource_id, details)
|
||||
VALUES
|
||||
(:project_id, :action, :resource_type, :resource_id, :details::jsonb)
|
||||
"""),
|
||||
{
|
||||
"project_id": project_id,
|
||||
"action": action,
|
||||
"resource_type": resource_type,
|
||||
"resource_id": resource_id,
|
||||
"details": json.dumps(details),
|
||||
},
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"contract_audit_write_failed",
|
||||
action=action,
|
||||
resource_id=resource_id,
|
||||
error=str(exc),
|
||||
)
|
||||
@@ -26,7 +26,7 @@ from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, Literal
|
||||
|
||||
import httpx
|
||||
@@ -125,7 +125,7 @@ class DecisionFusionAdapter:
|
||||
# Public API
|
||||
# =========================================================================
|
||||
|
||||
async def fuse_decision(self, event: "AiGovernanceEvent") -> FusedDecision:
|
||||
async def fuse_decision(self, event: AiGovernanceEvent) -> FusedDecision:
|
||||
"""三維融合:LLM × Playbook × MCP → FusedDecision。
|
||||
|
||||
三個維度並行評估(asyncio.gather),任一失敗靜默降為 0.5。
|
||||
@@ -226,7 +226,7 @@ class DecisionFusionAdapter:
|
||||
# =========================================================================
|
||||
|
||||
async def _score_llm(
|
||||
self, event: "AiGovernanceEvent"
|
||||
self, event: AiGovernanceEvent
|
||||
) -> tuple[float, str, dict[str, Any]]:
|
||||
"""Ollama LLM 推理:治理事件情境 → 建議動作 + 信心度。
|
||||
|
||||
@@ -254,7 +254,9 @@ class DecisionFusionAdapter:
|
||||
"只輸出 CONFIDENCE 和 ACTION 兩行,不要其他解釋。"
|
||||
)
|
||||
|
||||
ollama_url = getattr(self._settings, "OLLAMA_URL", "http://34.143.170.20:11434") # 2026-05-03 ogt: ADR-110 GCP-A Primary
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
ollama_url = resolve_ollama_endpoint("deep_rca")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
@@ -320,7 +322,7 @@ class DecisionFusionAdapter:
|
||||
# =========================================================================
|
||||
|
||||
async def _score_playbook(
|
||||
self, event: "AiGovernanceEvent"
|
||||
self, event: AiGovernanceEvent
|
||||
) -> tuple[float, str | None, float | None]:
|
||||
"""Playbook 相似度比對 → 取最高 trust_score。
|
||||
|
||||
@@ -373,7 +375,7 @@ class DecisionFusionAdapter:
|
||||
# =========================================================================
|
||||
|
||||
async def _score_mcp(
|
||||
self, event: "AiGovernanceEvent"
|
||||
self, event: AiGovernanceEvent
|
||||
) -> tuple[float, dict[str, Any]]:
|
||||
"""Prometheus 情報採集 → MCP 感官品質分數。
|
||||
|
||||
@@ -392,6 +394,7 @@ class DecisionFusionAdapter:
|
||||
|
||||
snapshot: dict[str, Any] = {}
|
||||
success_count = 0
|
||||
no_data_count = 0 # Prometheus 正常但指標尚未建立(SLI recording rules 未生效)
|
||||
total_count = len(queries)
|
||||
|
||||
if total_count == 0:
|
||||
@@ -413,22 +416,29 @@ class DecisionFusionAdapter:
|
||||
snapshot[metric_name] = round(value, 4)
|
||||
success_count += 1
|
||||
else:
|
||||
snapshot[metric_name] = None # 有回應但無資料
|
||||
# 2026-05-04 ogt: 指標尚未建立 ≠ MCP 失敗
|
||||
# SLI recording rules 初期可能無資料,給予 0.5 中性貢獻
|
||||
snapshot[metric_name] = "no_data"
|
||||
no_data_count += 1
|
||||
except Exception as exc:
|
||||
snapshot[metric_name] = f"error:{exc!s:.60}"
|
||||
except Exception as exc:
|
||||
logger.warning("fusion_mcp_prometheus_failed", event_id=event.id, error=str(exc))
|
||||
return 0.5, {"error": str(exc)}
|
||||
|
||||
# 品質分數:成功取得資料的指標比例(映射到 [0.2, 0.9])
|
||||
# 2026-05-04 ogt: 品質分數修正
|
||||
# success=完整貢獻(1.0), no_data=半貢獻(0.5,指標未建立非 MCP 故障), error=0
|
||||
# 最終映射到 [0.2, 0.9]
|
||||
if total_count > 0:
|
||||
ratio = success_count / total_count
|
||||
weighted = success_count + 0.5 * no_data_count
|
||||
ratio = weighted / total_count
|
||||
mcp_score = 0.2 + 0.7 * ratio
|
||||
else:
|
||||
mcp_score = 0.5
|
||||
|
||||
snapshot["_meta"] = {
|
||||
"success_count": success_count,
|
||||
"no_data_count": no_data_count,
|
||||
"total_queries": total_count,
|
||||
"quality_score": round(mcp_score, 4),
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@ Decision Manager - Phase 6.5 非同步決策狀態機
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html
|
||||
import json
|
||||
from datetime import UTC, datetime
|
||||
from enum import Enum
|
||||
@@ -37,6 +38,7 @@ from src.services.action_parser import parse_kubectl_action
|
||||
from src.services.auto_approve import get_auto_approve_policy
|
||||
from src.services.openclaw import get_openclaw
|
||||
from src.services.playbook_service import get_playbook_service
|
||||
from src.services.telegram_gateway import SILENCE_KEY_PREFIX # P1-24: 統一常數,禁止重複定義
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
@@ -54,6 +56,20 @@ def _fire_and_forget(coro) -> asyncio.Task:
|
||||
return task
|
||||
|
||||
|
||||
def _incident_alertname_for_dedup(incident: Incident) -> str:
|
||||
"""Return a stable alert name for Telegram fingerprint dedup."""
|
||||
if incident.signals:
|
||||
signal = incident.signals[0]
|
||||
return (
|
||||
signal.labels.get("alertname")
|
||||
or signal.alert_name
|
||||
or signal.annotations.get("summary")
|
||||
or signal.annotations.get("description")
|
||||
or incident.incident_id
|
||||
)
|
||||
return incident.incident_id
|
||||
|
||||
|
||||
def _phase2_fallback_reason(package: Any) -> str | None:
|
||||
"""Return why a Phase 2 package should continue to Playbook/LLM fallback.
|
||||
|
||||
@@ -74,6 +90,22 @@ def _phase2_fallback_reason(package: Any) -> str | None:
|
||||
return None
|
||||
|
||||
|
||||
def _incident_llm_timeout_seconds() -> float:
|
||||
"""Return the outer timeout for incident LLM proposals.
|
||||
|
||||
The provider layer already has per-provider timeouts. This outer guard must
|
||||
not be shorter than the GCP Ollama lane, or alert diagnosis will be cut off
|
||||
before the free/local-first route can answer.
|
||||
"""
|
||||
|
||||
configured = getattr(settings, "INCIDENT_LLM_TIMEOUT_SECONDS", None)
|
||||
try:
|
||||
timeout = float(configured)
|
||||
except (TypeError, ValueError):
|
||||
timeout = 240.0
|
||||
return max(timeout, float(getattr(settings, "OPENCLAW_TIMEOUT", 30)))
|
||||
|
||||
|
||||
def _should_escalate_auto_approve_rejection(reason: Any) -> bool:
|
||||
"""Return True for manual gates that mean the automation path went blind."""
|
||||
|
||||
@@ -211,7 +243,7 @@ async def _push_decision_to_telegram(
|
||||
# 改成 alertname+target 構造的 fingerprint key + TTL 86400s,同症狀共用 dedup。
|
||||
# Incident 真正 RESOLVED/CLOSED 時走 line 220-226 的 status check 提早 return,不影響復發偵測。
|
||||
redis = get_redis()
|
||||
_alertname_fp = (incident.title or "unknown").strip().lower().replace(" ", "_")[:60]
|
||||
_alertname_fp = _incident_alertname_for_dedup(incident).strip().lower().replace(" ", "_")[:60]
|
||||
_target_fp = (
|
||||
incident.affected_services[0] if incident.affected_services else "unknown"
|
||||
).lower()[:40]
|
||||
@@ -237,7 +269,7 @@ async def _push_decision_to_telegram(
|
||||
|
||||
# 🔴 靜默檢查:此資源是否被靜默 (2026-03-27 P1 優化)
|
||||
target = incident.affected_services[0] if incident.affected_services else "unknown"
|
||||
silence_key = f"telegram_silence:{target}"
|
||||
silence_key = f"{SILENCE_KEY_PREFIX}{target}"
|
||||
if await redis.exists(silence_key):
|
||||
logger.info(
|
||||
"telegram_push_silenced",
|
||||
@@ -545,6 +577,7 @@ async def _push_decision_to_telegram(
|
||||
alert_category=_alert_category,
|
||||
notification_type=_notification_type,
|
||||
playbook_name=_playbook_name,
|
||||
automation_state=proposal_data.get("automation_state", ""),
|
||||
)
|
||||
|
||||
# 2026-04-09 Claude Sonnet 4.6: 存 message_id → 後續狀態更新在原訊息延續
|
||||
@@ -606,7 +639,7 @@ async def _nemoclaw_second_opinion(incident: "Incident", primary_result: dict) -
|
||||
"""
|
||||
MCP Phase 4a: NemoClaw second opinion — 信心 < 0.7 時觸發
|
||||
============================================================
|
||||
用 deepseek-r1:14b (Ollama 188) 對同一份資料做獨立推理,
|
||||
用 deepseek-r1:14b (設定的 Ollama primary) 對同一份資料做獨立推理,
|
||||
輸出純文字 advisory_note,不執行任何操作。
|
||||
|
||||
2026-04-11 Claude Sonnet 4.6 Asia/Taipei
|
||||
@@ -665,7 +698,7 @@ async def _generate_playbook_draft_if_new(incident: "Incident") -> None:
|
||||
MCP Phase 4c: Playbook 無命中時,自動生成 AI 草稿 Playbook 寫入 KM
|
||||
=====================================================================
|
||||
- 僅在 KM 中不存在同 alertname 的 Playbook 時觸發(避免重複)
|
||||
- 用 qwen2.5:7b-instruct (Ollama 188) 生成結構化 Playbook 草稿
|
||||
- 用 qwen2.5:7b-instruct (設定的 Ollama primary) 生成結構化 Playbook 草稿
|
||||
- 寫入 KnowledgeEntry,status=DRAFT,需人工審核後升為 APPROVED
|
||||
- 寫入 AlertOperationLog PLAYBOOK_DRAFT_CREATED 事件
|
||||
|
||||
@@ -826,7 +859,6 @@ async def _resolve_target_from_k8s(incident: "Incident", namespace: str) -> str
|
||||
reason="alertname 有對應但 keywords=[],走 fallback 取第一個非 infra pod",
|
||||
)
|
||||
|
||||
import re as _re
|
||||
for line in pod_lines:
|
||||
pod = line.removeprefix("pod/").strip()
|
||||
if not pod:
|
||||
@@ -976,6 +1008,58 @@ def _format_metrics_delta(before: dict, after: dict) -> str:
|
||||
return " | ".join(parts)
|
||||
|
||||
|
||||
def _clip_telegram_field(value: str | None, limit: int) -> str:
|
||||
"""Normalize a short Telegram field without leaking multiline command noise."""
|
||||
text = " ".join(str(value or "").split())
|
||||
if len(text) <= limit:
|
||||
return text
|
||||
return f"{text[: max(0, limit - 3)]}..."
|
||||
|
||||
|
||||
def _format_auto_repair_status_line(
|
||||
*,
|
||||
incident_id: str,
|
||||
target: str,
|
||||
action: str,
|
||||
success: bool,
|
||||
error: str = "",
|
||||
metrics_delta_text: str = "",
|
||||
) -> str:
|
||||
"""Render auto-repair result as a scannable operation card."""
|
||||
safe_incident = html.escape(_clip_telegram_field(incident_id, 40))
|
||||
safe_target = html.escape(_clip_telegram_field(target, 80) or "unknown")
|
||||
safe_action = html.escape(_clip_telegram_field(action, 160) or "已執行")
|
||||
safe_error = html.escape(_clip_telegram_field(error, 180) or "未回傳錯誤")
|
||||
|
||||
if success:
|
||||
delta_line = (
|
||||
f"\n├ 指標:<code>{html.escape(_clip_telegram_field(metrics_delta_text, 120))}</code>"
|
||||
if metrics_delta_text
|
||||
else ""
|
||||
)
|
||||
return (
|
||||
"✅ <b>AUTO RESOLVED|AI 自動修復完成</b>\n"
|
||||
"──────────────────────\n"
|
||||
f"├ 事件:<code>{safe_incident}</code>\n"
|
||||
f"├ 對象:<code>{safe_target}</code>\n"
|
||||
f"├ 執行:<code>{safe_action}</code>\n"
|
||||
"├ 狀態:自動化已完成,等待後驗證觀察\n"
|
||||
"├ Actor:leWOOOgo autonomous"
|
||||
f"{delta_line}"
|
||||
)
|
||||
|
||||
return (
|
||||
"🧑🔧 <b>HANDOFF REQUIRED|AI 自動修復失敗,已轉人工</b>\n"
|
||||
"──────────────────────\n"
|
||||
f"├ 事件:<code>{safe_incident}</code>\n"
|
||||
f"├ 對象:<code>{safe_target}</code>\n"
|
||||
f"├ 嘗試:<code>{safe_action}</code>\n"
|
||||
f"├ 原因:<code>{safe_error}</code>\n"
|
||||
"├ 狀態:自動化已停止,不再重試\n"
|
||||
"└ 下一步:請 SRE 依 AwoooP Run / 原告警卡處理"
|
||||
)
|
||||
|
||||
|
||||
async def _push_auto_repair_result(
|
||||
incident: Incident,
|
||||
action: str,
|
||||
@@ -1051,24 +1135,16 @@ async def _push_auto_repair_result(
|
||||
except Exception as _k8s_err:
|
||||
logger.debug("k8s_state_after_failed", incident_id=inc_id, error=str(_k8s_err))
|
||||
|
||||
# 2026-05-02 ogt + Claude Sonnet 4.6: 強制標記 [AUTO],避免事後抵賴
|
||||
# 統帥要求「就算是自動化處理,也要發告警訊息出來」—— 所有自治動作必須留痕,
|
||||
# 且 Telegram 上能明顯與人工點擊區隔。
|
||||
if success:
|
||||
delta_line = f"\n├ 指標: <code>{metrics_delta_text}</code>" if metrics_delta_text else ""
|
||||
status_line = (
|
||||
f"🤖 <b>[AUTO] AI 自動修復完成</b>\n"
|
||||
f"├ 動作: <code>{action[:100] if action else '已執行'}</code>\n"
|
||||
f"├ Actor: leWOOOgo (autonomous)"
|
||||
f"{delta_line}"
|
||||
)
|
||||
else:
|
||||
status_line = (
|
||||
f"🤖❌ <b>[AUTO] AI 自動修復失敗,已升級人工介入</b>\n"
|
||||
f"├ 動作: <code>{action[:80] if action else '未知'}</code>\n"
|
||||
f"├ Actor: leWOOOgo (autonomous)\n"
|
||||
f"└ 錯誤: {error[:100] if error else '未知錯誤'}"
|
||||
)
|
||||
# 2026-05-07 Codex: 自動化結果必須讓 SRE 一眼分辨「已自動解決」或
|
||||
# 「已停止並轉人工」,不能再用 raw command / exception 片段洗版。
|
||||
status_line = _format_auto_repair_status_line(
|
||||
incident_id=inc_id,
|
||||
target=target,
|
||||
action=action,
|
||||
success=success,
|
||||
error=error,
|
||||
metrics_delta_text=metrics_delta_text,
|
||||
)
|
||||
|
||||
# BUG-006 修復 2026-04-11: outcome + verification_result 全為 null
|
||||
# 原因:_push_auto_repair_result 只送 Telegram,沒寫 DB
|
||||
@@ -2693,9 +2769,10 @@ class DecisionManager:
|
||||
if context_parts:
|
||||
llm_expert_context["diagnosis_context"] = "\n\n".join(context_parts)
|
||||
|
||||
# GAP-B4 (2026-04-14 Claude Sonnet 4.6): LLM 25s hard timeout,
|
||||
# 比外層 decide() 30s wait_for 更嚴格,留 5s 給 YAML risk override + NemoClaw second opinion
|
||||
# Timeout → 明確 llm_timeout_fallback 日誌,返回 expert_result 而非等外層觸發
|
||||
# 2026-05-06 Codex: The alert goal is resolution quality, not a
|
||||
# fast-but-paid card. The outer guard is configurable and must allow
|
||||
# the GCP-A → GCP-B → 111 Ollama lane to finish before cloud backup.
|
||||
llm_timeout_seconds = _incident_llm_timeout_seconds()
|
||||
llm_result, provider, success = await asyncio.wait_for(
|
||||
self._openclaw.generate_incident_proposal_with_tools(
|
||||
incident_id=incident.incident_id,
|
||||
@@ -2704,7 +2781,7 @@ class DecisionManager:
|
||||
affected_services=incident.affected_services,
|
||||
expert_context=llm_expert_context if llm_expert_context else None,
|
||||
),
|
||||
timeout=25.0,
|
||||
timeout=llm_timeout_seconds,
|
||||
)
|
||||
|
||||
if success and llm_result:
|
||||
@@ -2771,7 +2848,7 @@ class DecisionManager:
|
||||
logger.warning(
|
||||
"llm_timeout_fallback",
|
||||
incident_id=incident.incident_id,
|
||||
timeout_sec=25.0,
|
||||
timeout_sec=llm_timeout_seconds,
|
||||
action="降級 Expert System",
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -2922,6 +2999,52 @@ class DecisionManager:
|
||||
|
||||
return None
|
||||
|
||||
async def _find_existing_tokens_for_incidents(
|
||||
self,
|
||||
incident_ids: list[str],
|
||||
) -> dict[str, DecisionToken]:
|
||||
"""
|
||||
批次查找現有決策令牌。
|
||||
|
||||
2026-05-06 Codex: GET /api/v1/incidents 是前端輪詢路徑,不可對每個
|
||||
incident 都掃描一次 decision:*。這裡只掃一次 Redis keyspace,避免
|
||||
200+ incidents 時形成 O(N×M) 延遲與前端控制台卡死。
|
||||
"""
|
||||
wanted = set(incident_ids)
|
||||
if not wanted:
|
||||
return {}
|
||||
|
||||
import json
|
||||
|
||||
redis_client = get_redis()
|
||||
found: dict[str, DecisionToken] = {}
|
||||
cursor = 0
|
||||
while True:
|
||||
cursor, keys = await redis_client.scan(
|
||||
cursor=cursor,
|
||||
match=f"{DECISION_TOKEN_PREFIX}*",
|
||||
count=500,
|
||||
)
|
||||
|
||||
for key in keys:
|
||||
try:
|
||||
data = await redis_client.get(key)
|
||||
if not data:
|
||||
continue
|
||||
token_data = json.loads(data)
|
||||
incident_id = token_data.get("incident_id")
|
||||
if incident_id in wanted and incident_id not in found:
|
||||
found[incident_id] = DecisionToken.from_dict(token_data)
|
||||
if len(found) == len(wanted):
|
||||
return found
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if cursor == 0:
|
||||
break
|
||||
|
||||
return found
|
||||
|
||||
async def _persist_decision_to_db(
|
||||
self, incident_id: str, proposal_data: dict
|
||||
) -> None:
|
||||
@@ -3235,7 +3358,7 @@ class DecisionManager:
|
||||
# 與 line 217-218 同邏輯,避免 pod restart resend 路徑繞過 fingerprint dedup。
|
||||
# 原本 telegram_sent:{incident_id} TTL 600s 早就過期 → 重啟必重發;
|
||||
# 改 fingerprint + 24h TTL → 同症狀 24h 內任何 INC ID 都不會重推。
|
||||
_alertname_fp = (getattr(incident, "title", None) or "unknown").strip().lower().replace(" ", "_")[:60]
|
||||
_alertname_fp = _incident_alertname_for_dedup(incident).strip().lower().replace(" ", "_")[:60]
|
||||
_affected = getattr(incident, "affected_services", None) or []
|
||||
_target_fp = (_affected[0] if _affected else "unknown").lower()[:40]
|
||||
dedup_key = f"telegram_sent:fp:{_alertname_fp}:{_target_fp}"
|
||||
@@ -3442,6 +3565,8 @@ class DecisionManager:
|
||||
token.proposal_data["decision_state"] = DecisionState.READY.value
|
||||
token.proposal_data["auto_executed"] = False
|
||||
token.proposal_data["mcp_all_failed"] = True
|
||||
if _tool == "ssh_diagnose":
|
||||
token.proposal_data["automation_state"] = "diagnosis_failed_manual_required"
|
||||
await self._save_token(token)
|
||||
_fire_and_forget(
|
||||
_escalate_decision_auto_repair_unavailable(
|
||||
@@ -3451,14 +3576,15 @@ class DecisionManager:
|
||||
attempted_actions=f"decision_manager._ssh_execute -> {_tool}",
|
||||
)
|
||||
)
|
||||
_fire_and_forget(
|
||||
_push_auto_repair_result(
|
||||
incident,
|
||||
action,
|
||||
success=False,
|
||||
error=token.error,
|
||||
if _tool != "ssh_diagnose":
|
||||
_fire_and_forget(
|
||||
_push_auto_repair_result(
|
||||
incident,
|
||||
action,
|
||||
success=False,
|
||||
error=token.error,
|
||||
)
|
||||
)
|
||||
)
|
||||
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
|
||||
return
|
||||
|
||||
@@ -3468,6 +3594,7 @@ class DecisionManager:
|
||||
token.proposal_data["auto_executed"] = False
|
||||
token.proposal_data["ssh_diagnosis_collected"] = True
|
||||
token.proposal_data["ssh_diagnosis_preview"] = output_preview
|
||||
token.proposal_data["automation_state"] = "diagnosis_collected_manual_required"
|
||||
await self._save_token(token)
|
||||
_fire_and_forget(
|
||||
_escalate_decision_auto_repair_unavailable(
|
||||
|
||||
@@ -17,11 +17,12 @@ Drift Interpreter - Phase 25 P2 Config Drift Detection
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import structlog
|
||||
|
||||
from src.models.drift import DriftIntent, DriftInterpretation, DriftItem
|
||||
from src.models.drift import DriftIntent, DriftInterpretation
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from src.models.drift import DriftReport
|
||||
@@ -52,6 +53,58 @@ _INTENT_PROMPT_TEMPLATE = """你是 AWOOOI GitOps 守門員,請分析以下 K8
|
||||
"""
|
||||
|
||||
|
||||
def _strip_think_blocks(text: str) -> str:
|
||||
"""移除 qwen/deepseek 類模型常見的 <think> 推理段。"""
|
||||
return re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE).strip()
|
||||
|
||||
|
||||
def _extract_first_json_object(text: str) -> dict | None:
|
||||
"""
|
||||
從 LLM 回應中擷取第一個 JSON object。
|
||||
|
||||
Ollama qwen3/deepseek 常會在 JSON 前後加 `<think>` 或短句;這些文字不應
|
||||
讓 drift intent 直接降級成 UNKNOWN。
|
||||
"""
|
||||
cleaned = _strip_think_blocks(text)
|
||||
|
||||
candidates = [cleaned]
|
||||
candidates.extend(match.group(1).strip() for match in re.finditer(r"```(?:json)?\s*([\s\S]+?)```", cleaned))
|
||||
|
||||
start = cleaned.find("{")
|
||||
if start >= 0:
|
||||
in_string = False
|
||||
escaped = False
|
||||
depth = 0
|
||||
for idx, ch in enumerate(cleaned[start:], start=start):
|
||||
if escaped:
|
||||
escaped = False
|
||||
continue
|
||||
if ch == "\\":
|
||||
escaped = True
|
||||
continue
|
||||
if ch == '"':
|
||||
in_string = not in_string
|
||||
continue
|
||||
if in_string:
|
||||
continue
|
||||
if ch == "{":
|
||||
depth += 1
|
||||
elif ch == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
candidates.append(cleaned[start : idx + 1])
|
||||
break
|
||||
|
||||
for candidate in candidates:
|
||||
try:
|
||||
data = json.loads(candidate)
|
||||
except Exception:
|
||||
continue
|
||||
if isinstance(data, dict):
|
||||
return data
|
||||
return None
|
||||
|
||||
|
||||
class NemotronDriftInterpreter:
|
||||
"""
|
||||
使用 Nemotron 分析漂移意圖
|
||||
@@ -62,7 +115,7 @@ class NemotronDriftInterpreter:
|
||||
❌ 不直接呼叫 kubectl 或 git
|
||||
"""
|
||||
|
||||
async def analyze(self, report: "DriftReport") -> DriftInterpretation:
|
||||
async def analyze(self, report: DriftReport) -> DriftInterpretation:
|
||||
"""
|
||||
分析漂移意圖
|
||||
|
||||
@@ -85,7 +138,7 @@ class NemotronDriftInterpreter:
|
||||
result = await self._call_nemotron(prompt)
|
||||
return result
|
||||
|
||||
def _format_diff_for_prompt(self, report: "DriftReport") -> str:
|
||||
def _format_diff_for_prompt(self, report: DriftReport) -> str:
|
||||
"""格式化 diff 給 Nemotron 分析用"""
|
||||
lines = []
|
||||
for item in report.items[:10]: # 最多 10 項避免 token 過多
|
||||
@@ -111,7 +164,17 @@ class NemotronDriftInterpreter:
|
||||
try:
|
||||
from src.services.openclaw import get_openclaw
|
||||
openclaw = get_openclaw()
|
||||
response_text, _provider, success = await openclaw.call(prompt)
|
||||
response_text, _provider, success = await openclaw.call(
|
||||
prompt,
|
||||
alert_context={
|
||||
"intent_hint": "config",
|
||||
"task_type": "diagnose",
|
||||
"enforce_ollama_first": True,
|
||||
"allow_gcp_heavy_model": True,
|
||||
"target_resource": "config-drift",
|
||||
"alert_type": "ConfigDriftInternalScan",
|
||||
},
|
||||
)
|
||||
|
||||
if not success or not response_text:
|
||||
logger.warning("drift_interpreter_openclaw_failed", provider=_provider)
|
||||
@@ -125,19 +188,9 @@ class NemotronDriftInterpreter:
|
||||
|
||||
def _parse_response(self, text: str) -> DriftInterpretation:
|
||||
"""解析 Nemotron JSON 回應"""
|
||||
try:
|
||||
# 嘗試直接解析
|
||||
data = json.loads(text)
|
||||
except Exception:
|
||||
try:
|
||||
import re
|
||||
match = re.search(r"```(?:json)?\s*([\s\S]+?)```", text)
|
||||
if match:
|
||||
data = json.loads(match.group(1))
|
||||
else:
|
||||
return self._unknown_result("無法解析 JSON")
|
||||
except Exception:
|
||||
return self._unknown_result("JSON 解析失敗")
|
||||
data = _extract_first_json_object(text)
|
||||
if data is None:
|
||||
return self._unknown_result("無法解析 JSON")
|
||||
|
||||
try:
|
||||
intent_str = data.get("intent", "unknown")
|
||||
|
||||
@@ -33,10 +33,11 @@ logger = structlog.get_logger(__name__)
|
||||
# ============================================================
|
||||
# 設定
|
||||
# ============================================================
|
||||
# 2026-05-03 ogt: ADR-110 GCP-A Primary — 改從 settings 讀取,不再硬編碼 111
|
||||
# 2026-05-05 Codex: 重摘要走 111 lane,避免污染 GCP alert-fast lane
|
||||
def _get_ollama_url() -> str:
|
||||
from src.core.config import get_settings
|
||||
return get_settings().OLLAMA_URL
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
return resolve_ollama_endpoint("deep_rca")
|
||||
# D1 集中化 2026-04-11: 從 models.json providers.ollama.models.drift_summary 讀取
|
||||
NARRATOR_MODEL = get_model("ollama", "drift_summary")
|
||||
NARRATOR_TIMEOUT = 90.0 # seconds
|
||||
@@ -120,8 +121,8 @@ class DriftNarratorService:
|
||||
|
||||
async def narrate_and_notify(
|
||||
self,
|
||||
report: "DriftReport",
|
||||
interpretation: "DriftInterpretation | None" = None,
|
||||
report: DriftReport,
|
||||
interpretation: DriftInterpretation | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
生成人話摘要並推送 Telegram
|
||||
@@ -166,7 +167,7 @@ class DriftNarratorService:
|
||||
medium=report.medium_count,
|
||||
)
|
||||
|
||||
def _should_narrate(self, report: "DriftReport") -> bool:
|
||||
def _should_narrate(self, report: DriftReport) -> bool:
|
||||
"""觸發條件:high >= 1 or medium >= 3"""
|
||||
# 過濾 HPA 白名單後重算
|
||||
non_hpa_items = [
|
||||
@@ -180,8 +181,8 @@ class DriftNarratorService:
|
||||
|
||||
async def _generate_narrative_and_items(
|
||||
self,
|
||||
report: "DriftReport",
|
||||
interpretation: "DriftInterpretation | None",
|
||||
report: DriftReport,
|
||||
interpretation: DriftInterpretation | None,
|
||||
) -> tuple[str, list[dict], dict]:
|
||||
"""
|
||||
2026-04-18 ogt + Claude Opus 4.7: B 方案 — LLM 產生 narrative + 結構化 items
|
||||
@@ -354,8 +355,8 @@ class DriftNarratorService:
|
||||
|
||||
def _fallback_recommendation(
|
||||
self,
|
||||
report: "DriftReport",
|
||||
interpretation: "DriftInterpretation | None",
|
||||
report: DriftReport,
|
||||
interpretation: DriftInterpretation | None,
|
||||
) -> dict:
|
||||
"""
|
||||
2026-04-20 P0.2 ogt + Claude Opus 4.7: LLM 沒給 recommendation 時的 Python fallback
|
||||
@@ -397,7 +398,7 @@ class DriftNarratorService:
|
||||
|
||||
async def _log_ai_action_to_db(
|
||||
self,
|
||||
report: "DriftReport",
|
||||
report: DriftReport,
|
||||
prompt: str,
|
||||
raw_response: str | None,
|
||||
narrative: str,
|
||||
@@ -416,7 +417,9 @@ class DriftNarratorService:
|
||||
- 若能找到該 drift 的 incident 關聯,設 parent_op_id
|
||||
"""
|
||||
import json as _json
|
||||
|
||||
from sqlalchemy import text as _sql
|
||||
|
||||
from src.db.base import get_db_context
|
||||
|
||||
input_json = _json.dumps({
|
||||
@@ -511,7 +514,7 @@ class DriftNarratorService:
|
||||
items_count=len(items),
|
||||
)
|
||||
|
||||
def _format_drift_for_llm(self, report: "DriftReport") -> str:
|
||||
def _format_drift_for_llm(self, report: DriftReport) -> str:
|
||||
"""
|
||||
2026-04-18 ogt + Claude Opus 4.7: B 方案 — 餵 LLM 用的 JSON 序列化
|
||||
保留更多原始 context 給 LLM 推理,不做 30 字元暴力截斷
|
||||
@@ -582,7 +585,7 @@ class DriftNarratorService:
|
||||
# 一般變化
|
||||
return f"{from_val} → {to_val}"
|
||||
|
||||
def _fallback_items(self, report: "DriftReport") -> list[dict]:
|
||||
def _fallback_items(self, report: DriftReport) -> list[dict]:
|
||||
"""
|
||||
LLM 失敗時的 Python 智能摘要 (取代舊 str()[:30])
|
||||
- 過濾白名單
|
||||
@@ -605,7 +608,7 @@ class DriftNarratorService:
|
||||
})
|
||||
return items
|
||||
|
||||
def _format_intent_summary(self, interpretation: "DriftInterpretation | None") -> str:
|
||||
def _format_intent_summary(self, interpretation: DriftInterpretation | None) -> str:
|
||||
if not interpretation:
|
||||
return "無意圖分析"
|
||||
return (
|
||||
@@ -616,8 +619,8 @@ class DriftNarratorService:
|
||||
|
||||
def _fallback_narrative(
|
||||
self,
|
||||
report: "DriftReport",
|
||||
interpretation: "DriftInterpretation | None",
|
||||
report: DriftReport,
|
||||
interpretation: DriftInterpretation | None,
|
||||
) -> str:
|
||||
"""LLM 失敗時的結構化 fallback"""
|
||||
resources = list({
|
||||
@@ -636,7 +639,7 @@ class DriftNarratorService:
|
||||
|
||||
async def _send_telegram(
|
||||
self,
|
||||
report: "DriftReport",
|
||||
report: DriftReport,
|
||||
narrative: str,
|
||||
items: list[dict],
|
||||
recommendation: dict | None = None,
|
||||
@@ -667,7 +670,7 @@ class DriftNarratorService:
|
||||
except Exception as e:
|
||||
logger.warning("drift_narrator_telegram_error", error=str(e))
|
||||
|
||||
def _count_nontrivial_drift(self, report: "DriftReport") -> int:
|
||||
def _count_nontrivial_drift(self, report: DriftReport) -> int:
|
||||
"""
|
||||
計算非白名單、非 trivial (K8s 自動補齊) 的 drift 數
|
||||
用於 Telegram 底部「還有 N 項」顯示實際可操作數量
|
||||
@@ -704,7 +707,7 @@ class DriftNarratorService:
|
||||
|
||||
def _render_telegram_body(
|
||||
self,
|
||||
report: "DriftReport",
|
||||
report: DriftReport,
|
||||
narrative: str,
|
||||
items: list[dict],
|
||||
recommendation: dict | None = None,
|
||||
|
||||
@@ -71,7 +71,7 @@ class BaselineState:
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: dict[str, Any]) -> "BaselineState":
|
||||
def from_dict(cls, d: dict[str, Any]) -> BaselineState:
|
||||
return cls(
|
||||
metric_name=d["metric_name"],
|
||||
mean=d["mean"],
|
||||
@@ -250,6 +250,7 @@ class DynamicBaselineService:
|
||||
) -> list[MetricDatapoint]:
|
||||
"""從 Prometheus query_range API 抓取歷史資料(1h 步進)。"""
|
||||
import httpx
|
||||
|
||||
from src.core.config import settings
|
||||
|
||||
end_ts = now_taipei().timestamp()
|
||||
@@ -314,7 +315,7 @@ class DynamicBaselineService:
|
||||
seasonal="add" if len(arr) >= seasonal_periods * 2 else None,
|
||||
seasonal_periods=seasonal_periods,
|
||||
initialization_method="estimated",
|
||||
).fit(optimized=True, disp=False)
|
||||
).fit(optimized=True)
|
||||
|
||||
fitted = model.fittedvalues
|
||||
residuals = arr - fitted
|
||||
@@ -447,6 +448,7 @@ class DynamicBaselineService:
|
||||
"""從 PostgreSQL 載入最新一筆基線記錄"""
|
||||
try:
|
||||
from sqlalchemy import select
|
||||
|
||||
from src.db.base import get_session_factory
|
||||
from src.db.models import DynamicBaselineRecord
|
||||
|
||||
|
||||
@@ -1,17 +1,18 @@
|
||||
"""
|
||||
Embedding Service - Ollama BGE-M3 替代方案
|
||||
==========================================
|
||||
Embedding Service - Ollama bge-m3:latest 專用向量化
|
||||
===================================================
|
||||
|
||||
使用 Ollama qwen2.5:7b-instruct 提供文本向量化功能。
|
||||
雖非專用 embedding 模型,但支援多語言 (繁中/英文)。
|
||||
使用 Ollama bge-m3:latest 提供文本向量化功能(1024 維)。
|
||||
bge-m3 為專用多語言 embedding 模型,支援繁中/英文語義搜尋。
|
||||
|
||||
Phase 13.2 #84 - RAG Tool 基礎設施
|
||||
ADR-110 2026-05-04: GCP-A Primary 升級 bge-m3(768→1024 維遷移)
|
||||
|
||||
版本: v1.1
|
||||
版本: v1.2
|
||||
建立日期: 2026-03-26 20:30 (台北時區)
|
||||
更新日期: 2026-03-29 20:50 (台北時區)
|
||||
更新日期: 2026-05-04 (台北時區) — ADR-110 bge-m3 升級
|
||||
建立者: Claude Code
|
||||
更新者: Claude Code (P1 修復: 維度配置化)
|
||||
更新者: ogt + Claude Sonnet 4.6 (ADR-110 GCP-A Primary)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
@@ -20,8 +21,8 @@ from typing import Protocol
|
||||
import httpx
|
||||
import structlog
|
||||
|
||||
from src.core.config import settings
|
||||
from src.services.model_registry import get_model as _get_model
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
@@ -58,7 +59,7 @@ class OllamaEmbeddingService:
|
||||
Ollama Embedding Service
|
||||
|
||||
使用 Ollama API 進行文本向量化。
|
||||
預設使用 qwen2.5:7b-instruct (3584 維向量)。
|
||||
預設使用 bge-m3:latest (1024 維向量),來自 GCP-A (34.143.170.20)。
|
||||
|
||||
Usage:
|
||||
service = OllamaEmbeddingService()
|
||||
@@ -71,12 +72,16 @@ class OllamaEmbeddingService:
|
||||
"qwen2.5:3b-instruct": 2048,
|
||||
"llama3.2:3b": 3072,
|
||||
"nomic-embed-text": 768,
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP-A Primary — bge-m3 專用 embedding 模型
|
||||
# bge-m3 產生 1024 維向量;pgvector schema 已遷移至 vector(1024)(見 embedding_bge_m3_1024.sql)
|
||||
"bge-m3:latest": 1024,
|
||||
"bge-m3": 1024,
|
||||
}
|
||||
DEFAULT_DIMENSION = 3584 # 未知模型的預設值
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str = "qwen2.5:7b-instruct",
|
||||
model: str = "bge-m3:latest",
|
||||
ollama_url: str | None = None,
|
||||
timeout: float = 30.0,
|
||||
default_dimension: int | None = None,
|
||||
@@ -93,7 +98,7 @@ class OllamaEmbeddingService:
|
||||
P1 修復 (2026-03-29): 維度配置化,支援更多模型
|
||||
"""
|
||||
self._model = model
|
||||
self._ollama_url = ollama_url or settings.OLLAMA_URL
|
||||
self._ollama_url = ollama_url or resolve_ollama_endpoint("embedding")
|
||||
self._timeout = timeout
|
||||
self._default_dimension = default_dimension or self.MODEL_DIMENSIONS.get(
|
||||
model, self.DEFAULT_DIMENSION
|
||||
|
||||
@@ -12,16 +12,19 @@ from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
from datetime import datetime, timezone, timedelta
|
||||
import re
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
|
||||
TAIPEI_TZ = timezone(timedelta(hours=8))
|
||||
DEDUP_TTL_SEC = 600 # 10 min
|
||||
DEDUP_TTL_SEC = 600 # 10 min(故障切換用)
|
||||
RECOVERY_DEDUP_TTL_SEC = 3600 # 1h — GCP 健康閃爍時 1 小時內不重複告警
|
||||
QUOTA_DEDUP_TTL_SEC = 86400 # 24h(每日 quota 告警只發一次)
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
_TELEGRAM_BOT_URL_RE = re.compile(r"(api\.telegram\.org/bot)[^/\s]+")
|
||||
|
||||
|
||||
class FailoverAlerter:
|
||||
@@ -69,11 +72,16 @@ class FailoverAlerter:
|
||||
logger.info("failover_alert_sent", to_provider=to_provider)
|
||||
|
||||
async def alert_recovery(self, event: dict[str, Any]) -> None:
|
||||
"""Ollama 自動恢復告警 — 10min dedup
|
||||
"""Ollama 自動恢復告警 — 1h dedup per host
|
||||
# 2026-05-03 ogt: ADR-110 三層容災,恢復主機從 event["recovered_host"] 動態讀取
|
||||
# 2026-05-04 ogt: dedup key 加 recovered_host + 改 1h TTL
|
||||
# 原 key 固定 "alert:recovery" → GCP-A 每 10min 閃爍就重發
|
||||
"""
|
||||
dedup_key = "alert:recovery"
|
||||
if not await self._check_dedup(dedup_key, ttl=DEDUP_TTL_SEC):
|
||||
recovered_host = event.get("recovered_host", event.get("to_provider", "ollama"))
|
||||
# sanitize host → 只留 IP/hostname 部分,拿掉 http:// 前綴與 port
|
||||
safe_host = str(recovered_host).replace("http://", "").replace("/", "_").replace(":", "_")
|
||||
dedup_key = f"alert:recovery:{safe_host}"
|
||||
if not await self._check_dedup(dedup_key, ttl=RECOVERY_DEDUP_TTL_SEC):
|
||||
logger.debug("recovery_alert_dedup_skipped")
|
||||
return
|
||||
|
||||
@@ -82,8 +90,6 @@ class FailoverAlerter:
|
||||
from_provider = event.get("from_provider", event.get("from", "?"))
|
||||
to_provider = event.get("to_provider", event.get("to", "ollama"))
|
||||
recovery_time = event.get("recovery_time", datetime.now(TAIPEI_TZ).isoformat())
|
||||
# 2026-05-03 ogt: ADR-110 — 恢復主機動態,不再硬編碼 111
|
||||
recovered_host = event.get("recovered_host", to_provider)
|
||||
|
||||
msg = (
|
||||
f"*Ollama 自動恢復*\n\n"
|
||||
@@ -128,46 +134,7 @@ class FailoverAlerter:
|
||||
logger.debug("governance_alert_dedup_skipped", event_type=event_type)
|
||||
return
|
||||
|
||||
status = _escape_md(str(payload.get("status", "warning")))
|
||||
impact = _as_dict(payload.get("impact"))
|
||||
remediation = _as_dict(payload.get("remediation"))
|
||||
actionable = _as_dict(payload.get("actionable"))
|
||||
|
||||
impact_lines = _lines_from_dict(impact, max_items=12, compact=True)
|
||||
remediation_lines = _lines_from_list(remediation.get("items"))
|
||||
remediation_next_action = remediation.get("next_action")
|
||||
remediation_hint = remediation.get("hint")
|
||||
actionable_lines = _lines_from_list(actionable.get("items"))
|
||||
|
||||
next_action_line = ""
|
||||
if remediation_next_action:
|
||||
next_action_line = f"\n 下一步:{_escape_md(str(remediation_next_action))}"
|
||||
if remediation_hint:
|
||||
next_action_line += f"\n 提示:{_escape_md(str(remediation_hint))}"
|
||||
|
||||
sections: list[str] = [
|
||||
"⚠️ *AI 治理警報*",
|
||||
f"\n類型:{_escape_md(event_type)}",
|
||||
f"狀態:{status}",
|
||||
]
|
||||
if impact_lines:
|
||||
sections.append(f"\n*影響*\n{impact_lines}")
|
||||
if remediation_lines or next_action_line:
|
||||
sections.append(f"\n*修復方向*")
|
||||
if remediation_lines:
|
||||
sections.append(remediation_lines)
|
||||
if next_action_line:
|
||||
sections.append(next_action_line)
|
||||
if actionable_lines:
|
||||
sections.append(f"\n*可直接自動化*\n{actionable_lines}")
|
||||
|
||||
fallback_items = _fallback_pairs(payload, keep={"status", "impact", "remediation", "actionable"})
|
||||
if fallback_items:
|
||||
sections.append(
|
||||
"\n*欄位快覽(備援)*\n" + "\n".join(fallback_items)
|
||||
)
|
||||
|
||||
msg = "\n".join(sections)
|
||||
msg = format_governance_alert_card(event_type, payload)
|
||||
await self._send(msg)
|
||||
logger.info("governance_alert_sent", event_type=event_type)
|
||||
|
||||
@@ -281,8 +248,8 @@ class FailoverAlerter:
|
||||
2026-04-25 P1.5 by Claude Engineer-D — 告警失敗不能阻斷主流程
|
||||
"""
|
||||
try:
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
from src.core.config import get_settings
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
|
||||
settings = get_settings()
|
||||
chat_id = getattr(settings, "SRE_GROUP_CHAT_ID", None) or getattr(settings, "OPENCLAW_TG_CHAT_ID", None)
|
||||
@@ -295,7 +262,13 @@ class FailoverAlerter:
|
||||
logger.info("telegram_failover_alert_sent", message_len=len(message))
|
||||
except Exception as e:
|
||||
# 不 raise — 告警失敗不該阻斷主流程(鐵律)
|
||||
logger.exception("telegram_failover_send_failed", error=str(e))
|
||||
# 2026-05-06 Codex: Telegram/httpx exception 字串可能包含 bot token URL,
|
||||
# 禁止用 logger.exception 輸出 chained traceback。
|
||||
logger.warning(
|
||||
"telegram_failover_send_failed",
|
||||
error=_sanitize_telegram_error(str(e)),
|
||||
error_type=type(e).__name__,
|
||||
)
|
||||
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
@@ -315,10 +288,189 @@ def _escape_md(text: str) -> str:
|
||||
return text
|
||||
|
||||
|
||||
def _sanitize_telegram_error(text: str) -> str:
|
||||
"""遮蔽 Telegram Bot URL 中的 token,避免例外訊息寫入 log。"""
|
||||
return _TELEGRAM_BOT_URL_RE.sub(r"\1<redacted>", text)
|
||||
|
||||
|
||||
def _as_dict(value: Any) -> dict[str, Any]:
|
||||
return value if isinstance(value, dict) else {}
|
||||
|
||||
|
||||
_EVENT_DISPLAY_NAMES = {
|
||||
"trust_drift": "信任漂移",
|
||||
"knowledge_degradation": "知識庫劣化",
|
||||
"governance_slo_data_gap": "SLO 資料缺口",
|
||||
"governance_self_failure": "治理自檢失敗",
|
||||
"llm_hallucination": "LLM 驗證失敗",
|
||||
"execution_blast_radius": "執行風險擴大",
|
||||
}
|
||||
|
||||
_STATUS_BADGES = {
|
||||
"critical": "🔴 critical",
|
||||
"error": "🔴 error",
|
||||
"violation": "🔴 violation",
|
||||
"warning": "🟡 warning",
|
||||
"degraded": "🟠 degraded",
|
||||
"ok": "🟢 ok",
|
||||
}
|
||||
|
||||
_IMPACT_PROFILES: dict[str, list[tuple[str, str]]] = {
|
||||
"trust_drift": [
|
||||
("drifted_count", "漂移 Playbook"),
|
||||
("total_playbooks", "總 Playbook"),
|
||||
("drift_ratio", "漂移比例"),
|
||||
("threshold", "警戒門檻"),
|
||||
("auto_deprecated_count", "自動停用"),
|
||||
],
|
||||
"knowledge_degradation": [
|
||||
("stale_count", "陳舊 KM"),
|
||||
("total_count", "總 KM"),
|
||||
("stale_ratio", "陳舊比例"),
|
||||
("threshold", "警戒門檻"),
|
||||
("stale_days", "陳舊天數"),
|
||||
],
|
||||
"governance_slo_data_gap": [
|
||||
("reason", "缺口原因"),
|
||||
("skipped_count", "略過指標"),
|
||||
("all_slo_metrics_not_emitted", "SLO 指標缺失"),
|
||||
],
|
||||
"governance_self_failure": [
|
||||
("failed_checks", "失敗檢查"),
|
||||
("total_checks", "總檢查"),
|
||||
("failure_rate", "失敗比例"),
|
||||
],
|
||||
"execution_blast_radius": [
|
||||
("affected_services", "受影響服務"),
|
||||
("blast_radius", "爆炸半徑"),
|
||||
("threshold", "警戒門檻"),
|
||||
],
|
||||
"llm_hallucination": [
|
||||
("failed", "驗證失敗"),
|
||||
("rate", "失敗比例"),
|
||||
("threshold", "警戒門檻"),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _event_display_name(event_type: str) -> str:
|
||||
if event_type in _EVENT_DISPLAY_NAMES:
|
||||
return _EVENT_DISPLAY_NAMES[event_type]
|
||||
if event_type.startswith("slo_"):
|
||||
return "SLO 違反"
|
||||
return event_type.replace("_", " ").strip().title()
|
||||
|
||||
|
||||
def _status_badge(status: Any) -> str:
|
||||
status_text = str(status or "warning")
|
||||
return _STATUS_BADGES.get(status_text.lower(), status_text)
|
||||
|
||||
|
||||
def _format_metric_value(key: str, value: Any) -> str:
|
||||
if isinstance(value, bool):
|
||||
return "是" if value else "否"
|
||||
if isinstance(value, (float, int)) and (
|
||||
key.endswith("_ratio") or key in {"threshold", "rate", "failure_rate"}
|
||||
):
|
||||
return f"{float(value) * 100:.1f}%"
|
||||
if isinstance(value, list):
|
||||
if not value:
|
||||
return "0"
|
||||
shown = ", ".join(str(item) for item in value[:3])
|
||||
if len(value) > 3:
|
||||
shown += f"…(共 {len(value)})"
|
||||
return shown
|
||||
return str(value)
|
||||
|
||||
|
||||
def _profiled_rows(event_type: str, data: dict[str, Any], *, max_rows: int = 8) -> list[str]:
|
||||
if not data:
|
||||
return []
|
||||
|
||||
used: set[str] = set()
|
||||
rows: list[str] = []
|
||||
for key, label in _IMPACT_PROFILES.get(event_type, []):
|
||||
if key in data:
|
||||
rows.append(f"{label}:{_format_metric_value(key, data[key])}")
|
||||
used.add(key)
|
||||
|
||||
for key in sorted(data.keys()):
|
||||
if len(rows) >= max_rows:
|
||||
break
|
||||
if key in used:
|
||||
continue
|
||||
rows.append(f"{key}:{_format_metric_value(key, data[key])}")
|
||||
|
||||
if len(data) > len(used) + max(0, max_rows - len(rows)):
|
||||
rows.append("更多欄位已收斂至 AwoooP 稽核資料")
|
||||
return rows[:max_rows]
|
||||
|
||||
|
||||
def _tree_lines(rows: list[str]) -> str:
|
||||
if not rows:
|
||||
return ""
|
||||
rendered: list[str] = []
|
||||
for idx, row in enumerate(rows):
|
||||
branch = "└" if idx == len(rows) - 1 else "├"
|
||||
rendered.append(f"{branch} {_escape_md(str(row))}")
|
||||
return "\n".join(rendered)
|
||||
|
||||
|
||||
def _governance_summary_lines(event_type: str, impact: dict[str, Any]) -> str:
|
||||
rows = _profiled_rows(event_type, impact)
|
||||
return _tree_lines(rows)
|
||||
|
||||
|
||||
def format_governance_alert_card(event_type: str, payload: dict[str, Any]) -> str:
|
||||
"""格式化 AI 治理 Telegram 卡片。
|
||||
|
||||
2026-05-07 Codex — 保留治理 payload,僅在 Telegram 邊界層把 raw key/value
|
||||
轉成可掃描卡片,避免大量純文字欄位洗版。
|
||||
"""
|
||||
payload = payload if isinstance(payload, dict) else {}
|
||||
impact = _as_dict(payload.get("impact"))
|
||||
remediation = _as_dict(payload.get("remediation"))
|
||||
actionable = _as_dict(payload.get("actionable"))
|
||||
status = payload.get("status", "warning")
|
||||
|
||||
sections: list[str] = [
|
||||
f"⚠️ *AI 治理警報|{_escape_md(_event_display_name(event_type))}*",
|
||||
"──────────────────────",
|
||||
f"類型:{_escape_md(event_type)}",
|
||||
f"狀態:{_escape_md(_status_badge(status))}",
|
||||
]
|
||||
|
||||
impact_lines = _governance_summary_lines(event_type, impact)
|
||||
if impact_lines:
|
||||
sections.extend(["", "🧭 *影響摘要*", impact_lines])
|
||||
|
||||
remediation_lines = _lines_from_list(remediation.get("items"))
|
||||
remediation_next_action = remediation.get("next_action")
|
||||
remediation_hint = remediation.get("hint")
|
||||
if remediation_lines or remediation_next_action or remediation_hint:
|
||||
sections.extend(["", "🛠️ *修復方向*"])
|
||||
if remediation_lines:
|
||||
sections.append(remediation_lines)
|
||||
if remediation_next_action:
|
||||
sections.append(f"▶️ 下一步:{_escape_md(str(remediation_next_action))}")
|
||||
if remediation_hint:
|
||||
sections.append(f"💡 提示:{_escape_md(str(remediation_hint))}")
|
||||
|
||||
actionable_lines = _lines_from_list(actionable.get("items"))
|
||||
if actionable_lines:
|
||||
sections.extend(["", "🤖 *可自動化工作*", actionable_lines])
|
||||
|
||||
fallback_items = _fallback_pairs(
|
||||
payload,
|
||||
keep={"status", "impact", "remediation", "actionable"},
|
||||
max_items=4,
|
||||
)
|
||||
if fallback_items:
|
||||
sections.extend(["", "📎 *補充欄位*", "\n".join(fallback_items)])
|
||||
|
||||
return "\n".join(sections)
|
||||
|
||||
|
||||
def _lines_from_dict(data: dict[str, Any], max_items: int = 20, compact: bool = False) -> str:
|
||||
if not data:
|
||||
return ""
|
||||
@@ -330,7 +482,7 @@ def _lines_from_dict(data: dict[str, Any], max_items: int = 20, compact: bool =
|
||||
rows.append(f"{_escape_md(str(k))}:{_escape_md(str(data.get(k)))}")
|
||||
idx += 1
|
||||
if compact and len(rows) >= max_items:
|
||||
rows.append("...(更多欄位略)")
|
||||
rows.append(_escape_md("...(更多欄位略)"))
|
||||
return "\n".join(f" {line}" for line in rows)
|
||||
|
||||
|
||||
@@ -338,12 +490,17 @@ def _lines_from_list(value: Any) -> str:
|
||||
if not isinstance(value, list):
|
||||
return ""
|
||||
return "\n".join(
|
||||
f" {idx + 1}. {_escape_md(str(item))}"
|
||||
f" {_escape_md(str(idx + 1))}\\. {_escape_md(str(item))}"
|
||||
for idx, item in enumerate(value)
|
||||
)
|
||||
|
||||
|
||||
def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> list[str]:
|
||||
def _fallback_pairs(
|
||||
payload: dict[str, Any],
|
||||
keep: set[str] | None = None,
|
||||
*,
|
||||
max_items: int | None = None,
|
||||
) -> list[str]:
|
||||
if not isinstance(payload, dict):
|
||||
return []
|
||||
keep = set(keep or set())
|
||||
@@ -351,6 +508,9 @@ def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> li
|
||||
for key in sorted(payload.keys()):
|
||||
if key in keep:
|
||||
continue
|
||||
if max_items is not None and len(rows) >= max_items:
|
||||
rows.append(_escape_md("更多欄位已收斂至 AwoooP 稽核資料"))
|
||||
break
|
||||
rows.append(f"{_escape_md(str(key))}:{_escape_md(str(payload.get(key)))}")
|
||||
return rows
|
||||
|
||||
|
||||
@@ -237,6 +237,31 @@ class FlywheelStatsService:
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
continue
|
||||
|
||||
# 2026-05-06 ogt + Codex:
|
||||
# 執行成功率的 source of truth 是 auto_repair_executions。
|
||||
# Redis playbook success_count/failure_count 會因回寫鏈路中斷而落後,
|
||||
# 造成 governance / heartbeat 判定「飛輪沒有執行」。
|
||||
try:
|
||||
async with get_db_context() as db:
|
||||
row = await db.execute(
|
||||
text("""
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE success IS TRUE) AS success,
|
||||
COUNT(*) AS total
|
||||
FROM auto_repair_executions
|
||||
WHERE created_at >= NOW() - interval '24 hours'
|
||||
""")
|
||||
)
|
||||
repair_stats = row.one()
|
||||
db_total_exec = int(repair_stats.total or 0)
|
||||
if db_total_exec >= FLYWHEEL_MIN_SAMPLE:
|
||||
db_total_success = int(repair_stats.success or 0)
|
||||
return count, db_total_success / db_total_exec
|
||||
if db_total_exec > 0:
|
||||
return count, None
|
||||
except Exception:
|
||||
logger.warning("flywheel_stats_auto_repair_execution_query_failed")
|
||||
|
||||
if total_exec < FLYWHEEL_MIN_SAMPLE:
|
||||
# 樣本不足(含 Redis 空),回 None 通知呼叫方跳過 W-3 告警判斷
|
||||
return count, None
|
||||
|
||||
@@ -72,13 +72,15 @@ class GovernanceAgent:
|
||||
# 1. Playbook 信任度漂移
|
||||
# =========================================================================
|
||||
|
||||
async def check_trust_drift(self) -> dict[str, Any]:
|
||||
async def check_trust_drift(self, emit_alert: bool = True) -> dict[str, Any]:
|
||||
"""Playbook trust_score < 0.2 → 告警建議廢棄;30 天沒用過的直接 auto-deprecate
|
||||
|
||||
2026-04-26 P2.2 by Claude
|
||||
2026-05-02 ogt + Claude Sonnet 4.6: 加 auto_deprecate_low_trust_unused 自治路徑
|
||||
守衛條件:trust < 0.2 AND (last_used_at < 30 天前 OR 從未使用且創建超過 30 天)
|
||||
→ status 改 'deprecated',alert 改報「N 個告警 + M 個 auto-deprecated」
|
||||
2026-05-05 Codex: emit_alert=False 供 W-6 watchdog 查詢統計,維持
|
||||
governance_agent 單一入口,但避免與 hourly self-check 發出雙重 Telegram。
|
||||
"""
|
||||
async with get_db_context() as db:
|
||||
result = await db.execute(
|
||||
@@ -118,7 +120,7 @@ class GovernanceAgent:
|
||||
ids=auto_deprecated_ids[:10],
|
||||
)
|
||||
|
||||
if drifted:
|
||||
if drifted and emit_alert:
|
||||
drift_ratio = len(drifted) / total if total > 0 else 0.0
|
||||
await self._alert(
|
||||
"trust_drift",
|
||||
@@ -163,9 +165,11 @@ class GovernanceAgent:
|
||||
auto_deprecated=len(auto_deprecated_ids),
|
||||
kept=len(kept_ids),
|
||||
)
|
||||
drift_ratio = len(drifted) / total if total > 0 else 0.0
|
||||
return {
|
||||
"checked": total,
|
||||
"drifted": len(drifted),
|
||||
"drift_ratio": drift_ratio,
|
||||
"auto_deprecated": len(auto_deprecated_ids),
|
||||
"kept": len(kept_ids),
|
||||
}
|
||||
|
||||
@@ -16,15 +16,19 @@ Tier 3 鐵線(絕不觸碰):
|
||||
- 本模組透過 DecisionFusionAdapter(wrapper)間接使用這些能力
|
||||
|
||||
2026-05-03 ogt + Claude Sonnet 4.6(亞太): GovernanceDispatcher Wave 2E 實作
|
||||
2026-05-04 ogt + Claude Sonnet 4.6(亞太): skip 路徑無限迴圈修復
|
||||
- skip 決策後設 Redis 90min 冷卻,避免重複 LLM 呼叫
|
||||
- 超過 2 小時的 stale skip 事件標記 resolved=True(新事件若問題持續會重新產生)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import select, update
|
||||
|
||||
from src.db.base import get_db_context
|
||||
from src.db.models import AiGovernanceEvent
|
||||
@@ -46,6 +50,14 @@ logger = structlog.get_logger(__name__)
|
||||
# TODO: 移到 settings,允許運維不重啟調整 poll 間隔
|
||||
_DISPATCHER_INTERVAL_SEC: int = 30
|
||||
|
||||
# Skip 冷卻時間(秒):skip 決策後 90 分鐘內不重新評估同一事件
|
||||
# 原因:skip = 信心度不足,短期內 playbook trust / MCP 指標不會驟變
|
||||
_SKIP_COOLDOWN_SEC: int = 5400 # 90 分鐘
|
||||
|
||||
# Stale 事件閾值(秒):超過此時間的 skip 事件直接標 resolved
|
||||
# 原因:持久問題會由 governance_agent 重新產生新事件;舊事件繼續留著只是積壓
|
||||
_STALE_EVENT_SEC: int = 7200 # 2 小時
|
||||
|
||||
# 每輪最多處理幾個事件(避免單輪阻塞過長)
|
||||
_MAX_EVENTS_PER_CYCLE: int = 10
|
||||
|
||||
@@ -59,6 +71,54 @@ _DISPATCHABLE_EVENT_TYPES: frozenset[str] = frozenset({
|
||||
})
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Redis 冷卻 helpers(防止 skip 事件無限重評迴圈)
|
||||
# =============================================================================
|
||||
|
||||
async def _is_skip_cooldown(event_id: str) -> bool:
|
||||
"""確認事件是否在 skip 冷卻期內(90 分鐘)。"""
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
return bool(await redis.exists(f"governance:skip:{event_id}"))
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
async def _set_skip_cooldown(event_id: str) -> None:
|
||||
"""設置 skip 冷卻期(90 分鐘),防止重複 LLM 呼叫。"""
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
await redis.setex(f"governance:skip:{event_id}", _SKIP_COOLDOWN_SEC, "1")
|
||||
except Exception as exc:
|
||||
logger.warning("governance_skip_cooldown_set_failed", event_id=event_id, error=str(exc))
|
||||
|
||||
|
||||
async def _mark_event_resolved(event_id: str, reason: str) -> None:
|
||||
"""將 stale skip 事件標記為 resolved(持久問題會由 governance_agent 重新產生新事件)。
|
||||
|
||||
對齊模型設計:resolved=True 由「下次計算時補填」,
|
||||
dispatcher skip = 系統判斷當前無法自動修復,等同一次計算完成。
|
||||
"""
|
||||
try:
|
||||
from src.utils.timezone import now_taipei
|
||||
async with get_db_context() as db:
|
||||
await db.execute(
|
||||
update(AiGovernanceEvent)
|
||||
.where(AiGovernanceEvent.id == event_id)
|
||||
.where(AiGovernanceEvent.resolved.is_(False))
|
||||
.values(resolved=True, resolved_at=now_taipei())
|
||||
)
|
||||
logger.info(
|
||||
"governance_event_stale_resolved",
|
||||
event_id=event_id,
|
||||
reason=reason,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("governance_event_resolve_failed", event_id=event_id, error=str(exc))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 核心函數
|
||||
# =============================================================================
|
||||
@@ -75,6 +135,15 @@ async def dispatch_governance_event(event: AiGovernanceEvent) -> str | None:
|
||||
event_id = event.id
|
||||
event_type = event.event_type
|
||||
|
||||
# Step 0: Redis skip 冷卻檢查(防止 skip 事件每 30s 重新做 LLM 呼叫)
|
||||
if await _is_skip_cooldown(event_id):
|
||||
logger.debug(
|
||||
"governance_dispatch_skip_cooldown",
|
||||
event_id=event_id,
|
||||
event_type=event_type,
|
||||
)
|
||||
return None
|
||||
|
||||
# Step 1: 檢查是否已有活躍 dispatch(冪等保護)
|
||||
existing = await get_active_for_event(event_id)
|
||||
if existing is not None:
|
||||
@@ -108,12 +177,31 @@ async def dispatch_governance_event(event: AiGovernanceEvent) -> str | None:
|
||||
|
||||
# Step 3: 依 decision_path 決定要不要寫 dispatch
|
||||
if decision.decision_path == "skip":
|
||||
# 2026-05-04 ogt: 修復無限迴圈根因
|
||||
# skip 決策後設 90min Redis 冷卻,避免每 30s 重新呼叫 LLM
|
||||
# 超過 2h 的 stale 事件直接標 resolved(持久問題由 governance_agent 重新產生新事件)
|
||||
await _set_skip_cooldown(event_id)
|
||||
|
||||
triggered_at_aware = event.triggered_at
|
||||
if triggered_at_aware is not None and triggered_at_aware.tzinfo is None:
|
||||
triggered_at_aware = triggered_at_aware.replace(tzinfo=timezone.utc)
|
||||
event_age_sec = (
|
||||
(datetime.now(timezone.utc) - triggered_at_aware).total_seconds()
|
||||
if triggered_at_aware is not None else 0
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"governance_dispatch_path_skip",
|
||||
event_id=event_id,
|
||||
event_type=event_type,
|
||||
confidence=round(decision.confidence, 4),
|
||||
event_age_sec=int(event_age_sec),
|
||||
stale=event_age_sec > _STALE_EVENT_SEC,
|
||||
)
|
||||
|
||||
if event_age_sec > _STALE_EVENT_SEC:
|
||||
await _mark_event_resolved(event_id, reason=f"skip_stale_{int(event_age_sec)}s")
|
||||
|
||||
return None
|
||||
|
||||
# Step 4: 決定 executor_type 與 dispatch_status
|
||||
|
||||
@@ -15,7 +15,7 @@ HeartbeatReportService — ADR-073 心跳監控重構
|
||||
import asyncio
|
||||
import html
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
@@ -108,6 +108,7 @@ class HeartbeatReport:
|
||||
timestamp: datetime
|
||||
ai_services: dict[str, ProbeResult] = field(default_factory=dict)
|
||||
ollama_models: dict[str, bool] = field(default_factory=dict)
|
||||
ollama_endpoints: dict[str, ProbeResult] = field(default_factory=dict)
|
||||
mcp_providers: dict[str, ProbeResult] = field(default_factory=dict)
|
||||
flywheel: FlywheelStats = field(default_factory=FlywheelStats)
|
||||
infra: dict[str, ProbeResult] = field(default_factory=dict)
|
||||
@@ -181,6 +182,7 @@ class HeartbeatReportService:
|
||||
ollama_data = collected["_ollama"] or {}
|
||||
report.ai_services["ollama"] = ollama_data.get("probe", ProbeResult(False, "❌ 無回應"))
|
||||
report.ollama_models = ollama_data.get("models", {})
|
||||
report.ollama_endpoints = ollama_data.get("endpoints", {})
|
||||
report.ai_services["nemotron"] = collected["_nemotron"] or ProbeResult(False, "❌ 無回應")
|
||||
report.ai_services["gemini"] = collected["_gemini"] or ProbeResult(False, "❌ 無回應")
|
||||
report.ai_services["claude"] = collected["_claude"] or ProbeResult(False, "❌ 無回應")
|
||||
@@ -224,37 +226,62 @@ class HeartbeatReportService:
|
||||
|
||||
async def _probe_ollama(self) -> dict:
|
||||
"""探測 Ollama 服務 + 逐一確認所需模型"""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=_PROBE_TIMEOUT) as client:
|
||||
endpoints = [
|
||||
("GCP-A", settings.OLLAMA_URL),
|
||||
("GCP-B", getattr(settings, "OLLAMA_SECONDARY_URL", "")),
|
||||
("111", getattr(settings, "OLLAMA_FALLBACK_URL", "")),
|
||||
]
|
||||
|
||||
async def _probe_endpoint(
|
||||
client: httpx.AsyncClient,
|
||||
label: str,
|
||||
url: str,
|
||||
) -> tuple[str, ProbeResult, set[str]]:
|
||||
if not url:
|
||||
return label, ProbeResult(False, "⚠️ 未設定"), set()
|
||||
try:
|
||||
t0 = asyncio.get_event_loop().time()
|
||||
resp = await client.get(f"{settings.OLLAMA_URL}/api/tags")
|
||||
resp = await client.get(f"{url}/api/tags")
|
||||
latency = (asyncio.get_event_loop().time() - t0) * 1000
|
||||
if resp.status_code != 200:
|
||||
return label, ProbeResult(False, f"❌ HTTP {resp.status_code}", latency), set()
|
||||
available = {m["name"] for m in resp.json().get("models", [])}
|
||||
return label, ProbeResult(True, "✅ 正常", round(latency, 1)), available
|
||||
except Exception as e:
|
||||
return label, ProbeResult(False, f"❌ {str(e)[:60]}"), set()
|
||||
|
||||
if resp.status_code != 200:
|
||||
return {
|
||||
"probe": ProbeResult(False, f"❌ HTTP {resp.status_code}", latency),
|
||||
"models": {},
|
||||
}
|
||||
async with httpx.AsyncClient(timeout=_PROBE_TIMEOUT) as client:
|
||||
results = await asyncio.gather(
|
||||
*[_probe_endpoint(client, label, url) for label, url in endpoints],
|
||||
)
|
||||
|
||||
available = {m["name"] for m in resp.json().get("models", [])}
|
||||
endpoint_status = {label: probe for label, probe, _available in results}
|
||||
primary_probe = endpoint_status.get("GCP-A", ProbeResult(False, "❌ 無回應"))
|
||||
primary_available = next(
|
||||
(available for label, _probe, available in results if label == "GCP-A"),
|
||||
set(),
|
||||
)
|
||||
|
||||
if primary_probe.ok:
|
||||
# 也把 short name(無 :tag)加進去方便匹配
|
||||
available_short = {n.split(":")[0] for n in available}
|
||||
available_short = {n.split(":")[0] for n in primary_available}
|
||||
|
||||
model_status: dict[str, bool] = {}
|
||||
for required in settings.OLLAMA_REQUIRED_MODELS:
|
||||
req_short = required.split(":")[0]
|
||||
ok = required in available or req_short in available_short
|
||||
ok = required in primary_available or req_short in available_short
|
||||
model_status[required] = ok
|
||||
|
||||
return {
|
||||
"probe": ProbeResult(True, "✅ 正常", round(latency, 1)),
|
||||
"probe": primary_probe,
|
||||
"models": model_status,
|
||||
"endpoints": endpoint_status,
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"probe": ProbeResult(False, f"❌ {str(e)[:60]}"),
|
||||
"models": {},
|
||||
}
|
||||
|
||||
return {
|
||||
"probe": primary_probe,
|
||||
"models": {},
|
||||
"endpoints": endpoint_status,
|
||||
}
|
||||
|
||||
async def _probe_nemotron(self) -> ProbeResult:
|
||||
"""探測 Nemotron NIM API"""
|
||||
@@ -317,11 +344,20 @@ class HeartbeatReportService:
|
||||
"""K8s MCP:確認 kubectl 能連到 K3s"""
|
||||
try:
|
||||
from src.plugins.mcp.providers.k8s_provider import K8sProvider
|
||||
provider = K8sProvider()
|
||||
from src.plugins.mcp.registry import AuditedMCPToolProvider
|
||||
from src.services.mcp_audit_context import with_mcp_audit_context
|
||||
provider = AuditedMCPToolProvider(K8sProvider())
|
||||
if not provider.enabled:
|
||||
return ProbeResult(False, "⚠️ K8s MCP 未啟用")
|
||||
params = with_mcp_audit_context(
|
||||
{"resource_type": "nodes"},
|
||||
session_id="heartbeat:mcp_k8s",
|
||||
flywheel_node="govern",
|
||||
agent_role="heartbeat_report_service",
|
||||
gateway_path="legacy_heartbeat_provider",
|
||||
)
|
||||
result = await asyncio.wait_for(
|
||||
provider.execute("kubectl_get", {"resource_type": "nodes"}),
|
||||
provider.execute("kubectl_get", params),
|
||||
timeout=_PROBE_TIMEOUT,
|
||||
)
|
||||
if result.success:
|
||||
@@ -389,14 +425,23 @@ class HeartbeatReportService:
|
||||
"""Velero 備份:確認最後一次備份是否在 26 小時內"""
|
||||
try:
|
||||
from src.plugins.mcp.providers.k8s_provider import K8sProvider
|
||||
provider = K8sProvider()
|
||||
from src.plugins.mcp.registry import AuditedMCPToolProvider
|
||||
from src.services.mcp_audit_context import with_mcp_audit_context
|
||||
provider = AuditedMCPToolProvider(K8sProvider())
|
||||
if not provider.enabled:
|
||||
return ProbeResult(False, "⚠️ K8s MCP 未啟用,無法查 Velero")
|
||||
result = await asyncio.wait_for(
|
||||
provider.execute("kubectl_get", {
|
||||
params = with_mcp_audit_context(
|
||||
{
|
||||
"resource_type": "backups.velero.io",
|
||||
"namespace": "velero",
|
||||
}),
|
||||
},
|
||||
session_id="heartbeat:velero",
|
||||
flywheel_node="govern",
|
||||
agent_role="heartbeat_report_service",
|
||||
gateway_path="legacy_heartbeat_provider",
|
||||
)
|
||||
result = await asyncio.wait_for(
|
||||
provider.execute("kubectl_get", params),
|
||||
timeout=_PROBE_TIMEOUT,
|
||||
)
|
||||
if not result.success:
|
||||
@@ -419,9 +464,11 @@ class HeartbeatReportService:
|
||||
|
||||
try:
|
||||
# KM 向量化率(DB 查詢)
|
||||
from src.db.base import get_db_context
|
||||
from src.db.models import IncidentRecord, KnowledgeEntryRecord
|
||||
from sqlalchemy import func, select
|
||||
from sqlalchemy import text as sa_text
|
||||
|
||||
from src.db.base import get_db_context
|
||||
from src.db.models import KnowledgeEntryRecord
|
||||
async with get_db_context() as db:
|
||||
# KM 總數
|
||||
km_total = await db.scalar(select(func.count()).select_from(KnowledgeEntryRecord))
|
||||
@@ -436,20 +483,22 @@ class HeartbeatReportService:
|
||||
stats.km_vectorized = vec_result.scalar() or 0
|
||||
|
||||
# 24h 修復統計
|
||||
since = datetime.utcnow() - timedelta(hours=24)
|
||||
outcomes = await db.execute(
|
||||
select(IncidentRecord.outcome).where(
|
||||
IncidentRecord.created_at >= since,
|
||||
IncidentRecord.outcome.isnot(None),
|
||||
)
|
||||
)
|
||||
outcome_list = [r[0] for r in outcomes.all() if r[0]]
|
||||
stats.attempt_24h = len(outcome_list)
|
||||
stats.success_24h = sum(
|
||||
1 for o in outcome_list
|
||||
if isinstance(o, dict) and o.get("execution_success")
|
||||
or isinstance(o, str) and "success" in o.lower()
|
||||
# 2026-05-06 ogt + Codex:
|
||||
# incidents.outcome 已不是自動修復 source of truth。實際執行紀錄
|
||||
# 寫在 auto_repair_executions;舊查詢會讓心跳報告顯示 0/15,
|
||||
# 造成「全系統正常」但飛輪 KPI 失真的假象。
|
||||
repair_result = await db.execute(
|
||||
sa_text("""
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE success IS TRUE) AS success,
|
||||
COUNT(*) AS total
|
||||
FROM auto_repair_executions
|
||||
WHERE created_at >= NOW() - interval '24 hours'
|
||||
""")
|
||||
)
|
||||
repair_row = repair_result.one()
|
||||
stats.success_24h = int(repair_row.success or 0)
|
||||
stats.attempt_24h = int(repair_row.total or 0)
|
||||
|
||||
# 最後學習活動
|
||||
last_km = await db.scalar(
|
||||
@@ -470,8 +519,9 @@ class HeartbeatReportService:
|
||||
"""查 24h 告警流水線統計(approval_records)"""
|
||||
stats = AlertPipelineStats()
|
||||
try:
|
||||
from src.db.base import get_db_context
|
||||
from sqlalchemy import text as sa_text
|
||||
|
||||
from src.db.base import get_db_context
|
||||
async with get_db_context() as db:
|
||||
r = await db.execute(sa_text("""
|
||||
SELECT
|
||||
@@ -497,8 +547,9 @@ class HeartbeatReportService:
|
||||
"""探測 PostgreSQL 與 Redis 連線健康"""
|
||||
s = DbRedisStats()
|
||||
try:
|
||||
from src.db.base import get_db_context
|
||||
from sqlalchemy import text as sa_text
|
||||
|
||||
from src.db.base import get_db_context
|
||||
async with get_db_context() as db:
|
||||
await db.execute(sa_text("SELECT 1"))
|
||||
s.db_ok = True
|
||||
@@ -632,8 +683,9 @@ class HeartbeatReportService:
|
||||
logger.debug("heartbeat_automation_redis_failed", error=str(e))
|
||||
|
||||
try:
|
||||
from src.db.base import get_db_context
|
||||
from sqlalchemy import text as sa_text
|
||||
|
||||
from src.db.base import get_db_context
|
||||
async with get_db_context() as db:
|
||||
# 今日新增 KM(timestamptz 直接比較,不需 AT TIME ZONE)
|
||||
km_today = await db.scalar(sa_text(
|
||||
@@ -666,6 +718,10 @@ class HeartbeatReportService:
|
||||
if not loaded:
|
||||
warnings.append(f"{model} 未載入,相關功能失效")
|
||||
|
||||
for name, probe in report.ollama_endpoints.items():
|
||||
if not probe.ok and not probe.status.startswith("⚠️ 未設定"):
|
||||
warnings.append(f"Ollama {name} 異常: {probe.status}")
|
||||
|
||||
# AI 服務異常
|
||||
for name, probe in report.ai_services.items():
|
||||
if not probe.ok and not probe.status.startswith("⚠️"):
|
||||
@@ -796,6 +852,12 @@ def report_to_telegram_html(report: HeartbeatReport) -> str:
|
||||
|
||||
lines.append("🤖 <b>AI 服務</b>")
|
||||
lines.append(f"├─ Ollama: {ollama.status}{ollama_lat} <code>{html.escape(models_str)}</code>")
|
||||
if report.ollama_endpoints:
|
||||
endpoint_items = list(report.ollama_endpoints.items())
|
||||
for idx, (name, probe) in enumerate(endpoint_items):
|
||||
branch = "└" if idx == len(endpoint_items) - 1 else "├"
|
||||
latency = f" {probe.latency_ms:.0f}ms" if probe.latency_ms else ""
|
||||
lines.append(f"│ {branch}─ {html.escape(name)}: {probe.status}{latency}")
|
||||
lines.append(f"├─ Nemotron NIM: {nem.status}" + (f" {nem.latency_ms:.0f}ms" if nem.latency_ms else ""))
|
||||
lines.append(f"├─ Gemini API: {gem.status}" + (f" {gem.latency_ms:.0f}ms" if gem.latency_ms else ""))
|
||||
lines.append(f"└─ Claude API: {cla.status}" + (f" {cla.latency_ms:.0f}ms" if cla.latency_ms else ""))
|
||||
@@ -865,9 +927,10 @@ def report_to_telegram_html(report: HeartbeatReport) -> str:
|
||||
lines.append("☸️ <b>Kubernetes Pods</b>")
|
||||
for i, pod in enumerate(report.pods):
|
||||
prefix = "└─" if i == len(report.pods) - 1 else "├─"
|
||||
ready_icon = "✅" if pod.ready else "❌"
|
||||
ready_icon = "✅" if pod.ready or pod.status in ("Succeeded", "Completed") else "❌"
|
||||
restart_str = f" (重啟×{pod.restarts})" if pod.restarts > 0 else ""
|
||||
lines.append(f"{prefix} {ready_icon} {html.escape(pod.name[:35])}{restart_str}")
|
||||
status_str = "" if pod.ready else f" <code>{html.escape(pod.status)}</code>"
|
||||
lines.append(f"{prefix} {ready_icon} {html.escape(pod.name[:35])}{restart_str}{status_str}")
|
||||
|
||||
# --- Scanner 狀態 ---
|
||||
if report.scanners.last_runs:
|
||||
|
||||
@@ -7,7 +7,7 @@ Hosts:
|
||||
- 192.168.0.110: DevOps 金庫 (Harbor, GH Runner)
|
||||
- 192.168.0.112: Kali Security (Scanner API)
|
||||
- 192.168.0.120: K3s Master (awoooi-prod namespace)
|
||||
- 192.168.0.188: AI+Web 中心 (Nginx, PostgreSQL, Redis, Ollama, OpenClaw, SigNoz)
|
||||
- 192.168.0.188: AI+Web 中心 (Nginx, PostgreSQL, Redis, OpenClaw, SigNoz)
|
||||
|
||||
Features:
|
||||
- asyncio.gather for parallel fetching
|
||||
|
||||
@@ -21,7 +21,6 @@ AWOOOI — Image Analysis Service (Phase 34, ADR-067)
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
@@ -29,14 +28,13 @@ from typing import TYPE_CHECKING
|
||||
import httpx
|
||||
import structlog
|
||||
|
||||
from src.core.config import get_settings
|
||||
from src.services.model_registry import get_model
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
if TYPE_CHECKING:
|
||||
pass
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
settings = get_settings()
|
||||
|
||||
# D1 集中化 2026-04-11: 從 models.json providers.ollama.models.image_analysis 讀取
|
||||
_MODEL = get_model("ollama", "image_analysis")
|
||||
@@ -124,7 +122,7 @@ class ImageAnalysisService:
|
||||
image_b64 = base64.b64encode(image_path.read_bytes()).decode()
|
||||
http = await self._get_http()
|
||||
resp = await http.post(
|
||||
f"{settings.OLLAMA_URL}/api/generate",
|
||||
f"{resolve_ollama_endpoint('image_analysis')}/api/generate",
|
||||
json={
|
||||
"model": _MODEL,
|
||||
"prompt": question,
|
||||
|
||||
@@ -116,7 +116,7 @@ def classify_alert_early(
|
||||
1. ConfigurationDrift / KubeConfigDrift → TYPE-4D (Config Drift 卡片)
|
||||
2. severity=info/none → TYPE-1 (純資訊,無按鈕)
|
||||
3. backup/heartbeat 關鍵字 → TYPE-1(但 backup failure age > 24h → TYPE-3,見下)
|
||||
4. Docker/Host 前綴 → infrastructure TYPE-3
|
||||
4. Docker/Host/Systemd runner 前綴 → infrastructure/host_resource TYPE-3
|
||||
5. Kube/Pod/Deploy/Node/Velero/ArgoCD 前綴 → kubernetes TYPE-3
|
||||
6. Postgres/Redis 前綴 → database TYPE-3
|
||||
7. 預設 → general TYPE-3
|
||||
@@ -183,6 +183,12 @@ def classify_alert_early(
|
||||
if severity in ("info", "none"):
|
||||
return "info", "TYPE-1"
|
||||
|
||||
# 2026-05-05 ogt + Codex: self-hosted runners are host-level systemd services.
|
||||
# Must run before the generic "watchdog" heartbeat rule because
|
||||
# SystemdRunnerWatchdogEnabled contains "Watchdog" but is not a heartbeat.
|
||||
if alertname.startswith("SystemdRunner"):
|
||||
return "host_resource", "TYPE-3"
|
||||
|
||||
# 5. Backup / Heartbeat — 純資訊,不進 LLM
|
||||
# HostBackupFailed 必須在 Host prefix 前攔截,否則被歸 host_resource/TYPE-3
|
||||
# 2026-04-12 ogt: 只針對已知主機備份監控 alertname,不用寬泛關鍵字
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user