diff --git a/.gitea/workflows/cd-dev.yaml b/.gitea/workflows/cd-dev.yaml index 6f2bf5e7..df1bf900 100644 --- a/.gitea/workflows/cd-dev.yaml +++ b/.gitea/workflows/cd-dev.yaml @@ -145,9 +145,12 @@ jobs: mkdir -p ~/.ssh write_deploy_key + ssh-keyscan -T 5 -t ed25519,rsa,ecdsa 192.168.0.120 > "${HOME}/.ssh/known_hosts" 2>/dev/null + test -s "${HOME}/.ssh/known_hosts" || { echo "❌ K8S host keyscan failed: 192.168.0.120"; exit 1; } + SSH_OPTS="-o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${HOME}/.ssh/known_hosts -i ~/.ssh/deploy_key" # 2026-05-05 Codex: kubectl runs on 120 control-plane. 121 is a # worker and its local kubeconfig points at 127.0.0.1:6443. - ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS + ssh $SSH_OPTS wooo@192.168.0.120 << SECRETS set -e export KUBECONFIG=/etc/rancher/k3s/k3s.yaml @@ -174,11 +177,14 @@ jobs: # 部署到 awoooi-dev - name: Deploy to Dev K8s run: | + ssh-keyscan -T 5 -t ed25519,rsa,ecdsa 192.168.0.120 > "${HOME}/.ssh/known_hosts" 2>/dev/null + test -s "${HOME}/.ssh/known_hosts" || { echo "❌ K8S host keyscan failed: 192.168.0.120"; exit 1; } + SSH_OPTS="-o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${HOME}/.ssh/known_hosts -i ~/.ssh/deploy_key" cat k8s/awoooi-dev/02-configmap.yaml | \ - ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 \ + ssh $SSH_OPTS wooo@192.168.0.120 \ "export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -" - ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << 'DEPLOY' + ssh $SSH_OPTS wooo@192.168.0.120 << 'DEPLOY' set -e export KUBECONFIG=/etc/rancher/k3s/k3s.yaml diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 1e285700..c3e6fcba 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -602,7 +602,7 @@ jobs: AWOOOI_SECRET_SRE_GROUP_CHAT_ID )" - # S1/S2: 統一命名 deploy_key,改用 ssh-keyscan(比 StrictHostKeyChecking=no 更安全) + # S1/S2: 統一命名 deploy_key,改用 ssh-keyscan 與強制 host key 驗證。 write_deploy_key # 2026-05-13 Codex: keyscan must include ED25519 explicitly. Some # OpenSSH builds otherwise record only RSA/ECDSA, then strict deploy @@ -788,7 +788,7 @@ jobs: fi # 2026-04-06 Claude Code: Sprint 3 T2 — known_hosts Secret (Security Fix A1) - # 替換 StrictHostKeyChecking=no,讓 SSH 修復路徑使用已知主機指紋 + # 替換關閉 host key 驗證的舊做法,讓 SSH 修復路徑使用已知主機指紋。 # asyncssh reads /etc/ssh-mcp/known_hosts and requires a non-empty # OpenSSH known_hosts file. Keep hosts unhashed so both asyncssh and # CLI diagnostics can trust the same secret. diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index b3b33062..a03a6695 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,30 @@ +## 2026-06-11|IwoooS 即時資安危害優先序與 Wave 0 修補 + +**背景**:使用者調整 IwoooS 推進方向:不再把「完全不影響現有服務」放在所有工作之前,而是優先處理有即時性資訊安全危害的部分;處理過程仍需主動同步其他專案並避免未協調的服務中斷。本階段先確認全域資安範圍,並處理 source-control 層級可立即收斂的 P0。 + +**完成內容:** +- 新增 `docs/security/IWOOOS-REALTIME-SECURITY-SCOPE-AND-WAVE0.md`,固定 IwoooS 資安範圍:主機、K3s / ArgoCD、Nginx / 網路入口、服務、套件與供應鏈、網站前後台、AI / Agent、備份復原、VibeWork、agent-bounty-protocol 與人工治理。 +- 將優先序調整為:P0 secret / token / 密碼外洩、遠端執行與 agent / payout 風險、公開入口 / TLS / CORS / host key 風險;P1 才是主機更新、Kali 112 大量套件、hardening、備份與告警;P2 為可視化與 UX。 +- 移除 `k8s/monitoring/prometheus-config-phase-o.yaml` 內的 MinIO Prometheus bearer token 明文,改用 `bearer_token_file` reference,並移除註解中的 MinIO admin secret。 +- 移除 `k8s/velero/01-credentials.yaml` 內的 MinIO access / secret 可用值,改成 placeholder。 +- 移除 `docs/reference/SERVICE-ENDPOINTS.md` 內的 ArgoCD admin 密碼,改為受控密碼庫 / SSO / break-glass 說明。 +- `.gitea/workflows/cd-dev.yaml` 改用 `ssh-keyscan`、`StrictHostKeyChecking=yes` 與指定 `known_hosts`;`infra/ansible/inventory/group_vars/all.yml`、`scripts/health_check_session.sh`、`scripts/ops/deploy-docker-health-monitor.sh` 移除無條件關閉 host key 驗證的做法。 +- 清理 repair template 與正式 CD 註解中的危險參數逐字值,讓後續掃描不再把舊風險當可複製範例。 + +**本地驗證:** +- `node scripts/ci/check-gitea-step-env-secrets.js` 通過。 +- `python3 scripts/security/security-mirror-progress-guard.py --root .` 通過。 +- `python3 scripts/security/source-control-owner-response-guard.py --root .` 通過。 +- `git diff --check` 通過。 +- 高危字串掃描通過:`fSCLMBhtpRxhbRxw`、`Minio_Velero_2026`、MinIO JWT 前綴、`StrictHostKeyChecking=no`、`bearer_token:` 均未再命中 `.gitea`、`k8s`、`infra`、`scripts`、`docs/reference`、`docs/security`。 + +**完成度與邊界:** +- 即時資安 scope 重新確認:`100%`。 +- Wave 0 source-control 低風險修補:`100%`。 +- live token / password 輪替:`0%`,需 owner 確認與跨專案通知;不得在對話、文件或 commit 中收明文。 +- 主機更新、restart、hardening、active scan、Kali `/execute`、kubectl、ArgoCD sync、Nginx reload、Prometheus reload、MinIO / ArgoCD / Velero live credential rotation:本階段仍未執行。 +- IwoooS 整體仍維持 `64%`;active runtime gate 仍為 `0`;owner response received / accepted 仍為 `0 / false`。 + ## 2026-06-11|IwoooS 部署風險只讀卡與防誤讀邊界 **背景**:上一輪 `agent-bounty-protocol` 納管後,CD `2635` 的 API health、Playwright smoke 與 rollout 成功,但仍記錄 `AWOOOI_ROLLOUT_RISK=1`,原因為 ArgoCD health `Degraded` 且部分資源 `OutOfSync`。本階段只把風險放進 IwoooS 只讀態勢,不執行 ArgoCD sync、kubectl、主機重啟、修復或部署操作。 diff --git a/docs/reference/SERVICE-ENDPOINTS.md b/docs/reference/SERVICE-ENDPOINTS.md index 1df2e97a..26ceebfd 100644 --- a/docs/reference/SERVICE-ENDPOINTS.md +++ b/docs/reference/SERVICE-ENDPOINTS.md @@ -84,7 +84,7 @@ | 服務 | 端點 | 說明 | |------|------|------| -| **ArgoCD** | `192.168.0.125:30443` | GitOps UI (admin / fSCLMBhtpRxhbRxw) | +| **ArgoCD** | `192.168.0.125:30443` | GitOps UI;帳號與密碼不得寫入文件,請查受控密碼庫或 SSO / break-glass 流程 | ### 備份 (192.168.0.188) diff --git a/docs/security/IWOOOS-REALTIME-SECURITY-SCOPE-AND-WAVE0.md b/docs/security/IWOOOS-REALTIME-SECURITY-SCOPE-AND-WAVE0.md new file mode 100644 index 00000000..58063c74 --- /dev/null +++ b/docs/security/IWOOOS-REALTIME-SECURITY-SCOPE-AND-WAVE0.md @@ -0,0 +1,60 @@ +# IwoooS 即時資安工作範圍與 Wave 0 處置 + +| 項目 | 內容 | +|------|------| +| 日期 | 2026-06-11 | +| 模式 | 即時危害優先,但避免未協調的服務中斷 | +| 主控範圍 | AWOOOI / IwoooS 全產品資安視野 | +| runtime gate | `0`,除非另有明確維護窗口與 rollback owner | + +## 1. 工作範圍 + +IwoooS 資訊安全範圍涵蓋下列面向: + +1. 主機:`192.168.0.110`、`192.168.0.111`、`192.168.0.112`、`192.168.0.120`、`192.168.0.121`、`192.168.0.168`、`192.168.0.188`,以及已納入文件的 GCP / fallback 節點。 +2. K3s / ArgoCD / Kubernetes:namespace、Deployment、ConfigMap、Secret metadata、RBAC、NetworkPolicy、CronJob、HPA / VPA、Velero、Node Problem Detector、kured、descheduler。 +3. 服務:AWOOOI API / Web / Worker / auto-repair canary、AwoooP、OpenClaw、Ollama、PostgreSQL、Redis、Harbor、Gitea、Gitea Actions runner、Sentry、Langfuse、SignOz、ClickHouse、Prometheus、Alertmanager、MinIO、Kali Scanner。 +4. 網路與入口:Nginx、public domains、TLS / certbot、NodePort、VIP、WireGuard、內網 IP 暴露、CORS、public / internal route 邊界。 +5. 套件與供應鏈:Python、Node / pnpm、Docker image、Harbor image scan、Gitea / GitHub workflow、runner labels、deploy keys、secret name parity。 +6. 網站與後台:AWOOOI 前台、AwoooP、Governance、Code Review、安全合規、授權、告警、操作日誌、admin / treasury 類頁面。 +7. 新納管產品:VibeWork、agent-bounty-protocol,以及後續由 owner 指定的新 repo / product / surface。 +8. AI / Agent:模型 provider、NemoTron / Hermes / OpenClaw / ElephantAlpha、MCP / A2A、cron / daemon、auto claim / submit、外部訊息與 payout / withdrawal 邊界。 +9. 備份與復原:restic、offsite escrow、Gitea / DB / MinIO / Sentry / SignOz / Config backup、cold-start / DR runbook。 +10. 人工治理:owner response、security acceptance record、AwoooP approval 邊界、redacted evidence、quarantine、runtime follow-up gate。 + +## 2. 新優先序 + +使用者已調整方向:即時性資訊安全危害優先。後續排序如下: + +1. P0:已提交或可能外洩的 secret、token、密碼、private key、cookie、session、webhook secret。 +2. P0:可被濫用的遠端執行、掃描、agent 自主行為、payout / withdrawal、未授權 deploy / workflow / runner 修改。 +3. P0:公開入口、Nginx、TLS、CORS、內網 IP 暴露、未驗證 SSH host key、明顯 MITM 或橫向移動風險。 +4. P1:高風險過期套件、Kali 112 大量待更新、`networking.service` failed、服務 hardening 缺口。 +5. P1:備份 / restore / offsite escrow、Harbor image scan、可觀測性告警鏈路與失敗限定通知。 +6. P2:前台可視化、進度卡、UX、長期 GitHub primary readiness 與文檔整理。 + +## 3. Wave 0 已處置項 + +本波只做 source-control 層級的低風險修補,不登入主機、不重啟服務、不跑 active scan。 + +| 風險 | 處置 | 後續 | +|------|------|------| +| MinIO Prometheus bearer token 寫在 repo | 移除明文 token,改成 `bearer_token_file` reference | 需要 owner 安排 live token 輪替與主機 secret file 注入 | +| MinIO admin secret 出現在操作註解 | 改成 `` / `` placeholder | live 操作須走密碼庫,不在文件或對話貼值 | +| Velero MinIO credential sample 寫入可用值 | 改成 `` / `` placeholder | 若此值曾被使用,需輪替 MinIO / Velero credential | +| ArgoCD admin 密碼寫在端點文件 | 改成受控密碼庫 / SSO / break-glass 流程說明 | 需要 owner 確認該密碼是否仍有效,若有效需輪替 | +| dev CD SSH 關閉 host key 驗證 | 改成 `ssh-keyscan` + 強制 host key 驗證 + 指定 known_hosts | 若 120 host key 變更,需人工確認 fingerprint | +| Ansible / session health check 關閉 host key 驗證 | 改成 `accept-new`,避免無條件信任變更後 host key | 後續可升級為 pinned known_hosts evidence | +| docker-health-monitor deploy script 關閉 host key 驗證 | 改成 `ssh-keyscan` + `accept-new` + 指定 known_hosts | 後續可升級為 pinned host fingerprint | + +## 4. 仍需跨專案同步 + +1. AwoooP:同步即時風險優先序,避免把 UI 可見當批准。 +2. VibeWork:維持獨立產品邊界,只同步資安範圍與 owner response,不改其 repo / production。 +3. agent-bounty-protocol:優先確認 external agent、MCP / A2A、cron / daemon、webhook、treasury / payout 邊界;不讀 `.env`,不啟用 runtime。 +4. StockPlatform、Tsenyang、Bitan 等既有產品:後續納入全產品入口 / Nginx / TLS / secret / backup / admin surface 檢查,不與 AWOOOI runtime 混用權限。 +5. 主機維護:Kali 112、111、168、188、110、120、121 任何更新、restart、hardening、firewall、scan、reboot 都需維護窗口、rollback owner、post-check owner 與跨專案通知。 + +## 5. 邊界 + +本波不代表 active scan、credentialed scan、Kali `/execute`、ArgoCD sync、kubectl、SSH 主機修改、secret value collection、repo / refs mutation、GitHub primary switch、deploy、restart、payout、withdrawal 或外部 agent action 已授權。 diff --git a/infra/ansible/inventory/group_vars/all.yml b/infra/ansible/inventory/group_vars/all.yml index 31d763f0..5a4f46b1 100644 --- a/infra/ansible/inventory/group_vars/all.yml +++ b/infra/ansible/inventory/group_vars/all.yml @@ -6,7 +6,7 @@ timezone: "Asia/Taipei" # 共用 SSH 用戶 -ansible_ssh_common_args: "-o StrictHostKeyChecking=no -o ConnectTimeout=10" +ansible_ssh_common_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10" # sudoers NOPASSWD (CD 用,see ADR-034) sudo_password: "{{ vault_sudo_password }}" diff --git a/k8s/awoooi-prod/04-repair-known-hosts-template.yaml b/k8s/awoooi-prod/04-repair-known-hosts-template.yaml index ea76d042..ea60e9c6 100644 --- a/k8s/awoooi-prod/04-repair-known-hosts-template.yaml +++ b/k8s/awoooi-prod/04-repair-known-hosts-template.yaml @@ -15,7 +15,7 @@ # 安全說明: # - known_hosts 存 K8s Secret,掛載至 /etc/repair-ssh/known_hosts # - SSH 命令使用 -o UserKnownHostsFile=/etc/repair-ssh/known_hosts -# - 移除 -o StrictHostKeyChecking=no (安全漏洞) +# - 移除關閉 SSH host key 驗證的危險參數 apiVersion: v1 kind: Secret metadata: diff --git a/k8s/awoooi-prod/04-repair-ssh-key-template.yaml b/k8s/awoooi-prod/04-repair-ssh-key-template.yaml index d231d0df..f7ec4ca1 100644 --- a/k8s/awoooi-prod/04-repair-ssh-key-template.yaml +++ b/k8s/awoooi-prod/04-repair-ssh-key-template.yaml @@ -19,9 +19,9 @@ # - repair-bot-110.sh / repair-bot-188.sh 只允許 docker compose up -d 或 systemctl restart # # 驗證方式: -# ssh -i /tmp/awoooi_repair_bot -o StrictHostKeyChecking=no wooo@192.168.0.110 health +# ssh -i /tmp/awoooi_repair_bot -o UserKnownHostsFile=/etc/repair-ssh/known_hosts wooo@192.168.0.110 health # → 預期: REPAIR_BOT_HEALTHY:110 -# ssh -i /tmp/awoooi_repair_bot -o StrictHostKeyChecking=no ollama@192.168.0.188 health +# ssh -i /tmp/awoooi_repair_bot -o UserKnownHostsFile=/etc/repair-ssh/known_hosts ollama@192.168.0.188 health # → 預期: REPAIR_BOT_HEALTHY:188 apiVersion: v1 diff --git a/k8s/monitoring/prometheus-config-phase-o.yaml b/k8s/monitoring/prometheus-config-phase-o.yaml index f3b2f819..8a1b6935 100644 --- a/k8s/monitoring/prometheus-config-phase-o.yaml +++ b/k8s/monitoring/prometheus-config-phase-o.yaml @@ -17,14 +17,14 @@ # MinIO 此版本 (RELEASE.2024-03-26) 不支援 MINIO_PROMETHEUS_AUTH_TYPE=public # 必須使用 Bearer Token 認證 # Token 產生: docker exec minio mc admin prometheus generate local/ -# Token 有效期: ~2031 (exp: 4928730704) +# Token 不得提交到 Git;正式值應由主機本機 secret file 或 secret manager 注入。 # ============================================================================= # ===== MinIO 監控 (O-1.3) ===== # 前置條件: Bearer Token 由 mc admin prometheus generate 產生 # # 重新產生 Token: -# docker exec minio mc alias set local http://localhost:9000 minio_admin 'Minio_Velero_2026!' +# docker exec minio mc alias set local http://localhost:9000 '' # docker exec minio mc admin prometheus generate local/ # 驗證: # curl -H "Authorization: Bearer " http://192.168.0.188:9000/minio/v2/metrics/cluster | head -5 @@ -35,10 +35,9 @@ scrape_timeout: 10s metrics_path: /minio/v2/metrics/cluster scheme: http - # ⚠️ Bearer Token 認證 (2026-04-02 部署時由 mc admin prometheus generate 產生) - # Token 已直接寫入 .188:/home/ollama/momo-pro/monitoring/prometheus.yml - # 如需輪換: docker exec minio mc admin prometheus generate local/ - bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJwcm9tZXRoZXVzIiwic3ViIjoibWluaW9fYWRtaW4iLCJleHAiOjQ5Mjg3MzA3MDR9.s5WpFkluoicR_JXi_1l6dYVygkNV9G42s6c3NkSrenALWKZM78h-grj8vcqDeJaGR2eX4Ib4hPlcMqpM2yXjoQ + # ⚠️ Bearer Token 認證必須走檔案或 secret manager,不得把 token value 寫入 Git。 + # 即時資安處置:已移除 repo 內明文 token;live 主機上若仍使用舊 token,需安排輪替。 + bearer_token_file: /etc/prometheus/secrets/minio-prometheus-bearer-token static_configs: - targets: - 192.168.0.188:9000 diff --git a/k8s/velero/01-credentials.yaml b/k8s/velero/01-credentials.yaml index 52bc586a..fe7508b9 100644 --- a/k8s/velero/01-credentials.yaml +++ b/k8s/velero/01-credentials.yaml @@ -10,5 +10,5 @@ type: Opaque stringData: cloud: | [default] - aws_access_key_id=minio_admin - aws_secret_access_key=Minio_Velero_2026! + aws_access_key_id= + aws_secret_access_key= diff --git a/scripts/health_check_session.sh b/scripts/health_check_session.sh index d98a2f69..768ef77f 100755 --- a/scripts/health_check_session.sh +++ b/scripts/health_check_session.sh @@ -37,7 +37,7 @@ check_url() { check_ssh() { local name=$1 host=$2 - if ssh -o ConnectTimeout=3 -o BatchMode=yes -o StrictHostKeyChecking=no \ + if ssh -o ConnectTimeout=3 -o BatchMode=yes -o StrictHostKeyChecking=accept-new \ "$host" "echo ok" 2>/dev/null | grep -q ok; then ok "SSH $name ($host)" return 0 diff --git a/scripts/ops/deploy-docker-health-monitor.sh b/scripts/ops/deploy-docker-health-monitor.sh index e33f6ce5..a7222ad8 100755 --- a/scripts/ops/deploy-docker-health-monitor.sh +++ b/scripts/ops/deploy-docker-health-monitor.sh @@ -25,6 +25,7 @@ MONITOR_SCRIPT="${REPO_ROOT}/scripts/ops/docker-health-monitor.sh" TARGET="${1:-all}" SSH_KEY="${HOME}/.ssh/id_rsa" +KNOWN_HOSTS_FILE="${HOME}/.ssh/known_hosts" # 110 用 wooo,188 用 ollama ssh_user() { @@ -40,7 +41,9 @@ ssh_cmd() { shift local user user=$(ssh_user "$host") - ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "${user}@${host}" "$@" + mkdir -p "${HOME}/.ssh" + ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "$host" >> "$KNOWN_HOSTS_FILE" 2>/dev/null || true + ssh -i "$SSH_KEY" -o BatchMode=yes -o StrictHostKeyChecking=accept-new -o UserKnownHostsFile="$KNOWN_HOSTS_FILE" "${user}@${host}" "$@" } scp_cmd() { @@ -49,7 +52,9 @@ scp_cmd() { local dst="$3" local user user=$(ssh_user "$host") - scp -i "$SSH_KEY" -o StrictHostKeyChecking=no "$src" "${user}@${host}:${dst}" + mkdir -p "${HOME}/.ssh" + ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "$host" >> "$KNOWN_HOSTS_FILE" 2>/dev/null || true + scp -i "$SSH_KEY" -o BatchMode=yes -o StrictHostKeyChecking=accept-new -o UserKnownHostsFile="$KNOWN_HOSTS_FILE" "$src" "${user}@${host}:${dst}" } deploy_to_host() {