Files
ewoooc/services/db_backup_service.py
ogt 0099543c05
Some checks failed
CD Pipeline / deploy (push) Failing after 5m18s
fix(security): 全域健檢 — 40 項安全/Bug/品質修復
🔴 Critical
- auto_heal_service: 補 import re + sqlalchemy.text + 修正 orchestrator 變數名
  + autoheal_playbook→playbooks 表名 + _alert_and_store cooldown 修復
- aider_heal_executor: shell injection 改 shell=False + list 參數
- docker-compose: DISABLE_LOGIN 改 env var + 移除密碼 fallback + POSTGRES_HOST 修正
- app.py: /api/backup /api/run_task 等 6 個管理 API 加 @login_required
- config.py + pg_sync + e2e_test: 移除 wooo_pg_2026 hardcoded 密碼 fallback
- pg_backup.sh: 移除 TELEGRAM_TOKEN= 中間變數,直接用 $TELEGRAM_BOT_TOKEN
- migration 014: trigger_pattern→match_pattern + 補 error_type NOT NULL 欄位

🟡 High
- telegram_bot_service: str(e) 改通用訊息 + session try/finally + 移除 pa:/pr: 舊 callback
- run_scheduler: ElephantAlpha thread 死亡監控 + 自動重啟 + Telegram 告警
  + agent_context 03:30 TTL 定時清理任務
- openclaw_learning_service: build_rag_context 兩路徑加 .limit(200)
- hooks: commit-quality + momo-prod-guard 空 catch 改 stderr+exit(1)
- scripts/code_review: auto_yes 預設改 false
- db_backup_service: PGPASSWORD 透過 env dict 傳遞

📦 Migrations
- 013_autoheal: 修正建表順序 playbooks→incidents(外鍵前向引用)
- 018_add_missing_indexes: heal_logs/incidents 外鍵索引 + cleanup_expired_agent_context()

🟢 Infrastructure
- requirements.txt: 加版本下界 Flask>=2.3 SQLAlchemy>=1.4 等
- cd.yaml: 新增 run_scheduler.py + run_telegram_bot.py 監聽路徑
- .gitignore: insert_playbook_local.py 加入忽略

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-22 01:12:23 +08:00

207 lines
8.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
DB Backup Service — EwoooC V10.3
負責執行 pg_dump 備份、保留策略、以及備份狀態寫入 backup_log
"""
import os
import subprocess
import logging
import glob
from datetime import datetime, timedelta, timezone
TAIPEI_TZ = timezone(timedelta(hours=8))
logger = logging.getLogger(__name__)
# 備份目錄container 內掛載點
BACKUP_DIR = os.environ.get("BACKUP_DIR", "/app/data/db_backups")
# pg_dump 目標:在 momo-db container 內執行docker exec
DB_CONTAINER = os.environ.get("DB_CONTAINER", "momo-db")
DB_USER = os.environ.get("POSTGRES_USER", "momo")
DB_NAME = os.environ.get("POSTGRES_DB", "momo_analytics")
# 保留天數
RETENTION_DAYS = int(os.environ.get("BACKUP_RETENTION_DAYS", "7"))
def _ensure_backup_dir():
os.makedirs(BACKUP_DIR, exist_ok=True)
def _log_backup(filename, file_size, duration, status, error=None, storage_path=None):
"""寫入 backup_log 表,失敗不阻斷主流程"""
try:
from database.manager import DatabaseManager
db = DatabaseManager()
with db.get_session() as session:
from sqlalchemy import text
session.execute(text("""
INSERT INTO backup_log
(filename, file_size_bytes, duration_seconds, status, error_message,
host, storage_path, completed_at)
VALUES
(:filename, :size, :dur, :status, :error,
:host, :path, CURRENT_TIMESTAMP)
"""), {
"filename": filename,
"size": file_size,
"dur": duration,
"status": status,
"error": error,
"host": os.uname().nodename if hasattr(os, 'uname') else "unknown",
"path": storage_path or BACKUP_DIR,
})
session.commit()
except Exception as e:
logger.warning(f"[Backup] backup_log 寫入失敗(不影響備份本體): {e}")
def run_backup() -> dict:
"""
執行 pg_dump 備份。
因 scheduler 在 momo-scheduler container 內pg_dump 直連 momo-db service。
回傳 dict: {success, filename, file_size, duration, error}
"""
_ensure_backup_dir()
now = datetime.now(TAIPEI_TZ)
filename = f"momo_analytics_{now.strftime('%Y%m%d_%H%M%S')}.sql.gz"
filepath = os.path.join(BACKUP_DIR, filename)
start = datetime.now()
db_host = os.environ.get("POSTGRES_HOST", "momo-db")
db_port = os.environ.get("POSTGRES_PORT", "5432")
# 若 pg_dump 不存在則嘗試安裝容器重建後需重裝Dockerfile 已加入 postgresql-client
if not os.path.exists("/usr/bin/pg_dump"):
logger.info("[Backup] pg_dump 不存在,嘗試安裝 postgresql-client...")
subprocess.run(
["apt-get", "install", "-y", "-qq", "postgresql-client"],
capture_output=True
)
pg_password = os.environ.get("POSTGRES_PASSWORD")
pg_env = {**os.environ, "PGPASSWORD": pg_password} if pg_password else dict(os.environ)
logger.info(f"[Backup] 開始備份 → {filepath}")
result = {"success": False, "filename": filename, "file_size": 0, "duration": 0, "error": None}
try:
with open(filepath, "wb") as out_f:
pg_dump_proc = subprocess.Popen(
["pg_dump", "-h", db_host, "-p", db_port, "-U", DB_USER, "-d", DB_NAME,
"--no-password", "-Fp"],
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
env=pg_env
)
gzip_proc = subprocess.Popen(
["gzip"],
stdin=pg_dump_proc.stdout, stdout=out_f, stderr=subprocess.PIPE
)
pg_dump_proc.stdout.close()
gzip_stderr = gzip_proc.communicate(timeout=300)[1]
pg_dump_proc.wait(timeout=300)
# 模擬 proc 介面供後續邏輯共用
class _FakeProc:
def __init__(self, returncode, stderr_text):
self.returncode = returncode
self.stderr = stderr_text
pg_dump_stderr = pg_dump_proc.stderr.read().decode(errors="replace").strip()
combined_returncode = pg_dump_proc.returncode if pg_dump_proc.returncode != 0 else gzip_proc.returncode
proc = _FakeProc(combined_returncode, pg_dump_stderr or gzip_stderr.decode(errors="replace").strip())
duration = (datetime.now() - start).total_seconds()
if proc.returncode != 0:
error_msg = proc.stderr.strip() or "pg_dump 非零退出碼"
logger.error(f"[Backup] 備份失敗: {error_msg}")
result["error"] = error_msg
result["duration"] = duration
_log_backup(filename, 0, duration, "failed", error=error_msg)
else:
file_size = os.path.getsize(filepath) if os.path.exists(filepath) else 0
logger.info(f"[Backup] 備份成功 | 大小={file_size//1024}KB | 耗時={duration:.1f}s")
result.update({"success": True, "file_size": file_size, "duration": duration})
_log_backup(filename, file_size, duration, "success", storage_path=filepath)
except subprocess.TimeoutExpired:
duration = (datetime.now() - start).total_seconds()
error_msg = "pg_dump 超時300s"
logger.error(f"[Backup] {error_msg}")
result["error"] = error_msg
result["duration"] = duration
_log_backup(filename, 0, duration, "failed", error=error_msg)
except Exception as e:
duration = (datetime.now() - start).total_seconds()
error_msg = str(e)
logger.error(f"[Backup] 備份異常: {e}")
result["error"] = error_msg
result["duration"] = duration
_log_backup(filename, 0, duration, "failed", error=error_msg)
return result
def cleanup_old_backups() -> int:
"""刪除超過保留期限的備份檔,回傳刪除數量"""
_ensure_backup_dir()
cutoff = datetime.now() - timedelta(days=RETENTION_DAYS)
deleted = 0
for f in glob.glob(os.path.join(BACKUP_DIR, "momo_analytics_*.sql.gz")):
try:
mtime = datetime.fromtimestamp(os.path.getmtime(f))
if mtime < cutoff:
os.remove(f)
deleted += 1
logger.info(f"[Backup] 已刪除舊備份: {os.path.basename(f)}")
except Exception as e:
logger.warning(f"[Backup] 刪除舊備份失敗 {f}: {e}")
return deleted
def get_latest_backup_info() -> dict:
"""
回傳最新備份的資訊(供監控用)。
優先從 backup_log 讀取fallback 掃描檔案系統。
"""
try:
from database.manager import DatabaseManager
db = DatabaseManager()
with db.get_session() as session:
from sqlalchemy import text
row = session.execute(text("""
SELECT filename, file_size_bytes, duration_seconds, status, created_at, error_message
FROM backup_log
ORDER BY created_at DESC
LIMIT 1
""")).fetchone()
if row:
return {
"filename": row[0],
"file_size": row[1],
"duration": row[2],
"status": row[3],
"created_at": row[4],
"error": row[5],
"source": "db",
}
except Exception as e:
logger.warning(f"[Backup] 無法從 DB 讀取最新備份資訊: {e}")
# fallback: 掃描檔案
_ensure_backup_dir()
files = sorted(
glob.glob(os.path.join(BACKUP_DIR, "momo_analytics_*.sql.gz")),
key=os.path.getmtime, reverse=True
)
if files:
f = files[0]
mtime = datetime.fromtimestamp(os.path.getmtime(f))
return {
"filename": os.path.basename(f),
"file_size": os.path.getsize(f),
"duration": None,
"status": "success",
"created_at": mtime,
"error": None,
"source": "filesystem",
}
return {"filename": None, "status": "no_backup", "created_at": None, "source": "none"}