Files
ewoooc/services/import_service.py
OoO 55e14c0332
All checks were successful
CD Pipeline / deploy (push) Successful in 9m17s
V10.605 修復當日業績匯入與資料庫備份
2026-06-15 14:52:33 +08:00

1200 lines
48 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
自動匯入服務
負責從 Google Drive 自動下載、匯入、刪除檔案
"""
import logging
import json
import os
from datetime import date, datetime
from typing import Any, Dict, Optional
import pandas as pd
import pytz
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker
# 台北時區
TAIPEI_TZ = pytz.timezone('Asia/Taipei')
DAILY_SALES_REQUIRED_COLUMN_GROUPS = {
"商品名稱類": ["商品名稱", "品名", "Product", "Name"],
"業績金額類": ["銷售金額", "總業績", "業績", "金額", "Amount", "Sales", "Total"],
}
DAILY_SALES_DATE_ALIASES = ["日期", "訂單日期", "交易日期", "Date"]
DAILY_SALES_DETAIL_SHEET_HINTS = ("即時業績明細", "業績明細", "明細", "detail", "daily")
DAILY_SALES_HEADER_SCAN_ROWS = 15
from services.google_drive_service import drive_service
from database.import_models import ImportJob, ImportConfig, Base
from database.manager import ensure_metadata_initialized
# 設定日誌
logger = logging.getLogger(__name__)
def _build_in_clause(prefix: str, values) -> tuple:
"""Build a SQLAlchemy-safe IN clause placeholder list and params."""
params = {}
placeholders = []
for idx, value in enumerate(values):
key = f"{prefix}_{idx}"
placeholders.append(f":{key}")
params[key] = value
return ", ".join(placeholders), params
def _db_dialect_name() -> str:
try:
return engine.dialect.name
except Exception:
return ""
def _date_filter_expr(column_name: str) -> str:
"""Return a DB-specific date expression for legacy text date columns."""
if _db_dialect_name() == "postgresql":
return f"{column_name}::date"
return f"date({column_name})"
def _normalise_date_values_for_sql(values):
"""Keep PostgreSQL params as date objects; use ISO strings for SQLite/text comparisons."""
normalised = []
use_native_date = _db_dialect_name() == "postgresql"
for value in values:
if value is None:
continue
if isinstance(value, datetime):
value = value.date()
elif hasattr(value, "date") and not isinstance(value, date):
try:
value = value.date()
except Exception:
pass
if use_native_date:
if isinstance(value, date):
normalised.append(value)
else:
parsed = pd.to_datetime(value, errors="coerce")
if pd.notna(parsed):
normalised.append(parsed.date())
else:
if isinstance(value, date):
normalised.append(value.isoformat())
else:
parsed = pd.to_datetime(value, errors="coerce")
normalised.append(parsed.strftime("%Y-%m-%d") if pd.notna(parsed) else str(value))
return normalised
def _normalise_column_text(value) -> str:
if pd.isna(value):
return ""
return str(value).strip().replace("\n", "").replace("\r", "").replace(" ", "").lower()
def _stringify_column(value) -> str:
if pd.isna(value):
return ""
return str(value).strip()
def _dedupe_columns(columns) -> list[str]:
seen = {}
result = []
for idx, column in enumerate(columns):
base = _stringify_column(column) or f"unnamed_{idx + 1}"
count = seen.get(base, 0) + 1
seen[base] = count
result.append(base if count == 1 else f"{base}_{count}")
return result
def _columns_have_any(columns, keywords) -> bool:
normalised_columns = [_normalise_column_text(column) for column in columns]
normalised_keywords = [_normalise_column_text(keyword) for keyword in keywords]
return any(
keyword and column and keyword in column
for column in normalised_columns
for keyword in normalised_keywords
)
def _missing_daily_sales_column_groups(columns) -> list[str]:
return [
label
for label, keywords in DAILY_SALES_REQUIRED_COLUMN_GROUPS.items()
if not _columns_have_any(columns, keywords)
]
def _find_daily_sales_date_column(columns) -> Optional[str]:
normalised_aliases = [_normalise_column_text(alias) for alias in DAILY_SALES_DATE_ALIASES]
for column in columns:
normalised_column = _normalise_column_text(column)
if normalised_column in normalised_aliases:
return column
for column in columns:
normalised_column = _normalise_column_text(column)
if normalised_column and any(alias and alias in normalised_column for alias in normalised_aliases):
return column
return None
def _score_daily_sales_candidate(sheet_name: str, header_row: int, columns) -> int:
missing = _missing_daily_sales_column_groups(columns)
if missing:
return 0
normalised_sheet_name = _normalise_column_text(sheet_name)
score = 1000
if _find_daily_sales_date_column(columns):
score += 250
if any(_normalise_column_text(hint) in normalised_sheet_name for hint in DAILY_SALES_DETAIL_SHEET_HINTS):
score += 150
score += min(len([column for column in columns if _stringify_column(column)]), 80)
score += max(0, DAILY_SALES_HEADER_SCAN_ROWS - header_row)
return score
def _clean_daily_sales_dataframe(df: pd.DataFrame) -> pd.DataFrame:
cleaned = df.dropna(how="all").dropna(axis=1, how="all").copy()
cleaned.columns = _dedupe_columns(cleaned.columns)
return cleaned
def _format_daily_sales_diagnostics(candidates: list[dict]) -> str:
if not candidates:
return "未找到可辨識的工作表表頭"
parts = []
for candidate in sorted(candidates, key=lambda item: item["score"], reverse=True)[:8]:
columns_sample = [
column for column in candidate["columns_sample"]
if column and not str(column).lower().startswith("nan")
][:8]
missing = candidate.get("missing") or []
parts.append(
f"{candidate['sheet_name']}!第{candidate['header_row'] + 1}"
f"缺少={missing or ''} 欄位={columns_sample}"
)
return "".join(parts)
def _read_daily_sales_excel(file_path: str) -> tuple[pd.DataFrame, dict]:
"""
讀取當日業績 Excel支援多工作表與非第一列表頭。
Google Drive 匯出的檔案常把樞紐彙總表放在第一張 sheet
真正的匯入明細在「即時業績明細」。此處先掃描所有 sheet 的前幾列,
再選出同時含商品名稱、業績金額且最好有日期欄位的明細 sheet。
"""
candidates: list[dict] = []
with pd.ExcelFile(file_path, engine="openpyxl") as excel:
for sheet_name in excel.sheet_names:
try:
preview = pd.read_excel(
excel,
sheet_name=sheet_name,
header=None,
nrows=DAILY_SALES_HEADER_SCAN_ROWS,
dtype=str,
)
except Exception as exc:
candidates.append({
"sheet_name": sheet_name,
"header_row": 0,
"columns_sample": [f"讀取預覽失敗: {exc}"],
"missing": list(DAILY_SALES_REQUIRED_COLUMN_GROUPS.keys()),
"score": 0,
})
continue
for header_row, row in preview.iterrows():
columns = [_stringify_column(value) for value in row.tolist()]
if not any(columns):
continue
missing = _missing_daily_sales_column_groups(columns)
score = _score_daily_sales_candidate(sheet_name, int(header_row), columns)
candidates.append({
"sheet_name": sheet_name,
"header_row": int(header_row),
"columns_sample": columns[:30],
"missing": missing,
"score": score,
})
valid_candidates = [candidate for candidate in candidates if not candidate["missing"]]
if not valid_candidates:
diagnostics = _format_daily_sales_diagnostics(candidates)
raise ValueError(
"Excel 欄位防禦失敗:找不到可匯入的當日業績明細工作表。"
f"已檢查:{diagnostics}"
)
best = max(valid_candidates, key=lambda candidate: candidate["score"])
df = pd.read_excel(
excel,
sheet_name=best["sheet_name"],
header=best["header_row"],
dtype=str,
)
df = _clean_daily_sales_dataframe(df)
if df.empty:
raise ValueError("Excel 檔案為空")
missing = _missing_daily_sales_column_groups(df.columns)
if missing:
raise ValueError(
f"Excel 欄位防禦失敗:缺少必要欄位分類 {missing}"
f"現有欄位:{list(df.columns)[:30]}"
)
metadata = {
"sheet_name": best["sheet_name"],
"header_row": best["header_row"] + 1,
"date_col": _find_daily_sales_date_column(df.columns),
"candidate_score": best["score"],
}
return df, metadata
def _daily_sales_staleness_guard_enabled() -> bool:
return os.getenv("DAILY_SALES_IMPORT_STALENESS_GUARD_ENABLED", "true").lower() in {
"1",
"true",
"yes",
"on",
}
def _daily_sales_max_stale_days() -> int:
try:
return max(0, int(os.getenv("DAILY_SALES_IMPORT_MAX_STALE_DAYS", "3")))
except ValueError:
return 3
def _should_quarantine_failed_import(error_message: str) -> bool:
if not error_message:
return False
permanent_error_markers = [
"Excel 欄位防禦失敗",
"Excel 日期防禦失敗",
"Excel 檔案為空",
"找不到可匯入",
"欄位驗證失敗",
"日期驗證失敗",
]
return any(marker in error_message for marker in permanent_error_markers)
# 資料庫設定 - 使用 config.py 中的設定,支援 PostgreSQL 和 SQLite
def _create_engine_with_pool(db_path):
"""建立帶有連線池配置的資料庫引擎"""
if db_path.startswith('postgresql://'):
return create_engine(
db_path,
pool_pre_ping=True,
pool_size=5,
max_overflow=10,
pool_recycle=1800,
pool_timeout=30,
connect_args={
'connect_timeout': 10,
'options': '-c statement_timeout=120000' # 匯入需要更長的超時
}
)
elif db_path.startswith('sqlite://'):
return create_engine(db_path)
else:
return create_engine(f'sqlite:///{db_path}')
try:
from config import DATABASE_PATH as CONFIG_DATABASE_PATH
engine = _create_engine_with_pool(CONFIG_DATABASE_PATH)
logger.info(f"使用資料庫: {CONFIG_DATABASE_PATH.split('@')[-1] if '@' in CONFIG_DATABASE_PATH else CONFIG_DATABASE_PATH}")
except ImportError:
# 備援方案:使用環境變數或預設值
DATABASE_PATH = os.getenv('DATABASE_PATH', 'data/momo_database.db')
engine = _create_engine_with_pool(DATABASE_PATH)
logger.warning(f"無法匯入 config使用備援資料庫路徑: {DATABASE_PATH}")
Session = sessionmaker(bind=engine)
class ImportService:
"""匯入服務類別"""
def __init__(self):
"""初始化匯入服務"""
self._init_database()
def _init_database(self):
"""初始化資料庫表"""
try:
ensure_metadata_initialized(engine, use_postgres_lock=str(engine.url).startswith('postgresql'))
logger.info("匯入追蹤表已初始化")
except Exception as e:
logger.error(f"初始化資料庫表失敗: {str(e)}")
def get_config(self, key: str, default: str = None) -> Optional[str]:
"""
取得配置值
Args:
key: 配置鍵
default: 預設值
Returns:
Optional[str]: 配置值
"""
session = Session()
try:
config = session.query(ImportConfig).filter_by(config_key=key).first()
if config:
return config.config_value
return default
finally:
session.close()
def set_config(self, key: str, value: str, config_type: str = 'string', description: str = None):
"""
設定配置值
Args:
key: 配置鍵
value: 配置值
config_type: 配置類型
description: 配置說明
"""
session = Session()
try:
config = session.query(ImportConfig).filter_by(config_key=key).first()
if config:
config.config_value = value
config.config_type = config_type
if description:
config.description = description
config.updated_at = datetime.now(TAIPEI_TZ).replace(tzinfo=None)
else:
config = ImportConfig(
config_key=key,
config_value=value,
config_type=config_type,
description=description
)
session.add(config)
session.commit()
logger.info(f"配置已更新: {key} = {value}")
except Exception as e:
session.rollback()
logger.error(f"設定配置失敗: {str(e)}")
finally:
session.close()
def create_import_job(self, job_type: str, drive_file_id: str, drive_file_name: str,
drive_file_size: int = None) -> Optional[int]:
"""
建立匯入任務
Args:
job_type: 任務類型daily_sales 或 vendor_stockout
drive_file_id: Google Drive 檔案 ID
drive_file_name: 檔案名稱
drive_file_size: 檔案大小
Returns:
Optional[int]: 任務 ID
"""
session = Session()
try:
job = ImportJob(
job_type=job_type,
status='pending',
drive_file_id=drive_file_id,
drive_file_name=drive_file_name,
drive_file_size=drive_file_size,
progress_percent=0.0,
current_step='等待開始...'
)
session.add(job)
session.commit()
job_id = job.id
logger.info(f"已建立匯入任務: ID={job_id}, 檔案={drive_file_name}")
return job_id
except Exception as e:
session.rollback()
logger.error(f"建立匯入任務失敗: {str(e)}")
return None
finally:
session.close()
def update_job_status(self, job_id: int, status: str, progress: float = None,
current_step: str = None, error_message: str = None):
"""
更新任務狀態
Args:
job_id: 任務 ID
status: 狀態
progress: 進度百分比
current_step: 當前步驟
error_message: 錯誤訊息
"""
session = Session()
try:
job = session.query(ImportJob).filter_by(id=job_id).first()
if not job:
logger.warning(f"找不到任務: ID={job_id}")
return
job.status = status
if progress is not None:
job.progress_percent = progress
if current_step:
job.current_step = current_step
if error_message:
job.error_message = error_message
# 更新時間戳 (2026-01-30 修正:使用台北時區)
if status == 'downloading' or status == 'importing':
if not job.started_at:
job.started_at = datetime.now(TAIPEI_TZ).replace(tzinfo=None)
elif status in ['completed', 'failed']:
job.completed_at = datetime.now(TAIPEI_TZ).replace(tzinfo=None)
session.commit()
logger.info(f"任務 {job_id} 狀態已更新: {status} ({progress}%)")
except Exception as e:
session.rollback()
logger.error(f"更新任務狀態失敗: {str(e)}")
finally:
session.close()
def update_job_progress(self, job_id: int, total_rows: int = None, processed_rows: int = None,
success_rows: int = None, error_rows: int = None):
"""
更新任務進度
Args:
job_id: 任務 ID
total_rows: 總行數
processed_rows: 已處理行數
success_rows: 成功行數
error_rows: 錯誤行數
"""
session = Session()
try:
job = session.query(ImportJob).filter_by(id=job_id).first()
if not job:
return
if total_rows is not None:
job.total_rows = total_rows
if processed_rows is not None:
job.processed_rows = processed_rows
if success_rows is not None:
job.success_rows = success_rows
if error_rows is not None:
job.error_rows = error_rows
# 計算進度百分比
if job.total_rows and job.processed_rows:
job.progress_percent = (job.processed_rows / job.total_rows) * 100
session.commit()
except Exception as e:
session.rollback()
logger.error(f"更新任務進度失敗: {str(e)}")
finally:
session.close()
def get_job_status(self, job_id: int) -> Optional[Dict[str, Any]]:
"""
取得任務狀態
Args:
job_id: 任務 ID
Returns:
Optional[Dict]: 任務資訊
"""
session = Session()
try:
job = session.query(ImportJob).filter_by(id=job_id).first()
if job:
return job.to_dict()
return None
finally:
session.close()
def get_recent_jobs(self, limit: int = 20) -> list:
"""
取得最近的任務清單
Args:
limit: 返回數量
Returns:
list: 任務清單
"""
session = Session()
try:
jobs = session.query(ImportJob).order_by(
ImportJob.created_at.desc()
).limit(limit).all()
return [job.to_dict() for job in jobs]
finally:
session.close()
def process_daily_sales_import(self, job_id: int, file_path: str) -> bool:
"""
處理當日業績匯入
Args:
job_id: 任務 ID
file_path: Excel 檔案路徑
Returns:
bool: 是否成功
"""
try:
self.update_job_status(job_id, 'importing', 50, '正在匯入資料...')
# 讀取 Excel 檔案
logger.info(f"開始讀取 Excel 檔案: {file_path}")
try:
df, excel_metadata = _read_daily_sales_excel(file_path)
except ValueError as exc:
error_msg = str(exc)
step = '欄位驗證失敗' if '欄位' in error_msg else '匯入失敗'
logger.error(error_msg)
self.update_job_status(job_id, 'failed', 50, step, error_msg)
return False
logger.info(
"任務 %s 選用 Excel 工作表: %s | 表頭列=%s | 日期欄=%s | 欄位數=%s | 資料列=%s",
job_id,
excel_metadata.get("sheet_name"),
excel_metadata.get("header_row"),
excel_metadata.get("date_col") or "未提供",
len(df.columns),
len(df),
)
# 匯入到資料庫
table_name = 'daily_sales_snapshot'
# 找到日期欄位
date_col = excel_metadata.get("date_col") or _find_daily_sales_date_column(df.columns)
if date_col:
# 解析日期
parsed_dates = pd.to_datetime(df[date_col], errors='coerce')
if parsed_dates.dropna().empty:
error_msg = f"Excel 日期防禦失敗:日期欄位「{date_col}」無法解析出有效日期"
logger.error(error_msg)
self.update_job_status(job_id, 'failed', 50, '日期驗證失敗', error_msg)
return False
df['snapshot_date'] = parsed_dates.dt.date
logger.info(f"使用日期欄位: {date_col}")
if _daily_sales_staleness_guard_enabled():
max_snapshot_date = df['snapshot_date'].dropna().max()
today = datetime.now(TAIPEI_TZ).date()
lag_days = (today - max_snapshot_date).days
max_stale_days = _daily_sales_max_stale_days()
if lag_days > max_stale_days:
error_msg = (
f"Excel 日期防禦失敗:明細最大日期 {max_snapshot_date}"
f"已落後 {lag_days} 天(上限 {max_stale_days} 天)。"
"請上傳最新當日業績檔,或將舊檔移至已匯入/匯入失敗資料夾。"
)
logger.error(error_msg)
self.update_job_status(job_id, 'failed', 50, '日期驗證失敗', error_msg)
return False
else:
# 使用當前日期
df['snapshot_date'] = datetime.now(TAIPEI_TZ).date()
logger.info("未找到日期欄位,使用當前日期(台北時區)")
# 寫入資料庫 - 使用全域的 engine支援 PostgreSQL 和 SQLite
# 使用模組頂部定義的 engine確保連接到正確的資料庫
# 更新進度
total_rows = len(df)
self.update_job_progress(job_id, total_rows=total_rows, processed_rows=0)
# 取得此次匯入的日期範圍
import_dates = df['snapshot_date'].unique()
logger.info(f"本次匯入包含 {len(import_dates)} 個日期的資料")
# 刪除資料庫中相同日期的舊資料(覆蓋邏輯)
if len(import_dates) > 0:
# 過濾掉 None 值
valid_dates = _normalise_date_values_for_sql(import_dates)
if valid_dates:
date_placeholders, date_params = _build_in_clause("snapshot_date", valid_dates)
snapshot_date_expr = _date_filter_expr("snapshot_date")
with engine.connect() as conn:
# 刪除相同日期的舊資料
delete_query = text(
f"DELETE FROM {table_name} WHERE {snapshot_date_expr} IN ({date_placeholders})"
)
result = conn.execute(delete_query, date_params)
deleted_count = result.rowcount
conn.commit()
if deleted_count > 0:
logger.info(f"已刪除 {deleted_count} 筆舊資料(覆蓋模式)")
# 寫入資料庫(帶驗證和重試機制)
max_retries = 2
retry_count = 0
write_success = False
while retry_count <= max_retries and not write_success:
try:
if retry_count > 0:
logger.warning(f"任務 {job_id}{retry_count} 次重試寫入...")
self.update_job_status(job_id, 'importing', 60, f'重試寫入中 ({retry_count}/{max_retries})...')
df.to_sql(
table_name,
engine,
if_exists='append',
index=False,
method='multi',
chunksize=1000
)
# V-Fix: 匯入後驗證 - 確認資料已正確寫入資料庫
self.update_job_status(job_id, 'importing', 85, '驗證資料寫入...')
# 取得本次匯入的日期
import_dates = df['snapshot_date'].dropna().unique()
if len(import_dates) > 0:
# 查詢資料庫中這些日期的資料筆數
raw_valid_dates = [d for d in import_dates if d is not None]
valid_dates = _normalise_date_values_for_sql(raw_valid_dates)
date_placeholders, date_params = _build_in_clause("verify_date", valid_dates)
snapshot_date_expr = _date_filter_expr("snapshot_date")
with engine.connect() as conn:
verify_query = text(
f"SELECT COUNT(*) FROM {table_name} WHERE {snapshot_date_expr} IN ({date_placeholders})"
)
result = conn.execute(verify_query, date_params)
db_count = result.scalar()
# 驗證:資料庫筆數應該 >= 本次匯入筆數(可能有其他日期的舊資料)
expected_count = len(df[df['snapshot_date'].isin(raw_valid_dates)])
if db_count >= expected_count:
logger.info(f"任務 {job_id} 驗證成功: 預期 {expected_count} 筆, 資料庫有 {db_count}")
write_success = True
else:
logger.warning(f"任務 {job_id} 驗證失敗: 預期 {expected_count} 筆, 資料庫只有 {db_count}")
retry_count += 1
else:
# 沒有有效日期,跳過驗證
logger.warning(f"任務 {job_id} 無法驗證: 沒有有效的 snapshot_date")
write_success = True
except Exception as write_error:
logger.error(f"任務 {job_id} 寫入失敗 (嘗試 {retry_count + 1}): {str(write_error)}")
retry_count += 1
if retry_count > max_retries:
raise write_error
if not write_success:
error_msg = f"資料寫入驗證失敗,已重試 {max_retries}"
self.update_job_status(job_id, 'failed', 85, '驗證失敗', error_msg)
logger.error(f"任務 {job_id} {error_msg}")
return False
# === V-New 2026-01-15: 同步寫入 realtime_sales_monthly ===
# 目的:讓當日業績 raw data 同時呈現在「業績分析儀表板」
# 2026-01-30 修復:加強欄位驗證、同步狀態追蹤、失敗告警
self.update_job_status(job_id, 'importing', 90, '同步至業績分析儀表板...')
sync_success = False
sync_error_msg = None
monthly_table = 'realtime_sales_monthly'
try:
# 準備資料:移除 snapshot_date 欄位realtime_sales_monthly 不需要此欄位)
df_monthly = df.drop(columns=['snapshot_date'], errors='ignore')
# 2026-01-30 修正:強化欄位名稱轉換
# 將特殊字符轉換為 PostgreSQL 安全格式
column_mapping = {}
for col in df_monthly.columns:
new_col = col.replace('%', '_pct').replace('(', '_').replace(')', '_')
column_mapping[col] = new_col
df_monthly = df_monthly.rename(columns=column_mapping)
# 記錄轉換的欄位
converted_cols = [f"'{k}' -> '{v}'" for k, v in column_mapping.items() if k != v]
if converted_cols:
logger.info(f"任務 {job_id} 欄位名稱轉換: {', '.join(converted_cols)}")
logger.info(f"任務 {job_id} 欄位轉換完成,共 {len(df_monthly.columns)} 個欄位")
# 2026-01-30 新增:驗證 DataFrame 欄位和目標表欄位是否一致
with engine.connect() as conn:
col_query = text(f"""
SELECT column_name FROM information_schema.columns
WHERE table_name = '{monthly_table}' AND column_name != 'id'
ORDER BY ordinal_position
""")
result = conn.execute(col_query)
target_columns = set([row[0] for row in result])
df_columns = set(df_monthly.columns)
missing_in_table = df_columns - target_columns
missing_in_df = target_columns - df_columns
if missing_in_table:
logger.warning(f"任務 {job_id} 欄位警告: DataFrame 有但表中沒有: {missing_in_table}")
# 移除表中沒有的欄位,避免 INSERT 失敗
df_monthly = df_monthly.drop(columns=list(missing_in_table), errors='ignore')
logger.info(f"任務 {job_id} 已移除多餘欄位,剩餘 {len(df_monthly.columns)} 個欄位")
if missing_in_df:
logger.warning(f"任務 {job_id} 欄位警告: 表中有但 DataFrame 沒有: {missing_in_df}")
# 取得本次匯入的日期列表(使用原始「日期」欄位)
unique_dates = []
if '日期' in df.columns:
unique_dates = df['日期'].dropna().unique().tolist()
logger.info(f"任務 {job_id} 準備同步 {len(unique_dates)} 個日期的資料")
if len(unique_dates) > 0:
# 刪除 realtime_sales_monthly 中相同日期的舊資料(去重)
date_placeholders, date_params = _build_in_clause("monthly_date", unique_dates)
with engine.connect() as conn:
delete_monthly_query = text(
f'DELETE FROM {monthly_table} WHERE "日期" IN ({date_placeholders})'
)
result = conn.execute(delete_monthly_query, date_params)
deleted_monthly = result.rowcount
conn.commit()
if deleted_monthly > 0:
logger.info(f"任務 {job_id} 已從 {monthly_table} 刪除 {deleted_monthly} 筆同日期舊資料")
# 寫入 realtime_sales_monthly
df_monthly.to_sql(
monthly_table,
engine,
if_exists='append',
index=False,
method='multi',
chunksize=1000
)
logger.info(f"任務 {job_id} 已同步 {len(df_monthly)} 筆資料至 {monthly_table}")
# 驗證同步結果
if len(unique_dates) > 0:
with engine.connect() as conn:
date_placeholders, date_params = _build_in_clause("monthly_verify_date", unique_dates)
verify_query = text(
f'SELECT COUNT(*) FROM {monthly_table} WHERE "日期" IN ({date_placeholders})'
)
verify_count = conn.execute(verify_query, date_params).scalar()
if verify_count >= len(df_monthly):
logger.info(f"任務 {job_id} 同步驗證成功: {monthly_table} 現有 {verify_count} 筆資料")
sync_success = True
else:
sync_error_msg = f"同步驗證失敗: 預期 {len(df_monthly)} 筆, 實際 {verify_count}"
logger.error(f"任務 {job_id} {sync_error_msg}")
else:
sync_success = True # 沒有日期資料時視為成功
except Exception as sync_error:
# 同步失敗,記錄完整錯誤
import traceback
sync_error_msg = str(sync_error)
logger.error(f"任務 {job_id} 同步至 {monthly_table} 失敗: {sync_error_msg}")
logger.error(f"任務 {job_id} 同步錯誤堆疊:\n{traceback.format_exc()}")
# 2026-01-30 新增:發送同步失敗告警
try:
from services.notification_manager import NotificationManager
notifier = NotificationManager()
alert_msg = (
f"⚠️ 業績資料同步失敗告警\n"
f"{'='*30}\n"
f"任務 ID: {job_id}\n"
f"目標表: {monthly_table}\n"
f"錯誤: {sync_error_msg[:200]}\n"
f"{'='*30}\n"
f"daily_sales_snapshot 已匯入成功,但業績分析儀表板需要手動同步"
)
notifier._send_telegram_messages([alert_msg])
logger.info(f"任務 {job_id} 已發送同步失敗告警")
except Exception as notify_error:
logger.error(f"任務 {job_id} 發送告警失敗: {notify_error}")
# 更新成功資訊
self.update_job_progress(
job_id,
processed_rows=total_rows,
success_rows=total_rows
)
# 2026-01-30 修正:根據同步狀態設置完成訊息
if sync_success:
completion_msg = '匯入完成(已同步至業績分析儀表板)'
else:
completion_msg = '匯入完成(警告:業績分析儀表板同步失敗,需手動處理)'
self.update_job_status(
job_id,
'completed',
100,
completion_msg
)
# 計算日期範圍
date_min = None
date_max = None
valid_dates = df['snapshot_date'].dropna().unique()
if len(valid_dates) > 0:
sorted_dates = sorted([d for d in valid_dates if d is not None])
if sorted_dates:
date_min = str(sorted_dates[0])
date_max = str(sorted_dates[-1])
logger.info(f"任務 {job_id} 日期範圍: {date_min} ~ {date_max}")
# 更新匯入摘要 (2026-01-30 修正:加入同步狀態)
if sync_success:
sync_message = f'成功匯入 {total_rows} 筆資料,已同步至業績分析儀表板'
else:
sync_message = f'成功匯入 {total_rows} 筆資料,但同步至業績分析儀表板失敗: {sync_error_msg}'
summary = {
'imported_count': total_rows,
'table_name': table_name,
'synced_to': 'realtime_sales_monthly' if sync_success else None,
'sync_success': sync_success,
'sync_error': sync_error_msg,
'verified': True, # daily_sales_snapshot 驗證
'date_min': date_min,
'date_max': date_max,
'source_sheet': excel_metadata.get("sheet_name"),
'source_header_row': excel_metadata.get("header_row"),
'message': sync_message
}
session = Session()
try:
job = session.query(ImportJob).filter_by(id=job_id).first()
if job:
job.import_summary = json.dumps(summary, ensure_ascii=False)
session.commit()
finally:
session.close()
logger.info(f"任務 {job_id} 匯入成功: {total_rows}")
try:
from services.cache_service import clear_growth_cache
clear_growth_cache()
except Exception as cache_error:
logger.warning(f"任務 {job_id} 成長分析快取清除失敗: {cache_error}")
# daily_sales cache 失效仍靠 _get_data_fingerprintDB max(snapshot_date)+count(*)
# growth cache 另有 shared file + source fingerprint匯入後主動清掉避免短 TTL 內看到舊圖。
return True
except Exception as e:
error_msg = f"匯入過程發生異常: {str(e)}"
self.update_job_status(job_id, 'failed', 50, '匯入失敗', error_msg)
logger.error(f"任務 {job_id} 匯入異常: {str(e)}")
import traceback
logger.error(traceback.format_exc())
return False
def auto_import_from_drive(self) -> Dict[str, Any]:
"""
從 Google Drive 自動匯入檔案
Returns:
Dict: 執行結果
"""
try:
# 取得配置
folder_path = self.get_config('gdrive_folder_path', '業績報表/當日業績')
file_pattern = self.get_config('gdrive_file_pattern', '即時業績_當日')
logger.info(f"開始檢查 Google Drive: {folder_path}")
# 列出檔案
files = drive_service.list_files_in_folder(folder_path, file_pattern)
if not files:
logger.info("沒有找到待匯入的檔案")
# Staleness gate (critic-approved 2026-05-03)
# 'move-then-success' 反模式:成功 import 後 move_file 把 Excel 搬到
# 「已匯入」資料夾 → 後續排程 list 回空 → 走此分支 silent return success
# → 4/27~5/2 daily_sales_snapshot 停更 8 天無告警。補主動偵測:
# Drive 空 + DB ≥3 天無新資料時主動發催促告警(週末跨假期不誤觸)。
try:
from database.manager import get_session
from sqlalchemy import text
from datetime import date
from services.openclaw_strategist_service import _send_data_stale_alert
_stale_session = get_session()
try:
last_date = _stale_session.execute(
text("SELECT MAX(snapshot_date)::date FROM daily_sales_snapshot")
).scalar()
finally:
_stale_session.close()
if last_date:
days_since = (date.today() - last_date).days
if days_since >= 3:
_send_data_stale_alert(
report_type="upstream_drive",
last_date=str(last_date),
period=f"已停更 {days_since}",
)
except Exception:
logger.error(
"staleness check failed in auto_import_from_drive",
exc_info=True,
)
return {
'success': True,
'message': '沒有找到待匯入的檔案',
'file_count': 0
}
# 處理每個檔案
imported_count = 0
total_rows = 0
all_dates = [] # 收集所有匯入的日期
failed_files = []
for file in files:
file_id = file['id']
file_name = file['name']
file_size = file.get('size', 0)
logger.info(f"發現檔案: {file_name}")
# 建立匯入任務
job_id = self.create_import_job('daily_sales', file_id, file_name, file_size)
if not job_id:
continue
# 下載檔案
self.update_job_status(job_id, 'downloading', 10, '正在下載檔案...')
temp_dir = 'data/temp'
os.makedirs(temp_dir, exist_ok=True)
local_path = os.path.join(temp_dir, file_name)
if not drive_service.download_file(file_id, local_path):
err = '無法從 Google Drive 下載檔案'
self.update_job_status(job_id, 'failed', 10, '下載失敗', err)
failed_files.append({'file': file_name, 'error': err})
continue
# 更新本地路徑
session = Session()
try:
job = session.query(ImportJob).filter_by(id=job_id).first()
if job:
job.local_file_path = local_path
session.commit()
finally:
session.close()
self.update_job_status(job_id, 'downloading', 40, '下載完成')
# 匯入資料
if self.process_daily_sales_import(job_id, local_path):
# 移動 Google Drive 檔案到「已匯入」資料夾
self.update_job_status(job_id, 'completed', 90, '正在移動雲端檔案...')
# 取得「已匯入」資料夾路徑配置
archive_folder = self.get_config('gdrive_archive_folder', '已匯入')
if drive_service.move_file(file_id, archive_folder):
logger.info(f"已移動 Google Drive 檔案到「{archive_folder}」: {file_name}")
else:
logger.warning(f"無法移動 Google Drive 檔案: {file_name}")
self.update_job_status(job_id, 'completed', 100, '完成')
imported_count += 1
# 讀取 job summary 取得匯入筆數和日期範圍
session = Session()
try:
job = session.query(ImportJob).filter_by(id=job_id).first()
if job and job.import_summary:
summary = json.loads(job.import_summary)
total_rows += summary.get('imported_count', 0)
if summary.get('date_min'):
all_dates.append(summary['date_min'])
if summary.get('date_max'):
all_dates.append(summary['date_max'])
finally:
session.close()
# 清理本地檔案
try:
os.remove(local_path)
logger.info(f"已清理本地檔案: {local_path}")
except Exception as e:
logger.warning(f"清理本地檔案失敗: {str(e)}")
else:
session = Session()
try:
job = session.query(ImportJob).filter_by(id=job_id).first()
error_message = (job.error_message if job else None) or '匯入失敗,請查看 import_jobs'
quarantined = False
failed_folder = None
if _should_quarantine_failed_import(error_message):
default_failed_folder = (
f"{folder_path.rstrip('/')}/匯入失敗"
if folder_path and folder_path.strip("/")
else "匯入失敗"
)
failed_folder = self.get_config('gdrive_failed_folder', default_failed_folder)
if drive_service.move_file(file_id, failed_folder, create_missing=True):
quarantined = True
error_message = (
f"{error_message}(已移至「{failed_folder}」避免重複告警)"
)
self.update_job_status(job_id, 'failed', 100, '已移至匯入失敗', error_message)
logger.warning(
"已隔離不可自動修復的匯入失敗檔案: %s%s",
file_name,
failed_folder,
)
else:
logger.error("匯入失敗檔案隔離失敗: %s%s", file_name, failed_folder)
failed_files.append({
'file': file_name,
'job_id': job_id,
'error': error_message,
'quarantined': quarantined,
'failed_folder': failed_folder,
})
finally:
session.close()
try:
os.remove(local_path)
logger.info(f"已清理失敗匯入本地檔案: {local_path}")
except Exception as e:
logger.warning(f"清理失敗匯入本地檔案失敗: {str(e)}")
# 計算日期範圍
date_range = None
if all_dates:
sorted_dates = sorted(set(all_dates))
if sorted_dates:
date_range = {
'min': sorted_dates[0],
'max': sorted_dates[-1]
}
if failed_files:
first_error = failed_files[0].get('error') or '未知錯誤'
message = (
f'找到 {len(files)} 個檔案,成功匯入 {imported_count} 個,'
f'失敗 {len(failed_files)} 個。首筆錯誤:{first_error[:220]}'
)
else:
message = f'成功匯入 {imported_count} 個檔案'
return {
'success': len(failed_files) == 0,
'message': message,
'file_count': len(files),
'imported_count': imported_count,
'failed_count': len(failed_files),
'errors': failed_files,
'total_rows': total_rows,
'date_range': date_range
}
except Exception as e:
error_msg = str(e)
logger.error(f"自動匯入失敗: {error_msg}")
# 區分連線錯誤和真正的匯入錯誤
# 連線錯誤(如 Broken pipe、網路問題不應發送告警
connection_errors = [
'Broken pipe',
'Connection refused',
'Connection reset',
'Connection timed out',
'Name or service not known',
'No route to host',
'Network is unreachable',
'SSL',
'authenticate',
'credentials',
'token'
]
is_connection_error = any(err.lower() in error_msg.lower() for err in connection_errors)
if is_connection_error:
# 連線錯誤:返回成功但無檔案(避免發送告警)
logger.warning(f"Google Drive 連線問題,跳過本次匯入檢查: {error_msg}")
return {
'success': True, # 標記為成功避免告警
'message': f'Google Drive 連線問題,跳過本次檢查',
'file_count': 0,
'imported_count': 0,
'connection_error': True # 標記為連線錯誤供日誌記錄
}
else:
# 真正的匯入錯誤:返回失敗
return {
'success': False,
'message': f'自動匯入失敗: {error_msg}',
'file_count': 0,
'imported_count': 0
}
# 建立全域服務實例
import_service = ImportService()