Files
awoooi/apps/api/src/services/approval_db.py
Your Name cc6140230d
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
Code Review / ai-code-review (push) Has been cancelled
fix(api): 產生 MCP PlayBook 修復候選
2026-06-11 15:42:11 +08:00

1234 lines
44 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Database-based Approval Service
================================
Phase 5: 永久記憶植入
將 TrustEngine 的 in-memory 邏輯轉換為資料庫 CRUD 操作。
重啟後資料完好無缺。
Features:
- SQLAlchemy async CRUD
- ApprovalRecord 持久化
- TimelineEvent 持久化
- 與原有 API 契約相容
"""
from datetime import UTC, datetime, timedelta
from typing import Any
from uuid import UUID
import structlog
from sqlalchemy import and_, or_, select, update
from src.core.redis_client import get_redis
from src.core.trust_engine import classify_risk, get_required_signatures
from src.db.base import get_db_context
from src.db.models import ApprovalRecord, TimelineEvent
from src.models.approval import (
ApprovalRequest,
ApprovalRequestCreate,
ApprovalStatus,
BlastRadius,
DataImpact,
DryRunCheck,
RiskLevel,
Signature,
)
from src.services.approval_action_classifier import (
is_executable_repair_approval_action,
is_no_action_approval_action,
)
logger = structlog.get_logger(__name__)
# =============================================================================
# Conversion Helpers
# =============================================================================
def approval_record_to_request(record: ApprovalRecord) -> ApprovalRequest:
"""
Convert DB ApprovalRecord to Pydantic ApprovalRequest
保持 API 契約相容性
"""
# Parse blast_radius from JSON
blast_radius = None
if record.blast_radius:
br = record.blast_radius
blast_radius = BlastRadius(
affected_pods=br.get("affected_pods", 0),
estimated_downtime=br.get("estimated_downtime", "0"),
related_services=br.get("related_services", []),
data_impact=DataImpact(br.get("data_impact", "none").lower())
if br.get("data_impact")
else DataImpact.NONE,
)
# Parse dry_run_checks from JSON
dry_run_checks = []
if record.dry_run_checks:
for check in record.dry_run_checks:
dry_run_checks.append(
DryRunCheck(
name=check.get("name", ""),
passed=check.get("passed", True),
message=check.get("message"),
)
)
# Parse signatures from JSON
signatures = []
if record.signatures:
for sig in record.signatures:
signatures.append(
Signature(
signer_id=sig.get("signer_id", ""),
signer_name=sig.get("signer_name", ""),
timestamp=datetime.fromisoformat(sig["timestamp"])
if sig.get("timestamp")
else datetime.now(UTC),
comment=sig.get("comment"),
)
)
return ApprovalRequest(
id=UUID(record.id),
action=record.action,
description=record.description,
status=ApprovalStatus(record.status.value if hasattr(record.status, 'value') else record.status),
risk_level=RiskLevel(record.risk_level.value if hasattr(record.risk_level, 'value') else record.risk_level),
blast_radius=blast_radius,
dry_run_checks=dry_run_checks,
required_signatures=record.required_signatures,
current_signatures=record.current_signatures,
signatures=signatures,
requested_by=record.requested_by,
created_at=record.created_at,
expires_at=record.expires_at,
resolved_at=record.resolved_at,
rejection_reason=record.rejection_reason,
metadata=record.extra_metadata,
# 戰略 B: 告警風暴收斂
fingerprint=record.fingerprint,
hit_count=record.hit_count,
last_seen_at=record.last_seen_at,
# B3 fix 2026-04-24 ogt + Claude Sonnet 4.6: 補回 DB 欄位(人工審核路徑讀回必要)
incident_id=getattr(record, "incident_id", None),
matched_playbook_id=getattr(record, "matched_playbook_id", None),
telegram_message_id=getattr(record, "telegram_message_id", None),
telegram_chat_id=getattr(record, "telegram_chat_id", None),
)
def approval_request_to_record_data(
request: ApprovalRequestCreate,
risk_level: RiskLevel,
required_sigs: int,
fingerprint: str | None = None, # 戰略 B: 告警指紋
) -> dict[str, Any]:
"""
Convert ApprovalRequestCreate to dict for ApprovalRecord creation
"""
metadata = dict(request.metadata or {})
preallocated_approval_id = str(metadata.pop("preallocated_approval_id", "") or "").strip()
if preallocated_approval_id:
UUID(preallocated_approval_id)
blast_radius_dict = None
if request.blast_radius:
blast_radius_dict = {
"affected_pods": request.blast_radius.affected_pods,
"estimated_downtime": request.blast_radius.estimated_downtime,
"related_services": request.blast_radius.related_services,
"data_impact": request.blast_radius.data_impact.value.lower()
if request.blast_radius.data_impact
else "none",
}
dry_run_checks_list = []
if request.dry_run_checks:
for check in request.dry_run_checks:
dry_run_checks_list.append({
"name": check.name,
"passed": check.passed,
"message": check.message,
})
now = datetime.now(UTC)
record_data = {
"action": request.action,
"description": request.description,
"status": ApprovalStatus.APPROVED if risk_level == RiskLevel.LOW else ApprovalStatus.PENDING,
"risk_level": risk_level,
"required_signatures": required_sigs,
"current_signatures": 0,
"signatures": [],
"blast_radius": blast_radius_dict or {},
"dry_run_checks": dry_run_checks_list,
"requested_by": request.requested_by,
"expires_at": request.expires_at,
"extra_metadata": metadata or None,
"resolved_at": now if risk_level == RiskLevel.LOW else None,
# 戰略 B: 告警風暴收斂
"fingerprint": fingerprint,
"hit_count": 1,
"last_seen_at": now,
# 2026-04-14 Claude Sonnet 4.6: 補漏 — 原本 incident_id/telegram_message_id
# 不在 dict 裡導致 DB 欄位永遠 NULLTelegram 卡片顯示 INC 號是空白
# 用戶在 Telegram 根本認不出對應的告警,審核閉環名存實亡
"incident_id": request.incident_id,
# B5 fix 2026-04-24 ogt + Claude Sonnet 4.6: 補 matched_playbook_id
# 原本缺失 → Playbook 人工審核後 trust score 永遠不更新(學習閉環斷鏈)
"matched_playbook_id": getattr(request, "matched_playbook_id", None),
}
if preallocated_approval_id:
record_data["id"] = preallocated_approval_id
return record_data
def _record_value(value: Any) -> str:
if hasattr(value, "value"):
value = value.value
text = str(value or "").strip()
if "." in text:
text = text.rsplit(".", 1)[-1]
return text.lower()
def _record_int(value: Any) -> int:
try:
return int(value or 0)
except (TypeError, ValueError):
return 0
def _approval_gate_stage(record: ApprovalRecord) -> str:
status = _record_value(record.status)
current = _record_int(getattr(record, "current_signatures", 0))
required = _record_int(getattr(record, "required_signatures", 0))
if status == "pending":
return "approval_required"
if status == "approved" and current == 0 and required == 0:
return "approval_auto_approved"
if status == "approved":
return "approval_approved"
if status == "execution_success":
return "execution_verified"
if status == "execution_failed":
return "execution_failed"
if status in {"rejected", "expired"}:
return f"approval_{status}"
return "approval_status_recorded"
def _approval_gate_status(record: ApprovalRecord) -> str:
status = _record_value(record.status)
if status == "pending":
return "warning"
if status in {"approved", "execution_success"}:
return "success"
if status in {"rejected", "expired", "execution_failed"}:
return "error"
return "info"
def _approval_needs_human(record: ApprovalRecord) -> bool:
status = _record_value(record.status)
current = _record_int(getattr(record, "current_signatures", 0))
required = _record_int(getattr(record, "required_signatures", 0))
return status == "pending" and current < required
def _approval_next_action(record: ApprovalRecord) -> str:
status = _record_value(record.status)
if status == "pending":
return (
"operator_approve_or_reject"
if _approval_needs_human(record)
else "execute_or_verify"
)
if status == "approved":
return "execute_or_verify"
if status == "execution_success":
return "verify_or_close"
if status == "execution_failed":
return "manual_fix_or_rollback"
if status in {"rejected", "expired"}:
return "review_or_close"
return "review_status_chain"
def _approval_blocked_reason(record: ApprovalRecord) -> str:
status = _record_value(record.status)
if status == "pending" and _approval_needs_human(record):
return "waiting_for_required_signatures"
if status == "execution_failed":
return "execution_failed"
if status == "rejected":
return "operator_rejected"
if status == "expired":
return "approval_expired"
return "none"
def _approval_decision_mode(record: ApprovalRecord) -> str:
current = _record_int(getattr(record, "current_signatures", 0))
required = _record_int(getattr(record, "required_signatures", 0))
risk_level = _record_value(record.risk_level)
if _approval_needs_human(record) or current > 0:
return "manual"
if risk_level == "low" and required == 0:
return "auto"
return "manual"
def build_approval_created_timeline_event(record: ApprovalRecord) -> TimelineEvent:
"""Create the raw audit rail event that mirrors a newly-created approval gate."""
current = _record_int(getattr(record, "current_signatures", 0))
required = _record_int(getattr(record, "required_signatures", 0))
risk_level = _record_value(record.risk_level)
needs_human = _approval_needs_human(record)
stage = _approval_gate_stage(record)
next_action = _approval_next_action(record)
mode = _approval_decision_mode(record)
description = "; ".join(
[
f"stage={stage}",
f"next_action={next_action}",
f"blocked_reason={_approval_blocked_reason(record)}",
f"auto_or_manual={mode}",
f"needs_human={'yes' if needs_human else 'no'}",
f"risk_level={risk_level}",
f"signatures={current}/{required}",
f"action={str(getattr(record, 'action', '') or '')[:240]}",
]
)
title = (
"Approval gate waiting for human decision"
if needs_human
else "Approval gate passed"
)
return TimelineEvent(
event_type="human",
status=_approval_gate_status(record),
title=title,
description=description,
actor=getattr(record, "requested_by", None),
actor_role="approval_gate",
risk_level=risk_level,
approval_id=str(record.id),
incident_id=getattr(record, "incident_id", None),
)
def add_approval_created_timeline_event(db: Any, record: ApprovalRecord) -> TimelineEvent:
event = build_approval_created_timeline_event(record)
db.add(event)
return event
# =============================================================================
# Database Approval Service
# =============================================================================
class ApprovalDBService:
"""
資料庫授權服務 - 替代 in-memory TrustEngine
所有操作皆為資料庫 CRUD重啟後資料保持
Phase 16 R3.4: 支援 Repository 注入
- 有注入 repository: 使用 Repository 層
- 無注入: 使用內嵌 DB 操作 (向下相容)
"""
def __init__(self, repository=None):
"""
初始化 ApprovalDBService
Args:
repository: IApprovalRepository 實例 (可選Phase 16 DI)
"""
self._repository = repository
async def create_approval(
self,
request: ApprovalRequestCreate,
) -> ApprovalRequest:
"""
建立新授權請求 (寫入資料庫)
"""
# 分類風險
risk_level = classify_risk(
action=request.action,
blast_radius=request.blast_radius,
explicit_level=request.risk_level,
)
# 取得所需簽核數
required_sigs = get_required_signatures(risk_level)
# 準備資料
data = approval_request_to_record_data(request, risk_level, required_sigs)
async with get_db_context() as db:
record = ApprovalRecord(**data)
db.add(record)
await db.flush()
await db.refresh(record)
add_approval_created_timeline_event(db, record)
logger.info(
"approval_created_db",
id=record.id,
risk_level=risk_level.value,
status=record.status.value if hasattr(record.status, 'value') else record.status,
)
return approval_record_to_request(record)
# =========================================================================
# 戰略 B: 告警風暴收斂
# =========================================================================
async def create_approval_with_fingerprint(
self,
request: ApprovalRequestCreate,
fingerprint: str,
) -> ApprovalRequest:
"""
建立帶指紋的授權請求 (戰略 B)
用於告警收斂:相同指紋的告警會被聚合。
ADR-073 補丁 2026-04-15 ogt + Claude Sonnet 4.6:
所有 webhook 路徑都未傳 expires_at導致 DB 欄位為 NULL
get_pending_approvals() 的自動過期邏輯 (WHERE expires_at < now)
永遠不觸發,殭屍 PENDING 記錄無限堆積。
修正:凡未傳 expires_at自動注入 48h 預設值。
"""
DEFAULT_APPROVAL_TTL_HOURS = 48 # 給人類 48h 決定視窗
if not request.expires_at:
now = datetime.now(UTC)
request = request.model_copy(
update={"expires_at": now + timedelta(hours=DEFAULT_APPROVAL_TTL_HOURS)}
)
risk_level = classify_risk(
action=request.action,
blast_radius=request.blast_radius,
explicit_level=request.risk_level,
)
required_sigs = get_required_signatures(risk_level)
data = approval_request_to_record_data(request, risk_level, required_sigs, fingerprint=fingerprint)
async with get_db_context() as db:
record = ApprovalRecord(**data)
db.add(record)
await db.flush()
await db.refresh(record)
add_approval_created_timeline_event(db, record)
logger.info(
"approval_created_with_fingerprint",
id=record.id,
fingerprint=fingerprint,
risk_level=risk_level.value,
)
return approval_record_to_request(record)
async def find_by_fingerprint(
self,
fingerprint: str,
debounce_minutes: int = 5,
) -> ApprovalRequest | None:
"""
根據指紋查詢現有的告警記錄 (戰略 B)
查詢條件:
1. 相同指紋
2. 狀態為 PENDING 且在 24 小時內建立(超過 24h 的 PENDING 視為過期,不再收斂)
3. 或在 debounce_minutes 分鐘內建立(不論狀態)
ADR-073 補丁 2026-04-15 ogt + Claude Sonnet 4.6:
原邏輯 PENDING 無 TTL → 3 天前 PENDING 記錄永久封鎖同指紋告警。
修正PENDING 收斂窗口上限 PENDING_TTL_HOURS24h
Returns:
ApprovalRequest if found, None otherwise
"""
PENDING_TTL_HOURS = 24 # PENDING 記錄最長收斂時效(超過則視為已過期)
now = datetime.now(UTC)
cutoff_time = now - timedelta(minutes=debounce_minutes)
pending_cutoff = now - timedelta(hours=PENDING_TTL_HOURS)
async with get_db_context() as db:
result = await db.execute(
select(ApprovalRecord)
.where(ApprovalRecord.fingerprint == fingerprint)
.where(
or_(
# PENDING 狀態但必須在 24h 內,防止老 PENDING 永久封鎖
and_(
ApprovalRecord.status == ApprovalStatus.PENDING,
ApprovalRecord.created_at >= pending_cutoff,
),
# 最近 debounce_minutes 分鐘內建立的任何記錄
ApprovalRecord.created_at >= cutoff_time,
)
)
.order_by(ApprovalRecord.created_at.desc())
.limit(1)
)
record = result.scalar_one_or_none()
if record:
logger.info(
"fingerprint_match_found",
fingerprint=fingerprint,
approval_id=record.id,
hit_count=record.hit_count,
status=record.status.value if hasattr(record.status, 'value') else record.status,
)
# 2026-04-20 ogt + Claude Opus 4.7: ADR-092 tg_sent Redis 驗證
# PENDING 記錄不代表 Telegram 已發送(可能因網路/Token錯誤而靜默失敗
# 僅在 debounce 窗口外的 PENDING 收斂時,必須確認 Redis 有 tg_sent 標記
within_debounce = record.created_at >= cutoff_time
if not within_debounce:
try:
r = get_redis()
tg_confirmed = await r.exists(f"tg_sent:{fingerprint}")
except Exception as _re:
tg_confirmed = False
logger.warning("tg_sent_redis_check_failed", fingerprint=fingerprint, error=str(_re))
if not tg_confirmed:
logger.warning(
"fingerprint_pending_no_tg_confirmation",
fingerprint=fingerprint,
approval_id=str(record.id),
created_at=record.created_at.isoformat(),
)
return None # 視為新告警,重新發送 Telegram
return approval_record_to_request(record)
return None
async def mark_telegram_confirmed(self, fingerprint: str, ttl: int = 108000) -> None:
"""
2026-04-20 ogt + Claude Opus 4.7: ADR-092
記錄 Telegram 已成功發送,防止 PENDING 誤收斂造成永久靜默。
ADR-092 B3 (2026-04-24 ogt + Claude Sonnet 4.6 Asia/Taipei)
TTL 從 86400s24h延長至 108000s30h = 24h + 6h buffer
避免 PENDING 存活期與 tg_sent key 同時到期時的邊緣誤判。
W-2 現已改用 telegram_message_id IS NULL 判斷(不再依賴此 key
此 TTL 保留供收斂去重邏輯approval_db.py 第 342 行)使用。
"""
try:
r = get_redis()
await r.setex(f"tg_sent:{fingerprint}", ttl, "1")
logger.debug("tg_sent_marked", fingerprint=fingerprint, ttl=ttl)
except Exception as e:
logger.warning("tg_sent_mark_failed", fingerprint=fingerprint, error=str(e))
async def increment_hit_count(
self,
approval_id: UUID,
) -> ApprovalRequest | None:
"""
增加告警聚合次數 (戰略 B)
當相同指紋的告警再次觸發時:
1. hit_count += 1
2. last_seen_at = now
這樣可以跳過 LLM 分析,節省 API 成本!
"""
now = datetime.now(UTC)
async with get_db_context() as db:
# 更新 hit_count 和 last_seen_at
result = await db.execute(
update(ApprovalRecord)
.where(ApprovalRecord.id == str(approval_id))
.values(
hit_count=ApprovalRecord.hit_count + 1,
last_seen_at=now,
)
.returning(ApprovalRecord.hit_count)
)
new_count = result.scalar_one_or_none()
if new_count is None:
return None
# 重新讀取完整記錄
result = await db.execute(
select(ApprovalRecord).where(ApprovalRecord.id == str(approval_id))
)
record = result.scalar_one_or_none()
if record:
logger.info(
"hit_count_incremented",
approval_id=str(approval_id),
new_hit_count=new_count,
last_seen_at=now.isoformat(),
)
return approval_record_to_request(record)
return None
async def get_approval(self, approval_id: UUID) -> ApprovalRequest | None:
"""
取得單一授權請求
Phase 16 R3.4: 支援 Repository 注入
"""
# Phase 16: 使用 Repository (如果有注入)
if self._repository:
return await self._repository.get_by_id(approval_id)
# Legacy: 內嵌 DB 操作
async with get_db_context() as db:
result = await db.execute(
select(ApprovalRecord).where(ApprovalRecord.id == str(approval_id))
)
record = result.scalar_one_or_none()
if record is None:
return None
return approval_record_to_request(record)
async def get_pending_approvals(self) -> list[ApprovalRequest]:
"""
取得所有待簽核請求
Phase 16 R3.4: 支援 Repository 注入
"""
# Phase 16: 使用 Repository (如果有注入)
if self._repository:
return await self._repository.get_pending()
# Legacy: 內嵌 DB 操作
now = datetime.now(UTC)
async with get_db_context() as db:
# 先更新過期的請求
await db.execute(
update(ApprovalRecord)
.where(ApprovalRecord.status == ApprovalStatus.PENDING)
.where(ApprovalRecord.expires_at < now)
.values(status=ApprovalStatus.EXPIRED, resolved_at=now)
)
# 取得所有 PENDING
result = await db.execute(
select(ApprovalRecord)
.where(ApprovalRecord.status == ApprovalStatus.PENDING)
.order_by(ApprovalRecord.created_at.desc())
)
records = result.scalars().all()
return [approval_record_to_request(r) for r in records]
async def sign_approval(
self,
approval_id: UUID,
signer_id: str,
signer_name: str,
comment: str | None = None,
) -> tuple[ApprovalRequest | None, str, bool]:
"""
簽核授權請求
Phase 5: 使用 FOR UPDATE 行鎖防止 Race Condition
當多人同時簽核時,確保只有一人能成功取得鎖並更新
Returns:
(approval, message, execution_triggered)
"""
async with get_db_context() as db:
# Phase 5: FOR UPDATE 行級鎖 - 防止併發簽核競爭
# SQLite 不支援 FOR UPDATE但 PostgreSQL 完整支援
result = await db.execute(
select(ApprovalRecord)
.where(ApprovalRecord.id == str(approval_id))
.with_for_update() # Row-Level Lock
)
record = result.scalar_one_or_none()
logger.info(
"sign_approval_lock_acquired",
approval_id=str(approval_id),
signer_id=signer_id,
)
if record is None:
return None, "Approval not found", False
# 檢查狀態
status_value = record.status.value if hasattr(record.status, 'value') else record.status
if status_value != "pending":
return (
approval_record_to_request(record),
f"Cannot sign: status is {status_value}",
False,
)
# 檢查是否已簽核
signatures = record.signatures or []
for sig in signatures:
if sig.get("signer_id") == signer_id:
return (
approval_record_to_request(record),
f"User {signer_name} has already signed this approval",
False,
)
# Phase 5: 樂觀鎖 - 記錄更新前的簽名數
old_sig_count = record.current_signatures
# 新增簽章
new_signature = {
"signer_id": signer_id,
"signer_name": signer_name,
"timestamp": datetime.now(UTC).isoformat(),
"comment": comment,
}
signatures.append(new_signature)
new_sig_count = len(signatures)
# 計算新狀態
execution_triggered = False
new_status = record.status
resolved_at = None
if new_sig_count >= record.required_signatures:
new_status = ApprovalStatus.APPROVED
resolved_at = datetime.now(UTC)
execution_triggered = is_executable_repair_approval_action(
record.action
)
# Phase 5: 樂觀鎖更新 - 使用 WHERE current_signatures = old_value
# 如果其他人已更新,這個 UPDATE 會更新 0 行
metadata = dict(record.extra_metadata or {})
if is_no_action_approval_action(record.action):
metadata["execution_kind"] = metadata.get("execution_kind") or "no_action"
metadata["repair_executed"] = False
metadata["repair_attempted"] = False
metadata["execution_suppressed_reason"] = (
"approval_action_has_no_executable_repair"
)
result = await db.execute(
update(ApprovalRecord)
.where(and_(
ApprovalRecord.id == str(approval_id),
ApprovalRecord.current_signatures == old_sig_count, # 樂觀鎖條件
))
.values(
signatures=signatures,
current_signatures=new_sig_count,
status=new_status,
resolved_at=resolved_at,
extra_metadata=metadata,
)
)
# 檢查是否更新成功
if result.rowcount == 0:
logger.warning(
"sign_approval_optimistic_lock_conflict",
approval_id=str(approval_id),
signer_id=signer_id,
old_sig_count=old_sig_count,
)
return (
approval_record_to_request(record),
"Concurrent modification detected. Please retry.",
False,
)
# 重新讀取更新後的記錄
result = await db.execute(
select(ApprovalRecord).where(ApprovalRecord.id == str(approval_id))
)
record = result.scalar_one()
if execution_triggered:
message = f"Approval complete! ({new_sig_count}/{record.required_signatures} signatures)"
else:
message = f"Signature added ({new_sig_count}/{record.required_signatures})"
logger.info(
"approval_signed_db",
id=record.id,
signer=signer_name,
current=record.current_signatures,
required=record.required_signatures,
execution_triggered=execution_triggered,
)
return approval_record_to_request(record), message, execution_triggered
async def reject_approval(
self,
approval_id: UUID,
rejector_id: str,
rejector_name: str,
reason: str,
) -> tuple[ApprovalRequest | None, str]:
"""
拒絕授權請求
"""
async with get_db_context() as db:
result = await db.execute(
select(ApprovalRecord).where(ApprovalRecord.id == str(approval_id))
)
record = result.scalar_one_or_none()
if record is None:
return None, "Approval not found"
status_value = record.status.value if hasattr(record.status, 'value') else record.status
if status_value != "pending":
return (
approval_record_to_request(record),
f"Cannot reject: status is {status_value}",
)
record.status = ApprovalStatus.REJECTED
record.rejection_reason = f"{rejector_name}: {reason}"
record.resolved_at = datetime.now(UTC)
await db.flush()
await db.refresh(record)
logger.info(
"approval_rejected_db",
id=record.id,
rejector=rejector_name,
reason=reason,
)
return approval_record_to_request(record), "Approval rejected"
async def update_execution_status(
self,
approval_id: UUID,
success: bool,
error_message: str | None = None,
execution_kind: str | None = None,
repair_executed: bool | None = None,
repair_attempted: bool | None = None,
) -> None:
"""
更新執行狀態
2026-04-18 ogt + Claude Opus 4.7: ADR-090 L5 斷鏈修復 — P0.2
失敗時必寫 rejection_reason,讓診斷不再黑盒
(之前 EXECUTION_FAILED 216 筆 reason 全空)
"""
async with get_db_context() as db:
status = ApprovalStatus.EXECUTION_SUCCESS if success else ApprovalStatus.EXECUTION_FAILED
result = await db.execute(
select(ApprovalRecord).where(ApprovalRecord.id == str(approval_id))
)
record = result.scalar_one_or_none()
if record is None:
logger.warning(
"approval_execution_status_update_missing",
id=str(approval_id),
success=success,
)
return
record.status = status
if not success and error_message:
# 截斷至合理長度,避免爆欄位
record.rejection_reason = str(error_message)[:2000]
if execution_kind:
# 2026-05-31 ogt + Codex: OBSERVE/NO_ACTION 仍需 terminal 狀態,
# 但前台/報表必須能分辨「未執行修復」而非真正 execution success。
metadata = dict(record.extra_metadata or {})
metadata["execution_kind"] = execution_kind
metadata["repair_executed"] = (
repair_executed
if repair_executed is not None
else execution_kind not in {
"no_action",
"diagnostic",
"parse_failed",
"unsupported_action",
}
)
if repair_attempted is not None:
metadata["repair_attempted"] = repair_attempted
record.extra_metadata = metadata
logger.info(
"approval_execution_status_updated",
id=str(approval_id),
success=success,
has_error=bool(error_message),
execution_kind=execution_kind,
)
async def update_incident_id(self, approval_id: UUID, incident_id: str) -> None:
"""
2026-04-06 ogt: Phase 26 — 回寫 incident_id 到 approval_records
讓 Playbook 萃取和 KM 寫入能找到對應的 Incident
2026-04-14 Claude Sonnet 4.6 診斷: Live-fire #7 發現 approval.incident_id 仍 NULL
加 rowcount 與 pre/post 值檢查,若 0 rows affected 則 log warning
"""
async with get_db_context() as db:
result = await db.execute(
update(ApprovalRecord)
.where(ApprovalRecord.id == str(approval_id))
.values(incident_id=incident_id)
)
rowcount = result.rowcount if hasattr(result, "rowcount") else -1
if rowcount == 0:
# 找不到對應 approval — 可能 id 型別或 session 不同步
logger.warning(
"update_incident_id_zero_rows",
approval_id=str(approval_id),
approval_id_type=type(approval_id).__name__,
incident_id=incident_id,
reason="UPDATE 0 rows affected — approval 不存在或 id mismatch",
)
else:
logger.info(
"update_incident_id_success",
approval_id=str(approval_id),
incident_id=incident_id,
rowcount=rowcount,
)
async def update_telegram_message(
self, incident_id: str, telegram_message_id: int, telegram_chat_id: int | None = None
) -> None:
"""
2026-04-09 Claude Sonnet 4.6: 持久化 Telegram message_id 到 DB
讓告警訊息 ID 不再只存 Redis24h TTL支援長期狀態追蹤和訊息更新。
以 incident_id 查找最新 PENDING approval record 並回填。
"""
async with get_db_context() as db:
from sqlalchemy import text as _text
params: dict = {
"incident_id": incident_id,
"telegram_message_id": telegram_message_id,
"status": "PENDING",
}
chat_clause = ""
if telegram_chat_id is not None:
params["telegram_chat_id"] = telegram_chat_id
chat_clause = ", telegram_chat_id = :telegram_chat_id"
await db.execute(
_text(f"""
UPDATE approval_records
SET telegram_message_id = :telegram_message_id{chat_clause}
WHERE incident_id = :incident_id
AND status = :status
"""),
params,
)
async def update_action_by_incident_id(self, incident_id: str, new_action: str) -> int:
"""
Agent Orchestrator 分析完成後覆寫 ApprovalRecord.action。
設計動機 (2026-04-16 ogt + Claude Sonnet 4.6):
- Webhook inline LLM 寫入垃圾 action如 kubectl rollout restart for postgres disk
- Agent 分析正確但只發新 Telegram 卡,未覆寫 ApprovalRecord
- 用戶批准 Agent 卡 → 系統查 incident_id → 執行舊 webhook 垃圾 action
- 修復Agent 完成後呼叫此方法,讓用戶批准時執行正確 action
Args:
incident_id: INC-xxx 格式 Incident ID
new_action: Agent 決定的 action空字串 → 不覆寫)
Returns:
int: rowcount0 表示找不到對應 PENDING approval
"""
if not new_action:
return 0
async with get_db_context() as db:
from sqlalchemy import text as _text
result = await db.execute(
_text("""
UPDATE approval_records
SET action = :new_action
WHERE incident_id = :incident_id
AND status = 'PENDING'
"""),
{"incident_id": incident_id, "new_action": new_action},
)
rowcount = result.rowcount if hasattr(result, "rowcount") else -1
logger.info(
"approval_action_updated_by_agent",
incident_id=incident_id,
new_action=new_action[:80],
rowcount=rowcount,
)
return rowcount
async def update_decision_fusion(
self,
incident_id: str,
composite_score: float,
complexity_tier: str,
fusion_details: dict,
) -> int:
"""
P2.1 DecisionFusionEngine 結果回寫到 approval_records。
2026-04-26 P2-DB-Fix by Claude — db-expert P0 三修P0.3:
ADR-085 鐵律fusion 分數必須落地 PG不能只存 Redis token
Args:
incident_id: INC-xxx 格式 Incident ID
composite_score: FusionScore.composite0.0-1.0
complexity_tier: ComplexityTier.valuelow/medium/high/critical
fusion_details: FusionScore.to_dict() 完整 dict
Returns:
int: rowcount0 表示找不到對應 PENDING approval
"""
async with get_db_context() as db:
result = await db.execute(
update(ApprovalRecord)
.where(
and_(
ApprovalRecord.incident_id == incident_id,
ApprovalRecord.status == ApprovalStatus.PENDING,
)
)
.values(
composite_score=composite_score,
complexity_tier=complexity_tier,
decision_fusion_details=fusion_details,
)
)
rowcount = result.rowcount if hasattr(result, "rowcount") else -1
logger.info(
"approval_decision_fusion_updated",
incident_id=incident_id,
composite_score=composite_score,
complexity_tier=complexity_tier,
rowcount=rowcount,
)
return rowcount
# =========================================================================
# Phase 6.4h: Proposals API 支援方法
# =========================================================================
async def get_approval_by_id(self, approval_id: UUID) -> ApprovalRequest | None:
"""
根據 ID 取得單一授權請求 (Phase 6.4h)
Args:
approval_id: 授權請求 UUID
Returns:
ApprovalRequest if found, None otherwise
"""
async with get_db_context() as db:
result = await db.execute(
select(ApprovalRecord).where(ApprovalRecord.id == str(approval_id))
)
record = result.scalar_one_or_none()
if record is None:
return None
return approval_record_to_request(record)
async def get_all_approvals(
self,
status: ApprovalStatus | None = None,
incident_id: str | None = None,
limit: int = 50,
offset: int = 0,
) -> list[ApprovalRequest]:
"""
取得所有授權請求 (Phase 6.4h)
Args:
status: 狀態篩選 (可選)
incident_id: Incident ID 篩選 (可選)
limit: 每頁數量
offset: 偏移量
Returns:
ApprovalRequest 清單
"""
async with get_db_context() as db:
query = select(ApprovalRecord)
# 狀態篩選
if status is not None:
query = query.where(ApprovalRecord.status == status)
# 2026-04-09 Claude Sonnet 4.6: 修復 incident_id 篩選 — 直接用 DB 欄位
# 舊版在應用層查 a.metadata.get("incident_id") 但 ApprovalRecord.incident_id
# 是直接欄位,不在 extra_metadata JSON 裡,導致 telegram_approval_not_found_by_incident
if incident_id:
query = query.where(ApprovalRecord.incident_id == incident_id)
query = query.order_by(ApprovalRecord.created_at.desc())
query = query.offset(offset).limit(limit)
result = await db.execute(query)
records = result.scalars().all()
approvals = [approval_record_to_request(r) for r in records]
return approvals
# =============================================================================
# Timeline Event Service
# =============================================================================
class TimelineDBService:
"""
時間軸事件服務 - Phase 4 Action Timeline 持久化
"""
async def add_event(
self,
event_type: str,
status: str,
title: str,
description: str | None = None,
actor: str | None = None,
actor_role: str | None = None,
risk_level: str | None = None,
approval_id: str | None = None,
incident_id: str | None = None,
) -> dict[str, Any]:
"""
新增時間軸事件
"""
async with get_db_context() as db:
event = TimelineEvent(
event_type=event_type,
status=status,
title=title,
description=description,
actor=actor,
actor_role=actor_role,
risk_level=risk_level,
approval_id=approval_id,
incident_id=incident_id,
)
db.add(event)
await db.flush()
await db.refresh(event)
logger.info(
"timeline_event_added",
id=event.id,
type=event_type,
title=title,
incident_id=incident_id,
)
return {
"id": event.id,
"type": event.event_type,
"status": event.status,
"title": event.title,
"incident_id": event.incident_id,
"created_at": event.created_at.isoformat(),
}
async def get_events(
self,
limit: int = 50,
incident_id: str | None = None,
approval_ids: list[str] | None = None,
) -> list[dict[str, Any]]:
"""
取得最近的時間軸事件
"""
async with get_db_context() as db:
query = select(TimelineEvent)
if incident_id:
from sqlalchemy import or_
filters = [TimelineEvent.incident_id == incident_id]
if approval_ids:
filters.append(TimelineEvent.approval_id.in_(approval_ids))
query = query.where(or_(*filters))
result = await db.execute(query.order_by(TimelineEvent.created_at.desc()).limit(limit))
events = result.scalars().all()
return [
{
"id": e.id,
"type": e.event_type,
"status": e.status,
"title": e.title,
"description": e.description,
"actor": e.actor,
"actor_role": e.actor_role,
"risk_level": e.risk_level,
"approval_id": e.approval_id,
"incident_id": e.incident_id,
"created_at": e.created_at.isoformat(),
}
for e in events
]
# =============================================================================
# Singleton Instances
# =============================================================================
_approval_service: ApprovalDBService | None = None
_timeline_service: TimelineDBService | None = None
def get_approval_service(use_repository: bool = False) -> ApprovalDBService:
"""
取得授權服務實例
Args:
use_repository: 是否使用 Repository 層 (Phase 16 R3.4)
Phase 16: 絞殺者模式
- use_repository=False: 使用內嵌 DB 操作 (預設,向下相容)
- use_repository=True: 使用 ApprovalDBRepository
"""
global _approval_service
if _approval_service is None:
if use_repository:
from src.repositories import get_approval_repository
_approval_service = ApprovalDBService(repository=get_approval_repository())
logger.info("approval_service_with_repository")
else:
_approval_service = ApprovalDBService()
return _approval_service
def get_timeline_service() -> TimelineDBService:
"""取得時間軸服務實例"""
global _timeline_service
if _timeline_service is None:
_timeline_service = TimelineDBService()
return _timeline_service