diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py
index 0b87427b..269536ce 100644
--- a/apps/api/src/api/v1/webhooks.py
+++ b/apps/api/src/api/v1/webhooks.py
@@ -235,7 +235,7 @@ async def _try_auto_repair_background(
"risk_level": decision.risk_level.value if decision.risk_level else None,
"langfuse_trace_id": _langfuse_trace_id,
"langfuse_url": (
- f"http://192.168.0.110:3100/trace/{_langfuse_trace_id}"
+ f"{settings.LANGFUSE_URL}/trace/{_langfuse_trace_id}"
if _langfuse_trace_id else None
),
},
diff --git a/apps/api/src/services/auto_repair_service.py b/apps/api/src/services/auto_repair_service.py
index 266b2001..bbcc53ac 100644
--- a/apps/api/src/services/auto_repair_service.py
+++ b/apps/api/src/services/auto_repair_service.py
@@ -42,6 +42,8 @@ from src.services.global_repair_cooldown import (
check_global_repair_cooldown,
record_global_repair_action,
)
+# Sprint 5.1: Service Registry Guardrail (ADR-062)
+from src.services.service_registry import StatefulLevel, get_service_registry
from src.services.playbook_service import IPlaybookService, get_playbook_service
logger = structlog.get_logger(__name__)
@@ -195,8 +197,8 @@ class AutoRepairService:
# 0.5 Sprint 5.1 Guardrail: Service Registry 服務分級檢查
# (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062)
# 全域熔斷之後、嚴重度之前,BLOCK 等級直接拒絕
+ # 保守原則:Registry 讀取失敗也 block(優先安全,不放行)
try:
- from src.services.service_registry import StatefulLevel, get_service_registry
_registry = get_service_registry()
_service_name = (incident.target_resource or "") if hasattr(incident, "target_resource") else ""
if not _service_name and incident.affected_services:
@@ -215,8 +217,13 @@ class AutoRepairService:
blocked_by="SERVICE_REGISTRY_BLOCK",
)
except Exception as _guardrail_err:
+ # S1-3 修正: Registry 失敗時保守拒絕,不允許穿透(ADR-062 審查修正 2026-04-08)
logger.error("guardrail_check_failed", error=str(_guardrail_err))
- # 保守原則:失敗時繼續(不阻擋,但記錄)
+ return AutoRepairDecision(
+ can_auto_repair=False,
+ reason="Guardrail Service Registry 讀取異常,保守拒絕自動修復",
+ blocked_by="GUARDRAIL_ERROR",
+ )
# 1. 檢查 Incident 嚴重度
if incident.severity and incident.severity.value in ["P0", "P1"]:
diff --git a/apps/api/src/services/preflight_service.py b/apps/api/src/services/preflight_service.py
index 837d9104..6c8f2b53 100644
--- a/apps/api/src/services/preflight_service.py
+++ b/apps/api/src/services/preflight_service.py
@@ -6,7 +6,7 @@
from __future__ import annotations
-import logging
+import structlog
import time
from dataclasses import dataclass
from enum import Enum
@@ -14,7 +14,7 @@ from enum import Enum
from .service_registry import ServiceRegistryClient, get_service_registry
from .velero_client import VeleroClient, get_velero_client
-logger = logging.getLogger(__name__)
+logger = structlog.get_logger(__name__)
class PreflightResult(str, Enum):
diff --git a/apps/api/src/services/service_registry.py b/apps/api/src/services/service_registry.py
index c15472b1..2c38010c 100644
--- a/apps/api/src/services/service_registry.py
+++ b/apps/api/src/services/service_registry.py
@@ -6,14 +6,14 @@
from __future__ import annotations
-import logging
+import structlog
from enum import Enum
from pathlib import Path
from typing import Any
import yaml
-logger = logging.getLogger(__name__)
+logger = structlog.get_logger(__name__)
# YAML 路徑(相對於 repo root)
_DEFAULT_REGISTRY_PATH = Path(__file__).parents[5] / "ops" / "config" / "service-registry.yaml"
diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py
index 4d267ee2..2d1699e8 100644
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -2666,16 +2666,19 @@ class TelegramGateway:
reason: str,
) -> None:
"""T1: GUARDRAIL_BLOCKED — 服務屬於 BLOCK 等級,禁止自動修復"""
- text = (
- "🚫 [服務保護] 自動修復已阻擋\n"
- "━━━━━━━━━━━━━━━━━\n"
- f"服務: {html.escape(service_name)}\n"
- f"告警: {html.escape(alertname)}\n"
- f"原因: {html.escape(reason)}\n"
- "━━━━━━━━━━━━━━━━━\n"
- "⚠️ 請人工評估並手動處理"
- )
- await self.send_notification(text)
+ try:
+ text = (
+ "🚫 [服務保護] 自動修復已阻擋\n"
+ "━━━━━━━━━━━━━━━━━\n"
+ f"服務: {html.escape(service_name)}\n"
+ f"告警: {html.escape(alertname)}\n"
+ f"原因: {html.escape(reason)}\n"
+ "━━━━━━━━━━━━━━━━━\n"
+ "⚠️ 請人工評估並手動處理"
+ )
+ await self.send_notification(text)
+ except Exception as e:
+ logger.error("t1_guardrail_blocked_notify_failed", service=service_name, error=str(e))
async def send_preflight_failed(
self,
@@ -2685,21 +2688,24 @@ class TelegramGateway:
backup_name: str | None,
) -> None:
"""T2: PRE_FLIGHT_FAILED + BACKUP_TRIGGERED — 備份過期,修復暫停"""
- backup_status = (
- f"緊急備份: 已啟動 {html.escape(backup_name)}"
- if backup_name
- else "緊急備份: 啟動失敗,請人工處理"
- )
- text = (
- "⏸ [Pre-flight 阻擋] 備份已過期,修復暫停\n"
- "━━━━━━━━━━━━━━━━━\n"
- f"服務: {html.escape(service_name)}\n"
- f"備份距今: {backup_age_hours:.1f} 小時(上限 {max_age_hours:.0f} 小時)\n"
- f"{backup_status}\n"
- "━━━━━━━━━━━━━━━━━\n"
- "請等待備份完成後,人工重新評估修復方案"
- )
- await self.send_notification(text)
+ try:
+ backup_status = (
+ f"緊急備份: 已啟動 {html.escape(backup_name)}"
+ if backup_name
+ else "緊急備份: 啟動失敗,請人工處理"
+ )
+ text = (
+ "⏸ [Pre-flight 阻擋] 備份已過期,修復暫停\n"
+ "━━━━━━━━━━━━━━━━━\n"
+ f"服務: {html.escape(service_name)}\n"
+ f"備份距今: {backup_age_hours:.1f} 小時(上限 {max_age_hours:.0f} 小時)\n"
+ f"{backup_status}\n"
+ "━━━━━━━━━━━━━━━━━\n"
+ "請等待備份完成後,人工重新評估修復方案"
+ )
+ await self.send_notification(text)
+ except Exception as e:
+ logger.error("t2_preflight_failed_notify_failed", service=service_name, error=str(e))
async def send_backup_result(
self,
@@ -2708,21 +2714,24 @@ class TelegramGateway:
error_msg: str | None = None,
) -> None:
"""T3: BACKUP_COMPLETED / BACKUP_FAILED — 緊急備份結果"""
- if success:
- text = (
- "✅ 緊急備份完成\n"
- f"備份: {html.escape(backup_name)}\n"
- "可繼續手動執行修復"
- )
- else:
- err = html.escape(error_msg or "未知錯誤")
- text = (
- "❌ 緊急備份失敗\n"
- f"備份: {html.escape(backup_name)}\n"
- f"錯誤: {err}\n"
- "請人工介入,備份異常"
- )
- await self.send_notification(text)
+ try:
+ if success:
+ text = (
+ "✅ 緊急備份完成\n"
+ f"備份: {html.escape(backup_name)}\n"
+ "可繼續手動執行修復"
+ )
+ else:
+ err = html.escape(error_msg or "未知錯誤")
+ text = (
+ "❌ 緊急備份失敗\n"
+ f"備份: {html.escape(backup_name)}\n"
+ f"錯誤: {err}\n"
+ "請人工介入,備份異常"
+ )
+ await self.send_notification(text)
+ except Exception as e:
+ logger.error("t3_backup_result_notify_failed", backup=backup_name, error=str(e))
async def send_multisig_waiting(
self,
@@ -2733,18 +2742,21 @@ class TelegramGateway:
approval_id: str,
) -> None:
"""T4: APPROVAL_ESCALATED — 第 1 票完成,等待第 2 票"""
- text = (
- "🔐 [MultiSig] 等待第 2 票授權\n"
- "━━━━━━━━━━━━━━━━━\n"
- f"操作: {html.escape(action)}\n"
- f"服務: {html.escape(service_name)}\n"
- f"風險: CRITICAL(HITL 雙簽)\n"
- f"已獲授權: {votes_received}/{votes_required} 票\n"
- f"審核 ID: {html.escape(approval_id)}\n"
- "━━━━━━━━━━━━━━━━━\n"
- "請第二位審核者登入確認"
- )
- await self.send_notification(text)
+ try:
+ text = (
+ "🔐 [MultiSig] 等待第 2 票授權\n"
+ "━━━━━━━━━━━━━━━━━\n"
+ f"操作: {html.escape(action)}\n"
+ f"服務: {html.escape(service_name)}\n"
+ f"風險: CRITICAL(HITL 雙簽)\n"
+ f"已獲授權: {votes_received}/{votes_required} 票\n"
+ f"審核 ID: {html.escape(approval_id)}\n"
+ "━━━━━━━━━━━━━━━━━\n"
+ "請第二位審核者登入確認"
+ )
+ await self.send_notification(text)
+ except Exception as e:
+ logger.error("t4_multisig_waiting_notify_failed", approval=approval_id, error=str(e))
async def send_multisig_approved(
self,
@@ -2752,13 +2764,16 @@ class TelegramGateway:
service_name: str,
) -> None:
"""T5: MultiSig 完成(2/2)"""
- text = (
- "✅ [MultiSig 完成] 雙簽授權通過\n"
- f"操作: {html.escape(action)}\n"
- f"服務: {html.escape(service_name)}\n"
- "授權: 2/2 票 開始執行..."
- )
- await self.send_notification(text)
+ try:
+ text = (
+ "✅ [MultiSig 完成] 雙簽授權通過\n"
+ f"操作: {html.escape(action)}\n"
+ f"服務: {html.escape(service_name)}\n"
+ "授權: 2/2 票 開始執行..."
+ )
+ await self.send_notification(text)
+ except Exception as e:
+ logger.error("t5_multisig_approved_notify_failed", service=service_name, error=str(e))
async def send_change_applied(
self,
@@ -2767,14 +2782,17 @@ class TelegramGateway:
timestamp: str,
) -> None:
"""T6: CHANGE_APPLIED — 手動變更記錄"""
- text = (
- "📝 [變更記錄] 手動操作已記錄\n"
- "━━━━━━━━━━━━━━━━━\n"
- f"操作者: {html.escape(operator)}\n"
- f"動作: {html.escape(action_description)}\n"
- f"時間: {html.escape(timestamp)}"
- )
- await self.send_notification(text)
+ try:
+ text = (
+ "📝 [變更記錄] 手動操作已記錄\n"
+ "━━━━━━━━━━━━━━━━━\n"
+ f"操作者: {html.escape(operator)}\n"
+ f"動作: {html.escape(action_description)}\n"
+ f"時間: {html.escape(timestamp)}"
+ )
+ await self.send_notification(text)
+ except Exception as e:
+ logger.error("t6_change_applied_notify_failed", operator=operator, error=str(e))
async def send_notification(
self,
diff --git a/apps/api/src/services/velero_client.py b/apps/api/src/services/velero_client.py
index 7ad411d2..4fb91123 100644
--- a/apps/api/src/services/velero_client.py
+++ b/apps/api/src/services/velero_client.py
@@ -8,11 +8,14 @@ from __future__ import annotations
import asyncio
import json
-import logging
import time
-from datetime import UTC, datetime
+from datetime import datetime
-logger = logging.getLogger(__name__)
+import structlog
+
+from src.utils.timezone import now_taipei
+
+logger = structlog.get_logger(__name__)
_VELERO_NAMESPACE = "velero"
_KUBECTL_TIMEOUT = 30 # 秒
@@ -40,7 +43,7 @@ class VeleroClient:
data = json.loads(result)
items = data.get("items", [])
if not items:
- logger.warning("Velero: 找不到任何 Completed 備份")
+ logger.warning("velero_no_completed_backups")
return 999.0
latest = max(
@@ -52,15 +55,15 @@ class VeleroClient:
return 999.0
completed_at = datetime.fromisoformat(completion_ts.replace("Z", "+00:00"))
- age = (datetime.now(UTC) - completed_at).total_seconds() / 3600
- logger.info(f"Velero 最近備份: {completion_ts},距今 {age:.1f} 小時")
+ age = (now_taipei() - completed_at).total_seconds() / 3600
+ logger.info("velero_backup_age_checked", completion_ts=completion_ts, age_hours=round(age, 1))
return age
except asyncio.TimeoutError:
- logger.error("Velero kubectl 查詢超時")
+ logger.error("velero_kubectl_timeout")
return 999.0
except Exception as e:
- logger.error(f"Velero 查詢失敗: {e}")
+ logger.error("velero_query_failed", error=str(e))
return 999.0
async def trigger_emergency_backup(self, backup_name: str | None = None) -> bool:
@@ -68,21 +71,38 @@ class VeleroClient:
觸發緊急備份(非同步,不等待完成)
返回 True 表示指令已成功發送
"""
+ # S3-3 修正: kubectl apply Backup CRD(非 kubectl create backup,不存在此子命令)
+ # (2026-04-08 審查修正 Claude Sonnet 4.6 Asia/Taipei)
name = backup_name or f"emergency-{int(time.time())}"
+ manifest = (
+ f"apiVersion: velero.io/v1\n"
+ f"kind: Backup\n"
+ f"metadata:\n"
+ f" name: {name}\n"
+ f" namespace: {_VELERO_NAMESPACE}\n"
+ f"spec:\n"
+ f" includedNamespaces:\n"
+ f" - awoooi-prod\n"
+ f" ttl: 720h0m0s\n"
+ )
try:
- await asyncio.wait_for(
- self._run_kubectl([
- "create", "backup", name,
- "-n", _VELERO_NAMESPACE,
- "--include-namespaces", "awoooi-prod",
- "--wait=false",
- ]),
+ # kubectl apply -f - (from stdin)
+ proc = await asyncio.wait_for(
+ asyncio.create_subprocess_exec(
+ "kubectl", "apply", "-f", "-",
+ stdin=asyncio.subprocess.PIPE,
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.PIPE,
+ ),
timeout=_KUBECTL_TIMEOUT,
)
- logger.info(f"Velero 緊急備份已啟動: {name}")
+ stdout, stderr = await proc.communicate(input=manifest.encode())
+ if proc.returncode != 0:
+ raise RuntimeError(f"kubectl apply 失敗: {stderr.decode()}")
+ logger.info("velero_emergency_backup_triggered", backup_name=name)
return True
except Exception as e:
- logger.error(f"Velero 緊急備份失敗: {e}")
+ logger.error("velero_emergency_backup_failed", backup_name=name, error=str(e))
return False
async def _run_kubectl(self, args: list[str]) -> str: