fix(drift): dedupe blocked auto-adopt escalations
All checks were successful
Code Review / ai-code-review (push) Successful in 9s
CD Pipeline / tests (push) Successful in 1m4s
CD Pipeline / build-and-deploy (push) Successful in 3m41s
CD Pipeline / post-deploy-checks (push) Successful in 1m39s

This commit is contained in:
Your Name
2026-05-19 00:13:41 +08:00
parent fb9b0b3b7c
commit 0367dde686
3 changed files with 46 additions and 6 deletions

View File

@@ -277,6 +277,12 @@ class DriftAdoptService:
# Step 3: 找出受影響的 YAML 檔並 commit 更新
committed_files = await self._commit_drift_yaml(client, headers, branch_name, report)
if not committed_files:
return {
"success": False,
"message": "無直接匹配的 YAML 檔,未建立零 diff 承認 PR",
"pr_url": None,
}
# Step 4: 建立 PR
pr_url = await self._create_pr(
@@ -292,7 +298,7 @@ class DriftAdoptService:
logger.info("drift_adopt_pr_created", report_id=report.report_id, pr_url=pr_url)
return {
"success": True,
"message": f"PR 已建立,請 SRE review 後 merge",
"message": "PR 已建立,請 SRE review 後 merge",
"pr_url": pr_url,
}
@@ -345,7 +351,9 @@ class DriftAdoptService:
if not item.is_allowlisted
}
for yaml_file in sorted(self._k8s_dir.glob("*.yaml")):
for yaml_file in sorted(self._k8s_dir.rglob("*.yaml")):
if not yaml_file.is_file():
continue
# 判斷此 YAML 是否與漂移相關
file_stem = yaml_file.stem.lower()
if not any(kind in file_stem for kind in affected_kinds):

View File

@@ -16,6 +16,24 @@ from src.core.redis_client import get_redis
logger = structlog.get_logger(__name__)
def _drift_emergency_fingerprint(report: Any) -> str:
"""Return stable fingerprint for one drift escalation dedup window."""
try:
from src.services.drift_repeat_state import build_drift_fingerprint
return build_drift_fingerprint(
str(getattr(report, "namespace", "") or ""),
list(getattr(report, "items", []) or []),
)
except Exception as exc:
logger.warning(
"drift_emergency_fingerprint_failed",
report_id=getattr(report, "report_id", None),
error=str(exc),
)
return str(getattr(report, "report_id", "") or "unknown")
async def escalate_auto_repair_unavailable(
*,
incident_id: str,
@@ -120,9 +138,14 @@ async def escalate_drift_auto_adopt_blocked(
) -> None:
"""Notify the emergency channel when drift cannot be auto-adopted safely."""
dedup_key = f"drift:auto_adopt_emergency:{report.report_id}"
if not await _dedup_first_send(dedup_key, ttl=3600, event="drift"):
logger.info("drift_emergency_escalation_dedup_skipped", report_id=report.report_id)
fingerprint = _drift_emergency_fingerprint(report)
dedup_key = f"drift:auto_adopt_emergency:fp:{fingerprint}"
if not await _dedup_first_send(dedup_key, ttl=86400, event="drift"):
logger.info(
"drift_emergency_escalation_dedup_skipped",
report_id=report.report_id,
fingerprint=fingerprint,
)
return
try:
@@ -150,7 +173,8 @@ async def escalate_drift_auto_adopt_blocked(
current_impact=(
f"namespace={report.namespace} high={report.high_count} "
f"medium={report.medium_count} actionable={actionable_count} "
f"intent={intent} confidence={confidence:.0%} risk={risk}"
f"intent={intent} confidence={confidence:.0%} risk={risk} "
f"fingerprint={fingerprint}"
),
group_chat_id=settings.SRE_GROUP_CHAT_ID or None,
)
@@ -169,6 +193,7 @@ async def escalate_drift_auto_adopt_blocked(
"intent": intent,
"confidence": confidence,
"risk": risk,
"fingerprint": fingerprint,
},
)
try:
@@ -198,6 +223,7 @@ async def escalate_drift_auto_adopt_blocked(
high=report.high_count,
medium=report.medium_count,
actionable=actionable_count,
fingerprint=fingerprint,
)
except Exception as exc:
logger.warning(

View File

@@ -10,8 +10,10 @@ async def test_drift_emergency_escalation_writes_aol_and_timeline(monkeypatch):
sent_cards = []
aol_calls = []
timeline_calls = []
dedup_calls = []
async def fake_dedup(*args, **kwargs):
dedup_calls.append((args, kwargs))
return True
class FakeGateway:
@@ -65,7 +67,11 @@ async def test_drift_emergency_escalation_writes_aol_and_timeline(monkeypatch):
)
assert sent_cards and sent_cards[0]["incident_id"] == "drift-123"
assert "fingerprint=dfp_" in sent_cards[0]["current_impact"]
assert dedup_calls[0][0][0].startswith("drift:auto_adopt_emergency:fp:dfp_")
assert dedup_calls[0][1]["ttl"] == 86400
assert aol_calls and aol_calls[0][0][0] == "APPROVAL_ESCALATED"
assert aol_calls[0][1]["actor"] == "drift_auto_adopt"
assert aol_calls[0][1]["context"]["intent"] == "emergency_hotfix"
assert aol_calls[0][1]["context"]["fingerprint"].startswith("dfp_")
assert timeline_calls and timeline_calls[0]["actor_role"] == "emergency_intervention"