fix(api): suppress batch reconcile postmortems
Some checks failed
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / tests (push) Successful in 1m18s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
Your Name
2026-05-19 12:18:16 +08:00
parent f9d53469f9
commit 1d285dd9d4
3 changed files with 51 additions and 38 deletions

View File

@@ -76,6 +76,7 @@ async def reconcile_stuck_incidents(limit: int = BATCH_LIMIT) -> tuple[int, int]
result = await incident_service.resolve_incident(
candidate.incident_id,
resolution_type=candidate.resolution_type,
emit_postmortem=False,
)
if result is not None:
resolved += 1

View File

@@ -1143,6 +1143,7 @@ class IncidentService:
self,
incident_id: str,
resolution_type: str = "manual",
emit_postmortem: bool = True,
) -> Incident | None:
"""
將 Incident 狀態更新為 RESOLVED
@@ -1152,6 +1153,8 @@ class IncidentService:
Args:
incident_id: 事件 ID
resolution_type: "manual"(預設)| "timeout"Approval 48h 逾期自動結案)
emit_postmortem: 是否送出使用者可見 Postmortem。批次歷史 reconciler
會關閉此開關,避免一次補關大量舊 incident 時洗版 Telegram。
ADR-073 補丁 2026-04-15 ogt + Claude Sonnet 4.6:
新增 resolution_type="timeout" 路徑 — Approval EXPIRED 時由
@@ -1323,47 +1326,54 @@ class IncidentService:
except Exception as _disp_e:
logger.warning("disposition_manual_resolve_failed", error=str(_disp_e))
# MASTER Task 4.2 (2026-04-14 Claude Sonnet 4.6): Postmortem 自動組裝
# Incident duration > POSTMORTEM_MIN_DURATION_MINUTES(10min) 時自動生成
# 孤兒 report_generation_service.trigger_postmortem 本次接上 resolve 路徑
try:
import asyncio
if emit_postmortem:
# MASTER Task 4.2 (2026-04-14 Claude Sonnet 4.6): Postmortem 自動組裝
# Incident duration > POSTMORTEM_MIN_DURATION_MINUTES(10min) 時自動生成
# 孤兒 report_generation_service.trigger_postmortem 本次接上 resolve 路徑
try:
import asyncio
from src.services.report_generation_service import (
get_report_generation_service,
)
alertname = (
incident.signals[0].labels.get("alertname", "UnknownAlert")
if incident.signals else "UnknownAlert"
)
title = f"{alertname}{', '.join(incident.affected_services or ['N/A'])}"
root_cause = None
resolution_action = None
ai_provider = None
auto_repaired = False
if incident.decision_chain:
root_cause = incident.decision_chain.hypothesis
ai_provider = incident.decision_chain.model_used
if incident.outcome:
resolution_action = (incident.outcome.learning_notes or None)
auto_repaired = bool(incident.outcome.execution_success)
asyncio.create_task(
get_report_generation_service().trigger_postmortem(
incident_id=incident.incident_id,
title=title,
created_at=incident.signals[0].fired_at if incident.signals else incident.resolved_at,
resolved_at=incident.resolved_at,
root_cause=root_cause,
resolution_action=resolution_action,
ai_provider=ai_provider,
auto_repaired=auto_repaired,
from src.services.report_generation_service import (
get_report_generation_service,
)
alertname = (
incident.signals[0].labels.get("alertname", "UnknownAlert")
if incident.signals else "UnknownAlert"
)
title = f"{alertname}{', '.join(incident.affected_services or ['N/A'])}"
root_cause = None
resolution_action = None
ai_provider = None
auto_repaired = False
if incident.decision_chain:
root_cause = incident.decision_chain.hypothesis
ai_provider = incident.decision_chain.model_used
if incident.outcome:
resolution_action = (incident.outcome.learning_notes or None)
auto_repaired = bool(incident.outcome.execution_success)
asyncio.create_task(
get_report_generation_service().trigger_postmortem(
incident_id=incident.incident_id,
title=title,
created_at=incident.signals[0].fired_at if incident.signals else incident.resolved_at,
resolved_at=incident.resolved_at,
root_cause=root_cause,
resolution_action=resolution_action,
ai_provider=ai_provider,
auto_repaired=auto_repaired,
)
)
except Exception as _pm_e:
logger.exception("postmortem_trigger_failed",
incident_id=incident_id, error=str(_pm_e))
else:
logger.info(
"postmortem_suppressed_for_batch_reconcile",
incident_id=incident_id,
resolution_type=resolution_type,
)
except Exception as _pm_e:
logger.exception("postmortem_trigger_failed",
incident_id=incident_id, error=str(_pm_e))
return incident

View File

@@ -43,8 +43,10 @@ async def test_reconcile_stuck_incidents_resolves_strong_evidence(monkeypatch):
)
assert service.resolve_incident.await_args_list[0].kwargs == {
"resolution_type": "auto_repair",
"emit_postmortem": False,
}
assert service.resolve_incident.await_args_list[1].args == ("INC-TIMEOUT",)
assert service.resolve_incident.await_args_list[1].kwargs == {
"resolution_type": "timeout",
"emit_postmortem": False,
}