From 1d285dd9d4a1ccae7f5b49e9843f9dbaf786d55e Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 19 May 2026 12:18:16 +0800 Subject: [PATCH] fix(api): suppress batch reconcile postmortems --- .../src/jobs/incident_lifecycle_reconciler.py | 1 + apps/api/src/services/incident_service.py | 86 +++++++++++-------- .../test_incident_lifecycle_reconciler.py | 2 + 3 files changed, 51 insertions(+), 38 deletions(-) diff --git a/apps/api/src/jobs/incident_lifecycle_reconciler.py b/apps/api/src/jobs/incident_lifecycle_reconciler.py index 19c15da3..a45e8a8c 100644 --- a/apps/api/src/jobs/incident_lifecycle_reconciler.py +++ b/apps/api/src/jobs/incident_lifecycle_reconciler.py @@ -76,6 +76,7 @@ async def reconcile_stuck_incidents(limit: int = BATCH_LIMIT) -> tuple[int, int] result = await incident_service.resolve_incident( candidate.incident_id, resolution_type=candidate.resolution_type, + emit_postmortem=False, ) if result is not None: resolved += 1 diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index a16cacae..8d3aabc4 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -1143,6 +1143,7 @@ class IncidentService: self, incident_id: str, resolution_type: str = "manual", + emit_postmortem: bool = True, ) -> Incident | None: """ 將 Incident 狀態更新為 RESOLVED @@ -1152,6 +1153,8 @@ class IncidentService: Args: incident_id: 事件 ID resolution_type: "manual"(預設)| "timeout"(Approval 48h 逾期自動結案) + emit_postmortem: 是否送出使用者可見 Postmortem。批次歷史 reconciler + 會關閉此開關,避免一次補關大量舊 incident 時洗版 Telegram。 ADR-073 補丁 2026-04-15 ogt + Claude Sonnet 4.6: 新增 resolution_type="timeout" 路徑 — Approval EXPIRED 時由 @@ -1323,47 +1326,54 @@ class IncidentService: except Exception as _disp_e: logger.warning("disposition_manual_resolve_failed", error=str(_disp_e)) - # MASTER Task 4.2 (2026-04-14 Claude Sonnet 4.6): Postmortem 自動組裝 - # Incident duration > POSTMORTEM_MIN_DURATION_MINUTES(10min) 時自動生成 - # 孤兒 report_generation_service.trigger_postmortem 本次接上 resolve 路徑 - try: - import asyncio + if emit_postmortem: + # MASTER Task 4.2 (2026-04-14 Claude Sonnet 4.6): Postmortem 自動組裝 + # Incident duration > POSTMORTEM_MIN_DURATION_MINUTES(10min) 時自動生成 + # 孤兒 report_generation_service.trigger_postmortem 本次接上 resolve 路徑 + try: + import asyncio - from src.services.report_generation_service import ( - get_report_generation_service, - ) - - alertname = ( - incident.signals[0].labels.get("alertname", "UnknownAlert") - if incident.signals else "UnknownAlert" - ) - title = f"{alertname} — {', '.join(incident.affected_services or ['N/A'])}" - root_cause = None - resolution_action = None - ai_provider = None - auto_repaired = False - if incident.decision_chain: - root_cause = incident.decision_chain.hypothesis - ai_provider = incident.decision_chain.model_used - if incident.outcome: - resolution_action = (incident.outcome.learning_notes or None) - auto_repaired = bool(incident.outcome.execution_success) - - asyncio.create_task( - get_report_generation_service().trigger_postmortem( - incident_id=incident.incident_id, - title=title, - created_at=incident.signals[0].fired_at if incident.signals else incident.resolved_at, - resolved_at=incident.resolved_at, - root_cause=root_cause, - resolution_action=resolution_action, - ai_provider=ai_provider, - auto_repaired=auto_repaired, + from src.services.report_generation_service import ( + get_report_generation_service, ) + + alertname = ( + incident.signals[0].labels.get("alertname", "UnknownAlert") + if incident.signals else "UnknownAlert" + ) + title = f"{alertname} — {', '.join(incident.affected_services or ['N/A'])}" + root_cause = None + resolution_action = None + ai_provider = None + auto_repaired = False + if incident.decision_chain: + root_cause = incident.decision_chain.hypothesis + ai_provider = incident.decision_chain.model_used + if incident.outcome: + resolution_action = (incident.outcome.learning_notes or None) + auto_repaired = bool(incident.outcome.execution_success) + + asyncio.create_task( + get_report_generation_service().trigger_postmortem( + incident_id=incident.incident_id, + title=title, + created_at=incident.signals[0].fired_at if incident.signals else incident.resolved_at, + resolved_at=incident.resolved_at, + root_cause=root_cause, + resolution_action=resolution_action, + ai_provider=ai_provider, + auto_repaired=auto_repaired, + ) + ) + except Exception as _pm_e: + logger.exception("postmortem_trigger_failed", + incident_id=incident_id, error=str(_pm_e)) + else: + logger.info( + "postmortem_suppressed_for_batch_reconcile", + incident_id=incident_id, + resolution_type=resolution_type, ) - except Exception as _pm_e: - logger.exception("postmortem_trigger_failed", - incident_id=incident_id, error=str(_pm_e)) return incident diff --git a/apps/api/tests/test_incident_lifecycle_reconciler.py b/apps/api/tests/test_incident_lifecycle_reconciler.py index 638df6eb..68aaa2b8 100644 --- a/apps/api/tests/test_incident_lifecycle_reconciler.py +++ b/apps/api/tests/test_incident_lifecycle_reconciler.py @@ -43,8 +43,10 @@ async def test_reconcile_stuck_incidents_resolves_strong_evidence(monkeypatch): ) assert service.resolve_incident.await_args_list[0].kwargs == { "resolution_type": "auto_repair", + "emit_postmortem": False, } assert service.resolve_incident.await_args_list[1].args == ("INC-TIMEOUT",) assert service.resolve_incident.await_args_list[1].kwargs == { "resolution_type": "timeout", + "emit_postmortem": False, }