From cf8bb364a39655f47a2fb2e8862ef36d7ed7797a Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 21 May 2026 08:54:45 +0800 Subject: [PATCH] feat(awooop): surface source evidence review work items --- apps/api/src/api/v1/platform/events.py | 3 + .../services/channel_event_dossier_service.py | 117 +++++++++++++++--- .../test_channel_event_dossier_service.py | 101 ++++++++++++++- apps/web/messages/en.json | 13 +- apps/web/messages/zh-TW.json | 13 +- .../web/src/app/[locale]/awooop/runs/page.tsx | 18 ++- .../app/[locale]/awooop/work-items/page.tsx | 30 +++++ docs/LOGBOOK.md | 49 ++++++++ 8 files changed, 324 insertions(+), 20 deletions(-) diff --git a/apps/api/src/api/v1/platform/events.py b/apps/api/src/api/v1/platform/events.py index 75b1d23d..d956cdf0 100644 --- a/apps/api/src/api/v1/platform/events.py +++ b/apps/api/src/api/v1/platform/events.py @@ -172,6 +172,7 @@ class ChannelEventRecurrenceSummary(BaseModel): manual_gate_group_total: int = 0 automation_gap_group_total: int = 0 failed_repair_group_total: int = 0 + source_correlation_review_group_total: int = 0 latest_received_at: datetime | None @@ -183,6 +184,7 @@ class ChannelEventRecurrenceItem(BaseModel): namespace: str | None target_resource: str | None fingerprint: str | None + latest_stage: str | None = None latest_event_id: UUID | None latest_provider_event_id: str | None latest_content_preview: str | None @@ -201,6 +203,7 @@ class ChannelEventRecurrenceItem(BaseModel): sentry_ref_total: int signoz_ref_total: int alert_ref_total: int + stage_counts: dict[str, int] = Field(default_factory=dict) run_state_counts: dict[str, int] first_received_at: datetime | None latest_received_at: datetime | None diff --git a/apps/api/src/services/channel_event_dossier_service.py b/apps/api/src/services/channel_event_dossier_service.py index d05c0559..982439f2 100644 --- a/apps/api/src/services/channel_event_dossier_service.py +++ b/apps/api/src/services/channel_event_dossier_service.py @@ -23,6 +23,9 @@ _MAX_DOSSIER_EVENTS = 50 _MAX_COVERAGE_EVENTS = 200 _MAX_RECURRENCE_EVENTS = 300 _MAX_REPAIR_INCIDENTS = 200 +_SOURCE_CORRELATION_REVIEW_PROVIDERS = {"sentry", "signoz"} +_SOURCE_CORRELATION_REVIEW_EXCLUDED_STAGES = {"heartbeat"} +_SOURCE_CORRELATION_WORK_ITEM_ID_MAX = 180 _INCIDENT_ID_RE = re.compile(r"\bINC-\d{8}-[A-Z0-9]{4,}\b") RecurrenceWorkItemMode = Literal["auto", "ticket", "reverify", "approval_review", "observe"] RecurrenceWorkItemHandoffKind = Literal["ticket_proposal", "manual_review"] @@ -53,6 +56,34 @@ def _ref_count(source_refs: dict[str, Any], key: str) -> int: return 1 if value else 0 +def _source_correlation_ref_total(group: dict[str, Any]) -> int: + return int(group.get("sentry_ref_total") or 0) + int( + group.get("signoz_ref_total") or 0 + ) + + +def _needs_source_correlation_review( + group: dict[str, Any], + latest_incident_id: str | None, +) -> bool: + if latest_incident_id: + return False + provider = str(group.get("provider") or "").lower() + stage = str(group.get("latest_stage") or "").lower() + if provider not in _SOURCE_CORRELATION_REVIEW_PROVIDERS: + return False + if stage in _SOURCE_CORRELATION_REVIEW_EXCLUDED_STAGES: + return False + return _source_correlation_ref_total(group) > 0 + + +def _source_correlation_work_item_id(group: dict[str, Any]) -> str: + source_id = str( + group.get("latest_provider_event_id") or group.get("recurrence_key") or "unknown" + ).strip() + return f"source-evidence:{source_id}"[:_SOURCE_CORRELATION_WORK_ITEM_ID_MAX] + + def _append_unique(values: list[str], candidate: Any) -> None: text_value = str(candidate or "").strip() if text_value and text_value not in values: @@ -132,6 +163,7 @@ def build_dossier_recurrence( "namespace": event.get("namespace"), "target_resource": event.get("target_resource"), "fingerprint": event.get("fingerprint"), + "latest_stage": event.get("stage"), "latest_event_id": event.get("event_id"), "latest_provider_event_id": event.get("provider_event_id"), "latest_content_preview": event.get("content_preview"), @@ -148,6 +180,7 @@ def build_dossier_recurrence( "sentry_ref_total": 0, "signoz_ref_total": 0, "alert_ref_total": 0, + "stage_counts": {}, "run_state_counts": {}, "first_received_at": received_at, "latest_received_at": received_at, @@ -162,6 +195,10 @@ def build_dossier_recurrence( if event.get("is_duplicate"): group["duplicate_total"] += 1 + stage = str(event.get("stage") or "received") + stage_counts = group["stage_counts"] + stage_counts[stage] = int(stage_counts.get(stage, 0)) + 1 + for incident_id in incident_ids: _append_unique(group["incident_ids"], incident_id) @@ -189,6 +226,10 @@ def build_dossier_recurrence( or str(received_at) > str(group.get("latest_received_at")) ): group["latest_received_at"] = received_at + group["latest_event_id"] = event.get("event_id") + group["latest_provider_event_id"] = event.get("provider_event_id") + group["latest_content_preview"] = event.get("content_preview") + group["latest_stage"] = event.get("stage") group["latest_incident_id"] = ( incident_ids[0] if incident_ids else group.get("latest_incident_id") ) @@ -260,6 +301,12 @@ def build_dossier_recurrence( if _as_dict(item.get("repair_summary")).get("status") == "auto_repair_failed" ), + "source_correlation_review_group_total": sum( + 1 + for item in items + if _as_dict(item.get("repair_summary")).get("status") + == "source_correlation_review" + ), "latest_received_at": latest_received_at, }, "items": items, @@ -304,6 +351,8 @@ def _work_item_status(repair_status: str) -> str: def _work_item_kind(repair_status: str, auto_repair_id: Any) -> str: + if repair_status == "source_correlation_review": + return "source_correlation_review" if auto_repair_id: return "verification" if repair_status == "run_completed_no_repair": @@ -317,6 +366,7 @@ def _work_item_kind(repair_status: str, auto_repair_id: Any) -> str: def _work_item_next_step(repair_status: str) -> str: return { + "source_correlation_review": "review_provider_source_match", "auto_repair_succeeded_unverified": "run_post_verification", "auto_repair_failed": "triage_failed_repair", "auto_repair_recorded": "review_repair_record", @@ -329,6 +379,7 @@ def _work_item_next_step(repair_status: str) -> str: def _work_item_reason(repair_status: str) -> str: return { + "source_correlation_review": "provider_native_evidence_unlinked", "auto_repair_succeeded_unverified": "auto_repair_missing_verification", "auto_repair_failed": "auto_repair_failed", "auto_repair_recorded": "auto_repair_record_needs_review", @@ -359,6 +410,9 @@ def _attach_work_item_summary( latest_run_state=group.get("latest_run_state"), repair_summary=repair_summary, ) + if _needs_source_correlation_review(group, latest_incident_id): + status_value = "source_correlation_review" + if repair_summary: repair_payload = dict(repair_summary) repair_payload["status"] = status_value @@ -383,6 +437,8 @@ def _attach_work_item_summary( if auto_repair_id else f"incident:{latest_incident_id}" ) + elif status_value == "source_correlation_review" and work_status != "none": + work_item_id = _source_correlation_work_item_id(group) group["latest_incident_id"] = latest_incident_id group["repair_summary"] = repair_payload @@ -408,6 +464,7 @@ def _recurrence_work_item_target(item: dict[str, Any]) -> dict[str, Any]: "namespace": item.get("namespace"), "target_resource": item.get("target_resource"), "fingerprint": item.get("fingerprint"), + "latest_stage": item.get("latest_stage"), "latest_event_id": item.get("latest_event_id"), "latest_provider_event_id": item.get("latest_provider_event_id"), "latest_run_id": item.get("latest_run_id"), @@ -445,6 +502,11 @@ def _recurrence_work_item_checks( ) -> list[dict[str, Any]]: repair_summary = _as_dict(item.get("repair_summary")) source_ref_total = int(item.get("source_ref_total") or 0) + is_source_review = work_item.get("kind") == "source_correlation_review" + evidence_linked = bool(item.get("latest_provider_event_id")) and source_ref_total > 0 + incident_or_source_linked = bool(work_item.get("incident_id")) or ( + is_source_review and evidence_linked + ) return [ { "name": "work_item_open", @@ -452,9 +514,13 @@ def _recurrence_work_item_checks( "detail": str(work_item.get("status") or "unknown"), }, { - "name": "incident_linked", - "passed": bool(work_item.get("incident_id")), - "detail": str(work_item.get("incident_id") or "missing incident_id"), + "name": "incident_or_source_evidence_linked", + "passed": incident_or_source_linked, + "detail": str( + work_item.get("incident_id") + or item.get("latest_provider_event_id") + or "missing incident_id/source evidence" + ), }, { "name": "known_next_step", @@ -521,19 +587,35 @@ def _ticket_preview(item: dict[str, Any], work_item: dict[str, Any]) -> dict[str alertname = str(item.get("alertname") or item.get("provider") or "recurrence") incident_id = str(work_item.get("incident_id") or item.get("latest_incident_id") or "") kind = str(work_item.get("kind") or "recurrence") - title = f"[AwoooP] {alertname} recurrence work item: {incident_id or 'unlinked'}" - labels = ["awooop", "recurrence", kind] - body_lines = [ - f"Incident: {incident_id or '--'}", - f"Alert: {alertname}", - f"Namespace/Target: {item.get('namespace') or '--'} / {item.get('target_resource') or '--'}", - f"Occurrences: {item.get('occurrence_total') or 0}", - f"Duplicates: {item.get('duplicate_total') or 0}", - f"Latest run: {item.get('latest_run_id') or '--'} ({item.get('latest_run_state') or '--'})", - f"Repair status: {_as_dict(item.get('repair_summary')).get('status') or '--'}", - f"Next step: {work_item.get('next_step') or '--'}", - "Writes: none in preview/dry-run; ticket creation requires a later explicit apply path.", - ] + if kind == "source_correlation_review": + title = f"[AwoooP] Source evidence review: {alertname}" + labels = ["awooop", "source-correlation", "review", str(item.get("provider") or "source")] + body_lines = [ + f"Provider event: {item.get('latest_provider_event_id') or '--'}", + f"Stage: {item.get('latest_stage') or '--'}", + f"Provider: {item.get('provider') or '--'}", + f"Alert: {alertname}", + f"Namespace/Target: {item.get('namespace') or '--'} / {item.get('target_resource') or '--'}", + f"Source refs: {item.get('source_ref_total') or 0}", + f"Sentry refs: {item.get('sentry_ref_total') or 0}", + f"SignOz refs: {item.get('signoz_ref_total') or 0}", + f"Next step: {work_item.get('next_step') or '--'}", + "Writes: none in preview/dry-run; source matching requires a later explicit review/apply path.", + ] + else: + title = f"[AwoooP] {alertname} recurrence work item: {incident_id or 'unlinked'}" + labels = ["awooop", "recurrence", kind] + body_lines = [ + f"Incident: {incident_id or '--'}", + f"Alert: {alertname}", + f"Namespace/Target: {item.get('namespace') or '--'} / {item.get('target_resource') or '--'}", + f"Occurrences: {item.get('occurrence_total') or 0}", + f"Duplicates: {item.get('duplicate_total') or 0}", + f"Latest run: {item.get('latest_run_id') or '--'} ({item.get('latest_run_state') or '--'})", + f"Repair status: {_as_dict(item.get('repair_summary')).get('status') or '--'}", + f"Next step: {work_item.get('next_step') or '--'}", + "Writes: none in preview/dry-run; ticket creation requires a later explicit apply path.", + ] return { "would_create": False, "title": title[:180], @@ -556,6 +638,9 @@ def _recurrence_current_state_summary( "duplicate_total": int(item.get("duplicate_total") or 0), "linked_run_total": int(item.get("linked_run_total") or 0), "run_state_counts": item.get("run_state_counts") or {}, + "stage_counts": item.get("stage_counts") or {}, + "latest_stage": item.get("latest_stage"), + "latest_provider_event_id": item.get("latest_provider_event_id"), "latest_run_state": item.get("latest_run_state"), "latest_run_id": item.get("latest_run_id"), "repair_status": repair_summary.get("status"), diff --git a/apps/api/tests/test_channel_event_dossier_service.py b/apps/api/tests/test_channel_event_dossier_service.py index 4f0326bd..6e74de29 100644 --- a/apps/api/tests/test_channel_event_dossier_service.py +++ b/apps/api/tests/test_channel_event_dossier_service.py @@ -263,10 +263,11 @@ def test_build_dossier_recurrence_groups_events_and_run_state() -> None: assert recurrence["summary"]["linked_run_total"] == 2 assert recurrence["summary"]["unlinked_event_total"] == 1 assert recurrence["summary"]["auto_repair_linked_total"] == 1 - assert recurrence["summary"]["open_work_item_group_total"] == 1 + assert recurrence["summary"]["open_work_item_group_total"] == 2 assert recurrence["summary"]["verified_repair_group_total"] == 0 assert recurrence["summary"]["automation_gap_group_total"] == 0 assert recurrence["summary"]["failed_repair_group_total"] == 1 + assert recurrence["summary"]["source_correlation_review_group_total"] == 1 host_group = recurrence["items"][0] assert host_group["recurrence_key"] == "fingerprint:fp-host-disk" @@ -276,6 +277,8 @@ def test_build_dossier_recurrence_groups_events_and_run_state() -> None: assert host_group["latest_run_state"] == "waiting_approval" assert host_group["latest_incident_id"] == "INC-20260513-ABCD" assert host_group["incident_ids"] == ["INC-20260513-ABCD"] + assert host_group["latest_stage"] == "received" + assert host_group["stage_counts"] == {"received": 2} assert host_group["run_state_counts"] == {"waiting_approval": 1, "completed": 1} assert host_group["alert_ref_total"] == 2 assert host_group["repair_summary"]["status"] == "auto_repair_failed" @@ -292,6 +295,96 @@ def test_build_dossier_recurrence_groups_events_and_run_state() -> None: "needs_human": True, } + source_group = recurrence["items"][1] + assert source_group["provider"] == "sentry" + assert source_group["latest_stage"] == "received" + assert source_group["stage_counts"] == {"received": 1} + assert source_group["repair_summary"]["status"] == "source_correlation_review" + assert source_group["work_item"] == { + "schema_version": "awooop_recurrence_work_item_link_v1", + "work_item_id": "source-evidence:sentry:received:issue-1", + "incident_id": None, + "auto_repair_id": None, + "status": "open", + "kind": "source_correlation_review", + "next_step": "review_provider_source_match", + "reason": "provider_native_evidence_unlinked", + "needs_human": True, + } + + +def test_build_recurrence_work_item_preview_allows_source_correlation_review() -> None: + recurrence = build_dossier_recurrence( + [ + { + "event_id": "event-1", + "project_id": "awoooi", + "channel_type": "internal", + "provider_event_id": "signoz:upstream_canary:canary-1", + "content_hash": "a" * 64, + "content_preview": "SignOz upstream canary", + "content_redacted": "SignOz upstream canary", + "redaction_version": "audit_sink_v1", + "source_envelope": { + "provider": "signoz", + "stage": "upstream_canary", + "source_refs": { + "signoz_alerts": ["alert-1"], + "alert_ids": ["signoz:upstream_canary:canary-1"], + }, + "log_correlation": { + "alertname": "Source Provider Upstream Canary", + "severity": "info", + "namespace": "observability", + "target_resource": "signoz", + "fingerprint": "fp-signoz-canary", + }, + }, + "is_duplicate": False, + "provider_ts": None, + "received_at": "2026-05-20T13:01:00", + "run_id": None, + "run_state": None, + "run_agent_id": None, + } + ], + project_id="awoooi", + limit=20, + ) + + item = recurrence["items"][0] + work_item_id = "source-evidence:signoz:upstream_canary:canary-1" + assert recurrence["summary"]["source_correlation_review_group_total"] == 1 + assert recurrence["summary"]["open_work_item_group_total"] == 1 + assert item["latest_stage"] == "upstream_canary" + assert item["repair_summary"]["status"] == "source_correlation_review" + assert item["work_item"]["work_item_id"] == work_item_id + assert item["work_item"]["kind"] == "source_correlation_review" + assert item["work_item"]["next_step"] == "review_provider_source_match" + + preview = build_recurrence_work_item_preview( + recurrence, + work_item_id=work_item_id, + ) + + assert preview["mode"] == "observe" + assert preview["allowed"] is True + assert preview["plan"]["target_action"] == "review_provider_source_match" + assert preview["plan"]["target"]["latest_stage"] == "upstream_canary" + + dry_run = build_recurrence_work_item_dry_run( + recurrence, + work_item_id=work_item_id, + ) + + assert dry_run["verification_result_preview"] == "observe_only" + assert dry_run["ticket_preview"]["would_create"] is False + assert "Source evidence review" in dry_run["ticket_preview"]["title"] + assert dry_run["current_state_summary"]["latest_stage"] == "upstream_canary" + assert dry_run["current_state_summary"]["latest_provider_event_id"] == ( + "signoz:upstream_canary:canary-1" + ) + def test_build_dossier_recurrence_opens_work_item_for_completed_run_without_repair() -> None: recurrence = build_dossier_recurrence( @@ -556,6 +649,7 @@ def test_recurrence_response_model_preserves_repair_work_item_fields() -> None: "manual_gate_group_total": 0, "automation_gap_group_total": 0, "failed_repair_group_total": 0, + "source_correlation_review_group_total": 0, "latest_received_at": "2026-05-13T13:47:00", }, "items": [ @@ -567,6 +661,7 @@ def test_recurrence_response_model_preserves_repair_work_item_fields() -> None: "namespace": "node", "target_resource": "host-110", "fingerprint": "fp-host-disk", + "latest_stage": "incident_linked", "latest_event_id": "11111111-1111-4111-8111-111111111111", "latest_provider_event_id": "alertmanager:received:1", "latest_content_preview": "Host disk pressure", @@ -591,6 +686,7 @@ def test_recurrence_response_model_preserves_repair_work_item_fields() -> None: "sentry_ref_total": 0, "signoz_ref_total": 0, "alert_ref_total": 1, + "stage_counts": {"incident_linked": 1}, "run_state_counts": {"completed": 1}, "first_received_at": "2026-05-13T13:47:00", "latest_received_at": "2026-05-13T13:47:00", @@ -601,7 +697,10 @@ def test_recurrence_response_model_preserves_repair_work_item_fields() -> None: payload = response.model_dump() assert payload["summary"]["auto_repair_linked_total"] == 1 + assert payload["summary"]["source_correlation_review_group_total"] == 0 assert payload["items"][0]["latest_incident_id"] == "INC-20260513-ABCD" + assert payload["items"][0]["latest_stage"] == "incident_linked" + assert payload["items"][0]["stage_counts"] == {"incident_linked": 1} assert payload["items"][0]["repair_summary"]["status"] == "auto_repair_verified" assert payload["items"][0]["work_item"]["status"] == "closed" diff --git a/apps/web/messages/en.json b/apps/web/messages/en.json index 5f19d5de..1783763d 100644 --- a/apps/web/messages/en.json +++ b/apps/web/messages/en.json @@ -1909,7 +1909,7 @@ "evidence": { "channelEvents": "Recent Alertmanager channel events: {count}", "autoRepair": "Verified auto-repairs: {verified}/{evaluated}", - "recurrenceWorkItems": "Recurring alert work: {open}; no repair: {gap}; failed repair: {failed}; manual gates: {manual}", + "recurrenceWorkItems": "Recurring alert work: {open}; no repair: {gap}; failed repair: {failed}; manual gates: {manual}; source review: {source}", "recurrenceLatest": "Latest: {alert} / {incident}", "recurrenceReason": "Reason: {reason}", "recurrenceEmpty": "No open recurring-alert work item in the recent window", @@ -2155,10 +2155,14 @@ "open": "Open {count}", "automationGap": "No repair {count}", "failed": "Failed {count}", + "sourceReview": "Source review {count}", "unavailable": "The recurrence API has not responded, so work item state cannot be claimed.", "empty": "No open recurring-alert work items in the recent window.", "occurrences": "{count}x", "incident": "Incident: {incident}", + "stage": "Stage: {stage}", + "sourceEvent": "Source event: {event}", + "sourceRefs": "Source refs: {refs} (Sentry {sentry} / SignOz {signoz})", "workItem": "Work item: {id}", "repair": "Repair status: {status}", "reason": "Reason: {reason}", @@ -2219,6 +2223,7 @@ "manual_gate": "Manual gate needed", "investigating": "Investigating", "run_completed_no_repair": "Run completed without repair", + "source_correlation_review": "Source evidence needs matching", "no_repair_record": "No repair record", "unknown": "Unknown" }, @@ -2229,6 +2234,7 @@ "approval_required": "Approval required", "run_still_investigating": "Run is still investigating", "completed_run_without_auto_repair": "Run completed without an auto-repair record", + "provider_native_evidence_unlinked": "Provider-native source evidence is stored but not matched to an Incident", "incident_without_repair_record": "Incident has no repair record", "none": "None", "unknown": "Unknown" @@ -2240,6 +2246,7 @@ "review_approval": "Review approval", "wait_for_run_completion": "Wait for Run completion", "create_repair_ticket": "Create repair ticket", + "review_provider_source_match": "Review source-to-Incident match", "triage_missing_repair_record": "Fill missing repair record", "none": "None" } @@ -2340,6 +2347,7 @@ "duplicates": "Duplicate events", "linkedRuns": "Linked Runs", "autoRepair": "Auto repair", + "sourceReview": "Source review", "openWorkItems": "Open work items" }, "details": { @@ -2348,6 +2356,7 @@ "unlinked": "{count} items not linked to a Run", "limit": "Latest {count} item window", "verifiedRepair": "{count} verified repair groups", + "sourceReview": "{count} Sentry / SignOz source groups need matching review", "manualGates": "{count} manual gates" }, "states": { @@ -2370,6 +2379,7 @@ "manual_gate": "Manual gate needed", "investigating": "Investigating", "run_completed_no_repair": "Run completed without repair", + "source_correlation_review": "Source evidence needs matching", "no_repair_record": "No repair record" }, "item": { @@ -2377,6 +2387,7 @@ "duplicates": "Duplicates {count}", "refs": "Refs {count}", "linkedRuns": "Runs {count}", + "stage": "Stage {stage}", "incident": "Incident {incidentId}", "repair": "Repair {status}", "openRun": "Open Run", diff --git a/apps/web/messages/zh-TW.json b/apps/web/messages/zh-TW.json index f470960f..e0fe1c99 100644 --- a/apps/web/messages/zh-TW.json +++ b/apps/web/messages/zh-TW.json @@ -1910,7 +1910,7 @@ "evidence": { "channelEvents": "最近 Alertmanager channel events:{count}", "autoRepair": "已驗證自動修復:{verified}/{evaluated}", - "recurrenceWorkItems": "重複告警待處理:{open};無修復:{gap};修復失敗:{failed};人工閘門:{manual}", + "recurrenceWorkItems": "重複告警待處理:{open};無修復:{gap};修復失敗:{failed};人工閘門:{manual};來源待審:{source}", "recurrenceLatest": "最新:{alert} / {incident}", "recurrenceReason": "原因:{reason}", "recurrenceEmpty": "近期重複告警尚無待處理工作項", @@ -2156,10 +2156,14 @@ "open": "待處理 {count}", "automationGap": "無修復 {count}", "failed": "修復失敗 {count}", + "sourceReview": "來源待審 {count}", "unavailable": "recurrence API 尚未回應,不能判定工作項狀態。", "empty": "近期重複告警沒有待處理工作項。", "occurrences": "{count} 次", "incident": "Incident:{incident}", + "stage": "階段:{stage}", + "sourceEvent": "來源事件:{event}", + "sourceRefs": "來源 refs:{refs}(Sentry {sentry} / SignOz {signoz})", "workItem": "Work item:{id}", "repair": "修復狀態:{status}", "reason": "原因:{reason}", @@ -2220,6 +2224,7 @@ "manual_gate": "需人工閘門", "investigating": "調查中", "run_completed_no_repair": "Run 完成無修復", + "source_correlation_review": "來源證據待配對", "no_repair_record": "無修復記錄", "unknown": "未知" }, @@ -2230,6 +2235,7 @@ "approval_required": "需要審批", "run_still_investigating": "Run 尚在調查", "completed_run_without_auto_repair": "Run 已完成但沒有自動修復紀錄", + "provider_native_evidence_unlinked": "Provider 原生來源已入庫,尚未配對 Incident", "incident_without_repair_record": "Incident 沒有修復紀錄", "none": "無", "unknown": "未知" @@ -2241,6 +2247,7 @@ "review_approval": "處理審批", "wait_for_run_completion": "等待 Run 完成", "create_repair_ticket": "建立修復 Ticket", + "review_provider_source_match": "審核來源與 Incident 配對", "triage_missing_repair_record": "補齊修復紀錄", "none": "無" } @@ -2341,6 +2348,7 @@ "duplicates": "重複事件", "linkedRuns": "已連 Run", "autoRepair": "自動修復", + "sourceReview": "來源待審", "openWorkItems": "待處理項" }, "details": { @@ -2349,6 +2357,7 @@ "unlinked": "{count} 筆尚未連 Run", "limit": "最近 {count} 筆視窗", "verifiedRepair": "{count} 組已驗證修復", + "sourceReview": "{count} 組 Sentry / SignOz 來源需人工配對", "manualGates": "{count} 組人工閘門" }, "states": { @@ -2371,6 +2380,7 @@ "manual_gate": "需人工閘門", "investigating": "調查中", "run_completed_no_repair": "Run 完成無修復", + "source_correlation_review": "來源證據待配對", "no_repair_record": "無修復記錄" }, "item": { @@ -2378,6 +2388,7 @@ "duplicates": "重複 {count}", "refs": "Refs {count}", "linkedRuns": "Run {count}", + "stage": "階段 {stage}", "incident": "Incident {incidentId}", "repair": "修復 {status}", "openRun": "開啟 Run", diff --git a/apps/web/src/app/[locale]/awooop/runs/page.tsx b/apps/web/src/app/[locale]/awooop/runs/page.tsx index 14ae6f6a..ec020e39 100644 --- a/apps/web/src/app/[locale]/awooop/runs/page.tsx +++ b/apps/web/src/app/[locale]/awooop/runs/page.tsx @@ -68,6 +68,7 @@ type RecurrenceRepairStatus = | "manual_gate" | "investigating" | "run_completed_no_repair" + | "source_correlation_review" | "no_repair_record"; interface RemediationSummary { @@ -203,6 +204,7 @@ interface EventRecurrenceSummary { manual_gate_group_total?: number; automation_gap_group_total?: number; failed_repair_group_total?: number; + source_correlation_review_group_total?: number; latest_received_at?: string | null; } @@ -246,6 +248,7 @@ interface EventRecurrenceItem { namespace?: string | null; target_resource?: string | null; fingerprint?: string | null; + latest_stage?: string | null; latest_event_id?: string | null; latest_provider_event_id?: string | null; latest_content_preview?: string | null; @@ -264,6 +267,7 @@ interface EventRecurrenceItem { sentry_ref_total: number; signoz_ref_total: number; alert_ref_total: number; + stage_counts: Record; run_state_counts: Record; first_received_at?: string | null; latest_received_at?: string | null; @@ -1047,6 +1051,7 @@ function recurrenceRepairStatusLabelKey(status?: string | null) { status === "manual_gate" || status === "investigating" || status === "run_completed_no_repair" || + status === "source_correlation_review" || status === "no_repair_record" ) { return `repairStatuses.${status}`; @@ -1103,6 +1108,16 @@ function EventRecurrencePanel({ detail: t("details.verifiedRepair", { count: summary?.verified_repair_group_total ?? 0 }), className: "border-[#b9a6d9] bg-[#f5f0ff] text-[#51358f]", }, + { + label: t("metrics.sourceReview"), + value: summary?.source_correlation_review_group_total ?? 0, + detail: t("details.sourceReview", { + count: summary?.source_correlation_review_group_total ?? 0, + }), + className: (summary?.source_correlation_review_group_total ?? 0) > 0 + ? "border-[#d9b36f] bg-[#fff7e8] text-[#8a5a08]" + : "border-[#9bc7a4] bg-[#f0faf2] text-[#17602a]", + }, { label: t("metrics.openWorkItems"), value: summary?.open_work_item_group_total ?? 0, @@ -1134,7 +1149,7 @@ function EventRecurrencePanel({ ) : ( <> -
+
{metrics.map((item) => (
@@ -1204,6 +1219,7 @@ function EventRecurrencePanel({

{t("item.duplicates", { count: item.duplicate_total })}

{t("item.refs", { count: item.source_ref_total })}

{t("item.linkedRuns", { count: item.linked_run_total })}

+

{t("item.stage", { stage: item.latest_stage || "--" })}

{t("item.incident", { incidentId: item.latest_incident_id || "--" })}

{t("item.repair", { status: repairLabel })}

diff --git a/apps/web/src/app/[locale]/awooop/work-items/page.tsx b/apps/web/src/app/[locale]/awooop/work-items/page.tsx index a490f9ac..558f84fe 100644 --- a/apps/web/src/app/[locale]/awooop/work-items/page.tsx +++ b/apps/web/src/app/[locale]/awooop/work-items/page.tsx @@ -114,11 +114,16 @@ type RecurrenceItem = { severity?: string | null; namespace?: string | null; target_resource?: string | null; + latest_stage?: string | null; + latest_provider_event_id?: string | null; latest_run_id?: string | null; latest_run_state?: string | null; latest_incident_id?: string | null; occurrence_total: number; duplicate_total: number; + source_ref_total?: number; + sentry_ref_total?: number; + signoz_ref_total?: number; repair_summary?: { status?: string | null; latest_auto_repair_id?: string | null; @@ -142,6 +147,7 @@ type RecurrenceResponse = { manual_gate_group_total?: number; automation_gap_group_total?: number; failed_repair_group_total?: number; + source_correlation_review_group_total?: number; }; items: RecurrenceItem[]; }; @@ -170,6 +176,8 @@ type RecurrenceWorkItemActionResult = { occurrence_total?: number | null; duplicate_total?: number | null; linked_run_total?: number | null; + latest_stage?: string | null; + latest_provider_event_id?: string | null; } | null; ticket_preview?: { would_create?: boolean | null; @@ -543,6 +551,7 @@ function recurrenceRepairStatusKey(status?: string | null) { status === "manual_gate" || status === "investigating" || status === "run_completed_no_repair" || + status === "source_correlation_review" || status === "no_repair_record" ) { return status; @@ -892,6 +901,8 @@ function buildWorkItems( const recurrenceAutomationGap = recurrenceSummary?.automation_gap_group_total ?? 0; const recurrenceFailedRepair = recurrenceSummary?.failed_repair_group_total ?? 0; const recurrenceManualGate = recurrenceSummary?.manual_gate_group_total ?? 0; + const recurrenceSourceReview = + recurrenceSummary?.source_correlation_review_group_total ?? 0; const latestRecurrenceOpenItem = recurrenceOpenItems(telemetry.eventRecurrence)[0] ?? null; const driftState = telemetry.driftFingerprintState; const driftFsmKey = driftFsmStateKey(driftState?.fsm_state); @@ -951,6 +962,7 @@ function buildWorkItems( gap: recurrenceAutomationGap, failed: recurrenceFailedRepair, manual: recurrenceManualGate, + source: recurrenceSourceReview, }), evidenceDetails: latestRecurrenceOpenItem ? [ @@ -1342,6 +1354,11 @@ function RecurrenceWorkQueuePanel({ {t("failed", { count: summary?.failed_repair_group_total ?? 0 })} + + {t("sourceReview", { + count: summary?.source_correlation_review_group_total ?? 0, + })} +
@@ -1396,6 +1413,19 @@ function RecurrenceWorkQueuePanel({

{t("incident", { incident: item.latest_incident_id ?? "--" })}

+

{t("stage", { stage: item.latest_stage ?? "--" })}

+

+ {t("sourceEvent", { + event: item.latest_provider_event_id ?? "--", + })} +

+

+ {t("sourceRefs", { + refs: item.source_ref_total ?? 0, + sentry: item.sentry_ref_total ?? 0, + signoz: item.signoz_ref_total ?? 0, + })} +

{t("workItem", { id: workItem?.work_item_id ?? "--" })}

{t("repair", { diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 47c28f81..f23bd1ea 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,52 @@ +## 2026-05-21|T116 Provider source evidence review work items + +**觸發**: + +- T115 已證明 Sentry / SigNoz provider-native upstream canary 會寫入 AwoooP source dossier。 +- 但未連到 Incident 的 provider 原生事件仍只停在 source evidence,Operator 在前端看不到「已進來源鏈路,但需要審核是否配對到 Incident」。 + +**修正**: + +- `channel_event_dossier_service.build_dossier_recurrence()` 新增 `latest_stage` / `stage_counts`,讓 recurrence group 顯示事件跑到 heartbeat、upstream_canary、received 或 incident_linked 哪個階段。 +- Sentry / SigNoz 事件若有 provider refs、不是 heartbeat、且尚未連 Incident,會形成 read-only `source_correlation_review` work item: + - `kind=source_correlation_review` + - `next_step=review_provider_source_match` + - `reason=provider_native_evidence_unlinked` + - 不寫入 Incident / AutoRepair / Ticket,只提供 preview / dry-run / handoff read model。 +- `/api/v1/platform/events/dossier/recurrence` summary 新增 `source_correlation_review_group_total`。 +- AwoooP Runs 前端「重複告警關聯」新增「來源待審」指標,卡片顯示事件 stage,讓 operator 可看見 provider-native evidence 已進 AwoooP 但仍需配對審核。 +- AwoooP Work Items 同步顯示 source review count、stage、provider event id、Sentry / SignOz refs,避免從 Runs 點進工作項後掉成 unknown。 + +**Verification**: + +```text +python -m py_compile apps/api/src/services/channel_event_dossier_service.py apps/api/src/api/v1/platform/events.py + -> pass +DATABASE_URL=postgresql+asyncpg://test:test@localhost/test pytest -q tests/test_channel_event_dossier_service.py + -> 14 passed +pnpm --dir apps/web exec tsc --noEmit + -> pass +NEXT_PUBLIC_API_URL=https://awoooi.wooo.work pnpm --dir apps/web run build + -> compiled successfully, 90/90 static pages +python -m json.tool apps/web/messages/zh-TW.json +python -m json.tool apps/web/messages/en.json + -> pass +git diff --check + -> pass +python -m ruff check src/services/channel_event_dossier_service.py src/api/v1/platform/events.py tests/test_channel_event_dossier_service.py + -> pre-existing FastAPI Query B008 in events.py; no new logic failures observed +``` + +**目前整體進度**: + +- Provider-native upstream ingestion 可驗證性:99.5% → 99.6%。 +- Source refs / Sentry / SigNoz 可見性:99.9% → 99.93%。 +- Incident-level source correlation 可見性:86% → 88%。 +- AwoooP 告警可觀測鏈:99.985% → 99.988%。 +- 前端 AI 自動化管理介面同步:99.99%(Runs / Work Items recurrence panel 已同步來源待審)。 +- 完整 AI 自動化管理產品化:99.65% → 99.68%。 +- 剩餘:推 Gitea main、等待 CI/CD、production API / frontend 驗證。 + ## 2026-05-20|T115 Provider-native upstream canary 接入 **觸發**: