diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 3f4d9a4e..e8107db5 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -1636,6 +1636,10 @@ async def _process_new_alert_background( # 2026-04-27 ogt + Claude Sonnet 4.6: CS2 規則引擎自動執行 # 設計:is_rule_based=True 確定性高,滿足條件直接執行,不等人工審核 # 安全防線:CRITICAL / destructive patterns / NO_ACTION / 空 kubectl → 全部降級 PENDING + _cs2_auto_approval = None + _cs2_executor = None + _cs2_exec_success: bool | None = None + _cs2_exec_error: str | None = None try: from src.models.approval import ApprovalRequest, ApprovalStatus from src.services.approval_execution import ApprovalExecutionService @@ -1659,6 +1663,7 @@ async def _process_new_alert_background( ) # 使用 DB 中剛建立的 approval.id 讓 executor 可回寫 _auto_approval.id = approval.id + _cs2_auto_approval = _auto_approval _cs2_executor = ApprovalExecutionService() _cs2_exec_success = await _cs2_executor.execute_approved_action(_auto_approval) @@ -1681,6 +1686,8 @@ async def _process_new_alert_background( exec_success=_cs2_exec_success, ) except Exception as _auto_err: + _cs2_exec_success = False if _cs2_auto_approval is not None else None + _cs2_exec_error = str(_auto_err) logger.warning( "cs2_auto_execute_failed_degraded_to_pending", approval_id=str(approval.id), @@ -1712,6 +1719,23 @@ async def _process_new_alert_background( error=str(_meta_err), ) + if _cs2_auto_approval is not None and _cs2_exec_success is not None: + try: + _cs2_auto_approval.incident_id = incident_id + _cs2_executor = _cs2_executor or ApprovalExecutionService() + await _cs2_executor.finalize_auto_approved_execution( + _cs2_auto_approval, + success=_cs2_exec_success, + error_message=_cs2_exec_error, + ) + except Exception as _cs2_finalize_err: + logger.warning( + "cs2_auto_execute_finalize_failed", + approval_id=str(approval.id), + incident_id=incident_id, + error=str(_cs2_finalize_err), + ) + _is_heartbeat = is_heartbeat_alertname(alertname) if can_auto_repair and not _is_heartbeat: await _try_auto_repair_background( @@ -1875,8 +1899,15 @@ async def _process_new_alert_background( and "NO_ACTION" not in (analysis_result.action_title or "") and is_safe_kubectl_action(_cs3_kubectl) ) + _cs3_auto_approval = None + _cs3_executor = None + _cs3_exec_success: bool | None = None + _cs3_exec_error: str | None = None if _cs3_can_auto: try: + from src.models.approval import ApprovalRequest, ApprovalStatus + from src.services.approval_execution import ApprovalExecutionService + _cs3_auto_approval = ApprovalRequest( action=approval_create.action, description=approval_create.description, @@ -1893,8 +1924,17 @@ async def _process_new_alert_background( else "cs3_auto_confident_execution", }, ) + _cs3_auto_approval.id = approval.id _cs3_executor = ApprovalExecutionService() _cs3_exec_success = await _cs3_executor.execute_approved_action(_cs3_auto_approval) + try: + await service.update_execution_status(approval.id, _cs3_exec_success) + except Exception as _cs3_upd_err: + logger.warning( + "cs3_auto_execute_status_update_failed", + approval_id=str(approval.id), + error=str(_cs3_upd_err), + ) logger.info( "cs3_llm_auto_executed", approval_id=str(approval.id), @@ -1910,6 +1950,8 @@ async def _process_new_alert_background( ), ) except Exception as _cs3_exec_err: + _cs3_exec_success = False if _cs3_auto_approval is not None else None + _cs3_exec_error = str(_cs3_exec_err) logger.warning("cs3_llm_auto_execute_failed", error=str(_cs3_exec_err)) incident_id = await create_incident_for_approval( @@ -1937,6 +1979,23 @@ async def _process_new_alert_background( error=str(_meta_err), ) + if _cs3_auto_approval is not None and _cs3_exec_success is not None: + try: + _cs3_auto_approval.incident_id = incident_id + _cs3_executor = _cs3_executor or ApprovalExecutionService() + await _cs3_executor.finalize_auto_approved_execution( + _cs3_auto_approval, + success=_cs3_exec_success, + error_message=_cs3_exec_error, + ) + except Exception as _cs3_finalize_err: + logger.warning( + "cs3_auto_execute_finalize_failed", + approval_id=str(approval.id), + incident_id=incident_id, + error=str(_cs3_finalize_err), + ) + root_cause = analysis_result.description or message estimated_downtime = blast.estimated_downtime if blast else "~30s" primary_responsibility = analysis_result.primary_responsibility or "COLLAB" diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index 79712413..9ca8c664 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -858,7 +858,7 @@ class ApprovalExecutionService: """ try: # 自動執行路徑 skip(避免與 _push_auto_repair_result 重複發訊息) - if (approval.requested_by or "").lower() == "auto_approve": + if self._is_auto_approved_request(approval): return if not approval.incident_id: @@ -1106,6 +1106,186 @@ class ApprovalExecutionService: error=str(_e), ) + @staticmethod + def _is_auto_approved_request(approval: "ApprovalRequest") -> bool: + requested_by = (getattr(approval, "requested_by", "") or "").lower() + return requested_by.startswith("auto_approve") + + @staticmethod + def _is_observation_only_action(action: str | None) -> bool: + action_upper = (action or "").strip().upper() + return ( + not action_upper + or "NO_ACTION" in action_upper + or "NO-ACTION" in action_upper + or "NOACTION" in action_upper + or action_upper.startswith("OBSERVE") + or action_upper.startswith("INVESTIGATE") + ) + + @staticmethod + def _approval_risk_value(approval: "ApprovalRequest") -> str | None: + risk_level = getattr(approval, "risk_level", None) + if risk_level is None: + return None + return getattr(risk_level, "value", str(risk_level)) + + async def finalize_auto_approved_execution( + self, + approval: "ApprovalRequest", + *, + success: bool, + error_message: str | None = None, + ) -> None: + """ + 補齊「自動批准已執行」路徑的 incident-linked 證據鏈。 + + CS2/CS3 webhook 路徑為了快速執行,會先呼叫 execute_approved_action(), + 再建立 Incident。executor 當下沒有 incident_id,導致 verifier/KM/ + auto_repair_executions 都無法串回同一張告警卡。此方法只在 incident + 建立後補上 durable trace,不重新執行 action。 + """ + if not self._is_auto_approved_request(approval): + return + + incident_id = getattr(approval, "incident_id", None) + if not incident_id: + logger.warning( + "auto_approved_execution_finalize_skipped_no_incident", + approval_id=str(getattr(approval, "id", "")), + requested_by=getattr(approval, "requested_by", None), + ) + return + + if self._is_observation_only_action(getattr(approval, "action", None)): + logger.info( + "auto_approved_execution_finalize_skipped_observation_only", + approval_id=str(approval.id), + incident_id=incident_id, + action=(approval.action or "")[:120], + ) + return + + parsed = parse_operation_from_action(approval.action) + operation_type = parsed.operation_type + resource_name = parsed.resource_name or "unknown" + namespace = parsed.namespace or "default" + + playbook_id = str(getattr(approval, "matched_playbook_id", None) or approval.id)[:36] + operation_label = operation_type.value if operation_type else "unknown" + playbook_name = f"approval_auto_execute:{operation_label}:{resource_name}"[:200] + triggered_by = (getattr(approval, "requested_by", None) or "auto_approve")[:50] + action_taken = f"auto_repair_playbook:{playbook_id}:{operation_label}:{resource_name}" + if not success: + action_taken = f"{action_taken}:FAILED" + error_message = error_message or "auto-approved executor returned failure; see approval/aol logs" + + try: + from src.repositories.audit_log_repository import get_auto_repair_execution_repository + + repo = get_auto_repair_execution_repository() + existing = await repo.list_by_incident(incident_id) + already_recorded = any( + str(getattr(row, "playbook_id", "")) == playbook_id + and getattr(row, "triggered_by", "") == triggered_by + and (approval.action or "") in list(getattr(row, "executed_steps", []) or []) + for row in existing + ) + if not already_recorded: + await repo.create( + incident_id=incident_id, + playbook_id=playbook_id, + playbook_name=playbook_name, + success=success, + executed_steps=[approval.action], + error_message=error_message, + triggered_by=triggered_by, + risk_level=self._approval_risk_value(approval), + ) + else: + logger.info( + "auto_approved_execution_record_already_exists", + approval_id=str(approval.id), + incident_id=incident_id, + playbook_id=playbook_id, + ) + except Exception as exc: + logger.warning( + "auto_approved_execution_record_failed", + approval_id=str(approval.id), + incident_id=incident_id, + error=str(exc), + ) + + try: + timeline = get_timeline_service() + await timeline.add_event( + event_type="exec", + status="success" if success else "error", + title=f"{'✅' if success else '❌'} 自動批准執行已補鏈: {operation_label}", + description=( + f"Target: {resource_name} @ {namespace}; " + f"source={triggered_by}; action={approval.action[:160]}" + ), + actor="leWOOOgo", + actor_role="executor", + approval_id=str(approval.id), + incident_id=incident_id, + ) + except Exception as exc: + logger.warning( + "auto_approved_execution_timeline_failed", + approval_id=str(approval.id), + incident_id=incident_id, + error=str(exc), + ) + + try: + await self.write_execution_result_to_km(approval, success, error_message) + except Exception as exc: + logger.warning( + "auto_approved_execution_km_failed", + approval_id=str(approval.id), + incident_id=incident_id, + error=str(exc), + ) + + from src.core.feature_flags import aiops_flags + if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"): + try: + await asyncio.wait_for( + self._run_post_execution_verify( + approval=approval, + action_taken=action_taken, + ), + timeout=_VERIFIER_AWAIT_TIMEOUT_SEC, + ) + except asyncio.TimeoutError: + logger.warning( + "auto_approved_execution_post_verify_timeout", + approval_id=str(approval.id), + incident_id=incident_id, + timeout_sec=_VERIFIER_AWAIT_TIMEOUT_SEC, + ) + + if success: + try: + from src.services.incident_service import get_incident_service + + await get_incident_service().resolve_incident(incident_id) + logger.info( + "incident_resolved_after_auto_approved_execution_finalize", + incident_id=incident_id, + approval_id=str(approval.id), + ) + except Exception as exc: + logger.warning( + "incident_resolve_after_auto_approved_execution_finalize_failed", + incident_id=incident_id, + approval_id=str(approval.id), + error=str(exc), + ) + async def write_execution_result_to_km( self, approval: "ApprovalRequest", @@ -1124,7 +1304,7 @@ class ApprovalExecutionService: from src.services.km_writer import KMWritePayload, km_write_with_flag # 來源辨識(B.1 精修) - _is_auto = (approval.requested_by or "").lower() == "auto_approve" + _is_auto = self._is_auto_approved_request(approval) _mode_prefix = "[自動修復]" if _is_auto else "[人工修復]" _mode_tag = "auto_executed" if _is_auto else "human_approved" diff --git a/apps/api/tests/test_approval_execution_auto_approved_finalize.py b/apps/api/tests/test_approval_execution_auto_approved_finalize.py new file mode 100644 index 00000000..76205181 --- /dev/null +++ b/apps/api/tests/test_approval_execution_auto_approved_finalize.py @@ -0,0 +1,130 @@ +from types import SimpleNamespace + +from unittest.mock import AsyncMock + +import pytest + +from src.models.approval import RiskLevel +from src.services.approval_execution import ApprovalExecutionService + + +class _FakeAutoRepairRepo: + def __init__(self) -> None: + self.created: list[dict] = [] + + async def list_by_incident(self, incident_id: str) -> list: + return [] + + async def create(self, **kwargs): + self.created.append(kwargs) + return SimpleNamespace(id="are-1", **kwargs) + + +@pytest.mark.asyncio +async def test_finalize_auto_approved_execution_persists_incident_link(monkeypatch): + repo = _FakeAutoRepairRepo() + timeline = SimpleNamespace(add_event=AsyncMock()) + incident_service = SimpleNamespace(resolve_incident=AsyncMock()) + write_km = AsyncMock() + run_verify = AsyncMock() + + monkeypatch.setattr( + "src.repositories.audit_log_repository.get_auto_repair_execution_repository", + lambda: repo, + ) + monkeypatch.setattr( + "src.services.approval_execution.get_timeline_service", + lambda: timeline, + ) + monkeypatch.setattr( + "src.services.incident_service.get_incident_service", + lambda: incident_service, + ) + monkeypatch.setattr( + "src.core.feature_flags.aiops_flags", + SimpleNamespace(is_sub_flag_enabled=lambda _: True), + ) + monkeypatch.setattr( + ApprovalExecutionService, + "write_execution_result_to_km", + write_km, + ) + monkeypatch.setattr( + ApprovalExecutionService, + "_run_post_execution_verify", + run_verify, + ) + + approval = SimpleNamespace( + id="11111111-1111-1111-1111-111111111111", + incident_id="INC-20260513-001", + action="kubectl rollout restart deployment/api -n awoooi-prod", + requested_by="auto_approve_rule_engine", + matched_playbook_id="pb-auto-001", + risk_level=RiskLevel.LOW, + ) + + await ApprovalExecutionService().finalize_auto_approved_execution( + approval, + success=True, + ) + + assert repo.created == [ + { + "incident_id": "INC-20260513-001", + "playbook_id": "pb-auto-001", + "playbook_name": "approval_auto_execute:RESTART_DEPLOYMENT:api", + "success": True, + "executed_steps": ["kubectl rollout restart deployment/api -n awoooi-prod"], + "error_message": None, + "triggered_by": "auto_approve_rule_engine", + "risk_level": "low", + } + ] + timeline.add_event.assert_awaited_once() + write_km.assert_awaited_once_with(approval, True, None) + run_verify.assert_awaited_once() + assert run_verify.await_args.kwargs["action_taken"].startswith( + "auto_repair_playbook:pb-auto-001:RESTART_DEPLOYMENT:api" + ) + incident_service.resolve_incident.assert_awaited_once_with("INC-20260513-001") + + +@pytest.mark.asyncio +async def test_finalize_auto_approved_execution_skips_no_action(monkeypatch): + repo = _FakeAutoRepairRepo() + write_km = AsyncMock() + run_verify = AsyncMock() + + monkeypatch.setattr( + "src.repositories.audit_log_repository.get_auto_repair_execution_repository", + lambda: repo, + ) + monkeypatch.setattr( + ApprovalExecutionService, + "write_execution_result_to_km", + write_km, + ) + monkeypatch.setattr( + ApprovalExecutionService, + "_run_post_execution_verify", + run_verify, + ) + + approval = SimpleNamespace( + id="22222222-2222-2222-2222-222222222222", + incident_id="INC-20260513-002", + action="NO_ACTION: observe only", + requested_by="auto_approve_rule_engine", + matched_playbook_id="pb-auto-002", + risk_level=RiskLevel.LOW, + ) + + await ApprovalExecutionService().finalize_auto_approved_execution( + approval, + success=True, + ) + + assert repo.created == [] + write_km.assert_not_awaited() + run_verify.assert_not_awaited() diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 93330f98..6c306991 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -8129,3 +8129,80 @@ INC-20260513-42FCEC -> stage=manual_required / verdict=manual_required_no_action - 產線結果更誠實:目前不是「修了但未驗證」,而是「18 筆需人工判斷、12 筆只收到告警、0 筆可宣稱已驗證自動修復」。 - 下一步 T14 應從「分類校正」進到真正閉環:讓可安全處理的低風險事件產生 durable `auto_repair_executions`、post-execution `verification_result`、KM / learning writeback;不能再用 NO_ACTION 假裝自動修復。 - 目前整體進度更新:約 78%。 + +### 2026-05-13 — AwoooP truth-chain T14a:auto-repair verifier 結果補落庫,並消除重複驗證(production verified) + +**live diagnosis**: + +- 24h 內 production 其實有 `auto_repair_executions=6`,所以不是完全沒跑自動修復。 +- 但 `incident_evidence.verification_result` 24h 仍是 `0`,代表 Operator Console 仍不能宣稱「已驗證自動修復」。 +- 抽查 `INC-20260513-265773` 類事件可見 `AUTO_REPAIR_TRIGGERED` / `EXECUTION_COMPLETED`,且 log 內 verifier 判定 `degraded`,但 DB evidence 沒有 durable `verification_result`。 +- 根因:`PostExecutionVerifier.verify(snapshot=None)` 會回傳結果,但沒有 evidence snapshot 可更新;同時 webhook 路徑與 `AutoRepairService` 內部 fire-and-forget 會各自驗證一次,導致 Telegram / emergency escalation 有重複結論。 + +**變更**: + +- `PostExecutionVerifier` 在 `snapshot=None` 時補寫 fallback `EvidenceSnapshot`,內容包含: + - `post_execution_state` + - `verification_result` + - `matched_playbook_id`(可由 `auto_repair_playbook:*` / `auto_repair:*` 萃取) + - `mcp_health.post_execution_verifier` + - `evidence_summary` 標明 `pre_execution_state=missing` +- `AutoRepairService.execute_auto_repair()` 新增 `run_post_verification` 參數,預設維持原行為;webhook `_try_auto_repair_background()` 改以 `run_post_verification=False`,由 webhook 集中 await verifier / learning / incident resolve,避免同一個修復跑兩次驗證。 +- CD 修復:第一次推版 `518a16e8` 的 image build/push 已成功,但 `Inject K8s Secrets` 因 runner known_hosts 缺 ED25519 host key 失敗。`.gitea/workflows/cd.yaml` 已改為 `ssh-keyscan -t ed25519,rsa,ecdsa` 並檢查 known_hosts 非空。 + +**local verification**: + +```text +python3 -m py_compile apps/api/src/services/post_execution_verifier.py apps/api/src/services/auto_repair_service.py apps/api/src/api/v1/webhooks.py apps/api/tests/test_post_execution_verifier.py apps/api/tests/test_learning_chain_e2e.py +OK + +ruff check --select F821 apps/api/src/services/post_execution_verifier.py apps/api/src/services/auto_repair_service.py apps/api/src/api/v1/webhooks.py apps/api/tests/test_post_execution_verifier.py apps/api/tests/test_learning_chain_e2e.py +OK + +DATABASE_URL=postgresql+asyncpg://u:p@localhost:5432/db /Users/ogt/awoooi/apps/api/.venv/bin/python -m pytest tests/test_post_execution_verifier.py tests/test_learning_chain_e2e.py tests/test_awooop_truth_chain_service.py tests/test_platform_router_order.py -q +55 passed + +ruby -e 'require "yaml"; YAML.load_file(".gitea/workflows/cd.yaml"); puts "yaml ok"' +yaml ok +``` + +**production deploy / smoke(完成)**: + +```text +Gitea: +2061 code-review 3bad3544 -> success +2062 CD Pipeline workflow_dispatch 3bad3544 -> success + tests -> success + build-and-deploy -> success + post-deploy-checks -> success +Deploy marker: 9c9cf680 chore(cd): deploy 3bad354 [skip ci] + +K8s image: +awoooi-api 192.168.0.110:5000/awoooi/api:3bad354414edcef35406796b9b9e2cfb90b0740f +awoooi-worker 192.168.0.110:5000/awoooi/api:3bad354414edcef35406796b9b9e2cfb90b0740f +awoooi-web 192.168.0.110:5000/awoooi/web:3bad354414edcef35406796b9b9e2cfb90b0740f + +health: +https://awoooi.wooo.work/api/v1/health -> 200 + +quality summary, hours=24, limit=30: +verified_auto_repair_total=0 +production_claim.can_claim_full_auto_repair=false +by_verdict: + manual_required_no_action=18 + received_only=11 + approval_required=1 + +DB baseline after deploy time 2026-05-13T11:02:32Z: +auto_repair_since_deploy=0 +verified_evidence_since_deploy=0 +verified_evidence_24h=0 +auto_repair_24h=6 +``` + +判讀: + +- T14a 已完成並推版:未來只要 webhook auto-repair 真的觸發,即使 pre-decision snapshot 尚未可用,verifier 結果也會有 durable evidence row 可查。 +- 目前 production smoke 沒有新的 auto-repair 事件可驗證 fallback 寫入,因此仍不能宣稱完整閉環;這是正確保守判讀。 +- 下一步 T14b:等下一筆 `auto_repair=true` 事件或設計安全 live-fire,驗證 `auto_repair_executions -> incident_evidence.verification_result -> learning/KM -> truth-chain auto_repaired_verified` 是否全鏈路成立;同時補 auto-approved approval execution 的 incident linkage / durable execution record。 +- 目前整體進度更新:約 80%。 diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index eff02b1e..f210d36f 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -2032,6 +2032,14 @@ Phase 6 完成後 - Smoke:quality summary `hours=24&limit=30` 由舊的 `execution_unverified=11` 校正為 `manual_required_no_action=18`、`received_only=12`、`execution_unverified=0`、`verified_auto_repair_total=0`、`production_claim=false`。 - 判讀:T13 完成的是「真相分類校正」,不是自動修復閉環。下一步 T14 必須讓可安全處理的低風險事件產生 durable `auto_repair_executions`、post-execution `verification_result`、KM / learning writeback;禁止再用 NO_ACTION 或 dry-run audit 假裝自動修復。 +**T14a auto-repair verifier durable evidence production verified(2026-05-13 台北)**: +- 觸發:live DB 證實 24h `auto_repair_executions=6`,但 `incident_evidence.verification_result=0`;部分 auto-repair event log 內已有 verifier `degraded` 判定,卻沒有 durable evidence 給 truth-chain / Operator Console 回查。 +- 修正:`PostExecutionVerifier.verify(snapshot=None)` 會補寫 fallback `EvidenceSnapshot`,包含 `post_execution_state`、`verification_result`、`matched_playbook_id` 與 `pre_execution_state=missing` 摘要;webhook auto-repair path 改以 `run_post_verification=False` 呼叫 `AutoRepairService`,避免 service fire-and-forget 與 webhook await verifier 雙重驗證 / 雙重 emergency escalation。 +- CD 修正:第一次 `518a16e8` deploy 失敗在 runner known_hosts 缺 ED25519;`.gitea/workflows/cd.yaml` 改為 `ssh-keyscan -t ed25519,rsa,ecdsa` 並檢查 known_hosts 非空。 +- Production:`3bad3544 fix(cd): include ed25519 deploy host keyscan` 已用 `workflow_dispatch` 跑 CD,Gitea run `2062` tests/build-and-deploy/post-deploy-checks 全 success,deploy marker `9c9cf680`,API/Worker/Web image 均為 `3bad354414edcef35406796b9b9e2cfb90b0740f`,health 200。 +- Smoke:quality summary 仍為 `verified_auto_repair_total=0`、`production_claim=false`;deploy 後尚無新 auto-repair 事件(`auto_repair_since_deploy=0`),所以不能宣稱完整閉環,只能宣稱「未來 auto-repair verifier 結果會有 durable evidence target」。 +- 下一步 T14b:等待下一筆 `auto_repair=true` 事件或設計安全 live-fire,驗證 `auto_repair_executions -> incident_evidence.verification_result -> learning/KM -> truth-chain auto_repaired_verified` 全鏈路;並補 auto-approved approval execution 的 incident linkage / durable execution record。 + --- ### 2026-04-20 晚 (台北) — C1-C4 全流程串接 — Playbook 鏈路保護(commit de2d34d)