Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
## Phase 5 交付(ADR-086)
### 新增服務(4 個)
- blast_radius_calculator.py: 爆炸半徑計算器(0-100 純函數)
- 18 種 kubectl 動作基礎分 + 命名空間倍率 + 特殊 flag 修正
- HARD_RULES 永擋:delete ns/pv/pvc/clusterrole + rm -rf + DROP TABLE
- 分級:≤10 auto / 11-50 human / 51-99 dual / 100 blocked
- declarative_remediation.py: DeclarativeSpec 不可變規格(frozen dataclass)
- evaluate() 封裝 Blast Radius + dry-run + rollback_plan + constraints
- rollback_plan 從 kubectl 動作類型自動推導(不呼叫 LLM)
- gitops_pr_service.py: Gitea Issue 高風險修復審核(tier=dual)
- 含 Blast Radius + 目標狀態 + 回滾計畫 + 雙人審核流程
- AIOPS_P5_GITOPS_PR flag 守衛
- rollback_manager.py: 驗證失敗自動回滾
- 先驗 rollout history ≥ 2 revision,防止無版本可回滾
- kubectl rollout undo + 120s 收斂等待
### decision_manager.py 接線(AIOPS_P5_BLAST_RADIUS_CHECK)
- _auto_execute() 在安全守衛後、ApprovalRequest 前插入分級守衛
- blocked → 永擋 + 人工審核通知
- dual → 非同步 GitOps Issue + 升級人工審核
- human → 升級人工審核(不自動執行)
- auto(≤10)→ 原有自動執行流程
- 失敗降級:計算異常 → 保守升人工
### learning_service.py
- record_declarative_outcome(): 記錄 DeclarativeSpec 執行結果
anomaly_key=declarative:{incident_id},含 blast_radius_score/tier/rollback
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 5 全部完成
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
236 lines
7.9 KiB
Python
236 lines
7.9 KiB
Python
"""
|
||
AWOOOI AIOps Phase 5 — GitOps PR Service(GitOps 高風險修復 PR)
|
||
================================================================
|
||
職責:當 Blast Radius > 50(tier=dual)時,在 Gitea 建立 Issue 記錄修復計畫,
|
||
等待雙人審核後方可執行。
|
||
|
||
設計原則:
|
||
1. 只建立 Gitea Issue(不直接推 PR)— 修復計畫在 Issue 描述,含 rollback plan
|
||
2. 非阻塞:建立失敗不影響主路徑(fallback → 人工審核 Telegram 通知)
|
||
3. Issue 標題含 [AI-Repair] 前綴,方便篩選
|
||
4. 連線失敗時記錄 warning,不拋出例外
|
||
|
||
NOTE: 使用 Issue 而非 PR 的原因——
|
||
修復動作是 kubectl 命令,不是代碼變更,沒有對應 diff 可以 PR。
|
||
Issue 提供人類可讀的審計軌跡,並觸發 Gitea 通知。
|
||
|
||
ADR-086: Phase 5 Declarative 修復與 Blast Radius 分控
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 5 初始建立
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from dataclasses import dataclass
|
||
from typing import TYPE_CHECKING
|
||
|
||
import structlog
|
||
|
||
if TYPE_CHECKING:
|
||
from src.services.declarative_remediation import DeclarativeSpec
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Data Types
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class GitOpsPRResult:
|
||
"""GitOps Issue 建立結果"""
|
||
success: bool
|
||
issue_url: str | None # Gitea Issue URL
|
||
issue_number: int | None # Issue 編號
|
||
error: str | None # 失敗時的錯誤訊息
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Main Service
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
class GitOpsPRService:
|
||
"""
|
||
GitOps 高風險修復 Issue 建立器
|
||
|
||
Usage:
|
||
svc = GitOpsPRService()
|
||
result = await svc.create_repair_issue(spec, incident_id="INC-001")
|
||
if result.success:
|
||
print(result.issue_url)
|
||
"""
|
||
|
||
async def create_repair_issue(
|
||
self,
|
||
spec: "DeclarativeSpec",
|
||
incident_id: str,
|
||
diagnosis: str = "",
|
||
) -> GitOpsPRResult:
|
||
"""
|
||
在 Gitea 建立高風險修復審核 Issue。
|
||
|
||
Args:
|
||
spec: DeclarativeSpec(必須是 tier=dual)
|
||
incident_id: 關聯的 Incident ID
|
||
diagnosis: 診斷摘要(供人類理解上下文)
|
||
|
||
Returns:
|
||
GitOpsPRResult
|
||
"""
|
||
if not spec.requires_gitops_pr:
|
||
return GitOpsPRResult(success=False, issue_url=None, issue_number=None,
|
||
error="spec.tier 不是 dual,無需 GitOps PR")
|
||
|
||
from src.core.feature_flags import aiops_flags
|
||
if not aiops_flags.AIOPS_P5_GITOPS_PR:
|
||
logger.info(
|
||
"gitops_pr_skipped_feature_flag",
|
||
incident_id=incident_id,
|
||
blast_radius=spec.blast_radius_score,
|
||
)
|
||
return GitOpsPRResult(success=False, issue_url=None, issue_number=None,
|
||
error="AIOPS_P5_GITOPS_PR=False,跳過 Gitea Issue 建立")
|
||
|
||
title = f"[AI-Repair] 高風險修復審核(Blast={spec.blast_radius_score})— {incident_id}"
|
||
body = _build_issue_body(spec, incident_id, diagnosis)
|
||
|
||
return await self._create_gitea_issue(title, body, incident_id)
|
||
|
||
async def _create_gitea_issue(
|
||
self,
|
||
title: str,
|
||
body: str,
|
||
incident_id: str,
|
||
) -> GitOpsPRResult:
|
||
"""呼叫 Gitea API 建立 Issue。"""
|
||
import httpx
|
||
from src.core.config import settings
|
||
|
||
url = (
|
||
f"{settings.GITEA_API_URL}/repos/"
|
||
f"{settings.GITEA_REPO_OWNER}/{settings.GITEA_REPO_NAME}/issues"
|
||
)
|
||
headers = {
|
||
"Authorization": f"token {settings.GITEA_API_TOKEN}",
|
||
"Content-Type": "application/json",
|
||
}
|
||
payload = {
|
||
"title": title[:255],
|
||
"body": body,
|
||
"labels": [],
|
||
}
|
||
|
||
try:
|
||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||
resp = await client.post(url, json=payload, headers=headers)
|
||
resp.raise_for_status()
|
||
data = resp.json()
|
||
|
||
issue_number = data.get("number")
|
||
issue_url = data.get("html_url", "")
|
||
|
||
logger.info(
|
||
"gitops_issue_created",
|
||
incident_id=incident_id,
|
||
issue_number=issue_number,
|
||
issue_url=issue_url,
|
||
)
|
||
return GitOpsPRResult(
|
||
success=True,
|
||
issue_url=issue_url,
|
||
issue_number=issue_number,
|
||
error=None,
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning(
|
||
"gitops_issue_create_failed",
|
||
incident_id=incident_id,
|
||
error=str(e),
|
||
)
|
||
return GitOpsPRResult(success=False, issue_url=None, issue_number=None, error=str(e))
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Helpers
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def _build_issue_body(
|
||
spec: "DeclarativeSpec",
|
||
incident_id: str,
|
||
diagnosis: str,
|
||
) -> str:
|
||
"""建立 Gitea Issue 描述(Markdown 格式)。"""
|
||
constraints_md = "\n".join(f"- {c}" for c in spec.constraints) or "(無額外約束)"
|
||
|
||
return f"""## AI 自主修復審核請求
|
||
|
||
**Incident ID**: `{incident_id}`
|
||
**Blast Radius Score**: `{spec.blast_radius_score}` / 100(tier: `{spec.tier}`)
|
||
**需要**: 雙人審核後方可執行
|
||
|
||
---
|
||
|
||
## 修復計畫
|
||
|
||
**目標狀態**: {spec.target_state}
|
||
|
||
**執行命令**:
|
||
```bash
|
||
{spec.action}
|
||
```
|
||
|
||
**命名空間**: `{spec.namespace}`
|
||
**目標資源**: `{spec.target}`
|
||
**需要 dry-run**: {'✅ 是' if spec.dry_run_required else '⬜ 否'}
|
||
|
||
---
|
||
|
||
## 安全約束
|
||
|
||
{constraints_md}
|
||
|
||
---
|
||
|
||
## 回滾計畫
|
||
|
||
```bash
|
||
{spec.rollback_plan}
|
||
```
|
||
|
||
---
|
||
|
||
## 爆炸半徑計分依據
|
||
|
||
{spec.blast_reason}
|
||
|
||
---
|
||
|
||
## 診斷摘要
|
||
|
||
{diagnosis[:1000] if diagnosis else '(未提供診斷摘要)'}
|
||
|
||
---
|
||
|
||
## 審核流程
|
||
|
||
1. SRE-1 確認問題診斷正確,評估修復計畫
|
||
2. SRE-2 交叉驗證,確認回滾計畫可行
|
||
3. 兩人均在 Telegram 回覆 `/approve {incident_id}` 後,系統自動執行
|
||
|
||
> 此 Issue 由 AWOOOI AI 自主修復系統(Phase 5 ADR-086)自動建立。
|
||
"""
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Singleton
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
_service: GitOpsPRService | None = None
|
||
|
||
|
||
def get_gitops_pr_service() -> GitOpsPRService:
|
||
global _service
|
||
if _service is None:
|
||
_service = GitOpsPRService()
|
||
return _service
|