feat(awooop): add auto repair canary live-fire target
This commit is contained in:
56
k8s/awoooi-prod/10-deployment-auto-repair-canary.yaml
Normal file
56
k8s/awoooi-prod/10-deployment-auto-repair-canary.yaml
Normal file
@@ -0,0 +1,56 @@
|
||||
# AwoooP 自動修復 live-fire canary
|
||||
# 目的: 提供低風險 Deployment 目標,讓 T16 可驗證
|
||||
# alert -> Playbook -> executor -> verifier -> learning/KM 的真實閉環。
|
||||
# 注意: 此 Pod 不承接任何流量,不掛載 Secret,不自動暴露 Service。
|
||||
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: awoooi-auto-repair-canary
|
||||
namespace: awoooi-prod
|
||||
labels:
|
||||
app: awoooi-auto-repair-canary
|
||||
system: awoooi
|
||||
environment: prod
|
||||
component: auto-repair-canary
|
||||
spec:
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app: awoooi-auto-repair-canary
|
||||
environment: prod
|
||||
system: awoooi
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 1
|
||||
maxUnavailable: 0
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: awoooi-auto-repair-canary
|
||||
system: awoooi
|
||||
environment: prod
|
||||
component: auto-repair-canary
|
||||
spec:
|
||||
automountServiceAccountToken: false
|
||||
containers:
|
||||
- name: canary
|
||||
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
|
||||
imagePullPolicy: Always
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
- |
|
||||
trap 'exit 0' TERM INT
|
||||
while true; do sleep 3600 & wait $!; done
|
||||
env:
|
||||
- name: AWOOOP_CANARY_ROLE
|
||||
value: "auto-repair-live-fire-target"
|
||||
resources:
|
||||
requests:
|
||||
cpu: "5m"
|
||||
memory: "32Mi"
|
||||
limits:
|
||||
cpu: "50m"
|
||||
memory: "96Mi"
|
||||
@@ -28,6 +28,7 @@ resources:
|
||||
- 07-rbac.yaml
|
||||
- 08-deployment-worker.yaml
|
||||
- 09-pdb.yaml
|
||||
- 10-deployment-auto-repair-canary.yaml
|
||||
- 13-cronjob-k3s-report.yaml
|
||||
- 14-cronjob-weekly-report.yaml
|
||||
- 15-cronjob-km-vectorize.yaml
|
||||
|
||||
155
scripts/ops/awooop-seed-auto-repair-canary-playbook.py
Normal file
155
scripts/ops/awooop-seed-auto-repair-canary-playbook.py
Normal file
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Seed the AwoooP T16 auto-repair canary Playbook.
|
||||
|
||||
Run from an API pod so it uses the same PostgreSQL/Redis context as production:
|
||||
|
||||
python scripts/ops/awooop-seed-auto-repair-canary-playbook.py
|
||||
|
||||
The Playbook is intentionally scoped to the no-traffic
|
||||
``awoooi-auto-repair-canary`` Deployment. It exists to prove the automation
|
||||
chain with real execution evidence; it is not an organic symptom rule.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
|
||||
_REPO_API_ROOT = Path(__file__).resolve().parents[2] / "apps" / "api"
|
||||
if (_REPO_API_ROOT / "src").exists():
|
||||
sys.path.insert(0, str(_REPO_API_ROOT))
|
||||
|
||||
from src.models.playbook import (
|
||||
ActionType,
|
||||
Playbook,
|
||||
PlaybookSource,
|
||||
PlaybookStatus,
|
||||
RepairStep,
|
||||
RiskLevel,
|
||||
SymptomPattern,
|
||||
)
|
||||
from src.repositories.playbook_repository import get_playbook_repository
|
||||
from src.utils.timezone import now_taipei
|
||||
|
||||
|
||||
DEFAULT_ALERTNAME = "AwoooPAutoRepairCanaryT16"
|
||||
DEFAULT_TARGET = "awoooi-auto-repair-canary"
|
||||
DEFAULT_NAMESPACE = "awoooi-prod"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SeedResult:
|
||||
playbook_id: str
|
||||
alertname: str
|
||||
target: str
|
||||
namespace: str
|
||||
status: str
|
||||
source: str
|
||||
command: str
|
||||
preserved_success_count: int
|
||||
preserved_failure_count: int
|
||||
trust_score: float
|
||||
|
||||
|
||||
def _playbook_id_for_alertname(alertname: str) -> str:
|
||||
if alertname == DEFAULT_ALERTNAME:
|
||||
return "PB-AWOOOP-T16-CANARY"
|
||||
suffix = re.sub(r"[^A-Z0-9]+", "-", alertname.upper()).strip("-")
|
||||
suffix = suffix.replace("AWOOOP-AUTO-REPAIR-CANARY-", "")
|
||||
suffix = suffix[:18] or "T16"
|
||||
return f"PB-AWOOOP-CANARY-{suffix}"
|
||||
|
||||
|
||||
async def seed_canary_playbook(
|
||||
*,
|
||||
alertname: str,
|
||||
target: str,
|
||||
namespace: str,
|
||||
) -> SeedResult:
|
||||
repo = get_playbook_repository()
|
||||
playbook_id = _playbook_id_for_alertname(alertname)
|
||||
existing = await repo.get_by_id(playbook_id)
|
||||
command = f"kubectl rollout restart deployment/{target} -n {namespace}"
|
||||
|
||||
playbook = Playbook(
|
||||
playbook_id=playbook_id,
|
||||
name="AwoooP T16 auto-repair canary rollout restart",
|
||||
description=(
|
||||
"低風險 live-fire canary,用於驗證 alert -> Playbook -> executor "
|
||||
"-> verifier -> learning/KM -> truth-chain 的真實自動修復閉環。"
|
||||
),
|
||||
status=PlaybookStatus.APPROVED,
|
||||
source=PlaybookSource.MANUAL,
|
||||
symptom_pattern=SymptomPattern(
|
||||
alert_names=[alertname],
|
||||
affected_services=[target],
|
||||
severity_range=["P2"],
|
||||
label_patterns={"component": target, "namespace": namespace},
|
||||
keywords=["auto repair canary", "live-fire"],
|
||||
),
|
||||
repair_steps=[
|
||||
RepairStep(
|
||||
step_number=1,
|
||||
action_type=ActionType.KUBECTL,
|
||||
command=command,
|
||||
expected_result=f"Deployment {target} receives restartedAt annotation and remains Available.",
|
||||
rollback_command=f"kubectl rollout undo deployment/{target} -n {namespace}",
|
||||
requires_approval=False,
|
||||
risk_level=RiskLevel.LOW,
|
||||
)
|
||||
],
|
||||
estimated_duration_minutes=1,
|
||||
source_incident_ids=(existing.source_incident_ids if existing else []),
|
||||
ai_confidence=1.0,
|
||||
success_count=(existing.success_count if existing else 0),
|
||||
failure_count=(existing.failure_count if existing else 0),
|
||||
last_used_at=(existing.last_used_at if existing else None),
|
||||
trust_score=(existing.trust_score if existing else 0.6),
|
||||
approved_by="codex-t16-live-fire",
|
||||
approved_at=(existing.approved_at if existing and existing.approved_at else now_taipei()),
|
||||
tags=["awooop", "t16", "auto-repair", "canary", "live-fire"],
|
||||
notes=(
|
||||
"Synthetic low-risk canary for AwoooP automation verification. "
|
||||
"This does not authorize organic production remediation rules; "
|
||||
"it only proves the closed-loop plumbing with a no-traffic target."
|
||||
),
|
||||
created_at=(existing.created_at if existing else now_taipei()),
|
||||
)
|
||||
|
||||
await repo.create(playbook)
|
||||
return SeedResult(
|
||||
playbook_id=playbook_id,
|
||||
alertname=alertname,
|
||||
target=target,
|
||||
namespace=namespace,
|
||||
status=playbook.status.value,
|
||||
source=playbook.source.value,
|
||||
command=command,
|
||||
preserved_success_count=playbook.success_count,
|
||||
preserved_failure_count=playbook.failure_count,
|
||||
trust_score=playbook.trust_score,
|
||||
)
|
||||
|
||||
|
||||
async def _amain() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--alertname", default=DEFAULT_ALERTNAME)
|
||||
parser.add_argument("--target", default=DEFAULT_TARGET)
|
||||
parser.add_argument("--namespace", default=DEFAULT_NAMESPACE)
|
||||
args = parser.parse_args()
|
||||
|
||||
result = await seed_canary_playbook(
|
||||
alertname=args.alertname,
|
||||
target=args.target,
|
||||
namespace=args.namespace,
|
||||
)
|
||||
print(json.dumps(asdict(result), ensure_ascii=False, sort_keys=True))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(_amain())
|
||||
Reference in New Issue
Block a user