feat(awooop): add auto repair canary live-fire target
Some checks failed
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / tests (push) Successful in 1m11s
CD Pipeline / build-and-deploy (push) Failing after 6m52s
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-05-13 22:30:20 +08:00
parent 0337b62349
commit 1778a692e0
3 changed files with 212 additions and 0 deletions

View File

@@ -0,0 +1,56 @@
# AwoooP 自動修復 live-fire canary
# 目的: 提供低風險 Deployment 目標,讓 T16 可驗證
# alert -> Playbook -> executor -> verifier -> learning/KM 的真實閉環。
# 注意: 此 Pod 不承接任何流量,不掛載 Secret不自動暴露 Service。
apiVersion: apps/v1
kind: Deployment
metadata:
name: awoooi-auto-repair-canary
namespace: awoooi-prod
labels:
app: awoooi-auto-repair-canary
system: awoooi
environment: prod
component: auto-repair-canary
spec:
replicas: 1
revisionHistoryLimit: 2
selector:
matchLabels:
app: awoooi-auto-repair-canary
environment: prod
system: awoooi
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
template:
metadata:
labels:
app: awoooi-auto-repair-canary
system: awoooi
environment: prod
component: auto-repair-canary
spec:
automountServiceAccountToken: false
containers:
- name: canary
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
imagePullPolicy: Always
command: ["/bin/sh", "-c"]
args:
- |
trap 'exit 0' TERM INT
while true; do sleep 3600 & wait $!; done
env:
- name: AWOOOP_CANARY_ROLE
value: "auto-repair-live-fire-target"
resources:
requests:
cpu: "5m"
memory: "32Mi"
limits:
cpu: "50m"
memory: "96Mi"

View File

@@ -28,6 +28,7 @@ resources:
- 07-rbac.yaml
- 08-deployment-worker.yaml
- 09-pdb.yaml
- 10-deployment-auto-repair-canary.yaml
- 13-cronjob-k3s-report.yaml
- 14-cronjob-weekly-report.yaml
- 15-cronjob-km-vectorize.yaml

View File

@@ -0,0 +1,155 @@
#!/usr/bin/env python3
"""Seed the AwoooP T16 auto-repair canary Playbook.
Run from an API pod so it uses the same PostgreSQL/Redis context as production:
python scripts/ops/awooop-seed-auto-repair-canary-playbook.py
The Playbook is intentionally scoped to the no-traffic
``awoooi-auto-repair-canary`` Deployment. It exists to prove the automation
chain with real execution evidence; it is not an organic symptom rule.
"""
from __future__ import annotations
import argparse
import asyncio
import json
import re
import sys
from dataclasses import asdict, dataclass
from pathlib import Path
_REPO_API_ROOT = Path(__file__).resolve().parents[2] / "apps" / "api"
if (_REPO_API_ROOT / "src").exists():
sys.path.insert(0, str(_REPO_API_ROOT))
from src.models.playbook import (
ActionType,
Playbook,
PlaybookSource,
PlaybookStatus,
RepairStep,
RiskLevel,
SymptomPattern,
)
from src.repositories.playbook_repository import get_playbook_repository
from src.utils.timezone import now_taipei
DEFAULT_ALERTNAME = "AwoooPAutoRepairCanaryT16"
DEFAULT_TARGET = "awoooi-auto-repair-canary"
DEFAULT_NAMESPACE = "awoooi-prod"
@dataclass(frozen=True)
class SeedResult:
playbook_id: str
alertname: str
target: str
namespace: str
status: str
source: str
command: str
preserved_success_count: int
preserved_failure_count: int
trust_score: float
def _playbook_id_for_alertname(alertname: str) -> str:
if alertname == DEFAULT_ALERTNAME:
return "PB-AWOOOP-T16-CANARY"
suffix = re.sub(r"[^A-Z0-9]+", "-", alertname.upper()).strip("-")
suffix = suffix.replace("AWOOOP-AUTO-REPAIR-CANARY-", "")
suffix = suffix[:18] or "T16"
return f"PB-AWOOOP-CANARY-{suffix}"
async def seed_canary_playbook(
*,
alertname: str,
target: str,
namespace: str,
) -> SeedResult:
repo = get_playbook_repository()
playbook_id = _playbook_id_for_alertname(alertname)
existing = await repo.get_by_id(playbook_id)
command = f"kubectl rollout restart deployment/{target} -n {namespace}"
playbook = Playbook(
playbook_id=playbook_id,
name="AwoooP T16 auto-repair canary rollout restart",
description=(
"低風險 live-fire canary用於驗證 alert -> Playbook -> executor "
"-> verifier -> learning/KM -> truth-chain 的真實自動修復閉環。"
),
status=PlaybookStatus.APPROVED,
source=PlaybookSource.MANUAL,
symptom_pattern=SymptomPattern(
alert_names=[alertname],
affected_services=[target],
severity_range=["P2"],
label_patterns={"component": target, "namespace": namespace},
keywords=["auto repair canary", "live-fire"],
),
repair_steps=[
RepairStep(
step_number=1,
action_type=ActionType.KUBECTL,
command=command,
expected_result=f"Deployment {target} receives restartedAt annotation and remains Available.",
rollback_command=f"kubectl rollout undo deployment/{target} -n {namespace}",
requires_approval=False,
risk_level=RiskLevel.LOW,
)
],
estimated_duration_minutes=1,
source_incident_ids=(existing.source_incident_ids if existing else []),
ai_confidence=1.0,
success_count=(existing.success_count if existing else 0),
failure_count=(existing.failure_count if existing else 0),
last_used_at=(existing.last_used_at if existing else None),
trust_score=(existing.trust_score if existing else 0.6),
approved_by="codex-t16-live-fire",
approved_at=(existing.approved_at if existing and existing.approved_at else now_taipei()),
tags=["awooop", "t16", "auto-repair", "canary", "live-fire"],
notes=(
"Synthetic low-risk canary for AwoooP automation verification. "
"This does not authorize organic production remediation rules; "
"it only proves the closed-loop plumbing with a no-traffic target."
),
created_at=(existing.created_at if existing else now_taipei()),
)
await repo.create(playbook)
return SeedResult(
playbook_id=playbook_id,
alertname=alertname,
target=target,
namespace=namespace,
status=playbook.status.value,
source=playbook.source.value,
command=command,
preserved_success_count=playbook.success_count,
preserved_failure_count=playbook.failure_count,
trust_score=playbook.trust_score,
)
async def _amain() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--alertname", default=DEFAULT_ALERTNAME)
parser.add_argument("--target", default=DEFAULT_TARGET)
parser.add_argument("--namespace", default=DEFAULT_NAMESPACE)
args = parser.parse_args()
result = await seed_canary_playbook(
alertname=args.alertname,
target=args.target,
namespace=args.namespace,
)
print(json.dumps(asdict(result), ensure_ascii=False, sort_keys=True))
if __name__ == "__main__":
asyncio.run(_amain())