diff --git a/k8s/awoooi-prod/10-deployment-auto-repair-canary.yaml b/k8s/awoooi-prod/10-deployment-auto-repair-canary.yaml new file mode 100644 index 00000000..0035ff1d --- /dev/null +++ b/k8s/awoooi-prod/10-deployment-auto-repair-canary.yaml @@ -0,0 +1,56 @@ +# AwoooP 自動修復 live-fire canary +# 目的: 提供低風險 Deployment 目標,讓 T16 可驗證 +# alert -> Playbook -> executor -> verifier -> learning/KM 的真實閉環。 +# 注意: 此 Pod 不承接任何流量,不掛載 Secret,不自動暴露 Service。 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: awoooi-auto-repair-canary + namespace: awoooi-prod + labels: + app: awoooi-auto-repair-canary + system: awoooi + environment: prod + component: auto-repair-canary +spec: + replicas: 1 + revisionHistoryLimit: 2 + selector: + matchLabels: + app: awoooi-auto-repair-canary + environment: prod + system: awoooi + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: awoooi-auto-repair-canary + system: awoooi + environment: prod + component: auto-repair-canary + spec: + automountServiceAccountToken: false + containers: + - name: canary + image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER + imagePullPolicy: Always + command: ["/bin/sh", "-c"] + args: + - | + trap 'exit 0' TERM INT + while true; do sleep 3600 & wait $!; done + env: + - name: AWOOOP_CANARY_ROLE + value: "auto-repair-live-fire-target" + resources: + requests: + cpu: "5m" + memory: "32Mi" + limits: + cpu: "50m" + memory: "96Mi" diff --git a/k8s/awoooi-prod/kustomization.yaml b/k8s/awoooi-prod/kustomization.yaml index 721ba6d6..92c412b4 100644 --- a/k8s/awoooi-prod/kustomization.yaml +++ b/k8s/awoooi-prod/kustomization.yaml @@ -28,6 +28,7 @@ resources: - 07-rbac.yaml - 08-deployment-worker.yaml - 09-pdb.yaml +- 10-deployment-auto-repair-canary.yaml - 13-cronjob-k3s-report.yaml - 14-cronjob-weekly-report.yaml - 15-cronjob-km-vectorize.yaml diff --git a/scripts/ops/awooop-seed-auto-repair-canary-playbook.py b/scripts/ops/awooop-seed-auto-repair-canary-playbook.py new file mode 100644 index 00000000..2725601e --- /dev/null +++ b/scripts/ops/awooop-seed-auto-repair-canary-playbook.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +"""Seed the AwoooP T16 auto-repair canary Playbook. + +Run from an API pod so it uses the same PostgreSQL/Redis context as production: + + python scripts/ops/awooop-seed-auto-repair-canary-playbook.py + +The Playbook is intentionally scoped to the no-traffic +``awoooi-auto-repair-canary`` Deployment. It exists to prove the automation +chain with real execution evidence; it is not an organic symptom rule. +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import re +import sys +from dataclasses import asdict, dataclass +from pathlib import Path + +_REPO_API_ROOT = Path(__file__).resolve().parents[2] / "apps" / "api" +if (_REPO_API_ROOT / "src").exists(): + sys.path.insert(0, str(_REPO_API_ROOT)) + +from src.models.playbook import ( + ActionType, + Playbook, + PlaybookSource, + PlaybookStatus, + RepairStep, + RiskLevel, + SymptomPattern, +) +from src.repositories.playbook_repository import get_playbook_repository +from src.utils.timezone import now_taipei + + +DEFAULT_ALERTNAME = "AwoooPAutoRepairCanaryT16" +DEFAULT_TARGET = "awoooi-auto-repair-canary" +DEFAULT_NAMESPACE = "awoooi-prod" + + +@dataclass(frozen=True) +class SeedResult: + playbook_id: str + alertname: str + target: str + namespace: str + status: str + source: str + command: str + preserved_success_count: int + preserved_failure_count: int + trust_score: float + + +def _playbook_id_for_alertname(alertname: str) -> str: + if alertname == DEFAULT_ALERTNAME: + return "PB-AWOOOP-T16-CANARY" + suffix = re.sub(r"[^A-Z0-9]+", "-", alertname.upper()).strip("-") + suffix = suffix.replace("AWOOOP-AUTO-REPAIR-CANARY-", "") + suffix = suffix[:18] or "T16" + return f"PB-AWOOOP-CANARY-{suffix}" + + +async def seed_canary_playbook( + *, + alertname: str, + target: str, + namespace: str, +) -> SeedResult: + repo = get_playbook_repository() + playbook_id = _playbook_id_for_alertname(alertname) + existing = await repo.get_by_id(playbook_id) + command = f"kubectl rollout restart deployment/{target} -n {namespace}" + + playbook = Playbook( + playbook_id=playbook_id, + name="AwoooP T16 auto-repair canary rollout restart", + description=( + "低風險 live-fire canary,用於驗證 alert -> Playbook -> executor " + "-> verifier -> learning/KM -> truth-chain 的真實自動修復閉環。" + ), + status=PlaybookStatus.APPROVED, + source=PlaybookSource.MANUAL, + symptom_pattern=SymptomPattern( + alert_names=[alertname], + affected_services=[target], + severity_range=["P2"], + label_patterns={"component": target, "namespace": namespace}, + keywords=["auto repair canary", "live-fire"], + ), + repair_steps=[ + RepairStep( + step_number=1, + action_type=ActionType.KUBECTL, + command=command, + expected_result=f"Deployment {target} receives restartedAt annotation and remains Available.", + rollback_command=f"kubectl rollout undo deployment/{target} -n {namespace}", + requires_approval=False, + risk_level=RiskLevel.LOW, + ) + ], + estimated_duration_minutes=1, + source_incident_ids=(existing.source_incident_ids if existing else []), + ai_confidence=1.0, + success_count=(existing.success_count if existing else 0), + failure_count=(existing.failure_count if existing else 0), + last_used_at=(existing.last_used_at if existing else None), + trust_score=(existing.trust_score if existing else 0.6), + approved_by="codex-t16-live-fire", + approved_at=(existing.approved_at if existing and existing.approved_at else now_taipei()), + tags=["awooop", "t16", "auto-repair", "canary", "live-fire"], + notes=( + "Synthetic low-risk canary for AwoooP automation verification. " + "This does not authorize organic production remediation rules; " + "it only proves the closed-loop plumbing with a no-traffic target." + ), + created_at=(existing.created_at if existing else now_taipei()), + ) + + await repo.create(playbook) + return SeedResult( + playbook_id=playbook_id, + alertname=alertname, + target=target, + namespace=namespace, + status=playbook.status.value, + source=playbook.source.value, + command=command, + preserved_success_count=playbook.success_count, + preserved_failure_count=playbook.failure_count, + trust_score=playbook.trust_score, + ) + + +async def _amain() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--alertname", default=DEFAULT_ALERTNAME) + parser.add_argument("--target", default=DEFAULT_TARGET) + parser.add_argument("--namespace", default=DEFAULT_NAMESPACE) + args = parser.parse_args() + + result = await seed_canary_playbook( + alertname=args.alertname, + target=args.target, + namespace=args.namespace, + ) + print(json.dumps(asdict(result), ensure_ascii=False, sort_keys=True)) + + +if __name__ == "__main__": + asyncio.run(_amain())