_classify_alert() 與 classify_alert_early() 規則對齊, 確保回填腳本正確分類存量 incidents。 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
139 lines
5.6 KiB
Python
139 lines
5.6 KiB
Python
"""
|
||
ADR-073 回填腳本:修補存量 incidents 三個 NULL 欄位
|
||
- alertname
|
||
- notification_type
|
||
- alert_category
|
||
|
||
根本原因:save_to_episodic_memory() 建立 IncidentRecord 時漏掉這三個欄位。
|
||
2026-04-12 ogt (ADR-073 修補 Fix #1)
|
||
|
||
執行方式:
|
||
kubectl exec -n awoooi-prod <api-pod> -- python3 /app/scripts/backfill_alertname.py
|
||
"""
|
||
import asyncio
|
||
import sys
|
||
sys.path.insert(0, "/app")
|
||
|
||
from sqlalchemy import text
|
||
|
||
from src.db.base import get_db_context
|
||
|
||
|
||
def _classify_alert(alertname: str, severity: str) -> tuple[str, str]:
|
||
"""Python 版分類邏輯,與 classify_alert_early() 保持一致 (ADR-075 更新)"""
|
||
alertname_lower = alertname.lower()
|
||
if alertname in ("ConfigurationDrift", "KubeConfigDrift"):
|
||
return "config_drift", "TYPE-4D"
|
||
if severity in ("info", "none"):
|
||
return "info", "TYPE-1"
|
||
if "watchdog" in alertname_lower or alertname in ("Heartbeat",):
|
||
return "backup", "TYPE-1"
|
||
# ADR-075 新增: SecOps 優先
|
||
if any(alertname.startswith(p) for p in ("UnauthorizedSSH", "KubeAudit", "CVECritical", "WAFAttack", "PodAbnormal", "SecurityBreach")):
|
||
return "secops", "TYPE-5S"
|
||
# ADR-075 新增: Flywheel/META
|
||
if alertname in ("AutoRepairLowSuccessRate", "PermanentFixRequired") or any(
|
||
alertname.startswith(p) for p in ("Flywheel", "MCPProvider", "OllamaDown", "NemotronDown")
|
||
):
|
||
return "flywheel_health", "TYPE-8M"
|
||
# ADR-075 新增: Business/FinOps
|
||
if any(alertname.startswith(p) for p in ("AITokenCost", "GeminiAPIError", "SLOBurn", "APIErrorBudget", "MomoScraper", "ScraperSuccess")):
|
||
return "business", "TYPE-6B"
|
||
if alertname.startswith(("Docker", "Host")):
|
||
return "infrastructure", "TYPE-3"
|
||
if alertname.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")):
|
||
return "kubernetes", "TYPE-3"
|
||
if alertname.startswith(("Postgres", "Redis")):
|
||
return "database", "TYPE-3"
|
||
return "general", "TYPE-3"
|
||
|
||
|
||
async def main() -> None:
|
||
async with get_db_context() as db:
|
||
# --- Step 1: 統計目前 NULL 狀況 ---
|
||
null_r = await db.execute(text("""
|
||
SELECT
|
||
COUNT(*) FILTER (WHERE alertname IS NULL) AS alertname_null,
|
||
COUNT(*) FILTER (WHERE notification_type IS NULL) AS notification_type_null,
|
||
COUNT(*) FILTER (WHERE alert_category IS NULL) AS alert_category_null,
|
||
COUNT(*) AS total
|
||
FROM incidents
|
||
"""))
|
||
row = null_r.fetchone()
|
||
print(f"總計: {row.total} 筆")
|
||
print(f" alertname NULL: {row.alertname_null}")
|
||
print(f" notification_type NULL: {row.notification_type_null}")
|
||
print(f" alert_category NULL: {row.alert_category_null}")
|
||
|
||
# --- Step 2: SQL 回填 alertname ---
|
||
# signals 是 JSONB array,Signal 模型序列化後的 key 是 alert_name
|
||
# labels dict 裡也有 alertname key
|
||
result = await db.execute(text("""
|
||
UPDATE incidents
|
||
SET alertname = COALESCE(
|
||
signals->0->'labels'->>'alertname',
|
||
signals->0->>'alert_name',
|
||
signals->0->>'alertname'
|
||
)
|
||
WHERE alertname IS NULL
|
||
AND signals IS NOT NULL
|
||
AND json_array_length(signals) > 0
|
||
"""))
|
||
await db.commit()
|
||
print(f"\n✅ alertname 回填: {result.rowcount} 筆")
|
||
|
||
# --- Step 3: Python 回填 notification_type + alert_category ---
|
||
# 需要 Python 跑分類邏輯,從 DB 讀出 alertname + severity 批次更新
|
||
rows_r = await db.execute(text("""
|
||
SELECT incident_id, alertname, severity
|
||
FROM incidents
|
||
WHERE (notification_type IS NULL OR alert_category IS NULL)
|
||
AND alertname IS NOT NULL
|
||
"""))
|
||
rows = rows_r.fetchall()
|
||
print(f"\n待分類回填: {len(rows)} 筆")
|
||
|
||
updated = 0
|
||
for row in rows:
|
||
alert_category, notification_type = _classify_alert(
|
||
alertname=row.alertname or "",
|
||
severity=row.severity or "warning",
|
||
)
|
||
await db.execute(text("""
|
||
UPDATE incidents
|
||
SET notification_type = :notification_type,
|
||
alert_category = :alert_category
|
||
WHERE incident_id = :incident_id
|
||
"""), {
|
||
"notification_type": notification_type,
|
||
"alert_category": alert_category,
|
||
"incident_id": row.incident_id,
|
||
})
|
||
updated += 1
|
||
|
||
await db.commit()
|
||
print(f"✅ notification_type + alert_category 回填: {updated} 筆")
|
||
|
||
# --- Step 4: 最終統計 ---
|
||
final_r = await db.execute(text("""
|
||
SELECT
|
||
COUNT(*) FILTER (WHERE alertname IS NULL) AS alertname_null,
|
||
COUNT(*) FILTER (WHERE notification_type IS NULL) AS notification_type_null,
|
||
COUNT(*) FILTER (WHERE alert_category IS NULL) AS alert_category_null
|
||
FROM incidents
|
||
"""))
|
||
f = final_r.fetchone()
|
||
print(f"\n最終 NULL 統計:")
|
||
print(f" alertname NULL: {f.alertname_null}")
|
||
print(f" notification_type NULL: {f.notification_type_null}")
|
||
print(f" alert_category NULL: {f.alert_category_null}")
|
||
|
||
if f.alertname_null == 0 and f.notification_type_null == 0 and f.alert_category_null == 0:
|
||
print("\n✅ 三個欄位全部回填完成")
|
||
else:
|
||
print("\n⚠️ 部分記錄仍為 NULL (可能 signals 為空或格式異常)")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|