From b3dc41fcd462d13d422373a108565a136a6ea8b2 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 7 May 2026 15:32:47 +0800 Subject: [PATCH] =?UTF-8?q?fix(metrics):=20=E4=B8=B2=E5=85=A5=E9=A3=9B?= =?UTF-8?q?=E8=BC=AA=E6=8C=87=E6=A8=99=E5=88=B0=20/metrics=20=E4=B8=BB?= =?UTF-8?q?=E7=AB=AF=E9=BB=9E=EF=BC=8C=E4=BF=AE=E5=BE=A9=20FlywheelExecuti?= =?UTF-8?q?onRateMissing=20=E6=AD=BB=E5=91=8A=E8=AD=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit INC-20260507-99ADF2 根因(feedback_full_chain_first_then_fix.md 全鏈分析): 【鏈路斷點】規則層(5/3 加)vs 指標層(5/6 改)vs scrape 層(從沒同步) - 577250a6(5/3)「反消音化」commit 加了 FlywheelExecutionRateMissing rule,要求 110 Prom scrape 到 awoooi_flywheel_execution_success_rate; - a2c4b3d4(5/6)Codex 改 FlywheelStatsService 用 auto_repair_executions 作 source of truth(24h 樣本 1-9 筆回 None 給 W-3b watchdog 接管); - 但 awoooi_flywheel_* 指標自始至終只在 /api/v1/stats/flywheel/metrics 暴露,110 Prom awoooi-api job 抓的是 /metrics → absent() 永遠 1 → 自 2026-05-06T04:14 UTC 起 firing 26h+ 屬 dead alert 【修法】只動 awoooi-api 一處,不碰 Codex 設計、不碰 110 Prom 配置: - main.py /metrics endpoint 改 async,在 generate_latest() 後串入 FlywheelStatsService.compute() → to_prometheus_lines()。 - 既有 awoooi-api scrape job 自動拿到飛輪指標。 - 完全保留 Codex a2c4b3d4 設計:1-9 筆回 None 讓 W-3b watchdog 雙保險。 【不碰的部分】 - flywheel_stats_service.py 不動:Codex 5/6 LOGBOOK 已明確說明 「Redis playbook counter 失準 → 用 auto_repair_executions 為唯一信任源」, 1-9 筆 return None 是配合 ai_slo_watchdog_job W-3b grace+30min 設計的 反消音化雙保險,不是 bug。 驗證計畫(部署後): 1. curl /metrics | grep awoooi_flywheel → 看到飛輪指標 2. Prom query awoooi_flywheel_execution_success_rate → 非空 3. ALERTS{alertname="FlywheelExecutionRateMissing"} → resolved 4. 30 分鐘觀察 Telegram 不再收 INC-20260507-99ADF2 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/main.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/apps/api/src/main.py b/apps/api/src/main.py index bc5ac052..d20bb752 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -80,6 +80,7 @@ from src.core.feature_flags import aiops_flags # ADR-080: AI 自主化飛輪 fe from src.core.http_client import close_all_http_clients, init_all_http_clients from src.core.logging import get_logger, setup_logging from src.core.redis_client import close_redis_pool, init_redis_pool +from src.services.flywheel_stats_service import get_flywheel_stats_service from src.core.sse import get_publisher from src.core.telemetry import setup_telemetry, shutdown_telemetry @@ -1005,10 +1006,17 @@ app.include_router(platform_v1.router, prefix="/api/v1/platform", tags=["AwoooP @app.get("/metrics", include_in_schema=False) async def prometheus_metrics() -> Response: """Prometheus metrics endpoint for alerting""" - return Response( - content=generate_latest(), - media_type=CONTENT_TYPE_LATEST, - ) + content = generate_latest().decode("utf-8") + # 2026-05-07 ogt + Claude Sonnet 4.6 — INC-20260507-99ADF2 修復 + # 飛輪指標(awoooi_flywheel_*)原本只在 /api/v1/stats/flywheel/metrics 暴露, + # 110 Prom awoooi-api job scrape /metrics 時抓不到 → FlywheelExecutionRateMissing 永久 firing + # 修法:在此串入飛輪指標,讓既有 scrape job 無需新增 job 即可抓到 + try: + flywheel_metrics = await get_flywheel_stats_service().compute() + content += flywheel_metrics.to_prometheus_lines() + except Exception: + logger.warning("prometheus_metrics_flywheel_error") + return Response(content=content, media_type=CONTENT_TYPE_LATEST) # =============================================================================