После 24 спринтов regress-suite разросся; нестабильность блокирует доверие. Этот спринт: ловит flaky тесты, добавляет observability (Grafana + Prometheus alerts + RUNBOOK), сертифицирует 10× cert-прогон. 1. tests/regression/find-flaky.sh — 10× прогон + JSON-агрегатор → docs/flaky-tests.md (per-test pass/fail sequence + reproduce). 2. OrgFactory.signupWithRetry теперь honors Retry-After header (api-client.ts:ApiError.retryAfterSec). Stage rate-limit поднят: RATE_SIGNUP_HOUR=5000, RATE_PER_IP_MIN=5000 (~/food-market-stage/deploy/.env). 3. fullyParallel=true + workers=4 = тесты идут в недетерминированном порядке; isolation работает (OrgFactory per-test). 4. workers=4 даёт **2.4× ускорение** (66.6s → 27.7s). Worker-scoped фикстура lib/worker-org.ts добавлена как opt-in. 5. deploy/grafana/dashboards/quality-watchdog.json (10 панелей: smoke success ratio 7d, incidents, multi-tenant violations, current emoji, p95 by endpoint, step failures, RPS, DB p95, docs posted, disk free) + dashboards/README.md. quality-watchdog.sh пишет Prometheus textfile экспорт в ~/.fm-watchdog/textfile/quality_watchdog.prom для node_exporter. 6. deploy/prometheus/alerts.yml — 10 правил, 4 группы (uptime, errors, database, quality-watchdog). MultiTenantViolation = P0. deploy/prometheus/prometheus.yml — reference config. 7. docs/RUNBOOK.md +178 строк: action per alert (api-down, rps-drop, http-errors-spike/growing, doc-posting-errors, db-p95-high, disk-free-low, watchdog-red, multi-tenant-violation, watchdog-incident). Junior-friendly с конкретными командами. **Cert-прогон (10× workers=4):** 420/420 passed, 0 flaky, avg 30.1s/run, total 300.6s (< 5min budget). Изменения вне репо: - ~/food-market-stage/deploy/.env — RATE_* limits bumped. - ~/quality-watchdog.sh — добавлен .prom textfile экспорт. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
351 lines
11 KiB
JSON
351 lines
11 KiB
JSON
{
|
|
"annotations": {
|
|
"list": [
|
|
{
|
|
"builtIn": 1,
|
|
"datasource": {"type": "grafana", "uid": "-- Grafana --"},
|
|
"enable": true,
|
|
"hide": true,
|
|
"iconColor": "rgba(0, 211, 255, 1)",
|
|
"name": "Annotations & Alerts",
|
|
"type": "dashboard"
|
|
}
|
|
]
|
|
},
|
|
"description": "Sprint 26: quality-watchdog dashboard. Метрики из ~/quality-watchdog.sh (textfile exporter, см. docs/observability.md) + базовые food-market.api метрики (/metrics).",
|
|
"editable": true,
|
|
"fiscalYearStartMonth": 0,
|
|
"graphTooltip": 1,
|
|
"id": null,
|
|
"links": [],
|
|
"liveNow": false,
|
|
"panels": [
|
|
{
|
|
"id": 1,
|
|
"type": "stat",
|
|
"title": "Smoke success ratio (7 дней)",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"gridPos": {"h": 5, "w": 6, "x": 0, "y": 0},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "thresholds"},
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": null},
|
|
{"color": "orange", "value": 0.80},
|
|
{"color": "green", "value": 0.95}
|
|
]
|
|
},
|
|
"unit": "percentunit",
|
|
"min": 0,
|
|
"max": 1
|
|
}
|
|
},
|
|
"options": {
|
|
"graphMode": "area",
|
|
"colorMode": "value",
|
|
"justifyMode": "center",
|
|
"reduceOptions": {"calcs": ["lastNotNull"]}
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "sum(increase(quality_watchdog_run_total{result=\"green\"}[7d])) / sum(increase(quality_watchdog_run_total[7d]))",
|
|
"legendFormat": "green ratio"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 2,
|
|
"type": "stat",
|
|
"title": "Incidents (7 дней)",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"gridPos": {"h": 5, "w": 6, "x": 6, "y": 0},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "thresholds"},
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": null},
|
|
{"color": "orange", "value": 1},
|
|
{"color": "red", "value": 3}
|
|
]
|
|
},
|
|
"unit": "short"
|
|
}
|
|
},
|
|
"options": {"colorMode": "value", "reduceOptions": {"calcs": ["lastNotNull"]}},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "sum(increase(quality_watchdog_incidents_total[7d]))",
|
|
"legendFormat": "incidents"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 3,
|
|
"type": "stat",
|
|
"title": "Multi-tenant violations (24h) — должно быть 0",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"gridPos": {"h": 5, "w": 6, "x": 12, "y": 0},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "thresholds"},
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": null},
|
|
{"color": "red", "value": 1}
|
|
]
|
|
},
|
|
"unit": "short"
|
|
}
|
|
},
|
|
"options": {"colorMode": "value", "reduceOptions": {"calcs": ["lastNotNull"]}},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "sum(increase(quality_watchdog_step_failure_total{step=\"multi_tenant\"}[24h]))",
|
|
"legendFormat": "leaks"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 4,
|
|
"type": "stat",
|
|
"title": "Текущий статус watchdog",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"gridPos": {"h": 5, "w": 6, "x": 18, "y": 0},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "thresholds"},
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": null},
|
|
{"color": "orange", "value": 0.5},
|
|
{"color": "green", "value": 1}
|
|
]
|
|
},
|
|
"unit": "short",
|
|
"mappings": [
|
|
{"options": {"0": {"text": "🔴 RED"}, "1": {"text": "🟢 GREEN"}}, "type": "value"}
|
|
]
|
|
}
|
|
},
|
|
"options": {"colorMode": "background", "reduceOptions": {"calcs": ["lastNotNull"]}},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "quality_watchdog_last_run_status",
|
|
"legendFormat": "status"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 5,
|
|
"type": "timeseries",
|
|
"title": "p95 latency по endpoint (мс)",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"gridPos": {"h": 9, "w": 12, "x": 0, "y": 5},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "palette-classic"},
|
|
"custom": {
|
|
"drawStyle": "line",
|
|
"lineWidth": 2,
|
|
"fillOpacity": 10,
|
|
"showPoints": "never",
|
|
"spanNulls": false,
|
|
"stacking": {"mode": "none"}
|
|
},
|
|
"unit": "ms",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": null},
|
|
{"color": "orange", "value": 400},
|
|
{"color": "red", "value": 800}
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"options": {"legend": {"displayMode": "table", "placement": "bottom"}, "tooltip": {"mode": "multi"}},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "quality_watchdog_endpoint_p95_ms",
|
|
"legendFormat": "{{endpoint}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 6,
|
|
"type": "timeseries",
|
|
"title": "Шаги watchdog — pass/fail во времени",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"gridPos": {"h": 9, "w": 12, "x": 12, "y": 5},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "palette-classic"},
|
|
"custom": {
|
|
"drawStyle": "bars",
|
|
"lineWidth": 1,
|
|
"fillOpacity": 60,
|
|
"stacking": {"mode": "normal"}
|
|
},
|
|
"unit": "short"
|
|
}
|
|
},
|
|
"options": {"legend": {"displayMode": "list", "placement": "right"}, "tooltip": {"mode": "multi"}},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "sum by (step) (increase(quality_watchdog_step_failure_total[1h]))",
|
|
"legendFormat": "{{step}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 7,
|
|
"type": "timeseries",
|
|
"title": "HTTP request rate (rps)",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 14},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "palette-classic"},
|
|
"custom": {"drawStyle": "line", "lineWidth": 2, "fillOpacity": 10},
|
|
"unit": "reqps"
|
|
}
|
|
},
|
|
"options": {"legend": {"displayMode": "list"}, "tooltip": {"mode": "multi"}},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "sum(rate(http_requests_received_total[5m])) by (code)",
|
|
"legendFormat": "code={{code}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 8,
|
|
"type": "timeseries",
|
|
"title": "DB query duration p95 (food_market_db_query_duration_seconds)",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 14},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "palette-classic"},
|
|
"custom": {"drawStyle": "line", "lineWidth": 2, "fillOpacity": 10},
|
|
"unit": "s",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": null},
|
|
{"color": "orange", "value": 0.5},
|
|
{"color": "red", "value": 1.0}
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"options": {"legend": {"displayMode": "list"}, "tooltip": {"mode": "multi"}},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "histogram_quantile(0.95, sum(rate(food_market_db_query_duration_seconds_bucket[5m])) by (le))",
|
|
"legendFormat": "p95 DB"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 9,
|
|
"type": "timeseries",
|
|
"title": "Документы проведены / ошибки",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 22},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "palette-classic"},
|
|
"custom": {"drawStyle": "line", "lineWidth": 2, "fillOpacity": 10},
|
|
"unit": "ops"
|
|
}
|
|
},
|
|
"options": {"legend": {"displayMode": "table", "placement": "bottom"}, "tooltip": {"mode": "multi"}},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "sum(rate(food_market_documents_posted_total[5m])) by (type)",
|
|
"legendFormat": "posted {{type}}"
|
|
},
|
|
{
|
|
"refId": "B",
|
|
"expr": "sum(rate(food_market_documents_error_total[5m])) by (type)",
|
|
"legendFormat": "error {{type}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 10,
|
|
"type": "timeseries",
|
|
"title": "Свободное место на диске",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 22},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "palette-classic"},
|
|
"custom": {"drawStyle": "line", "lineWidth": 2, "fillOpacity": 10},
|
|
"unit": "bytes",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": null},
|
|
{"color": "orange", "value": 5368709120},
|
|
{"color": "green", "value": 10737418240}
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"options": {"legend": {"displayMode": "list"}, "tooltip": {"mode": "multi"}},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "food_market_disk_free_bytes",
|
|
"legendFormat": "{{mount}}"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"refresh": "1m",
|
|
"schemaVersion": 38,
|
|
"tags": ["food-market", "quality-watchdog", "sprint26"],
|
|
"templating": {
|
|
"list": [
|
|
{
|
|
"current": {"selected": false, "text": "Prometheus", "value": "Prometheus"},
|
|
"hide": 0,
|
|
"includeAll": false,
|
|
"label": "Datasource",
|
|
"multi": false,
|
|
"name": "DS_PROMETHEUS",
|
|
"options": [],
|
|
"query": "prometheus",
|
|
"queryValue": "",
|
|
"refresh": 1,
|
|
"regex": "",
|
|
"skipUrlSync": false,
|
|
"type": "datasource"
|
|
}
|
|
]
|
|
},
|
|
"time": {"from": "now-7d", "to": "now"},
|
|
"timepicker": {},
|
|
"timezone": "browser",
|
|
"title": "food-market — quality-watchdog",
|
|
"uid": "fm-quality-watchdog",
|
|
"version": 1
|
|
}
|