diff --git a/backend/kernelCI_app/management/commands/process_pending_aggregations.py b/backend/kernelCI_app/management/commands/process_pending_aggregations.py index c9c9e5db1..17351531e 100644 --- a/backend/kernelCI_app/management/commands/process_pending_aggregations.py +++ b/backend/kernelCI_app/management/commands/process_pending_aggregations.py @@ -1,13 +1,16 @@ import hashlib +import os import signal import time from datetime import datetime from typing import Literal, Optional, Sequence, TypedDict, Union +from django.conf import settings from django.core.management.base import BaseCommand from django.db import connection, transaction from kernelCI_app.constants.general import MAESTRO_DUMMY_BUILD_PREFIX from kernelCI_app.helpers.logger import out from kernelCI_app.management.commands.helpers.aggregation_helpers import simplify_status +from prometheus_client import start_http_server from kernelCI_app.models import ( Builds, Checkouts, @@ -17,6 +20,14 @@ SimplifiedStatusChoices, ) +from prometheus_client import Counter + +AGGREGATION_RECORDS_WRITTEN = Counter( + "aggregation_records_written_total", + "Total number of records written to destination tables", + ["table"], # values: "tree_listing", "hardware_status", "processed_items" +) + class ListingItemCount(TypedDict): build_pass: int @@ -516,6 +527,11 @@ def handle(self, *args, **options): loop = options["loop"] interval = options["interval"] + metrics_port = int(os.environ.get("PROMETHEUS_METRICS_PORT", 8001)) + if settings.PROMETHEUS_METRICS_ENABLED: + start_http_server(metrics_port) + out(f"Prometheus metrics server started on port {metrics_port}") + if loop: signal.signal(signal.SIGTERM, self.signal_handler) signal.signal(signal.SIGINT, self.signal_handler) @@ -585,6 +601,9 @@ def _process_new_processed_entries( f"bulk_create ProcessedListingItems: n={len(new_processed_entries)} " f"in {time.time() - t0:.3f}s" ) + AGGREGATION_RECORDS_WRITTEN.labels(table="processed_items").inc( + len(new_processed_entries) + ) def _process_tree_listing( self, @@ -640,6 +659,7 @@ def _process_tree_listing( ) out(f"Inserted {len(values)} tree_listing records in {time.time() - t0:.3f}s") + AGGREGATION_RECORDS_WRITTEN.labels(table="tree_listing").inc(len(values)) def _process_hardware_status( self, diff --git a/docker-compose.yml b/docker-compose.yml index 43e3debd5..a9d4fe7ad 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -93,6 +93,8 @@ services: networks: - private - public + environment: + - PENDING_AGGREGATIONS_METRICS_PORT=${PENDING_AGGREGATIONS_METRICS_PORT:-8003} command: - poetry - run @@ -105,6 +107,11 @@ services: restart: always depends_on: - dashboard_db + ports: + - target: 8001 + published: ${PENDING_AGGREGATIONS_METRICS_PORT:-8003} + protocol: tcp + mode: host profiles: ["with_commands"] dashboard_db: diff --git a/docs/monitoring.md b/docs/monitoring.md index f7a242915..69673d846 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -49,7 +49,8 @@ poetry run python manage.py runserver 0.0.0.0:8000 --noreload 3. Add data source 4. Select "Prometheus". URL: `http://prometheus:9090` 5. Import Dashboard by JSON File -6. Select: `monitoring/dashboard.json` +6. Select: `monitoring/dashboard.json` for API metrics +7. Select: `monitoring/aggregation_process.json` for Aggregation Process metrics ### 4. Verify Everything Works - **Prometheus**: http://localhost:9090 (show targets) @@ -58,6 +59,8 @@ poetry run python manage.py runserver 0.0.0.0:8000 --noreload ## Dashboard Features +### API Dashboard + After importing the dashboard, you'll have: - **Average Response Time by Endpoint** - Shows response time per endpoint @@ -69,6 +72,15 @@ After importing the dashboard, you'll have: - Average Response Time - Total Time (cumulative time per endpoint) +### Aggregation Process Dashboard + +This dashboard provides visibility into the `process_pending_aggregations` command: + +- **Records Written Rate**: Rate of records written to `tree_listing`, `hardware_status`, and `processed_items` tables. +- **Health Status**: Time since the last successful batch processing (alerts if > 5 minutes). +- **Batch Duration Percentiles**: p50, p95, and p99 duration of batch processing. +- **Error Rate**: Rate of errors encountered during processing. + ## Implementation Details ### Multi-Worker Gunicorn Support diff --git a/monitoring/aggregation_process.json b/monitoring/aggregation_process.json new file mode 100644 index 000000000..9dc0e2cfd --- /dev/null +++ b/monitoring/aggregation_process.json @@ -0,0 +1,160 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 3, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0-17142428006", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ef6i3x5negsu8f" + }, + "editorMode": "code", + "expr": "rate(aggregation_records_written_total{table=\"tree_listing\"}[$__rate_interval])", + "legendFormat": "Tree Listing", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ef6i3x5negsu8f" + }, + "editorMode": "code", + "expr": "rate(aggregation_records_written_total{table=\"hardware_status\"}[$__rate_interval])", + "legendFormat": "Hardware Status", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ef6i3x5negsu8f" + }, + "editorMode": "code", + "expr": "rate(aggregation_records_written_total{table=\"processed_items\"}[$__rate_interval])", + "legendFormat": "Processed Items", + "range": true, + "refId": "C" + } + ], + "title": "Records Written Rate", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "auto", + "schemaVersion": 41, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Aggregation Process", + "uid": "aggregation-process", + "version": 5 +} \ No newline at end of file diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml index dfef6fc13..a1c1e87f3 100644 --- a/monitoring/prometheus.yml +++ b/monitoring/prometheus.yml @@ -18,3 +18,8 @@ scrape_configs: - targets: ['host.docker.internal:8002'] metrics_path: '/metrics/' scrape_interval: 1s + - job_name: 'kernelci-pending-aggregations-processor' + static_configs: + - targets: ['host.docker.internal:8003'] + metrics_path: '/metrics/' + scrape_interval: 10s