kernelci · gustavobtflores · Feb 25, 2026 · Feb 24, 2026 · MarceloRobert · Feb 25, 2026
diff --git a/backend/kernelCI_app/management/commands/process_pending_aggregations.py b/backend/kernelCI_app/management/commands/process_pending_aggregations.py
@@ -1,13 +1,16 @@
 import hashlib
+import os
 import signal
 import time
 from datetime import datetime
 from typing import Literal, Optional, Sequence, TypedDict, Union
+from django.conf import settings
 from django.core.management.base import BaseCommand
 from django.db import connection, transaction
 from kernelCI_app.constants.general import MAESTRO_DUMMY_BUILD_PREFIX
 from kernelCI_app.helpers.logger import out
 from kernelCI_app.management.commands.helpers.aggregation_helpers import simplify_status
+from prometheus_client import start_http_server
 from kernelCI_app.models import (
     Builds,
     Checkouts,
@@ -17,6 +20,14 @@
     SimplifiedStatusChoices,
 )
 
+from prometheus_client import Counter
+
+AGGREGATION_RECORDS_WRITTEN = Counter(
+    "aggregation_records_written_total",
+    "Total number of records written to destination tables",
+    ["table"],  # values: "tree_listing", "hardware_status", "processed_items"
+)
+
 
 class ListingItemCount(TypedDict):
     build_pass: int
@@ -516,6 +527,11 @@ def handle(self, *args, **options):
         loop = options["loop"]
         interval = options["interval"]
 
+        metrics_port = int(os.environ.get("PROMETHEUS_METRICS_PORT", 8001))
+        if settings.PROMETHEUS_METRICS_ENABLED:
+            start_http_server(metrics_port)
+            out(f"Prometheus metrics server started on port {metrics_port}")
+
         if loop:
             signal.signal(signal.SIGTERM, self.signal_handler)
             signal.signal(signal.SIGINT, self.signal_handler)
@@ -585,6 +601,9 @@ def _process_new_processed_entries(
             f"bulk_create ProcessedListingItems: n={len(new_processed_entries)} "
             f"in {time.time() - t0:.3f}s"
         )
+        AGGREGATION_RECORDS_WRITTEN.labels(table="processed_items").inc(
+            len(new_processed_entries)
+        )
 
     def _process_tree_listing(
         self,
@@ -640,6 +659,7 @@ def _process_tree_listing(
             )
 
         out(f"Inserted {len(values)} tree_listing records in {time.time() - t0:.3f}s")
+        AGGREGATION_RECORDS_WRITTEN.labels(table="tree_listing").inc(len(values))
 
     def _process_hardware_status(
         self,

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -93,6 +93,8 @@ services:
         networks:
             - private
             - public
+        environment:
+            - PENDING_AGGREGATIONS_METRICS_PORT=${PENDING_AGGREGATIONS_METRICS_PORT:-8003}
         command:
             - poetry
             - run
@@ -105,6 +107,11 @@ services:
         restart: always
         depends_on:
             - dashboard_db
+        ports:
+            - target: 8001
+              published: ${PENDING_AGGREGATIONS_METRICS_PORT:-8003}
+              protocol: tcp
+              mode: host
         profiles: ["with_commands"]
 
     dashboard_db:

diff --git a/docs/monitoring.md b/docs/monitoring.md
@@ -49,7 +49,8 @@ poetry run python manage.py runserver 0.0.0.0:8000 --noreload
 3. Add data source
 4. Select "Prometheus". URL: `http://prometheus:9090`
 5. Import Dashboard by JSON File
-6. Select: `monitoring/dashboard.json`
+6. Select: `monitoring/dashboard.json` for API metrics
+7. Select: `monitoring/aggregation_process.json` for Aggregation Process metrics
 
 ### 4. Verify Everything Works
 - **Prometheus**: http://localhost:9090 (show targets)
@@ -58,6 +59,8 @@ poetry run python manage.py runserver 0.0.0.0:8000 --noreload
 
 ## Dashboard Features
 
+### API Dashboard
+
 After importing the dashboard, you'll have:
 
 - **Average Response Time by Endpoint** - Shows response time per endpoint
@@ -69,6 +72,15 @@ After importing the dashboard, you'll have:
   - Average Response Time
   - Total Time (cumulative time per endpoint)
 
+### Aggregation Process Dashboard
+
+This dashboard provides visibility into the `process_pending_aggregations` command:
+
+- **Records Written Rate**: Rate of records written to `tree_listing`, `hardware_status`, and `processed_items` tables.
+- **Health Status**: Time since the last successful batch processing (alerts if > 5 minutes).
+- **Batch Duration Percentiles**: p50, p95, and p99 duration of batch processing.
+- **Error Rate**: Rate of errors encountered during processing.
+
 ## Implementation Details
 
 ### Multi-Worker Gunicorn Support

diff --git a/monitoring/aggregation_process.json b/monitoring/aggregation_process.json
@@ -0,0 +1,160 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": 3,
+  "links": [],
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "PBFA97CFB590B2093"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": 0
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "id": 1,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.2.0-17142428006",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ef6i3x5negsu8f"
+          },
+          "editorMode": "code",
+          "expr": "rate(aggregation_records_written_total{table=\"tree_listing\"}[$__rate_interval])",
+          "legendFormat": "Tree Listing",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ef6i3x5negsu8f"
+          },
+          "editorMode": "code",
+          "expr": "rate(aggregation_records_written_total{table=\"hardware_status\"}[$__rate_interval])",
+          "legendFormat": "Hardware Status",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ef6i3x5negsu8f"
+          },
+          "editorMode": "code",
+          "expr": "rate(aggregation_records_written_total{table=\"processed_items\"}[$__rate_interval])",
+          "legendFormat": "Processed Items",
+          "range": true,
+          "refId": "C"
+        }
+      ],
+      "title": "Records Written Rate",
+      "type": "timeseries"
+    }
+  ],
+  "preload": false,
+  "refresh": "auto",
+  "schemaVersion": 41,
+  "tags": [],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Aggregation Process",
+  "uid": "aggregation-process",
+  "version": 5
+}
diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml
@@ -18,3 +18,8 @@ scrape_configs:
       - targets: ['host.docker.internal:8002']
     metrics_path: '/metrics/'
     scrape_interval: 1s
+  - job_name: 'kernelci-pending-aggregations-processor'
+    static_configs:
+      - targets: ['host.docker.internal:8003']
+    metrics_path: '/metrics/'
+    scrape_interval: 10s