Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import hashlib
import os
import signal
import time
from datetime import datetime
from typing import Literal, Optional, Sequence, TypedDict, Union
from django.conf import settings
from django.core.management.base import BaseCommand
from django.db import connection, transaction
from kernelCI_app.constants.general import MAESTRO_DUMMY_BUILD_PREFIX
from kernelCI_app.helpers.logger import out
from kernelCI_app.management.commands.helpers.aggregation_helpers import simplify_status
from prometheus_client import start_http_server
from kernelCI_app.models import (
Builds,
Checkouts,
Expand All @@ -17,6 +20,14 @@
SimplifiedStatusChoices,
)

from prometheus_client import Counter

AGGREGATION_RECORDS_WRITTEN = Counter(
"aggregation_records_written_total",
"Total number of records written to destination tables",
["table"], # values: "tree_listing", "hardware_status", "processed_items"
)


class ListingItemCount(TypedDict):
build_pass: int
Expand Down Expand Up @@ -516,6 +527,11 @@ def handle(self, *args, **options):
loop = options["loop"]
interval = options["interval"]

metrics_port = int(os.environ.get("PROMETHEUS_METRICS_PORT", 8001))
if settings.PROMETHEUS_METRICS_ENABLED:
start_http_server(metrics_port)
out(f"Prometheus metrics server started on port {metrics_port}")

if loop:
signal.signal(signal.SIGTERM, self.signal_handler)
signal.signal(signal.SIGINT, self.signal_handler)
Expand Down Expand Up @@ -585,6 +601,9 @@ def _process_new_processed_entries(
f"bulk_create ProcessedListingItems: n={len(new_processed_entries)} "
f"in {time.time() - t0:.3f}s"
)
AGGREGATION_RECORDS_WRITTEN.labels(table="processed_items").inc(
len(new_processed_entries)
)

def _process_tree_listing(
self,
Expand Down Expand Up @@ -640,6 +659,7 @@ def _process_tree_listing(
)

out(f"Inserted {len(values)} tree_listing records in {time.time() - t0:.3f}s")
AGGREGATION_RECORDS_WRITTEN.labels(table="tree_listing").inc(len(values))

def _process_hardware_status(
self,
Expand Down
7 changes: 7 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ services:
networks:
- private
- public
environment:
- PENDING_AGGREGATIONS_METRICS_PORT=${PENDING_AGGREGATIONS_METRICS_PORT:-8003}
command:
- poetry
- run
Expand All @@ -105,6 +107,11 @@ services:
restart: always
depends_on:
- dashboard_db
ports:
- target: 8001
published: ${PENDING_AGGREGATIONS_METRICS_PORT:-8003}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a need for a default here seen that you already have a default in the environment section?

protocol: tcp
mode: host
profiles: ["with_commands"]

dashboard_db:
Expand Down
14 changes: 13 additions & 1 deletion docs/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ poetry run python manage.py runserver 0.0.0.0:8000 --noreload
3. Add data source
4. Select "Prometheus". URL: `http://prometheus:9090`
5. Import Dashboard by JSON File
6. Select: `monitoring/dashboard.json`
6. Select: `monitoring/dashboard.json` for API metrics
7. Select: `monitoring/aggregation_process.json` for Aggregation Process metrics

### 4. Verify Everything Works
- **Prometheus**: http://localhost:9090 (show targets)
Expand All @@ -58,6 +59,8 @@ poetry run python manage.py runserver 0.0.0.0:8000 --noreload

## Dashboard Features

### API Dashboard

After importing the dashboard, you'll have:

- **Average Response Time by Endpoint** - Shows response time per endpoint
Expand All @@ -69,6 +72,15 @@ After importing the dashboard, you'll have:
- Average Response Time
- Total Time (cumulative time per endpoint)

### Aggregation Process Dashboard

This dashboard provides visibility into the `process_pending_aggregations` command:

- **Records Written Rate**: Rate of records written to `tree_listing`, `hardware_status`, and `processed_items` tables.
- **Health Status**: Time since the last successful batch processing (alerts if > 5 minutes).
- **Batch Duration Percentiles**: p50, p95, and p99 duration of batch processing.
- **Error Rate**: Rate of errors encountered during processing.

## Implementation Details

### Multi-Worker Gunicorn Support
Expand Down
160 changes: 160 additions & 0 deletions monitoring/aggregation_process.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 3,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "12.2.0-17142428006",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ef6i3x5negsu8f"
},
"editorMode": "code",
"expr": "rate(aggregation_records_written_total{table=\"tree_listing\"}[$__rate_interval])",
"legendFormat": "Tree Listing",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "ef6i3x5negsu8f"
},
"editorMode": "code",
"expr": "rate(aggregation_records_written_total{table=\"hardware_status\"}[$__rate_interval])",
"legendFormat": "Hardware Status",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "ef6i3x5negsu8f"
},
"editorMode": "code",
"expr": "rate(aggregation_records_written_total{table=\"processed_items\"}[$__rate_interval])",
"legendFormat": "Processed Items",
"range": true,
"refId": "C"
}
],
"title": "Records Written Rate",
"type": "timeseries"
}
],
"preload": false,
"refresh": "auto",
"schemaVersion": 41,
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "Aggregation Process",
"uid": "aggregation-process",
"version": 5
}
5 changes: 5 additions & 0 deletions monitoring/prometheus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,8 @@ scrape_configs:
- targets: ['host.docker.internal:8002']
metrics_path: '/metrics/'
scrape_interval: 1s
- job_name: 'kernelci-pending-aggregations-processor'
static_configs:
- targets: ['host.docker.internal:8003']
metrics_path: '/metrics/'
scrape_interval: 10s