Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions tests/regression/mlp/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
ROOT_DIR := $(realpath ../../..)
include $(ROOT_DIR)/config.mk

PROJECT := mlp

SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)

SRCS := $(SRC_DIR)/main.cpp

VX_SRCS := $(SRC_DIR)/kernel.cpp

OPTS ?=

include ../common.mk
40 changes: 40 additions & 0 deletions tests/regression/mlp/common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#ifndef _COMMON_H_
#define _COMMON_H_

#include <stdint.h>

#ifndef TYPE
#define TYPE float
#endif

// MLP Network Configuration (all dimensions multiples of 16)
#define INPUT_DIM 128
#define HIDDEN1_DIM 256
#define HIDDEN2_DIM 128
#define HIDDEN3_DIM 64
#define OUTPUT_DIM 16

// Number of layers
#define NUM_LAYERS 4

// Layer configuration structure
typedef struct {
uint32_t input_dim;
uint32_t output_dim;
uint64_t weights_addr; // Weight matrix: output_dim x input_dim
uint64_t bias_addr; // Bias vector: output_dim
} layer_config_t;
Comment on lines +20 to +26
Copy link

Copilot AI Feb 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This header uses uint32_t/uint64_t but doesn't include <stdint.h>/<cstdint>. Relying on transitive includes makes builds fragile; include the standard integer header directly from common.h.

Copilot uses AI. Check for mistakes.

// Kernel arguments
typedef struct {
uint32_t num_layers;
uint32_t batch_size; // Number of input samples
uint64_t input_addr; // Input data
uint64_t output_addr; // Final output
uint64_t layer_configs_addr; // Pointer to layer configurations
// Intermediate buffers for layer outputs
uint64_t buffer1_addr; // For layer 1 output / layer 2 input
uint64_t buffer2_addr; // For layer 2 output / layer 3 input
} kernel_arg_t;

#endif
164 changes: 164 additions & 0 deletions tests/regression/mlp/kernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
#include <vx_spawn.h>
#include <vx_intrinsics.h>
#include <math.h>
#include "common.h"

// This kernel is designed for single-core parallel execution (scales with warps/threads). Multi-core would require inter-core synchronization between layers, which Vortex's vx_spawn_threads doesn't provide yet (to the best of my knowledge)...

// ReLU activation function
inline TYPE relu(TYPE x) {
return (x > 0) ? x : 0;
}

// Layer execution arguments passed to spawned threads
typedef struct {
TYPE* input;
TYPE* weights;
TYPE* bias;
TYPE* output;
uint32_t input_dim;
uint32_t output_dim;
bool apply_relu;
} layer_args_t;


// Softmax execution arguments
typedef struct {
TYPE* data;
uint32_t size;
TYPE* max_val;
TYPE* sum_exp;
} softmax_args_t;

// Global shared variables for softmax reductions
TYPE g_softmax_max_val;
TYPE g_softmax_sum_exp;

// Thread function: thread 0 finds max, others wait
void softmax_find_max_thread(softmax_args_t* __UNIFORM__ args) {
if (blockIdx.x == 0) {
// Thread 0 does the reduction serially
TYPE max_val = args->data[0];
for (uint32_t i = 1; i < args->size; ++i) {
if (args->data[i] > max_val) {
max_val = args->data[i];
}
}
*(args->max_val) = max_val;
}
}

// Thread function: compute exp in parallel
void softmax_exp_thread(softmax_args_t* __UNIFORM__ args) {
uint32_t i = blockIdx.x;

// All threads compute their exp value
if (i < args->size) {
args->data[i] = expf(args->data[i] - *(args->max_val));
}
}

// Thread function: single thread sums all exp values
void softmax_sum_thread(softmax_args_t* __UNIFORM__ args) {
TYPE sum_exp = 0.0f;
for (uint32_t j = 0; j < args->size; ++j) {
sum_exp += args->data[j];
}
*(args->sum_exp) = sum_exp;
}

// Thread function: normalize in parallel
void softmax_normalize_thread(softmax_args_t* __UNIFORM__ args) {
uint32_t i = blockIdx.x;
if (i < args->size) {
args->data[i] /= *(args->sum_exp);
}
}

// Softmax implementation
void apply_softmax(TYPE* data, uint32_t size) {
softmax_args_t args;
args.data = data;
args.size = size;
args.max_val = &g_softmax_max_val;
args.sum_exp = &g_softmax_sum_exp;

uint32_t single_thread = 1;
vx_spawn_threads(1, &single_thread, nullptr, (vx_kernel_func_cb)softmax_find_max_thread, &args);

vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_exp_thread, &args);

vx_spawn_threads(1, &single_thread, nullptr, (vx_kernel_func_cb)softmax_sum_thread, &args);

vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_normalize_thread, &args);
}

// Each thread computes one output neuron
// y[i] = sum_j(W[i][j] * x[j]) + b[i]
void fc_layer_thread(layer_args_t* __UNIFORM__ args) {
uint32_t i = blockIdx.x; // Output neuron index

if (i >= args->output_dim) return;

TYPE sum = 0;

// Compute dot product: W[i] · input
for (uint32_t j = 0; j < args->input_dim; ++j) {
sum += args->weights[i * args->input_dim + j] * args->input[j];
}

// Add bias
sum += args->bias[i];

// Apply ReLU activation (except for output layer)
if (args->apply_relu) {
sum = relu(sum);
}

args->output[i] = sum;
}

// MLP inference kernel
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
auto layer_configs = reinterpret_cast<layer_config_t*>(arg->layer_configs_addr);

// Layer buffers
auto input = reinterpret_cast<TYPE*>(arg->input_addr);
auto buffer1 = reinterpret_cast<TYPE*>(arg->buffer1_addr);
auto buffer2 = reinterpret_cast<TYPE*>(arg->buffer2_addr);
auto output = reinterpret_cast<TYPE*>(arg->output_addr);

// Pointer arrays for ping-pong buffering
TYPE* layer_inputs[4] = {input, buffer1, buffer2, buffer1};
TYPE* layer_outputs[4] = {buffer1, buffer2, buffer1, output};

layer_args_t layer_args;

// Execute each layer with parallel threads
for (uint32_t layer = 0; layer < arg->num_layers; ++layer) {
auto cfg = &layer_configs[layer];

layer_args.input = layer_inputs[layer];
layer_args.weights = reinterpret_cast<TYPE*>(cfg->weights_addr);
layer_args.bias = reinterpret_cast<TYPE*>(cfg->bias_addr);
layer_args.output = layer_outputs[layer];
layer_args.input_dim = cfg->input_dim;
layer_args.output_dim = cfg->output_dim;
layer_args.apply_relu = (layer < arg->num_layers - 1);

// vx_spawn_threads provides barrier synchronization after all threads complete
vx_spawn_threads(1, &layer_args.output_dim, nullptr,
(vx_kernel_func_cb)fc_layer_thread, &layer_args);
}

// Apply softmax to final output
uint32_t output_size = layer_configs[arg->num_layers - 1].output_dim;
apply_softmax(output, output_size);
}

int main() {
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);

uint32_t grid_dim = 1;
return vx_spawn_threads(1, &grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg);
}
Loading
Loading