From ba5fae8d43dcea731bacc4f3901aacc5ef813365 Mon Sep 17 00:00:00 2001 From: tandonmitul27 Date: Thu, 5 Feb 2026 00:16:03 +0530 Subject: [PATCH 1/2] Added MLP tests --- tests/regression/mlp/Makefile | 14 + tests/regression/mlp/common.h | 38 +++ tests/regression/mlp/kernel.cpp | 162 +++++++++ tests/regression/mlp/main.cpp | 346 +++++++++++++++++++ tests/regression/mlp_vegeta/Makefile | 14 + tests/regression/mlp_vegeta/common.h | 55 +++ tests/regression/mlp_vegeta/kernel.cpp | 220 ++++++++++++ tests/regression/mlp_vegeta/main.cpp | 451 +++++++++++++++++++++++++ 8 files changed, 1300 insertions(+) create mode 100644 tests/regression/mlp/Makefile create mode 100644 tests/regression/mlp/common.h create mode 100644 tests/regression/mlp/kernel.cpp create mode 100644 tests/regression/mlp/main.cpp create mode 100644 tests/regression/mlp_vegeta/Makefile create mode 100644 tests/regression/mlp_vegeta/common.h create mode 100644 tests/regression/mlp_vegeta/kernel.cpp create mode 100644 tests/regression/mlp_vegeta/main.cpp diff --git a/tests/regression/mlp/Makefile b/tests/regression/mlp/Makefile new file mode 100644 index 000000000..d50062bac --- /dev/null +++ b/tests/regression/mlp/Makefile @@ -0,0 +1,14 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := mlp + +SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +VX_SRCS := $(SRC_DIR)/kernel.cpp + +OPTS ?= + +include ../common.mk diff --git a/tests/regression/mlp/common.h b/tests/regression/mlp/common.h new file mode 100644 index 000000000..7cc37012a --- /dev/null +++ b/tests/regression/mlp/common.h @@ -0,0 +1,38 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#ifndef TYPE +#define TYPE float +#endif + +// MLP Network Configuration (all dimensions multiples of 16) +#define INPUT_DIM 128 +#define HIDDEN1_DIM 256 +#define HIDDEN2_DIM 128 +#define HIDDEN3_DIM 64 +#define OUTPUT_DIM 16 + +// Number of layers +#define NUM_LAYERS 4 + +// Layer configuration structure +typedef struct { + uint32_t input_dim; + uint32_t output_dim; + uint64_t weights_addr; // Weight matrix: output_dim x input_dim + uint64_t bias_addr; // Bias vector: output_dim +} layer_config_t; + +// Kernel arguments +typedef struct { + uint32_t num_layers; + uint32_t batch_size; // Number of input samples + uint64_t input_addr; // Input data + uint64_t output_addr; // Final output + uint64_t layer_configs_addr; // Pointer to layer configurations + // Intermediate buffers for layer outputs + uint64_t buffer1_addr; // For layer 1 output / layer 2 input + uint64_t buffer2_addr; // For layer 2 output / layer 3 input +} kernel_arg_t; + +#endif diff --git a/tests/regression/mlp/kernel.cpp b/tests/regression/mlp/kernel.cpp new file mode 100644 index 000000000..50e4ed24e --- /dev/null +++ b/tests/regression/mlp/kernel.cpp @@ -0,0 +1,162 @@ +#include +#include +#include +#include "common.h" + +// This kernel is designed for single-core parallel execution (scales with warps/threads). Multi-core would require inter-core synchronization between layers, which Vortex's vx_spawn_threads doesn't provide yet (to the best of my knowledge)... + +// ReLU activation function +inline TYPE relu(TYPE x) { + return (x > 0) ? x : 0; +} + +// Layer execution arguments passed to spawned threads +typedef struct { + TYPE* input; + TYPE* weights; + TYPE* bias; + TYPE* output; + uint32_t input_dim; + uint32_t output_dim; + bool apply_relu; +} layer_args_t; + + +// Softmax execution arguments +typedef struct { + TYPE* data; + uint32_t size; + TYPE* max_val; + TYPE* sum_exp; +} softmax_args_t; + +// Global shared variables for softmax reductions +TYPE g_softmax_max_val; +TYPE g_softmax_sum_exp; + +// Thread function: thread 0 finds max, others wait +void softmax_find_max_thread(softmax_args_t* __UNIFORM__ args) { + if (blockIdx.x == 0) { + // Thread 0 does the reduction serially + TYPE max_val = args->data[0]; + for (uint32_t i = 1; i < args->size; ++i) { + if (args->data[i] > max_val) { + max_val = args->data[i]; + } + } + *(args->max_val) = max_val; + } +} + +// Thread function: compute exp in parallel, thread 0 sums +void softmax_exp_sum_thread(softmax_args_t* __UNIFORM__ args) { + uint32_t i = blockIdx.x; + + // All threads compute their exp value + if (i < args->size) { + args->data[i] = expf(args->data[i] - *(args->max_val)); + } + + // Thread 0 does the sum reduction serially + if (i == 0) { + TYPE sum_exp = 0.0f; + for (uint32_t j = 0; j < args->size; ++j) { + sum_exp += args->data[j]; + } + *(args->sum_exp) = sum_exp; + } +} + +// Thread function: normalize in parallel +void softmax_normalize_thread(softmax_args_t* __UNIFORM__ args) { + uint32_t i = blockIdx.x; + if (i < args->size) { + args->data[i] /= *(args->sum_exp); + } +} + +// Softmax implementation +void apply_softmax(TYPE* data, uint32_t size) { + softmax_args_t args; + args.data = data; + args.size = size; + args.max_val = &g_softmax_max_val; + args.sum_exp = &g_softmax_sum_exp; + + uint32_t single_thread = 1; + vx_spawn_threads(1, &single_thread, nullptr, (vx_kernel_func_cb)softmax_find_max_thread, &args); + + vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_exp_sum_thread, &args); + + vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_normalize_thread, &args); +} + +// Each thread computes one output neuron +// y[i] = sum_j(W[i][j] * x[j]) + b[i] +void fc_layer_thread(layer_args_t* __UNIFORM__ args) { + uint32_t i = blockIdx.x; // Output neuron index + + if (i >= args->output_dim) return; + + TYPE sum = 0; + + // Compute dot product: W[i] · input + for (uint32_t j = 0; j < args->input_dim; ++j) { + sum += args->weights[i * args->input_dim + j] * args->input[j]; + } + + // Add bias + sum += args->bias[i]; + + // Apply ReLU activation (except for output layer) + if (args->apply_relu) { + sum = relu(sum); + } + + args->output[i] = sum; +} + +// MLP inference kernel +void kernel_body(kernel_arg_t* __UNIFORM__ arg) { + auto layer_configs = reinterpret_cast(arg->layer_configs_addr); + + // Layer buffers + auto input = reinterpret_cast(arg->input_addr); + auto buffer1 = reinterpret_cast(arg->buffer1_addr); + auto buffer2 = reinterpret_cast(arg->buffer2_addr); + auto output = reinterpret_cast(arg->output_addr); + + // Pointer arrays for ping-pong buffering + TYPE* layer_inputs[4] = {input, buffer1, buffer2, buffer1}; + TYPE* layer_outputs[4] = {buffer1, buffer2, buffer1, output}; + + layer_args_t layer_args; + + // Execute each layer with parallel threads + for (uint32_t layer = 0; layer < arg->num_layers; ++layer) { + auto cfg = &layer_configs[layer]; + + layer_args.input = layer_inputs[layer]; + layer_args.weights = reinterpret_cast(cfg->weights_addr); + layer_args.bias = reinterpret_cast(cfg->bias_addr); + layer_args.output = layer_outputs[layer]; + layer_args.input_dim = cfg->input_dim; + layer_args.output_dim = cfg->output_dim; + layer_args.apply_relu = (layer < arg->num_layers - 1); + + // vx_spawn_threads provides barrier synchronization after all threads complete + vx_spawn_threads(1, &layer_args.output_dim, nullptr, + (vx_kernel_func_cb)fc_layer_thread, &layer_args); + } + + // Apply softmax to final output + uint32_t output_size = layer_configs[arg->num_layers - 1].output_dim; + apply_softmax(output, output_size); +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); + + uint32_t grid_dim = 1; + return vx_spawn_threads(1, &grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg); +} diff --git a/tests/regression/mlp/main.cpp b/tests/regression/mlp/main.cpp new file mode 100644 index 000000000..ca1d7373c --- /dev/null +++ b/tests/regression/mlp/main.cpp @@ -0,0 +1,346 @@ +#include +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define FLOAT_ULP 150 // Higher tolerance for deep network (4 layers, accumulated FP errors) + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.vxbin"; + +// Device handles +vx_device_h device = nullptr; +vx_buffer_h input_buffer = nullptr; +vx_buffer_h output_buffer = nullptr; +vx_buffer_h buffer1 = nullptr; +vx_buffer_h buffer2 = nullptr; +vx_buffer_h krnl_buffer = nullptr; +vx_buffer_h args_buffer = nullptr; +vx_buffer_h layer_configs_buffer = nullptr; + +// Weight and bias buffers for each layer +vx_buffer_h weights_buffers[NUM_LAYERS]; +vx_buffer_h bias_buffers[NUM_LAYERS]; + +kernel_arg_t kernel_arg = {}; + +// Layer dimensions +const uint32_t layer_input_dims[NUM_LAYERS] = {INPUT_DIM, HIDDEN1_DIM, HIDDEN2_DIM, HIDDEN3_DIM}; +const uint32_t layer_output_dims[NUM_LAYERS] = {HIDDEN1_DIM, HIDDEN2_DIM, HIDDEN3_DIM, OUTPUT_DIM}; + +static void show_usage() { + std::cout << "Vortex MLP Test." << std::endl; +// std::cout << "Usage: [-k: kernel] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "k:h")) != -1) { + switch (c) { + case 'k': + kernel_file = optarg; + break; + case 'h': + show_usage(); + exit(0); + break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + vx_mem_free(input_buffer); + vx_mem_free(output_buffer); + vx_mem_free(buffer1); + vx_mem_free(buffer2); + vx_mem_free(layer_configs_buffer); + for (int i = 0; i < NUM_LAYERS; ++i) { + vx_mem_free(weights_buffers[i]); + vx_mem_free(bias_buffers[i]); + } + vx_mem_free(krnl_buffer); + vx_mem_free(args_buffer); + vx_dev_close(device); + } +} + +// Initialize weights with Xavier initialization +void init_weights(std::vector& weights, uint32_t fan_in, uint32_t fan_out) { + TYPE scale = std::sqrt(2.0f / (fan_in + fan_out)); + for (size_t i = 0; i < weights.size(); ++i) { + // Generate random value between -scale and scale + weights[i] = (static_cast(rand()) / RAND_MAX * 2.0f - 1.0f) * scale; + } +} + +// Initialize biases to zero +void init_bias(std::vector& bias) { + for (size_t i = 0; i < bias.size(); ++i) { + bias[i] = 0.0f; + } +} + +// CPU reference implementation of ReLU +TYPE relu_cpu(TYPE x) { + return (x > 0) ? x : 0; +} + +// CPU reference implementation of fully connected layer +void fc_layer_cpu(const TYPE* input, const TYPE* weights, const TYPE* bias, + TYPE* output, uint32_t input_dim, uint32_t output_dim, bool apply_relu) { + for (uint32_t i = 0; i < output_dim; ++i) { + TYPE sum = 0; + for (uint32_t j = 0; j < input_dim; ++j) { + sum += weights[i * input_dim + j] * input[j]; + } + sum += bias[i]; + output[i] = apply_relu ? relu_cpu(sum) : sum; + } +} + +// CPU reference implementation of softmax +void softmax_cpu(TYPE* data, uint32_t size) { + // Find max for numerical stability + TYPE max_val = data[0]; + for (uint32_t i = 1; i < size; ++i) { + if (data[i] > max_val) { + max_val = data[i]; + } + } + + // Compute exp(x - max) and sum + TYPE sum_exp = 0.0f; + for (uint32_t i = 0; i < size; ++i) { + data[i] = std::exp(data[i] - max_val); + sum_exp += data[i]; + } + + // Normalize + for (uint32_t i = 0; i < size; ++i) { + data[i] /= sum_exp; + } +} + +// Run MLP inference on CPU for reference +void mlp_cpu(const TYPE* input, TYPE* output, + const std::vector>& weights, + const std::vector>& biases) { + + std::vector layer_output[NUM_LAYERS]; + + for (int l = 0; l < NUM_LAYERS; ++l) { + layer_output[l].resize(layer_output_dims[l]); + + const TYPE* layer_input = (l == 0) ? input : layer_output[l-1].data(); + bool apply_relu = (l < NUM_LAYERS - 1); + + fc_layer_cpu(layer_input, weights[l].data(), biases[l].data(), + layer_output[l].data(), layer_input_dims[l], layer_output_dims[l], apply_relu); + } + + // Copy final output + memcpy(output, layer_output[NUM_LAYERS-1].data(), OUTPUT_DIM * sizeof(TYPE)); + + // Apply softmax to final output + softmax_cpu(output, OUTPUT_DIM); +} + +// Compare floating point values with ULP tolerance +bool compare_float(TYPE a, TYPE b, int index, int& errors) { + union fi_t { float f; int32_t i; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; + auto d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP) { + if (errors < 100) { + printf("*** error: [%d] expected=%f, actual=%f (diff=%d ULP)\n", index, b, a, d); + } + return false; + } + return true; +} + +int main(int argc, char *argv[]) { + parse_args(argc, argv); + + std::srand(42); // Fixed seed for reproducible results across config changes + + std::cout << "=== Vortex MLP Neural Network Test ===" << std::endl; + std::cout << "Network: " << INPUT_DIM << " -> " << HIDDEN1_DIM + << " -> " << HIDDEN2_DIM << " -> " << HIDDEN3_DIM + << " -> " << OUTPUT_DIM << std::endl; + + // Open device connection + std::cout << "Opening device connection..." << std::endl; + RT_CHECK(vx_dev_open(&device)); + + // Allocate host buffers for weights and biases + std::vector> h_weights(NUM_LAYERS); + std::vector> h_biases(NUM_LAYERS); + + for (int l = 0; l < NUM_LAYERS; ++l) { + uint32_t in_dim = layer_input_dims[l]; + uint32_t out_dim = layer_output_dims[l]; + + h_weights[l].resize(out_dim * in_dim); + h_biases[l].resize(out_dim); + + init_weights(h_weights[l], in_dim, out_dim); + init_bias(h_biases[l]); + } + + // Allocate host input/output buffers + std::vector h_input(INPUT_DIM); + std::vector h_output(OUTPUT_DIM); + std::vector h_ref_output(OUTPUT_DIM); + + // Initialize input with random values + for (uint32_t i = 0; i < INPUT_DIM; ++i) { + h_input[i] = static_cast(rand()) / RAND_MAX; + } + + // Print input summary + // std::cout << "Input (first 8 values): "; + // for (int i = 0; i < 8; ++i) { + // std::cout << h_input[i] << " "; + // } + // std::cout << "..." << std::endl; + + // Allocate device memory for input/output + std::cout << "Allocating device memory..." << std::endl; + + RT_CHECK(vx_mem_alloc(device, INPUT_DIM * sizeof(TYPE), VX_MEM_READ, &input_buffer)); + RT_CHECK(vx_mem_address(input_buffer, &kernel_arg.input_addr)); + + RT_CHECK(vx_mem_alloc(device, OUTPUT_DIM * sizeof(TYPE), VX_MEM_READ_WRITE, &output_buffer)); + RT_CHECK(vx_mem_address(output_buffer, &kernel_arg.output_addr)); + + // Intermediate buffers (sized for largest hidden layer) + uint32_t max_hidden = std::max(HIDDEN1_DIM, std::max(HIDDEN2_DIM, HIDDEN3_DIM)); + RT_CHECK(vx_mem_alloc(device, max_hidden * sizeof(TYPE), VX_MEM_READ_WRITE, &buffer1)); + RT_CHECK(vx_mem_address(buffer1, &kernel_arg.buffer1_addr)); + + RT_CHECK(vx_mem_alloc(device, max_hidden * sizeof(TYPE), VX_MEM_READ_WRITE, &buffer2)); + RT_CHECK(vx_mem_address(buffer2, &kernel_arg.buffer2_addr)); + + // Allocate and upload weights and biases + std::vector layer_configs(NUM_LAYERS); + + for (int l = 0; l < NUM_LAYERS; ++l) { + uint32_t in_dim = layer_input_dims[l]; + uint32_t out_dim = layer_output_dims[l]; + + // Allocate weights + RT_CHECK(vx_mem_alloc(device, out_dim * in_dim * sizeof(TYPE), VX_MEM_READ, &weights_buffers[l])); + RT_CHECK(vx_mem_address(weights_buffers[l], &layer_configs[l].weights_addr)); + RT_CHECK(vx_copy_to_dev(weights_buffers[l], h_weights[l].data(), 0, out_dim * in_dim * sizeof(TYPE))); + + // Allocate biases + RT_CHECK(vx_mem_alloc(device, out_dim * sizeof(TYPE), VX_MEM_READ, &bias_buffers[l])); + RT_CHECK(vx_mem_address(bias_buffers[l], &layer_configs[l].bias_addr)); + RT_CHECK(vx_copy_to_dev(bias_buffers[l], h_biases[l].data(), 0, out_dim * sizeof(TYPE))); + + layer_configs[l].input_dim = in_dim; + layer_configs[l].output_dim = out_dim; + + std::cout << " Layer " << l << ": " << in_dim << " x " << out_dim + << " (weights: " << (out_dim * in_dim * sizeof(TYPE)) << " bytes)" << std::endl; + } + + // Upload layer configs + RT_CHECK(vx_mem_alloc(device, NUM_LAYERS * sizeof(layer_config_t), VX_MEM_READ, &layer_configs_buffer)); + RT_CHECK(vx_mem_address(layer_configs_buffer, &kernel_arg.layer_configs_addr)); + RT_CHECK(vx_copy_to_dev(layer_configs_buffer, layer_configs.data(), 0, NUM_LAYERS * sizeof(layer_config_t))); + + // Upload input + std::cout << "Uploading input data..." << std::endl; + RT_CHECK(vx_copy_to_dev(input_buffer, h_input.data(), 0, INPUT_DIM * sizeof(TYPE))); + + // Set kernel arguments + kernel_arg.num_layers = NUM_LAYERS; + kernel_arg.batch_size = 1; + + // Upload kernel binary + std::cout << "Uploading kernel binary..." << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer)); + + // Upload kernel arguments + std::cout << "Uploading kernel arguments..." << std::endl; + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer)); + + // Start device + std::cout << "Starting MLP inference on device..." << std::endl; + auto time_start = std::chrono::high_resolution_clock::now(); + + RT_CHECK(vx_start(device, krnl_buffer, args_buffer)); + + // Wait for completion + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast(time_end - time_start).count(); + std::cout << "Elapsed time: " << elapsed << " ms" << std::endl; + + // Download output + std::cout << "Downloading output..." << std::endl; + RT_CHECK(vx_copy_from_dev(h_output.data(), output_buffer, 0, OUTPUT_DIM * sizeof(TYPE))); + + // Compute CPU reference + std::cout << "Computing CPU reference..." << std::endl; + mlp_cpu(h_input.data(), h_ref_output.data(), h_weights, h_biases); + + // Print outputs + // std::cout << "Device output: "; + // for (int i = 0; i < OUTPUT_DIM; ++i) { + // std::cout << h_output[i] << " "; + // } + // std::cout << std::endl; + + // std::cout << "CPU reference: "; + // for (int i = 0; i < OUTPUT_DIM; ++i) { + // std::cout << h_ref_output[i] << " "; + // } + // std::cout << std::endl; + + // Verify results + std::cout << "Verifying results..." << std::endl; + int errors = 0; + for (uint32_t i = 0; i < OUTPUT_DIM; ++i) { + if (!compare_float(h_output[i], h_ref_output[i], i, errors)) { + ++errors; + } + } + + // Cleanup + std::cout << "Cleaning up..." << std::endl; + cleanup(); + + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + + std::cout << "PASSED!" << std::endl; + return 0; +} diff --git a/tests/regression/mlp_vegeta/Makefile b/tests/regression/mlp_vegeta/Makefile new file mode 100644 index 000000000..b2b36e7cd --- /dev/null +++ b/tests/regression/mlp_vegeta/Makefile @@ -0,0 +1,14 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := mlp_vegeta + +SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +VX_SRCS := $(SRC_DIR)/kernel.cpp + +OPTS ?= + +include ../common.mk diff --git a/tests/regression/mlp_vegeta/common.h b/tests/regression/mlp_vegeta/common.h new file mode 100644 index 000000000..ebbf9c6d3 --- /dev/null +++ b/tests/regression/mlp_vegeta/common.h @@ -0,0 +1,55 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#include + +#ifndef TYPE +#define TYPE float +#endif + +// MLP Network Configuration (same as mlp_test, all multiples of 16) +#define INPUT_DIM 128 +#define HIDDEN1_DIM 256 +#define HIDDEN2_DIM 128 +#define HIDDEN3_DIM 64 +#define OUTPUT_DIM 16 + +// Number of layers +#define NUM_LAYERS 4 + +// VEGETA tile dimensions +#define TILE_SIZE 16 +#define BATCH_SIZE 16 // VEGETA requires batch=16 for proper 16x16 tiles +#define T_TILE_BYTES (TILE_SIZE * TILE_SIZE * sizeof(TYPE)) // 1KB +#define U_TILE_BYTES (2 * T_TILE_BYTES) // 2KB +#define V_TILE_BYTES (4 * T_TILE_BYTES) // 4KB +#define M_TILE_BYTES (TILE_SIZE * TILE_SIZE / 2) // 128 bytes metadata + +// GEMM modes for VEGETA +typedef enum { + MLP_MODE_TGEMM = 0, // Dense × Dense (T × T) + MLP_MODE_UGEMM = 1, // Dense × 2:4 Sparse (T × U) + MLP_MODE_VGEMM = 2 // Dense × 1:4 Sparse (T × V) +} mlp_mode_t; + +// Layer configuration structure +typedef struct { + uint32_t input_dim; + uint32_t output_dim; + uint64_t weights_addr; // Weight matrix: input_dim x output_dim (row-major) + uint64_t bias_addr; // Bias vector: output_dim + uint64_t metadata_addr; // Sparsity metadata (for UGEMM/VGEMM modes) +} layer_config_t; + +// Kernel arguments +typedef struct { + uint32_t num_layers; + uint32_t mode; // MLP_MODE_TGEMM, MLP_MODE_UGEMM, or MLP_MODE_VGEMM + uint64_t input_addr; // Input data: [BATCH_SIZE x INPUT_DIM] + uint64_t output_addr; // Final output: [BATCH_SIZE x OUTPUT_DIM] + uint64_t layer_configs_addr; // Pointer to layer configurations + uint64_t buffer1_addr; // Intermediate buffer 1 + uint64_t buffer2_addr; // Intermediate buffer 2 +} kernel_arg_t; + +#endif diff --git a/tests/regression/mlp_vegeta/kernel.cpp b/tests/regression/mlp_vegeta/kernel.cpp new file mode 100644 index 000000000..ad57212f0 --- /dev/null +++ b/tests/regression/mlp_vegeta/kernel.cpp @@ -0,0 +1,220 @@ +#include +#include +#include +#include "common.h" + +// Aligned tile buffer for intermediate results +static TYPE g_C_tile[TILE_SIZE * TILE_SIZE] __attribute__((aligned(64))); + +// ReLU activation (in-place) for batched data +inline void apply_relu_batch(TYPE* data, uint32_t batch_size, uint32_t dim) { + for (uint32_t b = 0; b < batch_size; ++b) { + for (uint32_t i = 0; i < dim; ++i) { + TYPE& v = data[b * dim + i]; + if (v < 0) v = 0; + } + } +} + +// Softmax for each sample in batch +void apply_softmax_batch(TYPE* data, uint32_t batch_size, uint32_t dim) { + for (uint32_t b = 0; b < batch_size; ++b) { + TYPE* sample = data + b * dim; + + TYPE max_val = sample[0]; + for (uint32_t i = 1; i < dim; ++i) { + if (sample[i] > max_val) max_val = sample[i]; + } + + TYPE sum_exp = 0.0f; + for (uint32_t i = 0; i < dim; ++i) { + sample[i] = expf(sample[i] - max_val); + sum_exp += sample[i]; + } + + for (uint32_t i = 0; i < dim; ++i) { + sample[i] /= sum_exp; + } + } +} + +// Initialize C tile buffer to zeros +inline void clear_C_tile() { + for (uint32_t i = 0; i < TILE_SIZE * TILE_SIZE; ++i) { + g_C_tile[i] = 0.0f; + } +} + +// ============================================================================= +// TGEMM: Dense × Dense (T × T → T) +// ============================================================================= +void vegeta_tgemm_tile_accumulate(TYPE* A_tile, TYPE* B_tile, TYPE* C_tile) { + vx_lt(0, (size_t)C_tile, 0); // Load C to T0 (accumulator) + vx_lt(1, (size_t)A_tile, 0); // Load A to T1 + vx_lt(2, (size_t)B_tile, 0); // Load B to T2 + vx_tgemm(0, 1, 2); // T0 += T1 × T2 + vx_st((size_t)C_tile, 0, 0); // Store result +} + +void vegeta_layer_tgemm_batch( + TYPE* input, TYPE* weights, TYPE* bias, TYPE* output, + uint32_t in_dim, uint32_t out_dim +) { + uint32_t in_tiles = in_dim / TILE_SIZE; + uint32_t out_tiles = out_dim / TILE_SIZE; + + for (uint32_t out_t = 0; out_t < out_tiles; ++out_t) { + clear_C_tile(); + + for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) { + TYPE* A_tile = input + in_t * TILE_SIZE; + TYPE* B_tile = weights + in_t * TILE_SIZE * out_dim + out_t * TILE_SIZE; + vegeta_tgemm_tile_accumulate(A_tile, B_tile, g_C_tile); + } + + for (uint32_t row = 0; row < BATCH_SIZE; ++row) { + for (uint32_t col = 0; col < TILE_SIZE; ++col) { + output[row * out_dim + out_t * TILE_SIZE + col] = + g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col]; + } + } + } +} + +// ============================================================================= +// UGEMM: Dense × 2:4 Sparse (T × U → T) +// For 2:4 sparsity: weights are compressed to half size, metadata indicates positions +// ============================================================================= +void vegeta_ugemm_tile_accumulate(TYPE* A_tile, TYPE* B_tile, uint8_t* M_tile, TYPE* C_tile) { + vx_lt(0, (size_t)C_tile, 0); // Load C to T0 (accumulator) + vx_lt(1, (size_t)A_tile, 0); // Load A to T1 + vx_lm(1, (size_t)M_tile, 0); // Load metadata to M1 + vx_lu(2, (size_t)B_tile, 0); // Load B (2KB) to U2 + vx_ugemm(0, 1, 2); // T0 += T1 × U2 (with M1 metadata) + vx_st((size_t)C_tile, 0, 0); // Store result +} + +void vegeta_layer_ugemm_batch( + TYPE* input, TYPE* weights, uint8_t* metadata, TYPE* bias, TYPE* output, + uint32_t in_dim, uint32_t out_dim +) { + uint32_t in_tiles = in_dim / TILE_SIZE; + uint32_t out_tiles = out_dim / TILE_SIZE; + + for (uint32_t out_t = 0; out_t < out_tiles; ++out_t) { + clear_C_tile(); + + for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) { + TYPE* A_tile = input + in_t * TILE_SIZE; + // Compressed weights: half the K dimension + TYPE* B_tile = weights + (in_t * TILE_SIZE / 2) * out_dim + out_t * TILE_SIZE; + // Metadata: 128 bytes per tile + uint8_t* M_tile = metadata + (in_t * out_tiles + out_t) * M_TILE_BYTES; + vegeta_ugemm_tile_accumulate(A_tile, B_tile, M_tile, g_C_tile); + } + + for (uint32_t row = 0; row < BATCH_SIZE; ++row) { + for (uint32_t col = 0; col < TILE_SIZE; ++col) { + output[row * out_dim + out_t * TILE_SIZE + col] = + g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col]; + } + } + } +} + +// ============================================================================= +// VGEMM: Dense × 1:4 Sparse (T × V → T) +// For 1:4 sparsity: weights are compressed to quarter size, metadata indicates positions +// ============================================================================= +void vegeta_vgemm_tile_accumulate(TYPE* A_tile, TYPE* B_tile, uint8_t* M_tile, TYPE* C_tile) { + vx_lt(0, (size_t)C_tile, 0); // Load C to T0 (accumulator) + vx_lt(1, (size_t)A_tile, 0); // Load A to T1 + vx_lm(1, (size_t)M_tile, 0); // Load metadata to M1 + vx_lv(1, (size_t)B_tile, 0); // Load B (4KB) to V1 + vx_vgemm(0, 1, 1); // T0 += T1 × V1 (with M1 metadata) + vx_st((size_t)C_tile, 0, 0); // Store result +} + +void vegeta_layer_vgemm_batch( + TYPE* input, TYPE* weights, uint8_t* metadata, TYPE* bias, TYPE* output, + uint32_t in_dim, uint32_t out_dim +) { + uint32_t in_tiles = in_dim / TILE_SIZE; + uint32_t out_tiles = out_dim / TILE_SIZE; + + for (uint32_t out_t = 0; out_t < out_tiles; ++out_t) { + clear_C_tile(); + + for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) { + TYPE* A_tile = input + in_t * TILE_SIZE; + // Compressed weights: quarter the K dimension + TYPE* B_tile = weights + (in_t * TILE_SIZE / 4) * out_dim + out_t * TILE_SIZE; + // Metadata: 128 bytes per tile + uint8_t* M_tile = metadata + (in_t * out_tiles + out_t) * M_TILE_BYTES; + vegeta_vgemm_tile_accumulate(A_tile, B_tile, M_tile, g_C_tile); + } + + for (uint32_t row = 0; row < BATCH_SIZE; ++row) { + for (uint32_t col = 0; col < TILE_SIZE; ++col) { + output[row * out_dim + out_t * TILE_SIZE + col] = + g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col]; + } + } + } +} + +// ============================================================================= +// Kernel Entry Point +// ============================================================================= +void kernel_body(kernel_arg_t* __UNIFORM__ arg) { + auto input = reinterpret_cast(arg->input_addr); + auto output = reinterpret_cast(arg->output_addr); + auto layer_configs = reinterpret_cast(arg->layer_configs_addr); + auto buffer1 = reinterpret_cast(arg->buffer1_addr); + auto buffer2 = reinterpret_cast(arg->buffer2_addr); + uint32_t mode = arg->mode; + + TYPE* layer_input = input; + TYPE* layer_output = buffer1; + + for (uint32_t layer = 0; layer < arg->num_layers; ++layer) { + auto& config = layer_configs[layer]; + auto weights = reinterpret_cast(config.weights_addr); + auto bias = reinterpret_cast(config.bias_addr); + auto metadata = reinterpret_cast(config.metadata_addr); + + if (layer == arg->num_layers - 1) { + layer_output = output; + } else if (layer % 2 == 0) { + layer_output = buffer1; + } else { + layer_output = buffer2; + } + + // Execute layer based on mode + if (mode == MLP_MODE_TGEMM) { + vegeta_layer_tgemm_batch(layer_input, weights, bias, layer_output, + config.input_dim, config.output_dim); + } else if (mode == MLP_MODE_UGEMM) { + vegeta_layer_ugemm_batch(layer_input, weights, metadata, bias, layer_output, + config.input_dim, config.output_dim); + } else if (mode == MLP_MODE_VGEMM) { + vegeta_layer_vgemm_batch(layer_input, weights, metadata, bias, layer_output, + config.input_dim, config.output_dim); + } + + // Apply ReLU for hidden layers + if (layer < arg->num_layers - 1) { + apply_relu_batch(layer_output, BATCH_SIZE, config.output_dim); + } + + layer_input = layer_output; + } + + apply_softmax_batch(output, BATCH_SIZE, OUTPUT_DIM); +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); + return vx_spawn_threads(1, nullptr, nullptr, (vx_kernel_func_cb)kernel_body, arg); +} diff --git a/tests/regression/mlp_vegeta/main.cpp b/tests/regression/mlp_vegeta/main.cpp new file mode 100644 index 000000000..a801b854c --- /dev/null +++ b/tests/regression/mlp_vegeta/main.cpp @@ -0,0 +1,451 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.vxbin"; + +vx_device_h device = nullptr; +vx_buffer_h input_buffer = nullptr; +vx_buffer_h output_buffer = nullptr; +vx_buffer_h layer_configs_buffer = nullptr; +vx_buffer_h buffer1 = nullptr; +vx_buffer_h buffer2 = nullptr; +std::vector weight_buffers; +std::vector bias_buffers; +std::vector metadata_buffers; +vx_buffer_h krnl_buffer = nullptr; +vx_buffer_h args_buffer = nullptr; + +static mlp_mode_t mlp_mode = MLP_MODE_TGEMM; + +static void show_usage() { + std::cout << "Vortex VEGETA MLP Test (Batched)." << std::endl; +// std::cout << "Usage: [-m mode] [-h: help]" << std::endl; +// std::cout << " -m mode: GEMM mode (0=TGEMM, 1=UGEMM, 2=VGEMM) [default: 0]" << std::endl; +// std::cout << " TGEMM (0): Dense × Dense" << std::endl; +// std::cout << " UGEMM (1): Dense × 2:4 Sparse" << std::endl; +// std::cout << " VGEMM (2): Dense × 1:4 Sparse" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "m:h")) != -1) { + switch (c) { + case 'm': + mlp_mode = static_cast(atoi(optarg)); + if (mlp_mode < MLP_MODE_TGEMM || mlp_mode > MLP_MODE_VGEMM) { + std::cerr << "Error: Invalid mode " << mlp_mode << std::endl; + show_usage(); + exit(-1); + } + break; + case 'h': + show_usage(); + exit(0); + break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + vx_mem_free(input_buffer); + vx_mem_free(output_buffer); + vx_mem_free(layer_configs_buffer); + vx_mem_free(buffer1); + vx_mem_free(buffer2); + for (auto& buf : weight_buffers) vx_mem_free(buf); + for (auto& buf : bias_buffers) vx_mem_free(buf); + for (auto& buf : metadata_buffers) vx_mem_free(buf); + vx_mem_free(krnl_buffer); + vx_mem_free(args_buffer); + vx_dev_close(device); + } +} + +/////////////////////////////////////////////////////////////////////////////// +// Sparse Weight Compression +/////////////////////////////////////////////////////////////////////////////// + +// Compress weights to 2:4 sparsity (keep 2 largest of every 4) +// Input: dense weights [in_dim x out_dim] +// Output: compressed weights [in_dim/2 x out_dim], metadata +void compress_2_4_sparse(const std::vector& dense, int in_dim, int out_dim, + std::vector& compressed, std::vector& metadata, + std::vector& logical) { + // For the CPU reference, we need the "logical" dense weights with zeros + logical.resize(in_dim * out_dim, 0.0f); + compressed.resize((in_dim / 2) * out_dim); + + // Metadata: 128 bytes per 16x16 tile + int in_tiles = in_dim / TILE_SIZE; + int out_tiles = out_dim / TILE_SIZE; + metadata.resize(in_tiles * out_tiles * M_TILE_BYTES, 0); + + // For each group of 4 consecutive values in K dimension, keep 2 largest + for (int j = 0; j < out_dim; ++j) { + int comp_idx = 0; + for (int i = 0; i < in_dim; i += 4) { + // Get 4 values + std::pair vals[4]; + for (int k = 0; k < 4; ++k) { + vals[k] = {std::abs(dense[(i + k) * out_dim + j]), k}; + } + // Sort by magnitude (descending) + std::sort(vals, vals + 4, [](auto& a, auto& b) { return a.first > b.first; }); + + // Keep top 2, generate mask + uint8_t mask = 0; + for (int k = 0; k < 2; ++k) { + int pos = vals[k].second; + mask |= (1 << pos); + compressed[(comp_idx / 2) * out_dim + j] = dense[(i + pos) * out_dim + j]; + logical[(i + pos) * out_dim + j] = dense[(i + pos) * out_dim + j]; + comp_idx++; + } + + // Store metadata (simplified - in real impl would be tile-based) + int tile_in = i / TILE_SIZE; + int tile_out = j / TILE_SIZE; + int meta_offset = (tile_in * out_tiles + tile_out) * M_TILE_BYTES; + int row_in_tile = (i % TILE_SIZE) / 4; + int col_in_tile = j % TILE_SIZE; + int byte_offset = meta_offset + row_in_tile * TILE_SIZE + col_in_tile; + if (byte_offset < (int)metadata.size()) { + metadata[byte_offset] = mask; + } + } + } +} + +// Compress weights to 1:4 sparsity (keep 1 largest of every 4) +void compress_1_4_sparse(const std::vector& dense, int in_dim, int out_dim, + std::vector& compressed, std::vector& metadata, + std::vector& logical) { + logical.resize(in_dim * out_dim, 0.0f); + compressed.resize((in_dim / 4) * out_dim); + + int in_tiles = in_dim / TILE_SIZE; + int out_tiles = out_dim / TILE_SIZE; + metadata.resize(in_tiles * out_tiles * M_TILE_BYTES, 0); + + for (int j = 0; j < out_dim; ++j) { + int comp_idx = 0; + for (int i = 0; i < in_dim; i += 4) { + // Find largest of 4 + int max_pos = 0; + float max_val = std::abs(dense[i * out_dim + j]); + for (int k = 1; k < 4; ++k) { + float val = std::abs(dense[(i + k) * out_dim + j]); + if (val > max_val) { + max_val = val; + max_pos = k; + } + } + + compressed[comp_idx * out_dim + j] = dense[(i + max_pos) * out_dim + j]; + logical[(i + max_pos) * out_dim + j] = dense[(i + max_pos) * out_dim + j]; + comp_idx++; + + // Store metadata + uint8_t mask = (1 << max_pos); + int tile_in = i / TILE_SIZE; + int tile_out = j / TILE_SIZE; + int meta_offset = (tile_in * out_tiles + tile_out) * M_TILE_BYTES; + int row_in_tile = (i % TILE_SIZE) / 4; + int col_in_tile = j % TILE_SIZE; + int byte_offset = meta_offset + row_in_tile * TILE_SIZE + col_in_tile; + if (byte_offset < (int)metadata.size()) { + metadata[byte_offset] = mask; + } + } + } +} + +/////////////////////////////////////////////////////////////////////////////// +// CPU Reference +/////////////////////////////////////////////////////////////////////////////// + +static inline TYPE relu_cpu(TYPE x) { + return (x > 0) ? x : 0; +} + +static void softmax_cpu(TYPE* sample, int dim) { + TYPE max_val = sample[0]; + for (int i = 1; i < dim; ++i) { + if (sample[i] > max_val) max_val = sample[i]; + } + TYPE sum_exp = 0.0f; + for (int i = 0; i < dim; ++i) { + sample[i] = expf(sample[i] - max_val); + sum_exp += sample[i]; + } + for (int i = 0; i < dim; ++i) { + sample[i] /= sum_exp; + } +} + +static void layer_cpu_batch(const TYPE* input, const TYPE* weights, const TYPE* bias, + TYPE* output, int batch_size, int input_dim, int output_dim) { + for (int b = 0; b < batch_size; ++b) { + for (int j = 0; j < output_dim; ++j) { + TYPE sum = bias[j]; + for (int i = 0; i < input_dim; ++i) { + sum += input[b * input_dim + i] * weights[i * output_dim + j]; + } + output[b * output_dim + j] = sum; + } + } +} + +static void mlp_cpu_batch(const std::vector& input, + const std::vector>& weights, + const std::vector>& biases, + const std::vector>& dims, + std::vector& output) { + std::vector layer_input = input; + std::vector layer_output; + + for (size_t layer = 0; layer < dims.size(); ++layer) { + int in_dim = dims[layer].first; + int out_dim = dims[layer].second; + layer_output.resize(BATCH_SIZE * out_dim); + + layer_cpu_batch(layer_input.data(), weights[layer].data(), + biases[layer].data(), layer_output.data(), + BATCH_SIZE, in_dim, out_dim); + + if (layer < dims.size() - 1) { + for (auto& v : layer_output) { + v = relu_cpu(v); + } + } + + layer_input = layer_output; + } + + output = layer_output; + for (int b = 0; b < BATCH_SIZE; ++b) { + softmax_cpu(&output[b * OUTPUT_DIM], OUTPUT_DIM); + } +} + +// Compare floats with relative tolerance +static bool compare_float(TYPE a, TYPE b, int batch, int idx, int& errors) { + constexpr float REL_TOL = 0.05f; // 5% relative tolerance + constexpr float ABS_TOL = 1e-6f; + + float diff = std::abs(a - b); + float max_val = std::max(std::abs(a), std::abs(b)); + + if (diff > REL_TOL * max_val + ABS_TOL) { + if (errors < 10) { + printf("*** error: [batch=%d, idx=%d] expected=%.6f, actual=%.6f (diff=%.2f%%)\n", + batch, idx, b, a, 100.0f * diff / max_val); + } + ++errors; + return false; + } + return true; +} + +/////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char *argv[]) { + parse_args(argc, argv); + + std::srand(50); + + std::vector> layer_dims = { + {INPUT_DIM, HIDDEN1_DIM}, + {HIDDEN1_DIM, HIDDEN2_DIM}, + {HIDDEN2_DIM, HIDDEN3_DIM}, + {HIDDEN3_DIM, OUTPUT_DIM} + }; + + const char* mode_name; + switch (mlp_mode) { + case MLP_MODE_TGEMM: mode_name = "TGEMM (Dense × Dense)"; break; + case MLP_MODE_UGEMM: mode_name = "UGEMM (Dense × 2:4 Sparse)"; break; + case MLP_MODE_VGEMM: mode_name = "VGEMM (Dense × 1:4 Sparse)"; break; + default: mode_name = "Unknown"; + } + + std::cout << "=== Vortex VEGETA MLP Neural Network Test (Batched) ===" << std::endl; + std::cout << "Network: " << INPUT_DIM << " -> " << HIDDEN1_DIM << " -> " + << HIDDEN2_DIM << " -> " << HIDDEN3_DIM << " -> " << OUTPUT_DIM << std::endl; + std::cout << "Mode: " << mode_name << std::endl; + std::cout << "Batch size: " << BATCH_SIZE << std::endl; + + std::cout << "Opening device connection..." << std::endl; + RT_CHECK(vx_dev_open(&device)); + + // Generate batched input + std::vector h_input(BATCH_SIZE * INPUT_DIM); + for (auto& v : h_input) { + v = static_cast(rand()) / RAND_MAX; + } + + // Generate weights, biases, and optional sparse compression + std::vector> h_weights_dense(NUM_LAYERS); + std::vector> h_weights_compressed(NUM_LAYERS); + std::vector> h_weights_logical(NUM_LAYERS); + std::vector> h_biases(NUM_LAYERS); + std::vector> h_metadata(NUM_LAYERS); + + for (int layer = 0; layer < NUM_LAYERS; ++layer) { + int in_dim = layer_dims[layer].first; + int out_dim = layer_dims[layer].second; + + // Generate dense weights + h_weights_dense[layer].resize(in_dim * out_dim); + for (auto& v : h_weights_dense[layer]) { + v = (static_cast(rand()) / RAND_MAX - 0.5f) * 0.1f; + } + + h_biases[layer].resize(out_dim); + for (auto& v : h_biases[layer]) { + v = (static_cast(rand()) / RAND_MAX - 0.5f) * 0.1f; + } + + // Compress weights for sparse modes + if (mlp_mode == MLP_MODE_UGEMM) { + compress_2_4_sparse(h_weights_dense[layer], in_dim, out_dim, + h_weights_compressed[layer], h_metadata[layer], + h_weights_logical[layer]); + } else if (mlp_mode == MLP_MODE_VGEMM) { + compress_1_4_sparse(h_weights_dense[layer], in_dim, out_dim, + h_weights_compressed[layer], h_metadata[layer], + h_weights_logical[layer]); + } else { + h_weights_logical[layer] = h_weights_dense[layer]; + } + } + + // Allocate device memory + std::cout << "Allocating device memory..." << std::endl; + + RT_CHECK(vx_mem_alloc(device, BATCH_SIZE * INPUT_DIM * sizeof(TYPE), VX_MEM_READ, &input_buffer)); + RT_CHECK(vx_mem_alloc(device, BATCH_SIZE * OUTPUT_DIM * sizeof(TYPE), VX_MEM_READ_WRITE, &output_buffer)); + + int max_hidden = std::max({HIDDEN1_DIM, HIDDEN2_DIM, HIDDEN3_DIM}); + RT_CHECK(vx_mem_alloc(device, BATCH_SIZE * max_hidden * sizeof(TYPE), VX_MEM_READ_WRITE, &buffer1)); + RT_CHECK(vx_mem_alloc(device, BATCH_SIZE * max_hidden * sizeof(TYPE), VX_MEM_READ_WRITE, &buffer2)); + + std::vector layer_configs(NUM_LAYERS); + weight_buffers.resize(NUM_LAYERS); + bias_buffers.resize(NUM_LAYERS); + metadata_buffers.resize(NUM_LAYERS, nullptr); + + for (int layer = 0; layer < NUM_LAYERS; ++layer) { + int in_dim = layer_dims[layer].first; + int out_dim = layer_dims[layer].second; + + layer_configs[layer].input_dim = in_dim; + layer_configs[layer].output_dim = out_dim; + + // Upload weights (compressed for sparse modes, dense for TGEMM) + std::vector& weights_to_upload = + (mlp_mode == MLP_MODE_TGEMM) ? h_weights_dense[layer] : h_weights_compressed[layer]; + + size_t weight_size = weights_to_upload.size() * sizeof(TYPE); + RT_CHECK(vx_mem_alloc(device, weight_size, VX_MEM_READ, &weight_buffers[layer])); + RT_CHECK(vx_mem_address(weight_buffers[layer], &layer_configs[layer].weights_addr)); + RT_CHECK(vx_copy_to_dev(weight_buffers[layer], weights_to_upload.data(), 0, weight_size)); + + // Bias + size_t bias_size = h_biases[layer].size() * sizeof(TYPE); + RT_CHECK(vx_mem_alloc(device, bias_size, VX_MEM_READ, &bias_buffers[layer])); + RT_CHECK(vx_mem_address(bias_buffers[layer], &layer_configs[layer].bias_addr)); + RT_CHECK(vx_copy_to_dev(bias_buffers[layer], h_biases[layer].data(), 0, bias_size)); + + // Metadata for sparse modes + if (mlp_mode != MLP_MODE_TGEMM && !h_metadata[layer].empty()) { + size_t meta_size = h_metadata[layer].size(); + RT_CHECK(vx_mem_alloc(device, meta_size, VX_MEM_READ, &metadata_buffers[layer])); + RT_CHECK(vx_mem_address(metadata_buffers[layer], &layer_configs[layer].metadata_addr)); + RT_CHECK(vx_copy_to_dev(metadata_buffers[layer], h_metadata[layer].data(), 0, meta_size)); + } else { + layer_configs[layer].metadata_addr = 0; + } + } + + RT_CHECK(vx_mem_alloc(device, layer_configs.size() * sizeof(layer_config_t), VX_MEM_READ, &layer_configs_buffer)); + RT_CHECK(vx_copy_to_dev(layer_configs_buffer, layer_configs.data(), 0, layer_configs.size() * sizeof(layer_config_t))); + + std::cout << "Uploading input data (" << BATCH_SIZE << " samples)..." << std::endl; + RT_CHECK(vx_copy_to_dev(input_buffer, h_input.data(), 0, BATCH_SIZE * INPUT_DIM * sizeof(TYPE))); + + std::cout << "Uploading kernel binary..." << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer)); + + kernel_arg_t kernel_arg = {}; + kernel_arg.num_layers = NUM_LAYERS; + kernel_arg.mode = mlp_mode; + RT_CHECK(vx_mem_address(input_buffer, &kernel_arg.input_addr)); + RT_CHECK(vx_mem_address(output_buffer, &kernel_arg.output_addr)); + RT_CHECK(vx_mem_address(layer_configs_buffer, &kernel_arg.layer_configs_addr)); + RT_CHECK(vx_mem_address(buffer1, &kernel_arg.buffer1_addr)); + RT_CHECK(vx_mem_address(buffer2, &kernel_arg.buffer2_addr)); + + std::cout << "Uploading kernel arguments..." << std::endl; + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer)); + + std::cout << "Starting VEGETA MLP inference on device..." << std::endl; + auto start = std::chrono::high_resolution_clock::now(); + RT_CHECK(vx_start(device, krnl_buffer, args_buffer)); + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - start); + std::cout << "Elapsed time: " << duration.count() << " ms" << std::endl; + + std::cout << "Downloading output (" << BATCH_SIZE << " samples)..." << std::endl; + std::vector h_output(BATCH_SIZE * OUTPUT_DIM); + RT_CHECK(vx_copy_from_dev(h_output.data(), output_buffer, 0, BATCH_SIZE * OUTPUT_DIM * sizeof(TYPE))); + + // CPU reference uses logical weights (with zeros for pruned elements) + std::cout << "Computing CPU reference..." << std::endl; + std::vector h_ref; + mlp_cpu_batch(h_input, h_weights_logical, h_biases, layer_dims, h_ref); + + std::cout << "Verifying results..." << std::endl; + int errors = 0; + for (int b = 0; b < BATCH_SIZE; ++b) { + for (int i = 0; i < OUTPUT_DIM; ++i) { + compare_float(h_output[b * OUTPUT_DIM + i], h_ref[b * OUTPUT_DIM + i], b, i, errors); + } + } + + std::cout << "Cleaning up..." << std::endl; + cleanup(); + + if (errors != 0) { + std::cout << "Found " << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + + std::cout << "PASSED!" << std::endl; + return 0; +} From 3f9e97b2e557058313c8503e9407ed46d6df60d4 Mon Sep 17 00:00:00 2001 From: tandonmitul27 Date: Thu, 5 Feb 2026 21:43:32 +0530 Subject: [PATCH 2/2] MLP test corrections --- tests/regression/mlp/common.h | 2 + tests/regression/mlp/kernel.cpp | 24 ++--- tests/regression/mlp_vegeta/kernel.cpp | 93 ++++++++++++-------- tests/regression/mlp_vegeta/main.cpp | 117 +++++++++++++++++++------ 4 files changed, 164 insertions(+), 72 deletions(-) diff --git a/tests/regression/mlp/common.h b/tests/regression/mlp/common.h index 7cc37012a..476f234b0 100644 --- a/tests/regression/mlp/common.h +++ b/tests/regression/mlp/common.h @@ -1,6 +1,8 @@ #ifndef _COMMON_H_ #define _COMMON_H_ +#include + #ifndef TYPE #define TYPE float #endif diff --git a/tests/regression/mlp/kernel.cpp b/tests/regression/mlp/kernel.cpp index 50e4ed24e..92ead2ec1 100644 --- a/tests/regression/mlp/kernel.cpp +++ b/tests/regression/mlp/kernel.cpp @@ -48,23 +48,23 @@ void softmax_find_max_thread(softmax_args_t* __UNIFORM__ args) { } } -// Thread function: compute exp in parallel, thread 0 sums -void softmax_exp_sum_thread(softmax_args_t* __UNIFORM__ args) { +// Thread function: compute exp in parallel +void softmax_exp_thread(softmax_args_t* __UNIFORM__ args) { uint32_t i = blockIdx.x; // All threads compute their exp value if (i < args->size) { args->data[i] = expf(args->data[i] - *(args->max_val)); } - - // Thread 0 does the sum reduction serially - if (i == 0) { - TYPE sum_exp = 0.0f; - for (uint32_t j = 0; j < args->size; ++j) { - sum_exp += args->data[j]; - } - *(args->sum_exp) = sum_exp; +} + +// Thread function: single thread sums all exp values +void softmax_sum_thread(softmax_args_t* __UNIFORM__ args) { + TYPE sum_exp = 0.0f; + for (uint32_t j = 0; j < args->size; ++j) { + sum_exp += args->data[j]; } + *(args->sum_exp) = sum_exp; } // Thread function: normalize in parallel @@ -86,7 +86,9 @@ void apply_softmax(TYPE* data, uint32_t size) { uint32_t single_thread = 1; vx_spawn_threads(1, &single_thread, nullptr, (vx_kernel_func_cb)softmax_find_max_thread, &args); - vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_exp_sum_thread, &args); + vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_exp_thread, &args); + + vx_spawn_threads(1, &single_thread, nullptr, (vx_kernel_func_cb)softmax_sum_thread, &args); vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_normalize_thread, &args); } diff --git a/tests/regression/mlp_vegeta/kernel.cpp b/tests/regression/mlp_vegeta/kernel.cpp index ad57212f0..ea516c616 100644 --- a/tests/regression/mlp_vegeta/kernel.cpp +++ b/tests/regression/mlp_vegeta/kernel.cpp @@ -6,21 +6,32 @@ // Aligned tile buffer for intermediate results static TYPE g_C_tile[TILE_SIZE * TILE_SIZE] __attribute__((aligned(64))); -// ReLU activation (in-place) for batched data +// ReLU activation (in-place) for tile-major batched data inline void apply_relu_batch(TYPE* data, uint32_t batch_size, uint32_t dim) { - for (uint32_t b = 0; b < batch_size; ++b) { - for (uint32_t i = 0; i < dim; ++i) { - TYPE& v = data[b * dim + i]; - if (v < 0) v = 0; - } + // Data is in tile-major format: process all elements + uint32_t total_elements = batch_size * dim; + for (uint32_t i = 0; i < total_elements; ++i) { + if (data[i] < 0) data[i] = 0; } } -// Softmax for each sample in batch +// Softmax for each sample in batch (tile-major layout) void apply_softmax_batch(TYPE* data, uint32_t batch_size, uint32_t dim) { + // Convert tile-major to row-major for softmax processing + // Since dim is OUTPUT_DIM=16 which is one tile, data layout is simple + int num_tile_cols = dim / TILE_SIZE; + for (uint32_t b = 0; b < batch_size; ++b) { - TYPE* sample = data + b * dim; + // Extract this sample's values across tiles + TYPE sample[OUTPUT_DIM]; + for (int tile_col = 0; tile_col < num_tile_cols; ++tile_col) { + for (int col = 0; col < TILE_SIZE; ++col) { + int tile_idx = tile_col * (batch_size * TILE_SIZE) + b * TILE_SIZE + col; + sample[tile_col * TILE_SIZE + col] = data[tile_idx]; + } + } + // Compute softmax TYPE max_val = sample[0]; for (uint32_t i = 1; i < dim; ++i) { if (sample[i] > max_val) max_val = sample[i]; @@ -35,6 +46,14 @@ void apply_softmax_batch(TYPE* data, uint32_t batch_size, uint32_t dim) { for (uint32_t i = 0; i < dim; ++i) { sample[i] /= sum_exp; } + + // Write back to tile-major + for (int tile_col = 0; tile_col < num_tile_cols; ++tile_col) { + for (int col = 0; col < TILE_SIZE; ++col) { + int tile_idx = tile_col * (batch_size * TILE_SIZE) + b * TILE_SIZE + col; + data[tile_idx] = sample[tile_col * TILE_SIZE + col]; + } + } } } @@ -67,30 +86,31 @@ void vegeta_layer_tgemm_batch( clear_C_tile(); for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) { - TYPE* A_tile = input + in_t * TILE_SIZE; + // Tile-major layout: each tile column stores BATCH_SIZE×TILE_SIZE contiguously + TYPE* A_tile = input + in_t * (BATCH_SIZE * TILE_SIZE); TYPE* B_tile = weights + in_t * TILE_SIZE * out_dim + out_t * TILE_SIZE; vegeta_tgemm_tile_accumulate(A_tile, B_tile, g_C_tile); } - for (uint32_t row = 0; row < BATCH_SIZE; ++row) { - for (uint32_t col = 0; col < TILE_SIZE; ++col) { - output[row * out_dim + out_t * TILE_SIZE + col] = - g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col]; - } + // Store result in tile-major format + for (uint32_t i = 0; i < BATCH_SIZE * TILE_SIZE; ++i) { + output[out_t * (BATCH_SIZE * TILE_SIZE) + i] = + g_C_tile[i] + bias[out_t * TILE_SIZE + (i % TILE_SIZE)]; } } } // ============================================================================= -// UGEMM: Dense × 2:4 Sparse (T × U → T) +// UGEMM: Dense × 2:4 Sparse Weights (T × U → T) // For 2:4 sparsity: weights are compressed to half size, metadata indicates positions +// T-reg contains sparse weights (with metadata), U-reg contains dense activations // ============================================================================= void vegeta_ugemm_tile_accumulate(TYPE* A_tile, TYPE* B_tile, uint8_t* M_tile, TYPE* C_tile) { vx_lt(0, (size_t)C_tile, 0); // Load C to T0 (accumulator) - vx_lt(1, (size_t)A_tile, 0); // Load A to T1 - vx_lm(1, (size_t)M_tile, 0); // Load metadata to M1 - vx_lu(2, (size_t)B_tile, 0); // Load B (2KB) to U2 - vx_ugemm(0, 1, 2); // T0 += T1 × U2 (with M1 metadata) + vx_lt(1, (size_t)B_tile, 0); // Load sparse weights B to T1 + vx_lm(1, (size_t)M_tile, 0); // Load metadata for weights into M1 + vx_lu(2, (size_t)A_tile, 0); // Load dense activations A to U2 + vx_ugemm(0, 1, 2); // T0 += T1(sparse weights) × U2(dense activations) with M1 metadata vx_st((size_t)C_tile, 0, 0); // Store result } @@ -105,7 +125,8 @@ void vegeta_layer_ugemm_batch( clear_C_tile(); for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) { - TYPE* A_tile = input + in_t * TILE_SIZE; + // Tile-major input: U-reg loads 2×TILE_SIZE worth of activation data + TYPE* A_tile = input + in_t * (BATCH_SIZE * TILE_SIZE); // Compressed weights: half the K dimension TYPE* B_tile = weights + (in_t * TILE_SIZE / 2) * out_dim + out_t * TILE_SIZE; // Metadata: 128 bytes per tile @@ -113,25 +134,25 @@ void vegeta_layer_ugemm_batch( vegeta_ugemm_tile_accumulate(A_tile, B_tile, M_tile, g_C_tile); } - for (uint32_t row = 0; row < BATCH_SIZE; ++row) { - for (uint32_t col = 0; col < TILE_SIZE; ++col) { - output[row * out_dim + out_t * TILE_SIZE + col] = - g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col]; - } + // Store result in tile-major format + for (uint32_t i = 0; i < BATCH_SIZE * TILE_SIZE; ++i) { + output[out_t * (BATCH_SIZE * TILE_SIZE) + i] = + g_C_tile[i] + bias[out_t * TILE_SIZE + (i % TILE_SIZE)]; } } } // ============================================================================= -// VGEMM: Dense × 1:4 Sparse (T × V → T) +// VGEMM: Dense × 1:4 Sparse Weights (T × V → T) // For 1:4 sparsity: weights are compressed to quarter size, metadata indicates positions +// T-reg contains sparse weights (with metadata), V-reg contains dense activations // ============================================================================= void vegeta_vgemm_tile_accumulate(TYPE* A_tile, TYPE* B_tile, uint8_t* M_tile, TYPE* C_tile) { vx_lt(0, (size_t)C_tile, 0); // Load C to T0 (accumulator) - vx_lt(1, (size_t)A_tile, 0); // Load A to T1 - vx_lm(1, (size_t)M_tile, 0); // Load metadata to M1 - vx_lv(1, (size_t)B_tile, 0); // Load B (4KB) to V1 - vx_vgemm(0, 1, 1); // T0 += T1 × V1 (with M1 metadata) + vx_lt(1, (size_t)B_tile, 0); // Load compressed sparse weights B to T1 + vx_lm(1, (size_t)M_tile, 0); // Load metadata for sparse weights to M1 + vx_lv(1, (size_t)A_tile, 0); // Load dense activations A to V1 + vx_vgemm(0, 1, 1); // T0 += T1(sparse weights) × V1(dense activations) with M1 metadata vx_st((size_t)C_tile, 0, 0); // Store result } @@ -146,7 +167,8 @@ void vegeta_layer_vgemm_batch( clear_C_tile(); for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) { - TYPE* A_tile = input + in_t * TILE_SIZE; + // Tile-major input: V-reg loads 4×TILE_SIZE worth of activation data + TYPE* A_tile = input + in_t * (BATCH_SIZE * TILE_SIZE); // Compressed weights: quarter the K dimension TYPE* B_tile = weights + (in_t * TILE_SIZE / 4) * out_dim + out_t * TILE_SIZE; // Metadata: 128 bytes per tile @@ -154,11 +176,10 @@ void vegeta_layer_vgemm_batch( vegeta_vgemm_tile_accumulate(A_tile, B_tile, M_tile, g_C_tile); } - for (uint32_t row = 0; row < BATCH_SIZE; ++row) { - for (uint32_t col = 0; col < TILE_SIZE; ++col) { - output[row * out_dim + out_t * TILE_SIZE + col] = - g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col]; - } + // Store result in tile-major format + for (uint32_t i = 0; i < BATCH_SIZE * TILE_SIZE; ++i) { + output[out_t * (BATCH_SIZE * TILE_SIZE) + i] = + g_C_tile[i] + bias[out_t * TILE_SIZE + (i % TILE_SIZE)]; } } } diff --git a/tests/regression/mlp_vegeta/main.cpp b/tests/regression/mlp_vegeta/main.cpp index a801b854c..ac9e4cf3b 100644 --- a/tests/regression/mlp_vegeta/main.cpp +++ b/tests/regression/mlp_vegeta/main.cpp @@ -84,6 +84,47 @@ void cleanup() { } } +/////////////////////////////////////////////////////////////////////////////// +// Tile-Major Layout Conversion +/////////////////////////////////////////////////////////////////////////////// + +// Convert row-major [batch × dim] to tile-major layout for VEGETA loads +// Input: row_major[batch × dim] where batch=16, dim is multiple of 16 +// Output: tile_major stores data as contiguous 16×16 tiles +// Memory layout: For each tile column, store all 16 rows contiguously +void convert_to_tile_major(const std::vector& row_major, int batch, int dim, + std::vector& tile_major) { + int num_tile_cols = dim / TILE_SIZE; + tile_major.resize(batch * dim); + + for (int tile_col = 0; tile_col < num_tile_cols; ++tile_col) { + for (int row = 0; row < batch; ++row) { + for (int col = 0; col < TILE_SIZE; ++col) { + int src_idx = row * dim + tile_col * TILE_SIZE + col; + int dst_idx = tile_col * (batch * TILE_SIZE) + row * TILE_SIZE + col; + tile_major[dst_idx] = row_major[src_idx]; + } + } + } +} + +// Convert tile-major back to row-major [batch × dim] +void convert_from_tile_major(const std::vector& tile_major, int batch, int dim, + std::vector& row_major) { + int num_tile_cols = dim / TILE_SIZE; + row_major.resize(batch * dim); + + for (int tile_col = 0; tile_col < num_tile_cols; ++tile_col) { + for (int row = 0; row < batch; ++row) { + for (int col = 0; col < TILE_SIZE; ++col) { + int src_idx = tile_col * (batch * TILE_SIZE) + row * TILE_SIZE + col; + int dst_idx = row * dim + tile_col * TILE_SIZE + col; + row_major[dst_idx] = tile_major[src_idx]; + } + } + } +} + /////////////////////////////////////////////////////////////////////////////// // Sparse Weight Compression /////////////////////////////////////////////////////////////////////////////// @@ -115,25 +156,38 @@ void compress_2_4_sparse(const std::vector& dense, int in_dim, int out_dim // Sort by magnitude (descending) std::sort(vals, vals + 4, [](auto& a, auto& b) { return a.first > b.first; }); - // Keep top 2, generate mask + // Create bitmask for top 2 values uint8_t mask = 0; for (int k = 0; k < 2; ++k) { int pos = vals[k].second; mask |= (1 << pos); - compressed[(comp_idx / 2) * out_dim + j] = dense[(i + pos) * out_dim + j]; - logical[(i + pos) * out_dim + j] = dense[(i + pos) * out_dim + j]; - comp_idx++; } - // Store metadata (simplified - in real impl would be tile-based) + // Store compressed values in POSITION ORDER (not magnitude order) + // Hardware iterates through bit positions 0-3 and expects values in that order + for (int pos = 0; pos < 4; ++pos) { + if (mask & (1 << pos)) { + compressed[comp_idx * out_dim + j] = dense[(i + pos) * out_dim + j]; + logical[(i + pos) * out_dim + j] = dense[(i + pos) * out_dim + j]; + comp_idx++; + } + } + + // Store metadata in 16×16 nibble format (128 bytes per tile) + // Each row has 8 bytes, each byte has 2 nibbles + // Byte layout per row: byte 0 = cols 0,1; byte 1 = cols 2,3; ...; byte 7 = cols 14,15 int tile_in = i / TILE_SIZE; int tile_out = j / TILE_SIZE; int meta_offset = (tile_in * out_tiles + tile_out) * M_TILE_BYTES; - int row_in_tile = (i % TILE_SIZE) / 4; - int col_in_tile = j % TILE_SIZE; - int byte_offset = meta_offset + row_in_tile * TILE_SIZE + col_in_tile; - if (byte_offset < (int)metadata.size()) { - metadata[byte_offset] = mask; + int row_in_tile = i % TILE_SIZE; + int k_grp = (i % TILE_SIZE) / 4; // Which group of 4 in this row + int byte_idx = meta_offset + row_in_tile * 8 + k_grp / 2; + if (byte_idx < (int)metadata.size()) { + if (k_grp % 2 == 0) { + metadata[byte_idx] = (mask << 4); // Upper nibble + } else { + metadata[byte_idx] |= mask; // Lower nibble + } } } } @@ -168,16 +222,21 @@ void compress_1_4_sparse(const std::vector& dense, int in_dim, int out_dim logical[(i + max_pos) * out_dim + j] = dense[(i + max_pos) * out_dim + j]; comp_idx++; - // Store metadata + // Store metadata in 16×16 nibble format (128 bytes per tile) + // Each row has 8 bytes, each byte has 2 nibbles uint8_t mask = (1 << max_pos); int tile_in = i / TILE_SIZE; int tile_out = j / TILE_SIZE; int meta_offset = (tile_in * out_tiles + tile_out) * M_TILE_BYTES; - int row_in_tile = (i % TILE_SIZE) / 4; - int col_in_tile = j % TILE_SIZE; - int byte_offset = meta_offset + row_in_tile * TILE_SIZE + col_in_tile; - if (byte_offset < (int)metadata.size()) { - metadata[byte_offset] = mask; + int row_in_tile = i % TILE_SIZE; + int k_grp = (i % TILE_SIZE) / 4; // Which group of 4 in this row + int byte_idx = meta_offset + row_in_tile * 8 + k_grp / 2; + if (byte_idx < (int)metadata.size()) { + if (k_grp % 2 == 0) { + metadata[byte_idx] = (mask << 4); // Upper nibble + } else { + metadata[byte_idx] |= mask; // Lower nibble + } } } } @@ -301,12 +360,16 @@ int main(int argc, char *argv[]) { std::cout << "Opening device connection..." << std::endl; RT_CHECK(vx_dev_open(&device)); - // Generate batched input - std::vector h_input(BATCH_SIZE * INPUT_DIM); - for (auto& v : h_input) { + // Generate batched input (row-major) + std::vector h_input_row_major(BATCH_SIZE * INPUT_DIM); + for (auto& v : h_input_row_major) { v = static_cast(rand()) / RAND_MAX; } + // Convert to tile-major for device + std::vector h_input_tile_major; + convert_to_tile_major(h_input_row_major, BATCH_SIZE, INPUT_DIM, h_input_tile_major); + // Generate weights, biases, and optional sparse compression std::vector> h_weights_dense(NUM_LAYERS); std::vector> h_weights_compressed(NUM_LAYERS); @@ -394,8 +457,8 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_mem_alloc(device, layer_configs.size() * sizeof(layer_config_t), VX_MEM_READ, &layer_configs_buffer)); RT_CHECK(vx_copy_to_dev(layer_configs_buffer, layer_configs.data(), 0, layer_configs.size() * sizeof(layer_config_t))); - std::cout << "Uploading input data (" << BATCH_SIZE << " samples)..." << std::endl; - RT_CHECK(vx_copy_to_dev(input_buffer, h_input.data(), 0, BATCH_SIZE * INPUT_DIM * sizeof(TYPE))); + std::cout << "Uploading input data (" << BATCH_SIZE << " samples, tile-major)..." << std::endl; + RT_CHECK(vx_copy_to_dev(input_buffer, h_input_tile_major.data(), 0, BATCH_SIZE * INPUT_DIM * sizeof(TYPE))); std::cout << "Uploading kernel binary..." << std::endl; RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer)); @@ -420,14 +483,18 @@ int main(int argc, char *argv[]) { auto duration = std::chrono::duration_cast(end - start); std::cout << "Elapsed time: " << duration.count() << " ms" << std::endl; - std::cout << "Downloading output (" << BATCH_SIZE << " samples)..." << std::endl; - std::vector h_output(BATCH_SIZE * OUTPUT_DIM); - RT_CHECK(vx_copy_from_dev(h_output.data(), output_buffer, 0, BATCH_SIZE * OUTPUT_DIM * sizeof(TYPE))); + std::cout << "Downloading output (" << BATCH_SIZE << " samples, tile-major)..." << std::endl; + std::vector h_output_tile_major(BATCH_SIZE * OUTPUT_DIM); + RT_CHECK(vx_copy_from_dev(h_output_tile_major.data(), output_buffer, 0, BATCH_SIZE * OUTPUT_DIM * sizeof(TYPE))); + + // Convert output back to row-major for comparison + std::vector h_output; + convert_from_tile_major(h_output_tile_major, BATCH_SIZE, OUTPUT_DIM, h_output); // CPU reference uses logical weights (with zeros for pruned elements) std::cout << "Computing CPU reference..." << std::endl; std::vector h_ref; - mlp_cpu_batch(h_input, h_weights_logical, h_biases, layer_dims, h_ref); + mlp_cpu_batch(h_input_row_major, h_weights_logical, h_biases, layer_dims, h_ref); std::cout << "Verifying results..." << std::endl; int errors = 0;