-
Notifications
You must be signed in to change notification settings - Fork 434
Added MLP test for baseline and VEGETA engine. #316
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
tandonmitul27
wants to merge
2
commits into
vortexgpgpu:veg
Choose a base branch
from
tandonmitul27:veg
base: veg
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,14 @@ | ||
| ROOT_DIR := $(realpath ../../..) | ||
| include $(ROOT_DIR)/config.mk | ||
|
|
||
| PROJECT := mlp | ||
|
|
||
| SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) | ||
|
|
||
| SRCS := $(SRC_DIR)/main.cpp | ||
|
|
||
| VX_SRCS := $(SRC_DIR)/kernel.cpp | ||
|
|
||
| OPTS ?= | ||
|
|
||
| include ../common.mk |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| #ifndef _COMMON_H_ | ||
| #define _COMMON_H_ | ||
|
|
||
| #include <stdint.h> | ||
|
|
||
| #ifndef TYPE | ||
| #define TYPE float | ||
| #endif | ||
|
|
||
| // MLP Network Configuration (all dimensions multiples of 16) | ||
| #define INPUT_DIM 128 | ||
| #define HIDDEN1_DIM 256 | ||
| #define HIDDEN2_DIM 128 | ||
| #define HIDDEN3_DIM 64 | ||
| #define OUTPUT_DIM 16 | ||
|
|
||
| // Number of layers | ||
| #define NUM_LAYERS 4 | ||
|
|
||
| // Layer configuration structure | ||
| typedef struct { | ||
| uint32_t input_dim; | ||
| uint32_t output_dim; | ||
| uint64_t weights_addr; // Weight matrix: output_dim x input_dim | ||
| uint64_t bias_addr; // Bias vector: output_dim | ||
| } layer_config_t; | ||
|
|
||
| // Kernel arguments | ||
| typedef struct { | ||
| uint32_t num_layers; | ||
| uint32_t batch_size; // Number of input samples | ||
| uint64_t input_addr; // Input data | ||
| uint64_t output_addr; // Final output | ||
| uint64_t layer_configs_addr; // Pointer to layer configurations | ||
| // Intermediate buffers for layer outputs | ||
| uint64_t buffer1_addr; // For layer 1 output / layer 2 input | ||
| uint64_t buffer2_addr; // For layer 2 output / layer 3 input | ||
| } kernel_arg_t; | ||
|
|
||
| #endif | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,164 @@ | ||
| #include <vx_spawn.h> | ||
| #include <vx_intrinsics.h> | ||
| #include <math.h> | ||
| #include "common.h" | ||
|
|
||
| // This kernel is designed for single-core parallel execution (scales with warps/threads). Multi-core would require inter-core synchronization between layers, which Vortex's vx_spawn_threads doesn't provide yet (to the best of my knowledge)... | ||
|
|
||
| // ReLU activation function | ||
| inline TYPE relu(TYPE x) { | ||
| return (x > 0) ? x : 0; | ||
| } | ||
|
|
||
| // Layer execution arguments passed to spawned threads | ||
| typedef struct { | ||
| TYPE* input; | ||
| TYPE* weights; | ||
| TYPE* bias; | ||
| TYPE* output; | ||
| uint32_t input_dim; | ||
| uint32_t output_dim; | ||
| bool apply_relu; | ||
| } layer_args_t; | ||
|
|
||
|
|
||
| // Softmax execution arguments | ||
| typedef struct { | ||
| TYPE* data; | ||
| uint32_t size; | ||
| TYPE* max_val; | ||
| TYPE* sum_exp; | ||
| } softmax_args_t; | ||
|
|
||
| // Global shared variables for softmax reductions | ||
| TYPE g_softmax_max_val; | ||
| TYPE g_softmax_sum_exp; | ||
|
|
||
| // Thread function: thread 0 finds max, others wait | ||
| void softmax_find_max_thread(softmax_args_t* __UNIFORM__ args) { | ||
| if (blockIdx.x == 0) { | ||
| // Thread 0 does the reduction serially | ||
| TYPE max_val = args->data[0]; | ||
| for (uint32_t i = 1; i < args->size; ++i) { | ||
| if (args->data[i] > max_val) { | ||
| max_val = args->data[i]; | ||
| } | ||
| } | ||
| *(args->max_val) = max_val; | ||
| } | ||
| } | ||
|
|
||
| // Thread function: compute exp in parallel | ||
| void softmax_exp_thread(softmax_args_t* __UNIFORM__ args) { | ||
| uint32_t i = blockIdx.x; | ||
|
|
||
| // All threads compute their exp value | ||
| if (i < args->size) { | ||
| args->data[i] = expf(args->data[i] - *(args->max_val)); | ||
| } | ||
| } | ||
|
|
||
| // Thread function: single thread sums all exp values | ||
| void softmax_sum_thread(softmax_args_t* __UNIFORM__ args) { | ||
| TYPE sum_exp = 0.0f; | ||
| for (uint32_t j = 0; j < args->size; ++j) { | ||
| sum_exp += args->data[j]; | ||
| } | ||
| *(args->sum_exp) = sum_exp; | ||
| } | ||
|
|
||
| // Thread function: normalize in parallel | ||
| void softmax_normalize_thread(softmax_args_t* __UNIFORM__ args) { | ||
| uint32_t i = blockIdx.x; | ||
| if (i < args->size) { | ||
| args->data[i] /= *(args->sum_exp); | ||
| } | ||
| } | ||
|
|
||
| // Softmax implementation | ||
| void apply_softmax(TYPE* data, uint32_t size) { | ||
| softmax_args_t args; | ||
| args.data = data; | ||
| args.size = size; | ||
| args.max_val = &g_softmax_max_val; | ||
| args.sum_exp = &g_softmax_sum_exp; | ||
|
|
||
| uint32_t single_thread = 1; | ||
| vx_spawn_threads(1, &single_thread, nullptr, (vx_kernel_func_cb)softmax_find_max_thread, &args); | ||
|
|
||
| vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_exp_thread, &args); | ||
|
|
||
| vx_spawn_threads(1, &single_thread, nullptr, (vx_kernel_func_cb)softmax_sum_thread, &args); | ||
|
|
||
| vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_normalize_thread, &args); | ||
| } | ||
|
|
||
| // Each thread computes one output neuron | ||
| // y[i] = sum_j(W[i][j] * x[j]) + b[i] | ||
| void fc_layer_thread(layer_args_t* __UNIFORM__ args) { | ||
| uint32_t i = blockIdx.x; // Output neuron index | ||
|
|
||
| if (i >= args->output_dim) return; | ||
|
|
||
| TYPE sum = 0; | ||
|
|
||
| // Compute dot product: W[i] · input | ||
| for (uint32_t j = 0; j < args->input_dim; ++j) { | ||
| sum += args->weights[i * args->input_dim + j] * args->input[j]; | ||
| } | ||
|
|
||
| // Add bias | ||
| sum += args->bias[i]; | ||
|
|
||
| // Apply ReLU activation (except for output layer) | ||
| if (args->apply_relu) { | ||
| sum = relu(sum); | ||
| } | ||
|
|
||
| args->output[i] = sum; | ||
| } | ||
|
|
||
| // MLP inference kernel | ||
| void kernel_body(kernel_arg_t* __UNIFORM__ arg) { | ||
| auto layer_configs = reinterpret_cast<layer_config_t*>(arg->layer_configs_addr); | ||
|
|
||
| // Layer buffers | ||
| auto input = reinterpret_cast<TYPE*>(arg->input_addr); | ||
| auto buffer1 = reinterpret_cast<TYPE*>(arg->buffer1_addr); | ||
| auto buffer2 = reinterpret_cast<TYPE*>(arg->buffer2_addr); | ||
| auto output = reinterpret_cast<TYPE*>(arg->output_addr); | ||
|
|
||
| // Pointer arrays for ping-pong buffering | ||
| TYPE* layer_inputs[4] = {input, buffer1, buffer2, buffer1}; | ||
| TYPE* layer_outputs[4] = {buffer1, buffer2, buffer1, output}; | ||
|
|
||
| layer_args_t layer_args; | ||
|
|
||
| // Execute each layer with parallel threads | ||
| for (uint32_t layer = 0; layer < arg->num_layers; ++layer) { | ||
| auto cfg = &layer_configs[layer]; | ||
|
|
||
| layer_args.input = layer_inputs[layer]; | ||
| layer_args.weights = reinterpret_cast<TYPE*>(cfg->weights_addr); | ||
| layer_args.bias = reinterpret_cast<TYPE*>(cfg->bias_addr); | ||
| layer_args.output = layer_outputs[layer]; | ||
| layer_args.input_dim = cfg->input_dim; | ||
| layer_args.output_dim = cfg->output_dim; | ||
| layer_args.apply_relu = (layer < arg->num_layers - 1); | ||
|
|
||
| // vx_spawn_threads provides barrier synchronization after all threads complete | ||
| vx_spawn_threads(1, &layer_args.output_dim, nullptr, | ||
| (vx_kernel_func_cb)fc_layer_thread, &layer_args); | ||
| } | ||
|
|
||
| // Apply softmax to final output | ||
| uint32_t output_size = layer_configs[arg->num_layers - 1].output_dim; | ||
| apply_softmax(output, output_size); | ||
| } | ||
|
|
||
| int main() { | ||
| kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); | ||
|
|
||
| uint32_t grid_dim = 1; | ||
| return vx_spawn_threads(1, &grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg); | ||
| } |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This header uses
uint32_t/uint64_tbut doesn't include<stdint.h>/<cstdint>. Relying on transitive includes makes builds fragile; include the standard integer header directly fromcommon.h.