Optimize the reduction algorithm in kernel.cu so that the max value of the data
ID: 3853225 • Letter: O
Question
Optimize the reduction algorithm in kernel.cu so that the max value of the data is obtained. (other related files are provided)
kernel.cu
#define BLOCK_SIZE 512
#define SIMPLE
__global__ void reduction(float *out, float *in, unsigned size)
{
/********************************************************************
Load segment of input vector into shared memory
Traverse reduction tree
Write computed sum to output vector at correct index
********************************************************************/
// INSERT KERNEL CODE HERE
#ifdef SIMPLE
__shared__ float in_s[2 * BLOCK_SIZE];
int idx = 2 * blockIdx.x * blockDim.x + threadIdx.x;
in_s[threadIdx.x] = ((idx < size) ? in[idx] : 0.0f);
in_s[threadIdx.x + BLOCK_SIZE] = ((idx + BLOCK_SIZE < size) ? in[idx + BLOCK_SIZE] : 0.0f);
for (int stride = 1; stride < BLOCK_SIZE << 1; stride <<= 1) {
__syncthreads();
if (threadIdx.x % stride == 0)
in_s[2 * threadIdx.x] += in_s[2 * threadIdx.x + stride];
}
#else
__shared__ float in_s[BLOCK_SIZE];
int idx = 2 * blockIdx.x * blockDim.x + threadIdx.x;
in_s[threadIdx.x] = ((idx < size) ? in[idx] : 0.0f) +
((idx + BLOCK_SIZE < size) ? in[idx + BLOCK_SIZE] : 0.0f);
for (int stride = BLOCK_SIZE >> 1; stride > 0; stride >>= 1) {
__syncthreads();
if (threadIdx.x < stride)
in_s[threadIdx.x] += in_s[threadIdx.x + stride];
}
#endif
if (threadIdx.x == 0)
out[blockIdx.x] = in_s[0];
}
main.cu
#include
#include "support.h"
#include "kernel.cu"
int main(int argc, char* argv[])
{
Timer timer;
// Initialize host variables ----------------------------------------------
printf(" Setting up the problem..."); fflush(stdout);
startTime(&timer);
float *in_h, *out_h;
float *in_d, *out_d;
unsigned in_elements, out_elements;
cudaError_t cuda_ret;
dim3 dim_grid, dim_block;
int i;
// Allocate and initialize host memory
if (argc == 1) {
in_elements = 1000000;
}
else if (argc == 2) {
in_elements = atoi(argv[1]);
}
else {
printf(" Invalid input parameters!"
" Usage: ./reduction # Input of size 1,000,000 is used"
" Usage: ./reduction # Input of size m is used"
" ");
exit(0);
}
initVector(&in_h, in_elements);
out_elements = in_elements / (BLOCK_SIZE << 1);
if (in_elements % (BLOCK_SIZE << 1)) out_elements++;
out_h = (float*)malloc(out_elements * sizeof(float));
if (out_h == NULL) FATAL("Unable to allocate host");
stopTime(&timer); printf("%f s ", elapsedTime(timer));
printf(" Input size = %u ", in_elements);
// Allocate device variables ----------------------------------------------
printf("Allocating device variables..."); fflush(stdout);
startTime(&timer);
cuda_ret = cudaMalloc((void**)&in_d, in_elements * sizeof(float));
if (cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");
cuda_ret = cudaMalloc((void**)&out_d, out_elements * sizeof(float));
if (cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s ", elapsedTime(timer));
// Copy host variables to device ------------------------------------------
printf("Copying data from host to device..."); fflush(stdout);
startTime(&timer);
cuda_ret = cudaMemcpy(in_d, in_h, in_elements * sizeof(float),
cudaMemcpyHostToDevice);
if (cuda_ret != cudaSuccess) FATAL("Unable to copy memory to the device");
cuda_ret = cudaMemset(out_d, 0, out_elements * sizeof(float));
if (cuda_ret != cudaSuccess) FATAL("Unable to set device memory");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s ", elapsedTime(timer));
// Launch kernel ----------------------------------------------------------
printf("Launching kernel..."); fflush(stdout);
startTime(&timer);
dim_block.x = BLOCK_SIZE; dim_block.y = dim_block.z = 1;
dim_grid.x = out_elements; dim_grid.y = dim_grid.z = 1;
reduction << > >(out_d, in_d, in_elements);
cuda_ret = cudaDeviceSynchronize();
if (cuda_ret != cudaSuccess) FATAL("Unable to launch/execute kernel");
stopTime(&timer); printf("%f s ", elapsedTime(timer));
// Copy device variables from host ----------------------------------------
printf("Copying data from device to host..."); fflush(stdout);
startTime(&timer);
cuda_ret = cudaMemcpy(out_h, out_d, out_elements * sizeof(float),
cudaMemcpyDeviceToHost);
if (cuda_ret != cudaSuccess) FATAL("Unable to copy memory to host");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s ", elapsedTime(timer));
// Verify correctness -----------------------------------------------------
printf("Verifying results..."); fflush(stdout);
/* Accumulate partial sums on host */
for (i = 1; i out_h[0] += out_h[i];
}
/* Verify the result */
verify(in_h, in_elements, out_h[0]);
// Free memory ------------------------------------------------------------
cudaFree(in_d); cudaFree(out_d);
free(in_h); free(out_h);
return 0;
}
support.cu
#include
#include
#include "support.h"
void initVector(float **vec_h, unsigned size)
{
*vec_h = (float*)malloc(size * sizeof(float));
if (*vec_h == NULL) {
FATAL("Unable to allocate host");
}
for (unsigned int i = 0; i < size; i++) {
(*vec_h)[i] = (rand() % 100) / 100.00;
}
}
void verify(float* input, unsigned num_elements, float result) {
const float relativeTolerance = 2e-5;
float sum = 0.0f;
for (int i = 0; i < num_elements; ++i) {
sum += input[i];
}
float relativeError = (sum - result) / sum;
if (relativeError > relativeTolerance
|| relativeError < -relativeTolerance) {
printf("TEST FAILED, cpu = %0.3f, gpu = %0.3f ", sum, result);
exit(0);
}
printf("TEST PASSED ");
}
void startTime(Timer* timer) {
gettimeofday(&(timer->startTime), NULL);
}
void stopTime(Timer* timer) {
gettimeofday(&(timer->endTime), NULL);
}
float elapsedTime(Timer timer) {
return ((float)((timer.endTime.tv_sec - timer.startTime.tv_sec)
+ (timer.endTime.tv_usec - timer.startTime.tv_usec) / 1.0e6));
}
support.h
#ifndef __FILEH__
#define __FILEH__
#include
typedef struct {
struct timeval startTime;
struct timeval endTime;
} Timer;
#ifdef __cplusplus
extern "C" {
#endif
void initVector(float **vec_h, unsigned size);
void verify(float* input, unsigned num_elements, float result);
void startTime(Timer* timer);
void stopTime(Timer* timer);
float elapsedTime(Timer timer);
#ifdef __cplusplus
}
#endif
#define FATAL(msg, ...)
do {
fprintf(stderr, "[%s:%d] "msg" ", __FILE__, __LINE__, ##__VA_ARGS__);
exit(-1);
} while(0)
#if __BYTE_ORDER != __LITTLE_ENDIAN
# error "File I/O is not implemented for this system: wrong endianness."
#endif
#endif
Explanation / Answer
#define BLOCK_SIZE 512
__global__ void total(float * input, float * output, int len) {
__shared__ float partialSum[2 * BLOCK_SIZE];
unsigned int t = threadIdx.x;
unsigned int start = 2 * blockIdx.x * blockDim.x;
partialSum[t] = (t < len) ? input[start + t] : 0;
partialSum[blockDim.x + t] = ((blockDim.x + t) < len) ? input[start + blockDim.x + t] : 0;
for(unsigned int stride = blockDim.x; stride >= 1; stride >>= 1) {
__syncthreads();
if(t < stride)
partialSum[t] += partialSum[t + stride];
}
if(t == 0) {
output[blockIdx.x + t] = partialSum[t];
}
}
int main(int argc, char ** argv) {
int ii;
float * hostInput;
float * hostOutput;
float * deviceInput;
float * deviceOutput;
int numInputElements;
int numOutputElements;
numOutputElements = numInputElements / (BLOCK_SIZE<<1);
if (numInputElements % (BLOCK_SIZE<<1)) {
numOutputElements++;
}
hostOutput = (float*) malloc(numOutputElements * sizeof(float));
cudaMalloc((void **) &deviceInput, numInputElements * sizeof(float));
cudaMalloc((void **) &deviceOutput, numOutputElements * sizeof(float));
cudaMemcpy(deviceInput, hostInput, numInputElements * sizeof(float), cudaMemcpyHostToDevice);
dim3 DimGrid((numInputElements - 1)/BLOCK_SIZE + 1, 1, 1);
dim3 DimBlock(BLOCK_SIZE, 1, 1);
total<<<DimGrid, DimBlock>>>(deviceInput, deviceOutput, numInputElements);
cudaDeviceSynchronize();
cudaMemcpy(hostOutput, deviceOutput, numOutputElements * sizeof(float), cudaMemcpyDeviceToHost);
for (ii = 1; ii < numOutputElements; ii++) {
hostOutput[0] += hostOutput[ii];
}
cudaFree(deviceInput);
cudaFree(deviceOutput);
free(hostInput);
free(hostOutput);
return 0;
}
Related Questions
drjack9650@gmail.com
Navigate
Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.