Modify the reduction algorithm so the output reads \"TEST PASSED, cpu = 494503.9

ID: 3861750 • Letter: M

Question

Modify the reduction algorithm so the output reads "TEST PASSED, cpu = 494503.906, gpu = 494503.906".

kernel.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>

#define BLOCK_SIZE 512

__global__ void reduction(float *input, float *output, int len) {

   __shared__ float partialSum[2 * BLOCK_SIZE];
   unsigned int t = threadIdx.x;
   unsigned int start = 2 * blockIdx.x * blockDim.x;
   partialSum[t] = (t < len) ? input[start + t] : 0;
   partialSum[blockDim.x + t] = ((blockDim.x + t) < len) ? input[start + blockDim.x + t] : 0;

   for (int stride = blockDim.x; stride >= 1; stride >>= 1) {
       __syncthreads();
       if (t < stride)
           partialSum[t] += partialSum[t + stride];
   }

   if (t == 0) {
       output[blockIdx.x + t] = partialSum[t];
   }

}

void initVector(float **vec_h, int size)
{
*vec_h = (float*)malloc(size * sizeof(float));

   if (*vec_h == NULL) {
       printf("Unable to allocate host");
   }

   for (unsigned int i = 0; i < size; i++) {
       (*vec_h)[i] = (rand() % 100) / 100.00;
   }

}

void verify(float* input, unsigned num_elements, float result) {

const float relativeTolerance = 2e-5;

   float sum = 0.0f;
   for (int i = 0; i < num_elements; ++i) {
       sum += input[i];
   }
   float relativeError = (sum - result) / sum;
   if (relativeError > relativeTolerance
       || relativeError < -relativeTolerance) {
       printf("TEST FAILED, cpu = %0.3f, gpu = %0.3f ", sum, result);
       exit(0);
   }
   printf("TEST PASSED, cpu = %0.3f, gpu = %0.3f ", sum, result);
}

int main(int argc, char ** argv) {

   int i;
   float *hostInput;
   float *hostOutput;
   float *deviceInput;
   float *deviceOutput;
   unsigned numInputElements;
   unsigned numOutputElements;
   dim3 dimGrid, dimBlock;

   if (argc == 1) {
       numInputElements = 1000000;
   }

   else if (argc == 2) {
       numInputElements = atoi(argv[1]);
   }

   initVector(&hostInput, numInputElements);

numOutputElements = numInputElements / (BLOCK_SIZE << 1);

   if (numInputElements % (BLOCK_SIZE << 1)) {
       numOutputElements++;
   }

   hostOutput = (float*)malloc(numOutputElements * sizeof(float));
   cudaMalloc((void**)&deviceInput, numInputElements * sizeof(float));
   cudaMalloc((void**)&deviceOutput, numOutputElements * sizeof(float));
   cudaMemcpy(deviceInput, hostInput, numInputElements * sizeof(float), cudaMemcpyHostToDevice);
   dimBlock.x = BLOCK_SIZE; dimBlock.y = dimBlock.z = 1;
   dimGrid.x = numOutputElements; dimGrid.y = dimGrid.z = 1;
   reduction<<<dimGrid,dimBlock>>>(deviceInput, deviceOutput, numInputElements);
   cudaDeviceSynchronize();
   cudaMemcpy(hostOutput, deviceOutput, numOutputElements * sizeof(float), cudaMemcpyDeviceToHost);

   int count = 0;
   for (i = 1; i < numOutputElements; i++) {
       hostOutput[0] += hostOutput[i];
       count++;
       printf("%.1f ", hostOutput[0]);
   }

printf("Sums accumulated on host: %i ", count - 1);

verify(hostInput, numInputElements, hostOutput[0]);

   cudaFree(deviceInput);
   cudaFree(deviceOutput);
   free(hostInput);
   free(hostOutput);

return 0;

}

Explanation / Answer