Modify the reduction algorithm so the output reads \"TEST PASSED, cpu = 494503.9
ID: 3861750 • Letter: M
Question
Modify the reduction algorithm so the output reads "TEST PASSED, cpu = 494503.906, gpu = 494503.906".
kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCK_SIZE 512
__global__ void reduction(float *input, float *output, int len) {
__shared__ float partialSum[2 * BLOCK_SIZE];
unsigned int t = threadIdx.x;
unsigned int start = 2 * blockIdx.x * blockDim.x;
partialSum[t] = (t < len) ? input[start + t] : 0;
partialSum[blockDim.x + t] = ((blockDim.x + t) < len) ? input[start + blockDim.x + t] : 0;
for (int stride = blockDim.x; stride >= 1; stride >>= 1) {
__syncthreads();
if (t < stride)
partialSum[t] += partialSum[t + stride];
}
if (t == 0) {
output[blockIdx.x + t] = partialSum[t];
}
}
void initVector(float **vec_h, int size)
{
*vec_h = (float*)malloc(size * sizeof(float));
if (*vec_h == NULL) {
printf("Unable to allocate host");
}
for (unsigned int i = 0; i < size; i++) {
(*vec_h)[i] = (rand() % 100) / 100.00;
}
}
void verify(float* input, unsigned num_elements, float result) {
const float relativeTolerance = 2e-5;
float sum = 0.0f;
for (int i = 0; i < num_elements; ++i) {
sum += input[i];
}
float relativeError = (sum - result) / sum;
if (relativeError > relativeTolerance
|| relativeError < -relativeTolerance) {
printf("TEST FAILED, cpu = %0.3f, gpu = %0.3f ", sum, result);
exit(0);
}
printf("TEST PASSED, cpu = %0.3f, gpu = %0.3f ", sum, result);
}
int main(int argc, char ** argv) {
int i;
float *hostInput;
float *hostOutput;
float *deviceInput;
float *deviceOutput;
unsigned numInputElements;
unsigned numOutputElements;
dim3 dimGrid, dimBlock;
if (argc == 1) {
numInputElements = 1000000;
}
else if (argc == 2) {
numInputElements = atoi(argv[1]);
}
initVector(&hostInput, numInputElements);
numOutputElements = numInputElements / (BLOCK_SIZE << 1);
if (numInputElements % (BLOCK_SIZE << 1)) {
numOutputElements++;
}
hostOutput = (float*)malloc(numOutputElements * sizeof(float));
cudaMalloc((void**)&deviceInput, numInputElements * sizeof(float));
cudaMalloc((void**)&deviceOutput, numOutputElements * sizeof(float));
cudaMemcpy(deviceInput, hostInput, numInputElements * sizeof(float), cudaMemcpyHostToDevice);
dimBlock.x = BLOCK_SIZE; dimBlock.y = dimBlock.z = 1;
dimGrid.x = numOutputElements; dimGrid.y = dimGrid.z = 1;
reduction<<<dimGrid,dimBlock>>>(deviceInput, deviceOutput, numInputElements);
cudaDeviceSynchronize();
cudaMemcpy(hostOutput, deviceOutput, numOutputElements * sizeof(float), cudaMemcpyDeviceToHost);
int count = 0;
for (i = 1; i < numOutputElements; i++) {
hostOutput[0] += hostOutput[i];
count++;
printf("%.1f ", hostOutput[0]);
}
printf("Sums accumulated on host: %i ", count - 1);
verify(hostInput, numInputElements, hostOutput[0]);
cudaFree(deviceInput);
cudaFree(deviceOutput);
free(hostInput);
free(hostOutput);
return 0;
}
Explanation / Answer
kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCK_SIZE 512
__global__ void reduction(float *input, float *output, int len) {
__shared__ float partialSum[2 * BLOCK_SIZE];
unsigned int t = threadIdx.x;
unsigned int start = 2 * blockIdx.x * blockDim.x;
partialSum[t] = (t < len) ? input[start + t] : 0;
partialSum[blockDim.x + t] = ((blockDim.x + t) < len) ? input[start + blockDim.x + t] : 0;
for (int stride = blockDim.x; stride >= 1; stride >>= 1) {
__syncthreads();
if (t < stride)
partialSum[t] += partialSum[t + stride];
}
if (t == 0) {
output[blockIdx.x + t] = partialSum[t];
}
}
void initVector(float **vec_h, int size)
{
*vec_h = (float*)malloc(size * sizeof(float));
if (*vec_h == NULL) {
printf("Unable to allocate host");
}
for (unsigned int i = 0; i < size; i++) {
(*vec_h)[i] = (rand() % 100) / 100.00;
}
}
void verify(float* input, unsigned num_elements, float result) {
const float relativeTolerance = 2e-5;
float sum = 0.0f;
for (int i = 0; i < num_elements; ++i) {
sum += input[i];
}
float relativeError = (sum - result) / sum;
if (relativeError > relativeTolerance
|| relativeError < -relativeTolerance) {
printf("TEST FAILED, cpu = %0.3f, gpu = %0.3f ", sum, result);
exit(0);
}
printf("TEST PASSED, cpu = %0.3f, gpu = %0.3f ", sum, result);
}
int main(int argc, char ** argv) {
int i;
float *hostInput;
float *hostOutput;
float *deviceInput;
float *deviceOutput;
unsigned numInputElements;
unsigned numOutputElements;
dim3 dimGrid, dimBlock;
if (argc == 1) {
numInputElements = 1000000;
}
else if (argc == 2) {
numInputElements = atoi(argv[1]);
}
initVector(&hostInput, numInputElements);
numOutputElements = numInputElements / (BLOCK_SIZE << 1);
if (numInputElements % (BLOCK_SIZE << 1)) {
numOutputElements++;
}
hostOutput = (float*)malloc(numOutputElements * sizeof(float));
cudaMalloc((void**)&deviceInput, numInputElements * sizeof(float));
cudaMalloc((void**)&deviceOutput, numOutputElements * sizeof(float));
cudaMemcpy(deviceInput, hostInput, numInputElements * sizeof(float), cudaMemcpyHostToDevice);
dimBlock.x = BLOCK_SIZE; dimBlock.y = dimBlock.z = 1;
dimGrid.x = numOutputElements; dimGrid.y = dimGrid.z = 1;
reduction<<<dimGrid,dimBlock>>>(deviceInput, deviceOutput, numInputElements);
cudaDeviceSynchronize();
cudaMemcpy(hostOutput, deviceOutput, numOutputElements * sizeof(float), cudaMemcpyDeviceToHost);
int count = 0;
for (i = 1; i < numOutputElements; i++) {
hostOutput[0] += hostOutput[i];
count++;
printf("%.1f ", hostOutput[0]);
}
printf("Sums accumulated on host: %i ", count - 1);
verify(hostInput, numInputElements, hostOutput[0]);
cudaFree(deviceInput);
cudaFree(deviceOutput);
free(hostInput);
free(hostOutput);
return 0;
}
Related Questions
drjack9650@gmail.com
Navigate
Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.