Academic Integrity: tutoring, explanations, and feedback — we don’t complete graded work or submit on a student’s behalf.

The code below is for matrix multiplication on GPU using CUDA. The code should h

ID: 3633534 • Letter: T

Question

The code below is for matrix multiplication on GPU using CUDA. The code should handle the following matrix sizes
1. 512 x 512
2. 1500 x 1500
3. 5000 x 5000
4. 11000 x 11000
5. 17000 x 17000

Please I need a 'main' in my code and check to see if the code can handle the above matrix sizes given above.



#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <omp.h>

// Thread block size
#define BLOCK_SIZE 16

// Forward declaration of the device multiplication function
__global__ void Muld(float*, float*, int, int, float*);

// Host multiplication function
// Compute C = A * B
// hA is the height of A
// wA is the width of A
// wB is the width of B
void Mul(const float* A, const float* B, int hA, int wA, int wB,
float* C)
{
int size;

// Load A and B to the device
float* Ad;
size = hA * wA * sizeof(float);
cudaMalloc((void**)&Ad, size);
cudaMemcpy(Ad, A, size, cudaMemcpyHostToDevice);
float* Bd;
size = wA * wB * sizeof(float);
cudaMalloc((void**)&Bd, size);
cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);

// Allocate C on the device
float* Cd;
size = hA * wB * sizeof(float);
cudaMalloc((void**)&Cd, size);

// Compute the execution configuration assuming
// the matrix dimensions are multiples of BLOCK_SIZE

/********************
calculates the execution configuration
effectively the kernel function <Muld> will be
executed concurrently by BLOCK_SIZE^2 GPU threads
************************/
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(wB / dimBlock.x, hA / dimBlock.y);

// Launch the device computation
Muld<<<dimGrid, dimBlock>>>(Ad, Bd, wA, wB, Cd);

// Read C from the device
cudaMemcpy(C, Cd, size, cudaMemcpyDeviceToHost);

// Free device memory
cudaFree(Ad);
cudaFree(Bd);
cudaFree(Cd);
}

Explanation / Answer

int size = Width*Width*sizeof(float); float* Md, *Nd, *Pd; cudaError_t err = cudaSuccess; //Allocate Device Memory for M, N and P err = cudaMalloc((void**)&Md, size); err = cudaMalloc((void**)&Nd, size); err = cudaMalloc((void**)&Pd, size); //Copy Matrix from Host Memory to Device Memory err = cudaMemcpy(Md, M, size, cudaMemcpyHostToDevice); err = cudaMemcpy(Nd, N, size, cudaMemcpyHostToDevice); //Setup the execution configuration dim3 dimBlock(TileWidth, TileWidth, 1); dim3 dimGrid(ceil((float)(Width)/TileWidth), ceil((float)(Width)/TileWidth), 1); MatrixMultiplicationMultiBlock_Kernel(Md, Nd, Pd, Width); err = cudaMemcpy(P, Pd, size, cudaMemcpyDeviceToHost); //Free Device Memory cudaFree(Md); cudaFree(Nd); cudaFree(Pd);
Hire Me For All Your Tutoring Needs
Integrity-first tutoring: clear explanations, guidance, and feedback.
Drop an Email at
drjack9650@gmail.com
Chat Now And Get Quote