Task 2.4 - Parallelizing Matrix Multiplication You are given a skeleton sequenti
ID: 3911128 • Letter: T
Question
Task 2.4 - Parallelizing Matrix Multiplication
You are given a skeleton sequential program for matrix multiplication below:
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#define M 500
#define N 500
int main(int argc, char *argv) {
//set number of threads here
omp_set_num_threads(8);
int i, j, k;
double sum;
double **A, **B, **C;
A = malloc(M*sizeof(double *));
B = malloc(M*sizeof(double *));
C = malloc(M*sizeof(double *));
for (i = 0; i < M; i++) {
A[i] = malloc(N*sizeof(double));
B[i] = malloc(N*sizeof(double));
C[i] = malloc(N*sizeof(double));
}
double start, end;
for (i = 0; i < M; i++) {
for (j = 0; j < N; j++) {
A[i][j] = j*1;
B[i][j] = i*j+2;
C[i][j] = j-i*2;
}
}
start = omp_get_wtime();
for (i = 0; i < M; i++) {
for (j = 0; j < N; j++) {
sum = 0;
for (k=0; k < M; k++) {
sum += A[i][k]*B[k][j];
}
C[i][j] = sum;
}
}
end = omp_get_wtime();
printf("Time of computation: %f ", end-start);
}
You are to parallelize this algorithm in three different ways:
Add the necessary pragma to parallelize the outer for loop
Remove the pragma for the outer for loop and create a pragma for the middle for loop
3. Add the necessary pragmas to parallelize both the outer and middle for loops
and collect timing data given 2 thread, 4 threads, 8 threads, 16 threads, 32 threads and three matrix sizes. You
will find that when you run the same program several times, the timing values can vary significantly. Therefore
for each set of conditions, collect five data values and average them. Use a spreadsheet program either from MS Office or OpenOffice to collect and store your data and perform the necessary calculations and illustrate the results visually with Charts.
Here are the conditions you should collect data for:
No parallelization at all (that is, the given program)
Parallelizing the loops as above with 2, 4, 8, 16, 32 threads using matrix sizes 50x50, 500x500 and 5000x5000
Test run set 1 using 1 CPU in your own VM.
<>4.Test run set 2 using maximum number of cores supported by your VM (set corresponding number of cores in your VM according to your physical machine) and describe in learning report and excel chart(s) the max number of cores used.
Explanation / Answer
1ST Method
#include <stdio.h>
#include <time.h>
#include <omp.h>
#include <stdlib.h>
int main() {
int i, j, n, a[719][719], b[719], c[719];
clock_t start = clock();
n = 100; //Max 719
printf("Matrix A ");
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
a[i][j] = 10;
printf("%d ", a[i][j]);
}
printf(" ");
}
printf(" Matrix B ");
#pragma omp parallel private(i) shared(b)
{
#pragma omp for
for (i = 0; i < n; ++i) {
b[i] = 5;
printf("%d ", b[i]);
}
}
printf(" A * B ");
#pragma omp parallel private(i) shared(c)
{
#pragma omp for
for (i = 0; i < n; ++i) {
c[i] = 0;
}
}
#pragma omp parallel private(i,j) shared(n,a,b,c)
{
#pragma omp for schedule(dynamic)
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
c[i] += b[j] * a[j][i];
}
}
}
#pragma omp parallel private(i) shared(c)
{
#pragma omp for
for (i = 0; i < n; ++i) {
printf("%d ", c[i]);
}
}
clock_t stop = clock();
double elapsed = (double) (stop - start) / CLOCKS_PER_SEC;
printf(" Time elapsed: %.5f ", elapsed);
start = clock();
printf("Matrix A ");
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
a[i][j] = 10;
printf("%d ", a[i][j]);
}
printf(" ");
}
printf(" Matrix B ");
#pragma omp parallel private(i) shared(b)
{
#pragma omp for
for (i = 0; i < n; ++i) {
b[i] = 5;
printf("%d ", b[i]);
}
}
printf(" A * B ");
omp_set_num_threads(4);
#pragma omp parallel for
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
c[i] += b[j] * a[j][i];
}
}
stop = clock();
elapsed = (double) (stop - start) / CLOCKS_PER_SEC;
printf(" Time elapsed: %.5f ", elapsed);
return 0;
}
2ND Method
#include <algorithm>
#include <omp.h>
#include <bitset>
#include <cctype>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <deque>
#include <fstream>
#include <iostream>
#include <list>
#include <map>
#include <queue>
#include <set>
#include <sstream>
#include <stack>
#include <string>
#include <utility>
#include <vector>
#include <assert.h>
using namespace std;
#define READ(f) freopen(f, "r", stdin)
#define WRITE(f) freopen(f, "w", stdout)
#define pks printf("Case %d: ",++ks);
#define mx 1002
int a[mx][mx];
int b[mx][mx];
int c[mx][mx];
int d[mx][mx];
void generate_matrix(int n)
{
int i,j;
for(i=0;i<n;i++)
{
for(j=0;j<n;j++)
{
a[i][j]=rand()%100;
b[i][j]=rand()%100;
}
}
}
void check(int n)
{
int i,j;
for(i=0;i<n;i++)
{
for(j=0;j<n;j++)
assert(c[i][j]==d[i][j]);
}
}
void matrix_mult_serial(int n)
{
int i,j,k;
double st=omp_get_wtime();
for(i=0;i<n;i++)
{
for(j=0;j<n;j++)
{
for(k=0;k<n;k++)
{
c[i][j]+=a[i][k]*b[k][j];
}
}
}
double en=omp_get_wtime();
printf("Serial: %lf ",en-st);
}
void matrix_mult_parallel1(int n)
{
//Static Scheduler
memset(d,0,sizeof d);
int i,j,k;
double st=omp_get_wtime();
#pragma omp parallel for schedule(static,50) collapse(2) private(i,j,k)shared(a,b,c)
for(i=0;i<n;i++)for( j=0;j<n;j++)for(k=0;k<n;k++)d[i][j]+=a[i][k]*b[k][j];
double en=omp_get_wtime();
printf("Parallel-1(Static Scheduler) %lf ",en-st);
check(n);
}
void matrix_mult_parallel2(int n)
{
//Dynamic Scheduler
memset(d,0,sizeof d);
int i,j,k;
double st=omp_get_wtime();
#pragma omp parallel for schedule(dynamic,50) collapse(2) private(i,j,k) shared(a,b,c)
for(i=0;i<n;i++)for( j=0;j<n;j++)for(k=0;k<n;k++)d[i][j]+=a[i][k]*b[k][j];
double en=omp_get_wtime();
printf("Parallel-2(Dynamic Scheduler) %lf ",en-st);
check(n);
}
int main() {
//READ("in");
//WRITE("out2");
int n=500;
generate_matrix(n);
matrix_mult_serial(n);
matrix_mult_parallel1(n);
matrix_mult_parallel2(n);
return 0;
}
Related Questions
drjack9650@gmail.com
Navigate
Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.