Task 2.4 - Parallelizing Matrix Multiplication You are given a skeleton sequenti

ID: 3911128 • Letter: T

Question

Task 2.4 - Parallelizing Matrix Multiplication

You are given a skeleton sequential program for matrix multiplication below:

#include <omp.h>

#include <stdio.h>

#include <stdlib.h>

#define M 500

#define N 500

int main(int argc, char *argv) {

//set number of threads here

omp_set_num_threads(8);

int i, j, k;

double sum;

double **A, **B, **C;

A = malloc(M*sizeof(double *));

B = malloc(M*sizeof(double *));

C = malloc(M*sizeof(double *));

for (i = 0; i < M; i++) {

A[i] = malloc(N*sizeof(double));

B[i] = malloc(N*sizeof(double));

C[i] = malloc(N*sizeof(double));

}

double start, end;

for (i = 0; i < M; i++) {

for (j = 0; j < N; j++) {

A[i][j] = j*1;

B[i][j] = i*j+2;

C[i][j] = j-i*2;

}

start = omp_get_wtime();

for (i = 0; i < M; i++) {

for (j = 0; j < N; j++) {

sum = 0;

for (k=0; k < M; k++) {

sum += A[i][k]*B[k][j];

}

C[i][j] = sum;

}

end = omp_get_wtime();

printf("Time of computation: %f ", end-start);

}

You are to parallelize this algorithm in three different ways:

Add the necessary pragma to parallelize the outer for loop

Remove the pragma for the outer for loop and create a pragma for the middle for loop

3. Add the necessary pragmas to parallelize both the outer and middle for loops

and collect timing data given 2 thread, 4 threads, 8 threads, 16 threads, 32 threads and three matrix sizes. You

will find that when you run the same program several times, the timing values can vary significantly. Therefore

for each set of conditions, collect five data values and average them. Use a spreadsheet program either from MS Office or OpenOffice to collect and store your data and perform the necessary calculations and illustrate the results visually with Charts.

Here are the conditions you should collect data for:

No parallelization at all (that is, the given program)

Parallelizing the loops as above with 2, 4, 8, 16, 32 threads using matrix sizes 50x50, 500x500 and 5000x5000

Test run set 1 using 1 CPU in your own VM.

<>4.Test run set 2 using maximum number of cores supported by your VM (set corresponding number of cores in your VM according to your physical machine) and describe in learning report and excel chart(s) the max number of cores used.

Explanation / Answer

1ST Method

#include <stdio.h>

#include <time.h>

#include <omp.h>

#include <stdlib.h>

int main() {

int i, j, n, a[719][719], b[719], c[719];

clock_t start = clock();

n = 100; //Max 719

printf("Matrix A ");

for (i = 0; i < n; ++i) {

for (j = 0; j < n; ++j) {

a[i][j] = 10;

printf("%d ", a[i][j]);

}

printf(" ");

}

printf(" Matrix B ");

#pragma omp parallel private(i) shared(b)

{

#pragma omp for

for (i = 0; i < n; ++i) {

b[i] = 5;

printf("%d ", b[i]);

}

printf(" A * B ");

#pragma omp parallel private(i) shared(c)

{

#pragma omp for

for (i = 0; i < n; ++i) {

c[i] = 0;

}

#pragma omp parallel private(i,j) shared(n,a,b,c)

{

#pragma omp for schedule(dynamic)

for (i = 0; i < n; ++i) {

for (j = 0; j < n; ++j) {

c[i] += b[j] * a[j][i];

}

#pragma omp parallel private(i) shared(c)

{

#pragma omp for

for (i = 0; i < n; ++i) {

printf("%d ", c[i]);

}

clock_t stop = clock();

double elapsed = (double) (stop - start) / CLOCKS_PER_SEC;

printf(" Time elapsed: %.5f ", elapsed);

start = clock();

printf("Matrix A ");

for (i = 0; i < n; ++i) {

for (j = 0; j < n; ++j) {

a[i][j] = 10;

printf("%d ", a[i][j]);

}

printf(" ");

}

printf(" Matrix B ");

#pragma omp parallel private(i) shared(b)

{

#pragma omp for

for (i = 0; i < n; ++i) {

b[i] = 5;

printf("%d ", b[i]);

}

printf(" A * B ");

omp_set_num_threads(4);

#pragma omp parallel for

for (i = 0; i < n; i++) {

for (j = 0; j < n; j++) {

c[i] += b[j] * a[j][i];

}

stop = clock();

elapsed = (double) (stop - start) / CLOCKS_PER_SEC;

printf(" Time elapsed: %.5f ", elapsed);

return 0;

}

2ND Method

#include <algorithm>

#include <omp.h>

#include <bitset>

#include <cctype>

#include <cmath>

#include <cstdio>

#include <cstdlib>

#include <cstring>

#include <ctime>

#include <deque>

#include <fstream>

#include <iostream>

#include <list>

#include <map>

#include <queue>

#include <set>

#include <sstream>

#include <stack>

#include <string>

#include <utility>

#include <vector>

#include <assert.h>

using namespace std;

#define READ(f) freopen(f, "r", stdin)

#define WRITE(f) freopen(f, "w", stdout)

#define pks printf("Case %d: ",++ks);

#define mx 1002

int a[mx][mx];

int b[mx][mx];

int c[mx][mx];

int d[mx][mx];

void generate_matrix(int n)

{

int i,j;

for(i=0;i<n;i++)

{

for(j=0;j<n;j++)

{

a[i][j]=rand()%100;

b[i][j]=rand()%100;

}

void check(int n)

{

int i,j;

for(i=0;i<n;i++)

{

for(j=0;j<n;j++)

assert(c[i][j]==d[i][j]);

}

void matrix_mult_serial(int n)

{

int i,j,k;

double st=omp_get_wtime();

for(i=0;i<n;i++)

{

for(j=0;j<n;j++)

{

for(k=0;k<n;k++)

{

c[i][j]+=a[i][k]*b[k][j];

}

double en=omp_get_wtime();

printf("Serial: %lf ",en-st);

}

void matrix_mult_parallel1(int n)

{

//Static Scheduler

memset(d,0,sizeof d);

int i,j,k;

double st=omp_get_wtime();

#pragma omp parallel for schedule(static,50) collapse(2) private(i,j,k)shared(a,b,c)

for(i=0;i<n;i++)for( j=0;j<n;j++)for(k=0;k<n;k++)d[i][j]+=a[i][k]*b[k][j];

double en=omp_get_wtime();

printf("Parallel-1(Static Scheduler) %lf ",en-st);

check(n);

}

void matrix_mult_parallel2(int n)

{

//Dynamic Scheduler

memset(d,0,sizeof d);

int i,j,k;

double st=omp_get_wtime();

#pragma omp parallel for schedule(dynamic,50) collapse(2) private(i,j,k) shared(a,b,c)

for(i=0;i<n;i++)for( j=0;j<n;j++)for(k=0;k<n;k++)d[i][j]+=a[i][k]*b[k][j];

double en=omp_get_wtime();

printf("Parallel-2(Dynamic Scheduler) %lf ",en-st);

check(n);

}

int main() {

//READ("in");

//WRITE("out2");

int n=500;

generate_matrix(n);

matrix_mult_serial(n);

matrix_mult_parallel1(n);

matrix_mult_parallel2(n);

return 0;

}

Navigate

Task 2.2 - Work Sharing This task explores the use of the for work-sharing const

Task 2.4 - Parallelizing Matrix Multiplication You are given a skeleton sequenti

Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.

Task 2.4 - Parallelizing Matrix Multiplication You are given a skeleton sequenti

Question

Explanation / Answer

Related Questions

Navigate