Below are two programs written in C. When tested, the execution times vary. Plea
ID: 3667043 • Letter: B
Question
Below are two programs written in C. When tested, the execution times vary. Please explain why in terms of memory, caches, etc. I can already see that there is a difference in row leading vs column leading between the two programs but would like to know what else makes the execution times vary on a bigger scale.
PROGRAM 1:
#include <stdio.h>
#include <stdlib.h>
void matrix_vector_multiply(long long *y, long long *A, long long *x, long long size) {
long long i, j;
for (i = 0; i < size; ++i) {
for (j = 0; j < size; ++j) {
y[i] += A[i*size + j] * x[j];
}
}
}
int main(int argc, char **argv) {
long long size = 100;
if (argc >= 2) size = atoi(argv[1]);
// y = Ax
long long *A = (long long *)malloc(sizeof(long long) * size * size);
long long *x = (long long *)malloc(sizeof(long long) * size);
long long *y = (long long *)calloc(size, sizeof(long long));
long long i, j;
for (i = 0; i < size; ++i)
for (j = 0; j < size; j++)
A[i*size + j] = j;
for (i = 0; i < size; ++i)
x[i] = i;
matrix_vector_multiply(y, A, x, size);
for (i = 1; i < size; ++i)
if (y[i] != y[i-1]) {
printf("failed ");
return 1;
}
printf("passed(%ld) ", y[0]);
free(A);
free(x);
free(y);
return 0;
}
PROGRAM 2:
#include <stdio.h>
#include <stdlib.h>
void matrix_vector_multiply(long long *y, long long *A, long long *x, long long size) {
long long i, j;
for (i = 0; i < size; ++i) {
for (j = 0; j < size; ++j) {
y[j] += A[j*size + i] * x[i];
}
}
}
int main(int argc, char **argv) {
long long size = 100;
if (argc >= 2) size = atoi(argv[1]);
// y = Ax
long long *A = (long long *)malloc(sizeof(long long) * size * size);
long long *x = (long long *)malloc(sizeof(long long) * size);
long long *y = (long long *)calloc(size, sizeof(long long));
long long i, j;
for (i = 0; i < size; ++i)
for (j = 0; j < size; j++)
A[i*size + j] = j;
for (i = 0; i < size; ++i)
x[i] = i;
matrix_vector_multiply(y, A, x, size);
for (i = 1; i < size; ++i)
if (y[i] != y[i-1]) {
printf("failed ");
return 1;
}
printf("passed(%ld) ", y[0]);
free(A);
free(x);
free(y);
return 0;
}
Explanation / Answer
That is due to spatial locality. When your program needs some data from the memory, the processor not only reads that specific data but also the neighboring data. So, in the next iteration when you need the next set of data, it is already there in your cache.
In the other case, your program can't take advantage of spatial locality since you are not reading the neighboring data in consecutive iterations.
Say your data is laid out in the memory like:
When your program needs to read say data labeled 0, it reads the entire row:
0 1 2 3 4 5 6 7 8 9
So, that when you need data labeled 1, it is already in the cache and your program runs faster.
On the contrary if you are reading data column wise, this doesn't help you, each time you get a cache miss and the processor again has to do a memory read.
In short, memory read are costly, this is processor's way of optimizing reads to save time.
Related Questions
drjack9650@gmail.com
Navigate
Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.