Translate the above code (bottom using nested loops) using our DLX vector instru
ID: 3862616 • Letter: T
Question
Translate the above code (bottom using nested loops) using our DLX vector instruction set. Assume:
Vector registers of length 8
Load unit has a startup of L clocks
Adder unit has a startup of A clocks
Multiplier unit has a startup of M clocks
For vectors of length N, compute the number of clock cycles to execute the inner loop (the vector operations) both for normal execution and then for allowing changing of loads/stores/addition/ multiplication. How much speedup do we achieve with chaining?
low VL (n MVL); find odd-size piece using modulo op for (j 0; j (n/MVL) j j+1) /*outer loop*/ for (i low; i (low+VL); i i+1) runs for length VL*/ Y[i] a x[i] Y[i] /*main operation*/ low low VL; start of next vector*/ VL MVL; reset the length to maximum vector lengthExplanation / Answer
Answer:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r13
pushq %r12
pushq %rbx
subq $88, %rsp
.cfi_offset 13, -24
.cfi_offset 12, -32
.cfi_offset 3, -40
movq %fs:40, %rax
movq %rax, -40(%rbp)
xorl %eax, %eax
movl $0, -100(%rbp)
movl -84(%rbp), %eax
cltd
idivl -80(%rbp)
movl %edx, -88(%rbp)
movl $0, -92(%rbp)
jmp .L2
.L5:
movq %rsp, %rax
movq %rax, %rbx
movl -96(%rbp), %eax
movslq %eax, %rdx
subq $1, %rdx
movq %rdx, -72(%rbp)
movslq %eax, %rdx
movq %rdx, %rsi
movl $0, %edi
movslq %eax, %rdx
movq %rdx, %r8
movl $0, %r9d
cltq
salq $2, %rax
leaq 3(%rax), %rdx
movl $16, %eax
subq $1, %rax
addq %rdx, %rax
movl $16, %ecx
movl $0, %edx
divq %rcx
imulq $16, %rax, %rax
subq %rax, %rsp
movq %rsp, %rax
addq $3, %rax
shrq $2, %rax
salq $2, %rax
movq %rax, -64(%rbp)
movl -96(%rbp), %eax
movslq %eax, %rdx
subq $1, %rdx
movq %rdx, -56(%rbp)
movslq %eax, %rdx
movq %rdx, %r10
movl $0, %r11d
movslq %eax, %rdx
movq %rdx, %r12
movl $0, %r13d
cltq
salq $2, %rax
leaq 3(%rax), %rdx
movl $16, %eax
subq $1, %rax
addq %rdx, %rax
movl $16, %ecx
movl $0, %edx
divq %rcx
imulq $16, %rax, %rax
subq %rax, %rsp
movq %rsp, %rax
addq $3, %rax
shrq $2, %rax
salq $2, %rax
movq %rax, -48(%rbp)
movl -100(%rbp), %eax
movl %eax, -96(%rbp)
jmp .L3
.L4:
movq -48(%rbp), %rax
movl -96(%rbp), %edx
movslq %edx, %rdx
movl (%rax,%rdx,4), %eax
imull -76(%rbp), %eax
movl %eax, %ecx
movq -64(%rbp), %rax
movl -96(%rbp), %edx
movslq %edx, %rdx
movl (%rax,%rdx,4), %eax
addl %eax, %ecx
movq -64(%rbp), %rax
movl -96(%rbp), %edx
movslq %edx, %rdx
movl %ecx, (%rax,%rdx,4)
addl $1, -96(%rbp)
.L3:
movl -100(%rbp), %edx
movl -88(%rbp), %eax
addl %edx, %eax
cmpl -96(%rbp), %eax
jg .L4
movl -88(%rbp), %eax
addl %eax, -100(%rbp)
movl -80(%rbp), %eax
movl %eax, -88(%rbp)
movq %rbx, %rsp
addl $1, -92(%rbp)
.L2:
movl -84(%rbp), %eax
cltd
idivl -80(%rbp)
cmpl -92(%rbp), %eax
jge .L5
movl $0, %eax
movq -40(%rbp), %rdi
xorq %fs:40, %rdi
je .L7
call __stack_chk_fail@PLT
.L7:
leaq -24(%rbp), %rsp
popq %rbx
popq %r12
popq %r13
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
Related Questions
drjack9650@gmail.com
Navigate
Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.