Translate the above code (bottom using nested loops) using our DLX vector instru

ID: 3862616 • Letter: T

Question

Translate the above code (bottom using nested loops) using our DLX vector instruction set. Assume:

Vector registers of length 8

Load unit has a startup of L clocks

Adder unit has a startup of A clocks

Multiplier unit has a startup of M clocks

For vectors of length N, compute the number of clock cycles to execute the inner loop (the vector operations) both for normal execution and then for allowing changing of loads/stores/addition/ multiplication. How much speedup do we achieve with chaining?

low VL (n MVL); find odd-size piece using modulo op for (j 0; j (n/MVL) j j+1) /*outer loop*/ for (i low; i (low+VL); i i+1) runs for length VL*/ Y[i] a x[i] Y[i] /*main operation*/ low low VL; start of next vector*/ VL MVL; reset the length to maximum vector length

Explanation / Answer

Answer:

.LFB0:
   .cfi_startproc
   pushq   %rbp
   .cfi_def_cfa_offset 16
   .cfi_offset 6, -16
   movq   %rsp, %rbp
   .cfi_def_cfa_register 6
   pushq   %r13
   pushq   %r12
   pushq   %rbx
   subq   $88, %rsp
   .cfi_offset 13, -24
   .cfi_offset 12, -32
   .cfi_offset 3, -40
   movq   %fs:40, %rax
   movq   %rax, -40(%rbp)
   xorl   %eax, %eax
   movl   $0, -100(%rbp)
   movl   -84(%rbp), %eax
   cltd
   idivl   -80(%rbp)
   movl   %edx, -88(%rbp)
   movl   $0, -92(%rbp)
   jmp   .L2
.L5:
   movq   %rsp, %rax
   movq   %rax, %rbx
   movl   -96(%rbp), %eax
   movslq   %eax, %rdx
   subq   $1, %rdx
   movq   %rdx, -72(%rbp)
   movslq   %eax, %rdx
   movq   %rdx, %rsi
   movl   $0, %edi
   movslq   %eax, %rdx
   movq   %rdx, %r8
   movl   $0, %r9d
   cltq
   salq   $2, %rax
   leaq   3(%rax), %rdx
   movl   $16, %eax
   subq   $1, %rax
   addq   %rdx, %rax
   movl   $16, %ecx
   movl   $0, %edx
   divq   %rcx
   imulq   $16, %rax, %rax
   subq   %rax, %rsp
   movq   %rsp, %rax
   addq   $3, %rax
   shrq   $2, %rax
   salq   $2, %rax
   movq   %rax, -64(%rbp)
   movl   -96(%rbp), %eax
   movslq   %eax, %rdx
   subq   $1, %rdx
   movq   %rdx, -56(%rbp)
   movslq   %eax, %rdx
   movq   %rdx, %r10
   movl   $0, %r11d
   movslq   %eax, %rdx
   movq   %rdx, %r12
   movl   $0, %r13d
   cltq
   salq   $2, %rax
   leaq   3(%rax), %rdx
   movl   $16, %eax
   subq   $1, %rax
   addq   %rdx, %rax
   movl   $16, %ecx
   movl   $0, %edx
   divq   %rcx
   imulq   $16, %rax, %rax
   subq   %rax, %rsp
   movq   %rsp, %rax
   addq   $3, %rax
   shrq   $2, %rax
   salq   $2, %rax
   movq   %rax, -48(%rbp)
   movl   -100(%rbp), %eax
   movl   %eax, -96(%rbp)
   jmp   .L3
.L4:
   movq   -48(%rbp), %rax
   movl   -96(%rbp), %edx
   movslq   %edx, %rdx
   movl   (%rax,%rdx,4), %eax
   imull   -76(%rbp), %eax
   movl   %eax, %ecx
   movq   -64(%rbp), %rax
   movl   -96(%rbp), %edx
   movslq   %edx, %rdx
   movl   (%rax,%rdx,4), %eax
   addl   %eax, %ecx
   movq   -64(%rbp), %rax
   movl   -96(%rbp), %edx
   movslq   %edx, %rdx
   movl   %ecx, (%rax,%rdx,4)
   addl   $1, -96(%rbp)
.L3:
   movl   -100(%rbp), %edx
   movl   -88(%rbp), %eax
   addl   %edx, %eax
   cmpl   -96(%rbp), %eax
   jg   .L4
   movl   -88(%rbp), %eax
   addl   %eax, -100(%rbp)
   movl   -80(%rbp), %eax
   movl   %eax, -88(%rbp)
   movq   %rbx, %rsp
   addl   $1, -92(%rbp)
.L2:
   movl   -84(%rbp), %eax
   cltd
   idivl   -80(%rbp)
   cmpl   -92(%rbp), %eax
   jge   .L5
   movl   $0, %eax
   movq   -40(%rbp), %rdi
   xorq   %fs:40, %rdi
   je   .L7
   call   __stack_chk_fail@PLT
.L7:
   leaq   -24(%rbp), %rsp
   popq   %rbx
   popq   %r12
   popq   %r13
   popq   %rbp
   .cfi_def_cfa 7, 8
   ret
   .cfi_endproc

Navigate

Translate the above code (bottom using nested loops) using our DLX vector instru

Translate the code from the BASIC idiom to Java and then compile and run. You sh

Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.

Translate the above code (bottom using nested loops) using our DLX vector instru

Question

Explanation / Answer

Related Questions

Navigate