Separated fetch of bytecode instructions from its decode+dispatch.

  • From: "Gustavo Serra Scalet" <gustavo.scalet@xxxxxxxxxxxxxxx>
  • To: "luajit@xxxxxxxxxxxxx" <luajit@xxxxxxxxxxxxx>
  • Date: Fri, 6 Mar 2015 19:14:47 +0000

Hello LuaJIT enthusiasts,

I'm working on the PPC64 port along with Caio so I'm checking the .dasc file of 
the existing ported architectures. Currently I noticed that for x86 and arm64, 
the fetching of bytecode instructions is done right before the decode+dispatch 
for some architectures ("ins_next" macro). But for ppc and mips, the fetching 
and decode+dispatch  are separated with other instructions in between 
("ins_next1" and "ins_next2" or "ins_next3" macros). See BC_KSTR as an example 
below and its "ins_next*" macro invocations:

# grep -n -A11 KSTR: src/*.dasc
src/vm_arm.dasc:3174:  case BC_KSTR:
src/vm_arm.dasc-3175-    |  // RA = dst*8, RC = str_const (~)
src/vm_arm.dasc-3176-    |  mvn RC, RC
src/vm_arm.dasc-3177-    |   ins_next1
src/vm_arm.dasc-3178-    |  ldr CARG1, [KBASE, RC, lsl #2]
src/vm_arm.dasc-3179-    |  mvn CARG2, #~LJ_TSTR
src/vm_arm.dasc-3180-    |   ins_next2
src/vm_arm.dasc-3181-    |  strd CARG12, [BASE, RA]
src/vm_arm.dasc-3182-    |   ins_next3
src/vm_arm.dasc-3183-    break;
src/vm_arm.dasc-3184-  case BC_KCDATA:
src/vm_arm.dasc-3185-    |.if FFI
--
src/vm_arm64.dasc:2455:  case BC_KSTR:
src/vm_arm64.dasc-2456-    |  // RA = dst, RC = str_const (~)
src/vm_arm64.dasc-2457-    |  mvn RC, RC
src/vm_arm64.dasc-2458-    |  ldr TMP0, [KBASE, RC, lsl #3]
src/vm_arm64.dasc-2459-    |   movn TMP1, #~LJ_TSTR
src/vm_arm64.dasc-2460-    |  add TMP0, TMP0, TMP1, lsl #47
src/vm_arm64.dasc-2461-    |  str TMP0, [BASE, RA, lsl #3]
src/vm_arm64.dasc-2462-    |  ins_next
src/vm_arm64.dasc-2463-    break;
src/vm_arm64.dasc-2464-  case BC_KCDATA:
src/vm_arm64.dasc-2465-    |.if FFI
src/vm_arm64.dasc-2466-    |  // RA = dst, RC = cdata_const (~)
--
src/vm_mips.dasc:2842:  case BC_KSTR:
src/vm_mips.dasc-2843-    |  // RA = dst*8, RD = str_const*8 (~)
src/vm_mips.dasc-2844-    |  srl TMP1, RD, 1
src/vm_mips.dasc-2845-    |  subu TMP1, KBASE, TMP1
src/vm_mips.dasc-2846-    |  ins_next1
src/vm_mips.dasc-2847-    |  lw TMP0, -4(TMP1)          // KBASE-4-str_const*4
src/vm_mips.dasc-2848-    |  addu RA, BASE, RA
src/vm_mips.dasc-2849-    |   li TMP2, LJ_TSTR
src/vm_mips.dasc-2850-    |  sw TMP0, LO(RA)
src/vm_mips.dasc-2851-    |   sw TMP2, HI(RA)
src/vm_mips.dasc-2852-    |  ins_next2
src/vm_mips.dasc-2853-    break;
--
src/vm_ppc.dasc:3671:  case BC_KSTR:
src/vm_ppc.dasc-3672-    |  // RA = dst*8, RD = str_const*8 (~)
src/vm_ppc.dasc-3673-    |  srwi TMP1, RD, 1
src/vm_ppc.dasc-3674-    |  subfic TMP1, TMP1, -4
src/vm_ppc.dasc-3675-    |  ins_next1
src/vm_ppc.dasc-3676-    |  lwzx TMP0, KBASE, TMP1              // 
KBASE-4-str_const*4
src/vm_ppc.dasc-3677-    |  li TMP2, LJ_TSTR
src/vm_ppc.dasc-3678-    |  stwux TMP2, RA, BASE
src/vm_ppc.dasc-3679-    |  stw TMP0, 4(RA)
src/vm_ppc.dasc-3680-    |  ins_next2
src/vm_ppc.dasc-3681-    break;
src/vm_ppc.dasc-3682-  case BC_KCDATA:
--
src/vm_x86.dasc:3856:  case BC_KSTR:
src/vm_x86.dasc-3857-    |  ins_AND     // RA = dst, RD = str const (~)
src/vm_x86.dasc-3858-    |  mov RD, [KBASE+RD*4]
src/vm_x86.dasc-3859-    |  mov dword [BASE+RA*8+4], LJ_TSTR
src/vm_x86.dasc-3860-    |  mov [BASE+RA*8], RD
src/vm_x86.dasc-3861-    |  ins_next
src/vm_x86.dasc-3862-    break;
src/vm_x86.dasc-3863-  case BC_KCDATA:
src/vm_x86.dasc-3864-    |.if FFI
src/vm_x86.dasc-3865-    |  ins_AND     // RA = dst, RD = cdata const (~)
src/vm_x86.dasc-3866-    |  mov RD, [KBASE+RD*4]
src/vm_x86.dasc-3867-    |  mov dword [BASE+RA*8+4], LJ_TCDATA


What would be the reason for that? For readability purposes I'd leave them 
together but as they are separated for some architectures, one reason that I 
can think of is to optimize the processor's pipeline in order to avoid idling 
during the fetch of the next instruction. Other processors may have an 
optimized pipeline for this case so they can stick together. Is that so? If 
not, why?!

Thanks

Other related posts: