Hello LuaJIT enthusiasts, I'm working on the PPC64 port along with Caio so I'm checking the .dasc file of the existing ported architectures. Currently I noticed that for x86 and arm64, the fetching of bytecode instructions is done right before the decode+dispatch for some architectures ("ins_next" macro). But for ppc and mips, the fetching and decode+dispatch are separated with other instructions in between ("ins_next1" and "ins_next2" or "ins_next3" macros). See BC_KSTR as an example below and its "ins_next*" macro invocations: # grep -n -A11 KSTR: src/*.dasc src/vm_arm.dasc:3174: case BC_KSTR: src/vm_arm.dasc-3175- | // RA = dst*8, RC = str_const (~) src/vm_arm.dasc-3176- | mvn RC, RC src/vm_arm.dasc-3177- | ins_next1 src/vm_arm.dasc-3178- | ldr CARG1, [KBASE, RC, lsl #2] src/vm_arm.dasc-3179- | mvn CARG2, #~LJ_TSTR src/vm_arm.dasc-3180- | ins_next2 src/vm_arm.dasc-3181- | strd CARG12, [BASE, RA] src/vm_arm.dasc-3182- | ins_next3 src/vm_arm.dasc-3183- break; src/vm_arm.dasc-3184- case BC_KCDATA: src/vm_arm.dasc-3185- |.if FFI -- src/vm_arm64.dasc:2455: case BC_KSTR: src/vm_arm64.dasc-2456- | // RA = dst, RC = str_const (~) src/vm_arm64.dasc-2457- | mvn RC, RC src/vm_arm64.dasc-2458- | ldr TMP0, [KBASE, RC, lsl #3] src/vm_arm64.dasc-2459- | movn TMP1, #~LJ_TSTR src/vm_arm64.dasc-2460- | add TMP0, TMP0, TMP1, lsl #47 src/vm_arm64.dasc-2461- | str TMP0, [BASE, RA, lsl #3] src/vm_arm64.dasc-2462- | ins_next src/vm_arm64.dasc-2463- break; src/vm_arm64.dasc-2464- case BC_KCDATA: src/vm_arm64.dasc-2465- |.if FFI src/vm_arm64.dasc-2466- | // RA = dst, RC = cdata_const (~) -- src/vm_mips.dasc:2842: case BC_KSTR: src/vm_mips.dasc-2843- | // RA = dst*8, RD = str_const*8 (~) src/vm_mips.dasc-2844- | srl TMP1, RD, 1 src/vm_mips.dasc-2845- | subu TMP1, KBASE, TMP1 src/vm_mips.dasc-2846- | ins_next1 src/vm_mips.dasc-2847- | lw TMP0, -4(TMP1) // KBASE-4-str_const*4 src/vm_mips.dasc-2848- | addu RA, BASE, RA src/vm_mips.dasc-2849- | li TMP2, LJ_TSTR src/vm_mips.dasc-2850- | sw TMP0, LO(RA) src/vm_mips.dasc-2851- | sw TMP2, HI(RA) src/vm_mips.dasc-2852- | ins_next2 src/vm_mips.dasc-2853- break; -- src/vm_ppc.dasc:3671: case BC_KSTR: src/vm_ppc.dasc-3672- | // RA = dst*8, RD = str_const*8 (~) src/vm_ppc.dasc-3673- | srwi TMP1, RD, 1 src/vm_ppc.dasc-3674- | subfic TMP1, TMP1, -4 src/vm_ppc.dasc-3675- | ins_next1 src/vm_ppc.dasc-3676- | lwzx TMP0, KBASE, TMP1 // KBASE-4-str_const*4 src/vm_ppc.dasc-3677- | li TMP2, LJ_TSTR src/vm_ppc.dasc-3678- | stwux TMP2, RA, BASE src/vm_ppc.dasc-3679- | stw TMP0, 4(RA) src/vm_ppc.dasc-3680- | ins_next2 src/vm_ppc.dasc-3681- break; src/vm_ppc.dasc-3682- case BC_KCDATA: -- src/vm_x86.dasc:3856: case BC_KSTR: src/vm_x86.dasc-3857- | ins_AND // RA = dst, RD = str const (~) src/vm_x86.dasc-3858- | mov RD, [KBASE+RD*4] src/vm_x86.dasc-3859- | mov dword [BASE+RA*8+4], LJ_TSTR src/vm_x86.dasc-3860- | mov [BASE+RA*8], RD src/vm_x86.dasc-3861- | ins_next src/vm_x86.dasc-3862- break; src/vm_x86.dasc-3863- case BC_KCDATA: src/vm_x86.dasc-3864- |.if FFI src/vm_x86.dasc-3865- | ins_AND // RA = dst, RD = cdata const (~) src/vm_x86.dasc-3866- | mov RD, [KBASE+RD*4] src/vm_x86.dasc-3867- | mov dword [BASE+RA*8+4], LJ_TCDATA What would be the reason for that? For readability purposes I'd leave them together but as they are separated for some architectures, one reason that I can think of is to optimize the processor's pipeline in order to avoid idling during the fetch of the next instruction. Other processors may have an optimized pipeline for this case so they can stick together. Is that so? If not, why?! Thanks