m = mat4(1, 2, 3, …) 75x slower than m = mat4(); m.m11 = 1; … (was: Allocation sinking in git HEAD)

  • From: Adam Strzelecki <ono@xxxxxxx>
  • To: luajit@xxxxxxxxxxxxx
  • Date: Wed, 4 Jul 2012 20:09:12 +0200

Not sure if I should put it here as reply to this thread, but it is somehow 
related to my previous post as example below shows that allocation sinking 
doesn't apply for bigger structures.

Also this example shows another problem, using cdata initializers is much much 
slower (75x slower) than initializing cdata with empty value then assigning 
structure fields. I thought it is should be opposite. Anyone knows why?

-- $ luajit test.lua 
-- matrix * matrix with assign in 0.043875 seconds
-- matrix * matrix with init in 3.036972 seconds

local ffi = require 'ffi'
local mat4

ffi.cdef [[
typedef struct {
  float m11, m21, m31, m41;
  float m12, m22, m32, m42;
  float m13, m23, m33, m43;
  float m14, m24, m34, m44;
} mat4;
]]


mat4 = ffi.metatype('mat4', {
  -- return via initializers (I know that this is multiplication not addition 
;P)
  __add = function(a, b)
    return mat4(
    a.m11*b.m11 + a.m21*b.m12 + a.m31*b.m13 + a.m41*b.m14,
    a.m11*b.m21 + a.m21*b.m22 + a.m31*b.m23 + a.m41*b.m24,
    a.m11*b.m31 + a.m21*b.m32 + a.m31*b.m33 + a.m41*b.m34,
    a.m11*b.m41 + a.m21*b.m42 + a.m31*b.m43 + a.m41*b.m44,

    a.m12*b.m11 + a.m22*b.m12 + a.m32*b.m13 + a.m42*b.m14,
    a.m12*b.m21 + a.m22*b.m22 + a.m32*b.m23 + a.m42*b.m24,
    a.m12*b.m31 + a.m22*b.m32 + a.m32*b.m33 + a.m42*b.m34,
    a.m12*b.m41 + a.m22*b.m42 + a.m32*b.m43 + a.m42*b.m44,

    a.m13*b.m11 + a.m23*b.m12 + a.m33*b.m13 + a.m43*b.m14,
    a.m13*b.m21 + a.m23*b.m22 + a.m33*b.m23 + a.m43*b.m24,
    a.m13*b.m31 + a.m23*b.m32 + a.m33*b.m33 + a.m43*b.m34,
    a.m13*b.m41 + a.m23*b.m42 + a.m33*b.m43 + a.m43*b.m44,

    a.m14*b.m11 + a.m24*b.m12 + a.m34*b.m13 + a.m44*b.m14,
    a.m14*b.m21 + a.m24*b.m22 + a.m34*b.m23 + a.m44*b.m24,
    a.m14*b.m31 + a.m24*b.m32 + a.m34*b.m33 + a.m44*b.m34,
    a.m14*b.m41 + a.m24*b.m42 + a.m34*b.m43 + a.m44*b.m44)
  end,
  -- first allocate then init
  __mul = function(a, b)
    local ret = mat4()
    ret.m11 = a.m11*b.m11 + a.m21*b.m12 + a.m31*b.m13 + a.m41*b.m14
    ret.m21 = a.m11*b.m21 + a.m21*b.m22 + a.m31*b.m23 + a.m41*b.m24
    ret.m31 = a.m11*b.m31 + a.m21*b.m32 + a.m31*b.m33 + a.m41*b.m34
    ret.m41 = a.m11*b.m41 + a.m21*b.m42 + a.m31*b.m43 + a.m41*b.m44

    ret.m12 = a.m12*b.m11 + a.m22*b.m12 + a.m32*b.m13 + a.m42*b.m14
    ret.m22 = a.m12*b.m21 + a.m22*b.m22 + a.m32*b.m23 + a.m42*b.m24
    ret.m32 = a.m12*b.m31 + a.m22*b.m32 + a.m32*b.m33 + a.m42*b.m34
    ret.m42 = a.m12*b.m41 + a.m22*b.m42 + a.m32*b.m43 + a.m42*b.m44

    ret.m13 = a.m13*b.m11 + a.m23*b.m12 + a.m33*b.m13 + a.m43*b.m14
    ret.m23 = a.m13*b.m21 + a.m23*b.m22 + a.m33*b.m23 + a.m43*b.m24
    ret.m33 = a.m13*b.m31 + a.m23*b.m32 + a.m33*b.m33 + a.m43*b.m34
    ret.m43 = a.m13*b.m41 + a.m23*b.m42 + a.m33*b.m43 + a.m43*b.m44

    ret.m14 = a.m14*b.m11 + a.m24*b.m12 + a.m34*b.m13 + a.m44*b.m14
    ret.m24 = a.m14*b.m21 + a.m24*b.m22 + a.m34*b.m23 + a.m44*b.m24
    ret.m34 = a.m14*b.m31 + a.m24*b.m32 + a.m34*b.m33 + a.m44*b.m34
    ret.m44 = a.m14*b.m41 + a.m24*b.m42 + a.m34*b.m43 + a.m44*b.m44
    return ret
  end
})

local iter = 200000 -- 2M
local a = mat4(1, 0, 0, 0,
               0, 1, 0, 0,
               0, 0, 1, 0,
               0, 0, 0, 1)
local b = mat4(1, 0, 0, 0,
               0, 1, 0, 0,
               0, 0, 1, 0,
               0, 0, 0, 1)
local start = os.clock()
for i = 1, iter do
  a = a * b
end
print(string.format('matrix * matrix with assign in %f seconds', 
os.clock()-start)); io.stdout:flush()

local start = os.clock()
for i = 1, iter do
  a = a + b
end
print(string.format('matrix * matrix with init in %f seconds', 
os.clock()-start)); io.stdout:flush()

Cheers,
-- 
Adam Strzelecki | nanoant.com | twitter.com/nanoant


Other related posts: