[haiku-commits] r40768 - haiku/trunk/src/add-ons/media/plugins/ffmpeg

  • From: dlmcpaul@xxxxxxxxx
  • To: haiku-commits@xxxxxxxxxxxxx
  • Date: Tue, 1 Mar 2011 22:32:01 +0100 (CET)

Author: dlmcpaul
Date: 2011-03-01 22:32:01 +0100 (Tue, 01 Mar 2011)
New Revision: 40768
Changeset: http://dev.haiku-os.org/changeset/40768

Modified:
   haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp
   haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm
   haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse2.nasm
   haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm
Log:
Use multiplication instead of shifts and adds.  Include SSSE3 routine

Modified: haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp
===================================================================
--- haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp   2011-03-01 
20:24:04 UTC (rev 40767)
+++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp   2011-03-01 
21:32:01 UTC (rev 40768)
@@ -42,9 +42,11 @@
 
                        if (pixelFormat == PIX_FMT_YUV420P 
                                || pixelFormat == PIX_FMT_YUVJ420P) {
-                               if (cpu.HasSSE2() && width % 8 == 0 && height % 
2 == 0) {
-                                       TRACE("resolve_colorspace: "
-                                               
"gfx_conv_yuv420p_rgba32_sse2\n");
+                               if (cpu.HasSSSE3() && width % 8 == 0 && height 
% 2 == 0) {
+                                       TRACE("resolve_colorspace: 
gfx_conv_yuv420p_rgba32_ssse3\n");
+                                       return gfx_conv_yuv420p_rgba32_ssse3;
+                               } else if (cpu.HasSSE2() && width % 8 == 0 && 
height % 2 == 0) {
+                                       TRACE("resolve_colorspace: 
gfx_conv_yuv420p_rgba32_sse2\n");
                                        return gfx_conv_yuv420p_rgba32_sse2;
                                } else if (cpu.HasSSE1() && width % 4 == 0
                                        && height % 2 == 0) {
@@ -58,7 +60,10 @@
 
                        if (pixelFormat == PIX_FMT_YUV422P
                                || pixelFormat == PIX_FMT_YUVJ422P) {
-                               if (cpu.HasSSE2() && width % 8 == 0) {
+                               if (cpu.HasSSSE3() && width % 8 == 0) {
+                                       TRACE("resolve_colorspace: 
gfx_conv_yuv422p_RGB32_ssse3\n");
+                                       return gfx_conv_yuv422p_rgba32_ssse3;
+                               } else if (cpu.HasSSE2() && width % 8 == 0) {
                                        TRACE("resolve_colorspace: 
gfx_conv_yuv422p_RGB32_sse2\n");
                                        return gfx_conv_yuv422p_rgba32_sse2;
                                } else if (cpu.HasSSE1() && width % 4 == 0) {

Modified: haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm
===================================================================
--- haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm        
2011-03-01 20:24:04 UTC (rev 40767)
+++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm        
2011-03-01 21:32:01 UTC (rev 40768)
@@ -42,7 +42,7 @@
 ; b = y + u + u >> 1 + u >> 2 + u >> 6
 ; subtract 16 from y
        movq mm7, [Const16]                             ; loads a constant 
using data cache (slower on first fetch but then cached)
-       psubsw mm0,mm7                                  ; y = y - 16
+;      psubsw mm0,mm7                                  ; y = y - 16
 ; subtract 128 from u and v
        movq mm7, [Const128]                    ; loads a constant using data 
cache (slower on first fetch but then cached)
        psubsw mm1,mm7                                  ; u = u - 128

Modified: haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse2.nasm
===================================================================
--- haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse2.nasm       
2011-03-01 20:24:04 UTC (rev 40767)
+++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse2.nasm       
2011-03-01 21:32:01 UTC (rev 40768)
@@ -33,65 +33,137 @@
 %1:
 %endmacro
 
+SECTION .data align=16
+
+Const16        dw      16
+       dw      16
+       dw      16
+       dw      16
+       dw      16
+       dw      16
+       dw      16
+       dw      16
+
+Const128       dw      128
+       dw      128
+       dw      128
+       dw      128
+       dw      128
+       dw      128
+       dw      128
+       dw      128
+       
+RConst dw 0
+               dw 5743
+               dw 0
+               dw 5743
+               dw 0
+               dw 5743
+               dw 0
+               dw 5743
+               
+GConst dw -1409
+               dw -2925
+               dw -1409
+               dw -2925
+               dw -1409
+               dw -2925
+               dw -1409
+               dw -2925
+               
+BConst dw 7258
+               dw 0
+               dw 7258
+               dw 0
+               dw 7258
+               dw 0
+               dw 7258
+               dw 0
+
+shuffconst db 0x0
+               db 0x01
+               db 0x00
+               db 0x01
+               db 0x04
+               db 0x05
+               db 0x04
+               db 0x05
+               db 0x08
+               db 0x09
+               db 0x08
+               db 0x09
+               db 0x0c
+               db 0x0d
+               db 0x0c
+               db 0x0d
+               
+YMask  db      0x00
+       db      0x80
+       db      0x02
+       db      0x80
+       db      0x04
+       db      0x80
+       db      0x06
+       db      0x80
+       db      0x08
+       db      0x80
+       db      0x0a
+       db      0x80
+       db      0x0c
+       db      0x80
+       db      0x0e
+       db      0x80
+
+UVMask db      0x01
+       db      0x80
+       db      0x03
+       db      0x80
+       db      0x05
+       db      0x80
+       db      0x07
+       db      0x80
+       db      0x09
+       db      0x80
+       db      0x0b
+       db      0x80
+       db      0x0d
+       db      0x80
+       db      0x0f
+       db      0x80
+
 ; conversion code 
 %macro yuv2rgbsse2 0
 ; u = u - 128
 ; v = v - 128
-; r = y + v + v >> 2 + v >> 3 + v >> 5 
-; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
-; b = y + u + u >> 1 + u >> 2 + u >> 6
+; r = y + 0 * u + 1.402 * v
+; g = y + -0.344 * u + -0.714 * v
+; b = y + 1.772 * u + 0 * v
 ; subtract 16 from y
-       movdqa xmm7, [Const16]                  ; loads a constant using data 
cache (slower on first fetch but then cached)
-       psubsw xmm0,xmm7                                ; y = y - 16
+;      psubsw xmm0, [Const16]                  ; y = y - 16
 ; subtract 128 from u and v
-       movdqa xmm7, [Const128]                 ; loads a constant using data 
cache (slower on first fetch but then cached)
-       psubsw xmm1,xmm7                                ; u = u - 128
-       psubsw xmm2,xmm7                                ; v = v - 128
-; load r,b with y 
-       movdqa xmm3,xmm0                                ; r = y 
-       pshufd xmm5,xmm0, 0xE4                  ; b = y 
-
-; r = y + v + v >> 2 + v >> 3 + v >> 5
-       paddsw xmm3, xmm2                               ; add v to r
-       movdqa xmm7, xmm1                               ; move u to scratch
-       pshufd xmm6, xmm2, 0xE4                 ; move v to scratch
+       psubsw xmm3, [Const128]                 ; u = u - 128, v = v -128
        
-       psraw  xmm6,2                                   ; divide v by 4
-       paddsw xmm3, xmm6                               ; and add to r
-       psraw  xmm6,1                                   ; divide v by 2
-       paddsw xmm3, xmm6                               ; and add to r
-       psraw  xmm6,2                                   ; divide v by 4
-       paddsw xmm3, xmm6                               ; and add to r
+       movdqa xmm4, xmm3                               ; duplicate
+       pshufd xmm5, xmm3, 0xE4                 ; duplicate
+       
+       pmaddwd xmm3, [RConst]                  ; multiply and add
+       pmaddwd xmm4, [GConst]                  ; to get RGB offsets to Y
+       pmaddwd xmm5, [BConst]                  ;
 
-; b = y + u + u >> 1 + u >> 2 + u >> 6
-       paddsw xmm5, xmm1                               ; add u to b
-       psraw  xmm7,1                                   ; divide u by 2
-       paddsw xmm5, xmm7                               ; and add to b
-       psraw  xmm7,1                                   ; divide u by 2
-       paddsw xmm5, xmm7                               ; and add to b
-       psraw  xmm7,4                                   ; divide u by 32
-       paddsw xmm5, xmm7                               ; and add to b
+       psrad xmm3, 12                                  ; Scale back to 
original range
+       psrad xmm4, 12                                  ;
+       psrad xmm5, 12                                  ;
        
-; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
-       movdqa xmm7,xmm2                                ; move v to scratch
-       pshufd xmm6,xmm1, 0xE4                  ; move u to scratch
-       movdqa xmm4,xmm0                                ; g = y 
+       pshuflw xmm3, xmm3, 0xa0                ; duplicate results
+       pshufhw xmm3, xmm3, 0xa0
+       pshuflw xmm4, xmm4, 0xa0
+       pshufhw xmm4, xmm4, 0xa0
+       pshuflw xmm5, xmm5, 0xa0
+       pshufhw xmm5, xmm5, 0xa0
        
-       psraw  xmm6,2                                   ; divide u by 4
-       psubsw xmm4,xmm6                                ; subtract from g
-       psraw  xmm6,2                                   ; divide u by 4
-       psubsw xmm4,xmm6                                ; subtract from g
-       psraw  xmm6,1                                   ; divide u by 2
-       psubsw xmm4,xmm6                                ; subtract from g
-
-       psraw  xmm7,1                                   ; divide v by 2
-       psubsw xmm4,xmm7                                ; subtract from g
-       psraw  xmm7,2                                   ; divide v by 4
-       psubsw xmm4,xmm7                                ; subtract from g
-       psraw  xmm7,1                                   ; divide v by 2
-       psubsw xmm4,xmm7                                ; subtract from g
-       psraw  xmm7,1                                   ; divide v by 2
-       psubsw xmm4,xmm7                                ; subtract from g
+       paddsw xmm3, xmm0                               ; add to y
+       paddsw xmm4, xmm0                               ;
+       paddsw xmm5, xmm0                               ;
 %endmacro
 
 ; outputer
@@ -112,26 +184,6 @@
        movntdq [edi+16], xmm0                  ; output second 4 pixels 
bypassing cache
 %endmacro
 
-SECTION .data align=16
-
-Const16        dw      16
-       dw      16
-       dw      16
-       dw      16
-       dw      16
-       dw      16
-       dw      16
-       dw      16
-
-Const128       dw      128
-       dw      128
-       dw      128
-       dw      128
-       dw      128
-       dw      128
-       dw      128
-       dw      128
-
 ; void Convert_YUV422_RGBA32_SSE2(void *fromPtr, void *toPtr, int width)
 width    equ   ebp+16
 toPtr    equ   ebp+12
@@ -162,28 +214,20 @@
        test ecx,ecx
        jng ENDLOOP
 REPEATLOOP:                                                    ; loop over 
width / 8
+       prefetchnta [esi+256]
 ; YUV422 packed inputer
        movdqa xmm0, [esi]                              ; should have yuyv yuyv 
yuyv yuyv
-       pshufd xmm1, xmm0, 0xE4                 ; copy to xmm1
-       movdqa xmm2, xmm0                               ; copy to xmm2
+       pshufd xmm3, xmm0, 0xE4                 ; copy to xmm3
 ; extract y
-       pxor xmm7,xmm7                                  ; 
00000000000000000000000000000000
-       pcmpeqd xmm6,xmm6                               ; 
ffffffffffffffffffffffffffffffff
-       punpcklbw xmm6,xmm7                             ; interleave xmm7 into 
xmm6 ff00ff00ff00ff00ff00ff00ff00ff00
+       pxor xmm7, xmm7                                 ; 
00000000000000000000000000000000
+       pcmpeqd xmm6, xmm6                              ; 
ffffffffffffffffffffffffffffffff
+       punpcklbw xmm6, xmm7                    ; interleave xmm7 into xmm6 
ff00ff00ff00ff00ff00ff00ff00ff00
        pand xmm0, xmm6                                 ; clear all but y 
values leaving y0y0 etc
-; extract u and duplicate so each u in yuyv becomes 0u0u
-       psrld xmm6,8                                    ; 00ff0000 00ff0000 
00ff0000 00ff0000
-       pand xmm1, xmm6                                 ; clear all yv values 
leaving 0u00 etc
-       psrld xmm1,8                                    ; rotate u to get u000
-       pshuflw xmm1,xmm1, 0xA0                 ; copy u values
-       pshufhw xmm1,xmm1, 0xA0                 ; to get u0u0
-; extract v
-       pslld xmm6,16                                   ; 000000ff000000ff 
000000ff000000ff
-       pand xmm2, xmm6                                 ; clear all yu values 
leaving 000v etc
-       psrld xmm2,8                                    ; rotate v to get 00v0
-       pshuflw xmm2,xmm2, 0xF5                 ; copy v values
-       pshufhw xmm2,xmm2, 0xF5                 ; to get v0v0
-
+; extract u and v
+       psllw xmm6, 8                                   ; 
00ff00ff00ff00ff00ff00ff00ff00ff
+       pand xmm3, xmm6                                 ; extract uv values 
0u0v0u0v0u0v0u0v0u0v
+       psrlw xmm3, 8                                   ; covert to 16bit   
u0v0u0v0u0v0u0v0u0v0
+       
 yuv2rgbsse2
        
 rgba32sse2output
@@ -224,22 +268,16 @@
 REPEATLOOP1:                                           ; loop over width / 8
 ; YUV420 Planar inputer
        movq xmm0, [esi]                                ; fetch 8 y values (8 
bit) yyyyyyyy00000000
-       movd xmm1, [eax]                                ; fetch 4 u values (8 
bit) uuuu000000000000
-       movd xmm2, [ebx]                                ; fetch 4 v values (8 
bit) vvvv000000000000
+       movd xmm3, [eax]                                ; fetch 4 u values (8 
bit) uuuu000000000000
+       movd xmm1, [ebx]                                ; fetch 4 v values (8 
bit) vvvv000000000000
        
 ; extract y
-       pxor xmm7,xmm7                                  ; 
00000000000000000000000000000000
-       punpcklbw xmm0,xmm7                             ; interleave xmm7 into 
xmm0 y0y0y0y0y0y0y0y0
-; extract u and duplicate so each becomes 0u0u
-       punpcklbw xmm1,xmm7                             ; interleave xmm7 into 
xmm1 u0u0u0u000000000
-       punpcklwd xmm1,xmm7                             ; interleave again 
u000u000u000u000
-       pshuflw xmm1,xmm1, 0xA0                 ; copy u values
-       pshufhw xmm1,xmm1, 0xA0                 ; to get u0u0
-; extract v
-       punpcklbw xmm2,xmm7                             ; interleave xmm7 into 
xmm1 v0v0v0v000000000
-       punpcklwd xmm2,xmm7                             ; interleave again 
v000v000v000v000
-       pshuflw xmm2,xmm2, 0xA0                 ; copy v values
-       pshufhw xmm2,xmm2, 0xA0                 ; to get v0v0
+       pxor xmm7, xmm7                                 ; 
00000000000000000000000000000000
+       punpcklbw xmm0, xmm7                    ; interleave xmm7 into xmm0 
y0y0y0y0y0y0y0y0
+       
+; combine u and v
+       punpcklbw xmm3, xmm1                    ; uvuvuvuv00000000
+       punpcklbw xmm3, xmm7                    ; u0v0u0v0u0v0u0v0
 
 yuv2rgbsse2
        
@@ -263,4 +301,58 @@
        pop ebp
        ret
 
+cglobal Test_SSE2
+; reserve variables
+       push ebp
+       mov ebp, esp
+       push edi
+       push esi
+       push ecx
+       push eax
+       push ebx
+       
+       mov esi, [fromPtr]
+       mov edi, [toPtr]
+
+       movdqa xmm0, [esi]                              ; should have yuyv yuyv 
yuyv yuyv
+       pshufd xmm1, xmm0, 0xE4                 ; copy to xmm1
+       movdqa xmm3, xmm0                               ; copy to xmm2
+; extract y
+       pxor xmm7,xmm7                                  ; 
00000000000000000000000000000000
+       pcmpeqd xmm6,xmm6                               ; 
ffffffffffffffffffffffffffffffff
+       punpcklbw xmm6,xmm7                             ; interleave xmm7 into 
xmm6 ff00ff00ff00ff00ff00ff00ff00ff00
+       pand xmm0, xmm6                                 ; clear all but y 
values leaving y0y0 etc
+; extract u and duplicate so each u in yuyv becomes 0u0u
+       psrld xmm6,8                                    ; 00ff0000 00ff0000 
00ff0000 00ff0000
+       pand xmm1, xmm6                                 ; clear all yv values 
leaving 0u00 etc
+       psrld xmm1,8                                    ; rotate u to get u000
+; extract v
+       pslld xmm6,16                                   ; 000000ff000000ff 
000000ff000000ff
+       pand xmm3, xmm6                                 ; clear all yu values 
leaving 000v etc
+       psrld xmm3,8                                    ; rotate v to get 00v0
+       por xmm3, xmm1
+
+       psubsw xmm3, [Const128]                 ; u = u - 128, v = v -128
+
+       pmaddwd xmm3, [RConst]                  ; multiply and add
+       psrad xmm3, 12                                  ; Scale back to 
original range
+
+       pshufb xmm3, [shuffconst]               ; duplicate results
+;      paddsw xmm3, xmm0                               ; add to y
+
+;      pxor xmm7,xmm7
+;      packuswb xmm3,xmm7                              ; clamp to 0,255 and 
pack R to 8 bit per pixel
+
+       movntdq [edi], xmm3                             ; output first 4 pixels 
bypassing cache
+
+; Cleanup
+       pop ebx
+       pop eax
+       pop ecx
+       pop esi
+       pop edi
+       mov esp, ebp
+       pop ebp
+       ret
+       
 SECTION .note.GNU-stack noalloc noexec nowrite progbits

Modified: haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm
===================================================================
--- haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm      
2011-03-01 20:24:04 UTC (rev 40767)
+++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm      
2011-03-01 21:32:01 UTC (rev 40768)
@@ -33,85 +33,6 @@
 %1:
 %endmacro
 
-; conversion code 
-%macro yuv2rgbsse2 0
-; u = u - 128
-; v = v - 128
-; r = y + v + v >> 2 + v >> 3 + v >> 5 
-; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
-; b = y + u + u >> 1 + u >> 2 + u >> 6
-; subtract 16 from y
-       movdqa xmm7, [Const16]                  ; loads a constant using data 
cache (slower on first fetch but then cached)
-       psubsw xmm0,xmm7                                ; y = y - 16
-; subtract 128 from u and v
-       movdqa xmm7, [Const128]                 ; loads a constant using data 
cache (slower on first fetch but then cached)
-       psubsw xmm1,xmm7                                ; u = u - 128
-       psubsw xmm2,xmm7                                ; v = v - 128
-; load r,b with y 
-       movdqa xmm3,xmm0                                ; r = y 
-       pshufd xmm5,xmm0, 0xE4                  ; b = y 
-
-; r = y + v + v >> 2 + v >> 3 + v >> 5
-       paddsw xmm3, xmm2                               ; add v to r
-       movdqa xmm7, xmm1                               ; move u to scratch
-       pshufd xmm6, xmm2, 0xE4                 ; move v to scratch
-       
-       psraw  xmm6,2                                   ; divide v by 4
-       paddsw xmm3, xmm6                               ; and add to r
-       psraw  xmm6,1                                   ; divide v by 2
-       paddsw xmm3, xmm6                               ; and add to r
-       psraw  xmm6,2                                   ; divide v by 4
-       paddsw xmm3, xmm6                               ; and add to r
-
-; b = y + u + u >> 1 + u >> 2 + u >> 6
-       paddsw xmm5, xmm1                               ; add u to b
-       psraw  xmm7,1                                   ; divide u by 2
-       paddsw xmm5, xmm7                               ; and add to b
-       psraw  xmm7,1                                   ; divide u by 2
-       paddsw xmm5, xmm7                               ; and add to b
-       psraw  xmm7,4                                   ; divide u by 32
-       paddsw xmm5, xmm7                               ; and add to b
-       
-; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
-       movdqa xmm7,xmm2                                ; move v to scratch
-       pshufd xmm6,xmm1, 0xE4                  ; move u to scratch
-       movdqa xmm4,xmm0                                ; g = y 
-       
-       psraw  xmm6,2                                   ; divide u by 4
-       psubsw xmm4,xmm6                                ; subtract from g
-       psraw  xmm6,2                                   ; divide u by 4
-       psubsw xmm4,xmm6                                ; subtract from g
-       psraw  xmm6,1                                   ; divide u by 2
-       psubsw xmm4,xmm6                                ; subtract from g
-
-       psraw  xmm7,1                                   ; divide v by 2
-       psubsw xmm4,xmm7                                ; subtract from g
-       psraw  xmm7,2                                   ; divide v by 4
-       psubsw xmm4,xmm7                                ; subtract from g
-       psraw  xmm7,1                                   ; divide v by 2
-       psubsw xmm4,xmm7                                ; subtract from g
-       psraw  xmm7,1                                   ; divide v by 2
-       psubsw xmm4,xmm7                                ; subtract from g
-%endmacro
-
-; outputer
-%macro rgba32sse2output 0
-; clamp values
-       pxor xmm7,xmm7
-       packuswb xmm3,xmm7                              ; clamp to 0,255 and 
pack R to 8 bit per pixel
-       packuswb xmm4,xmm7                              ; clamp to 0,255 and 
pack G to 8 bit per pixel
-       packuswb xmm5,xmm7                              ; clamp to 0,255 and 
pack B to 8 bit per pixel
-; convert to bgra32 packed
-       punpcklbw xmm5,xmm4                             ; bgbgbgbgbgbgbgbg
-       movdqa xmm0, xmm5                               ; save bg values
-       punpcklbw xmm3,xmm7                             ; r0r0r0r0r0r0r0r0
-       punpcklwd xmm5,xmm3                             ; lower half 
bgr0bgr0bgr0bgr0
-       punpckhwd xmm0,xmm3                             ; upper half 
bgr0bgr0bgr0bgr0
-; write to output ptr
-       movntdq [edi], xmm5                             ; output first 4 pixels 
bypassing cache
-       movntdq [edi+16], xmm0                  ; output second 4 pixels 
bypassing cache
-%endmacro
-
 SECTION .data align=16
 
 Const16        dw      16
@@ -183,7 +104,119 @@
        db      0x0e
        db      0x80
 
+UVMask db      0x01
+       db      0x80
+       db      0x03
+       db      0x80
+       db      0x05
+       db      0x80
+       db      0x07
+       db      0x80
+       db      0x09
+       db      0x80
+       db      0x0b
+       db      0x80
+       db      0x0d
+       db      0x80
+       db      0x0f
+       db      0x80
 
+shuffconst db 0x0
+               db 0x01
+               db 0x00
+               db 0x01
+               db 0x04
+               db 0x05
+               db 0x04
+               db 0x05
+               db 0x08
+               db 0x09
+               db 0x08
+               db 0x09
+               db 0x0c
+               db 0x0d
+               db 0x0c
+               db 0x0d
+
+RConst dw 0
+               dw 5743
+               dw 0
+               dw 5743
+               dw 0
+               dw 5743
+               dw 0
+               dw 5743
+               
+GConst dw -1409
+               dw -2925
+               dw -1409
+               dw -2925
+               dw -1409
+               dw -2925
+               dw -1409
+               dw -2925
+               
+BConst dw 7258
+               dw 0
+               dw 7258
+               dw 0
+               dw 7258
+               dw 0
+               dw 7258
+               dw 0
+
+; conversion code 
+%macro yuv2rgbssse3 0
+; u = u - 128
+; v = v - 128
+; r = y + 0 * u + 1.403 * v
+; g = y + -0.344 * u + -0.714 * v
+; b = y + 1.773 * u + 0 * v
+; subtract 128 from u and v
+       psubsw xmm3, [Const128]                 ; u = u - 128, v = v -128
+       
+       pshufd xmm5, xmm3, 0xE4                 ; duplicate
+       movdqa xmm4, xmm3                               ; duplicate
+       
+; subtract 16 from y
+;      psubsw xmm0, [Const16]                  ; y = y - 16
+
+       pmaddwd xmm3, [RConst]                  ; multiply and add
+       pmaddwd xmm4, [GConst]                  ; to get RGB offsets to Y
+       pmaddwd xmm5, [BConst]                  ;
+
+       psrad xmm3, 12                                  ; Scale back to 
original range
+       psrad xmm4, 12                                  ;
+       psrad xmm5, 12                                  ;
+
+       pshufb xmm3, [shuffconst]               ; duplicate results
+       pshufb xmm4, [shuffconst]               ; 2 y values per const
+       pshufb xmm5, [shuffconst]               ;
+
+       paddsw xmm3, xmm0                               ; and add to y
+       paddsw xmm4, xmm0                               ;
+       paddsw xmm5, xmm0                               ;
+%endmacro
+
+; outputer
+%macro rgba32ssse3output 0
+; clamp values
+       pxor xmm7,xmm7
+       packuswb xmm3,xmm7                              ; clamp to 0,255 and 
pack R to 8 bit per pixel
+       packuswb xmm4,xmm7                              ; clamp to 0,255 and 
pack G to 8 bit per pixel
+       packuswb xmm5,xmm7                              ; clamp to 0,255 and 
pack B to 8 bit per pixel
+; convert to bgra32 packed
+       punpcklbw xmm5,xmm4                             ; bgbgbgbgbgbgbgbg
+       movdqa xmm0, xmm5                               ; save bg values
+       punpcklbw xmm3,xmm7                             ; r0r0r0r0r0r0r0r0
+       punpcklwd xmm5,xmm3                             ; lower half 
bgr0bgr0bgr0bgr0
+       punpckhwd xmm0,xmm3                             ; upper half 
bgr0bgr0bgr0bgr0
+; write to output ptr
+       movntdq [edi], xmm5                             ; output first 4 pixels 
bypassing cache
+       movntdq [edi+16], xmm0                  ; output second 4 pixels 
bypassing cache
+%endmacro
+
+
 ; void Convert_YUV422_RGBA32_SSSE3(void *fromPtr, void *toPtr, int width)
 width    equ   ebp+16
 toPtr    equ   ebp+12
@@ -214,20 +247,18 @@
        test ecx,ecx
        jng ENDLOOP
 REPEATLOOP:                                                    ; loop over 
width / 8
+       prefetchnta [esi+256]
 ; YUV422 packed inputer
        movdqa xmm0, [esi]                              ; should have yuyv yuyv 
yuyv yuyv
-       pshufd xmm1, xmm0, 0xE4                 ; copy to xmm1
-       movdqa xmm2, xmm0                               ; copy to xmm2
+       pshufd xmm3, xmm0, 0xE4                 ; copy to xmm1
 ; extract both y giving y0y0
        pshufb xmm0, [YMask]
-; extract u and duplicate so each u in yuyv becomes u0u0
-       pshufb xmm1, [UMask]
-; extract v and duplicate so each v in yuyv becomes v0v0
-       pshufb xmm2, [VMask]
+; extract u and v to have u0v0
+       pshufb xmm3, [UVMask]
 
-yuv2rgbsse2
+yuv2rgbssse3
        
-rgba32sse2output
+rgba32ssse3output
 
 ; endloop
        add edi,32
@@ -263,28 +294,26 @@
        test ecx,ecx
        jng ENDLOOP1
 REPEATLOOP1:                                           ; loop over width / 8
+       prefetchnta [esi+256]
+       prefetchnta [eax+128]
+       prefetchnta [ebx+128]
+
 ; YUV420 Planar inputer
        movq xmm0, [esi]                                ; fetch 8 y values (8 
bit) yyyyyyyy00000000
-       movd xmm1, [eax]                                ; fetch 4 u values (8 
bit) uuuu000000000000
-       movd xmm2, [ebx]                                ; fetch 4 v values (8 
bit) vvvv000000000000
+       movd xmm3, [eax]                                ; fetch 4 u values (8 
bit) uuuu000000000000
+       movd xmm1, [ebx]                                ; fetch 4 v values (8 
bit) vvvv000000000000
        
-; extract y
+; convert y to 16 bit
        pxor xmm7,xmm7                                  ; 
00000000000000000000000000000000
        punpcklbw xmm0,xmm7                             ; interleave xmm7 into 
xmm0 y0y0y0y0y0y0y0y0
-; extract u and duplicate so each becomes 0u0u
-       punpcklbw xmm1,xmm7                             ; interleave xmm7 into 
xmm1 u0u0u0u000000000
-       punpcklwd xmm1,xmm7                             ; interleave again 
u000u000u000u000
-       pshuflw xmm1,xmm1, 0xA0                 ; copy u values
-       pshufhw xmm1,xmm1, 0xA0                 ; to get u0u0
-; extract v
-       punpcklbw xmm2,xmm7                             ; interleave xmm7 into 
xmm1 v0v0v0v000000000
-       punpcklwd xmm2,xmm7                             ; interleave again 
v000v000v000v000
-       pshuflw xmm2,xmm2, 0xA0                 ; copy v values
-       pshufhw xmm2,xmm2, 0xA0                 ; to get v0v0
-
-yuv2rgbsse2
        
-rgba32sse2output
+; combine u and v
+       punpcklbw xmm3,xmm1                             ; uvuvuvuv00000000
+       punpcklbw xmm3,xmm7                             ; u0v0u0v0u0v0u0v0
+       
+yuv2rgbssse3
+       
+rgba32ssse3output
 
 ; endloop
        add edi,32


Other related posts: