[haiku-commits] r39913 - haiku/trunk/src/add-ons/media/plugins/ffmpeg

From: dlmcpaul@xxxxxxxxx
To: haiku-commits@xxxxxxxxxxxxx
Date: Tue, 21 Dec 2010 08:25:40 +0100 (CET)
Author: dlmcpaul
Date: 2010-12-21 08:25:40 +0100 (Tue, 21 Dec 2010)
New Revision: 39913
Changeset: http://dev.haiku-os.org/changeset/39913

Added:
   haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm
   haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse2.nasm
   haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm
Removed:
   haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb.nasm
Modified:
   haiku/trunk/src/add-ons/media/plugins/ffmpeg/Jamfile
   haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp
   haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h
   haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp
Log:
correct yuv422 planar conversion.  Seperate sse, sse2 and ssse3 asm code.  Add 
packed convertor

Modified: haiku/trunk/src/add-ons/media/plugins/ffmpeg/Jamfile
===================================================================
--- haiku/trunk/src/add-ons/media/plugins/ffmpeg/Jamfile        2010-12-21 
01:09:15 UTC (rev 39912)
+++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/Jamfile        2010-12-21 
07:25:40 UTC (rev 39913)
@@ -27,7 +27,9 @@
        gfx_conv_c_lookup.cpp
        gfx_conv_mmx.cpp
        gfx_util.cpp
-       yuvrgb.nasm
+       yuvrgb_sse.nasm
+       yuvrgb_sse2.nasm
+       yuvrgb_ssse3.nasm
        :
        libavformat.a
        libavcodec.a

Modified: haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp
===================================================================
--- haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp       
2010-12-21 01:09:15 UTC (rev 39912)
+++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp       
2010-12-21 07:25:40 UTC (rev 39913)
@@ -1,22 +1,63 @@
 #include "gfx_conv_mmx.h"
 #include "gfx_conv_c.h"
 
-
-extern "C" void _Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr,
-       void *fromVPtr, void *toPtr, int width);
+// Packed
+extern "C" void _Convert_YUV422_RGBA32_SSE(void *fromYPtr, void *toPtr,
+       int width);
 extern "C" void _Convert_YUV422_RGBA32_SSE2(void *fromYPtr, void *toPtr,
        int width);
+extern "C" void _Convert_YUV422_RGBA32_SSSE3(void *fromYPtr, void *toPtr,
+       int width);
+
+// Planar
 extern "C" void _Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr,
        void *fromVPtr, void *toPtr, int width);
-extern "C" void _Convert_YUV422_RGBA32_SSE(void *fromYPtr, void *toPtr,
-       int width);
+extern "C" void _Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr,
+       void *fromVPtr, void *toPtr, int width);
+extern "C" void _Convert_YUV420P_RGBA32_SSSE3(void *fromYPtr, void *fromUPtr,
+       void *fromVPtr, void *toPtr, int width);
 
 
+// Planar YUV420 means 2 Y lines share a UV line
 void
-gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
+gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
 {
-       // Planar YUV420
+       // in and out buffers must be aligned to 16 bytes,
+       // in should be as ffmpeg allocates it
+       if ((off_t)out->data[0] % 16 != 0) {
+               gfx_conv_YCbCr420p_RGB32_c(in, out, width, height);
+               return;
+       }
+       
+       uint8 *ybase = (uint8 *)in->data[0];
+       uint8 *ubase = (uint8 *)in->data[1];
+       uint8 *vbase = (uint8 *)in->data[2];
+       uint8 *rgbbase = (uint8 *)out->data[0];
+       
+       int yBaseInc = in->linesize[0];
+       int uBaseInc = in->linesize[1];
+       int vBaseInc = in->linesize[2];
+       int rgbBaseInc = out->linesize[0];
+       
+       for (int i=0;i<height;i+=2) {
+               // First Y row
+               _Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, 
width);
+               ybase += yBaseInc;
+               rgbbase += rgbBaseInc;
 
+               // Second Y row but same u and v row
+               _Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, 
width);
+               ybase += yBaseInc;
+               ubase += uBaseInc;
+               vbase += vBaseInc;
+               rgbbase += rgbBaseInc;
+       }
+}
+
+// Planar YUV420 means 2 Y lines share a UV line
+void
+gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
+{      
        // in and out buffers must be aligned to 32 bytes,
        // in should be as ffmpeg allocates it
        if ((off_t)out->data[0] % 32 != 0) {
@@ -49,42 +90,83 @@
        }
 }
 
+// Planar YUV420 means 2 Y lines share a UV line
+void
+gfx_conv_yuv420p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height)
+{      
+       // in and out buffers must be aligned to 32 bytes,
+       // in should be as ffmpeg allocates it
+       if ((off_t)out->data[0] % 32 != 0) {
+               gfx_conv_YCbCr420p_RGB32_c(in, out, width, height);
+               return;
+       }
 
+       uint8 *ybase = (uint8 *)in->data[0];
+       uint8 *ubase = (uint8 *)in->data[1];
+       uint8 *vbase = (uint8 *)in->data[2];
+       uint8 *rgbbase = (uint8 *)out->data[0];
+       
+       int yBaseInc = in->linesize[0];
+       int uBaseInc = in->linesize[1];
+       int vBaseInc = in->linesize[2];
+       int rgbBaseInc = out->linesize[0];
+       
+       for (int i=0;i<height;i+=2) {
+               // First Y row
+               _Convert_YUV420P_RGBA32_SSSE3(ybase, ubase, vbase, rgbbase, 
width);
+               ybase += yBaseInc;
+               rgbbase += rgbBaseInc;
+
+               // Second Y row but same u and v row
+               _Convert_YUV420P_RGBA32_SSSE3(ybase, ubase, vbase, rgbbase, 
width);
+               ybase += yBaseInc;
+               ubase += uBaseInc;
+               vbase += vBaseInc;
+               rgbbase += rgbBaseInc;
+       }
+}
+
+// Planar YUV422 means each Y line has it's own UV line
 void
-gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
+gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
 {
-       // Packed YUV422
-
        // in and out buffers must be aligned to 32 bytes,
        // in should be as ffmpeg allocates it
        if ((off_t)out->data[0] % 32 != 0) {
                gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
                return;
        }
-               
+
        uint8 *ybase = (uint8 *)in->data[0];
+       uint8 *ubase = (uint8 *)in->data[1];
+       uint8 *vbase = (uint8 *)in->data[2];
        uint8 *rgbbase = (uint8 *)out->data[0];
-
-       for (int i = 0; i <= height; i++) {
-               _Convert_YUV422_RGBA32_SSE2(ybase, rgbbase, width);
-               ybase += in->linesize[0];
-               rgbbase += out->linesize[0];
+       
+       int yBaseInc = in->linesize[0];
+       int uBaseInc = in->linesize[1];
+       int vBaseInc = in->linesize[2];
+       int rgbBaseInc = out->linesize[0];
+       
+       for (int i=0;i<height;i++) {
+               _Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, 
width);
+               ybase += yBaseInc;
+               ubase += uBaseInc;
+               vbase += vBaseInc;
+               rgbbase += rgbBaseInc;
        }
 }
 
-
+// Planar YUV422 means each Y line has it's own UV line
 void
-gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
+gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
 {
-       // Planar YUV420
-
-       // in and out buffers must be aligned to 16 bytes,
+       // in and out buffers must be aligned to 32 bytes,
        // in should be as ffmpeg allocates it
-       if ((off_t)out->data[0] % 16 != 0) {
-               gfx_conv_YCbCr420p_RGB32_c(in, out, width, height);
+       if ((off_t)out->data[0] % 32 != 0) {
+               gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
                return;
        }
-       
+
        uint8 *ybase = (uint8 *)in->data[0];
        uint8 *ubase = (uint8 *)in->data[1];
        uint8 *vbase = (uint8 *)in->data[2];
@@ -95,14 +177,38 @@
        int vBaseInc = in->linesize[2];
        int rgbBaseInc = out->linesize[0];
        
-       for (int i=0;i<height;i+=2) {
-               // First Y row
-               _Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, 
width);
+       for (int i=0;i<height;i++) {
+               _Convert_YUV420P_RGBA32_SSE2(ybase, ubase, vbase, rgbbase, 
width);
                ybase += yBaseInc;
+               ubase += uBaseInc;
+               vbase += vBaseInc;
                rgbbase += rgbBaseInc;
+       }
+}
 
-               // Second Y row but same u and v row
-               _Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, 
width);
+// Planar YUV422 means each Y line has it's own UV line
+void
+gfx_conv_yuv422p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height)
+{
+       // in and out buffers must be aligned to 32 bytes,
+       // in should be as ffmpeg allocates it
+       if ((off_t)out->data[0] % 32 != 0) {
+               gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
+               return;
+       }
+
+       uint8 *ybase = (uint8 *)in->data[0];
+       uint8 *ubase = (uint8 *)in->data[1];
+       uint8 *vbase = (uint8 *)in->data[2];
+       uint8 *rgbbase = (uint8 *)out->data[0];
+       
+       int yBaseInc = in->linesize[0];
+       int uBaseInc = in->linesize[1];
+       int vBaseInc = in->linesize[2];
+       int rgbBaseInc = out->linesize[0];
+       
+       for (int i=0;i<height;i++) {
+               _Convert_YUV420P_RGBA32_SSSE3(ybase, ubase, vbase, rgbbase, 
width);
                ybase += yBaseInc;
                ubase += uBaseInc;
                vbase += vBaseInc;
@@ -110,12 +216,10 @@
        }
 }
 
-
+// Packed YUV422 (YUYV)
 void
-gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
+gfx_conv_yuv422_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
 {
-       // Packed YUV422
-
        // in and out buffers must be aligned to 16 bytes,
        // in should be as ffmpeg allocates it
        if ((off_t)out->data[0] % 16 != 0) {
@@ -132,3 +236,45 @@
                rgbbase += out->linesize[0];
        }
 }
+
+// Packed YUV422 (YUYV)
+void
+gfx_conv_yuv422_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
+{
+       // in and out buffers must be aligned to 32 bytes,
+       // in should be as ffmpeg allocates it
+       if ((off_t)out->data[0] % 32 != 0) {
+               gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
+               return;
+       }
+               
+       uint8 *ybase = (uint8 *)in->data[0];
+       uint8 *rgbbase = (uint8 *)out->data[0];
+
+       for (int i = 0; i <= height; i++) {
+               _Convert_YUV422_RGBA32_SSE2(ybase, rgbbase, width);
+               ybase += in->linesize[0];
+               rgbbase += out->linesize[0];
+       }
+}
+
+// Packed YUV422 (YUYV)
+void
+gfx_conv_yuv422_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height)
+{
+       // in and out buffers must be aligned to 32 bytes,
+       // in should be as ffmpeg allocates it
+       if ((off_t)out->data[0] % 32 != 0) {
+               gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
+               return;
+       }
+               
+       uint8 *ybase = (uint8 *)in->data[0];
+       uint8 *rgbbase = (uint8 *)out->data[0];
+
+       for (int i = 0; i <= height; i++) {
+               _Convert_YUV422_RGBA32_SSSE3(ybase, rgbbase, width);
+               ybase += in->linesize[0];
+               rgbbase += out->linesize[0];
+       }
+}

Modified: haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h
===================================================================
--- haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h 2010-12-21 
01:09:15 UTC (rev 39912)
+++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h 2010-12-21 
07:25:40 UTC (rev 39913)
@@ -7,9 +7,17 @@
 
 void gfx_conv_null_mmx(AVFrame *in, AVFrame *out, int width, int height);
 
+// Planar
+void gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int 
height);
 void gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int 
height);
+void gfx_conv_yuv420p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int 
height);
+void gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int 
height);
 void gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int 
height);
-void gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int 
height);
-void gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int 
height);
+void gfx_conv_yuv422p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int 
height);
 
+// Packed
+void gfx_conv_yuv422_rgba32_sse(AVFrame *in, AVFrame *out, int width, int 
height);
+void gfx_conv_yuv422_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int 
height);
+void gfx_conv_yuv422_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int 
height);
+
 #endif

Modified: haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp
===================================================================
--- haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp   2010-12-21 
01:09:15 UTC (rev 39912)
+++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp   2010-12-21 
07:25:40 UTC (rev 39913)
@@ -29,6 +29,7 @@
 
        switch (colorSpace) {
                case B_RGB32:
+                       // Planar Formats
                        if (pixelFormat == PIX_FMT_YUV410P) {
                                TRACE("resolve_colorspace: 
gfx_conv_yuv410p_rgb32_c\n");
                                return gfx_conv_yuv410p_rgb32_c;
@@ -57,14 +58,32 @@
 
                        if (pixelFormat == PIX_FMT_YUV422P
                                || pixelFormat == PIX_FMT_YUVJ422P) {
-                               if (cpu.HasSSE2() && width % 8 == 0)
+                               if (cpu.HasSSE2() && width % 8 == 0) {
+                                       TRACE("resolve_colorspace: 
gfx_conv_yuv422p_RGB32_sse2\n");
                                        return gfx_conv_yuv422p_rgba32_sse2;
-                               else if (cpu.HasSSE1() && width % 4 == 0)
+                               } else if (cpu.HasSSE1() && width % 4 == 0) {
+                                       TRACE("resolve_colorspace: 
gfx_conv_yuv422p_RGB32_sse\n");
                                        return gfx_conv_yuv422p_rgba32_sse;
-                               else
+                               } else {
+                                       TRACE("resolve_colorspace: 
gfx_conv_YCbCr422p_RGB32_c\n");
                                        return gfx_conv_YCbCr422_RGB32_c;
+                               }
                        }
-
+                       
+                       // Packed Formats
+                       if (pixelFormat == PIX_FMT_YUYV422) {
+                               if (cpu.HasSSSE3() && width % 8 == 0) {
+                                       return gfx_conv_yuv422_rgba32_ssse3;
+                               } else if (cpu.HasSSE2() && width % 8 == 0) {
+                                       return gfx_conv_yuv422_rgba32_sse2;
+                               } else if (cpu.HasSSE1() && width % 4 == 0
+                                       && height % 2 == 0) {
+                                       return gfx_conv_yuv422_rgba32_sse;
+                               } else {
+                                       return gfx_conv_YCbCr422_RGB32_c;
+                               }
+                       }
+                       
                        TRACE("resolve_colorspace: %s => B_RGB32: NULL\n",
                                pixfmt_to_string(pixelFormat));
                        return NULL;

Copied: haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm (from rev 
39136, haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb.nasm)
===================================================================
--- haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm                
                (rev 0)
+++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm        
2010-12-21 07:25:40 UTC (rev 39913)
@@ -0,0 +1,268 @@
+;
+; Copyright (C) 2009-2010 David McPaul
+;
+; All rights reserved. Distributed under the terms of the MIT License.
+;
+
+; A rather unoptimised set of sse yuv to rgb converters
+; does 4 pixels per loop
+
+; inputer:
+; reads 128 bits of yuv 8 bit data and puts
+; the y values converted to 16 bit in mm0
+; the u values converted to 16 bit and duplicated into mm1
+; the v values converted to 16 bit and duplicated into mm2
+
+; conversion:
+; does the yuv to rgb conversion using 16 bit fixed point and the
+; results are placed into the following registers as 8 bit clamped values
+; r values in mm3
+; g values in mm4
+; b values in mm5
+
+; outputer:
+; writes out the rgba pixels as 8 bit values with 0 for alpha
+
+; mm6 used for scratch
+; mm7 used for scratch
+
+%macro  cglobal 1
+       global  _%1
+       %define %1 _%1
+       align 16
+%1:
+%endmacro
+
+; conversion code 
+%macro yuv2rgbsse 0
+; u = u - 128
+; v = v - 128
+; r = y + v + v >> 2 + v >> 3 + v >> 5 
+; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
+; b = y + u + u >> 1 + u >> 2 + u >> 6
+; subtract 16 from y
+       movq mm7, [Const16]                             ; loads a constant 
using data cache (slower on first fetch but then cached)
+       psubsw mm0,mm7                                  ; y = y - 16
+; subtract 128 from u and v
+       movq mm7, [Const128]                    ; loads a constant using data 
cache (slower on first fetch but then cached)
+       psubsw mm1,mm7                                  ; u = u - 128
+       psubsw mm2,mm7                                  ; v = v - 128
+; load r,g,b with y 
+       movq mm3,mm0                                    ; r = y 
+       pshufw mm5,mm0, 0xE4                    ; b = y 
+
+; r = r + v + v >> 2 + v >> 3 + v >> 5
+       paddsw mm3, mm2                                 ; add v to r
+       movq mm7, mm1                                   ; move u to scratch
+       pshufw mm6, mm2, 0xE4                   ; move v to scratch
+       
+       psraw  mm6,2                                    ; divide v by 4
+       paddsw mm3, mm6                                 ; and add to r
+       psraw  mm6,1                                    ; divide v by 2
+       paddsw mm3, mm6                                 ; and add to r
+       psraw  mm6,2                                    ; divide v by 4
+       paddsw mm3, mm6                                 ; and add to r
+
+; b = y + u + u >> 1 + u >> 2 + u >> 6
+       paddsw mm5, mm1                                 ; add u to b
+       psraw  mm7,1                                    ; divide u by 2
+       paddsw mm5, mm7                                 ; and add to b
+       psraw  mm7,1                                    ; divide u by 2
+       paddsw mm5, mm7                                 ; and add to b
+       psraw  mm7,4                                    ; divide u by 32
+       paddsw mm5, mm7                                 ; and add to b
+       
+; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
+       movq mm7,mm2                                    ; move v to scratch
+       pshufw mm6,mm1, 0xE4                    ; move u to scratch
+       movq mm4,mm0                                    ; g = y 
+       
+       psraw  mm6,2                                    ; divide u by 4
+       psubsw mm4,mm6                                  ; subtract from g
+       psraw  mm6,2                                    ; divide u by 4
+       psubsw mm4,mm6                                  ; subtract from g
+       psraw  mm6,1                                    ; divide u by 2
+       psubsw mm4,mm6                                  ; subtract from g
+
+       psraw  mm7,1                                    ; divide v by 2
+       psubsw mm4,mm7                                  ; subtract from g
+       psraw  mm7,2                                    ; divide v by 4
+       psubsw mm4,mm7                                  ; subtract from g
+       psraw  mm7,1                                    ; divide v by 2
+       psubsw mm4,mm7                                  ; subtract from g
+       psraw  mm7,1                                    ; divide v by 2
+       psubsw mm4,mm7                                  ; subtract from g
+%endmacro
+
+; outputer
+%macro rgba32sseoutput 0
+; clamp values
+       pxor mm7,mm7
+       packuswb mm3,mm7                                ; clamp to 0,255 and 
pack R to 8 bit per pixel
+       packuswb mm4,mm7                                ; clamp to 0,255 and 
pack G to 8 bit per pixel
+       packuswb mm5,mm7                                ; clamp to 0,255 and 
pack B to 8 bit per pixel
+; convert to bgra32 packed
+       punpcklbw mm5,mm4                               ; bgbgbgbgbgbgbgbg
+       movq mm0, mm5                                   ; save bg values
+       punpcklbw mm3,mm7                               ; r0r0r0r0
+       punpcklwd mm5,mm3                               ; lower half bgr0bgr0
+       punpckhwd mm0,mm3                               ; upper half bgr0bgr0
+; write to output ptr
+       movq [edi], mm5                                 ; output first 2 pixels 
+       movq [edi+8], mm0                               ; output second 2 
pixels 
+%endmacro
+
+SECTION .data align=16
+
+Const16        dw      16
+       dw      16
+       dw      16
+       dw      16
+       dw      16
+       dw      16
+       dw      16
+       dw      16
+
+Const128       dw      128
+       dw      128
+       dw      128
+       dw      128
+       dw      128
+       dw      128
+       dw      128
+       dw      128
+
+; Packed Convert
+; void Convert_YUV422_RGBA32_SSE(void *fromPtr, void *toPtr, int width)
+width    equ   ebp+16
+toPtr    equ   ebp+12
+fromPtr  equ   ebp+8
+
+; Planar Convert
+; void Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr, void 
*fromVPtr, void *toPtr, int width)
+width1    equ  ebp+24
+toPtr1    equ  ebp+20
+fromVPtr  equ  ebp+16
+fromUPtr  equ  ebp+12
+fromYPtr  equ  ebp+8
+
+SECTION .text align=16
+
+; YUY2 FOURCC
+cglobal Convert_YUV422_RGBA32_SSE
+; reserve variables
+       push ebp
+       mov ebp, esp
+       push edi
+       push esi
+       push ecx
+       
+       mov esi, [fromPtr]
+       mov ecx, [width]
+       mov edi, [toPtr]
+; loop width / 4 times
+       shr ecx,2
+       test ecx,ecx
+       jng ENDLOOP2
+REPEATLOOP2:                                           ; loop over width / 4
+
+; YUV422 packed inputer
+       movq mm0, [esi]                                 ; should have yuyv yuyv
+       pshufw mm1, mm0, 0xE4                   ; copy to mm1
+       movq mm2, mm0                                   ; copy to mm2
+; extract y
+       pxor mm7,mm7                                    ; 0000000000000000
+       pcmpeqb mm6,mm6                                 ; ffffffffffffffff
+       punpckhbw mm6,mm7                               ; interleave mm7 into 
mm6 ff00ff00ff00ff00
+       pand mm0, mm6                                   ; clear all but y 
values leaving y0y0 etc
+; extract u and duplicate so each u in yuyv becomes 0u0u
+       psrld mm6,8                                             ; 00ff0000 
00ff0000 
+       pand mm1, mm6                                   ; clear all yv values 
leaving 0u00 etc
+       psrld mm1,8                                             ; rotate u to 
get u000
+       pshufw mm1,mm1, 0xA0                    ; copy u values to get u0u0     
        (SSE not MMX)
+; extract v
+       pslld mm6,16                                    ; 000000ff000000ff
+       pand mm2, mm6                                   ; clear all yu values 
leaving 000v etc
+       psrld mm2,8                                             ; rotate v to 
get 00v0
+       pshufw mm2,mm2, 0xF5                    ; copy v values to get v0v0    
(SSE not MMX)
+
+yuv2rgbsse
+
+rgba32sseoutput
+
+       ; endloop
+       add edi,16
+       add esi,8
+       sub ecx, 1                                              ; apparently 
sub is better than dec
+       jnz REPEATLOOP2
+ENDLOOP2:
+; Cleanup
+       emms                                                    ; reset mmx 
regs back to float
+       pop ecx
+       pop esi
+       pop edi
+       mov esp, ebp
+       pop ebp
+       ret
+
+cglobal Convert_YUV420P_RGBA32_SSE
+; reserve variables
+       push ebp
+       mov ebp, esp
+       push edi
+       push esi
+       push ecx
+       push eax
+       push ebx
+               
+       mov esi, [fromYPtr]
+       mov eax, [fromUPtr]
+       mov ebx, [fromVPtr]
+       mov edi, [toPtr1]
+       mov ecx, [width1]
+; loop width / 4 times
+       shr ecx,2
+       test ecx,ecx
+       jng ENDLOOP3
+REPEATLOOP3:                                           ; loop over width / 4
+; YUV420 Planar inputer
+       movq mm0, [esi]                                 ; fetch 4 y values (8 
bit) yyyy0000
+       movd mm1, [eax]                                 ; fetch 2 u values (8 
bit) uu000000
+       movd mm2, [ebx]                                 ; fetch 2 v values (8 
bit) vv000000
+       
+; extract y
+       pxor mm7,mm7                                    ; 0000000000000000
+       punpcklbw mm0,mm7                               ; interleave xmm7 into 
xmm0 y0y0y0y
+; extract u and duplicate so each becomes 0u0u
+       punpcklbw mm1,mm7                               ; interleave xmm7 into 
xmm1 u0u00000
+       punpcklwd mm1,mm7                               ; interleave again 
u000u000
+       pshufw mm1,mm1, 0xA0                    ; copy u values to get u0u0
+; extract v
+       punpcklbw mm2,mm7                               ; interleave xmm7 into 
xmm1 v0v00000
+       punpcklwd mm2,mm7                               ; interleave again 
v000v000
+       pshufw mm2,mm2, 0xA0                    ; copy v values to get v0v0
+
+yuv2rgbsse
+       
+rgba32sseoutput
+
+; endloop
+       add edi,16
+       add esi,4
+       add eax,2
+       add ebx,2
+       sub ecx, 1                              ; apparently sub is better than 
dec
+       jnz REPEATLOOP3
+ENDLOOP3:
+; Cleanup
+       emms
+       pop ebx
+       pop eax
+       pop ecx
+       pop esi
+       pop edi
+       mov esp, ebp
+       pop ebp
+       ret
+
+SECTION .note.GNU-stack noalloc noexec nowrite progbits

Added: haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse2.nasm
===================================================================
--- haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse2.nasm               
                (rev 0)
+++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse2.nasm       
2010-12-21 07:25:40 UTC (rev 39913)
@@ -0,0 +1,266 @@
+;
+; Copyright (C) 2009-2010 David McPaul
+;
+; All rights reserved. Distributed under the terms of the MIT License.
+;
+
+; A rather unoptimised set of sse2 yuv to rgb converters
+; does 8 pixels per loop
+
+; inputer:
+; reads 128 bits of yuv 8 bit data and puts
+; the y values converted to 16 bit in xmm0
+; the u values converted to 16 bit and duplicated into xmm1
+; the v values converted to 16 bit and duplicated into xmm2
+
+; conversion:
+; does the yuv to rgb conversion using 16 bit fixed point and the
+; results are placed into the following registers as 8 bit clamped values
+; r values in xmm3
+; g values in xmm4
+; b values in xmm5
+
+; outputer:
+; writes out the rgba pixels as 8 bit values with 0 for alpha
+
+; xmm6 used for scratch
+; xmm7 used for scratch
+
+%macro  cglobal 1
+       global  _%1
+       %define %1 _%1
+       align 16
+%1:
+%endmacro
+
+; conversion code 
+%macro yuv2rgbsse2 0
+; u = u - 128
+; v = v - 128
+; r = y + v + v >> 2 + v >> 3 + v >> 5 
+; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
+; b = y + u + u >> 1 + u >> 2 + u >> 6
+; subtract 16 from y
+       movdqa xmm7, [Const16]                  ; loads a constant using data 
cache (slower on first fetch but then cached)
+       psubsw xmm0,xmm7                                ; y = y - 16
+; subtract 128 from u and v
+       movdqa xmm7, [Const128]                 ; loads a constant using data 
cache (slower on first fetch but then cached)
+       psubsw xmm1,xmm7                                ; u = u - 128
+       psubsw xmm2,xmm7                                ; v = v - 128
+; load r,b with y 
+       movdqa xmm3,xmm0                                ; r = y 
+       pshufd xmm5,xmm0, 0xE4                  ; b = y 
+
+; r = y + v + v >> 2 + v >> 3 + v >> 5
+       paddsw xmm3, xmm2                               ; add v to r
+       movdqa xmm7, xmm1                               ; move u to scratch
+       pshufd xmm6, xmm2, 0xE4                 ; move v to scratch
+       
+       psraw  xmm6,2                                   ; divide v by 4
+       paddsw xmm3, xmm6                               ; and add to r
+       psraw  xmm6,1                                   ; divide v by 2
+       paddsw xmm3, xmm6                               ; and add to r
+       psraw  xmm6,2                                   ; divide v by 4
+       paddsw xmm3, xmm6                               ; and add to r
+
+; b = y + u + u >> 1 + u >> 2 + u >> 6
+       paddsw xmm5, xmm1                               ; add u to b
+       psraw  xmm7,1                                   ; divide u by 2
+       paddsw xmm5, xmm7                               ; and add to b
+       psraw  xmm7,1                                   ; divide u by 2
+       paddsw xmm5, xmm7                               ; and add to b
+       psraw  xmm7,4                                   ; divide u by 32
+       paddsw xmm5, xmm7                               ; and add to b
+       
+; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
+       movdqa xmm7,xmm2                                ; move v to scratch
+       pshufd xmm6,xmm1, 0xE4                  ; move u to scratch
+       movdqa xmm4,xmm0                                ; g = y 
+       
+       psraw  xmm6,2                                   ; divide u by 4
+       psubsw xmm4,xmm6                                ; subtract from g
+       psraw  xmm6,2                                   ; divide u by 4
+       psubsw xmm4,xmm6                                ; subtract from g
+       psraw  xmm6,1                                   ; divide u by 2
+       psubsw xmm4,xmm6                                ; subtract from g
+
+       psraw  xmm7,1                                   ; divide v by 2
+       psubsw xmm4,xmm7                                ; subtract from g
+       psraw  xmm7,2                                   ; divide v by 4
+       psubsw xmm4,xmm7                                ; subtract from g
+       psraw  xmm7,1                                   ; divide v by 2
+       psubsw xmm4,xmm7                                ; subtract from g
+       psraw  xmm7,1                                   ; divide v by 2
+       psubsw xmm4,xmm7                                ; subtract from g
+%endmacro
+
+; outputer
+%macro rgba32sse2output 0
+; clamp values
+       pxor xmm7,xmm7
+       packuswb xmm3,xmm7                              ; clamp to 0,255 and 
pack R to 8 bit per pixel
+       packuswb xmm4,xmm7                              ; clamp to 0,255 and 
pack G to 8 bit per pixel
+       packuswb xmm5,xmm7                              ; clamp to 0,255 and 
pack B to 8 bit per pixel
+; convert to bgra32 packed
+       punpcklbw xmm5,xmm4                             ; bgbgbgbgbgbgbgbg
+       movdqa xmm0, xmm5                               ; save bg values
+       punpcklbw xmm3,xmm7                             ; r0r0r0r0r0r0r0r0
+       punpcklwd xmm5,xmm3                             ; lower half 
bgr0bgr0bgr0bgr0
+       punpckhwd xmm0,xmm3                             ; upper half 
bgr0bgr0bgr0bgr0
+; write to output ptr
+       movntdq [edi], xmm5                             ; output first 4 pixels 
bypassing cache
+       movntdq [edi+16], xmm0                  ; output second 4 pixels 
bypassing cache
+%endmacro
+
+SECTION .data align=16
+
+Const16        dw      16
+       dw      16
+       dw      16
+       dw      16
+       dw      16
+       dw      16
+       dw      16
+       dw      16
+
+Const128       dw      128
+       dw      128
+       dw      128
+       dw      128
+       dw      128
+       dw      128
+       dw      128
+       dw      128
+
+; void Convert_YUV422_RGBA32_SSE2(void *fromPtr, void *toPtr, int width)
+width    equ   ebp+16
+toPtr    equ   ebp+12
+fromPtr  equ   ebp+8
+
+; void Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void 
*fromVPtr, void *toPtr, int width)
+width1    equ  ebp+24
+toPtr1    equ  ebp+20
+fromVPtr  equ  ebp+16
+fromUPtr  equ  ebp+12
+fromYPtr  equ  ebp+8
+
+SECTION .text align=16
+
+cglobal Convert_YUV422_RGBA32_SSE2
+; reserve variables
+       push ebp
+       mov ebp, esp
+       push edi
+       push esi
+       push ecx
+       
+       mov esi, [fromPtr]
+       mov edi, [toPtr]
+       mov ecx, [width]
+; loop width / 8 times
+       shr ecx,3
+       test ecx,ecx
+       jng ENDLOOP
+REPEATLOOP:                                                    ; loop over 
width / 8
+; YUV422 packed inputer
+       movdqa xmm0, [esi]                              ; should have yuyv yuyv 
yuyv yuyv
+       pshufd xmm1, xmm0, 0xE4                 ; copy to xmm1
+       movdqa xmm2, xmm0                               ; copy to xmm2
+; extract y
+       pxor xmm7,xmm7                                  ; 
00000000000000000000000000000000
+       pcmpeqd xmm6,xmm6                               ; 
ffffffffffffffffffffffffffffffff
+       punpcklbw xmm6,xmm7                             ; interleave xmm7 into 
xmm6 ff00ff00ff00ff00ff00ff00ff00ff00
+       pand xmm0, xmm6                                 ; clear all but y 
values leaving y0y0 etc
+; extract u and duplicate so each u in yuyv becomes 0u0u
+       psrld xmm6,8                                    ; 00ff0000 00ff0000 
00ff0000 00ff0000
+       pand xmm1, xmm6                                 ; clear all yv values 
leaving 0u00 etc
+       psrld xmm1,8                                    ; rotate u to get u000
+       pshuflw xmm1,xmm1, 0xA0                 ; copy u values
+       pshufhw xmm1,xmm1, 0xA0                 ; to get u0u0
+; extract v
+       pslld xmm6,16                                   ; 000000ff000000ff 
000000ff000000ff
+       pand xmm2, xmm6                                 ; clear all yu values 
leaving 000v etc
+       psrld xmm2,8                                    ; rotate v to get 00v0
+       pshuflw xmm2,xmm2, 0xF5                 ; copy v values
+       pshufhw xmm2,xmm2, 0xF5                 ; to get v0v0
+
+yuv2rgbsse2
+       
+rgba32sse2output
+
+; endloop
+       add edi,32
+       add esi,16
+       sub ecx, 1                              ; apparently sub is better than 
dec
+       jnz REPEATLOOP
+ENDLOOP:
+; Cleanup
+       pop ecx
+       pop esi
+       pop edi
+       mov esp, ebp
+       pop ebp
+       ret
+
+cglobal Convert_YUV420P_RGBA32_SSE2
+; reserve variables
+       push ebp
+       mov ebp, esp
+       push edi
+       push esi
+       push ecx
+       push eax
+       push ebx
+               
+       mov esi, [fromYPtr]
+       mov eax, [fromUPtr]
+       mov ebx, [fromVPtr]
+       mov edi, [toPtr1]
+       mov ecx, [width1]
+; loop width / 8 times
+       shr ecx,3
+       test ecx,ecx
+       jng ENDLOOP1
+REPEATLOOP1:                                           ; loop over width / 8
+; YUV420 Planar inputer
+       movq xmm0, [esi]                                ; fetch 8 y values (8 
bit) yyyyyyyy00000000
+       movd xmm1, [eax]                                ; fetch 4 u values (8 
bit) uuuu000000000000
+       movd xmm2, [ebx]                                ; fetch 4 v values (8 
bit) vvvv000000000000
+       
+; extract y
+       pxor xmm7,xmm7                                  ; 
00000000000000000000000000000000
+       punpcklbw xmm0,xmm7                             ; interleave xmm7 into 
xmm0 y0y0y0y0y0y0y0y0
+; extract u and duplicate so each becomes 0u0u
+       punpcklbw xmm1,xmm7                             ; interleave xmm7 into 
xmm1 u0u0u0u000000000
+       punpcklwd xmm1,xmm7                             ; interleave again 
u000u000u000u000
+       pshuflw xmm1,xmm1, 0xA0                 ; copy u values
+       pshufhw xmm1,xmm1, 0xA0                 ; to get u0u0
+; extract v
+       punpcklbw xmm2,xmm7                             ; interleave xmm7 into 
xmm1 v0v0v0v000000000
+       punpcklwd xmm2,xmm7                             ; interleave again 
v000v000v000v000
+       pshuflw xmm2,xmm2, 0xA0                 ; copy v values
+       pshufhw xmm2,xmm2, 0xA0                 ; to get v0v0
+
+yuv2rgbsse2
+       
+rgba32sse2output
+
+; endloop
+       add edi,32
+       add esi,8
+       add eax,4
+       add ebx,4
+       sub ecx, 1                              ; apparently sub is better than 
dec
+       jnz REPEATLOOP1
+ENDLOOP1:
+; Cleanup
+       pop ebx
+       pop eax
+       pop ecx
+       pop esi
+       pop edi
+       mov esp, ebp
+       pop ebp
+       ret
+
+SECTION .note.GNU-stack noalloc noexec nowrite progbits

Added: haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm
===================================================================
--- haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm              
                (rev 0)
+++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm      
2010-12-21 07:25:40 UTC (rev 39913)
@@ -0,0 +1,307 @@
+;
+; Copyright (C) 2009-2010 David McPaul
+;
+; All rights reserved. Distributed under the terms of the MIT License.
+;
+
+; A rather unoptimised set of ssse3 yuv to rgb converters
+; does 8 pixels per loop
+
+; inputer:
+; reads 128 bits of yuv 8 bit data and puts
+; the y values converted to 16 bit in xmm0
+; the u values converted to 16 bit and duplicated into xmm1
+; the v values converted to 16 bit and duplicated into xmm2
+
+; conversion:
+; does the yuv to rgb conversion using 16 bit fixed point and the
+; results are placed into the following registers as 8 bit clamped values
+; r values in xmm3
+; g values in xmm4
+; b values in xmm5
+
+; outputer:
+; writes out the rgba pixels as 8 bit values with 0 for alpha
+
+; xmm6 used for scratch
+; xmm7 used for scratch
+
+%macro  cglobal 1
+       global  _%1
+       %define %1 _%1
+       align 16
+%1:
+%endmacro
+
+; conversion code 
+%macro yuv2rgbsse2 0
+; u = u - 128
+; v = v - 128
+; r = y + v + v >> 2 + v >> 3 + v >> 5 
+; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
+; b = y + u + u >> 1 + u >> 2 + u >> 6
+; subtract 16 from y
+       movdqa xmm7, [Const16]                  ; loads a constant using data 
cache (slower on first fetch but then cached)
+       psubsw xmm0,xmm7                                ; y = y - 16
+; subtract 128 from u and v
+       movdqa xmm7, [Const128]                 ; loads a constant using data 
cache (slower on first fetch but then cached)
+       psubsw xmm1,xmm7                                ; u = u - 128
+       psubsw xmm2,xmm7                                ; v = v - 128
+; load r,b with y 
+       movdqa xmm3,xmm0                                ; r = y 
+       pshufd xmm5,xmm0, 0xE4                  ; b = y 
+
+; r = y + v + v >> 2 + v >> 3 + v >> 5
+       paddsw xmm3, xmm2                               ; add v to r
+       movdqa xmm7, xmm1                               ; move u to scratch
+       pshufd xmm6, xmm2, 0xE4                 ; move v to scratch
+       
+       psraw  xmm6,2                                   ; divide v by 4
+       paddsw xmm3, xmm6                               ; and add to r
+       psraw  xmm6,1                                   ; divide v by 2
+       paddsw xmm3, xmm6                               ; and add to r
+       psraw  xmm6,2                                   ; divide v by 4
+       paddsw xmm3, xmm6                               ; and add to r
+
+; b = y + u + u >> 1 + u >> 2 + u >> 6
+       paddsw xmm5, xmm1                               ; add u to b
+       psraw  xmm7,1                                   ; divide u by 2
+       paddsw xmm5, xmm7                               ; and add to b
+       psraw  xmm7,1                                   ; divide u by 2
+       paddsw xmm5, xmm7                               ; and add to b
+       psraw  xmm7,4                                   ; divide u by 32
+       paddsw xmm5, xmm7                               ; and add to b
+       
+; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
+       movdqa xmm7,xmm2                                ; move v to scratch

[... truncated: 232 lines follow ...]
Follow-Ups:
- [haiku-commits] Re: r39913 - haiku/trunk/src/add-ons/media/plugins/ffmpeg
  - From: Stephan Aßmus
[haiku-commits] r39913 - haiku/trunk/src/add-ons/media/plugins/ffmpeg

Other related posts: