Author: dlmcpaul Date: 2010-12-21 08:25:40 +0100 (Tue, 21 Dec 2010) New Revision: 39913 Changeset: http://dev.haiku-os.org/changeset/39913 Added: haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse2.nasm haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm Removed: haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb.nasm Modified: haiku/trunk/src/add-ons/media/plugins/ffmpeg/Jamfile haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp Log: correct yuv422 planar conversion. Seperate sse, sse2 and ssse3 asm code. Add packed convertor Modified: haiku/trunk/src/add-ons/media/plugins/ffmpeg/Jamfile =================================================================== --- haiku/trunk/src/add-ons/media/plugins/ffmpeg/Jamfile 2010-12-21 01:09:15 UTC (rev 39912) +++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/Jamfile 2010-12-21 07:25:40 UTC (rev 39913) @@ -27,7 +27,9 @@ gfx_conv_c_lookup.cpp gfx_conv_mmx.cpp gfx_util.cpp - yuvrgb.nasm + yuvrgb_sse.nasm + yuvrgb_sse2.nasm + yuvrgb_ssse3.nasm : libavformat.a libavcodec.a Modified: haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp =================================================================== --- haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp 2010-12-21 01:09:15 UTC (rev 39912) +++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp 2010-12-21 07:25:40 UTC (rev 39913) @@ -1,22 +1,63 @@ #include "gfx_conv_mmx.h" #include "gfx_conv_c.h" - -extern "C" void _Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, - void *fromVPtr, void *toPtr, int width); +// Packed +extern "C" void _Convert_YUV422_RGBA32_SSE(void *fromYPtr, void *toPtr, + int width); extern "C" void _Convert_YUV422_RGBA32_SSE2(void *fromYPtr, void *toPtr, int width); +extern "C" void _Convert_YUV422_RGBA32_SSSE3(void *fromYPtr, void *toPtr, + int width); + +// Planar extern "C" void _Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width); -extern "C" void _Convert_YUV422_RGBA32_SSE(void *fromYPtr, void *toPtr, - int width); +extern "C" void _Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, + void *fromVPtr, void *toPtr, int width); +extern "C" void _Convert_YUV420P_RGBA32_SSSE3(void *fromYPtr, void *fromUPtr, + void *fromVPtr, void *toPtr, int width); +// Planar YUV420 means 2 Y lines share a UV line void -gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height) +gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height) { - // Planar YUV420 + // in and out buffers must be aligned to 16 bytes, + // in should be as ffmpeg allocates it + if ((off_t)out->data[0] % 16 != 0) { + gfx_conv_YCbCr420p_RGB32_c(in, out, width, height); + return; + } + + uint8 *ybase = (uint8 *)in->data[0]; + uint8 *ubase = (uint8 *)in->data[1]; + uint8 *vbase = (uint8 *)in->data[2]; + uint8 *rgbbase = (uint8 *)out->data[0]; + + int yBaseInc = in->linesize[0]; + int uBaseInc = in->linesize[1]; + int vBaseInc = in->linesize[2]; + int rgbBaseInc = out->linesize[0]; + + for (int i=0;i<height;i+=2) { + // First Y row + _Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width); + ybase += yBaseInc; + rgbbase += rgbBaseInc; + // Second Y row but same u and v row + _Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width); + ybase += yBaseInc; + ubase += uBaseInc; + vbase += vBaseInc; + rgbbase += rgbBaseInc; + } +} + +// Planar YUV420 means 2 Y lines share a UV line +void +gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height) +{ // in and out buffers must be aligned to 32 bytes, // in should be as ffmpeg allocates it if ((off_t)out->data[0] % 32 != 0) { @@ -49,42 +90,83 @@ } } +// Planar YUV420 means 2 Y lines share a UV line +void +gfx_conv_yuv420p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height) +{ + // in and out buffers must be aligned to 32 bytes, + // in should be as ffmpeg allocates it + if ((off_t)out->data[0] % 32 != 0) { + gfx_conv_YCbCr420p_RGB32_c(in, out, width, height); + return; + } + uint8 *ybase = (uint8 *)in->data[0]; + uint8 *ubase = (uint8 *)in->data[1]; + uint8 *vbase = (uint8 *)in->data[2]; + uint8 *rgbbase = (uint8 *)out->data[0]; + + int yBaseInc = in->linesize[0]; + int uBaseInc = in->linesize[1]; + int vBaseInc = in->linesize[2]; + int rgbBaseInc = out->linesize[0]; + + for (int i=0;i<height;i+=2) { + // First Y row + _Convert_YUV420P_RGBA32_SSSE3(ybase, ubase, vbase, rgbbase, width); + ybase += yBaseInc; + rgbbase += rgbBaseInc; + + // Second Y row but same u and v row + _Convert_YUV420P_RGBA32_SSSE3(ybase, ubase, vbase, rgbbase, width); + ybase += yBaseInc; + ubase += uBaseInc; + vbase += vBaseInc; + rgbbase += rgbBaseInc; + } +} + +// Planar YUV422 means each Y line has it's own UV line void -gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height) +gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height) { - // Packed YUV422 - // in and out buffers must be aligned to 32 bytes, // in should be as ffmpeg allocates it if ((off_t)out->data[0] % 32 != 0) { gfx_conv_YCbCr422_RGB32_c(in, out, width, height); return; } - + uint8 *ybase = (uint8 *)in->data[0]; + uint8 *ubase = (uint8 *)in->data[1]; + uint8 *vbase = (uint8 *)in->data[2]; uint8 *rgbbase = (uint8 *)out->data[0]; - - for (int i = 0; i <= height; i++) { - _Convert_YUV422_RGBA32_SSE2(ybase, rgbbase, width); - ybase += in->linesize[0]; - rgbbase += out->linesize[0]; + + int yBaseInc = in->linesize[0]; + int uBaseInc = in->linesize[1]; + int vBaseInc = in->linesize[2]; + int rgbBaseInc = out->linesize[0]; + + for (int i=0;i<height;i++) { + _Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width); + ybase += yBaseInc; + ubase += uBaseInc; + vbase += vBaseInc; + rgbbase += rgbBaseInc; } } - +// Planar YUV422 means each Y line has it's own UV line void -gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height) +gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height) { - // Planar YUV420 - - // in and out buffers must be aligned to 16 bytes, + // in and out buffers must be aligned to 32 bytes, // in should be as ffmpeg allocates it - if ((off_t)out->data[0] % 16 != 0) { - gfx_conv_YCbCr420p_RGB32_c(in, out, width, height); + if ((off_t)out->data[0] % 32 != 0) { + gfx_conv_YCbCr422_RGB32_c(in, out, width, height); return; } - + uint8 *ybase = (uint8 *)in->data[0]; uint8 *ubase = (uint8 *)in->data[1]; uint8 *vbase = (uint8 *)in->data[2]; @@ -95,14 +177,38 @@ int vBaseInc = in->linesize[2]; int rgbBaseInc = out->linesize[0]; - for (int i=0;i<height;i+=2) { - // First Y row - _Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width); + for (int i=0;i<height;i++) { + _Convert_YUV420P_RGBA32_SSE2(ybase, ubase, vbase, rgbbase, width); ybase += yBaseInc; + ubase += uBaseInc; + vbase += vBaseInc; rgbbase += rgbBaseInc; + } +} - // Second Y row but same u and v row - _Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width); +// Planar YUV422 means each Y line has it's own UV line +void +gfx_conv_yuv422p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height) +{ + // in and out buffers must be aligned to 32 bytes, + // in should be as ffmpeg allocates it + if ((off_t)out->data[0] % 32 != 0) { + gfx_conv_YCbCr422_RGB32_c(in, out, width, height); + return; + } + + uint8 *ybase = (uint8 *)in->data[0]; + uint8 *ubase = (uint8 *)in->data[1]; + uint8 *vbase = (uint8 *)in->data[2]; + uint8 *rgbbase = (uint8 *)out->data[0]; + + int yBaseInc = in->linesize[0]; + int uBaseInc = in->linesize[1]; + int vBaseInc = in->linesize[2]; + int rgbBaseInc = out->linesize[0]; + + for (int i=0;i<height;i++) { + _Convert_YUV420P_RGBA32_SSSE3(ybase, ubase, vbase, rgbbase, width); ybase += yBaseInc; ubase += uBaseInc; vbase += vBaseInc; @@ -110,12 +216,10 @@ } } - +// Packed YUV422 (YUYV) void -gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height) +gfx_conv_yuv422_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height) { - // Packed YUV422 - // in and out buffers must be aligned to 16 bytes, // in should be as ffmpeg allocates it if ((off_t)out->data[0] % 16 != 0) { @@ -132,3 +236,45 @@ rgbbase += out->linesize[0]; } } + +// Packed YUV422 (YUYV) +void +gfx_conv_yuv422_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height) +{ + // in and out buffers must be aligned to 32 bytes, + // in should be as ffmpeg allocates it + if ((off_t)out->data[0] % 32 != 0) { + gfx_conv_YCbCr422_RGB32_c(in, out, width, height); + return; + } + + uint8 *ybase = (uint8 *)in->data[0]; + uint8 *rgbbase = (uint8 *)out->data[0]; + + for (int i = 0; i <= height; i++) { + _Convert_YUV422_RGBA32_SSE2(ybase, rgbbase, width); + ybase += in->linesize[0]; + rgbbase += out->linesize[0]; + } +} + +// Packed YUV422 (YUYV) +void +gfx_conv_yuv422_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height) +{ + // in and out buffers must be aligned to 32 bytes, + // in should be as ffmpeg allocates it + if ((off_t)out->data[0] % 32 != 0) { + gfx_conv_YCbCr422_RGB32_c(in, out, width, height); + return; + } + + uint8 *ybase = (uint8 *)in->data[0]; + uint8 *rgbbase = (uint8 *)out->data[0]; + + for (int i = 0; i <= height; i++) { + _Convert_YUV422_RGBA32_SSSE3(ybase, rgbbase, width); + ybase += in->linesize[0]; + rgbbase += out->linesize[0]; + } +} Modified: haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h =================================================================== --- haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h 2010-12-21 01:09:15 UTC (rev 39912) +++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h 2010-12-21 07:25:40 UTC (rev 39913) @@ -7,9 +7,17 @@ void gfx_conv_null_mmx(AVFrame *in, AVFrame *out, int width, int height); +// Planar +void gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height); void gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height); +void gfx_conv_yuv420p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height); +void gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height); void gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height); -void gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height); -void gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height); +void gfx_conv_yuv422p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height); +// Packed +void gfx_conv_yuv422_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height); +void gfx_conv_yuv422_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height); +void gfx_conv_yuv422_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height); + #endif Modified: haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp =================================================================== --- haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp 2010-12-21 01:09:15 UTC (rev 39912) +++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp 2010-12-21 07:25:40 UTC (rev 39913) @@ -29,6 +29,7 @@ switch (colorSpace) { case B_RGB32: + // Planar Formats if (pixelFormat == PIX_FMT_YUV410P) { TRACE("resolve_colorspace: gfx_conv_yuv410p_rgb32_c\n"); return gfx_conv_yuv410p_rgb32_c; @@ -57,14 +58,32 @@ if (pixelFormat == PIX_FMT_YUV422P || pixelFormat == PIX_FMT_YUVJ422P) { - if (cpu.HasSSE2() && width % 8 == 0) + if (cpu.HasSSE2() && width % 8 == 0) { + TRACE("resolve_colorspace: gfx_conv_yuv422p_RGB32_sse2\n"); return gfx_conv_yuv422p_rgba32_sse2; - else if (cpu.HasSSE1() && width % 4 == 0) + } else if (cpu.HasSSE1() && width % 4 == 0) { + TRACE("resolve_colorspace: gfx_conv_yuv422p_RGB32_sse\n"); return gfx_conv_yuv422p_rgba32_sse; - else + } else { + TRACE("resolve_colorspace: gfx_conv_YCbCr422p_RGB32_c\n"); return gfx_conv_YCbCr422_RGB32_c; + } } - + + // Packed Formats + if (pixelFormat == PIX_FMT_YUYV422) { + if (cpu.HasSSSE3() && width % 8 == 0) { + return gfx_conv_yuv422_rgba32_ssse3; + } else if (cpu.HasSSE2() && width % 8 == 0) { + return gfx_conv_yuv422_rgba32_sse2; + } else if (cpu.HasSSE1() && width % 4 == 0 + && height % 2 == 0) { + return gfx_conv_yuv422_rgba32_sse; + } else { + return gfx_conv_YCbCr422_RGB32_c; + } + } + TRACE("resolve_colorspace: %s => B_RGB32: NULL\n", pixfmt_to_string(pixelFormat)); return NULL; Copied: haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm (from rev 39136, haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb.nasm) =================================================================== --- haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm (rev 0) +++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm 2010-12-21 07:25:40 UTC (rev 39913) @@ -0,0 +1,268 @@ +; +; Copyright (C) 2009-2010 David McPaul +; +; All rights reserved. Distributed under the terms of the MIT License. +; + +; A rather unoptimised set of sse yuv to rgb converters +; does 4 pixels per loop + +; inputer: +; reads 128 bits of yuv 8 bit data and puts +; the y values converted to 16 bit in mm0 +; the u values converted to 16 bit and duplicated into mm1 +; the v values converted to 16 bit and duplicated into mm2 + +; conversion: +; does the yuv to rgb conversion using 16 bit fixed point and the +; results are placed into the following registers as 8 bit clamped values +; r values in mm3 +; g values in mm4 +; b values in mm5 + +; outputer: +; writes out the rgba pixels as 8 bit values with 0 for alpha + +; mm6 used for scratch +; mm7 used for scratch + +%macro cglobal 1 + global _%1 + %define %1 _%1 + align 16 +%1: +%endmacro + +; conversion code +%macro yuv2rgbsse 0 +; u = u - 128 +; v = v - 128 +; r = y + v + v >> 2 + v >> 3 + v >> 5 +; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5) +; b = y + u + u >> 1 + u >> 2 + u >> 6 +; subtract 16 from y + movq mm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached) + psubsw mm0,mm7 ; y = y - 16 +; subtract 128 from u and v + movq mm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached) + psubsw mm1,mm7 ; u = u - 128 + psubsw mm2,mm7 ; v = v - 128 +; load r,g,b with y + movq mm3,mm0 ; r = y + pshufw mm5,mm0, 0xE4 ; b = y + +; r = r + v + v >> 2 + v >> 3 + v >> 5 + paddsw mm3, mm2 ; add v to r + movq mm7, mm1 ; move u to scratch + pshufw mm6, mm2, 0xE4 ; move v to scratch + + psraw mm6,2 ; divide v by 4 + paddsw mm3, mm6 ; and add to r + psraw mm6,1 ; divide v by 2 + paddsw mm3, mm6 ; and add to r + psraw mm6,2 ; divide v by 4 + paddsw mm3, mm6 ; and add to r + +; b = y + u + u >> 1 + u >> 2 + u >> 6 + paddsw mm5, mm1 ; add u to b + psraw mm7,1 ; divide u by 2 + paddsw mm5, mm7 ; and add to b + psraw mm7,1 ; divide u by 2 + paddsw mm5, mm7 ; and add to b + psraw mm7,4 ; divide u by 32 + paddsw mm5, mm7 ; and add to b + +; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5 + movq mm7,mm2 ; move v to scratch + pshufw mm6,mm1, 0xE4 ; move u to scratch + movq mm4,mm0 ; g = y + + psraw mm6,2 ; divide u by 4 + psubsw mm4,mm6 ; subtract from g + psraw mm6,2 ; divide u by 4 + psubsw mm4,mm6 ; subtract from g + psraw mm6,1 ; divide u by 2 + psubsw mm4,mm6 ; subtract from g + + psraw mm7,1 ; divide v by 2 + psubsw mm4,mm7 ; subtract from g + psraw mm7,2 ; divide v by 4 + psubsw mm4,mm7 ; subtract from g + psraw mm7,1 ; divide v by 2 + psubsw mm4,mm7 ; subtract from g + psraw mm7,1 ; divide v by 2 + psubsw mm4,mm7 ; subtract from g +%endmacro + +; outputer +%macro rgba32sseoutput 0 +; clamp values + pxor mm7,mm7 + packuswb mm3,mm7 ; clamp to 0,255 and pack R to 8 bit per pixel + packuswb mm4,mm7 ; clamp to 0,255 and pack G to 8 bit per pixel + packuswb mm5,mm7 ; clamp to 0,255 and pack B to 8 bit per pixel +; convert to bgra32 packed + punpcklbw mm5,mm4 ; bgbgbgbgbgbgbgbg + movq mm0, mm5 ; save bg values + punpcklbw mm3,mm7 ; r0r0r0r0 + punpcklwd mm5,mm3 ; lower half bgr0bgr0 + punpckhwd mm0,mm3 ; upper half bgr0bgr0 +; write to output ptr + movq [edi], mm5 ; output first 2 pixels + movq [edi+8], mm0 ; output second 2 pixels +%endmacro + +SECTION .data align=16 + +Const16 dw 16 + dw 16 + dw 16 + dw 16 + dw 16 + dw 16 + dw 16 + dw 16 + +Const128 dw 128 + dw 128 + dw 128 + dw 128 + dw 128 + dw 128 + dw 128 + dw 128 + +; Packed Convert +; void Convert_YUV422_RGBA32_SSE(void *fromPtr, void *toPtr, int width) +width equ ebp+16 +toPtr equ ebp+12 +fromPtr equ ebp+8 + +; Planar Convert +; void Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width) +width1 equ ebp+24 +toPtr1 equ ebp+20 +fromVPtr equ ebp+16 +fromUPtr equ ebp+12 +fromYPtr equ ebp+8 + +SECTION .text align=16 + +; YUY2 FOURCC +cglobal Convert_YUV422_RGBA32_SSE +; reserve variables + push ebp + mov ebp, esp + push edi + push esi + push ecx + + mov esi, [fromPtr] + mov ecx, [width] + mov edi, [toPtr] +; loop width / 4 times + shr ecx,2 + test ecx,ecx + jng ENDLOOP2 +REPEATLOOP2: ; loop over width / 4 + +; YUV422 packed inputer + movq mm0, [esi] ; should have yuyv yuyv + pshufw mm1, mm0, 0xE4 ; copy to mm1 + movq mm2, mm0 ; copy to mm2 +; extract y + pxor mm7,mm7 ; 0000000000000000 + pcmpeqb mm6,mm6 ; ffffffffffffffff + punpckhbw mm6,mm7 ; interleave mm7 into mm6 ff00ff00ff00ff00 + pand mm0, mm6 ; clear all but y values leaving y0y0 etc +; extract u and duplicate so each u in yuyv becomes 0u0u + psrld mm6,8 ; 00ff0000 00ff0000 + pand mm1, mm6 ; clear all yv values leaving 0u00 etc + psrld mm1,8 ; rotate u to get u000 + pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0 (SSE not MMX) +; extract v + pslld mm6,16 ; 000000ff000000ff + pand mm2, mm6 ; clear all yu values leaving 000v etc + psrld mm2,8 ; rotate v to get 00v0 + pshufw mm2,mm2, 0xF5 ; copy v values to get v0v0 (SSE not MMX) + +yuv2rgbsse + +rgba32sseoutput + + ; endloop + add edi,16 + add esi,8 + sub ecx, 1 ; apparently sub is better than dec + jnz REPEATLOOP2 +ENDLOOP2: +; Cleanup + emms ; reset mmx regs back to float + pop ecx + pop esi + pop edi + mov esp, ebp + pop ebp + ret + +cglobal Convert_YUV420P_RGBA32_SSE +; reserve variables + push ebp + mov ebp, esp + push edi + push esi + push ecx + push eax + push ebx + + mov esi, [fromYPtr] + mov eax, [fromUPtr] + mov ebx, [fromVPtr] + mov edi, [toPtr1] + mov ecx, [width1] +; loop width / 4 times + shr ecx,2 + test ecx,ecx + jng ENDLOOP3 +REPEATLOOP3: ; loop over width / 4 +; YUV420 Planar inputer + movq mm0, [esi] ; fetch 4 y values (8 bit) yyyy0000 + movd mm1, [eax] ; fetch 2 u values (8 bit) uu000000 + movd mm2, [ebx] ; fetch 2 v values (8 bit) vv000000 + +; extract y + pxor mm7,mm7 ; 0000000000000000 + punpcklbw mm0,mm7 ; interleave xmm7 into xmm0 y0y0y0y +; extract u and duplicate so each becomes 0u0u + punpcklbw mm1,mm7 ; interleave xmm7 into xmm1 u0u00000 + punpcklwd mm1,mm7 ; interleave again u000u000 + pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0 +; extract v + punpcklbw mm2,mm7 ; interleave xmm7 into xmm1 v0v00000 + punpcklwd mm2,mm7 ; interleave again v000v000 + pshufw mm2,mm2, 0xA0 ; copy v values to get v0v0 + +yuv2rgbsse + +rgba32sseoutput + +; endloop + add edi,16 + add esi,4 + add eax,2 + add ebx,2 + sub ecx, 1 ; apparently sub is better than dec + jnz REPEATLOOP3 +ENDLOOP3: +; Cleanup + emms + pop ebx + pop eax + pop ecx + pop esi + pop edi + mov esp, ebp + pop ebp + ret + +SECTION .note.GNU-stack noalloc noexec nowrite progbits Added: haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse2.nasm =================================================================== --- haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse2.nasm (rev 0) +++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse2.nasm 2010-12-21 07:25:40 UTC (rev 39913) @@ -0,0 +1,266 @@ +; +; Copyright (C) 2009-2010 David McPaul +; +; All rights reserved. Distributed under the terms of the MIT License. +; + +; A rather unoptimised set of sse2 yuv to rgb converters +; does 8 pixels per loop + +; inputer: +; reads 128 bits of yuv 8 bit data and puts +; the y values converted to 16 bit in xmm0 +; the u values converted to 16 bit and duplicated into xmm1 +; the v values converted to 16 bit and duplicated into xmm2 + +; conversion: +; does the yuv to rgb conversion using 16 bit fixed point and the +; results are placed into the following registers as 8 bit clamped values +; r values in xmm3 +; g values in xmm4 +; b values in xmm5 + +; outputer: +; writes out the rgba pixels as 8 bit values with 0 for alpha + +; xmm6 used for scratch +; xmm7 used for scratch + +%macro cglobal 1 + global _%1 + %define %1 _%1 + align 16 +%1: +%endmacro + +; conversion code +%macro yuv2rgbsse2 0 +; u = u - 128 +; v = v - 128 +; r = y + v + v >> 2 + v >> 3 + v >> 5 +; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5) +; b = y + u + u >> 1 + u >> 2 + u >> 6 +; subtract 16 from y + movdqa xmm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached) + psubsw xmm0,xmm7 ; y = y - 16 +; subtract 128 from u and v + movdqa xmm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached) + psubsw xmm1,xmm7 ; u = u - 128 + psubsw xmm2,xmm7 ; v = v - 128 +; load r,b with y + movdqa xmm3,xmm0 ; r = y + pshufd xmm5,xmm0, 0xE4 ; b = y + +; r = y + v + v >> 2 + v >> 3 + v >> 5 + paddsw xmm3, xmm2 ; add v to r + movdqa xmm7, xmm1 ; move u to scratch + pshufd xmm6, xmm2, 0xE4 ; move v to scratch + + psraw xmm6,2 ; divide v by 4 + paddsw xmm3, xmm6 ; and add to r + psraw xmm6,1 ; divide v by 2 + paddsw xmm3, xmm6 ; and add to r + psraw xmm6,2 ; divide v by 4 + paddsw xmm3, xmm6 ; and add to r + +; b = y + u + u >> 1 + u >> 2 + u >> 6 + paddsw xmm5, xmm1 ; add u to b + psraw xmm7,1 ; divide u by 2 + paddsw xmm5, xmm7 ; and add to b + psraw xmm7,1 ; divide u by 2 + paddsw xmm5, xmm7 ; and add to b + psraw xmm7,4 ; divide u by 32 + paddsw xmm5, xmm7 ; and add to b + +; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5 + movdqa xmm7,xmm2 ; move v to scratch + pshufd xmm6,xmm1, 0xE4 ; move u to scratch + movdqa xmm4,xmm0 ; g = y + + psraw xmm6,2 ; divide u by 4 + psubsw xmm4,xmm6 ; subtract from g + psraw xmm6,2 ; divide u by 4 + psubsw xmm4,xmm6 ; subtract from g + psraw xmm6,1 ; divide u by 2 + psubsw xmm4,xmm6 ; subtract from g + + psraw xmm7,1 ; divide v by 2 + psubsw xmm4,xmm7 ; subtract from g + psraw xmm7,2 ; divide v by 4 + psubsw xmm4,xmm7 ; subtract from g + psraw xmm7,1 ; divide v by 2 + psubsw xmm4,xmm7 ; subtract from g + psraw xmm7,1 ; divide v by 2 + psubsw xmm4,xmm7 ; subtract from g +%endmacro + +; outputer +%macro rgba32sse2output 0 +; clamp values + pxor xmm7,xmm7 + packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel + packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel + packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel +; convert to bgra32 packed + punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg + movdqa xmm0, xmm5 ; save bg values + punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0 + punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0 + punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0 +; write to output ptr + movntdq [edi], xmm5 ; output first 4 pixels bypassing cache + movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache +%endmacro + +SECTION .data align=16 + +Const16 dw 16 + dw 16 + dw 16 + dw 16 + dw 16 + dw 16 + dw 16 + dw 16 + +Const128 dw 128 + dw 128 + dw 128 + dw 128 + dw 128 + dw 128 + dw 128 + dw 128 + +; void Convert_YUV422_RGBA32_SSE2(void *fromPtr, void *toPtr, int width) +width equ ebp+16 +toPtr equ ebp+12 +fromPtr equ ebp+8 + +; void Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width) +width1 equ ebp+24 +toPtr1 equ ebp+20 +fromVPtr equ ebp+16 +fromUPtr equ ebp+12 +fromYPtr equ ebp+8 + +SECTION .text align=16 + +cglobal Convert_YUV422_RGBA32_SSE2 +; reserve variables + push ebp + mov ebp, esp + push edi + push esi + push ecx + + mov esi, [fromPtr] + mov edi, [toPtr] + mov ecx, [width] +; loop width / 8 times + shr ecx,3 + test ecx,ecx + jng ENDLOOP +REPEATLOOP: ; loop over width / 8 +; YUV422 packed inputer + movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv + pshufd xmm1, xmm0, 0xE4 ; copy to xmm1 + movdqa xmm2, xmm0 ; copy to xmm2 +; extract y + pxor xmm7,xmm7 ; 00000000000000000000000000000000 + pcmpeqd xmm6,xmm6 ; ffffffffffffffffffffffffffffffff + punpcklbw xmm6,xmm7 ; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00 + pand xmm0, xmm6 ; clear all but y values leaving y0y0 etc +; extract u and duplicate so each u in yuyv becomes 0u0u + psrld xmm6,8 ; 00ff0000 00ff0000 00ff0000 00ff0000 + pand xmm1, xmm6 ; clear all yv values leaving 0u00 etc + psrld xmm1,8 ; rotate u to get u000 + pshuflw xmm1,xmm1, 0xA0 ; copy u values + pshufhw xmm1,xmm1, 0xA0 ; to get u0u0 +; extract v + pslld xmm6,16 ; 000000ff000000ff 000000ff000000ff + pand xmm2, xmm6 ; clear all yu values leaving 000v etc + psrld xmm2,8 ; rotate v to get 00v0 + pshuflw xmm2,xmm2, 0xF5 ; copy v values + pshufhw xmm2,xmm2, 0xF5 ; to get v0v0 + +yuv2rgbsse2 + +rgba32sse2output + +; endloop + add edi,32 + add esi,16 + sub ecx, 1 ; apparently sub is better than dec + jnz REPEATLOOP +ENDLOOP: +; Cleanup + pop ecx + pop esi + pop edi + mov esp, ebp + pop ebp + ret + +cglobal Convert_YUV420P_RGBA32_SSE2 +; reserve variables + push ebp + mov ebp, esp + push edi + push esi + push ecx + push eax + push ebx + + mov esi, [fromYPtr] + mov eax, [fromUPtr] + mov ebx, [fromVPtr] + mov edi, [toPtr1] + mov ecx, [width1] +; loop width / 8 times + shr ecx,3 + test ecx,ecx + jng ENDLOOP1 +REPEATLOOP1: ; loop over width / 8 +; YUV420 Planar inputer + movq xmm0, [esi] ; fetch 8 y values (8 bit) yyyyyyyy00000000 + movd xmm1, [eax] ; fetch 4 u values (8 bit) uuuu000000000000 + movd xmm2, [ebx] ; fetch 4 v values (8 bit) vvvv000000000000 + +; extract y + pxor xmm7,xmm7 ; 00000000000000000000000000000000 + punpcklbw xmm0,xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0 +; extract u and duplicate so each becomes 0u0u + punpcklbw xmm1,xmm7 ; interleave xmm7 into xmm1 u0u0u0u000000000 + punpcklwd xmm1,xmm7 ; interleave again u000u000u000u000 + pshuflw xmm1,xmm1, 0xA0 ; copy u values + pshufhw xmm1,xmm1, 0xA0 ; to get u0u0 +; extract v + punpcklbw xmm2,xmm7 ; interleave xmm7 into xmm1 v0v0v0v000000000 + punpcklwd xmm2,xmm7 ; interleave again v000v000v000v000 + pshuflw xmm2,xmm2, 0xA0 ; copy v values + pshufhw xmm2,xmm2, 0xA0 ; to get v0v0 + +yuv2rgbsse2 + +rgba32sse2output + +; endloop + add edi,32 + add esi,8 + add eax,4 + add ebx,4 + sub ecx, 1 ; apparently sub is better than dec + jnz REPEATLOOP1 +ENDLOOP1: +; Cleanup + pop ebx + pop eax + pop ecx + pop esi + pop edi + mov esp, ebp + pop ebp + ret + +SECTION .note.GNU-stack noalloc noexec nowrite progbits Added: haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm =================================================================== --- haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm (rev 0) +++ haiku/trunk/src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm 2010-12-21 07:25:40 UTC (rev 39913) @@ -0,0 +1,307 @@ +; +; Copyright (C) 2009-2010 David McPaul +; +; All rights reserved. Distributed under the terms of the MIT License. +; + +; A rather unoptimised set of ssse3 yuv to rgb converters +; does 8 pixels per loop + +; inputer: +; reads 128 bits of yuv 8 bit data and puts +; the y values converted to 16 bit in xmm0 +; the u values converted to 16 bit and duplicated into xmm1 +; the v values converted to 16 bit and duplicated into xmm2 + +; conversion: +; does the yuv to rgb conversion using 16 bit fixed point and the +; results are placed into the following registers as 8 bit clamped values +; r values in xmm3 +; g values in xmm4 +; b values in xmm5 + +; outputer: +; writes out the rgba pixels as 8 bit values with 0 for alpha + +; xmm6 used for scratch +; xmm7 used for scratch + +%macro cglobal 1 + global _%1 + %define %1 _%1 + align 16 +%1: +%endmacro + +; conversion code +%macro yuv2rgbsse2 0 +; u = u - 128 +; v = v - 128 +; r = y + v + v >> 2 + v >> 3 + v >> 5 +; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5) +; b = y + u + u >> 1 + u >> 2 + u >> 6 +; subtract 16 from y + movdqa xmm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached) + psubsw xmm0,xmm7 ; y = y - 16 +; subtract 128 from u and v + movdqa xmm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached) + psubsw xmm1,xmm7 ; u = u - 128 + psubsw xmm2,xmm7 ; v = v - 128 +; load r,b with y + movdqa xmm3,xmm0 ; r = y + pshufd xmm5,xmm0, 0xE4 ; b = y + +; r = y + v + v >> 2 + v >> 3 + v >> 5 + paddsw xmm3, xmm2 ; add v to r + movdqa xmm7, xmm1 ; move u to scratch + pshufd xmm6, xmm2, 0xE4 ; move v to scratch + + psraw xmm6,2 ; divide v by 4 + paddsw xmm3, xmm6 ; and add to r + psraw xmm6,1 ; divide v by 2 + paddsw xmm3, xmm6 ; and add to r + psraw xmm6,2 ; divide v by 4 + paddsw xmm3, xmm6 ; and add to r + +; b = y + u + u >> 1 + u >> 2 + u >> 6 + paddsw xmm5, xmm1 ; add u to b + psraw xmm7,1 ; divide u by 2 + paddsw xmm5, xmm7 ; and add to b + psraw xmm7,1 ; divide u by 2 + paddsw xmm5, xmm7 ; and add to b + psraw xmm7,4 ; divide u by 32 + paddsw xmm5, xmm7 ; and add to b + +; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5 + movdqa xmm7,xmm2 ; move v to scratch [... truncated: 232 lines follow ...]