/***************************************************************************** * copy.c: Fast YV12/NV12 copy ***************************************************************************** * Copyright (C) 2010 Laurent Aimar * $Id$ * * Authors: Laurent Aimar * Victorien Le Couviour--Tuffet * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software Foundation, * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. *****************************************************************************/ #ifdef HAVE_CONFIG_H # include "config.h" #endif #ifdef COPY_TEST # undef NDEBUG #endif #include #include #include #include #include "copy.h" static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch, unsigned height, int bitshift); #define ASSERT_PLANE(i) assert(src[i]); \ assert(src_pitch[i]) #define ASSERT_2PLANES \ assert(dst); \ ASSERT_PLANE(0); \ ASSERT_PLANE(1); \ assert(height) #define ASSERT_3PLANES ASSERT_2PLANES; \ ASSERT_PLANE(2) int CopyInitCache(copy_cache_t *cache, unsigned width) { #ifdef CAN_COMPILE_SSE2 cache->size = __MAX((width + 0x3f) & ~ 0x3f, 16384); cache->buffer = aligned_alloc(64, cache->size); if (!cache->buffer) return VLC_EGENERIC; #else (void) cache; (void) width; #endif return VLC_SUCCESS; } void CopyCleanCache(copy_cache_t *cache) { #ifdef CAN_COMPILE_SSE2 aligned_free(cache->buffer); cache->buffer = NULL; cache->size = 0; #else (void) cache; #endif } #ifdef CAN_COMPILE_SSE2 /* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction * load and storing data with the SSE>=2 instruction store. */ #define COPY16_SHIFTR(x) \ "psrlw "x", %%xmm1\n" #define COPY16_SHIFTL(x) \ "psllw "x", %%xmm1\n" #define COPY16_S(dstp, srcp, load, store, shiftstr) \ asm volatile ( \ load " 0(%[src]), %%xmm1\n" \ shiftstr \ store " %%xmm1, 0(%[dst])\n" \ : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1") #define COPY16(dstp, srcp, load, store) COPY16_S(dstp, srcp, load, store, "") #define COPY64_SHIFTR(x) \ "psrlw "x", %%xmm1\n" \ "psrlw "x", %%xmm2\n" \ "psrlw "x", %%xmm3\n" \ "psrlw "x", %%xmm4\n" #define COPY64_SHIFTL(x) \ "psllw "x", %%xmm1\n" \ "psllw "x", %%xmm2\n" \ "psllw "x", %%xmm3\n" \ "psllw "x", %%xmm4\n" #define COPY64_S(dstp, srcp, load, store, shiftstr) \ asm volatile ( \ load " 0(%[src]), %%xmm1\n" \ load " 16(%[src]), %%xmm2\n" \ load " 32(%[src]), %%xmm3\n" \ load " 48(%[src]), %%xmm4\n" \ shiftstr \ store " %%xmm1, 0(%[dst])\n" \ store " %%xmm2, 16(%[dst])\n" \ store " %%xmm3, 32(%[dst])\n" \ store " %%xmm4, 48(%[dst])\n" \ : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4") #define COPY64(dstp, srcp, load, store) \ COPY64_S(dstp, srcp, load, store, "") #ifdef COPY_TEST_NOOPTIM # undef vlc_CPU_SSE4_1 # define vlc_CPU_SSE4_1() (0) # undef vlc_CPU_SSE3 # define vlc_CPU_SSE3() (0) # undef vlc_CPU_SSSE3 # define vlc_CPU_SSSE3() (0) # undef vlc_CPU_SSE2 # define vlc_CPU_SSE2() (0) #endif /* Optimized copy from "Uncacheable Speculative Write Combining" memory * as used by some video surface. * XXX It is really efficient only when SSE4.1 is available. */ VLC_SSE static void CopyFromUswc(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch, unsigned width, unsigned height, int bitshift) { assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0); asm volatile ("mfence"); #define SSE_USWC_COPY(shiftstr16, shiftstr64) \ for (unsigned y = 0; y < height; y++) { \ const unsigned unaligned = (-(uintptr_t)src) & 0x0f; \ unsigned x = unaligned; \ if (vlc_CPU_SSE4_1()) { \ if (!unaligned) { \ for (; x+63 < width; x += 64) \ COPY64_S(&dst[x], &src[x], "movntdqa", "movdqa", shiftstr64); \ } else { \ COPY16_S(dst, src, "movdqu", "movdqa", shiftstr16); \ for (; x+63 < width; x += 64) \ COPY64_S(&dst[x], &src[x], "movntdqa", "movdqu", shiftstr64); \ } \ } else { \ if (!unaligned) { \ for (; x+63 < width; x += 64) \ COPY64_S(&dst[x], &src[x], "movdqa", "movdqa", shiftstr64); \ } else { \ COPY16_S(dst, src, "movdqu", "movdqa", shiftstr16); \ for (; x+63 < width; x += 64) \ COPY64_S(&dst[x], &src[x], "movdqa", "movdqu", shiftstr64); \ } \ } \ /* The following should not happen since buffers are generally well aligned */ \ if (x < width) \ CopyPlane(&dst[x], dst_pitch - x, &src[x], src_pitch - x, 1, bitshift); \ src += src_pitch; \ dst += dst_pitch; \ } switch (bitshift) { case 0: SSE_USWC_COPY("", "") break; case -6: SSE_USWC_COPY(COPY16_SHIFTL("$6"), COPY64_SHIFTL("$6")) break; case 6: SSE_USWC_COPY(COPY16_SHIFTR("$6"), COPY64_SHIFTR("$6")) break; case 2: SSE_USWC_COPY(COPY16_SHIFTR("$2"), COPY64_SHIFTR("$2")) break; case -2: SSE_USWC_COPY(COPY16_SHIFTL("$2"), COPY64_SHIFTL("$2")) break; case 4: SSE_USWC_COPY(COPY16_SHIFTR("$4"), COPY64_SHIFTR("$4")) break; case -4: SSE_USWC_COPY(COPY16_SHIFTL("$2"), COPY64_SHIFTL("$2")) break; default: vlc_assert_unreachable(); } #undef SSE_USWC_COPY asm volatile ("mfence"); } VLC_SSE static void Copy2d(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch, unsigned width, unsigned height) { assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0); for (unsigned y = 0; y < height; y++) { unsigned x = 0; bool unaligned = ((intptr_t)dst & 0x0f) != 0; if (!unaligned) { for (; x+63 < width; x += 64) COPY64(&dst[x], &src[x], "movdqa", "movntdq"); } else { for (; x+63 < width; x += 64) COPY64(&dst[x], &src[x], "movdqa", "movdqu"); } for (; x < width; x++) dst[x] = src[x]; src += src_pitch; dst += dst_pitch; } } VLC_SSE static void SSE_InterleaveUV(uint8_t *dst, size_t dst_pitch, uint8_t *srcu, size_t srcu_pitch, uint8_t *srcv, size_t srcv_pitch, unsigned int width, unsigned int height, uint8_t pixel_size) { assert(!((intptr_t)srcu & 0xf) && !(srcu_pitch & 0x0f) && !((intptr_t)srcv & 0xf) && !(srcv_pitch & 0x0f)); static const uint8_t shuffle_8[] = { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; static const uint8_t shuffle_16[] = { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 }; const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16; for (unsigned int y = 0; y < height; ++y) { unsigned int x; #define LOAD2X32 \ "movhpd 0x00(%[src2]), %%xmm0\n" \ "movlpd 0x00(%[src1]), %%xmm0\n" \ \ "movhpd 0x08(%[src2]), %%xmm1\n" \ "movlpd 0x08(%[src1]), %%xmm1\n" \ \ "movhpd 0x10(%[src2]), %%xmm2\n" \ "movlpd 0x10(%[src1]), %%xmm2\n" \ \ "movhpd 0x18(%[src2]), %%xmm3\n" \ "movlpd 0x18(%[src1]), %%xmm3\n" #define STORE64 \ "movdqu %%xmm0, 0x00(%[dst])\n" \ "movdqu %%xmm1, 0x10(%[dst])\n" \ "movdqu %%xmm2, 0x20(%[dst])\n" \ "movdqu %%xmm3, 0x30(%[dst])\n" #ifdef CAN_COMPILE_SSSE3 if (vlc_CPU_SSSE3()) for (x = 0; x < (width & ~31); x += 32) asm volatile ( "movdqu (%[shuffle]), %%xmm7\n" LOAD2X32 "pshufb %%xmm7, %%xmm0\n" "pshufb %%xmm7, %%xmm1\n" "pshufb %%xmm7, %%xmm2\n" "pshufb %%xmm7, %%xmm3\n" STORE64 : : [dst]"r"(dst+2*x), [src1]"r"(srcu+x), [src2]"r"(srcv+x), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7" ); else #endif { assert(pixel_size == 1); for (x = 0; x < (width & ~31); x += 32) asm volatile ( LOAD2X32 "movhlps %%xmm0, %%xmm4\n" "punpcklbw %%xmm4, %%xmm0\n" "movhlps %%xmm1, %%xmm4\n" "punpcklbw %%xmm4, %%xmm1\n" "movhlps %%xmm2, %%xmm4\n" "punpcklbw %%xmm4, %%xmm2\n" "movhlps %%xmm3, %%xmm4\n" "punpcklbw %%xmm4, %%xmm3\n" STORE64 : : [dst]"r"(dst+2*x), [src1]"r"(srcu+x), [src2]"r"(srcv+x) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm7" ); } #undef LOAD2X32 #undef STORE64 if (pixel_size == 1) { for (; x < width; x++) { dst[2*x+0] = srcu[x]; dst[2*x+1] = srcv[x]; } } else { for (; x < width; x+= 2) { dst[2*x+0] = srcu[x]; dst[2*x+1] = srcu[x + 1]; dst[2*x+2] = srcv[x]; dst[2*x+3] = srcv[x + 1]; } } srcu += srcu_pitch; srcv += srcv_pitch; dst += dst_pitch; } } VLC_SSE static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch, uint8_t *dstv, size_t dstv_pitch, const uint8_t *src, size_t src_pitch, unsigned width, unsigned height, uint8_t pixel_size) { assert(pixel_size == 1 || pixel_size == 2); assert(((intptr_t)src & 0xf) == 0 && (src_pitch & 0x0f) == 0); #define LOAD64 \ "movdqa 0(%[src]), %%xmm0\n" \ "movdqa 16(%[src]), %%xmm1\n" \ "movdqa 32(%[src]), %%xmm2\n" \ "movdqa 48(%[src]), %%xmm3\n" #define STORE2X32 \ "movq %%xmm0, 0(%[dst1])\n" \ "movq %%xmm1, 8(%[dst1])\n" \ "movhpd %%xmm0, 0(%[dst2])\n" \ "movhpd %%xmm1, 8(%[dst2])\n" \ "movq %%xmm2, 16(%[dst1])\n" \ "movq %%xmm3, 24(%[dst1])\n" \ "movhpd %%xmm2, 16(%[dst2])\n" \ "movhpd %%xmm3, 24(%[dst2])\n" #ifdef CAN_COMPILE_SSSE3 if (vlc_CPU_SSSE3()) { static const uint8_t shuffle_8[] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; static const uint8_t shuffle_16[] = { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }; const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16; for (unsigned y = 0; y < height; y++) { unsigned x = 0; for (; x < (width & ~31); x += 32) { asm volatile ( "movdqu (%[shuffle]), %%xmm7\n" LOAD64 "pshufb %%xmm7, %%xmm0\n" "pshufb %%xmm7, %%xmm1\n" "pshufb %%xmm7, %%xmm2\n" "pshufb %%xmm7, %%xmm3\n" STORE2X32 : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"); } if (pixel_size == 1) { for (; x < width; x++) { dstu[x] = src[2*x+0]; dstv[x] = src[2*x+1]; } } else { for (; x < width; x+= 2) { dstu[x] = src[2*x+0]; dstu[x+1] = src[2*x+1]; dstv[x] = src[2*x+2]; dstv[x+1] = src[2*x+3]; } } src += src_pitch; dstu += dstu_pitch; dstv += dstv_pitch; } } else #endif { assert(pixel_size == 1); static const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 }; for (unsigned y = 0; y < height; y++) { unsigned x = 0; for (; x < (width & ~31); x += 32) { asm volatile ( "movdqu (%[mask]), %%xmm7\n" LOAD64 "movdqa %%xmm0, %%xmm4\n" "movdqa %%xmm1, %%xmm5\n" "movdqa %%xmm2, %%xmm6\n" "psrlw $8, %%xmm0\n" "psrlw $8, %%xmm1\n" "pand %%xmm7, %%xmm4\n" "pand %%xmm7, %%xmm5\n" "pand %%xmm7, %%xmm6\n" "packuswb %%xmm4, %%xmm0\n" "packuswb %%xmm5, %%xmm1\n" "pand %%xmm3, %%xmm7\n" "psrlw $8, %%xmm2\n" "psrlw $8, %%xmm3\n" "packuswb %%xmm6, %%xmm2\n" "packuswb %%xmm7, %%xmm3\n" STORE2X32 : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } for (; x < width; x++) { dstu[x] = src[2*x+0]; dstv[x] = src[2*x+1]; } src += src_pitch; dstu += dstu_pitch; dstv += dstv_pitch; } } #undef STORE2X32 #undef LOAD64 } static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch, uint8_t *cache, size_t cache_size, unsigned height, int bitshift) { const size_t copy_pitch = __MIN(src_pitch, dst_pitch); const unsigned w16 = (copy_pitch+15) & ~15; const unsigned hstep = cache_size / w16; const unsigned cache_width = __MIN(src_pitch, cache_size); assert(hstep > 0); /* If SSE4.1: CopyFromUswc is faster than memcpy */ if (!vlc_CPU_SSE4_1() && bitshift == 0 && src_pitch == dst_pitch) memcpy(dst, src, copy_pitch * height); else for (unsigned y = 0; y < height; y += hstep) { const unsigned hblock = __MIN(hstep, height - y); /* Copy a bunch of line into our cache */ CopyFromUswc(cache, w16, src, src_pitch, cache_width, hblock, bitshift); /* Copy from our cache to the destination */ Copy2d(dst, dst_pitch, cache, w16, copy_pitch, hblock); /* */ src += src_pitch * hblock; dst += dst_pitch * hblock; } } static void SSE_InterleavePlanes(uint8_t *dst, size_t dst_pitch, const uint8_t *srcu, size_t srcu_pitch, const uint8_t *srcv, size_t srcv_pitch, uint8_t *cache, size_t cache_size, unsigned int height, uint8_t pixel_size, int bitshift) { assert(srcu_pitch == srcv_pitch); size_t copy_pitch = __MIN(dst_pitch / 2, srcu_pitch); unsigned int const w16 = (srcu_pitch+15) & ~15; unsigned int const hstep = (cache_size) / (2*w16); const unsigned cacheu_width = __MIN(srcu_pitch, cache_size); const unsigned cachev_width = __MIN(srcv_pitch, cache_size); assert(hstep > 0); for (unsigned int y = 0; y < height; y += hstep) { unsigned int const hblock = __MIN(hstep, height - y); /* Copy a bunch of line into our cache */ CopyFromUswc(cache, w16, srcu, srcu_pitch, cacheu_width, hblock, bitshift); CopyFromUswc(cache+w16*hblock, w16, srcv, srcv_pitch, cachev_width, hblock, bitshift); /* Copy from our cache to the destination */ SSE_InterleaveUV(dst, dst_pitch, cache, w16, cache + w16 * hblock, w16, copy_pitch, hblock, pixel_size); /* */ srcu += hblock * srcu_pitch; srcv += hblock * srcv_pitch; dst += hblock * dst_pitch; } } static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch, uint8_t *dstv, size_t dstv_pitch, const uint8_t *src, size_t src_pitch, uint8_t *cache, size_t cache_size, unsigned height, uint8_t pixel_size, int bitshift) { size_t copy_pitch = __MIN(__MIN(src_pitch / 2, dstu_pitch), dstv_pitch); const unsigned w16 = (src_pitch+15) & ~15; const unsigned hstep = cache_size / w16; const unsigned cache_width = __MIN(src_pitch, cache_size); assert(hstep > 0); for (unsigned y = 0; y < height; y += hstep) { const unsigned hblock = __MIN(hstep, height - y); /* Copy a bunch of line into our cache */ CopyFromUswc(cache, w16, src, src_pitch, cache_width, hblock, bitshift); /* Copy from our cache to the destination */ SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch, cache, w16, copy_pitch, hblock, pixel_size); /* */ src += src_pitch * hblock; dstu += dstu_pitch * hblock; dstv += dstv_pitch * hblock; } } static void SSE_Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3], const size_t src_pitch[static 3], unsigned height, const copy_cache_t *cache) { for (unsigned n = 0; n < 3; n++) { const unsigned d = n > 0 ? 2 : 1; SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch, src[n], src_pitch[n], cache->buffer, cache->size, (height+d-1)/d, 0); } asm volatile ("emms"); } static void SSE_Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2], const size_t src_pitch[static 2], unsigned height, const copy_cache_t *cache) { SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0], cache->buffer, cache->size, height, 0); SSE_CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch, src[1], src_pitch[1], cache->buffer, cache->size, (height+1) / 2, 0); asm volatile ("emms"); } static void SSE_Copy420_SP_to_P(picture_t *dest, const uint8_t *src[static 2], const size_t src_pitch[static 2], unsigned int height, uint8_t pixel_size, int bitshift, const copy_cache_t *cache) { SSE_CopyPlane(dest->p[0].p_pixels, dest->p[0].i_pitch, src[0], src_pitch[0], cache->buffer, cache->size, height, bitshift); SSE_SplitPlanes(dest->p[1].p_pixels, dest->p[1].i_pitch, dest->p[2].p_pixels, dest->p[2].i_pitch, src[1], src_pitch[1], cache->buffer, cache->size, (height+1) / 2, pixel_size, bitshift); asm volatile ("emms"); } static void SSE_Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3], const size_t src_pitch[static 3], unsigned height, uint8_t pixel_size, int bitshift, const copy_cache_t *cache) { SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0], cache->buffer, cache->size, height, bitshift); SSE_InterleavePlanes(dst->p[1].p_pixels, dst->p[1].i_pitch, src[U_PLANE], src_pitch[U_PLANE], src[V_PLANE], src_pitch[V_PLANE], cache->buffer, cache->size, (height+1) / 2, pixel_size, bitshift); asm volatile ("emms"); } #undef COPY64 #endif /* CAN_COMPILE_SSE2 */ static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch, unsigned height, int bitshift) { const size_t copy_pitch = __MIN(src_pitch, dst_pitch); if (bitshift != 0) { for (unsigned y = 0; y < height; y++) { uint16_t *dst16 = (uint16_t *) dst; const uint16_t *src16 = (const uint16_t *) src; if (bitshift > 0) for (unsigned x = 0; x < (copy_pitch / 2); x++) *dst16++ = (*src16++) >> (bitshift & 0xf); else for (unsigned x = 0; x < (copy_pitch / 2); x++) *dst16++ = (*src16++) << ((-bitshift) & 0xf); src += src_pitch; dst += dst_pitch; } } else if (src_pitch == dst_pitch) memcpy(dst, src, copy_pitch * height); else for (unsigned y = 0; y < height; y++) { memcpy(dst, src, copy_pitch); src += src_pitch; dst += dst_pitch; } } void CopyPacked(picture_t *dst, const uint8_t *src, const size_t src_pitch, unsigned height, const copy_cache_t *cache) { assert(dst); assert(src); assert(src_pitch); assert(height); #ifdef CAN_COMPILE_SSE2 if (vlc_CPU_SSE4_1()) return SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src, src_pitch, cache->buffer, cache->size, height, 0); #else (void) cache; #endif CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src, src_pitch, height, 0); } void Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2], const size_t src_pitch[static 2], unsigned height, const copy_cache_t *cache) { ASSERT_2PLANES; #ifdef CAN_COMPILE_SSE2 if (vlc_CPU_SSE2()) return SSE_Copy420_SP_to_SP(dst, src, src_pitch, height, cache); #else (void) cache; #endif CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0], height, 0); CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch, src[1], src_pitch[1], (height+1)/2, 0); } #define SPLIT_PLANES(type, pitch_den) do { \ size_t copy_pitch = __MIN(__MIN(src_pitch / pitch_den, dstu_pitch), dstv_pitch); \ for (unsigned y = 0; y < height; y++) { \ for (unsigned x = 0; x < copy_pitch; x++) { \ ((type *) dstu)[x] = ((const type *) src)[2*x+0]; \ ((type *) dstv)[x] = ((const type *) src)[2*x+1]; \ } \ src += src_pitch; \ dstu += dstu_pitch; \ dstv += dstv_pitch; \ } \ } while(0) #define SPLIT_PLANES_SHIFTR(type, pitch_den, bitshift) do { \ size_t copy_pitch = __MIN(__MIN(src_pitch / pitch_den, dstu_pitch), dstv_pitch); \ for (unsigned y = 0; y < height; y++) { \ for (unsigned x = 0; x < copy_pitch; x++) { \ ((type *) dstu)[x] = (((const type *) src)[2*x+0]) >> (bitshift); \ ((type *) dstv)[x] = (((const type *) src)[2*x+1]) >> (bitshift); \ } \ src += src_pitch; \ dstu += dstu_pitch; \ dstv += dstv_pitch; \ } \ } while(0) #define SPLIT_PLANES_SHIFTL(type, pitch_den, bitshift) do { \ size_t copy_pitch = __MIN(__MIN(src_pitch / pitch_den, dstu_pitch), dstv_pitch); \ for (unsigned y = 0; y < height; y++) { \ for (unsigned x = 0; x < copy_pitch; x++) { \ ((type *) dstu)[x] = (((const type *) src)[2*x+0]) << (bitshift); \ ((type *) dstv)[x] = (((const type *) src)[2*x+1]) << (bitshift); \ } \ src += src_pitch; \ dstu += dstu_pitch; \ dstv += dstv_pitch; \ } \ } while(0) static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch, uint8_t *dstv, size_t dstv_pitch, const uint8_t *src, size_t src_pitch, unsigned height) { SPLIT_PLANES(uint8_t, 2); } static void SplitPlanes16(uint8_t *dstu, size_t dstu_pitch, uint8_t *dstv, size_t dstv_pitch, const uint8_t *src, size_t src_pitch, unsigned height, int bitshift) { if (bitshift == 0) SPLIT_PLANES(uint16_t, 4); else if (bitshift > 0) SPLIT_PLANES_SHIFTR(uint16_t, 4, bitshift & 0xf); else SPLIT_PLANES_SHIFTL(uint16_t, 4, (-bitshift) & 0xf); } void Copy420_SP_to_P(picture_t *dst, const uint8_t *src[static 2], const size_t src_pitch[static 2], unsigned height, const copy_cache_t *cache) { ASSERT_2PLANES; #ifdef CAN_COMPILE_SSE2 if (vlc_CPU_SSE2()) return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, 1, 0, cache); #else VLC_UNUSED(cache); #endif CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0], height, 0); SplitPlanes(dst->p[1].p_pixels, dst->p[1].i_pitch, dst->p[2].p_pixels, dst->p[2].i_pitch, src[1], src_pitch[1], (height+1)/2); } void Copy420_16_SP_to_P(picture_t *dst, const uint8_t *src[static 2], const size_t src_pitch[static 2], unsigned height, int bitshift, const copy_cache_t *cache) { ASSERT_2PLANES; assert(bitshift >= -6 && bitshift <= 6 && (bitshift % 2 == 0)); #ifdef CAN_COMPILE_SSE3 if (vlc_CPU_SSSE3()) return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, 2, bitshift, cache); #else VLC_UNUSED(cache); #endif CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0], height, bitshift); SplitPlanes16(dst->p[1].p_pixels, dst->p[1].i_pitch, dst->p[2].p_pixels, dst->p[2].i_pitch, src[1], src_pitch[1], (height+1)/2, bitshift); } #define INTERLEAVE_UV() do { \ for ( unsigned int line = 0; line < copy_lines; line++ ) { \ for ( unsigned int col = 0; col < copy_pitch; col++ ) { \ *dstUV++ = *srcU++; \ *dstUV++ = *srcV++; \ } \ dstUV += i_extra_pitch_uv; \ srcU += i_extra_pitch_u; \ srcV += i_extra_pitch_v; \ } \ }while(0) #define INTERLEAVE_UV_SHIFTR(bitshitf) do { \ for ( unsigned int line = 0; line < copy_lines; line++ ) { \ for ( unsigned int col = 0; col < copy_pitch; col++ ) { \ *dstUV++ = (*srcU++) >> (bitshitf); \ *dstUV++ = (*srcV++) >> (bitshitf); \ } \ dstUV += i_extra_pitch_uv; \ srcU += i_extra_pitch_u; \ srcV += i_extra_pitch_v; \ } \ }while(0) #define INTERLEAVE_UV_SHIFTL(bitshitf) do { \ for ( unsigned int line = 0; line < copy_lines; line++ ) { \ for ( unsigned int col = 0; col < copy_pitch; col++ ) { \ *dstUV++ = (*srcU++) << (bitshitf); \ *dstUV++ = (*srcV++) << (bitshitf); \ } \ dstUV += i_extra_pitch_uv; \ srcU += i_extra_pitch_u; \ srcV += i_extra_pitch_v; \ } \ }while(0) void Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3], const size_t src_pitch[static 3], unsigned height, const copy_cache_t *cache) { ASSERT_3PLANES; #ifdef CAN_COMPILE_SSE2 if (vlc_CPU_SSE2()) return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, 1, 0, cache); #else (void) cache; #endif CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0], height, 0); const unsigned copy_lines = (height+1) / 2; const unsigned copy_pitch = __MIN(src_pitch[1], dst->p[1].i_pitch / 2); const int i_extra_pitch_uv = dst->p[1].i_pitch - 2 * copy_pitch; const int i_extra_pitch_u = src_pitch[U_PLANE] - copy_pitch; const int i_extra_pitch_v = src_pitch[V_PLANE] - copy_pitch; uint8_t *dstUV = dst->p[1].p_pixels; const uint8_t *srcU = src[U_PLANE]; const uint8_t *srcV = src[V_PLANE]; INTERLEAVE_UV(); } void Copy420_16_P_to_SP(picture_t *dst, const uint8_t *src[static 3], const size_t src_pitch[static 3], unsigned height, int bitshift, const copy_cache_t *cache) { ASSERT_3PLANES; assert(bitshift >= -6 && bitshift <= 6 && (bitshift % 2 == 0)); #ifdef CAN_COMPILE_SSE2 if (vlc_CPU_SSSE3()) return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, 2, bitshift, cache); #else (void) cache; #endif CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0], height, bitshift); const unsigned copy_lines = (height+1) / 2; const unsigned copy_pitch = src_pitch[1] / 2; const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch; const int i_extra_pitch_u = src_pitch[U_PLANE] / 2 - copy_pitch; const int i_extra_pitch_v = src_pitch[V_PLANE] / 2 - copy_pitch; uint16_t *dstUV = (void*) dst->p[1].p_pixels; const uint16_t *srcU = (const uint16_t *) src[U_PLANE]; const uint16_t *srcV = (const uint16_t *) src[V_PLANE]; if (bitshift == 0) INTERLEAVE_UV(); else if (bitshift > 0) INTERLEAVE_UV_SHIFTR(bitshift & 0xf); else INTERLEAVE_UV_SHIFTL((-bitshift) & 0xf); } void CopyFromI420_10ToP010(picture_t *dst, const uint8_t *src[static 3], const size_t src_pitch[static 3], unsigned height, const copy_cache_t *cache) { (void) cache; const int i_extra_pitch_dst_y = (dst->p[0].i_pitch - src_pitch[0]) / 2; const int i_extra_pitch_src_y = (src_pitch[Y_PLANE] - src_pitch[0]) / 2; uint16_t *dstY = (uint16_t *) dst->p[0].p_pixels; const uint16_t *srcY = (const uint16_t *) src[Y_PLANE]; for (unsigned y = 0; y < height; y++) { for (unsigned x = 0; x < (src_pitch[0] / 2); x++) { *dstY++ = *srcY++ << 6; } dstY += i_extra_pitch_dst_y; srcY += i_extra_pitch_src_y; } const unsigned copy_lines = height / 2; const unsigned copy_pitch = src_pitch[1] / 2; const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch; const int i_extra_pitch_u = src_pitch[U_PLANE] / 2 - copy_pitch; const int i_extra_pitch_v = src_pitch[V_PLANE] / 2 - copy_pitch; uint16_t *dstUV = (uint16_t *) dst->p[1].p_pixels; const uint16_t *srcU = (const uint16_t *) src[U_PLANE]; const uint16_t *srcV = (const uint16_t *) src[V_PLANE]; for ( unsigned int line = 0; line < copy_lines; line++ ) { for ( unsigned int col = 0; col < copy_pitch; col++ ) { *dstUV++ = *srcU++ << 6; *dstUV++ = *srcV++ << 6; } dstUV += i_extra_pitch_uv; srcU += i_extra_pitch_u; srcV += i_extra_pitch_v; } } void Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3], const size_t src_pitch[static 3], unsigned height, const copy_cache_t *cache) { ASSERT_3PLANES; #ifdef CAN_COMPILE_SSE2 if (vlc_CPU_SSE2()) return SSE_Copy420_P_to_P(dst, src, src_pitch, height, cache); #else (void) cache; #endif CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0], height, 0); CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch, src[1], src_pitch[1], (height+1) / 2, 0); CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch, src[2], src_pitch[2], (height+1) / 2, 0); } void picture_SwapUV(picture_t *picture) { assert(picture->i_planes == 3); plane_t tmp_plane = picture->p[U_PLANE]; picture->p[U_PLANE] = picture->p[V_PLANE]; picture->p[V_PLANE] = tmp_plane; } int picture_UpdatePlanes(picture_t *picture, uint8_t *data, unsigned pitch) { /* fill in buffer info in first plane */ picture->p->p_pixels = data; picture->p->i_pitch = pitch; picture->p->i_lines = picture->format.i_height; assert(picture->p->i_visible_pitch <= picture->p->i_pitch); assert(picture->p->i_visible_lines <= picture->p->i_lines); /* Fill chroma planes for biplanar YUV */ if (picture->format.i_chroma == VLC_CODEC_NV12 || picture->format.i_chroma == VLC_CODEC_NV21 || picture->format.i_chroma == VLC_CODEC_P010) { for (int n = 1; n < picture->i_planes; n++) { const plane_t *o = &picture->p[n-1]; plane_t *p = &picture->p[n]; p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch; p->i_pitch = pitch; p->i_lines = picture->format.i_height / 2; assert(p->i_visible_pitch <= p->i_pitch); assert(p->i_visible_lines <= p->i_lines); } /* The dx/d3d buffer is always allocated as NV12 */ if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_NV12)) { /* TODO : Swap NV21 UV planes to match NV12 */ return VLC_EGENERIC; } } /* Fill chroma planes for planar YUV */ else if (picture->format.i_chroma == VLC_CODEC_I420 || picture->format.i_chroma == VLC_CODEC_J420 || picture->format.i_chroma == VLC_CODEC_YV12) { for (int n = 1; n < picture->i_planes; n++) { const plane_t *o = &picture->p[n-1]; plane_t *p = &picture->p[n]; p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch; p->i_pitch = pitch / 2; p->i_lines = picture->format.i_height / 2; } /* The dx/d3d buffer is always allocated as YV12 */ if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_YV12)) { uint8_t *p_tmp = picture->p[1].p_pixels; picture->p[1].p_pixels = picture->p[2].p_pixels; picture->p[2].p_pixels = p_tmp; } } return VLC_SUCCESS; } #ifdef COPY_TEST #include struct test_dst { vlc_fourcc_t chroma; int bitshift; union { void (*conv)(picture_t *, const uint8_t *[], const size_t [], unsigned, const copy_cache_t *); void (*conv16)(picture_t *, const uint8_t *[], const size_t [], unsigned, int, const copy_cache_t *); }; }; struct test_conv { vlc_fourcc_t src_chroma; struct test_dst dsts[3]; }; static const struct test_conv convs[] = { { .src_chroma = VLC_CODEC_NV12, .dsts = { { VLC_CODEC_I420, 0, .conv = Copy420_SP_to_P }, { VLC_CODEC_NV12, 0, .conv = Copy420_SP_to_SP } }, }, { .src_chroma = VLC_CODEC_I420, .dsts = { { VLC_CODEC_I420, 0, .conv = Copy420_P_to_P }, { VLC_CODEC_NV12, 0, .conv = Copy420_P_to_SP } }, }, { .src_chroma = VLC_CODEC_P010, .dsts = { { VLC_CODEC_I420_10L, 6, .conv16 = Copy420_16_SP_to_P } }, }, { .src_chroma = VLC_CODEC_I420_10L, .dsts = { { VLC_CODEC_P010, -6, .conv16 = Copy420_16_P_to_SP } }, }, }; #define NB_CONVS ARRAY_SIZE(convs) struct test_size { int i_width; int i_height; int i_visible_width; int i_visible_height; }; static const struct test_size sizes[] = { { 1, 1, 1, 1 }, { 3, 3, 3, 3 }, { 65, 39, 65, 39 }, { 560, 369, 540, 350 }, { 1274, 721, 1200, 720 }, { 1920, 1088, 1920, 1080 }, { 3840, 2160, 3840, 2160 }, #if 0 /* too long */ { 8192, 8192, 8192, 8192 }, #endif }; #define NB_SIZES ARRAY_SIZE(sizes) static void piccheck(picture_t *pic, const vlc_chroma_description_t *dsc, bool init) { #define ASSERT_COLOR(good) do { \ fprintf(stderr, "error: pixel doesn't match @ plane: %d: %d x %d: 0x%X vs 0x%X\n", i, x, y, *(--p), good); \ assert(!"error: pixel doesn't match"); \ } while(0) #define PICCHECK(type_u, type_uv, colors_P, color_UV, pitch_den) do { \ for (int i = 0; i < pic->i_planes; ++i) \ { \ const struct plane_t *plane = &pic->p[i]; \ for (int y = 0; y < plane->i_visible_lines; ++y) \ { \ if (pic->i_planes == 2 && i == 1) \ { \ type_uv *p = (type_uv *)&plane->p_pixels[y * plane->i_pitch]; \ for (int x = 0; x < plane->i_visible_pitch / 2 / pitch_den; ++x) \ if (init) \ *(p++) = color_UV; \ else if (*(p++) != color_UV) \ ASSERT_COLOR(color_UV); \ } \ else \ { \ type_u *p = (type_u *) &plane->p_pixels[y * plane->i_pitch]; \ for (int x = 0; x < plane->i_visible_pitch / pitch_den; ++x) \ if (init) \ *(p++) = colors_P[i]; \ else if (*(p++) != colors_P[i]) \ ASSERT_COLOR(colors_P[i]); \ } \ } \ } \ } while (0) assert(pic->i_planes == 2 || pic->i_planes == 3); assert(dsc->pixel_size == 1 || dsc->pixel_size == 2); if (dsc->pixel_size == 1) { const uint8_t colors_8_P[3] = { 0x42, 0xF1, 0x36 }; const uint16_t color_8_UV = ntoh16(0xF136); PICCHECK(uint8_t, uint16_t, colors_8_P, color_8_UV, 1); } else { const unsigned mask = (1 << dsc->pixel_bits) - 1; uint16_t colors_16_P[3] = { 0x1042 &mask, 0xF114 &mask, 0x3645 &mask}; switch (pic->format.i_chroma) { case VLC_CODEC_P010: for (size_t i = 0; i < 3; ++i) colors_16_P[i] <<= 6; break; case VLC_CODEC_I420_10L: break; default: vlc_assert_unreachable(); } uint32_t color_16_UV = (colors_16_P[2] << 16) | colors_16_P[1]; PICCHECK(uint16_t, uint32_t, colors_16_P, color_16_UV, 2); } } static void pic_rsc_destroy(picture_t *pic) { for (unsigned i = 0; i < 3; i++) free(pic->p[i].p_pixels); free(pic); } static picture_t *pic_new_unaligned(const video_format_t *fmt) { /* Allocate a no-aligned picture in order to ease buffer overflow detection * from the source picture */ const vlc_chroma_description_t *dsc = vlc_fourcc_GetChromaDescription(fmt->i_chroma); assert(dsc); picture_resource_t rsc = { .pf_destroy = pic_rsc_destroy }; for (unsigned i = 0; i < dsc->plane_count; i++) { rsc.p[i].i_lines = ((fmt->i_visible_height + (dsc->p[i].h.den - 1)) / dsc->p[i].h.den) * dsc->p[i].h.num; rsc.p[i].i_pitch = ((fmt->i_visible_width + (dsc->p[i].w.den - 1)) / dsc->p[i].w.den) * dsc->p[i].w.num * dsc->pixel_size; rsc.p[i].p_pixels = malloc(rsc.p[i].i_lines * rsc.p[i].i_pitch); assert(rsc.p[i].p_pixels); } return picture_NewFromResource(fmt, &rsc); } int main(void) { alarm(10); #ifndef COPY_TEST_NOOPTIM if (!vlc_CPU_SSE2()) { fprintf(stderr, "WARNING: could not test SSE\n"); return 77; } #endif for (size_t i = 0; i < NB_CONVS; ++i) { const struct test_conv *conv = &convs[i]; for (size_t j = 0; j < NB_SIZES; ++j) { const struct test_size *size = &sizes[j]; const vlc_chroma_description_t *src_dsc = vlc_fourcc_GetChromaDescription(conv->src_chroma); assert(src_dsc); video_format_t fmt; video_format_Init(&fmt, 0); video_format_Setup(&fmt, conv->src_chroma, size->i_width, size->i_height, size->i_visible_width, size->i_visible_height, 1, 1); picture_t *src = pic_new_unaligned(&fmt); assert(src); piccheck(src, src_dsc, true); copy_cache_t cache; int ret = CopyInitCache(&cache, src->format.i_width * src_dsc->pixel_size); assert(ret == VLC_SUCCESS); for (size_t f = 0; conv->dsts[f].chroma != 0; ++f) { const struct test_dst *test_dst= &conv->dsts[f]; const vlc_chroma_description_t *dst_dsc = vlc_fourcc_GetChromaDescription(test_dst->chroma); assert(dst_dsc); fmt.i_chroma = test_dst->chroma; picture_t *dst = picture_NewFromFormat(&fmt); assert(dst); const uint8_t * src_planes[3] = { src->p[Y_PLANE].p_pixels, src->p[U_PLANE].p_pixels, src->p[V_PLANE].p_pixels }; const size_t src_pitches[3] = { src->p[Y_PLANE].i_pitch, src->p[U_PLANE].i_pitch, src->p[V_PLANE].i_pitch }; fprintf(stderr, "testing: %u x %u (vis: %u x %u) %4.4s -> %4.4s\n", size->i_width, size->i_height, size->i_visible_width, size->i_visible_height, (const char *) &src->format.i_chroma, (const char *) &dst->format.i_chroma); if (test_dst->bitshift == 0) test_dst->conv(dst, src_planes, src_pitches, src->format.i_visible_height, &cache); else test_dst->conv16(dst, src_planes, src_pitches, src->format.i_visible_height, test_dst->bitshift, &cache); piccheck(dst, dst_dsc, false); picture_Release(dst); } picture_Release(src); CopyCleanCache(&cache); } } return 0; } #endif