diff -ruN libmpeg2.orig//configure.ac libmpeg2/configure.ac --- libmpeg2.orig//configure.ac 2011-11-06 22:25:07.273694621 -0500 +++ libmpeg2/configure.ac 2011-11-06 22:25:19.033752939 -0500 @@ -103,7 +103,14 @@ AC_DEFINE([ARCH_ALPHA],,[alpha architecture]);; arm*) arm_conditional=: - AC_DEFINE([ARCH_ARM],,[ARM architecture]);; + AC_DEFINE([ARCH_ARM],,[ARM architecture]) + AC_MSG_CHECKING([if inline ARM Advanced SIMD assembly is supported]) + AC_TRY_COMPILE([], + [asm ("vqmovun.s64 d0, q1":::"d0");], + [AC_DEFINE([ARCH_ARM_NEON],, [ARM Advanced SIMD assembly]) + AC_MSG_RESULT(yes)], + [AC_MSG_RESULT(no)]) + ;; esac elif test x"$CC" = x"tendracc"; then dnl TenDRA portability checking compiler diff -ruN libmpeg2.orig//include/mpeg2.h libmpeg2/include/mpeg2.h --- libmpeg2.orig//include/mpeg2.h 2011-11-06 22:25:07.297694741 -0500 +++ libmpeg2/include/mpeg2.h 2011-11-06 22:25:19.025752913 -0500 @@ -164,6 +164,7 @@ #define MPEG2_ACCEL_SPARC_VIS 1 #define MPEG2_ACCEL_SPARC_VIS2 2 #define MPEG2_ACCEL_ARM 1 +#define MPEG2_ACCEL_ARM_NEON 2 #define MPEG2_ACCEL_DETECT 0x80000000 uint32_t mpeg2_accel (uint32_t accel); diff -ruN libmpeg2.orig//libmpeg2/Makefile.am libmpeg2/libmpeg2/Makefile.am --- libmpeg2.orig//libmpeg2/Makefile.am 2011-11-06 22:25:07.289694707 -0500 +++ libmpeg2/libmpeg2/Makefile.am 2011-11-06 22:25:19.033752939 -0500 @@ -14,7 +14,7 @@ motion_comp_vis.c motion_comp_arm.c \ cpu_accel.c cpu_state.c if ARCH_ARM -libmpeg2arch_la_SOURCES += motion_comp_arm_s.S +libmpeg2arch_la_SOURCES += motion_comp_arm_s.S motion_comp_neon.c endif libmpeg2arch_la_CFLAGS = $(OPT_CFLAGS) $(ARCH_OPT_CFLAGS) $(LIBMPEG2_CFLAGS) diff -ruN libmpeg2.orig//libmpeg2/motion_comp.c libmpeg2/libmpeg2/motion_comp.c --- libmpeg2.orig//libmpeg2/motion_comp.c 2011-11-06 22:25:07.289694707 -0500 +++ libmpeg2/libmpeg2/motion_comp.c 2011-11-06 22:25:19.029752924 -0500 @@ -58,6 +58,11 @@ else #endif #ifdef ARCH_ARM +#ifdef ARCH_ARM_NEON + if (accel & MPEG2_ACCEL_ARM_NEON) + mpeg2_mc = mpeg2_mc_neon; + else +#endif if (accel & MPEG2_ACCEL_ARM) { mpeg2_mc = mpeg2_mc_arm; } else diff -ruN libmpeg2.orig//libmpeg2/motion_comp_neon.c libmpeg2/libmpeg2/motion_comp_neon.c --- libmpeg2.orig//libmpeg2/motion_comp_neon.c 1969-12-31 19:00:00.000000000 -0500 +++ libmpeg2/libmpeg2/motion_comp_neon.c 2011-11-06 22:25:19.029752924 -0500 @@ -0,0 +1,302 @@ +/* + * motion_comp_neon.c + * Copyright (C) 2009 RĂ©mi Denis-Courmont + * + * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * See http://libmpeg2.sourceforge.net/ for updates. + * + * mpeg2dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with mpeg2dec; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "config.h" + +#if defined(ARCH_ARM_NEON) + +#include +#include + +#include "mpeg2.h" +#include "attributes.h" +#include "mpeg2_internal.h" + +/* dest = ref */ +static void MC_put_o_16_neon (uint8_t * dest, const uint8_t * ref, + const int stride, int height) +{ + do { + memcpy (dest, ref, 16); + ref += stride; + dest += stride; + } while (--height); +} + +static void MC_put_o_8_neon (uint8_t * dest, const uint8_t * ref, + const int stride, int height) +{ + do { + memcpy (dest, ref, 8); + ref += stride; + dest += stride; + } while (--height); +} + +/* dest = (src1 + src2 + 1) / 2 */ +static void MC_avg_1_16_neon (uint8_t * dest, const uint8_t * src1, + const uint8_t * src2, + const int stride, unsigned height) +{ + do { + asm volatile ( + "vld1.u8 {q0}, [%[src1]]\n" + "vld1.u8 {q1}, [%[src2]]\n" + "vrhadd.u8 q0, q0, q1\n" + /* XXX: three cycles stall */ + "vst1.u8 {q0}, [%[dest]]\n" + : + : [dest]"r"(dest), [src1]"r"(src1), [src2]"r"(src2) + : "memory", "q0", "q1"); + src1 += stride; + src2 += stride; + dest += stride; + } while (--height); +} + +static void MC_avg_1_8_neon (uint8_t * dest, const uint8_t * src1, + const uint8_t * src2, + const int stride, unsigned height) +{ + do { + asm volatile ( + "vld1.u8 {d0}, [%[src1]]\n" + "vld1.u8 {d1}, [%[src2]]\n" + "vrhadd.u8 d0, d0, d1\n" + "vst1.u8 {d0}, [%[dest]]\n" + : + : [dest]"r"(dest), [src1]"r"(src1), [src2]"r"(src2) + : "memory", "q0"); + + src1 += stride; + src2 += stride; + dest += stride; + } while (--height); +} + +/* dest = (dest + ((src1 + src2 + 1) / 2) + 1) / 2 */ +static void MC_avg_2_16_neon (uint8_t * dest, const uint8_t * src1, + const uint8_t * src2, + const int stride, unsigned height) +{ + do { + asm volatile ( + "vld1.u8 {q0}, [%[src1]]\n" + "vld1.u8 {q1}, [%[src2]]\n" + "vrhadd.u8 q0, q0, q1\n" + "vld1.u8 {q2}, [%[dest]]\n" + /* XXX: one cycle stall */ + "vrhadd.u8 q0, q0, q2\n" + /* XXX: three cycles stall */ + "vst1.u8 {q0}, [%[dest]]\n" + : + : [dest]"r"(dest), [src1]"r"(src1), [src2]"r"(src2) + : "memory", "q0", "q1", "q2"); + src1 += stride; + src2 += stride; + dest += stride; + } while (--height); +} + +static void MC_avg_2_8_neon (uint8_t * dest, const uint8_t * src1, + const uint8_t * src2, + const int stride, unsigned height) +{ + do { + asm volatile ( + "vld1.u8 {d0}, [%[src1]]\n" + "vld1.u8 {d1}, [%[src2]]\n" + "vrhadd.u8 d0, d0, d1\n" + "vld1.u8 {d2}, [%[dest]]\n" + "vrhadd.u8 d0, d0, d2\n" + "vst1.u8 {d0}, [%[dest]]\n" + : + : [dest]"r"(dest), [src1]"r"(src1), [src2]"r"(src2) + : "memory", "q0", "d2"); + src1 += stride; + src2 += stride; + dest += stride; + } while (--height); +} + +static void MC_avg_o_16_neon (uint8_t * dest, const uint8_t * ref, + const int stride, int height) +{ + MC_avg_1_16_neon (dest, dest, ref, stride, height); +} + +static void MC_avg_o_8_neon (uint8_t * dest, const uint8_t * ref, + const int stride, int height) +{ + MC_avg_1_8_neon (dest, dest, ref, stride, height); +} + +static void MC_put_x_16_neon (uint8_t * dest, const uint8_t * ref, + const int stride, int height) +{ + MC_avg_1_16_neon (dest, ref, ref + 1, stride, height); +} + +static void MC_put_x_8_neon (uint8_t * dest, const uint8_t * ref, + const int stride, int height) +{ + MC_avg_1_8_neon (dest, ref, ref + 1, stride, height); +} + +static void MC_avg_x_16_neon (uint8_t * dest, const uint8_t * ref, + const int stride, int height) +{ + MC_avg_2_16_neon (dest, ref, ref + 1, stride, height); +} + +static void MC_avg_x_8_neon (uint8_t * dest, const uint8_t * ref, + const int stride, int height) +{ + MC_avg_2_8_neon (dest, ref, ref + 1, stride, height); +} + +static void MC_put_y_16_neon (uint8_t * dest, const uint8_t * ref, + const int stride, int height) +{ + MC_avg_1_16_neon (dest, ref, ref + stride, stride, height); +} +static void MC_put_y_8_neon (uint8_t * dest, const uint8_t * ref, + const int stride, int height) +{ + MC_avg_1_8_neon (dest, ref, ref + stride, stride, height); +} + +static void MC_avg_y_16_neon (uint8_t * dest, const uint8_t * ref, + const int stride, int height) +{ + MC_avg_2_16_neon (dest, ref, ref + stride, stride, height); +} + +static void MC_avg_y_8_neon (uint8_t * dest, const uint8_t * ref, + const int stride, int height) +{ + MC_avg_2_8_neon (dest, ref, ref + stride, stride, height); +} + +static void MC_put_xy_16_neon (uint8_t * dest, const uint8_t * ref, + const int stride, int height) +{ + do { + asm volatile ( + "vld1.u8 {q0}, [%[ref]]\n" + "vld1.u8 {q1}, [%[refx]]\n" + "vrhadd.u8 q0, q0, q1\n" + "vld1.u8 {q2}, [%[refy]]\n" + "vld1.u8 {q3}, [%[refxy]]\n" + "vrhadd.u8 q2, q2, q3\n" + /* XXX: three cycles stall */ + "vrhadd.u8 q0, q0, q2\n" + /* XXX: three cycles stall */ + "vst1.u8 {q0}, [%[dest]]\n" + : + : [dest]"r"(dest), [ref]"r"(ref), [refx]"r"(ref + 1), + [refy]"r"(ref + stride), [refxy]"r"(ref + stride + 1) + : "memory", "q0", "q1", "q2", "q3"); + ref += stride; + dest += stride; + } while (--height); +} + +static void MC_put_xy_8_neon (uint8_t * dest, const uint8_t * ref, + const int stride, int height) +{ + do { + asm volatile ( + "vld1.u8 {d0}, [%[ref]]\n" + "vld1.u8 {d1}, [%[refx]]\n" + "vrhadd.u8 d0, d0, d1\n" + "vld1.u8 {d2}, [%[refy]]\n" + "vld1.u8 {d3}, [%[refxy]]\n" + "vrhadd.u8 d2, d2, d3\n" + /* XXX: three cycles stall */ + "vrhadd.u8 d0, d0, d2\n" + /* XXX: three cycles stall */ + "vst1.u8 {d0}, [%[dest]]\n" + : + : [dest]"r"(dest), [ref]"r"(ref), [refx]"r"(ref + 1), + [refy]"r"(ref + stride), [refxy]"r"(ref + stride + 1) + : "memory", "q0", "q1"); + ref += stride; + dest += stride; + } while (--height); +} + +static void MC_avg_xy_16_neon (uint8_t * dest, const uint8_t * ref, + const int stride, int height) +{ + do { + asm volatile ( + "vld1.u8 {q0}, [%[ref]]\n" + "vld1.u8 {q1}, [%[refx]]\n" + "vrhadd.u8 q0, q0, q1\n" + "vld1.u8 {q2}, [%[refy]]\n" + "vld1.u8 {q3}, [%[refxy]]\n" + "vrhadd.u8 q2, q2, q3\n" + "vld1.u8 {q4}, [%[dest]]\n" + /* XXX: one cycle stall */ + "vrhadd.u8 q0, q0, q2\n" + /* XXX: three cycles stall */ + "vrhadd.u8 q0, q4, q0\n" + "vst1.u8 {q0}, [%[dest]]\n" + : + : [dest]"r"(dest), [ref]"r"(ref), [refx]"r"(ref + 1), + [refy]"r"(ref + stride), [refxy]"r"(ref + stride + 1) + : "memory", "q0", "q1", "q2", "q3", "q4"); + ref += stride; + dest += stride; + } while (--height); +} + +static void MC_avg_xy_8_neon (uint8_t * dest, const uint8_t * ref, + const int stride, int height) +{ + do { + asm volatile ( + "vld1.u8 {d0}, [%[ref]]\n" + "vld1.u8 {d1}, [%[refx]]\n" + "vrhadd.u8 d0, d0, d1\n" + "vld1.u8 {d2}, [%[refy]]\n" + "vld1.u8 {d3}, [%[refxy]]\n" + "vrhadd.u8 d2, d2, d3\n" + "vld1.u8 {d4}, [%[dest]]\n" + /* XXX: one cycle stall */ + "vrhadd.u8 d0, d0, d2\n" + /* XXX: three cycles stall */ + "vrhadd.u8 d0, d4, d0\n" + "vst1.u8 {d0}, [%[dest]]\n" + : + : [dest]"r"(dest), [ref]"r"(ref), [refx]"r"(ref + 1), + [refy]"r"(ref + stride), [refxy]"r"(ref + stride + 1) + : "memory", "q0", "q1", "d4"); + ref += stride; + dest += stride; + } while (--height); +} + +MPEG2_MC_EXTERN (neon) + +#endif /* ARCH_ARM_NEON */ diff -ruN libmpeg2.orig//libmpeg2/mpeg2_internal.h libmpeg2/libmpeg2/mpeg2_internal.h --- libmpeg2.orig//libmpeg2/mpeg2_internal.h 2011-11-06 22:25:07.293694722 -0500 +++ libmpeg2/libmpeg2/mpeg2_internal.h 2011-11-06 22:25:19.029752924 -0500 @@ -313,5 +313,6 @@ extern mpeg2_mc_t mpeg2_mc_alpha; extern mpeg2_mc_t mpeg2_mc_vis; extern mpeg2_mc_t mpeg2_mc_arm; +extern mpeg2_mc_t mpeg2_mc_neon; #endif /* LIBMPEG2_MPEG2_INTERNAL_H */