avfilter/x86/vf_pp7: Port ff_pp7_dctB_mmx to SSE2
Unfortunately a bit slower than the MMX version due to the impossibility to use memory operands in paddw. The situation would reverse if ff_dctB_mmx() would have to issue emms. dctB_c: 3.7 ( 1.00x) dctB_mmx: 3.3 ( 1.13x) dctB_sse2: 3.6 ( 1.03x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -27,7 +27,6 @@
|
||||
* project, and ported by Arwa Arif for FFmpeg.
|
||||
*/
|
||||
|
||||
#include "libavutil/emms.h"
|
||||
#include "libavutil/imgutils.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/mem_internal.h"
|
||||
@@ -351,7 +350,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
|
||||
cw, ch, qp_table, qp_stride, 0);
|
||||
filter(pp7, out->data[2], in->data[2], out->linesize[2], in->linesize[2],
|
||||
cw, ch, qp_table, qp_stride, 0);
|
||||
emms_c();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+26
-29
@@ -24,34 +24,31 @@
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_MMX mmx
|
||||
INIT_XMM sse2
|
||||
;void ff_pp7_dctB_sse2(int16_t *dst, const int16_t *src)
|
||||
cglobal pp7_dctB, 2, 2, 6, dst, src
|
||||
movq m0, [srcq+8*0]
|
||||
movq m5, [srcq+8*6]
|
||||
movq m3, [srcq+8*3]
|
||||
movq m1, [srcq+8*1]
|
||||
movq m4, [srcq+8*5]
|
||||
movq m2, [srcq+8*2]
|
||||
paddw m0, m5
|
||||
movq m5, [srcq+8*4]
|
||||
paddw m3, m3
|
||||
paddw m1, m4
|
||||
paddw m2, m5
|
||||
|
||||
;void ff_pp7_dctB_mmx(int16_t *dst, const int16_t *src)
|
||||
cglobal pp7_dctB, 2, 2, 0, dst, src
|
||||
movq m0, [srcq]
|
||||
movq m1, [srcq+mmsize*1]
|
||||
paddw m0, [srcq+mmsize*6]
|
||||
paddw m1, [srcq+mmsize*5]
|
||||
movq m2, [srcq+mmsize*2]
|
||||
movq m3, [srcq+mmsize*3]
|
||||
paddw m2, [srcq+mmsize*4]
|
||||
paddw m3, m3
|
||||
movq m4, m3
|
||||
psubw m3, m0
|
||||
paddw m4, m0
|
||||
movq m0, m2
|
||||
psubw m2, m1
|
||||
paddw m0, m1
|
||||
movq m1, m4
|
||||
psubw m4, m0
|
||||
paddw m1, m0
|
||||
movq m0, m3
|
||||
psubw m3, m2
|
||||
psubw m3, m2
|
||||
paddw m2, m0
|
||||
paddw m2, m0
|
||||
movq [dstq], m1
|
||||
movq [dstq+mmsize*2], m4
|
||||
movq [dstq+mmsize*1], m2
|
||||
movq [dstq+mmsize*3], m3
|
||||
SUMSUB_BA w, 0, 3, 4
|
||||
SUMSUB_BA w, 1, 2, 5
|
||||
|
||||
SUMSUB_BA w, 1, 0, 4
|
||||
movq [dstq], m1
|
||||
paddw m4, m2, m3
|
||||
paddw m2, m2
|
||||
movq [dstq+8*2], m0
|
||||
paddw m4, m3
|
||||
psubw m3, m2
|
||||
movq [dstq+8*1], m4
|
||||
movq [dstq+8*3], m3
|
||||
RET
|
||||
|
||||
@@ -23,12 +23,12 @@
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavfilter/vf_pp7dsp.h"
|
||||
|
||||
void ff_pp7_dctB_mmx(int16_t *restrict dst, const int16_t *restrict src);
|
||||
void ff_pp7_dctB_sse2(int16_t *restrict dst, const int16_t *restrict src);
|
||||
|
||||
av_cold void ff_pp7dsp_init_x86(PP7DSPContext *p)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags))
|
||||
p->dctB = ff_pp7_dctB_mmx;
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
p->dctB = ff_pp7_dctB_sse2;
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
|
||||
static void check_dctB(const PP7DSPContext *const pp7dsp)
|
||||
{
|
||||
declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *dst, const int16_t *src);
|
||||
declare_func(void, int16_t *dst, const int16_t *src);
|
||||
|
||||
if (!check_func(pp7dsp->dctB, "dctB"))
|
||||
return;
|
||||
|
||||
Reference in New Issue
Block a user