avfilter/x86/vf_pp7: Port ff_pp7_dctB_mmx to SSE2

Unfortunately a bit slower than the MMX version due to
the impossibility to use memory operands in paddw.
The situation would reverse if ff_dctB_mmx() would have
to issue emms.

dctB_c:                                                  3.7 ( 1.00x)
dctB_mmx:                                                3.3 ( 1.13x)
dctB_sse2:                                               3.6 ( 1.03x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2026-05-10 22:01:15 +02:00
parent fc9e63474f
commit 7971953d29
4 changed files with 30 additions and 35 deletions
-2
View File
@@ -27,7 +27,6 @@
* project, and ported by Arwa Arif for FFmpeg.
*/
#include "libavutil/emms.h"
#include "libavutil/imgutils.h"
#include "libavutil/mem.h"
#include "libavutil/mem_internal.h"
@@ -351,7 +350,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
cw, ch, qp_table, qp_stride, 0);
filter(pp7, out->data[2], in->data[2], out->linesize[2], in->linesize[2],
cw, ch, qp_table, qp_stride, 0);
emms_c();
}
}
+26 -29
View File
@@ -24,34 +24,31 @@
SECTION .text
INIT_MMX mmx
INIT_XMM sse2
;void ff_pp7_dctB_sse2(int16_t *dst, const int16_t *src)
cglobal pp7_dctB, 2, 2, 6, dst, src
movq m0, [srcq+8*0]
movq m5, [srcq+8*6]
movq m3, [srcq+8*3]
movq m1, [srcq+8*1]
movq m4, [srcq+8*5]
movq m2, [srcq+8*2]
paddw m0, m5
movq m5, [srcq+8*4]
paddw m3, m3
paddw m1, m4
paddw m2, m5
;void ff_pp7_dctB_mmx(int16_t *dst, const int16_t *src)
cglobal pp7_dctB, 2, 2, 0, dst, src
movq m0, [srcq]
movq m1, [srcq+mmsize*1]
paddw m0, [srcq+mmsize*6]
paddw m1, [srcq+mmsize*5]
movq m2, [srcq+mmsize*2]
movq m3, [srcq+mmsize*3]
paddw m2, [srcq+mmsize*4]
paddw m3, m3
movq m4, m3
psubw m3, m0
paddw m4, m0
movq m0, m2
psubw m2, m1
paddw m0, m1
movq m1, m4
psubw m4, m0
paddw m1, m0
movq m0, m3
psubw m3, m2
psubw m3, m2
paddw m2, m0
paddw m2, m0
movq [dstq], m1
movq [dstq+mmsize*2], m4
movq [dstq+mmsize*1], m2
movq [dstq+mmsize*3], m3
SUMSUB_BA w, 0, 3, 4
SUMSUB_BA w, 1, 2, 5
SUMSUB_BA w, 1, 0, 4
movq [dstq], m1
paddw m4, m2, m3
paddw m2, m2
movq [dstq+8*2], m0
paddw m4, m3
psubw m3, m2
movq [dstq+8*1], m4
movq [dstq+8*3], m3
RET
+3 -3
View File
@@ -23,12 +23,12 @@
#include "libavutil/x86/cpu.h"
#include "libavfilter/vf_pp7dsp.h"
void ff_pp7_dctB_mmx(int16_t *restrict dst, const int16_t *restrict src);
void ff_pp7_dctB_sse2(int16_t *restrict dst, const int16_t *restrict src);
av_cold void ff_pp7dsp_init_x86(PP7DSPContext *p)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags))
p->dctB = ff_pp7_dctB_mmx;
if (EXTERNAL_SSE2(cpu_flags))
p->dctB = ff_pp7_dctB_sse2;
}
+1 -1
View File
@@ -35,7 +35,7 @@
static void check_dctB(const PP7DSPContext *const pp7dsp)
{
declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *dst, const int16_t *src);
declare_func(void, int16_t *dst, const int16_t *src);
if (!check_func(pp7dsp->dctB, "dctB"))
return;