avcodec/x86/sbcdsp: Port MMX sbc_calc_scalefactors to SSE4

Besides giving a nice speedup over the MMX version,
it also avoids processing unnecessarily much input and
touching unnecessarily much output in the 2ch-4subbands case.

calc_scalefactors_1ch_4subbands_c:                     106.9 ( 1.00x)
calc_scalefactors_1ch_4subbands_mmx:                    46.7 ( 2.29x)
calc_scalefactors_1ch_4subbands_sse4:                   11.8 ( 9.05x)
calc_scalefactors_1ch_8subbands_c:                     220.5 ( 1.00x)
calc_scalefactors_1ch_8subbands_mmx:                    92.3 ( 2.39x)
calc_scalefactors_1ch_8subbands_sse4:                   23.8 ( 9.28x)
calc_scalefactors_2ch_4subbands_c:                     222.5 ( 1.00x)
calc_scalefactors_2ch_4subbands_mmx:                   139.3 ( 1.60x)
calc_scalefactors_2ch_4subbands_sse4:                   23.6 ( 9.41x)
calc_scalefactors_2ch_8subbands_c:                     440.3 ( 1.00x)
calc_scalefactors_2ch_8subbands_mmx:                   196.8 ( 2.24x)
calc_scalefactors_2ch_8subbands_sse4:                   46.5 ( 9.48x)

The MMX version has been removed.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2026-03-24 23:06:05 +01:00
parent e4e5beb394
commit bb65b54f2f
2 changed files with 39 additions and 48 deletions
+33 -42
View File
@@ -26,10 +26,6 @@
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
scale_mask: times 2 dd 0x8000 ; 1 << (SBC_PROTO_FIXED_SCALE - 1)
SECTION .text
%macro NIDN 3
@@ -127,50 +123,45 @@ cglobal sbc_analyze_8, 3, 3, 6, in, out, consts
; uint32_t scale_factor[2][8],
; int blocks, int channels, int subbands)
;*******************************************************************
INIT_MMX mmx
cglobal sbc_calc_scalefactors, 5, 7, 4, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk
; subbands = 4 * subbands * channels
movq m3, [scale_mask]
shl subbandsd, 2
cmp channelsd, 2
jl .loop_1
add subbandsd, 32
INIT_XMM sse4
cglobal sbc_calc_scalefactors, 5, 6, 5, sb_sample_f, scale_factor, blocks, channels, subbands, step
shl blocksd, 6
pcmpeqd m3, m3
shl subbandsd, 2
mov stepd, 48
add sb_sample_fq, blocksq
psrld m4, m3, 25 ; pd_127
neg blocksq
shl channelsd, 5
sub stepd, subbandsd ; step = subbands == 4 ? 32 : 16
pxor m2, m2
.loop_1:
sub subbandsq, 8
lea ptrq, [sb_sample_fq + subbandsq]
lea subbandsq, [blocksq+64]
; blk = (blocks - 1) * 64;
lea blkq, [blocksq - 1]
shl blkd, 6
movq m0, m3
pabsd m0, [sb_sample_fq+blocksq]
.loop_2:
movq m1, [ptrq+blkq]
pxor m2, m2
pcmpgtd m1, m2
paddd m1, [ptrq+blkq]
pcmpgtd m2, m1
pxor m1, m2
pabsd m1, [sb_sample_fq+subbandsq]
pmaxud m0, m1
add subbandsq, 64
js .loop_2
por m0, m1
paddd m0, m3 ; max - 1, representable as signed value
pmaxsd m0, m2
sub blkq, 64
jns .loop_2
; We have to calculate log2(x|(1<<15))-15. This equals log2(x>>15) for x >= 2^15
; and x>>15 is exactly representable as a float, so one can get the log2
; by converting to float and subtracting 127 from the exponent.
; For x < 2^15 the result is correct when using saturated subtraction.
psrld m0, 15
cvtdq2ps m0, m0
add sb_sample_fq, stepq
psrld m0, 23 ; exponent
psubusw m0, m4 ; same as saturated dword subtraction
mova [scale_factorq], m0
movd blkd, m0
psrlq m0, 32
bsr blkd, blkd
sub blkd, 15 ; SCALE_OUT_BITS
mov [scale_factorq + subbandsq], blkd
add scale_factorq, stepq
sub channelsd, stepd
jg .loop_1
movd blkd, m0
bsr blkd, blkd
sub blkd, 15 ; SCALE_OUT_BITS
mov [scale_factorq + subbandsq + 4], blkd
cmp subbandsq, 0
jg .loop_1
emms
RET
+6 -6
View File
@@ -36,19 +36,19 @@
void ff_sbc_analyze_4_sse2(const int16_t *in, int32_t *out, const int16_t *consts);
void ff_sbc_analyze_8_sse2(const int16_t *in, int32_t *out, const int16_t *consts);
void ff_sbc_calc_scalefactors_mmx(const int32_t sb_sample_f[16][2][8],
uint32_t scale_factor[2][8],
int blocks, int channels, int subbands);
void ff_sbc_calc_scalefactors_sse4(const int32_t sb_sample_f[16][2][8],
uint32_t scale_factor[2][8],
int blocks, int channels, int subbands);
av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx;
}
if (EXTERNAL_SSE2(cpu_flags)) {
s->sbc_analyze_4 = ff_sbc_analyze_4_sse2;
s->sbc_analyze_8 = ff_sbc_analyze_8_sse2;
}
if (EXTERNAL_SSE4(cpu_flags)) {
s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_sse4;
}
}