avcodec/x86/sbcdsp: Port MMX sbc_calc_scalefactors to SSE4
Besides giving a nice speedup over the MMX version, it also avoids processing unnecessarily much input and touching unnecessarily much output in the 2ch-4subbands case. calc_scalefactors_1ch_4subbands_c: 106.9 ( 1.00x) calc_scalefactors_1ch_4subbands_mmx: 46.7 ( 2.29x) calc_scalefactors_1ch_4subbands_sse4: 11.8 ( 9.05x) calc_scalefactors_1ch_8subbands_c: 220.5 ( 1.00x) calc_scalefactors_1ch_8subbands_mmx: 92.3 ( 2.39x) calc_scalefactors_1ch_8subbands_sse4: 23.8 ( 9.28x) calc_scalefactors_2ch_4subbands_c: 222.5 ( 1.00x) calc_scalefactors_2ch_4subbands_mmx: 139.3 ( 1.60x) calc_scalefactors_2ch_4subbands_sse4: 23.6 ( 9.41x) calc_scalefactors_2ch_8subbands_c: 440.3 ( 1.00x) calc_scalefactors_2ch_8subbands_mmx: 196.8 ( 2.24x) calc_scalefactors_2ch_8subbands_sse4: 46.5 ( 9.48x) The MMX version has been removed. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
+33
-42
@@ -26,10 +26,6 @@
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
scale_mask: times 2 dd 0x8000 ; 1 << (SBC_PROTO_FIXED_SCALE - 1)
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro NIDN 3
|
||||
@@ -127,50 +123,45 @@ cglobal sbc_analyze_8, 3, 3, 6, in, out, consts
|
||||
; uint32_t scale_factor[2][8],
|
||||
; int blocks, int channels, int subbands)
|
||||
;*******************************************************************
|
||||
INIT_MMX mmx
|
||||
cglobal sbc_calc_scalefactors, 5, 7, 4, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk
|
||||
; subbands = 4 * subbands * channels
|
||||
movq m3, [scale_mask]
|
||||
shl subbandsd, 2
|
||||
cmp channelsd, 2
|
||||
jl .loop_1
|
||||
add subbandsd, 32
|
||||
INIT_XMM sse4
|
||||
cglobal sbc_calc_scalefactors, 5, 6, 5, sb_sample_f, scale_factor, blocks, channels, subbands, step
|
||||
shl blocksd, 6
|
||||
pcmpeqd m3, m3
|
||||
shl subbandsd, 2
|
||||
mov stepd, 48
|
||||
add sb_sample_fq, blocksq
|
||||
psrld m4, m3, 25 ; pd_127
|
||||
neg blocksq
|
||||
shl channelsd, 5
|
||||
sub stepd, subbandsd ; step = subbands == 4 ? 32 : 16
|
||||
pxor m2, m2
|
||||
|
||||
.loop_1:
|
||||
sub subbandsq, 8
|
||||
lea ptrq, [sb_sample_fq + subbandsq]
|
||||
lea subbandsq, [blocksq+64]
|
||||
|
||||
; blk = (blocks - 1) * 64;
|
||||
lea blkq, [blocksq - 1]
|
||||
shl blkd, 6
|
||||
|
||||
movq m0, m3
|
||||
pabsd m0, [sb_sample_fq+blocksq]
|
||||
.loop_2:
|
||||
movq m1, [ptrq+blkq]
|
||||
pxor m2, m2
|
||||
pcmpgtd m1, m2
|
||||
paddd m1, [ptrq+blkq]
|
||||
pcmpgtd m2, m1
|
||||
pxor m1, m2
|
||||
pabsd m1, [sb_sample_fq+subbandsq]
|
||||
pmaxud m0, m1
|
||||
add subbandsq, 64
|
||||
js .loop_2
|
||||
|
||||
por m0, m1
|
||||
paddd m0, m3 ; max - 1, representable as signed value
|
||||
pmaxsd m0, m2
|
||||
|
||||
sub blkq, 64
|
||||
jns .loop_2
|
||||
; We have to calculate log2(x|(1<<15))-15. This equals log2(x>>15) for x >= 2^15
|
||||
; and x>>15 is exactly representable as a float, so one can get the log2
|
||||
; by converting to float and subtracting 127 from the exponent.
|
||||
; For x < 2^15 the result is correct when using saturated subtraction.
|
||||
psrld m0, 15
|
||||
cvtdq2ps m0, m0
|
||||
add sb_sample_fq, stepq
|
||||
psrld m0, 23 ; exponent
|
||||
psubusw m0, m4 ; same as saturated dword subtraction
|
||||
mova [scale_factorq], m0
|
||||
|
||||
movd blkd, m0
|
||||
psrlq m0, 32
|
||||
bsr blkd, blkd
|
||||
sub blkd, 15 ; SCALE_OUT_BITS
|
||||
mov [scale_factorq + subbandsq], blkd
|
||||
add scale_factorq, stepq
|
||||
sub channelsd, stepd
|
||||
jg .loop_1
|
||||
|
||||
movd blkd, m0
|
||||
bsr blkd, blkd
|
||||
sub blkd, 15 ; SCALE_OUT_BITS
|
||||
mov [scale_factorq + subbandsq + 4], blkd
|
||||
|
||||
cmp subbandsq, 0
|
||||
jg .loop_1
|
||||
|
||||
emms
|
||||
RET
|
||||
|
||||
@@ -36,19 +36,19 @@
|
||||
|
||||
void ff_sbc_analyze_4_sse2(const int16_t *in, int32_t *out, const int16_t *consts);
|
||||
void ff_sbc_analyze_8_sse2(const int16_t *in, int32_t *out, const int16_t *consts);
|
||||
void ff_sbc_calc_scalefactors_mmx(const int32_t sb_sample_f[16][2][8],
|
||||
uint32_t scale_factor[2][8],
|
||||
int blocks, int channels, int subbands);
|
||||
void ff_sbc_calc_scalefactors_sse4(const int32_t sb_sample_f[16][2][8],
|
||||
uint32_t scale_factor[2][8],
|
||||
int blocks, int channels, int subbands);
|
||||
|
||||
av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
s->sbc_analyze_4 = ff_sbc_analyze_4_sse2;
|
||||
s->sbc_analyze_8 = ff_sbc_analyze_8_sse2;
|
||||
}
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_sse4;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user