Compare commits
19 Commits
n4.3.9
...
release/4.3
| Author | SHA1 | Date | |
|---|---|---|---|
| d91b3a16b5 | |||
| 257ab5a6ac | |||
| 58882aa298 | |||
| be1665dbec | |||
| a7f6ee19a8 | |||
| 6dc71760d2 | |||
| 01d5c40143 | |||
| 0ef8ce13b8 | |||
| 2d32a6611a | |||
| 415ed8bb09 | |||
| a3acba8949 | |||
| 55774a0f19 | |||
| c1a2e08c73 | |||
| 93d70c41e3 | |||
| a778fe8fe4 | |||
| 37403e0aee | |||
| 02e9b60b66 | |||
| b6bf959e12 | |||
| e1f5d0ff91 |
@@ -0,0 +1,23 @@
|
||||
exclude: ^tests/ref/
|
||||
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v5.0.0
|
||||
hooks:
|
||||
- id: check-case-conflict
|
||||
- id: check-executables-have-shebangs
|
||||
- id: check-illegal-windows-names
|
||||
- id: check-shebang-scripts-are-executable
|
||||
- id: check-yaml
|
||||
- id: end-of-file-fixer
|
||||
- id: fix-byte-order-marker
|
||||
- id: mixed-line-ending
|
||||
- id: trailing-whitespace
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: aarch64-asm-indent
|
||||
name: fix aarch64 assembly indentation
|
||||
files: ^.*/aarch64/.*\.S$
|
||||
language: script
|
||||
entry: ./tools/check_arm_indent.sh --apply
|
||||
pass_filenames: false
|
||||
@@ -0,0 +1,29 @@
|
||||
name: Lint
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- release/4.3
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
name: Pre-Commit
|
||||
runs-on: utilities
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install pre-commit CI
|
||||
id: install
|
||||
run: |
|
||||
python3 -m venv ~/pre-commit
|
||||
~/pre-commit/bin/pip install --upgrade pip setuptools
|
||||
~/pre-commit/bin/pip install pre-commit
|
||||
echo "envhash=$({ python3 --version && cat .forgejo/pre-commit/config.yaml; } | sha256sum | cut -d' ' -f1)" >> $FORGEJO_OUTPUT
|
||||
- name: Cache
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pre-commit
|
||||
key: pre-commit-${{ steps.install.outputs.envhash }}
|
||||
- name: Run pre-commit CI
|
||||
run: ~/pre-commit/bin/pre-commit run -c .forgejo/pre-commit/config.yaml --show-diff-on-failure --color=always --all-files
|
||||
@@ -0,0 +1,80 @@
|
||||
name: Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- release/4.3
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
run_fate:
|
||||
name: Fate (${{ matrix.runner }}, ${{ matrix.shared }}, ${{ matrix.bits }} bit)
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
runner: [linux-aarch64]
|
||||
shared: ['static']
|
||||
bits: ['64']
|
||||
include:
|
||||
- runner: linux-amd64
|
||||
shared: 'static'
|
||||
bits: '32'
|
||||
- runner: linux-amd64
|
||||
shared: 'shared'
|
||||
bits: '64'
|
||||
runs-on: ${{ matrix.runner }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Configure
|
||||
run: |
|
||||
./configure --enable-gpl --enable-nonfree --enable-memory-poisoning --assert-level=2 \
|
||||
$([ "${{ matrix.bits }}" != "32" ] || echo --arch=x86_32 --extra-cflags=-m32 --extra-cxxflags=-m32 --extra-ldflags=-m32) \
|
||||
$([ "${{ matrix.shared }}" != "shared" ] || echo --enable-shared --disable-static) \
|
||||
|| CFGRES=$? && CFGRES=$?
|
||||
cat ffbuild/config.log
|
||||
exit $CFGRES
|
||||
- name: Build
|
||||
run: make -j$(nproc)
|
||||
- name: Restore Cached Fate-Suite
|
||||
id: cache
|
||||
uses: actions/cache/restore@v4
|
||||
with:
|
||||
path: fate-suite
|
||||
key: fate-suite
|
||||
restore-keys: |
|
||||
fate-suite-
|
||||
- name: Sync Fate-Suite
|
||||
id: fate
|
||||
run: |
|
||||
make fate-rsync SAMPLES=$PWD/fate-suite
|
||||
echo "hash=$(find fate-suite -type f -printf "%P %s %T@\n" | sort | sha256sum | cut -d' ' -f1)" >> $FORGEJO_OUTPUT
|
||||
- name: Cache Fate-Suite
|
||||
uses: actions/cache/save@v4
|
||||
if: ${{ format('fate-suite-{0}', steps.fate.outputs.hash) != steps.cache.outputs.cache-matched-key }}
|
||||
with:
|
||||
path: fate-suite
|
||||
key: fate-suite-${{ steps.fate.outputs.hash }}
|
||||
- name: Run Fate
|
||||
run: LD_LIBRARY_PATH="$(printf "%s:" "$PWD"/lib*)$PWD" make fate fate-build SAMPLES=$PWD/fate-suite -j$(nproc)
|
||||
compile_only:
|
||||
name: Fate (Win64, Build-Only)
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
image: ["ghcr.io/btbn/ffmpeg-builds/win64-gpl-4.3:latest"]
|
||||
runs-on: linux-amd64
|
||||
container: ${{ matrix.image }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Configure
|
||||
run: |
|
||||
./configure --pkg-config-flags="--static" $FFBUILD_TARGET_FLAGS $FF_CONFIGURE \
|
||||
--cc="$CC" --cxx="$CXX" --ar="$AR" --ranlib="$RANLIB" --nm="$NM" \
|
||||
--extra-cflags="$FF_CFLAGS" --extra-cxxflags="$FF_CXXFLAGS" \
|
||||
--extra-libs="$FF_LIBS" --extra-ldflags="$FF_LDFLAGS" --extra-ldexeflags="$FF_LDEXEFLAGS"
|
||||
- name: Build
|
||||
run: make -j$(nproc)
|
||||
- name: Run Fate
|
||||
run: make -j$(nproc) fate-build
|
||||
+9
-9
@@ -55,7 +55,7 @@ modified by someone else and passed on, the recipients should know
|
||||
that what they have is not the original version, so that the original
|
||||
author's reputation will not be affected by problems that might be
|
||||
introduced by others.
|
||||
|
||||
|
||||
Finally, software patents pose a constant threat to the existence of
|
||||
any free program. We wish to make sure that a company cannot
|
||||
effectively restrict the users of a free program by obtaining a
|
||||
@@ -111,7 +111,7 @@ modification follow. Pay close attention to the difference between a
|
||||
"work based on the library" and a "work that uses the library". The
|
||||
former contains code derived from the library, whereas the latter must
|
||||
be combined with the library in order to run.
|
||||
|
||||
|
||||
GNU LESSER GENERAL PUBLIC LICENSE
|
||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||
|
||||
@@ -158,7 +158,7 @@ Library.
|
||||
You may charge a fee for the physical act of transferring a copy,
|
||||
and you may at your option offer warranty protection in exchange for a
|
||||
fee.
|
||||
|
||||
|
||||
2. You may modify your copy or copies of the Library or any portion
|
||||
of it, thus forming a work based on the Library, and copy and
|
||||
distribute such modifications or work under the terms of Section 1
|
||||
@@ -216,7 +216,7 @@ instead of to this License. (If a newer version than version 2 of the
|
||||
ordinary GNU General Public License has appeared, then you can specify
|
||||
that version instead if you wish.) Do not make any other change in
|
||||
these notices.
|
||||
|
||||
|
||||
Once this change is made in a given copy, it is irreversible for
|
||||
that copy, so the ordinary GNU General Public License applies to all
|
||||
subsequent copies and derivative works made from that copy.
|
||||
@@ -267,7 +267,7 @@ Library will still fall under Section 6.)
|
||||
distribute the object code for the work under the terms of Section 6.
|
||||
Any executables containing that work also fall under Section 6,
|
||||
whether or not they are linked directly with the Library itself.
|
||||
|
||||
|
||||
6. As an exception to the Sections above, you may also combine or
|
||||
link a "work that uses the Library" with the Library to produce a
|
||||
work containing portions of the Library, and distribute that work
|
||||
@@ -329,7 +329,7 @@ restrictions of other proprietary libraries that do not normally
|
||||
accompany the operating system. Such a contradiction means you cannot
|
||||
use both them and the Library together in an executable that you
|
||||
distribute.
|
||||
|
||||
|
||||
7. You may place library facilities that are a work based on the
|
||||
Library side-by-side in a single library together with other library
|
||||
facilities not covered by this License, and distribute such a combined
|
||||
@@ -370,7 +370,7 @@ subject to these terms and conditions. You may not impose any further
|
||||
restrictions on the recipients' exercise of the rights granted herein.
|
||||
You are not responsible for enforcing compliance by third parties with
|
||||
this License.
|
||||
|
||||
|
||||
11. If, as a consequence of a court judgment or allegation of patent
|
||||
infringement or for any other reason (not limited to patent issues),
|
||||
conditions are imposed on you (whether by court order, agreement or
|
||||
@@ -422,7 +422,7 @@ conditions either of that version or of any later version published by
|
||||
the Free Software Foundation. If the Library does not specify a
|
||||
license version number, you may choose any version ever published by
|
||||
the Free Software Foundation.
|
||||
|
||||
|
||||
14. If you wish to incorporate parts of the Library into other free
|
||||
programs whose distribution conditions are incompatible with these,
|
||||
write to the author to ask for permission. For software which is
|
||||
@@ -456,7 +456,7 @@ SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
|
||||
DAMAGES.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
|
||||
How to Apply These Terms to Your New Libraries
|
||||
|
||||
If you develop a new library, and you want it to be of the greatest
|
||||
|
||||
@@ -63,4 +63,3 @@ make -j<num>
|
||||
make -k
|
||||
Continue build in case of errors, this is useful for the regression tests
|
||||
sometimes but note that it will still not run all reg tests.
|
||||
|
||||
|
||||
@@ -157,4 +157,3 @@ PFD[32] would for example be signed 32 bit little-endian IEEE float
|
||||
@item XVID @tab non-compliant MPEG-4 generated by old Xvid
|
||||
@item XVIX @tab non-compliant MPEG-4 generated by old Xvid with interlacing bug
|
||||
@end multitable
|
||||
|
||||
|
||||
Regular → Executable
Regular → Executable
@@ -44,4 +44,3 @@ a+b*c;
|
||||
here the reader knows that a,b,c are meant to be signed integers but for C
|
||||
standard compliance / to avoid undefined behavior they are stored in unsigned
|
||||
ints.
|
||||
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#!/bin/sh
|
||||
|
||||
toupper(){
|
||||
echo "$@" | tr abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||
}
|
||||
|
||||
+1
-1
@@ -1174,7 +1174,7 @@ SKIPHEADERS-$(CONFIG_QSV) += qsv.h qsv_internal.h
|
||||
SKIPHEADERS-$(CONFIG_QSVDEC) += qsvdec.h
|
||||
SKIPHEADERS-$(CONFIG_QSVENC) += qsvenc.h
|
||||
SKIPHEADERS-$(CONFIG_XVMC) += xvmc.h
|
||||
SKIPHEADERS-$(CONFIG_VAAPI) += vaapi_decode.h vaapi_encode.h
|
||||
SKIPHEADERS-$(CONFIG_VAAPI) += vaapi_decode.h vaapi_hevc.h vaapi_encode.h
|
||||
SKIPHEADERS-$(CONFIG_VDPAU) += vdpau.h vdpau_internal.h
|
||||
SKIPHEADERS-$(CONFIG_VIDEOTOOLBOX) += videotoolbox.h vt_internal.h
|
||||
SKIPHEADERS-$(CONFIG_V4L2_M2M) += v4l2_buffers.h v4l2_context.h v4l2_m2m.h
|
||||
|
||||
+27
-13
@@ -173,6 +173,7 @@ void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce)
|
||||
sce->ics.window_sequence[0] == LONG_START_SEQUENCE ? 0 : 2;
|
||||
const int sfb_len = sfb_end - sfb_start;
|
||||
const int coef_len = sce->ics.swb_offset[sfb_end] - sce->ics.swb_offset[sfb_start];
|
||||
const int n_filt = is8 ? 1 : order != TNS_MAX_ORDER ? 2 : 3;
|
||||
|
||||
if (coef_len <= 0 || sfb_len <= 0) {
|
||||
sce->tns.present = 0;
|
||||
@@ -180,16 +181,30 @@ void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce)
|
||||
}
|
||||
|
||||
for (w = 0; w < sce->ics.num_windows; w++) {
|
||||
float en[2] = {0.0f, 0.0f};
|
||||
int oc_start = 0, os_start = 0;
|
||||
float en[4] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
int oc_start = 0;
|
||||
int coef_start = sce->ics.swb_offset[sfb_start];
|
||||
|
||||
for (g = sfb_start; g < sce->ics.num_swb && g <= sfb_end; g++) {
|
||||
FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[w*16+g];
|
||||
if (g > sfb_start + (sfb_len/2))
|
||||
en[1] += band->energy;
|
||||
else
|
||||
en[0] += band->energy;
|
||||
if (n_filt == 2) {
|
||||
for (g = sfb_start; g < sce->ics.num_swb && g <= sfb_end; g++) {
|
||||
FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[w*16+g];
|
||||
if (g > sfb_start + (sfb_len/2))
|
||||
en[1] += band->energy; /* End */
|
||||
else
|
||||
en[0] += band->energy; /* Start */
|
||||
}
|
||||
en[2] = en[0];
|
||||
} else {
|
||||
for (g = sfb_start; g < sce->ics.num_swb && g <= sfb_end; g++) {
|
||||
FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[w*16+g];
|
||||
if (g > sfb_start + (sfb_len/2) + (sfb_len/4))
|
||||
en[2] += band->energy; /* End */
|
||||
else if (g > sfb_start + (sfb_len/2) - (sfb_len/4))
|
||||
en[1] += band->energy; /* Middle */
|
||||
else
|
||||
en[0] += band->energy; /* Start */
|
||||
}
|
||||
en[3] = en[0];
|
||||
}
|
||||
|
||||
/* LPC */
|
||||
@@ -199,15 +214,14 @@ void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce)
|
||||
if (!order || !isfinite(gain) || gain < TNS_GAIN_THRESHOLD_LOW || gain > TNS_GAIN_THRESHOLD_HIGH)
|
||||
continue;
|
||||
|
||||
tns->n_filt[w] = is8 ? 1 : order != TNS_MAX_ORDER ? 2 : 3;
|
||||
tns->n_filt[w] = n_filt;
|
||||
for (g = 0; g < tns->n_filt[w]; g++) {
|
||||
tns->direction[w][g] = slant != 2 ? slant : en[g] < en[!g];
|
||||
tns->order[w][g] = g < tns->n_filt[w] ? order/tns->n_filt[w] : order - oc_start;
|
||||
tns->length[w][g] = g < tns->n_filt[w] ? sfb_len/tns->n_filt[w] : sfb_len - os_start;
|
||||
tns->direction[w][g] = slant != 2 ? slant : en[g] < en[g + 1];
|
||||
tns->order[w][g] = order/tns->n_filt[w];
|
||||
tns->length[w][g] = sfb_len/tns->n_filt[w];
|
||||
quantize_coefs(&coefs[oc_start], tns->coef_idx[w][g], tns->coef[w][g],
|
||||
tns->order[w][g], c_bits);
|
||||
oc_start += tns->order[w][g];
|
||||
os_start += tns->length[w][g];
|
||||
}
|
||||
count++;
|
||||
}
|
||||
|
||||
+109
-109
@@ -19,130 +19,130 @@
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_ps_add_squares_neon, export=1
|
||||
1: ld1 {v0.4S,v1.4S}, [x1], #32
|
||||
fmul v0.4S, v0.4S, v0.4S
|
||||
fmul v1.4S, v1.4S, v1.4S
|
||||
faddp v2.4S, v0.4S, v1.4S
|
||||
ld1 {v3.4S}, [x0]
|
||||
fadd v3.4S, v3.4S, v2.4S
|
||||
st1 {v3.4S}, [x0], #16
|
||||
subs w2, w2, #4
|
||||
b.gt 1b
|
||||
1: ld1 {v0.4s,v1.4s}, [x1], #32
|
||||
fmul v0.4s, v0.4s, v0.4s
|
||||
fmul v1.4s, v1.4s, v1.4s
|
||||
faddp v2.4s, v0.4s, v1.4s
|
||||
ld1 {v3.4s}, [x0]
|
||||
fadd v3.4s, v3.4s, v2.4s
|
||||
st1 {v3.4s}, [x0], #16
|
||||
subs w2, w2, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_ps_mul_pair_single_neon, export=1
|
||||
1: ld1 {v0.4S,v1.4S}, [x1], #32
|
||||
ld1 {v2.4S}, [x2], #16
|
||||
zip1 v3.4S, v2.4S, v2.4S
|
||||
zip2 v4.4S, v2.4S, v2.4S
|
||||
fmul v0.4S, v0.4S, v3.4S
|
||||
fmul v1.4S, v1.4S, v4.4S
|
||||
st1 {v0.4S,v1.4S}, [x0], #32
|
||||
subs w3, w3, #4
|
||||
b.gt 1b
|
||||
1: ld1 {v0.4s,v1.4s}, [x1], #32
|
||||
ld1 {v2.4s}, [x2], #16
|
||||
zip1 v3.4s, v2.4s, v2.4s
|
||||
zip2 v4.4s, v2.4s, v2.4s
|
||||
fmul v0.4s, v0.4s, v3.4s
|
||||
fmul v1.4s, v1.4s, v4.4s
|
||||
st1 {v0.4s,v1.4s}, [x0], #32
|
||||
subs w3, w3, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_ps_stereo_interpolate_neon, export=1
|
||||
ld1 {v0.4S}, [x2]
|
||||
ld1 {v1.4S}, [x3]
|
||||
zip1 v4.4S, v0.4S, v0.4S
|
||||
zip2 v5.4S, v0.4S, v0.4S
|
||||
zip1 v6.4S, v1.4S, v1.4S
|
||||
zip2 v7.4S, v1.4S, v1.4S
|
||||
1: ld1 {v2.2S}, [x0]
|
||||
ld1 {v3.2S}, [x1]
|
||||
fadd v4.4S, v4.4S, v6.4S
|
||||
fadd v5.4S, v5.4S, v7.4S
|
||||
mov v2.D[1], v2.D[0]
|
||||
mov v3.D[1], v3.D[0]
|
||||
fmul v2.4S, v2.4S, v4.4S
|
||||
fmla v2.4S, v3.4S, v5.4S
|
||||
st1 {v2.D}[0], [x0], #8
|
||||
st1 {v2.D}[1], [x1], #8
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
ld1 {v0.4s}, [x2]
|
||||
ld1 {v1.4s}, [x3]
|
||||
zip1 v4.4s, v0.4s, v0.4s
|
||||
zip2 v5.4s, v0.4s, v0.4s
|
||||
zip1 v6.4s, v1.4s, v1.4s
|
||||
zip2 v7.4s, v1.4s, v1.4s
|
||||
1: ld1 {v2.2s}, [x0]
|
||||
ld1 {v3.2s}, [x1]
|
||||
fadd v4.4s, v4.4s, v6.4s
|
||||
fadd v5.4s, v5.4s, v7.4s
|
||||
mov v2.d[1], v2.d[0]
|
||||
mov v3.d[1], v3.d[0]
|
||||
fmul v2.4s, v2.4s, v4.4s
|
||||
fmla v2.4s, v3.4s, v5.4s
|
||||
st1 {v2.d}[0], [x0], #8
|
||||
st1 {v2.d}[1], [x1], #8
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_ps_stereo_interpolate_ipdopd_neon, export=1
|
||||
ld1 {v0.4S,v1.4S}, [x2]
|
||||
ld1 {v6.4S,v7.4S}, [x3]
|
||||
fneg v2.4S, v1.4S
|
||||
fneg v3.4S, v7.4S
|
||||
zip1 v16.4S, v0.4S, v0.4S
|
||||
zip2 v17.4S, v0.4S, v0.4S
|
||||
zip1 v18.4S, v2.4S, v1.4S
|
||||
zip2 v19.4S, v2.4S, v1.4S
|
||||
zip1 v20.4S, v6.4S, v6.4S
|
||||
zip2 v21.4S, v6.4S, v6.4S
|
||||
zip1 v22.4S, v3.4S, v7.4S
|
||||
zip2 v23.4S, v3.4S, v7.4S
|
||||
1: ld1 {v2.2S}, [x0]
|
||||
ld1 {v3.2S}, [x1]
|
||||
fadd v16.4S, v16.4S, v20.4S
|
||||
fadd v17.4S, v17.4S, v21.4S
|
||||
mov v2.D[1], v2.D[0]
|
||||
mov v3.D[1], v3.D[0]
|
||||
fmul v4.4S, v2.4S, v16.4S
|
||||
fmla v4.4S, v3.4S, v17.4S
|
||||
fadd v18.4S, v18.4S, v22.4S
|
||||
fadd v19.4S, v19.4S, v23.4S
|
||||
ext v2.16B, v2.16B, v2.16B, #4
|
||||
ext v3.16B, v3.16B, v3.16B, #4
|
||||
fmla v4.4S, v2.4S, v18.4S
|
||||
fmla v4.4S, v3.4S, v19.4S
|
||||
st1 {v4.D}[0], [x0], #8
|
||||
st1 {v4.D}[1], [x1], #8
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
ld1 {v0.4s,v1.4s}, [x2]
|
||||
ld1 {v6.4s,v7.4s}, [x3]
|
||||
fneg v2.4s, v1.4s
|
||||
fneg v3.4s, v7.4s
|
||||
zip1 v16.4s, v0.4s, v0.4s
|
||||
zip2 v17.4s, v0.4s, v0.4s
|
||||
zip1 v18.4s, v2.4s, v1.4s
|
||||
zip2 v19.4s, v2.4s, v1.4s
|
||||
zip1 v20.4s, v6.4s, v6.4s
|
||||
zip2 v21.4s, v6.4s, v6.4s
|
||||
zip1 v22.4s, v3.4s, v7.4s
|
||||
zip2 v23.4s, v3.4s, v7.4s
|
||||
1: ld1 {v2.2s}, [x0]
|
||||
ld1 {v3.2s}, [x1]
|
||||
fadd v16.4s, v16.4s, v20.4s
|
||||
fadd v17.4s, v17.4s, v21.4s
|
||||
mov v2.d[1], v2.d[0]
|
||||
mov v3.d[1], v3.d[0]
|
||||
fmul v4.4s, v2.4s, v16.4s
|
||||
fmla v4.4s, v3.4s, v17.4s
|
||||
fadd v18.4s, v18.4s, v22.4s
|
||||
fadd v19.4s, v19.4s, v23.4s
|
||||
ext v2.16b, v2.16b, v2.16b, #4
|
||||
ext v3.16b, v3.16b, v3.16b, #4
|
||||
fmla v4.4s, v2.4s, v18.4s
|
||||
fmla v4.4s, v3.4s, v19.4s
|
||||
st1 {v4.d}[0], [x0], #8
|
||||
st1 {v4.d}[1], [x1], #8
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_ps_hybrid_analysis_neon, export=1
|
||||
lsl x3, x3, #3
|
||||
ld2 {v0.4S,v1.4S}, [x1], #32
|
||||
ld2 {v2.2S,v3.2S}, [x1], #16
|
||||
ld1 {v24.2S}, [x1], #8
|
||||
ld2 {v4.2S,v5.2S}, [x1], #16
|
||||
ld2 {v6.4S,v7.4S}, [x1]
|
||||
rev64 v6.4S, v6.4S
|
||||
rev64 v7.4S, v7.4S
|
||||
ext v6.16B, v6.16B, v6.16B, #8
|
||||
ext v7.16B, v7.16B, v7.16B, #8
|
||||
rev64 v4.2S, v4.2S
|
||||
rev64 v5.2S, v5.2S
|
||||
mov v2.D[1], v3.D[0]
|
||||
mov v4.D[1], v5.D[0]
|
||||
mov v5.D[1], v2.D[0]
|
||||
mov v3.D[1], v4.D[0]
|
||||
fadd v16.4S, v0.4S, v6.4S
|
||||
fadd v17.4S, v1.4S, v7.4S
|
||||
fsub v18.4S, v1.4S, v7.4S
|
||||
fsub v19.4S, v0.4S, v6.4S
|
||||
fadd v22.4S, v2.4S, v4.4S
|
||||
fsub v23.4S, v5.4S, v3.4S
|
||||
trn1 v20.2D, v22.2D, v23.2D // {re4+re8, re5+re7, im8-im4, im7-im5}
|
||||
trn2 v21.2D, v22.2D, v23.2D // {im4+im8, im5+im7, re4-re8, re5-re7}
|
||||
1: ld2 {v2.4S,v3.4S}, [x2], #32
|
||||
ld2 {v4.2S,v5.2S}, [x2], #16
|
||||
ld1 {v6.2S}, [x2], #8
|
||||
add x2, x2, #8
|
||||
mov v4.D[1], v5.D[0]
|
||||
mov v6.S[1], v6.S[0]
|
||||
fmul v6.2S, v6.2S, v24.2S
|
||||
fmul v0.4S, v2.4S, v16.4S
|
||||
fmul v1.4S, v2.4S, v17.4S
|
||||
fmls v0.4S, v3.4S, v18.4S
|
||||
fmla v1.4S, v3.4S, v19.4S
|
||||
fmla v0.4S, v4.4S, v20.4S
|
||||
fmla v1.4S, v4.4S, v21.4S
|
||||
faddp v0.4S, v0.4S, v1.4S
|
||||
faddp v0.4S, v0.4S, v0.4S
|
||||
fadd v0.2S, v0.2S, v6.2S
|
||||
st1 {v0.2S}, [x0], x3
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
lsl x3, x3, #3
|
||||
ld2 {v0.4s,v1.4s}, [x1], #32
|
||||
ld2 {v2.2s,v3.2s}, [x1], #16
|
||||
ld1 {v24.2s}, [x1], #8
|
||||
ld2 {v4.2s,v5.2s}, [x1], #16
|
||||
ld2 {v6.4s,v7.4s}, [x1]
|
||||
rev64 v6.4s, v6.4s
|
||||
rev64 v7.4s, v7.4s
|
||||
ext v6.16b, v6.16b, v6.16b, #8
|
||||
ext v7.16b, v7.16b, v7.16b, #8
|
||||
rev64 v4.2s, v4.2s
|
||||
rev64 v5.2s, v5.2s
|
||||
mov v2.d[1], v3.d[0]
|
||||
mov v4.d[1], v5.d[0]
|
||||
mov v5.d[1], v2.d[0]
|
||||
mov v3.d[1], v4.d[0]
|
||||
fadd v16.4s, v0.4s, v6.4s
|
||||
fadd v17.4s, v1.4s, v7.4s
|
||||
fsub v18.4s, v1.4s, v7.4s
|
||||
fsub v19.4s, v0.4s, v6.4s
|
||||
fadd v22.4s, v2.4s, v4.4s
|
||||
fsub v23.4s, v5.4s, v3.4s
|
||||
trn1 v20.2d, v22.2d, v23.2d // {re4+re8, re5+re7, im8-im4, im7-im5}
|
||||
trn2 v21.2d, v22.2d, v23.2d // {im4+im8, im5+im7, re4-re8, re5-re7}
|
||||
1: ld2 {v2.4s,v3.4s}, [x2], #32
|
||||
ld2 {v4.2s,v5.2s}, [x2], #16
|
||||
ld1 {v6.2s}, [x2], #8
|
||||
add x2, x2, #8
|
||||
mov v4.d[1], v5.d[0]
|
||||
mov v6.s[1], v6.s[0]
|
||||
fmul v6.2s, v6.2s, v24.2s
|
||||
fmul v0.4s, v2.4s, v16.4s
|
||||
fmul v1.4s, v2.4s, v17.4s
|
||||
fmls v0.4s, v3.4s, v18.4s
|
||||
fmla v1.4s, v3.4s, v19.4s
|
||||
fmla v0.4s, v4.4s, v20.4s
|
||||
fmla v1.4s, v4.4s, v21.4s
|
||||
faddp v0.4s, v0.4s, v1.4s
|
||||
faddp v0.4s, v0.4s, v0.4s
|
||||
fadd v0.2s, v0.2s, v6.2s
|
||||
st1 {v0.2s}, [x0], x3
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@@ -353,18 +353,18 @@ function fft\n\()_neon, align=6
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
def_fft 32, 16, 8
|
||||
def_fft 64, 32, 16
|
||||
def_fft 128, 64, 32
|
||||
def_fft 256, 128, 64
|
||||
def_fft 512, 256, 128
|
||||
def_fft 1024, 512, 256
|
||||
def_fft 2048, 1024, 512
|
||||
def_fft 4096, 2048, 1024
|
||||
def_fft 8192, 4096, 2048
|
||||
def_fft 16384, 8192, 4096
|
||||
def_fft 32768, 16384, 8192
|
||||
def_fft 65536, 32768, 16384
|
||||
def_fft 32, 16, 8
|
||||
def_fft 64, 32, 16
|
||||
def_fft 128, 64, 32
|
||||
def_fft 256, 128, 64
|
||||
def_fft 512, 256, 128
|
||||
def_fft 1024, 512, 256
|
||||
def_fft 2048, 1024, 512
|
||||
def_fft 4096, 2048, 1024
|
||||
def_fft 8192, 4096, 2048
|
||||
def_fft 16384, 8192, 4096
|
||||
def_fft 32768, 16384, 8192
|
||||
def_fft 65536, 32768, 16384
|
||||
|
||||
function ff_fft_calc_neon, export=1
|
||||
prfm pldl1keep, [x1]
|
||||
|
||||
+205
-205
@@ -36,11 +36,11 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
|
||||
lsl w9, w9, #3
|
||||
lsl w10, w10, #1
|
||||
add w9, w9, w10
|
||||
add x6, x6, w9, UXTW
|
||||
ld1r {v22.8H}, [x6]
|
||||
add x6, x6, w9, uxtw
|
||||
ld1r {v22.8h}, [x6]
|
||||
.endif
|
||||
.ifc \codec,vc1
|
||||
movi v22.8H, #28
|
||||
movi v22.8h, #28
|
||||
.endif
|
||||
mul w7, w4, w5
|
||||
lsl w14, w5, #3
|
||||
@@ -53,139 +53,139 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
|
||||
add w4, w4, #64
|
||||
b.eq 2f
|
||||
|
||||
dup v0.8B, w4
|
||||
dup v1.8B, w12
|
||||
ld1 {v4.8B, v5.8B}, [x1], x2
|
||||
dup v2.8B, w6
|
||||
dup v3.8B, w7
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
1: ld1 {v6.8B, v7.8B}, [x1], x2
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umlal v16.8H, v5.8B, v1.8B
|
||||
ext v7.8B, v6.8B, v7.8B, #1
|
||||
ld1 {v4.8B, v5.8B}, [x1], x2
|
||||
umlal v16.8H, v6.8B, v2.8B
|
||||
dup v0.8b, w4
|
||||
dup v1.8b, w12
|
||||
ld1 {v4.8b, v5.8b}, [x1], x2
|
||||
dup v2.8b, w6
|
||||
dup v3.8b, w7
|
||||
ext v5.8b, v4.8b, v5.8b, #1
|
||||
1: ld1 {v6.8b, v7.8b}, [x1], x2
|
||||
umull v16.8h, v4.8b, v0.8b
|
||||
umlal v16.8h, v5.8b, v1.8b
|
||||
ext v7.8b, v6.8b, v7.8b, #1
|
||||
ld1 {v4.8b, v5.8b}, [x1], x2
|
||||
umlal v16.8h, v6.8b, v2.8b
|
||||
prfm pldl1strm, [x1]
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
umlal v16.8H, v7.8B, v3.8B
|
||||
umull v17.8H, v6.8B, v0.8B
|
||||
ext v5.8b, v4.8b, v5.8b, #1
|
||||
umlal v16.8h, v7.8b, v3.8b
|
||||
umull v17.8h, v6.8b, v0.8b
|
||||
subs w3, w3, #2
|
||||
umlal v17.8H, v7.8B, v1.8B
|
||||
umlal v17.8H, v4.8B, v2.8B
|
||||
umlal v17.8H, v5.8B, v3.8B
|
||||
umlal v17.8h, v7.8b, v1.8b
|
||||
umlal v17.8h, v4.8b, v2.8b
|
||||
umlal v17.8h, v5.8b, v3.8b
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rshrn v17.8B, v17.8H, #6
|
||||
rshrn v16.8b, v16.8h, #6
|
||||
rshrn v17.8b, v17.8h, #6
|
||||
.else
|
||||
add v16.8H, v16.8H, v22.8H
|
||||
add v17.8H, v17.8H, v22.8H
|
||||
shrn v16.8B, v16.8H, #6
|
||||
shrn v17.8B, v17.8H, #6
|
||||
add v16.8h, v16.8h, v22.8h
|
||||
add v17.8h, v17.8h, v22.8h
|
||||
shrn v16.8b, v16.8h, #6
|
||||
shrn v17.8b, v17.8h, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.8B}, [x8], x2
|
||||
ld1 {v21.8B}, [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
urhadd v17.8B, v17.8B, v21.8B
|
||||
ld1 {v20.8b}, [x8], x2
|
||||
ld1 {v21.8b}, [x8], x2
|
||||
urhadd v16.8b, v16.8b, v20.8b
|
||||
urhadd v17.8b, v17.8b, v21.8b
|
||||
.endif
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
st1 {v16.8b}, [x0], x2
|
||||
st1 {v17.8b}, [x0], x2
|
||||
b.gt 1b
|
||||
ret
|
||||
|
||||
2: adds w12, w12, w6
|
||||
dup v0.8B, w4
|
||||
dup v0.8b, w4
|
||||
b.eq 5f
|
||||
tst w6, w6
|
||||
dup v1.8B, w12
|
||||
dup v1.8b, w12
|
||||
b.eq 4f
|
||||
|
||||
ld1 {v4.8B}, [x1], x2
|
||||
3: ld1 {v6.8B}, [x1], x2
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umlal v16.8H, v6.8B, v1.8B
|
||||
ld1 {v4.8B}, [x1], x2
|
||||
umull v17.8H, v6.8B, v0.8B
|
||||
umlal v17.8H, v4.8B, v1.8B
|
||||
ld1 {v4.8b}, [x1], x2
|
||||
3: ld1 {v6.8b}, [x1], x2
|
||||
umull v16.8h, v4.8b, v0.8b
|
||||
umlal v16.8h, v6.8b, v1.8b
|
||||
ld1 {v4.8b}, [x1], x2
|
||||
umull v17.8h, v6.8b, v0.8b
|
||||
umlal v17.8h, v4.8b, v1.8b
|
||||
prfm pldl1strm, [x1]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rshrn v17.8B, v17.8H, #6
|
||||
rshrn v16.8b, v16.8h, #6
|
||||
rshrn v17.8b, v17.8h, #6
|
||||
.else
|
||||
add v16.8H, v16.8H, v22.8H
|
||||
add v17.8H, v17.8H, v22.8H
|
||||
shrn v16.8B, v16.8H, #6
|
||||
shrn v17.8B, v17.8H, #6
|
||||
add v16.8h, v16.8h, v22.8h
|
||||
add v17.8h, v17.8h, v22.8h
|
||||
shrn v16.8b, v16.8h, #6
|
||||
shrn v17.8b, v17.8h, #6
|
||||
.endif
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \type,avg
|
||||
ld1 {v20.8B}, [x8], x2
|
||||
ld1 {v21.8B}, [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
urhadd v17.8B, v17.8B, v21.8B
|
||||
ld1 {v20.8b}, [x8], x2
|
||||
ld1 {v21.8b}, [x8], x2
|
||||
urhadd v16.8b, v16.8b, v20.8b
|
||||
urhadd v17.8b, v17.8b, v21.8b
|
||||
.endif
|
||||
subs w3, w3, #2
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
st1 {v16.8b}, [x0], x2
|
||||
st1 {v17.8b}, [x0], x2
|
||||
b.gt 3b
|
||||
ret
|
||||
|
||||
4: ld1 {v4.8B, v5.8B}, [x1], x2
|
||||
ld1 {v6.8B, v7.8B}, [x1], x2
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
ext v7.8B, v6.8B, v7.8B, #1
|
||||
4: ld1 {v4.8b, v5.8b}, [x1], x2
|
||||
ld1 {v6.8b, v7.8b}, [x1], x2
|
||||
ext v5.8b, v4.8b, v5.8b, #1
|
||||
ext v7.8b, v6.8b, v7.8b, #1
|
||||
prfm pldl1strm, [x1]
|
||||
subs w3, w3, #2
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umlal v16.8H, v5.8B, v1.8B
|
||||
umull v17.8H, v6.8B, v0.8B
|
||||
umlal v17.8H, v7.8B, v1.8B
|
||||
umull v16.8h, v4.8b, v0.8b
|
||||
umlal v16.8h, v5.8b, v1.8b
|
||||
umull v17.8h, v6.8b, v0.8b
|
||||
umlal v17.8h, v7.8b, v1.8b
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rshrn v17.8B, v17.8H, #6
|
||||
rshrn v16.8b, v16.8h, #6
|
||||
rshrn v17.8b, v17.8h, #6
|
||||
.else
|
||||
add v16.8H, v16.8H, v22.8H
|
||||
add v17.8H, v17.8H, v22.8H
|
||||
shrn v16.8B, v16.8H, #6
|
||||
shrn v17.8B, v17.8H, #6
|
||||
add v16.8h, v16.8h, v22.8h
|
||||
add v17.8h, v17.8h, v22.8h
|
||||
shrn v16.8b, v16.8h, #6
|
||||
shrn v17.8b, v17.8h, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.8B}, [x8], x2
|
||||
ld1 {v21.8B}, [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
urhadd v17.8B, v17.8B, v21.8B
|
||||
ld1 {v20.8b}, [x8], x2
|
||||
ld1 {v21.8b}, [x8], x2
|
||||
urhadd v16.8b, v16.8b, v20.8b
|
||||
urhadd v17.8b, v17.8b, v21.8b
|
||||
.endif
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
st1 {v16.8b}, [x0], x2
|
||||
st1 {v17.8b}, [x0], x2
|
||||
b.gt 4b
|
||||
ret
|
||||
|
||||
5: ld1 {v4.8B}, [x1], x2
|
||||
ld1 {v5.8B}, [x1], x2
|
||||
5: ld1 {v4.8b}, [x1], x2
|
||||
ld1 {v5.8b}, [x1], x2
|
||||
prfm pldl1strm, [x1]
|
||||
subs w3, w3, #2
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umull v17.8H, v5.8B, v0.8B
|
||||
umull v16.8h, v4.8b, v0.8b
|
||||
umull v17.8h, v5.8b, v0.8b
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rshrn v17.8B, v17.8H, #6
|
||||
rshrn v16.8b, v16.8h, #6
|
||||
rshrn v17.8b, v17.8h, #6
|
||||
.else
|
||||
add v16.8H, v16.8H, v22.8H
|
||||
add v17.8H, v17.8H, v22.8H
|
||||
shrn v16.8B, v16.8H, #6
|
||||
shrn v17.8B, v17.8H, #6
|
||||
add v16.8h, v16.8h, v22.8h
|
||||
add v17.8h, v17.8h, v22.8h
|
||||
shrn v16.8b, v16.8h, #6
|
||||
shrn v17.8b, v17.8h, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.8B}, [x8], x2
|
||||
ld1 {v21.8B}, [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
urhadd v17.8B, v17.8B, v21.8B
|
||||
ld1 {v20.8b}, [x8], x2
|
||||
ld1 {v21.8b}, [x8], x2
|
||||
urhadd v16.8b, v16.8b, v20.8b
|
||||
urhadd v17.8b, v17.8b, v21.8b
|
||||
.endif
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
st1 {v16.8b}, [x0], x2
|
||||
st1 {v17.8b}, [x0], x2
|
||||
b.gt 5b
|
||||
ret
|
||||
endfunc
|
||||
@@ -206,11 +206,11 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
|
||||
lsl w9, w9, #3
|
||||
lsl w10, w10, #1
|
||||
add w9, w9, w10
|
||||
add x6, x6, w9, UXTW
|
||||
ld1r {v22.8H}, [x6]
|
||||
add x6, x6, w9, uxtw
|
||||
ld1r {v22.8h}, [x6]
|
||||
.endif
|
||||
.ifc \codec,vc1
|
||||
movi v22.8H, #28
|
||||
movi v22.8h, #28
|
||||
.endif
|
||||
mul w7, w4, w5
|
||||
lsl w14, w5, #3
|
||||
@@ -223,133 +223,133 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
|
||||
add w4, w4, #64
|
||||
b.eq 2f
|
||||
|
||||
dup v24.8B, w4
|
||||
dup v25.8B, w12
|
||||
ld1 {v4.8B}, [x1], x2
|
||||
dup v26.8B, w6
|
||||
dup v27.8B, w7
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
trn1 v0.2S, v24.2S, v25.2S
|
||||
trn1 v2.2S, v26.2S, v27.2S
|
||||
trn1 v4.2S, v4.2S, v5.2S
|
||||
1: ld1 {v6.8B}, [x1], x2
|
||||
ext v7.8B, v6.8B, v7.8B, #1
|
||||
trn1 v6.2S, v6.2S, v7.2S
|
||||
umull v18.8H, v4.8B, v0.8B
|
||||
umlal v18.8H, v6.8B, v2.8B
|
||||
ld1 {v4.8B}, [x1], x2
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
trn1 v4.2S, v4.2S, v5.2S
|
||||
dup v24.8b, w4
|
||||
dup v25.8b, w12
|
||||
ld1 {v4.8b}, [x1], x2
|
||||
dup v26.8b, w6
|
||||
dup v27.8b, w7
|
||||
ext v5.8b, v4.8b, v5.8b, #1
|
||||
trn1 v0.2s, v24.2s, v25.2s
|
||||
trn1 v2.2s, v26.2s, v27.2s
|
||||
trn1 v4.2s, v4.2s, v5.2s
|
||||
1: ld1 {v6.8b}, [x1], x2
|
||||
ext v7.8b, v6.8b, v7.8b, #1
|
||||
trn1 v6.2s, v6.2s, v7.2s
|
||||
umull v18.8h, v4.8b, v0.8b
|
||||
umlal v18.8h, v6.8b, v2.8b
|
||||
ld1 {v4.8b}, [x1], x2
|
||||
ext v5.8b, v4.8b, v5.8b, #1
|
||||
trn1 v4.2s, v4.2s, v5.2s
|
||||
prfm pldl1strm, [x1]
|
||||
umull v19.8H, v6.8B, v0.8B
|
||||
umlal v19.8H, v4.8B, v2.8B
|
||||
trn1 v30.2D, v18.2D, v19.2D
|
||||
trn2 v31.2D, v18.2D, v19.2D
|
||||
add v18.8H, v30.8H, v31.8H
|
||||
umull v19.8h, v6.8b, v0.8b
|
||||
umlal v19.8h, v4.8b, v2.8b
|
||||
trn1 v30.2d, v18.2d, v19.2d
|
||||
trn2 v31.2d, v18.2d, v19.2d
|
||||
add v18.8h, v30.8h, v31.8h
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v18.8H, #6
|
||||
rshrn v16.8b, v18.8h, #6
|
||||
.else
|
||||
add v18.8H, v18.8H, v22.8H
|
||||
shrn v16.8B, v18.8H, #6
|
||||
add v18.8h, v18.8h, v22.8h
|
||||
shrn v16.8b, v18.8h, #6
|
||||
.endif
|
||||
subs w3, w3, #2
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \type,avg
|
||||
ld1 {v20.S}[0], [x8], x2
|
||||
ld1 {v20.S}[1], [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
ld1 {v20.s}[0], [x8], x2
|
||||
ld1 {v20.s}[1], [x8], x2
|
||||
urhadd v16.8b, v16.8b, v20.8b
|
||||
.endif
|
||||
st1 {v16.S}[0], [x0], x2
|
||||
st1 {v16.S}[1], [x0], x2
|
||||
st1 {v16.s}[0], [x0], x2
|
||||
st1 {v16.s}[1], [x0], x2
|
||||
b.gt 1b
|
||||
ret
|
||||
|
||||
2: adds w12, w12, w6
|
||||
dup v30.8B, w4
|
||||
dup v30.8b, w4
|
||||
b.eq 5f
|
||||
tst w6, w6
|
||||
dup v31.8B, w12
|
||||
trn1 v0.2S, v30.2S, v31.2S
|
||||
trn2 v1.2S, v30.2S, v31.2S
|
||||
dup v31.8b, w12
|
||||
trn1 v0.2s, v30.2s, v31.2s
|
||||
trn2 v1.2s, v30.2s, v31.2s
|
||||
b.eq 4f
|
||||
|
||||
ext v1.8B, v0.8B, v1.8B, #4
|
||||
ld1 {v4.S}[0], [x1], x2
|
||||
3: ld1 {v4.S}[1], [x1], x2
|
||||
umull v18.8H, v4.8B, v0.8B
|
||||
ld1 {v4.S}[0], [x1], x2
|
||||
umull v19.8H, v4.8B, v1.8B
|
||||
trn1 v30.2D, v18.2D, v19.2D
|
||||
trn2 v31.2D, v18.2D, v19.2D
|
||||
add v18.8H, v30.8H, v31.8H
|
||||
ext v1.8b, v0.8b, v1.8b, #4
|
||||
ld1 {v4.s}[0], [x1], x2
|
||||
3: ld1 {v4.s}[1], [x1], x2
|
||||
umull v18.8h, v4.8b, v0.8b
|
||||
ld1 {v4.s}[0], [x1], x2
|
||||
umull v19.8h, v4.8b, v1.8b
|
||||
trn1 v30.2d, v18.2d, v19.2d
|
||||
trn2 v31.2d, v18.2d, v19.2d
|
||||
add v18.8h, v30.8h, v31.8h
|
||||
prfm pldl1strm, [x1]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v18.8H, #6
|
||||
rshrn v16.8b, v18.8h, #6
|
||||
.else
|
||||
add v18.8H, v18.8H, v22.8H
|
||||
shrn v16.8B, v18.8H, #6
|
||||
add v18.8h, v18.8h, v22.8h
|
||||
shrn v16.8b, v18.8h, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.S}[0], [x8], x2
|
||||
ld1 {v20.S}[1], [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
ld1 {v20.s}[0], [x8], x2
|
||||
ld1 {v20.s}[1], [x8], x2
|
||||
urhadd v16.8b, v16.8b, v20.8b
|
||||
.endif
|
||||
subs w3, w3, #2
|
||||
prfm pldl1strm, [x1, x2]
|
||||
st1 {v16.S}[0], [x0], x2
|
||||
st1 {v16.S}[1], [x0], x2
|
||||
st1 {v16.s}[0], [x0], x2
|
||||
st1 {v16.s}[1], [x0], x2
|
||||
b.gt 3b
|
||||
ret
|
||||
|
||||
4: ld1 {v4.8B}, [x1], x2
|
||||
ld1 {v6.8B}, [x1], x2
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
ext v7.8B, v6.8B, v7.8B, #1
|
||||
trn1 v4.2S, v4.2S, v5.2S
|
||||
trn1 v6.2S, v6.2S, v7.2S
|
||||
umull v18.8H, v4.8B, v0.8B
|
||||
umull v19.8H, v6.8B, v0.8B
|
||||
4: ld1 {v4.8b}, [x1], x2
|
||||
ld1 {v6.8b}, [x1], x2
|
||||
ext v5.8b, v4.8b, v5.8b, #1
|
||||
ext v7.8b, v6.8b, v7.8b, #1
|
||||
trn1 v4.2s, v4.2s, v5.2s
|
||||
trn1 v6.2s, v6.2s, v7.2s
|
||||
umull v18.8h, v4.8b, v0.8b
|
||||
umull v19.8h, v6.8b, v0.8b
|
||||
subs w3, w3, #2
|
||||
trn1 v30.2D, v18.2D, v19.2D
|
||||
trn2 v31.2D, v18.2D, v19.2D
|
||||
add v18.8H, v30.8H, v31.8H
|
||||
trn1 v30.2d, v18.2d, v19.2d
|
||||
trn2 v31.2d, v18.2d, v19.2d
|
||||
add v18.8h, v30.8h, v31.8h
|
||||
prfm pldl1strm, [x1]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v18.8H, #6
|
||||
rshrn v16.8b, v18.8h, #6
|
||||
.else
|
||||
add v18.8H, v18.8H, v22.8H
|
||||
shrn v16.8B, v18.8H, #6
|
||||
add v18.8h, v18.8h, v22.8h
|
||||
shrn v16.8b, v18.8h, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.S}[0], [x8], x2
|
||||
ld1 {v20.S}[1], [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
ld1 {v20.s}[0], [x8], x2
|
||||
ld1 {v20.s}[1], [x8], x2
|
||||
urhadd v16.8b, v16.8b, v20.8b
|
||||
.endif
|
||||
prfm pldl1strm, [x1]
|
||||
st1 {v16.S}[0], [x0], x2
|
||||
st1 {v16.S}[1], [x0], x2
|
||||
st1 {v16.s}[0], [x0], x2
|
||||
st1 {v16.s}[1], [x0], x2
|
||||
b.gt 4b
|
||||
ret
|
||||
|
||||
5: ld1 {v4.S}[0], [x1], x2
|
||||
ld1 {v4.S}[1], [x1], x2
|
||||
umull v18.8H, v4.8B, v30.8B
|
||||
5: ld1 {v4.s}[0], [x1], x2
|
||||
ld1 {v4.s}[1], [x1], x2
|
||||
umull v18.8h, v4.8b, v30.8b
|
||||
subs w3, w3, #2
|
||||
prfm pldl1strm, [x1]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v18.8H, #6
|
||||
rshrn v16.8b, v18.8h, #6
|
||||
.else
|
||||
add v18.8H, v18.8H, v22.8H
|
||||
shrn v16.8B, v18.8H, #6
|
||||
add v18.8h, v18.8h, v22.8h
|
||||
shrn v16.8b, v18.8h, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.S}[0], [x8], x2
|
||||
ld1 {v20.S}[1], [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
ld1 {v20.s}[0], [x8], x2
|
||||
ld1 {v20.s}[1], [x8], x2
|
||||
urhadd v16.8b, v16.8b, v20.8b
|
||||
.endif
|
||||
prfm pldl1strm, [x1]
|
||||
st1 {v16.S}[0], [x0], x2
|
||||
st1 {v16.S}[1], [x0], x2
|
||||
st1 {v16.s}[0], [x0], x2
|
||||
st1 {v16.s}[1], [x0], x2
|
||||
b.gt 5b
|
||||
ret
|
||||
endfunc
|
||||
@@ -370,51 +370,51 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1
|
||||
sub w4, w7, w13
|
||||
sub w4, w4, w14
|
||||
add w4, w4, #64
|
||||
dup v0.8B, w4
|
||||
dup v2.8B, w12
|
||||
dup v1.8B, w6
|
||||
dup v3.8B, w7
|
||||
trn1 v0.4H, v0.4H, v2.4H
|
||||
trn1 v1.4H, v1.4H, v3.4H
|
||||
dup v0.8b, w4
|
||||
dup v2.8b, w12
|
||||
dup v1.8b, w6
|
||||
dup v3.8b, w7
|
||||
trn1 v0.4h, v0.4h, v2.4h
|
||||
trn1 v1.4h, v1.4h, v3.4h
|
||||
1:
|
||||
ld1 {v4.S}[0], [x1], x2
|
||||
ld1 {v4.S}[1], [x1], x2
|
||||
rev64 v5.2S, v4.2S
|
||||
ld1 {v5.S}[1], [x1]
|
||||
ext v6.8B, v4.8B, v5.8B, #1
|
||||
ext v7.8B, v5.8B, v4.8B, #1
|
||||
trn1 v4.4H, v4.4H, v6.4H
|
||||
trn1 v5.4H, v5.4H, v7.4H
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umlal v16.8H, v5.8B, v1.8B
|
||||
ld1 {v4.s}[0], [x1], x2
|
||||
ld1 {v4.s}[1], [x1], x2
|
||||
rev64 v5.2s, v4.2s
|
||||
ld1 {v5.s}[1], [x1]
|
||||
ext v6.8b, v4.8b, v5.8b, #1
|
||||
ext v7.8b, v5.8b, v4.8b, #1
|
||||
trn1 v4.4h, v4.4h, v6.4h
|
||||
trn1 v5.4h, v5.4h, v7.4h
|
||||
umull v16.8h, v4.8b, v0.8b
|
||||
umlal v16.8h, v5.8b, v1.8b
|
||||
.ifc \type,avg
|
||||
ld1 {v18.H}[0], [x0], x2
|
||||
ld1 {v18.H}[2], [x0]
|
||||
ld1 {v18.h}[0], [x0], x2
|
||||
ld1 {v18.h}[2], [x0]
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
rev64 v17.4S, v16.4S
|
||||
add v16.8H, v16.8H, v17.8H
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rev64 v17.4s, v16.4s
|
||||
add v16.8h, v16.8h, v17.8h
|
||||
rshrn v16.8b, v16.8h, #6
|
||||
.ifc \type,avg
|
||||
urhadd v16.8B, v16.8B, v18.8B
|
||||
urhadd v16.8b, v16.8b, v18.8b
|
||||
.endif
|
||||
st1 {v16.H}[0], [x0], x2
|
||||
st1 {v16.H}[2], [x0], x2
|
||||
st1 {v16.h}[0], [x0], x2
|
||||
st1 {v16.h}[2], [x0], x2
|
||||
subs w3, w3, #2
|
||||
b.gt 1b
|
||||
ret
|
||||
|
||||
2:
|
||||
ld1 {v16.H}[0], [x1], x2
|
||||
ld1 {v16.H}[1], [x1], x2
|
||||
ld1 {v16.h}[0], [x1], x2
|
||||
ld1 {v16.h}[1], [x1], x2
|
||||
.ifc \type,avg
|
||||
ld1 {v18.H}[0], [x0], x2
|
||||
ld1 {v18.H}[1], [x0]
|
||||
ld1 {v18.h}[0], [x0], x2
|
||||
ld1 {v18.h}[1], [x0]
|
||||
sub x0, x0, x2
|
||||
urhadd v16.8B, v16.8B, v18.8B
|
||||
urhadd v16.8b, v16.8b, v18.8b
|
||||
.endif
|
||||
st1 {v16.H}[0], [x0], x2
|
||||
st1 {v16.H}[1], [x0], x2
|
||||
st1 {v16.h}[0], [x0], x2
|
||||
st1 {v16.h}[1], [x0], x2
|
||||
subs w3, w3, #2
|
||||
b.gt 2b
|
||||
ret
|
||||
|
||||
+531
-531
File diff suppressed because it is too large
Load Diff
+272
-272
@@ -27,114 +27,114 @@
|
||||
.macro lowpass_const r
|
||||
movz \r, #20, lsl #16
|
||||
movk \r, #5
|
||||
mov v6.S[0], \r
|
||||
mov v6.s[0], \r
|
||||
.endm
|
||||
|
||||
//trashes v0-v5
|
||||
.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
|
||||
ext v2.8B, \r0\().8B, \r1\().8B, #2
|
||||
ext v3.8B, \r0\().8B, \r1\().8B, #3
|
||||
uaddl v2.8H, v2.8B, v3.8B
|
||||
ext v4.8B, \r0\().8B, \r1\().8B, #1
|
||||
ext v5.8B, \r0\().8B, \r1\().8B, #4
|
||||
uaddl v4.8H, v4.8B, v5.8B
|
||||
ext v1.8B, \r0\().8B, \r1\().8B, #5
|
||||
uaddl \d0\().8H, \r0\().8B, v1.8B
|
||||
ext v0.8B, \r2\().8B, \r3\().8B, #2
|
||||
mla \d0\().8H, v2.8H, v6.H[1]
|
||||
ext v1.8B, \r2\().8B, \r3\().8B, #3
|
||||
uaddl v0.8H, v0.8B, v1.8B
|
||||
ext v1.8B, \r2\().8B, \r3\().8B, #1
|
||||
mls \d0\().8H, v4.8H, v6.H[0]
|
||||
ext v3.8B, \r2\().8B, \r3\().8B, #4
|
||||
uaddl v1.8H, v1.8B, v3.8B
|
||||
ext v2.8B, \r2\().8B, \r3\().8B, #5
|
||||
uaddl \d1\().8H, \r2\().8B, v2.8B
|
||||
mla \d1\().8H, v0.8H, v6.H[1]
|
||||
mls \d1\().8H, v1.8H, v6.H[0]
|
||||
ext v2.8b, \r0\().8b, \r1\().8b, #2
|
||||
ext v3.8b, \r0\().8b, \r1\().8b, #3
|
||||
uaddl v2.8h, v2.8b, v3.8b
|
||||
ext v4.8b, \r0\().8b, \r1\().8b, #1
|
||||
ext v5.8b, \r0\().8b, \r1\().8b, #4
|
||||
uaddl v4.8h, v4.8b, v5.8b
|
||||
ext v1.8b, \r0\().8b, \r1\().8b, #5
|
||||
uaddl \d0\().8h, \r0\().8b, v1.8b
|
||||
ext v0.8b, \r2\().8b, \r3\().8b, #2
|
||||
mla \d0\().8h, v2.8h, v6.h[1]
|
||||
ext v1.8b, \r2\().8b, \r3\().8b, #3
|
||||
uaddl v0.8h, v0.8b, v1.8b
|
||||
ext v1.8b, \r2\().8b, \r3\().8b, #1
|
||||
mls \d0\().8h, v4.8h, v6.h[0]
|
||||
ext v3.8b, \r2\().8b, \r3\().8b, #4
|
||||
uaddl v1.8h, v1.8b, v3.8b
|
||||
ext v2.8b, \r2\().8b, \r3\().8b, #5
|
||||
uaddl \d1\().8h, \r2\().8b, v2.8b
|
||||
mla \d1\().8h, v0.8h, v6.h[1]
|
||||
mls \d1\().8h, v1.8h, v6.h[0]
|
||||
.if \narrow
|
||||
sqrshrun \d0\().8B, \d0\().8H, #5
|
||||
sqrshrun \d1\().8B, \d1\().8H, #5
|
||||
sqrshrun \d0\().8b, \d0\().8h, #5
|
||||
sqrshrun \d1\().8b, \d1\().8h, #5
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//trashes v0-v5, v7, v30-v31
|
||||
.macro lowpass_8H r0, r1
|
||||
ext v0.16B, \r0\().16B, \r0\().16B, #2
|
||||
ext v1.16B, \r0\().16B, \r0\().16B, #3
|
||||
uaddl v0.8H, v0.8B, v1.8B
|
||||
ext v2.16B, \r0\().16B, \r0\().16B, #1
|
||||
ext v3.16B, \r0\().16B, \r0\().16B, #4
|
||||
uaddl v2.8H, v2.8B, v3.8B
|
||||
ext v30.16B, \r0\().16B, \r0\().16B, #5
|
||||
uaddl \r0\().8H, \r0\().8B, v30.8B
|
||||
ext v4.16B, \r1\().16B, \r1\().16B, #2
|
||||
mla \r0\().8H, v0.8H, v6.H[1]
|
||||
ext v5.16B, \r1\().16B, \r1\().16B, #3
|
||||
uaddl v4.8H, v4.8B, v5.8B
|
||||
ext v7.16B, \r1\().16B, \r1\().16B, #1
|
||||
mls \r0\().8H, v2.8H, v6.H[0]
|
||||
ext v0.16B, \r1\().16B, \r1\().16B, #4
|
||||
uaddl v7.8H, v7.8B, v0.8B
|
||||
ext v31.16B, \r1\().16B, \r1\().16B, #5
|
||||
uaddl \r1\().8H, \r1\().8B, v31.8B
|
||||
mla \r1\().8H, v4.8H, v6.H[1]
|
||||
mls \r1\().8H, v7.8H, v6.H[0]
|
||||
ext v0.16b, \r0\().16b, \r0\().16b, #2
|
||||
ext v1.16b, \r0\().16b, \r0\().16b, #3
|
||||
uaddl v0.8h, v0.8b, v1.8b
|
||||
ext v2.16b, \r0\().16b, \r0\().16b, #1
|
||||
ext v3.16b, \r0\().16b, \r0\().16b, #4
|
||||
uaddl v2.8h, v2.8b, v3.8b
|
||||
ext v30.16b, \r0\().16b, \r0\().16b, #5
|
||||
uaddl \r0\().8h, \r0\().8b, v30.8b
|
||||
ext v4.16b, \r1\().16b, \r1\().16b, #2
|
||||
mla \r0\().8h, v0.8h, v6.h[1]
|
||||
ext v5.16b, \r1\().16b, \r1\().16b, #3
|
||||
uaddl v4.8h, v4.8b, v5.8b
|
||||
ext v7.16b, \r1\().16b, \r1\().16b, #1
|
||||
mls \r0\().8h, v2.8h, v6.h[0]
|
||||
ext v0.16b, \r1\().16b, \r1\().16b, #4
|
||||
uaddl v7.8h, v7.8b, v0.8b
|
||||
ext v31.16b, \r1\().16b, \r1\().16b, #5
|
||||
uaddl \r1\().8h, \r1\().8b, v31.8b
|
||||
mla \r1\().8h, v4.8h, v6.h[1]
|
||||
mls \r1\().8h, v7.8h, v6.h[0]
|
||||
.endm
|
||||
|
||||
// trashes v2-v5, v30
|
||||
.macro lowpass_8_1 r0, r1, d0, narrow=1
|
||||
ext v2.8B, \r0\().8B, \r1\().8B, #2
|
||||
ext v3.8B, \r0\().8B, \r1\().8B, #3
|
||||
uaddl v2.8H, v2.8B, v3.8B
|
||||
ext v4.8B, \r0\().8B, \r1\().8B, #1
|
||||
ext v5.8B, \r0\().8B, \r1\().8B, #4
|
||||
uaddl v4.8H, v4.8B, v5.8B
|
||||
ext v30.8B, \r0\().8B, \r1\().8B, #5
|
||||
uaddl \d0\().8H, \r0\().8B, v30.8B
|
||||
mla \d0\().8H, v2.8H, v6.H[1]
|
||||
mls \d0\().8H, v4.8H, v6.H[0]
|
||||
ext v2.8b, \r0\().8b, \r1\().8b, #2
|
||||
ext v3.8b, \r0\().8b, \r1\().8b, #3
|
||||
uaddl v2.8h, v2.8b, v3.8b
|
||||
ext v4.8b, \r0\().8b, \r1\().8b, #1
|
||||
ext v5.8b, \r0\().8b, \r1\().8b, #4
|
||||
uaddl v4.8h, v4.8b, v5.8b
|
||||
ext v30.8b, \r0\().8b, \r1\().8b, #5
|
||||
uaddl \d0\().8h, \r0\().8b, v30.8b
|
||||
mla \d0\().8h, v2.8h, v6.h[1]
|
||||
mls \d0\().8h, v4.8h, v6.h[0]
|
||||
.if \narrow
|
||||
sqrshrun \d0\().8B, \d0\().8H, #5
|
||||
sqrshrun \d0\().8b, \d0\().8h, #5
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// trashed v0-v7
|
||||
.macro lowpass_8.16 r0, r1, r2
|
||||
ext v1.16B, \r0\().16B, \r1\().16B, #4
|
||||
ext v0.16B, \r0\().16B, \r1\().16B, #6
|
||||
saddl v5.4S, v1.4H, v0.4H
|
||||
ext v2.16B, \r0\().16B, \r1\().16B, #2
|
||||
saddl2 v1.4S, v1.8H, v0.8H
|
||||
ext v3.16B, \r0\().16B, \r1\().16B, #8
|
||||
saddl v6.4S, v2.4H, v3.4H
|
||||
ext \r1\().16B, \r0\().16B, \r1\().16B, #10
|
||||
saddl2 v2.4S, v2.8H, v3.8H
|
||||
saddl v0.4S, \r0\().4H, \r1\().4H
|
||||
saddl2 v4.4S, \r0\().8H, \r1\().8H
|
||||
ext v1.16b, \r0\().16b, \r1\().16b, #4
|
||||
ext v0.16b, \r0\().16b, \r1\().16b, #6
|
||||
saddl v5.4s, v1.4h, v0.4h
|
||||
ext v2.16b, \r0\().16b, \r1\().16b, #2
|
||||
saddl2 v1.4s, v1.8h, v0.8h
|
||||
ext v3.16b, \r0\().16b, \r1\().16b, #8
|
||||
saddl v6.4s, v2.4h, v3.4h
|
||||
ext \r1\().16b, \r0\().16b, \r1\().16b, #10
|
||||
saddl2 v2.4s, v2.8h, v3.8h
|
||||
saddl v0.4s, \r0\().4h, \r1\().4h
|
||||
saddl2 v4.4s, \r0\().8h, \r1\().8h
|
||||
|
||||
shl v3.4S, v5.4S, #4
|
||||
shl v5.4S, v5.4S, #2
|
||||
shl v7.4S, v6.4S, #2
|
||||
add v5.4S, v5.4S, v3.4S
|
||||
add v6.4S, v6.4S, v7.4S
|
||||
shl v3.4s, v5.4s, #4
|
||||
shl v5.4s, v5.4s, #2
|
||||
shl v7.4s, v6.4s, #2
|
||||
add v5.4s, v5.4s, v3.4s
|
||||
add v6.4s, v6.4s, v7.4s
|
||||
|
||||
shl v3.4S, v1.4S, #4
|
||||
shl v1.4S, v1.4S, #2
|
||||
shl v7.4S, v2.4S, #2
|
||||
add v1.4S, v1.4S, v3.4S
|
||||
add v2.4S, v2.4S, v7.4S
|
||||
shl v3.4s, v1.4s, #4
|
||||
shl v1.4s, v1.4s, #2
|
||||
shl v7.4s, v2.4s, #2
|
||||
add v1.4s, v1.4s, v3.4s
|
||||
add v2.4s, v2.4s, v7.4s
|
||||
|
||||
add v5.4S, v5.4S, v0.4S
|
||||
sub v5.4S, v5.4S, v6.4S
|
||||
add v5.4s, v5.4s, v0.4s
|
||||
sub v5.4s, v5.4s, v6.4s
|
||||
|
||||
add v1.4S, v1.4S, v4.4S
|
||||
sub v1.4S, v1.4S, v2.4S
|
||||
add v1.4s, v1.4s, v4.4s
|
||||
sub v1.4s, v1.4s, v2.4s
|
||||
|
||||
rshrn v5.4H, v5.4S, #10
|
||||
rshrn2 v5.8H, v1.4S, #10
|
||||
rshrn v5.4h, v5.4s, #10
|
||||
rshrn2 v5.8h, v1.4s, #10
|
||||
|
||||
sqxtun \r2\().8B, v5.8H
|
||||
sqxtun \r2\().8b, v5.8h
|
||||
.endm
|
||||
|
||||
function put_h264_qpel16_h_lowpass_neon_packed
|
||||
@@ -163,19 +163,19 @@ function \type\()_h264_qpel16_h_lowpass_neon
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel8_h_lowpass_neon
|
||||
1: ld1 {v28.8B, v29.8B}, [x1], x2
|
||||
ld1 {v16.8B, v17.8B}, [x1], x2
|
||||
1: ld1 {v28.8b, v29.8b}, [x1], x2
|
||||
ld1 {v16.8b, v17.8b}, [x1], x2
|
||||
subs x12, x12, #2
|
||||
lowpass_8 v28, v29, v16, v17, v28, v16
|
||||
.ifc \type,avg
|
||||
ld1 {v2.8B}, [x0], x3
|
||||
urhadd v28.8B, v28.8B, v2.8B
|
||||
ld1 {v3.8B}, [x0]
|
||||
urhadd v16.8B, v16.8B, v3.8B
|
||||
ld1 {v2.8b}, [x0], x3
|
||||
urhadd v28.8b, v28.8b, v2.8b
|
||||
ld1 {v3.8b}, [x0]
|
||||
urhadd v16.8b, v16.8b, v3.8b
|
||||
sub x0, x0, x3
|
||||
.endif
|
||||
st1 {v28.8B}, [x0], x3
|
||||
st1 {v16.8B}, [x0], x3
|
||||
st1 {v28.8b}, [x0], x3
|
||||
st1 {v16.8b}, [x0], x3
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
@@ -200,23 +200,23 @@ function \type\()_h264_qpel16_h_lowpass_l2_neon
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel8_h_lowpass_l2_neon
|
||||
1: ld1 {v26.8B, v27.8B}, [x1], x2
|
||||
ld1 {v16.8B, v17.8B}, [x1], x2
|
||||
ld1 {v28.8B}, [x3], x2
|
||||
ld1 {v29.8B}, [x3], x2
|
||||
1: ld1 {v26.8b, v27.8b}, [x1], x2
|
||||
ld1 {v16.8b, v17.8b}, [x1], x2
|
||||
ld1 {v28.8b}, [x3], x2
|
||||
ld1 {v29.8b}, [x3], x2
|
||||
subs x12, x12, #2
|
||||
lowpass_8 v26, v27, v16, v17, v26, v27
|
||||
urhadd v26.8B, v26.8B, v28.8B
|
||||
urhadd v27.8B, v27.8B, v29.8B
|
||||
urhadd v26.8b, v26.8b, v28.8b
|
||||
urhadd v27.8b, v27.8b, v29.8b
|
||||
.ifc \type,avg
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
urhadd v26.8B, v26.8B, v2.8B
|
||||
ld1 {v3.8B}, [x0]
|
||||
urhadd v27.8B, v27.8B, v3.8B
|
||||
ld1 {v2.8b}, [x0], x2
|
||||
urhadd v26.8b, v26.8b, v2.8b
|
||||
ld1 {v3.8b}, [x0]
|
||||
urhadd v27.8b, v27.8b, v3.8b
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v26.8B}, [x0], x2
|
||||
st1 {v27.8B}, [x0], x2
|
||||
st1 {v26.8b}, [x0], x2
|
||||
st1 {v27.8b}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
@@ -257,19 +257,19 @@ function \type\()_h264_qpel16_v_lowpass_neon
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel8_v_lowpass_neon
|
||||
ld1 {v16.8B}, [x1], x3
|
||||
ld1 {v18.8B}, [x1], x3
|
||||
ld1 {v20.8B}, [x1], x3
|
||||
ld1 {v22.8B}, [x1], x3
|
||||
ld1 {v24.8B}, [x1], x3
|
||||
ld1 {v26.8B}, [x1], x3
|
||||
ld1 {v28.8B}, [x1], x3
|
||||
ld1 {v30.8B}, [x1], x3
|
||||
ld1 {v17.8B}, [x1], x3
|
||||
ld1 {v19.8B}, [x1], x3
|
||||
ld1 {v21.8B}, [x1], x3
|
||||
ld1 {v23.8B}, [x1], x3
|
||||
ld1 {v25.8B}, [x1]
|
||||
ld1 {v16.8b}, [x1], x3
|
||||
ld1 {v18.8b}, [x1], x3
|
||||
ld1 {v20.8b}, [x1], x3
|
||||
ld1 {v22.8b}, [x1], x3
|
||||
ld1 {v24.8b}, [x1], x3
|
||||
ld1 {v26.8b}, [x1], x3
|
||||
ld1 {v28.8b}, [x1], x3
|
||||
ld1 {v30.8b}, [x1], x3
|
||||
ld1 {v17.8b}, [x1], x3
|
||||
ld1 {v19.8b}, [x1], x3
|
||||
ld1 {v21.8b}, [x1], x3
|
||||
ld1 {v23.8b}, [x1], x3
|
||||
ld1 {v25.8b}, [x1]
|
||||
|
||||
transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1
|
||||
transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1
|
||||
@@ -280,33 +280,33 @@ function \type\()_h264_qpel8_v_lowpass_neon
|
||||
transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
|
||||
|
||||
.ifc \type,avg
|
||||
ld1 {v24.8B}, [x0], x2
|
||||
urhadd v16.8B, v16.8B, v24.8B
|
||||
ld1 {v25.8B}, [x0], x2
|
||||
urhadd v17.8B, v17.8B, v25.8B
|
||||
ld1 {v26.8B}, [x0], x2
|
||||
urhadd v18.8B, v18.8B, v26.8B
|
||||
ld1 {v27.8B}, [x0], x2
|
||||
urhadd v19.8B, v19.8B, v27.8B
|
||||
ld1 {v28.8B}, [x0], x2
|
||||
urhadd v20.8B, v20.8B, v28.8B
|
||||
ld1 {v29.8B}, [x0], x2
|
||||
urhadd v21.8B, v21.8B, v29.8B
|
||||
ld1 {v30.8B}, [x0], x2
|
||||
urhadd v22.8B, v22.8B, v30.8B
|
||||
ld1 {v31.8B}, [x0], x2
|
||||
urhadd v23.8B, v23.8B, v31.8B
|
||||
ld1 {v24.8b}, [x0], x2
|
||||
urhadd v16.8b, v16.8b, v24.8b
|
||||
ld1 {v25.8b}, [x0], x2
|
||||
urhadd v17.8b, v17.8b, v25.8b
|
||||
ld1 {v26.8b}, [x0], x2
|
||||
urhadd v18.8b, v18.8b, v26.8b
|
||||
ld1 {v27.8b}, [x0], x2
|
||||
urhadd v19.8b, v19.8b, v27.8b
|
||||
ld1 {v28.8b}, [x0], x2
|
||||
urhadd v20.8b, v20.8b, v28.8b
|
||||
ld1 {v29.8b}, [x0], x2
|
||||
urhadd v21.8b, v21.8b, v29.8b
|
||||
ld1 {v30.8b}, [x0], x2
|
||||
urhadd v22.8b, v22.8b, v30.8b
|
||||
ld1 {v31.8b}, [x0], x2
|
||||
urhadd v23.8b, v23.8b, v31.8b
|
||||
sub x0, x0, x2, lsl #3
|
||||
.endif
|
||||
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
st1 {v18.8B}, [x0], x2
|
||||
st1 {v19.8B}, [x0], x2
|
||||
st1 {v20.8B}, [x0], x2
|
||||
st1 {v21.8B}, [x0], x2
|
||||
st1 {v22.8B}, [x0], x2
|
||||
st1 {v23.8B}, [x0], x2
|
||||
st1 {v16.8b}, [x0], x2
|
||||
st1 {v17.8b}, [x0], x2
|
||||
st1 {v18.8b}, [x0], x2
|
||||
st1 {v19.8b}, [x0], x2
|
||||
st1 {v20.8b}, [x0], x2
|
||||
st1 {v21.8b}, [x0], x2
|
||||
st1 {v22.8b}, [x0], x2
|
||||
st1 {v23.8b}, [x0], x2
|
||||
|
||||
ret
|
||||
endfunc
|
||||
@@ -334,19 +334,19 @@ function \type\()_h264_qpel16_v_lowpass_l2_neon
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel8_v_lowpass_l2_neon
|
||||
ld1 {v16.8B}, [x1], x3
|
||||
ld1 {v18.8B}, [x1], x3
|
||||
ld1 {v20.8B}, [x1], x3
|
||||
ld1 {v22.8B}, [x1], x3
|
||||
ld1 {v24.8B}, [x1], x3
|
||||
ld1 {v26.8B}, [x1], x3
|
||||
ld1 {v28.8B}, [x1], x3
|
||||
ld1 {v30.8B}, [x1], x3
|
||||
ld1 {v17.8B}, [x1], x3
|
||||
ld1 {v19.8B}, [x1], x3
|
||||
ld1 {v21.8B}, [x1], x3
|
||||
ld1 {v23.8B}, [x1], x3
|
||||
ld1 {v25.8B}, [x1]
|
||||
ld1 {v16.8b}, [x1], x3
|
||||
ld1 {v18.8b}, [x1], x3
|
||||
ld1 {v20.8b}, [x1], x3
|
||||
ld1 {v22.8b}, [x1], x3
|
||||
ld1 {v24.8b}, [x1], x3
|
||||
ld1 {v26.8b}, [x1], x3
|
||||
ld1 {v28.8b}, [x1], x3
|
||||
ld1 {v30.8b}, [x1], x3
|
||||
ld1 {v17.8b}, [x1], x3
|
||||
ld1 {v19.8b}, [x1], x3
|
||||
ld1 {v21.8b}, [x1], x3
|
||||
ld1 {v23.8b}, [x1], x3
|
||||
ld1 {v25.8b}, [x1]
|
||||
|
||||
transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1
|
||||
transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1
|
||||
@@ -356,51 +356,51 @@ function \type\()_h264_qpel8_v_lowpass_l2_neon
|
||||
lowpass_8 v28, v29, v30, v31, v22, v23
|
||||
transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
|
||||
|
||||
ld1 {v24.8B}, [x12], x2
|
||||
ld1 {v25.8B}, [x12], x2
|
||||
ld1 {v26.8B}, [x12], x2
|
||||
ld1 {v27.8B}, [x12], x2
|
||||
ld1 {v28.8B}, [x12], x2
|
||||
urhadd v16.8B, v24.8B, v16.8B
|
||||
urhadd v17.8B, v25.8B, v17.8B
|
||||
ld1 {v29.8B}, [x12], x2
|
||||
urhadd v18.8B, v26.8B, v18.8B
|
||||
urhadd v19.8B, v27.8B, v19.8B
|
||||
ld1 {v30.8B}, [x12], x2
|
||||
urhadd v20.8B, v28.8B, v20.8B
|
||||
urhadd v21.8B, v29.8B, v21.8B
|
||||
ld1 {v31.8B}, [x12], x2
|
||||
urhadd v22.8B, v30.8B, v22.8B
|
||||
urhadd v23.8B, v31.8B, v23.8B
|
||||
ld1 {v24.8b}, [x12], x2
|
||||
ld1 {v25.8b}, [x12], x2
|
||||
ld1 {v26.8b}, [x12], x2
|
||||
ld1 {v27.8b}, [x12], x2
|
||||
ld1 {v28.8b}, [x12], x2
|
||||
urhadd v16.8b, v24.8b, v16.8b
|
||||
urhadd v17.8b, v25.8b, v17.8b
|
||||
ld1 {v29.8b}, [x12], x2
|
||||
urhadd v18.8b, v26.8b, v18.8b
|
||||
urhadd v19.8b, v27.8b, v19.8b
|
||||
ld1 {v30.8b}, [x12], x2
|
||||
urhadd v20.8b, v28.8b, v20.8b
|
||||
urhadd v21.8b, v29.8b, v21.8b
|
||||
ld1 {v31.8b}, [x12], x2
|
||||
urhadd v22.8b, v30.8b, v22.8b
|
||||
urhadd v23.8b, v31.8b, v23.8b
|
||||
|
||||
.ifc \type,avg
|
||||
ld1 {v24.8B}, [x0], x3
|
||||
urhadd v16.8B, v16.8B, v24.8B
|
||||
ld1 {v25.8B}, [x0], x3
|
||||
urhadd v17.8B, v17.8B, v25.8B
|
||||
ld1 {v26.8B}, [x0], x3
|
||||
urhadd v18.8B, v18.8B, v26.8B
|
||||
ld1 {v27.8B}, [x0], x3
|
||||
urhadd v19.8B, v19.8B, v27.8B
|
||||
ld1 {v28.8B}, [x0], x3
|
||||
urhadd v20.8B, v20.8B, v28.8B
|
||||
ld1 {v29.8B}, [x0], x3
|
||||
urhadd v21.8B, v21.8B, v29.8B
|
||||
ld1 {v30.8B}, [x0], x3
|
||||
urhadd v22.8B, v22.8B, v30.8B
|
||||
ld1 {v31.8B}, [x0], x3
|
||||
urhadd v23.8B, v23.8B, v31.8B
|
||||
ld1 {v24.8b}, [x0], x3
|
||||
urhadd v16.8b, v16.8b, v24.8b
|
||||
ld1 {v25.8b}, [x0], x3
|
||||
urhadd v17.8b, v17.8b, v25.8b
|
||||
ld1 {v26.8b}, [x0], x3
|
||||
urhadd v18.8b, v18.8b, v26.8b
|
||||
ld1 {v27.8b}, [x0], x3
|
||||
urhadd v19.8b, v19.8b, v27.8b
|
||||
ld1 {v28.8b}, [x0], x3
|
||||
urhadd v20.8b, v20.8b, v28.8b
|
||||
ld1 {v29.8b}, [x0], x3
|
||||
urhadd v21.8b, v21.8b, v29.8b
|
||||
ld1 {v30.8b}, [x0], x3
|
||||
urhadd v22.8b, v22.8b, v30.8b
|
||||
ld1 {v31.8b}, [x0], x3
|
||||
urhadd v23.8b, v23.8b, v31.8b
|
||||
sub x0, x0, x3, lsl #3
|
||||
.endif
|
||||
|
||||
st1 {v16.8B}, [x0], x3
|
||||
st1 {v17.8B}, [x0], x3
|
||||
st1 {v18.8B}, [x0], x3
|
||||
st1 {v19.8B}, [x0], x3
|
||||
st1 {v20.8B}, [x0], x3
|
||||
st1 {v21.8B}, [x0], x3
|
||||
st1 {v22.8B}, [x0], x3
|
||||
st1 {v23.8B}, [x0], x3
|
||||
st1 {v16.8b}, [x0], x3
|
||||
st1 {v17.8b}, [x0], x3
|
||||
st1 {v18.8b}, [x0], x3
|
||||
st1 {v19.8b}, [x0], x3
|
||||
st1 {v20.8b}, [x0], x3
|
||||
st1 {v21.8b}, [x0], x3
|
||||
st1 {v22.8b}, [x0], x3
|
||||
st1 {v23.8b}, [x0], x3
|
||||
|
||||
ret
|
||||
endfunc
|
||||
@@ -411,19 +411,19 @@ endfunc
|
||||
|
||||
function put_h264_qpel8_hv_lowpass_neon_top
|
||||
lowpass_const w12
|
||||
ld1 {v16.8H}, [x1], x3
|
||||
ld1 {v17.8H}, [x1], x3
|
||||
ld1 {v18.8H}, [x1], x3
|
||||
ld1 {v19.8H}, [x1], x3
|
||||
ld1 {v20.8H}, [x1], x3
|
||||
ld1 {v21.8H}, [x1], x3
|
||||
ld1 {v22.8H}, [x1], x3
|
||||
ld1 {v23.8H}, [x1], x3
|
||||
ld1 {v24.8H}, [x1], x3
|
||||
ld1 {v25.8H}, [x1], x3
|
||||
ld1 {v26.8H}, [x1], x3
|
||||
ld1 {v27.8H}, [x1], x3
|
||||
ld1 {v28.8H}, [x1]
|
||||
ld1 {v16.8h}, [x1], x3
|
||||
ld1 {v17.8h}, [x1], x3
|
||||
ld1 {v18.8h}, [x1], x3
|
||||
ld1 {v19.8h}, [x1], x3
|
||||
ld1 {v20.8h}, [x1], x3
|
||||
ld1 {v21.8h}, [x1], x3
|
||||
ld1 {v22.8h}, [x1], x3
|
||||
ld1 {v23.8h}, [x1], x3
|
||||
ld1 {v24.8h}, [x1], x3
|
||||
ld1 {v25.8h}, [x1], x3
|
||||
ld1 {v26.8h}, [x1], x3
|
||||
ld1 {v27.8h}, [x1], x3
|
||||
ld1 {v28.8h}, [x1]
|
||||
lowpass_8H v16, v17
|
||||
lowpass_8H v18, v19
|
||||
lowpass_8H v20, v21
|
||||
@@ -447,7 +447,7 @@ function put_h264_qpel8_hv_lowpass_neon_top
|
||||
lowpass_8.16 v22, v30, v22
|
||||
lowpass_8.16 v23, v31, v23
|
||||
|
||||
transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
|
||||
transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
|
||||
|
||||
ret
|
||||
endfunc
|
||||
@@ -457,33 +457,33 @@ function \type\()_h264_qpel8_hv_lowpass_neon
|
||||
mov x10, x30
|
||||
bl put_h264_qpel8_hv_lowpass_neon_top
|
||||
.ifc \type,avg
|
||||
ld1 {v0.8B}, [x0], x2
|
||||
urhadd v16.8B, v16.8B, v0.8B
|
||||
ld1 {v1.8B}, [x0], x2
|
||||
urhadd v17.8B, v17.8B, v1.8B
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
urhadd v18.8B, v18.8B, v2.8B
|
||||
ld1 {v3.8B}, [x0], x2
|
||||
urhadd v19.8B, v19.8B, v3.8B
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
urhadd v20.8B, v20.8B, v4.8B
|
||||
ld1 {v5.8B}, [x0], x2
|
||||
urhadd v21.8B, v21.8B, v5.8B
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
urhadd v22.8B, v22.8B, v6.8B
|
||||
ld1 {v7.8B}, [x0], x2
|
||||
urhadd v23.8B, v23.8B, v7.8B
|
||||
ld1 {v0.8b}, [x0], x2
|
||||
urhadd v16.8b, v16.8b, v0.8b
|
||||
ld1 {v1.8b}, [x0], x2
|
||||
urhadd v17.8b, v17.8b, v1.8b
|
||||
ld1 {v2.8b}, [x0], x2
|
||||
urhadd v18.8b, v18.8b, v2.8b
|
||||
ld1 {v3.8b}, [x0], x2
|
||||
urhadd v19.8b, v19.8b, v3.8b
|
||||
ld1 {v4.8b}, [x0], x2
|
||||
urhadd v20.8b, v20.8b, v4.8b
|
||||
ld1 {v5.8b}, [x0], x2
|
||||
urhadd v21.8b, v21.8b, v5.8b
|
||||
ld1 {v6.8b}, [x0], x2
|
||||
urhadd v22.8b, v22.8b, v6.8b
|
||||
ld1 {v7.8b}, [x0], x2
|
||||
urhadd v23.8b, v23.8b, v7.8b
|
||||
sub x0, x0, x2, lsl #3
|
||||
.endif
|
||||
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
st1 {v18.8B}, [x0], x2
|
||||
st1 {v19.8B}, [x0], x2
|
||||
st1 {v20.8B}, [x0], x2
|
||||
st1 {v21.8B}, [x0], x2
|
||||
st1 {v22.8B}, [x0], x2
|
||||
st1 {v23.8B}, [x0], x2
|
||||
st1 {v16.8b}, [x0], x2
|
||||
st1 {v17.8b}, [x0], x2
|
||||
st1 {v18.8b}, [x0], x2
|
||||
st1 {v19.8b}, [x0], x2
|
||||
st1 {v20.8b}, [x0], x2
|
||||
st1 {v21.8b}, [x0], x2
|
||||
st1 {v22.8b}, [x0], x2
|
||||
st1 {v23.8b}, [x0], x2
|
||||
|
||||
ret x10
|
||||
endfunc
|
||||
@@ -497,45 +497,45 @@ function \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
mov x10, x30
|
||||
bl put_h264_qpel8_hv_lowpass_neon_top
|
||||
|
||||
ld1 {v0.8B, v1.8B}, [x2], #16
|
||||
ld1 {v2.8B, v3.8B}, [x2], #16
|
||||
urhadd v0.8B, v0.8B, v16.8B
|
||||
urhadd v1.8B, v1.8B, v17.8B
|
||||
ld1 {v4.8B, v5.8B}, [x2], #16
|
||||
urhadd v2.8B, v2.8B, v18.8B
|
||||
urhadd v3.8B, v3.8B, v19.8B
|
||||
ld1 {v6.8B, v7.8B}, [x2], #16
|
||||
urhadd v4.8B, v4.8B, v20.8B
|
||||
urhadd v5.8B, v5.8B, v21.8B
|
||||
urhadd v6.8B, v6.8B, v22.8B
|
||||
urhadd v7.8B, v7.8B, v23.8B
|
||||
ld1 {v0.8b, v1.8b}, [x2], #16
|
||||
ld1 {v2.8b, v3.8b}, [x2], #16
|
||||
urhadd v0.8b, v0.8b, v16.8b
|
||||
urhadd v1.8b, v1.8b, v17.8b
|
||||
ld1 {v4.8b, v5.8b}, [x2], #16
|
||||
urhadd v2.8b, v2.8b, v18.8b
|
||||
urhadd v3.8b, v3.8b, v19.8b
|
||||
ld1 {v6.8b, v7.8b}, [x2], #16
|
||||
urhadd v4.8b, v4.8b, v20.8b
|
||||
urhadd v5.8b, v5.8b, v21.8b
|
||||
urhadd v6.8b, v6.8b, v22.8b
|
||||
urhadd v7.8b, v7.8b, v23.8b
|
||||
.ifc \type,avg
|
||||
ld1 {v16.8B}, [x0], x3
|
||||
urhadd v0.8B, v0.8B, v16.8B
|
||||
ld1 {v17.8B}, [x0], x3
|
||||
urhadd v1.8B, v1.8B, v17.8B
|
||||
ld1 {v18.8B}, [x0], x3
|
||||
urhadd v2.8B, v2.8B, v18.8B
|
||||
ld1 {v19.8B}, [x0], x3
|
||||
urhadd v3.8B, v3.8B, v19.8B
|
||||
ld1 {v20.8B}, [x0], x3
|
||||
urhadd v4.8B, v4.8B, v20.8B
|
||||
ld1 {v21.8B}, [x0], x3
|
||||
urhadd v5.8B, v5.8B, v21.8B
|
||||
ld1 {v22.8B}, [x0], x3
|
||||
urhadd v6.8B, v6.8B, v22.8B
|
||||
ld1 {v23.8B}, [x0], x3
|
||||
urhadd v7.8B, v7.8B, v23.8B
|
||||
ld1 {v16.8b}, [x0], x3
|
||||
urhadd v0.8b, v0.8b, v16.8b
|
||||
ld1 {v17.8b}, [x0], x3
|
||||
urhadd v1.8b, v1.8b, v17.8b
|
||||
ld1 {v18.8b}, [x0], x3
|
||||
urhadd v2.8b, v2.8b, v18.8b
|
||||
ld1 {v19.8b}, [x0], x3
|
||||
urhadd v3.8b, v3.8b, v19.8b
|
||||
ld1 {v20.8b}, [x0], x3
|
||||
urhadd v4.8b, v4.8b, v20.8b
|
||||
ld1 {v21.8b}, [x0], x3
|
||||
urhadd v5.8b, v5.8b, v21.8b
|
||||
ld1 {v22.8b}, [x0], x3
|
||||
urhadd v6.8b, v6.8b, v22.8b
|
||||
ld1 {v23.8b}, [x0], x3
|
||||
urhadd v7.8b, v7.8b, v23.8b
|
||||
sub x0, x0, x3, lsl #3
|
||||
.endif
|
||||
st1 {v0.8B}, [x0], x3
|
||||
st1 {v1.8B}, [x0], x3
|
||||
st1 {v2.8B}, [x0], x3
|
||||
st1 {v3.8B}, [x0], x3
|
||||
st1 {v4.8B}, [x0], x3
|
||||
st1 {v5.8B}, [x0], x3
|
||||
st1 {v6.8B}, [x0], x3
|
||||
st1 {v7.8B}, [x0], x3
|
||||
st1 {v0.8b}, [x0], x3
|
||||
st1 {v1.8b}, [x0], x3
|
||||
st1 {v2.8b}, [x0], x3
|
||||
st1 {v3.8b}, [x0], x3
|
||||
st1 {v4.8b}, [x0], x3
|
||||
st1 {v5.8b}, [x0], x3
|
||||
st1 {v6.8b}, [x0], x3
|
||||
st1 {v7.8b}, [x0], x3
|
||||
|
||||
ret x10
|
||||
endfunc
|
||||
@@ -579,8 +579,8 @@ function \type\()_h264_qpel16_hv_lowpass_l2_neon
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel16_hv put
|
||||
h264_qpel16_hv avg
|
||||
h264_qpel16_hv put
|
||||
h264_qpel16_hv avg
|
||||
|
||||
.macro h264_qpel8 type
|
||||
function ff_\type\()_h264_qpel8_mc10_neon, export=1
|
||||
@@ -758,8 +758,8 @@ function ff_\type\()_h264_qpel8_mc33_neon, export=1
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel8 put
|
||||
h264_qpel8 avg
|
||||
h264_qpel8 put
|
||||
h264_qpel8 avg
|
||||
|
||||
.macro h264_qpel16 type
|
||||
function ff_\type\()_h264_qpel16_mc10_neon, export=1
|
||||
@@ -930,5 +930,5 @@ function ff_\type\()_h264_qpel16_mc33_neon, export=1
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel16 put
|
||||
h264_qpel16 avg
|
||||
h264_qpel16 put
|
||||
h264_qpel16 avg
|
||||
|
||||
+181
-181
@@ -26,295 +26,295 @@
|
||||
.if \avg
|
||||
mov x12, x0
|
||||
.endif
|
||||
1: ld1 {v0.16B}, [x1], x2
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
ld1 {v2.16B}, [x1], x2
|
||||
ld1 {v3.16B}, [x1], x2
|
||||
1: ld1 {v0.16b}, [x1], x2
|
||||
ld1 {v1.16b}, [x1], x2
|
||||
ld1 {v2.16b}, [x1], x2
|
||||
ld1 {v3.16b}, [x1], x2
|
||||
.if \avg
|
||||
ld1 {v4.16B}, [x12], x2
|
||||
urhadd v0.16B, v0.16B, v4.16B
|
||||
ld1 {v5.16B}, [x12], x2
|
||||
urhadd v1.16B, v1.16B, v5.16B
|
||||
ld1 {v6.16B}, [x12], x2
|
||||
urhadd v2.16B, v2.16B, v6.16B
|
||||
ld1 {v7.16B}, [x12], x2
|
||||
urhadd v3.16B, v3.16B, v7.16B
|
||||
ld1 {v4.16b}, [x12], x2
|
||||
urhadd v0.16b, v0.16b, v4.16b
|
||||
ld1 {v5.16b}, [x12], x2
|
||||
urhadd v1.16b, v1.16b, v5.16b
|
||||
ld1 {v6.16b}, [x12], x2
|
||||
urhadd v2.16b, v2.16b, v6.16b
|
||||
ld1 {v7.16b}, [x12], x2
|
||||
urhadd v3.16b, v3.16b, v7.16b
|
||||
.endif
|
||||
subs w3, w3, #4
|
||||
st1 {v0.16B}, [x0], x2
|
||||
st1 {v1.16B}, [x0], x2
|
||||
st1 {v2.16B}, [x0], x2
|
||||
st1 {v3.16B}, [x0], x2
|
||||
st1 {v0.16b}, [x0], x2
|
||||
st1 {v1.16b}, [x0], x2
|
||||
st1 {v2.16b}, [x0], x2
|
||||
st1 {v3.16b}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels16_x2 rnd=1, avg=0
|
||||
1: ld1 {v0.16B, v1.16B}, [x1], x2
|
||||
ld1 {v2.16B, v3.16B}, [x1], x2
|
||||
1: ld1 {v0.16b, v1.16b}, [x1], x2
|
||||
ld1 {v2.16b, v3.16b}, [x1], x2
|
||||
subs w3, w3, #2
|
||||
ext v1.16B, v0.16B, v1.16B, #1
|
||||
avg v0.16B, v0.16B, v1.16B
|
||||
ext v3.16B, v2.16B, v3.16B, #1
|
||||
avg v2.16B, v2.16B, v3.16B
|
||||
ext v1.16b, v0.16b, v1.16b, #1
|
||||
avg v0.16b, v0.16b, v1.16b
|
||||
ext v3.16b, v2.16b, v3.16b, #1
|
||||
avg v2.16b, v2.16b, v3.16b
|
||||
.if \avg
|
||||
ld1 {v1.16B}, [x0], x2
|
||||
ld1 {v3.16B}, [x0]
|
||||
urhadd v0.16B, v0.16B, v1.16B
|
||||
urhadd v2.16B, v2.16B, v3.16B
|
||||
ld1 {v1.16b}, [x0], x2
|
||||
ld1 {v3.16b}, [x0]
|
||||
urhadd v0.16b, v0.16b, v1.16b
|
||||
urhadd v2.16b, v2.16b, v3.16b
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v0.16B}, [x0], x2
|
||||
st1 {v2.16B}, [x0], x2
|
||||
st1 {v0.16b}, [x0], x2
|
||||
st1 {v2.16b}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels16_y2 rnd=1, avg=0
|
||||
sub w3, w3, #2
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
ld1 {v0.16b}, [x1], x2
|
||||
ld1 {v1.16b}, [x1], x2
|
||||
1: subs w3, w3, #2
|
||||
avg v2.16B, v0.16B, v1.16B
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
avg v3.16B, v0.16B, v1.16B
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
avg v2.16b, v0.16b, v1.16b
|
||||
ld1 {v0.16b}, [x1], x2
|
||||
avg v3.16b, v0.16b, v1.16b
|
||||
ld1 {v1.16b}, [x1], x2
|
||||
.if \avg
|
||||
ld1 {v4.16B}, [x0], x2
|
||||
ld1 {v5.16B}, [x0]
|
||||
urhadd v2.16B, v2.16B, v4.16B
|
||||
urhadd v3.16B, v3.16B, v5.16B
|
||||
ld1 {v4.16b}, [x0], x2
|
||||
ld1 {v5.16b}, [x0]
|
||||
urhadd v2.16b, v2.16b, v4.16b
|
||||
urhadd v3.16b, v3.16b, v5.16b
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v2.16B}, [x0], x2
|
||||
st1 {v3.16B}, [x0], x2
|
||||
st1 {v2.16b}, [x0], x2
|
||||
st1 {v3.16b}, [x0], x2
|
||||
b.ne 1b
|
||||
|
||||
avg v2.16B, v0.16B, v1.16B
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
avg v3.16B, v0.16B, v1.16B
|
||||
avg v2.16b, v0.16b, v1.16b
|
||||
ld1 {v0.16b}, [x1], x2
|
||||
avg v3.16b, v0.16b, v1.16b
|
||||
.if \avg
|
||||
ld1 {v4.16B}, [x0], x2
|
||||
ld1 {v5.16B}, [x0]
|
||||
urhadd v2.16B, v2.16B, v4.16B
|
||||
urhadd v3.16B, v3.16B, v5.16B
|
||||
ld1 {v4.16b}, [x0], x2
|
||||
ld1 {v5.16b}, [x0]
|
||||
urhadd v2.16b, v2.16b, v4.16b
|
||||
urhadd v3.16b, v3.16b, v5.16b
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v2.16B}, [x0], x2
|
||||
st1 {v3.16B}, [x0], x2
|
||||
st1 {v2.16b}, [x0], x2
|
||||
st1 {v3.16b}, [x0], x2
|
||||
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels16_xy2 rnd=1, avg=0
|
||||
sub w3, w3, #2
|
||||
ld1 {v0.16B, v1.16B}, [x1], x2
|
||||
ld1 {v4.16B, v5.16B}, [x1], x2
|
||||
ld1 {v0.16b, v1.16b}, [x1], x2
|
||||
ld1 {v4.16b, v5.16b}, [x1], x2
|
||||
NRND movi v26.8H, #1
|
||||
ext v1.16B, v0.16B, v1.16B, #1
|
||||
ext v5.16B, v4.16B, v5.16B, #1
|
||||
uaddl v16.8H, v0.8B, v1.8B
|
||||
uaddl2 v20.8H, v0.16B, v1.16B
|
||||
uaddl v18.8H, v4.8B, v5.8B
|
||||
uaddl2 v22.8H, v4.16B, v5.16B
|
||||
ext v1.16b, v0.16b, v1.16b, #1
|
||||
ext v5.16b, v4.16b, v5.16b, #1
|
||||
uaddl v16.8h, v0.8b, v1.8b
|
||||
uaddl2 v20.8h, v0.16b, v1.16b
|
||||
uaddl v18.8h, v4.8b, v5.8b
|
||||
uaddl2 v22.8h, v4.16b, v5.16b
|
||||
1: subs w3, w3, #2
|
||||
ld1 {v0.16B, v1.16B}, [x1], x2
|
||||
add v24.8H, v16.8H, v18.8H
|
||||
ld1 {v0.16b, v1.16b}, [x1], x2
|
||||
add v24.8h, v16.8h, v18.8h
|
||||
NRND add v24.8H, v24.8H, v26.8H
|
||||
ext v30.16B, v0.16B, v1.16B, #1
|
||||
add v1.8H, v20.8H, v22.8H
|
||||
mshrn v28.8B, v24.8H, #2
|
||||
ext v30.16b, v0.16b, v1.16b, #1
|
||||
add v1.8h, v20.8h, v22.8h
|
||||
mshrn v28.8b, v24.8h, #2
|
||||
NRND add v1.8H, v1.8H, v26.8H
|
||||
mshrn2 v28.16B, v1.8H, #2
|
||||
mshrn2 v28.16b, v1.8h, #2
|
||||
.if \avg
|
||||
ld1 {v16.16B}, [x0]
|
||||
urhadd v28.16B, v28.16B, v16.16B
|
||||
ld1 {v16.16b}, [x0]
|
||||
urhadd v28.16b, v28.16b, v16.16b
|
||||
.endif
|
||||
uaddl v16.8H, v0.8B, v30.8B
|
||||
ld1 {v2.16B, v3.16B}, [x1], x2
|
||||
uaddl2 v20.8H, v0.16B, v30.16B
|
||||
st1 {v28.16B}, [x0], x2
|
||||
add v24.8H, v16.8H, v18.8H
|
||||
uaddl v16.8h, v0.8b, v30.8b
|
||||
ld1 {v2.16b, v3.16b}, [x1], x2
|
||||
uaddl2 v20.8h, v0.16b, v30.16b
|
||||
st1 {v28.16b}, [x0], x2
|
||||
add v24.8h, v16.8h, v18.8h
|
||||
NRND add v24.8H, v24.8H, v26.8H
|
||||
ext v3.16B, v2.16B, v3.16B, #1
|
||||
add v0.8H, v20.8H, v22.8H
|
||||
mshrn v30.8B, v24.8H, #2
|
||||
ext v3.16b, v2.16b, v3.16b, #1
|
||||
add v0.8h, v20.8h, v22.8h
|
||||
mshrn v30.8b, v24.8h, #2
|
||||
NRND add v0.8H, v0.8H, v26.8H
|
||||
mshrn2 v30.16B, v0.8H, #2
|
||||
mshrn2 v30.16b, v0.8h, #2
|
||||
.if \avg
|
||||
ld1 {v18.16B}, [x0]
|
||||
urhadd v30.16B, v30.16B, v18.16B
|
||||
ld1 {v18.16b}, [x0]
|
||||
urhadd v30.16b, v30.16b, v18.16b
|
||||
.endif
|
||||
uaddl v18.8H, v2.8B, v3.8B
|
||||
uaddl2 v22.8H, v2.16B, v3.16B
|
||||
st1 {v30.16B}, [x0], x2
|
||||
uaddl v18.8h, v2.8b, v3.8b
|
||||
uaddl2 v22.8h, v2.16b, v3.16b
|
||||
st1 {v30.16b}, [x0], x2
|
||||
b.gt 1b
|
||||
|
||||
ld1 {v0.16B, v1.16B}, [x1], x2
|
||||
add v24.8H, v16.8H, v18.8H
|
||||
ld1 {v0.16b, v1.16b}, [x1], x2
|
||||
add v24.8h, v16.8h, v18.8h
|
||||
NRND add v24.8H, v24.8H, v26.8H
|
||||
ext v30.16B, v0.16B, v1.16B, #1
|
||||
add v1.8H, v20.8H, v22.8H
|
||||
mshrn v28.8B, v24.8H, #2
|
||||
ext v30.16b, v0.16b, v1.16b, #1
|
||||
add v1.8h, v20.8h, v22.8h
|
||||
mshrn v28.8b, v24.8h, #2
|
||||
NRND add v1.8H, v1.8H, v26.8H
|
||||
mshrn2 v28.16B, v1.8H, #2
|
||||
mshrn2 v28.16b, v1.8h, #2
|
||||
.if \avg
|
||||
ld1 {v16.16B}, [x0]
|
||||
urhadd v28.16B, v28.16B, v16.16B
|
||||
ld1 {v16.16b}, [x0]
|
||||
urhadd v28.16b, v28.16b, v16.16b
|
||||
.endif
|
||||
uaddl v16.8H, v0.8B, v30.8B
|
||||
uaddl2 v20.8H, v0.16B, v30.16B
|
||||
st1 {v28.16B}, [x0], x2
|
||||
add v24.8H, v16.8H, v18.8H
|
||||
uaddl v16.8h, v0.8b, v30.8b
|
||||
uaddl2 v20.8h, v0.16b, v30.16b
|
||||
st1 {v28.16b}, [x0], x2
|
||||
add v24.8h, v16.8h, v18.8h
|
||||
NRND add v24.8H, v24.8H, v26.8H
|
||||
add v0.8H, v20.8H, v22.8H
|
||||
mshrn v30.8B, v24.8H, #2
|
||||
add v0.8h, v20.8h, v22.8h
|
||||
mshrn v30.8b, v24.8h, #2
|
||||
NRND add v0.8H, v0.8H, v26.8H
|
||||
mshrn2 v30.16B, v0.8H, #2
|
||||
mshrn2 v30.16b, v0.8h, #2
|
||||
.if \avg
|
||||
ld1 {v18.16B}, [x0]
|
||||
urhadd v30.16B, v30.16B, v18.16B
|
||||
ld1 {v18.16b}, [x0]
|
||||
urhadd v30.16b, v30.16b, v18.16b
|
||||
.endif
|
||||
st1 {v30.16B}, [x0], x2
|
||||
st1 {v30.16b}, [x0], x2
|
||||
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels8 rnd=1, avg=0
|
||||
1: ld1 {v0.8B}, [x1], x2
|
||||
ld1 {v1.8B}, [x1], x2
|
||||
ld1 {v2.8B}, [x1], x2
|
||||
ld1 {v3.8B}, [x1], x2
|
||||
1: ld1 {v0.8b}, [x1], x2
|
||||
ld1 {v1.8b}, [x1], x2
|
||||
ld1 {v2.8b}, [x1], x2
|
||||
ld1 {v3.8b}, [x1], x2
|
||||
.if \avg
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
urhadd v0.8B, v0.8B, v4.8B
|
||||
ld1 {v5.8B}, [x0], x2
|
||||
urhadd v1.8B, v1.8B, v5.8B
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
urhadd v2.8B, v2.8B, v6.8B
|
||||
ld1 {v7.8B}, [x0], x2
|
||||
urhadd v3.8B, v3.8B, v7.8B
|
||||
ld1 {v4.8b}, [x0], x2
|
||||
urhadd v0.8b, v0.8b, v4.8b
|
||||
ld1 {v5.8b}, [x0], x2
|
||||
urhadd v1.8b, v1.8b, v5.8b
|
||||
ld1 {v6.8b}, [x0], x2
|
||||
urhadd v2.8b, v2.8b, v6.8b
|
||||
ld1 {v7.8b}, [x0], x2
|
||||
urhadd v3.8b, v3.8b, v7.8b
|
||||
sub x0, x0, x2, lsl #2
|
||||
.endif
|
||||
subs w3, w3, #4
|
||||
st1 {v0.8B}, [x0], x2
|
||||
st1 {v1.8B}, [x0], x2
|
||||
st1 {v2.8B}, [x0], x2
|
||||
st1 {v3.8B}, [x0], x2
|
||||
st1 {v0.8b}, [x0], x2
|
||||
st1 {v1.8b}, [x0], x2
|
||||
st1 {v2.8b}, [x0], x2
|
||||
st1 {v3.8b}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels8_x2 rnd=1, avg=0
|
||||
1: ld1 {v0.8B, v1.8B}, [x1], x2
|
||||
ext v1.8B, v0.8B, v1.8B, #1
|
||||
ld1 {v2.8B, v3.8B}, [x1], x2
|
||||
ext v3.8B, v2.8B, v3.8B, #1
|
||||
1: ld1 {v0.8b, v1.8b}, [x1], x2
|
||||
ext v1.8b, v0.8b, v1.8b, #1
|
||||
ld1 {v2.8b, v3.8b}, [x1], x2
|
||||
ext v3.8b, v2.8b, v3.8b, #1
|
||||
subs w3, w3, #2
|
||||
avg v0.8B, v0.8B, v1.8B
|
||||
avg v2.8B, v2.8B, v3.8B
|
||||
avg v0.8b, v0.8b, v1.8b
|
||||
avg v2.8b, v2.8b, v3.8b
|
||||
.if \avg
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
ld1 {v5.8B}, [x0]
|
||||
urhadd v0.8B, v0.8B, v4.8B
|
||||
urhadd v2.8B, v2.8B, v5.8B
|
||||
ld1 {v4.8b}, [x0], x2
|
||||
ld1 {v5.8b}, [x0]
|
||||
urhadd v0.8b, v0.8b, v4.8b
|
||||
urhadd v2.8b, v2.8b, v5.8b
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v0.8B}, [x0], x2
|
||||
st1 {v2.8B}, [x0], x2
|
||||
st1 {v0.8b}, [x0], x2
|
||||
st1 {v2.8b}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels8_y2 rnd=1, avg=0
|
||||
sub w3, w3, #2
|
||||
ld1 {v0.8B}, [x1], x2
|
||||
ld1 {v1.8B}, [x1], x2
|
||||
ld1 {v0.8b}, [x1], x2
|
||||
ld1 {v1.8b}, [x1], x2
|
||||
1: subs w3, w3, #2
|
||||
avg v4.8B, v0.8B, v1.8B
|
||||
ld1 {v0.8B}, [x1], x2
|
||||
avg v5.8B, v0.8B, v1.8B
|
||||
ld1 {v1.8B}, [x1], x2
|
||||
avg v4.8b, v0.8b, v1.8b
|
||||
ld1 {v0.8b}, [x1], x2
|
||||
avg v5.8b, v0.8b, v1.8b
|
||||
ld1 {v1.8b}, [x1], x2
|
||||
.if \avg
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
ld1 {v3.8B}, [x0]
|
||||
urhadd v4.8B, v4.8B, v2.8B
|
||||
urhadd v5.8B, v5.8B, v3.8B
|
||||
ld1 {v2.8b}, [x0], x2
|
||||
ld1 {v3.8b}, [x0]
|
||||
urhadd v4.8b, v4.8b, v2.8b
|
||||
urhadd v5.8b, v5.8b, v3.8b
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v4.8B}, [x0], x2
|
||||
st1 {v5.8B}, [x0], x2
|
||||
st1 {v4.8b}, [x0], x2
|
||||
st1 {v5.8b}, [x0], x2
|
||||
b.ne 1b
|
||||
|
||||
avg v4.8B, v0.8B, v1.8B
|
||||
ld1 {v0.8B}, [x1], x2
|
||||
avg v5.8B, v0.8B, v1.8B
|
||||
avg v4.8b, v0.8b, v1.8b
|
||||
ld1 {v0.8b}, [x1], x2
|
||||
avg v5.8b, v0.8b, v1.8b
|
||||
.if \avg
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
ld1 {v3.8B}, [x0]
|
||||
urhadd v4.8B, v4.8B, v2.8B
|
||||
urhadd v5.8B, v5.8B, v3.8B
|
||||
ld1 {v2.8b}, [x0], x2
|
||||
ld1 {v3.8b}, [x0]
|
||||
urhadd v4.8b, v4.8b, v2.8b
|
||||
urhadd v5.8b, v5.8b, v3.8b
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v4.8B}, [x0], x2
|
||||
st1 {v5.8B}, [x0], x2
|
||||
st1 {v4.8b}, [x0], x2
|
||||
st1 {v5.8b}, [x0], x2
|
||||
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels8_xy2 rnd=1, avg=0
|
||||
sub w3, w3, #2
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
ld1 {v0.16b}, [x1], x2
|
||||
ld1 {v1.16b}, [x1], x2
|
||||
NRND movi v19.8H, #1
|
||||
ext v4.16B, v0.16B, v4.16B, #1
|
||||
ext v6.16B, v1.16B, v6.16B, #1
|
||||
uaddl v16.8H, v0.8B, v4.8B
|
||||
uaddl v17.8H, v1.8B, v6.8B
|
||||
ext v4.16b, v0.16b, v4.16b, #1
|
||||
ext v6.16b, v1.16b, v6.16b, #1
|
||||
uaddl v16.8h, v0.8b, v4.8b
|
||||
uaddl v17.8h, v1.8b, v6.8b
|
||||
1: subs w3, w3, #2
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
add v18.8H, v16.8H, v17.8H
|
||||
ext v4.16B, v0.16B, v4.16B, #1
|
||||
ld1 {v0.16b}, [x1], x2
|
||||
add v18.8h, v16.8h, v17.8h
|
||||
ext v4.16b, v0.16b, v4.16b, #1
|
||||
NRND add v18.8H, v18.8H, v19.8H
|
||||
uaddl v16.8H, v0.8B, v4.8B
|
||||
mshrn v5.8B, v18.8H, #2
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
add v18.8H, v16.8H, v17.8H
|
||||
uaddl v16.8h, v0.8b, v4.8b
|
||||
mshrn v5.8b, v18.8h, #2
|
||||
ld1 {v1.16b}, [x1], x2
|
||||
add v18.8h, v16.8h, v17.8h
|
||||
.if \avg
|
||||
ld1 {v7.8B}, [x0]
|
||||
urhadd v5.8B, v5.8B, v7.8B
|
||||
ld1 {v7.8b}, [x0]
|
||||
urhadd v5.8b, v5.8b, v7.8b
|
||||
.endif
|
||||
NRND add v18.8H, v18.8H, v19.8H
|
||||
st1 {v5.8B}, [x0], x2
|
||||
mshrn v7.8B, v18.8H, #2
|
||||
st1 {v5.8b}, [x0], x2
|
||||
mshrn v7.8b, v18.8h, #2
|
||||
.if \avg
|
||||
ld1 {v5.8B}, [x0]
|
||||
urhadd v7.8B, v7.8B, v5.8B
|
||||
ld1 {v5.8b}, [x0]
|
||||
urhadd v7.8b, v7.8b, v5.8b
|
||||
.endif
|
||||
ext v6.16B, v1.16B, v6.16B, #1
|
||||
uaddl v17.8H, v1.8B, v6.8B
|
||||
st1 {v7.8B}, [x0], x2
|
||||
ext v6.16b, v1.16b, v6.16b, #1
|
||||
uaddl v17.8h, v1.8b, v6.8b
|
||||
st1 {v7.8b}, [x0], x2
|
||||
b.gt 1b
|
||||
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
add v18.8H, v16.8H, v17.8H
|
||||
ext v4.16B, v0.16B, v4.16B, #1
|
||||
ld1 {v0.16b}, [x1], x2
|
||||
add v18.8h, v16.8h, v17.8h
|
||||
ext v4.16b, v0.16b, v4.16b, #1
|
||||
NRND add v18.8H, v18.8H, v19.8H
|
||||
uaddl v16.8H, v0.8B, v4.8B
|
||||
mshrn v5.8B, v18.8H, #2
|
||||
add v18.8H, v16.8H, v17.8H
|
||||
uaddl v16.8h, v0.8b, v4.8b
|
||||
mshrn v5.8b, v18.8h, #2
|
||||
add v18.8h, v16.8h, v17.8h
|
||||
.if \avg
|
||||
ld1 {v7.8B}, [x0]
|
||||
urhadd v5.8B, v5.8B, v7.8B
|
||||
ld1 {v7.8b}, [x0]
|
||||
urhadd v5.8b, v5.8b, v7.8b
|
||||
.endif
|
||||
NRND add v18.8H, v18.8H, v19.8H
|
||||
st1 {v5.8B}, [x0], x2
|
||||
mshrn v7.8B, v18.8H, #2
|
||||
st1 {v5.8b}, [x0], x2
|
||||
mshrn v7.8b, v18.8h, #2
|
||||
.if \avg
|
||||
ld1 {v5.8B}, [x0]
|
||||
urhadd v7.8B, v7.8B, v5.8B
|
||||
ld1 {v5.8b}, [x0]
|
||||
urhadd v7.8b, v7.8b, v5.8b
|
||||
.endif
|
||||
st1 {v7.8B}, [x0], x2
|
||||
st1 {v7.8b}, [x0], x2
|
||||
|
||||
ret
|
||||
.endm
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#ifndef AVCODEC_AARCH64_IDCT_H
|
||||
#define AVCODEC_AARCH64_IDCT_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
void ff_simple_idct_neon(int16_t *data);
|
||||
|
||||
+96
-96
@@ -17,133 +17,133 @@
|
||||
*/
|
||||
|
||||
.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
||||
trn1 \r8\().8B, \r0\().8B, \r1\().8B
|
||||
trn2 \r9\().8B, \r0\().8B, \r1\().8B
|
||||
trn1 \r1\().8B, \r2\().8B, \r3\().8B
|
||||
trn2 \r3\().8B, \r2\().8B, \r3\().8B
|
||||
trn1 \r0\().8B, \r4\().8B, \r5\().8B
|
||||
trn2 \r5\().8B, \r4\().8B, \r5\().8B
|
||||
trn1 \r2\().8B, \r6\().8B, \r7\().8B
|
||||
trn2 \r7\().8B, \r6\().8B, \r7\().8B
|
||||
trn1 \r8\().8b, \r0\().8b, \r1\().8b
|
||||
trn2 \r9\().8b, \r0\().8b, \r1\().8b
|
||||
trn1 \r1\().8b, \r2\().8b, \r3\().8b
|
||||
trn2 \r3\().8b, \r2\().8b, \r3\().8b
|
||||
trn1 \r0\().8b, \r4\().8b, \r5\().8b
|
||||
trn2 \r5\().8b, \r4\().8b, \r5\().8b
|
||||
trn1 \r2\().8b, \r6\().8b, \r7\().8b
|
||||
trn2 \r7\().8b, \r6\().8b, \r7\().8b
|
||||
|
||||
trn1 \r4\().4H, \r0\().4H, \r2\().4H
|
||||
trn2 \r2\().4H, \r0\().4H, \r2\().4H
|
||||
trn1 \r6\().4H, \r5\().4H, \r7\().4H
|
||||
trn2 \r7\().4H, \r5\().4H, \r7\().4H
|
||||
trn1 \r5\().4H, \r9\().4H, \r3\().4H
|
||||
trn2 \r9\().4H, \r9\().4H, \r3\().4H
|
||||
trn1 \r3\().4H, \r8\().4H, \r1\().4H
|
||||
trn2 \r8\().4H, \r8\().4H, \r1\().4H
|
||||
trn1 \r4\().4h, \r0\().4h, \r2\().4h
|
||||
trn2 \r2\().4h, \r0\().4h, \r2\().4h
|
||||
trn1 \r6\().4h, \r5\().4h, \r7\().4h
|
||||
trn2 \r7\().4h, \r5\().4h, \r7\().4h
|
||||
trn1 \r5\().4h, \r9\().4h, \r3\().4h
|
||||
trn2 \r9\().4h, \r9\().4h, \r3\().4h
|
||||
trn1 \r3\().4h, \r8\().4h, \r1\().4h
|
||||
trn2 \r8\().4h, \r8\().4h, \r1\().4h
|
||||
|
||||
trn1 \r0\().2S, \r3\().2S, \r4\().2S
|
||||
trn2 \r4\().2S, \r3\().2S, \r4\().2S
|
||||
trn1 \r0\().2s, \r3\().2s, \r4\().2s
|
||||
trn2 \r4\().2s, \r3\().2s, \r4\().2s
|
||||
|
||||
trn1 \r1\().2S, \r5\().2S, \r6\().2S
|
||||
trn2 \r5\().2S, \r5\().2S, \r6\().2S
|
||||
trn1 \r1\().2s, \r5\().2s, \r6\().2s
|
||||
trn2 \r5\().2s, \r5\().2s, \r6\().2s
|
||||
|
||||
trn2 \r6\().2S, \r8\().2S, \r2\().2S
|
||||
trn1 \r2\().2S, \r8\().2S, \r2\().2S
|
||||
trn2 \r6\().2s, \r8\().2s, \r2\().2s
|
||||
trn1 \r2\().2s, \r8\().2s, \r2\().2s
|
||||
|
||||
trn1 \r3\().2S, \r9\().2S, \r7\().2S
|
||||
trn2 \r7\().2S, \r9\().2S, \r7\().2S
|
||||
trn1 \r3\().2s, \r9\().2s, \r7\().2s
|
||||
trn2 \r7\().2s, \r9\().2s, \r7\().2s
|
||||
.endm
|
||||
|
||||
.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
|
||||
trn1 \t0\().16B, \r0\().16B, \r1\().16B
|
||||
trn2 \t1\().16B, \r0\().16B, \r1\().16B
|
||||
trn1 \r1\().16B, \r2\().16B, \r3\().16B
|
||||
trn2 \r3\().16B, \r2\().16B, \r3\().16B
|
||||
trn1 \r0\().16B, \r4\().16B, \r5\().16B
|
||||
trn2 \r5\().16B, \r4\().16B, \r5\().16B
|
||||
trn1 \r2\().16B, \r6\().16B, \r7\().16B
|
||||
trn2 \r7\().16B, \r6\().16B, \r7\().16B
|
||||
trn1 \t0\().16b, \r0\().16b, \r1\().16b
|
||||
trn2 \t1\().16b, \r0\().16b, \r1\().16b
|
||||
trn1 \r1\().16b, \r2\().16b, \r3\().16b
|
||||
trn2 \r3\().16b, \r2\().16b, \r3\().16b
|
||||
trn1 \r0\().16b, \r4\().16b, \r5\().16b
|
||||
trn2 \r5\().16b, \r4\().16b, \r5\().16b
|
||||
trn1 \r2\().16b, \r6\().16b, \r7\().16b
|
||||
trn2 \r7\().16b, \r6\().16b, \r7\().16b
|
||||
|
||||
trn1 \r4\().8H, \r0\().8H, \r2\().8H
|
||||
trn2 \r2\().8H, \r0\().8H, \r2\().8H
|
||||
trn1 \r6\().8H, \r5\().8H, \r7\().8H
|
||||
trn2 \r7\().8H, \r5\().8H, \r7\().8H
|
||||
trn1 \r5\().8H, \t1\().8H, \r3\().8H
|
||||
trn2 \t1\().8H, \t1\().8H, \r3\().8H
|
||||
trn1 \r3\().8H, \t0\().8H, \r1\().8H
|
||||
trn2 \t0\().8H, \t0\().8H, \r1\().8H
|
||||
trn1 \r4\().8h, \r0\().8h, \r2\().8h
|
||||
trn2 \r2\().8h, \r0\().8h, \r2\().8h
|
||||
trn1 \r6\().8h, \r5\().8h, \r7\().8h
|
||||
trn2 \r7\().8h, \r5\().8h, \r7\().8h
|
||||
trn1 \r5\().8h, \t1\().8h, \r3\().8h
|
||||
trn2 \t1\().8h, \t1\().8h, \r3\().8h
|
||||
trn1 \r3\().8h, \t0\().8h, \r1\().8h
|
||||
trn2 \t0\().8h, \t0\().8h, \r1\().8h
|
||||
|
||||
trn1 \r0\().4S, \r3\().4S, \r4\().4S
|
||||
trn2 \r4\().4S, \r3\().4S, \r4\().4S
|
||||
trn1 \r0\().4s, \r3\().4s, \r4\().4s
|
||||
trn2 \r4\().4s, \r3\().4s, \r4\().4s
|
||||
|
||||
trn1 \r1\().4S, \r5\().4S, \r6\().4S
|
||||
trn2 \r5\().4S, \r5\().4S, \r6\().4S
|
||||
trn1 \r1\().4s, \r5\().4s, \r6\().4s
|
||||
trn2 \r5\().4s, \r5\().4s, \r6\().4s
|
||||
|
||||
trn2 \r6\().4S, \t0\().4S, \r2\().4S
|
||||
trn1 \r2\().4S, \t0\().4S, \r2\().4S
|
||||
trn2 \r6\().4s, \t0\().4s, \r2\().4s
|
||||
trn1 \r2\().4s, \t0\().4s, \r2\().4s
|
||||
|
||||
trn1 \r3\().4S, \t1\().4S, \r7\().4S
|
||||
trn2 \r7\().4S, \t1\().4S, \r7\().4S
|
||||
trn1 \r3\().4s, \t1\().4s, \r7\().4s
|
||||
trn2 \r7\().4s, \t1\().4s, \r7\().4s
|
||||
.endm
|
||||
|
||||
.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().16B, \r0\().16B, \r1\().16B
|
||||
trn2 \t5\().16B, \r0\().16B, \r1\().16B
|
||||
trn1 \t6\().16B, \r2\().16B, \r3\().16B
|
||||
trn2 \t7\().16B, \r2\().16B, \r3\().16B
|
||||
trn1 \t4\().16b, \r0\().16b, \r1\().16b
|
||||
trn2 \t5\().16b, \r0\().16b, \r1\().16b
|
||||
trn1 \t6\().16b, \r2\().16b, \r3\().16b
|
||||
trn2 \t7\().16b, \r2\().16b, \r3\().16b
|
||||
|
||||
trn1 \r0\().8H, \t4\().8H, \t6\().8H
|
||||
trn2 \r2\().8H, \t4\().8H, \t6\().8H
|
||||
trn1 \r1\().8H, \t5\().8H, \t7\().8H
|
||||
trn2 \r3\().8H, \t5\().8H, \t7\().8H
|
||||
trn1 \r0\().8h, \t4\().8h, \t6\().8h
|
||||
trn2 \r2\().8h, \t4\().8h, \t6\().8h
|
||||
trn1 \r1\().8h, \t5\().8h, \t7\().8h
|
||||
trn2 \r3\().8h, \t5\().8h, \t7\().8h
|
||||
.endm
|
||||
|
||||
.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().8B, \r0\().8B, \r1\().8B
|
||||
trn2 \t5\().8B, \r0\().8B, \r1\().8B
|
||||
trn1 \t6\().8B, \r2\().8B, \r3\().8B
|
||||
trn2 \t7\().8B, \r2\().8B, \r3\().8B
|
||||
trn1 \t4\().8b, \r0\().8b, \r1\().8b
|
||||
trn2 \t5\().8b, \r0\().8b, \r1\().8b
|
||||
trn1 \t6\().8b, \r2\().8b, \r3\().8b
|
||||
trn2 \t7\().8b, \r2\().8b, \r3\().8b
|
||||
|
||||
trn1 \r0\().4H, \t4\().4H, \t6\().4H
|
||||
trn2 \r2\().4H, \t4\().4H, \t6\().4H
|
||||
trn1 \r1\().4H, \t5\().4H, \t7\().4H
|
||||
trn2 \r3\().4H, \t5\().4H, \t7\().4H
|
||||
trn1 \r0\().4h, \t4\().4h, \t6\().4h
|
||||
trn2 \r2\().4h, \t4\().4h, \t6\().4h
|
||||
trn1 \r1\().4h, \t5\().4h, \t7\().4h
|
||||
trn2 \r3\().4h, \t5\().4h, \t7\().4h
|
||||
.endm
|
||||
|
||||
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
|
||||
trn1 \r4\().4H, \r0\().4H, \r1\().4H
|
||||
trn2 \r5\().4H, \r0\().4H, \r1\().4H
|
||||
trn1 \r6\().4H, \r2\().4H, \r3\().4H
|
||||
trn2 \r7\().4H, \r2\().4H, \r3\().4H
|
||||
trn1 \r0\().2S, \r4\().2S, \r6\().2S
|
||||
trn2 \r2\().2S, \r4\().2S, \r6\().2S
|
||||
trn1 \r1\().2S, \r5\().2S, \r7\().2S
|
||||
trn2 \r3\().2S, \r5\().2S, \r7\().2S
|
||||
trn1 \r4\().4h, \r0\().4h, \r1\().4h
|
||||
trn2 \r5\().4h, \r0\().4h, \r1\().4h
|
||||
trn1 \r6\().4h, \r2\().4h, \r3\().4h
|
||||
trn2 \r7\().4h, \r2\().4h, \r3\().4h
|
||||
trn1 \r0\().2s, \r4\().2s, \r6\().2s
|
||||
trn2 \r2\().2s, \r4\().2s, \r6\().2s
|
||||
trn1 \r1\().2s, \r5\().2s, \r7\().2s
|
||||
trn2 \r3\().2s, \r5\().2s, \r7\().2s
|
||||
.endm
|
||||
|
||||
.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
||||
trn1 \r8\().8H, \r0\().8H, \r1\().8H
|
||||
trn2 \r9\().8H, \r0\().8H, \r1\().8H
|
||||
trn1 \r1\().8H, \r2\().8H, \r3\().8H
|
||||
trn2 \r3\().8H, \r2\().8H, \r3\().8H
|
||||
trn1 \r0\().8H, \r4\().8H, \r5\().8H
|
||||
trn2 \r5\().8H, \r4\().8H, \r5\().8H
|
||||
trn1 \r2\().8H, \r6\().8H, \r7\().8H
|
||||
trn2 \r7\().8H, \r6\().8H, \r7\().8H
|
||||
trn1 \r8\().8h, \r0\().8h, \r1\().8h
|
||||
trn2 \r9\().8h, \r0\().8h, \r1\().8h
|
||||
trn1 \r1\().8h, \r2\().8h, \r3\().8h
|
||||
trn2 \r3\().8h, \r2\().8h, \r3\().8h
|
||||
trn1 \r0\().8h, \r4\().8h, \r5\().8h
|
||||
trn2 \r5\().8h, \r4\().8h, \r5\().8h
|
||||
trn1 \r2\().8h, \r6\().8h, \r7\().8h
|
||||
trn2 \r7\().8h, \r6\().8h, \r7\().8h
|
||||
|
||||
trn1 \r4\().4S, \r0\().4S, \r2\().4S
|
||||
trn2 \r2\().4S, \r0\().4S, \r2\().4S
|
||||
trn1 \r6\().4S, \r5\().4S, \r7\().4S
|
||||
trn2 \r7\().4S, \r5\().4S, \r7\().4S
|
||||
trn1 \r5\().4S, \r9\().4S, \r3\().4S
|
||||
trn2 \r9\().4S, \r9\().4S, \r3\().4S
|
||||
trn1 \r3\().4S, \r8\().4S, \r1\().4S
|
||||
trn2 \r8\().4S, \r8\().4S, \r1\().4S
|
||||
trn1 \r4\().4s, \r0\().4s, \r2\().4s
|
||||
trn2 \r2\().4s, \r0\().4s, \r2\().4s
|
||||
trn1 \r6\().4s, \r5\().4s, \r7\().4s
|
||||
trn2 \r7\().4s, \r5\().4s, \r7\().4s
|
||||
trn1 \r5\().4s, \r9\().4s, \r3\().4s
|
||||
trn2 \r9\().4s, \r9\().4s, \r3\().4s
|
||||
trn1 \r3\().4s, \r8\().4s, \r1\().4s
|
||||
trn2 \r8\().4s, \r8\().4s, \r1\().4s
|
||||
|
||||
trn1 \r0\().2D, \r3\().2D, \r4\().2D
|
||||
trn2 \r4\().2D, \r3\().2D, \r4\().2D
|
||||
trn1 \r0\().2d, \r3\().2d, \r4\().2d
|
||||
trn2 \r4\().2d, \r3\().2d, \r4\().2d
|
||||
|
||||
trn1 \r1\().2D, \r5\().2D, \r6\().2D
|
||||
trn2 \r5\().2D, \r5\().2D, \r6\().2D
|
||||
trn1 \r1\().2d, \r5\().2d, \r6\().2d
|
||||
trn2 \r5\().2d, \r5\().2d, \r6\().2d
|
||||
|
||||
trn2 \r6\().2D, \r8\().2D, \r2\().2D
|
||||
trn1 \r2\().2D, \r8\().2D, \r2\().2D
|
||||
trn2 \r6\().2d, \r8\().2d, \r2\().2d
|
||||
trn1 \r2\().2d, \r8\().2d, \r2\().2d
|
||||
|
||||
trn1 \r3\().2D, \r9\().2D, \r7\().2D
|
||||
trn2 \r7\().2D, \r9\().2D, \r7\().2D
|
||||
trn1 \r3\().2d, \r9\().2d, \r7\().2d
|
||||
trn2 \r7\().2d, \r9\().2d, \r7\().2d
|
||||
|
||||
.endm
|
||||
|
||||
@@ -33,81 +33,81 @@ const tab_x2, align=4
|
||||
endconst
|
||||
|
||||
function ff_opus_deemphasis_neon, export=1
|
||||
movrel x4, tab_st
|
||||
ld1 {v4.4s}, [x4]
|
||||
movrel x4, tab_x0
|
||||
ld1 {v5.4s}, [x4]
|
||||
movrel x4, tab_x1
|
||||
ld1 {v6.4s}, [x4]
|
||||
movrel x4, tab_x2
|
||||
ld1 {v7.4s}, [x4]
|
||||
movrel x4, tab_st
|
||||
ld1 {v4.4s}, [x4]
|
||||
movrel x4, tab_x0
|
||||
ld1 {v5.4s}, [x4]
|
||||
movrel x4, tab_x1
|
||||
ld1 {v6.4s}, [x4]
|
||||
movrel x4, tab_x2
|
||||
ld1 {v7.4s}, [x4]
|
||||
|
||||
fmul v0.4s, v4.4s, v0.s[0]
|
||||
fmul v0.4s, v4.4s, v0.s[0]
|
||||
|
||||
1: ld1 {v1.4s, v2.4s}, [x1], #32
|
||||
1: ld1 {v1.4s, v2.4s}, [x1], #32
|
||||
|
||||
fmla v0.4s, v5.4s, v1.s[0]
|
||||
fmul v3.4s, v7.4s, v2.s[2]
|
||||
fmla v0.4s, v5.4s, v1.s[0]
|
||||
fmul v3.4s, v7.4s, v2.s[2]
|
||||
|
||||
fmla v0.4s, v6.4s, v1.s[1]
|
||||
fmla v3.4s, v6.4s, v2.s[1]
|
||||
fmla v0.4s, v6.4s, v1.s[1]
|
||||
fmla v3.4s, v6.4s, v2.s[1]
|
||||
|
||||
fmla v0.4s, v7.4s, v1.s[2]
|
||||
fmla v3.4s, v5.4s, v2.s[0]
|
||||
fmla v0.4s, v7.4s, v1.s[2]
|
||||
fmla v3.4s, v5.4s, v2.s[0]
|
||||
|
||||
fadd v1.4s, v1.4s, v0.4s
|
||||
fadd v2.4s, v2.4s, v3.4s
|
||||
fadd v1.4s, v1.4s, v0.4s
|
||||
fadd v2.4s, v2.4s, v3.4s
|
||||
|
||||
fmla v2.4s, v4.4s, v1.s[3]
|
||||
fmla v2.4s, v4.4s, v1.s[3]
|
||||
|
||||
st1 {v1.4s, v2.4s}, [x0], #32
|
||||
fmul v0.4s, v4.4s, v2.s[3]
|
||||
st1 {v1.4s, v2.4s}, [x0], #32
|
||||
fmul v0.4s, v4.4s, v2.s[3]
|
||||
|
||||
subs w2, w2, #8
|
||||
b.gt 1b
|
||||
subs w2, w2, #8
|
||||
b.gt 1b
|
||||
|
||||
mov s0, v2.s[3]
|
||||
mov s0, v2.s[3]
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_opus_postfilter_neon, export=1
|
||||
ld1 {v0.4s}, [x2]
|
||||
dup v1.4s, v0.s[1]
|
||||
dup v2.4s, v0.s[2]
|
||||
dup v0.4s, v0.s[0]
|
||||
ld1 {v0.4s}, [x2]
|
||||
dup v1.4s, v0.s[1]
|
||||
dup v2.4s, v0.s[2]
|
||||
dup v0.4s, v0.s[0]
|
||||
|
||||
add w1, w1, #2
|
||||
sub x1, x0, x1, lsl #2
|
||||
add w1, w1, #2
|
||||
sub x1, x0, x1, lsl #2
|
||||
|
||||
ld1 {v3.4s}, [x1]
|
||||
fmul v3.4s, v3.4s, v2.4s
|
||||
ld1 {v3.4s}, [x1]
|
||||
fmul v3.4s, v3.4s, v2.4s
|
||||
|
||||
1: add x1, x1, #4
|
||||
ld1 {v4.4s}, [x1]
|
||||
add x1, x1, #4
|
||||
ld1 {v5.4s}, [x1]
|
||||
add x1, x1, #4
|
||||
ld1 {v6.4s}, [x1]
|
||||
add x1, x1, #4
|
||||
ld1 {v7.4s}, [x1]
|
||||
1: add x1, x1, #4
|
||||
ld1 {v4.4s}, [x1]
|
||||
add x1, x1, #4
|
||||
ld1 {v5.4s}, [x1]
|
||||
add x1, x1, #4
|
||||
ld1 {v6.4s}, [x1]
|
||||
add x1, x1, #4
|
||||
ld1 {v7.4s}, [x1]
|
||||
|
||||
fmla v3.4s, v7.4s, v2.4s
|
||||
fadd v6.4s, v6.4s, v4.4s
|
||||
fmla v3.4s, v7.4s, v2.4s
|
||||
fadd v6.4s, v6.4s, v4.4s
|
||||
|
||||
ld1 {v4.4s}, [x0]
|
||||
fmla v4.4s, v5.4s, v0.4s
|
||||
ld1 {v4.4s}, [x0]
|
||||
fmla v4.4s, v5.4s, v0.4s
|
||||
|
||||
fmul v6.4s, v6.4s, v1.4s
|
||||
fadd v6.4s, v6.4s, v3.4s
|
||||
fmul v6.4s, v6.4s, v1.4s
|
||||
fadd v6.4s, v6.4s, v3.4s
|
||||
|
||||
fadd v4.4s, v4.4s, v6.4s
|
||||
fmul v3.4s, v7.4s, v2.4s
|
||||
fadd v4.4s, v4.4s, v6.4s
|
||||
fmul v3.4s, v7.4s, v2.4s
|
||||
|
||||
st1 {v4.4s}, [x0], #16
|
||||
st1 {v4.4s}, [x0], #16
|
||||
|
||||
subs w3, w3, #4
|
||||
b.gt 1b
|
||||
subs w3, w3, #4
|
||||
b.gt 1b
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
+147
-147
@@ -46,49 +46,49 @@ function ff_sbr_sum64x5_neon, export=1
|
||||
add x3, x0, #192*4
|
||||
add x4, x0, #256*4
|
||||
mov x5, #64
|
||||
1: ld1 {v0.4S}, [x0]
|
||||
ld1 {v1.4S}, [x1], #16
|
||||
fadd v0.4S, v0.4S, v1.4S
|
||||
ld1 {v2.4S}, [x2], #16
|
||||
fadd v0.4S, v0.4S, v2.4S
|
||||
ld1 {v3.4S}, [x3], #16
|
||||
fadd v0.4S, v0.4S, v3.4S
|
||||
ld1 {v4.4S}, [x4], #16
|
||||
fadd v0.4S, v0.4S, v4.4S
|
||||
st1 {v0.4S}, [x0], #16
|
||||
1: ld1 {v0.4s}, [x0]
|
||||
ld1 {v1.4s}, [x1], #16
|
||||
fadd v0.4s, v0.4s, v1.4s
|
||||
ld1 {v2.4s}, [x2], #16
|
||||
fadd v0.4s, v0.4s, v2.4s
|
||||
ld1 {v3.4s}, [x3], #16
|
||||
fadd v0.4s, v0.4s, v3.4s
|
||||
ld1 {v4.4s}, [x4], #16
|
||||
fadd v0.4s, v0.4s, v4.4s
|
||||
st1 {v0.4s}, [x0], #16
|
||||
subs x5, x5, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_sum_square_neon, export=1
|
||||
movi v0.4S, #0
|
||||
1: ld1 {v1.4S}, [x0], #16
|
||||
fmla v0.4S, v1.4S, v1.4S
|
||||
movi v0.4s, #0
|
||||
1: ld1 {v1.4s}, [x0], #16
|
||||
fmla v0.4s, v1.4s, v1.4s
|
||||
subs w1, w1, #2
|
||||
b.gt 1b
|
||||
faddp v0.4S, v0.4S, v0.4S
|
||||
faddp v0.4S, v0.4S, v0.4S
|
||||
faddp v0.4s, v0.4s, v0.4s
|
||||
faddp v0.4s, v0.4s, v0.4s
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_neg_odd_64_neon, export=1
|
||||
mov x1, x0
|
||||
movi v5.4S, #1<<7, lsl #24
|
||||
ld2 {v0.4S, v1.4S}, [x0], #32
|
||||
eor v1.16B, v1.16B, v5.16B
|
||||
ld2 {v2.4S, v3.4S}, [x0], #32
|
||||
movi v5.4s, #1<<7, lsl #24
|
||||
ld2 {v0.4s, v1.4s}, [x0], #32
|
||||
eor v1.16b, v1.16b, v5.16b
|
||||
ld2 {v2.4s, v3.4s}, [x0], #32
|
||||
.rept 3
|
||||
st2 {v0.4S, v1.4S}, [x1], #32
|
||||
eor v3.16B, v3.16B, v5.16B
|
||||
ld2 {v0.4S, v1.4S}, [x0], #32
|
||||
st2 {v2.4S, v3.4S}, [x1], #32
|
||||
eor v1.16B, v1.16B, v5.16B
|
||||
ld2 {v2.4S, v3.4S}, [x0], #32
|
||||
st2 {v0.4s, v1.4s}, [x1], #32
|
||||
eor v3.16b, v3.16b, v5.16b
|
||||
ld2 {v0.4s, v1.4s}, [x0], #32
|
||||
st2 {v2.4s, v3.4s}, [x1], #32
|
||||
eor v1.16b, v1.16b, v5.16b
|
||||
ld2 {v2.4s, v3.4s}, [x0], #32
|
||||
.endr
|
||||
eor v3.16B, v3.16B, v5.16B
|
||||
st2 {v0.4S, v1.4S}, [x1], #32
|
||||
st2 {v2.4S, v3.4S}, [x1], #32
|
||||
eor v3.16b, v3.16b, v5.16b
|
||||
st2 {v0.4s, v1.4s}, [x1], #32
|
||||
st2 {v2.4s, v3.4s}, [x1], #32
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@@ -97,26 +97,26 @@ function ff_sbr_qmf_pre_shuffle_neon, export=1
|
||||
add x2, x0, #64*4
|
||||
mov x3, #-16
|
||||
mov x4, #-4
|
||||
movi v6.4S, #1<<7, lsl #24
|
||||
ld1 {v0.2S}, [x0], #8
|
||||
st1 {v0.2S}, [x2], #8
|
||||
movi v6.4s, #1<<7, lsl #24
|
||||
ld1 {v0.2s}, [x0], #8
|
||||
st1 {v0.2s}, [x2], #8
|
||||
.rept 7
|
||||
ld1 {v1.4S}, [x1], x3
|
||||
ld1 {v2.4S}, [x0], #16
|
||||
eor v1.16B, v1.16B, v6.16B
|
||||
rev64 v1.4S, v1.4S
|
||||
ext v1.16B, v1.16B, v1.16B, #8
|
||||
st2 {v1.4S, v2.4S}, [x2], #32
|
||||
ld1 {v1.4s}, [x1], x3
|
||||
ld1 {v2.4s}, [x0], #16
|
||||
eor v1.16b, v1.16b, v6.16b
|
||||
rev64 v1.4s, v1.4s
|
||||
ext v1.16b, v1.16b, v1.16b, #8
|
||||
st2 {v1.4s, v2.4s}, [x2], #32
|
||||
.endr
|
||||
add x1, x1, #8
|
||||
ld1 {v1.2S}, [x1], x4
|
||||
ld1 {v2.2S}, [x0], #8
|
||||
ld1 {v1.S}[3], [x1]
|
||||
ld1 {v2.S}[2], [x0]
|
||||
eor v1.16B, v1.16B, v6.16B
|
||||
rev64 v1.4S, v1.4S
|
||||
st2 {v1.2S, v2.2S}, [x2], #16
|
||||
st2 {v1.S, v2.S}[2], [x2]
|
||||
ld1 {v1.2s}, [x1], x4
|
||||
ld1 {v2.2s}, [x0], #8
|
||||
ld1 {v1.s}[3], [x1]
|
||||
ld1 {v2.s}[2], [x0]
|
||||
eor v1.16b, v1.16b, v6.16b
|
||||
rev64 v1.4s, v1.4s
|
||||
st2 {v1.2s, v2.2s}, [x2], #16
|
||||
st2 {v1.s, v2.s}[2], [x2]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@@ -124,13 +124,13 @@ function ff_sbr_qmf_post_shuffle_neon, export=1
|
||||
add x2, x1, #60*4
|
||||
mov x3, #-16
|
||||
mov x4, #32
|
||||
movi v6.4S, #1<<7, lsl #24
|
||||
1: ld1 {v0.4S}, [x2], x3
|
||||
ld1 {v1.4S}, [x1], #16
|
||||
eor v0.16B, v0.16B, v6.16B
|
||||
rev64 v0.4S, v0.4S
|
||||
ext v0.16B, v0.16B, v0.16B, #8
|
||||
st2 {v0.4S, v1.4S}, [x0], #32
|
||||
movi v6.4s, #1<<7, lsl #24
|
||||
1: ld1 {v0.4s}, [x2], x3
|
||||
ld1 {v1.4s}, [x1], #16
|
||||
eor v0.16b, v0.16b, v6.16b
|
||||
rev64 v0.4s, v0.4s
|
||||
ext v0.16b, v0.16b, v0.16b, #8
|
||||
st2 {v0.4s, v1.4s}, [x0], #32
|
||||
subs x4, x4, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
@@ -141,13 +141,13 @@ function ff_sbr_qmf_deint_neg_neon, export=1
|
||||
add x2, x0, #60*4
|
||||
mov x3, #-32
|
||||
mov x4, #32
|
||||
movi v2.4S, #1<<7, lsl #24
|
||||
1: ld2 {v0.4S, v1.4S}, [x1], x3
|
||||
eor v0.16B, v0.16B, v2.16B
|
||||
rev64 v1.4S, v1.4S
|
||||
ext v1.16B, v1.16B, v1.16B, #8
|
||||
st1 {v0.4S}, [x2]
|
||||
st1 {v1.4S}, [x0], #16
|
||||
movi v2.4s, #1<<7, lsl #24
|
||||
1: ld2 {v0.4s, v1.4s}, [x1], x3
|
||||
eor v0.16b, v0.16b, v2.16b
|
||||
rev64 v1.4s, v1.4s
|
||||
ext v1.16b, v1.16b, v1.16b, #8
|
||||
st1 {v0.4s}, [x2]
|
||||
st1 {v1.4s}, [x0], #16
|
||||
sub x2, x2, #16
|
||||
subs x4, x4, #4
|
||||
b.gt 1b
|
||||
@@ -159,16 +159,16 @@ function ff_sbr_qmf_deint_bfly_neon, export=1
|
||||
add x3, x0, #124*4
|
||||
mov x4, #64
|
||||
mov x5, #-16
|
||||
1: ld1 {v0.4S}, [x1], #16
|
||||
ld1 {v1.4S}, [x2], x5
|
||||
rev64 v2.4S, v0.4S
|
||||
ext v2.16B, v2.16B, v2.16B, #8
|
||||
rev64 v3.4S, v1.4S
|
||||
ext v3.16B, v3.16B, v3.16B, #8
|
||||
fadd v1.4S, v1.4S, v2.4S
|
||||
fsub v0.4S, v0.4S, v3.4S
|
||||
st1 {v0.4S}, [x0], #16
|
||||
st1 {v1.4S}, [x3], x5
|
||||
1: ld1 {v0.4s}, [x1], #16
|
||||
ld1 {v1.4s}, [x2], x5
|
||||
rev64 v2.4s, v0.4s
|
||||
ext v2.16b, v2.16b, v2.16b, #8
|
||||
rev64 v3.4s, v1.4s
|
||||
ext v3.16b, v3.16b, v3.16b, #8
|
||||
fadd v1.4s, v1.4s, v2.4s
|
||||
fsub v0.4s, v0.4s, v3.4s
|
||||
st1 {v0.4s}, [x0], #16
|
||||
st1 {v1.4s}, [x3], x5
|
||||
subs x4, x4, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
@@ -178,32 +178,32 @@ function ff_sbr_hf_gen_neon, export=1
|
||||
sxtw x4, w4
|
||||
sxtw x5, w5
|
||||
movrel x6, factors
|
||||
ld1 {v7.4S}, [x6]
|
||||
dup v1.4S, v0.S[0]
|
||||
mov v2.8B, v1.8B
|
||||
mov v2.S[2], v7.S[0]
|
||||
mov v2.S[3], v7.S[0]
|
||||
fmul v1.4S, v1.4S, v2.4S
|
||||
ld1 {v0.D}[0], [x3]
|
||||
ld1 {v0.D}[1], [x2]
|
||||
fmul v0.4S, v0.4S, v1.4S
|
||||
fmul v1.4S, v0.4S, v7.4S
|
||||
rev64 v0.4S, v0.4S
|
||||
ld1 {v7.4s}, [x6]
|
||||
dup v1.4s, v0.s[0]
|
||||
mov v2.8b, v1.8b
|
||||
mov v2.s[2], v7.s[0]
|
||||
mov v2.s[3], v7.s[0]
|
||||
fmul v1.4s, v1.4s, v2.4s
|
||||
ld1 {v0.d}[0], [x3]
|
||||
ld1 {v0.d}[1], [x2]
|
||||
fmul v0.4s, v0.4s, v1.4s
|
||||
fmul v1.4s, v0.4s, v7.4s
|
||||
rev64 v0.4s, v0.4s
|
||||
sub x7, x5, x4
|
||||
add x0, x0, x4, lsl #3
|
||||
add x1, x1, x4, lsl #3
|
||||
sub x1, x1, #16
|
||||
1: ld1 {v2.4S}, [x1], #16
|
||||
ld1 {v3.2S}, [x1]
|
||||
fmul v4.4S, v2.4S, v1.4S
|
||||
fmul v5.4S, v2.4S, v0.4S
|
||||
faddp v4.4S, v4.4S, v4.4S
|
||||
faddp v5.4S, v5.4S, v5.4S
|
||||
faddp v4.4S, v4.4S, v4.4S
|
||||
faddp v5.4S, v5.4S, v5.4S
|
||||
mov v4.S[1], v5.S[0]
|
||||
fadd v4.2S, v4.2S, v3.2S
|
||||
st1 {v4.2S}, [x0], #8
|
||||
1: ld1 {v2.4s}, [x1], #16
|
||||
ld1 {v3.2s}, [x1]
|
||||
fmul v4.4s, v2.4s, v1.4s
|
||||
fmul v5.4s, v2.4s, v0.4s
|
||||
faddp v4.4s, v4.4s, v4.4s
|
||||
faddp v5.4s, v5.4s, v5.4s
|
||||
faddp v4.4s, v4.4s, v4.4s
|
||||
faddp v5.4s, v5.4s, v5.4s
|
||||
mov v4.s[1], v5.s[0]
|
||||
fadd v4.2s, v4.2s, v3.2s
|
||||
st1 {v4.2s}, [x0], #8
|
||||
sub x1, x1, #8
|
||||
subs x7, x7, #1
|
||||
b.gt 1b
|
||||
@@ -215,10 +215,10 @@ function ff_sbr_hf_g_filt_neon, export=1
|
||||
sxtw x4, w4
|
||||
mov x5, #40*2*4
|
||||
add x1, x1, x4, lsl #3
|
||||
1: ld1 {v0.2S}, [x1], x5
|
||||
ld1 {v1.S}[0], [x2], #4
|
||||
fmul v2.4S, v0.4S, v1.S[0]
|
||||
st1 {v2.2S}, [x0], #8
|
||||
1: ld1 {v0.2s}, [x1], x5
|
||||
ld1 {v1.s}[0], [x2], #4
|
||||
fmul v2.4s, v0.4s, v1.s[0]
|
||||
st1 {v2.2s}, [x0], #8
|
||||
subs x3, x3, #1
|
||||
b.gt 1b
|
||||
ret
|
||||
@@ -227,46 +227,46 @@ endfunc
|
||||
function ff_sbr_autocorrelate_neon, export=1
|
||||
mov x2, #38
|
||||
movrel x3, factors
|
||||
ld1 {v0.4S}, [x3]
|
||||
movi v1.4S, #0
|
||||
movi v2.4S, #0
|
||||
movi v3.4S, #0
|
||||
ld1 {v4.2S}, [x0], #8
|
||||
ld1 {v5.2S}, [x0], #8
|
||||
fmul v16.2S, v4.2S, v4.2S
|
||||
fmul v17.2S, v5.2S, v4.S[0]
|
||||
fmul v18.2S, v5.2S, v4.S[1]
|
||||
1: ld1 {v5.D}[1], [x0], #8
|
||||
fmla v1.2S, v4.2S, v4.2S
|
||||
fmla v2.4S, v5.4S, v4.S[0]
|
||||
fmla v3.4S, v5.4S, v4.S[1]
|
||||
mov v4.D[0], v5.D[0]
|
||||
mov v5.D[0], v5.D[1]
|
||||
ld1 {v0.4s}, [x3]
|
||||
movi v1.4s, #0
|
||||
movi v2.4s, #0
|
||||
movi v3.4s, #0
|
||||
ld1 {v4.2s}, [x0], #8
|
||||
ld1 {v5.2s}, [x0], #8
|
||||
fmul v16.2s, v4.2s, v4.2s
|
||||
fmul v17.2s, v5.2s, v4.s[0]
|
||||
fmul v18.2s, v5.2s, v4.s[1]
|
||||
1: ld1 {v5.d}[1], [x0], #8
|
||||
fmla v1.2s, v4.2s, v4.2s
|
||||
fmla v2.4s, v5.4s, v4.s[0]
|
||||
fmla v3.4s, v5.4s, v4.s[1]
|
||||
mov v4.d[0], v5.d[0]
|
||||
mov v5.d[0], v5.d[1]
|
||||
subs x2, x2, #1
|
||||
b.gt 1b
|
||||
fmul v19.2S, v4.2S, v4.2S
|
||||
fmul v20.2S, v5.2S, v4.S[0]
|
||||
fmul v21.2S, v5.2S, v4.S[1]
|
||||
fadd v22.4S, v2.4S, v20.4S
|
||||
fsub v22.4S, v22.4S, v17.4S
|
||||
fadd v23.4S, v3.4S, v21.4S
|
||||
fsub v23.4S, v23.4S, v18.4S
|
||||
rev64 v23.4S, v23.4S
|
||||
fmul v23.4S, v23.4S, v0.4S
|
||||
fadd v22.4S, v22.4S, v23.4S
|
||||
st1 {v22.4S}, [x1], #16
|
||||
fadd v23.2S, v1.2S, v19.2S
|
||||
fsub v23.2S, v23.2S, v16.2S
|
||||
faddp v23.2S, v23.2S, v23.2S
|
||||
st1 {v23.S}[0], [x1]
|
||||
fmul v19.2s, v4.2s, v4.2s
|
||||
fmul v20.2s, v5.2s, v4.s[0]
|
||||
fmul v21.2s, v5.2s, v4.s[1]
|
||||
fadd v22.4s, v2.4s, v20.4s
|
||||
fsub v22.4s, v22.4s, v17.4s
|
||||
fadd v23.4s, v3.4s, v21.4s
|
||||
fsub v23.4s, v23.4s, v18.4s
|
||||
rev64 v23.4s, v23.4s
|
||||
fmul v23.4s, v23.4s, v0.4s
|
||||
fadd v22.4s, v22.4s, v23.4s
|
||||
st1 {v22.4s}, [x1], #16
|
||||
fadd v23.2s, v1.2s, v19.2s
|
||||
fsub v23.2s, v23.2s, v16.2s
|
||||
faddp v23.2s, v23.2s, v23.2s
|
||||
st1 {v23.s}[0], [x1]
|
||||
add x1, x1, #8
|
||||
rev64 v3.2S, v3.2S
|
||||
fmul v3.2S, v3.2S, v0.2S
|
||||
fadd v2.2S, v2.2S, v3.2S
|
||||
st1 {v2.2S}, [x1]
|
||||
rev64 v3.2s, v3.2s
|
||||
fmul v3.2s, v3.2s, v0.2s
|
||||
fadd v2.2s, v2.2s, v3.2s
|
||||
st1 {v2.2s}, [x1]
|
||||
add x1, x1, #16
|
||||
faddp v1.2S, v1.2S, v1.2S
|
||||
st1 {v1.S}[0], [x1]
|
||||
faddp v1.2s, v1.2s, v1.2s
|
||||
st1 {v1.s}[0], [x1]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@@ -278,25 +278,25 @@ endfunc
|
||||
1: and x3, x3, #0x1ff
|
||||
add x8, x7, x3, lsl #3
|
||||
add x3, x3, #2
|
||||
ld1 {v2.4S}, [x0]
|
||||
ld1 {v3.2S}, [x1], #8
|
||||
ld1 {v4.2S}, [x2], #8
|
||||
ld1 {v5.4S}, [x8]
|
||||
mov v6.16B, v2.16B
|
||||
zip1 v3.4S, v3.4S, v3.4S
|
||||
zip1 v4.4S, v4.4S, v4.4S
|
||||
fmla v6.4S, v1.4S, v3.4S
|
||||
fmla v2.4S, v5.4S, v4.4S
|
||||
fcmeq v7.4S, v3.4S, #0
|
||||
bif v2.16B, v6.16B, v7.16B
|
||||
st1 {v2.4S}, [x0], #16
|
||||
ld1 {v2.4s}, [x0]
|
||||
ld1 {v3.2s}, [x1], #8
|
||||
ld1 {v4.2s}, [x2], #8
|
||||
ld1 {v5.4s}, [x8]
|
||||
mov v6.16b, v2.16b
|
||||
zip1 v3.4s, v3.4s, v3.4s
|
||||
zip1 v4.4s, v4.4s, v4.4s
|
||||
fmla v6.4s, v1.4s, v3.4s
|
||||
fmla v2.4s, v5.4s, v4.4s
|
||||
fcmeq v7.4s, v3.4s, #0
|
||||
bif v2.16b, v6.16b, v7.16b
|
||||
st1 {v2.4s}, [x0], #16
|
||||
subs x5, x5, #2
|
||||
b.gt 1b
|
||||
.endm
|
||||
|
||||
function ff_sbr_hf_apply_noise_0_neon, export=1
|
||||
movrel x9, phi_noise_0
|
||||
ld1 {v1.4S}, [x9]
|
||||
ld1 {v1.4s}, [x9]
|
||||
apply_noise_common
|
||||
ret
|
||||
endfunc
|
||||
@@ -305,14 +305,14 @@ function ff_sbr_hf_apply_noise_1_neon, export=1
|
||||
movrel x9, phi_noise_1
|
||||
and x4, x4, #1
|
||||
add x9, x9, x4, lsl #4
|
||||
ld1 {v1.4S}, [x9]
|
||||
ld1 {v1.4s}, [x9]
|
||||
apply_noise_common
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_hf_apply_noise_2_neon, export=1
|
||||
movrel x9, phi_noise_2
|
||||
ld1 {v1.4S}, [x9]
|
||||
ld1 {v1.4s}, [x9]
|
||||
apply_noise_common
|
||||
ret
|
||||
endfunc
|
||||
@@ -321,7 +321,7 @@ function ff_sbr_hf_apply_noise_3_neon, export=1
|
||||
movrel x9, phi_noise_3
|
||||
and x4, x4, #1
|
||||
add x9, x9, x4, lsl #4
|
||||
ld1 {v1.4S}, [x9]
|
||||
ld1 {v1.4s}, [x9]
|
||||
apply_noise_common
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@@ -54,7 +54,7 @@ endconst
|
||||
prfm pldl1keep, [\data]
|
||||
mov x10, x30
|
||||
movrel x3, idct_coeff_neon
|
||||
ld1 {v0.2D}, [x3]
|
||||
ld1 {v0.2d}, [x3]
|
||||
.endm
|
||||
|
||||
.macro idct_end
|
||||
@@ -74,146 +74,146 @@ endconst
|
||||
.endm
|
||||
|
||||
.macro idct_col4_top y1, y2, y3, y4, i, l
|
||||
smull\i v7.4S, \y3\l, z2
|
||||
smull\i v16.4S, \y3\l, z6
|
||||
smull\i v17.4S, \y2\l, z1
|
||||
add v19.4S, v23.4S, v7.4S
|
||||
smull\i v18.4S, \y2\l, z3
|
||||
add v20.4S, v23.4S, v16.4S
|
||||
smull\i v5.4S, \y2\l, z5
|
||||
sub v21.4S, v23.4S, v16.4S
|
||||
smull\i v6.4S, \y2\l, z7
|
||||
sub v22.4S, v23.4S, v7.4S
|
||||
smull\i v7.4s, \y3\l, z2
|
||||
smull\i v16.4s, \y3\l, z6
|
||||
smull\i v17.4s, \y2\l, z1
|
||||
add v19.4s, v23.4s, v7.4s
|
||||
smull\i v18.4s, \y2\l, z3
|
||||
add v20.4s, v23.4s, v16.4s
|
||||
smull\i v5.4s, \y2\l, z5
|
||||
sub v21.4s, v23.4s, v16.4s
|
||||
smull\i v6.4s, \y2\l, z7
|
||||
sub v22.4s, v23.4s, v7.4s
|
||||
|
||||
smlal\i v17.4S, \y4\l, z3
|
||||
smlsl\i v18.4S, \y4\l, z7
|
||||
smlsl\i v5.4S, \y4\l, z1
|
||||
smlsl\i v6.4S, \y4\l, z5
|
||||
smlal\i v17.4s, \y4\l, z3
|
||||
smlsl\i v18.4s, \y4\l, z7
|
||||
smlsl\i v5.4s, \y4\l, z1
|
||||
smlsl\i v6.4s, \y4\l, z5
|
||||
.endm
|
||||
|
||||
.macro idct_row4_neon y1, y2, y3, y4, pass
|
||||
ld1 {\y1\().2D,\y2\().2D}, [x2], #32
|
||||
movi v23.4S, #1<<2, lsl #8
|
||||
orr v5.16B, \y1\().16B, \y2\().16B
|
||||
ld1 {\y3\().2D,\y4\().2D}, [x2], #32
|
||||
orr v6.16B, \y3\().16B, \y4\().16B
|
||||
orr v5.16B, v5.16B, v6.16B
|
||||
mov x3, v5.D[1]
|
||||
smlal v23.4S, \y1\().4H, z4
|
||||
ld1 {\y1\().2d,\y2\().2d}, [x2], #32
|
||||
movi v23.4s, #1<<2, lsl #8
|
||||
orr v5.16b, \y1\().16b, \y2\().16b
|
||||
ld1 {\y3\().2d,\y4\().2d}, [x2], #32
|
||||
orr v6.16b, \y3\().16b, \y4\().16b
|
||||
orr v5.16b, v5.16b, v6.16b
|
||||
mov x3, v5.d[1]
|
||||
smlal v23.4s, \y1\().4h, z4
|
||||
|
||||
idct_col4_top \y1, \y2, \y3, \y4, 1, .4H
|
||||
idct_col4_top \y1, \y2, \y3, \y4, 1, .4h
|
||||
|
||||
cmp x3, #0
|
||||
b.eq \pass\()f
|
||||
|
||||
smull2 v7.4S, \y1\().8H, z4
|
||||
smlal2 v17.4S, \y2\().8H, z5
|
||||
smlsl2 v18.4S, \y2\().8H, z1
|
||||
smull2 v16.4S, \y3\().8H, z2
|
||||
smlal2 v5.4S, \y2\().8H, z7
|
||||
add v19.4S, v19.4S, v7.4S
|
||||
sub v20.4S, v20.4S, v7.4S
|
||||
sub v21.4S, v21.4S, v7.4S
|
||||
add v22.4S, v22.4S, v7.4S
|
||||
smlal2 v6.4S, \y2\().8H, z3
|
||||
smull2 v7.4S, \y3\().8H, z6
|
||||
smlal2 v17.4S, \y4\().8H, z7
|
||||
smlsl2 v18.4S, \y4\().8H, z5
|
||||
smlal2 v5.4S, \y4\().8H, z3
|
||||
smlsl2 v6.4S, \y4\().8H, z1
|
||||
add v19.4S, v19.4S, v7.4S
|
||||
sub v20.4S, v20.4S, v16.4S
|
||||
add v21.4S, v21.4S, v16.4S
|
||||
sub v22.4S, v22.4S, v7.4S
|
||||
smull2 v7.4s, \y1\().8h, z4
|
||||
smlal2 v17.4s, \y2\().8h, z5
|
||||
smlsl2 v18.4s, \y2\().8h, z1
|
||||
smull2 v16.4s, \y3\().8h, z2
|
||||
smlal2 v5.4s, \y2\().8h, z7
|
||||
add v19.4s, v19.4s, v7.4s
|
||||
sub v20.4s, v20.4s, v7.4s
|
||||
sub v21.4s, v21.4s, v7.4s
|
||||
add v22.4s, v22.4s, v7.4s
|
||||
smlal2 v6.4s, \y2\().8h, z3
|
||||
smull2 v7.4s, \y3\().8h, z6
|
||||
smlal2 v17.4s, \y4\().8h, z7
|
||||
smlsl2 v18.4s, \y4\().8h, z5
|
||||
smlal2 v5.4s, \y4\().8h, z3
|
||||
smlsl2 v6.4s, \y4\().8h, z1
|
||||
add v19.4s, v19.4s, v7.4s
|
||||
sub v20.4s, v20.4s, v16.4s
|
||||
add v21.4s, v21.4s, v16.4s
|
||||
sub v22.4s, v22.4s, v7.4s
|
||||
|
||||
\pass: add \y3\().4S, v19.4S, v17.4S
|
||||
add \y4\().4S, v20.4S, v18.4S
|
||||
shrn \y1\().4H, \y3\().4S, #ROW_SHIFT
|
||||
shrn \y2\().4H, \y4\().4S, #ROW_SHIFT
|
||||
add v7.4S, v21.4S, v5.4S
|
||||
add v16.4S, v22.4S, v6.4S
|
||||
shrn \y3\().4H, v7.4S, #ROW_SHIFT
|
||||
shrn \y4\().4H, v16.4S, #ROW_SHIFT
|
||||
sub v22.4S, v22.4S, v6.4S
|
||||
sub v19.4S, v19.4S, v17.4S
|
||||
sub v21.4S, v21.4S, v5.4S
|
||||
shrn2 \y1\().8H, v22.4S, #ROW_SHIFT
|
||||
sub v20.4S, v20.4S, v18.4S
|
||||
shrn2 \y2\().8H, v21.4S, #ROW_SHIFT
|
||||
shrn2 \y3\().8H, v20.4S, #ROW_SHIFT
|
||||
shrn2 \y4\().8H, v19.4S, #ROW_SHIFT
|
||||
add \y4\().4s, v20.4s, v18.4s
|
||||
shrn \y1\().4h, \y3\().4s, #ROW_SHIFT
|
||||
shrn \y2\().4h, \y4\().4s, #ROW_SHIFT
|
||||
add v7.4s, v21.4s, v5.4s
|
||||
add v16.4s, v22.4s, v6.4s
|
||||
shrn \y3\().4h, v7.4s, #ROW_SHIFT
|
||||
shrn \y4\().4h, v16.4s, #ROW_SHIFT
|
||||
sub v22.4s, v22.4s, v6.4s
|
||||
sub v19.4s, v19.4s, v17.4s
|
||||
sub v21.4s, v21.4s, v5.4s
|
||||
shrn2 \y1\().8h, v22.4s, #ROW_SHIFT
|
||||
sub v20.4s, v20.4s, v18.4s
|
||||
shrn2 \y2\().8h, v21.4s, #ROW_SHIFT
|
||||
shrn2 \y3\().8h, v20.4s, #ROW_SHIFT
|
||||
shrn2 \y4\().8h, v19.4s, #ROW_SHIFT
|
||||
|
||||
trn1 v16.8H, \y1\().8H, \y2\().8H
|
||||
trn2 v17.8H, \y1\().8H, \y2\().8H
|
||||
trn1 v18.8H, \y3\().8H, \y4\().8H
|
||||
trn2 v19.8H, \y3\().8H, \y4\().8H
|
||||
trn1 \y1\().4S, v16.4S, v18.4S
|
||||
trn1 \y2\().4S, v17.4S, v19.4S
|
||||
trn2 \y3\().4S, v16.4S, v18.4S
|
||||
trn2 \y4\().4S, v17.4S, v19.4S
|
||||
trn1 v16.8h, \y1\().8h, \y2\().8h
|
||||
trn2 v17.8h, \y1\().8h, \y2\().8h
|
||||
trn1 v18.8h, \y3\().8h, \y4\().8h
|
||||
trn2 v19.8h, \y3\().8h, \y4\().8h
|
||||
trn1 \y1\().4s, v16.4s, v18.4s
|
||||
trn1 \y2\().4s, v17.4s, v19.4s
|
||||
trn2 \y3\().4s, v16.4s, v18.4s
|
||||
trn2 \y4\().4s, v17.4s, v19.4s
|
||||
.endm
|
||||
|
||||
.macro declare_idct_col4_neon i, l
|
||||
function idct_col4_neon\i
|
||||
dup v23.4H, z4c
|
||||
dup v23.4h, z4c
|
||||
.if \i == 1
|
||||
add v23.4H, v23.4H, v24.4H
|
||||
add v23.4h, v23.4h, v24.4h
|
||||
.else
|
||||
mov v5.D[0], v24.D[1]
|
||||
add v23.4H, v23.4H, v5.4H
|
||||
mov v5.d[0], v24.d[1]
|
||||
add v23.4h, v23.4h, v5.4h
|
||||
.endif
|
||||
smull v23.4S, v23.4H, z4
|
||||
smull v23.4s, v23.4h, z4
|
||||
|
||||
idct_col4_top v24, v25, v26, v27, \i, \l
|
||||
|
||||
mov x4, v28.D[\i - 1]
|
||||
mov x5, v29.D[\i - 1]
|
||||
mov x4, v28.d[\i - 1]
|
||||
mov x5, v29.d[\i - 1]
|
||||
cmp x4, #0
|
||||
b.eq 1f
|
||||
|
||||
smull\i v7.4S, v28\l, z4
|
||||
add v19.4S, v19.4S, v7.4S
|
||||
sub v20.4S, v20.4S, v7.4S
|
||||
sub v21.4S, v21.4S, v7.4S
|
||||
add v22.4S, v22.4S, v7.4S
|
||||
smull\i v7.4s, v28\l, z4
|
||||
add v19.4s, v19.4s, v7.4s
|
||||
sub v20.4s, v20.4s, v7.4s
|
||||
sub v21.4s, v21.4s, v7.4s
|
||||
add v22.4s, v22.4s, v7.4s
|
||||
|
||||
1: mov x4, v30.D[\i - 1]
|
||||
1: mov x4, v30.d[\i - 1]
|
||||
cmp x5, #0
|
||||
b.eq 2f
|
||||
|
||||
smlal\i v17.4S, v29\l, z5
|
||||
smlsl\i v18.4S, v29\l, z1
|
||||
smlal\i v5.4S, v29\l, z7
|
||||
smlal\i v6.4S, v29\l, z3
|
||||
smlal\i v17.4s, v29\l, z5
|
||||
smlsl\i v18.4s, v29\l, z1
|
||||
smlal\i v5.4s, v29\l, z7
|
||||
smlal\i v6.4s, v29\l, z3
|
||||
|
||||
2: mov x5, v31.D[\i - 1]
|
||||
2: mov x5, v31.d[\i - 1]
|
||||
cmp x4, #0
|
||||
b.eq 3f
|
||||
|
||||
smull\i v7.4S, v30\l, z6
|
||||
smull\i v16.4S, v30\l, z2
|
||||
add v19.4S, v19.4S, v7.4S
|
||||
sub v22.4S, v22.4S, v7.4S
|
||||
sub v20.4S, v20.4S, v16.4S
|
||||
add v21.4S, v21.4S, v16.4S
|
||||
smull\i v7.4s, v30\l, z6
|
||||
smull\i v16.4s, v30\l, z2
|
||||
add v19.4s, v19.4s, v7.4s
|
||||
sub v22.4s, v22.4s, v7.4s
|
||||
sub v20.4s, v20.4s, v16.4s
|
||||
add v21.4s, v21.4s, v16.4s
|
||||
|
||||
3: cmp x5, #0
|
||||
b.eq 4f
|
||||
|
||||
smlal\i v17.4S, v31\l, z7
|
||||
smlsl\i v18.4S, v31\l, z5
|
||||
smlal\i v5.4S, v31\l, z3
|
||||
smlsl\i v6.4S, v31\l, z1
|
||||
smlal\i v17.4s, v31\l, z7
|
||||
smlsl\i v18.4s, v31\l, z5
|
||||
smlal\i v5.4s, v31\l, z3
|
||||
smlsl\i v6.4s, v31\l, z1
|
||||
|
||||
4: addhn v7.4H, v19.4S, v17.4S
|
||||
addhn2 v7.8H, v20.4S, v18.4S
|
||||
subhn v18.4H, v20.4S, v18.4S
|
||||
subhn2 v18.8H, v19.4S, v17.4S
|
||||
4: addhn v7.4h, v19.4s, v17.4s
|
||||
addhn2 v7.8h, v20.4s, v18.4s
|
||||
subhn v18.4h, v20.4s, v18.4s
|
||||
subhn2 v18.8h, v19.4s, v17.4s
|
||||
|
||||
addhn v16.4H, v21.4S, v5.4S
|
||||
addhn2 v16.8H, v22.4S, v6.4S
|
||||
subhn v17.4H, v22.4S, v6.4S
|
||||
subhn2 v17.8H, v21.4S, v5.4S
|
||||
addhn v16.4h, v21.4s, v5.4s
|
||||
addhn2 v16.8h, v22.4s, v6.4s
|
||||
subhn v17.4h, v22.4s, v6.4s
|
||||
subhn2 v17.8h, v21.4s, v5.4s
|
||||
|
||||
ret
|
||||
endfunc
|
||||
@@ -229,33 +229,33 @@ function ff_simple_idct_put_neon, export=1
|
||||
idct_row4_neon v28, v29, v30, v31, 2
|
||||
bl idct_col4_neon1
|
||||
|
||||
sqshrun v1.8B, v7.8H, #COL_SHIFT-16
|
||||
sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16
|
||||
sqshrun v3.8B, v17.8H, #COL_SHIFT-16
|
||||
sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16
|
||||
sqshrun v1.8b, v7.8h, #COL_SHIFT-16
|
||||
sqshrun2 v1.16b, v16.8h, #COL_SHIFT-16
|
||||
sqshrun v3.8b, v17.8h, #COL_SHIFT-16
|
||||
sqshrun2 v3.16b, v18.8h, #COL_SHIFT-16
|
||||
|
||||
bl idct_col4_neon2
|
||||
|
||||
sqshrun v2.8B, v7.8H, #COL_SHIFT-16
|
||||
sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16
|
||||
sqshrun v4.8B, v17.8H, #COL_SHIFT-16
|
||||
sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16
|
||||
sqshrun v2.8b, v7.8h, #COL_SHIFT-16
|
||||
sqshrun2 v2.16b, v16.8h, #COL_SHIFT-16
|
||||
sqshrun v4.8b, v17.8h, #COL_SHIFT-16
|
||||
sqshrun2 v4.16b, v18.8h, #COL_SHIFT-16
|
||||
|
||||
zip1 v16.4S, v1.4S, v2.4S
|
||||
zip2 v17.4S, v1.4S, v2.4S
|
||||
zip1 v16.4s, v1.4s, v2.4s
|
||||
zip2 v17.4s, v1.4s, v2.4s
|
||||
|
||||
st1 {v16.D}[0], [x0], x1
|
||||
st1 {v16.D}[1], [x0], x1
|
||||
st1 {v16.d}[0], [x0], x1
|
||||
st1 {v16.d}[1], [x0], x1
|
||||
|
||||
zip1 v18.4S, v3.4S, v4.4S
|
||||
zip2 v19.4S, v3.4S, v4.4S
|
||||
zip1 v18.4s, v3.4s, v4.4s
|
||||
zip2 v19.4s, v3.4s, v4.4s
|
||||
|
||||
st1 {v17.D}[0], [x0], x1
|
||||
st1 {v17.D}[1], [x0], x1
|
||||
st1 {v18.D}[0], [x0], x1
|
||||
st1 {v18.D}[1], [x0], x1
|
||||
st1 {v19.D}[0], [x0], x1
|
||||
st1 {v19.D}[1], [x0], x1
|
||||
st1 {v17.d}[0], [x0], x1
|
||||
st1 {v17.d}[1], [x0], x1
|
||||
st1 {v18.d}[0], [x0], x1
|
||||
st1 {v18.d}[1], [x0], x1
|
||||
st1 {v19.d}[0], [x0], x1
|
||||
st1 {v19.d}[1], [x0], x1
|
||||
|
||||
idct_end
|
||||
endfunc
|
||||
@@ -267,59 +267,59 @@ function ff_simple_idct_add_neon, export=1
|
||||
idct_row4_neon v28, v29, v30, v31, 2
|
||||
bl idct_col4_neon1
|
||||
|
||||
sshr v1.8H, v7.8H, #COL_SHIFT-16
|
||||
sshr v2.8H, v16.8H, #COL_SHIFT-16
|
||||
sshr v3.8H, v17.8H, #COL_SHIFT-16
|
||||
sshr v4.8H, v18.8H, #COL_SHIFT-16
|
||||
sshr v1.8h, v7.8h, #COL_SHIFT-16
|
||||
sshr v2.8h, v16.8h, #COL_SHIFT-16
|
||||
sshr v3.8h, v17.8h, #COL_SHIFT-16
|
||||
sshr v4.8h, v18.8h, #COL_SHIFT-16
|
||||
|
||||
bl idct_col4_neon2
|
||||
|
||||
sshr v7.8H, v7.8H, #COL_SHIFT-16
|
||||
sshr v16.8H, v16.8H, #COL_SHIFT-16
|
||||
sshr v17.8H, v17.8H, #COL_SHIFT-16
|
||||
sshr v18.8H, v18.8H, #COL_SHIFT-16
|
||||
sshr v7.8h, v7.8h, #COL_SHIFT-16
|
||||
sshr v16.8h, v16.8h, #COL_SHIFT-16
|
||||
sshr v17.8h, v17.8h, #COL_SHIFT-16
|
||||
sshr v18.8h, v18.8h, #COL_SHIFT-16
|
||||
|
||||
mov x9, x0
|
||||
ld1 {v19.D}[0], [x0], x1
|
||||
zip1 v23.2D, v1.2D, v7.2D
|
||||
zip2 v24.2D, v1.2D, v7.2D
|
||||
ld1 {v19.D}[1], [x0], x1
|
||||
zip1 v25.2D, v2.2D, v16.2D
|
||||
zip2 v26.2D, v2.2D, v16.2D
|
||||
ld1 {v20.D}[0], [x0], x1
|
||||
zip1 v27.2D, v3.2D, v17.2D
|
||||
zip2 v28.2D, v3.2D, v17.2D
|
||||
ld1 {v20.D}[1], [x0], x1
|
||||
zip1 v29.2D, v4.2D, v18.2D
|
||||
zip2 v30.2D, v4.2D, v18.2D
|
||||
ld1 {v21.D}[0], [x0], x1
|
||||
uaddw v23.8H, v23.8H, v19.8B
|
||||
uaddw2 v24.8H, v24.8H, v19.16B
|
||||
ld1 {v21.D}[1], [x0], x1
|
||||
sqxtun v23.8B, v23.8H
|
||||
sqxtun2 v23.16B, v24.8H
|
||||
ld1 {v22.D}[0], [x0], x1
|
||||
uaddw v24.8H, v25.8H, v20.8B
|
||||
uaddw2 v25.8H, v26.8H, v20.16B
|
||||
ld1 {v22.D}[1], [x0], x1
|
||||
sqxtun v24.8B, v24.8H
|
||||
sqxtun2 v24.16B, v25.8H
|
||||
st1 {v23.D}[0], [x9], x1
|
||||
uaddw v25.8H, v27.8H, v21.8B
|
||||
uaddw2 v26.8H, v28.8H, v21.16B
|
||||
st1 {v23.D}[1], [x9], x1
|
||||
sqxtun v25.8B, v25.8H
|
||||
sqxtun2 v25.16B, v26.8H
|
||||
st1 {v24.D}[0], [x9], x1
|
||||
uaddw v26.8H, v29.8H, v22.8B
|
||||
uaddw2 v27.8H, v30.8H, v22.16B
|
||||
st1 {v24.D}[1], [x9], x1
|
||||
sqxtun v26.8B, v26.8H
|
||||
sqxtun2 v26.16B, v27.8H
|
||||
st1 {v25.D}[0], [x9], x1
|
||||
st1 {v25.D}[1], [x9], x1
|
||||
st1 {v26.D}[0], [x9], x1
|
||||
st1 {v26.D}[1], [x9], x1
|
||||
ld1 {v19.d}[0], [x0], x1
|
||||
zip1 v23.2d, v1.2d, v7.2d
|
||||
zip2 v24.2d, v1.2d, v7.2d
|
||||
ld1 {v19.d}[1], [x0], x1
|
||||
zip1 v25.2d, v2.2d, v16.2d
|
||||
zip2 v26.2d, v2.2d, v16.2d
|
||||
ld1 {v20.d}[0], [x0], x1
|
||||
zip1 v27.2d, v3.2d, v17.2d
|
||||
zip2 v28.2d, v3.2d, v17.2d
|
||||
ld1 {v20.d}[1], [x0], x1
|
||||
zip1 v29.2d, v4.2d, v18.2d
|
||||
zip2 v30.2d, v4.2d, v18.2d
|
||||
ld1 {v21.d}[0], [x0], x1
|
||||
uaddw v23.8h, v23.8h, v19.8b
|
||||
uaddw2 v24.8h, v24.8h, v19.16b
|
||||
ld1 {v21.d}[1], [x0], x1
|
||||
sqxtun v23.8b, v23.8h
|
||||
sqxtun2 v23.16b, v24.8h
|
||||
ld1 {v22.d}[0], [x0], x1
|
||||
uaddw v24.8h, v25.8h, v20.8b
|
||||
uaddw2 v25.8h, v26.8h, v20.16b
|
||||
ld1 {v22.d}[1], [x0], x1
|
||||
sqxtun v24.8b, v24.8h
|
||||
sqxtun2 v24.16b, v25.8h
|
||||
st1 {v23.d}[0], [x9], x1
|
||||
uaddw v25.8h, v27.8h, v21.8b
|
||||
uaddw2 v26.8h, v28.8h, v21.16b
|
||||
st1 {v23.d}[1], [x9], x1
|
||||
sqxtun v25.8b, v25.8h
|
||||
sqxtun2 v25.16b, v26.8h
|
||||
st1 {v24.d}[0], [x9], x1
|
||||
uaddw v26.8h, v29.8h, v22.8b
|
||||
uaddw2 v27.8h, v30.8h, v22.16b
|
||||
st1 {v24.d}[1], [x9], x1
|
||||
sqxtun v26.8b, v26.8h
|
||||
sqxtun2 v26.16b, v27.8h
|
||||
st1 {v25.d}[0], [x9], x1
|
||||
st1 {v25.d}[1], [x9], x1
|
||||
st1 {v26.d}[0], [x9], x1
|
||||
st1 {v26.d}[1], [x9], x1
|
||||
|
||||
idct_end
|
||||
endfunc
|
||||
@@ -333,30 +333,30 @@ function ff_simple_idct_neon, export=1
|
||||
sub x2, x2, #128
|
||||
bl idct_col4_neon1
|
||||
|
||||
sshr v1.8H, v7.8H, #COL_SHIFT-16
|
||||
sshr v2.8H, v16.8H, #COL_SHIFT-16
|
||||
sshr v3.8H, v17.8H, #COL_SHIFT-16
|
||||
sshr v4.8H, v18.8H, #COL_SHIFT-16
|
||||
sshr v1.8h, v7.8h, #COL_SHIFT-16
|
||||
sshr v2.8h, v16.8h, #COL_SHIFT-16
|
||||
sshr v3.8h, v17.8h, #COL_SHIFT-16
|
||||
sshr v4.8h, v18.8h, #COL_SHIFT-16
|
||||
|
||||
bl idct_col4_neon2
|
||||
|
||||
sshr v7.8H, v7.8H, #COL_SHIFT-16
|
||||
sshr v16.8H, v16.8H, #COL_SHIFT-16
|
||||
sshr v17.8H, v17.8H, #COL_SHIFT-16
|
||||
sshr v18.8H, v18.8H, #COL_SHIFT-16
|
||||
sshr v7.8h, v7.8h, #COL_SHIFT-16
|
||||
sshr v16.8h, v16.8h, #COL_SHIFT-16
|
||||
sshr v17.8h, v17.8h, #COL_SHIFT-16
|
||||
sshr v18.8h, v18.8h, #COL_SHIFT-16
|
||||
|
||||
zip1 v23.2D, v1.2D, v7.2D
|
||||
zip2 v24.2D, v1.2D, v7.2D
|
||||
st1 {v23.2D,v24.2D}, [x2], #32
|
||||
zip1 v25.2D, v2.2D, v16.2D
|
||||
zip2 v26.2D, v2.2D, v16.2D
|
||||
st1 {v25.2D,v26.2D}, [x2], #32
|
||||
zip1 v27.2D, v3.2D, v17.2D
|
||||
zip2 v28.2D, v3.2D, v17.2D
|
||||
st1 {v27.2D,v28.2D}, [x2], #32
|
||||
zip1 v29.2D, v4.2D, v18.2D
|
||||
zip2 v30.2D, v4.2D, v18.2D
|
||||
st1 {v29.2D,v30.2D}, [x2], #32
|
||||
zip1 v23.2d, v1.2d, v7.2d
|
||||
zip2 v24.2d, v1.2d, v7.2d
|
||||
st1 {v23.2d,v24.2d}, [x2], #32
|
||||
zip1 v25.2d, v2.2d, v16.2d
|
||||
zip2 v26.2d, v2.2d, v16.2d
|
||||
st1 {v25.2d,v26.2d}, [x2], #32
|
||||
zip1 v27.2d, v3.2d, v17.2d
|
||||
zip2 v28.2d, v3.2d, v17.2d
|
||||
st1 {v27.2d,v28.2d}, [x2], #32
|
||||
zip1 v29.2d, v4.2d, v18.2d
|
||||
zip2 v30.2d, v4.2d, v18.2d
|
||||
st1 {v29.2d,v30.2d}, [x2], #32
|
||||
|
||||
idct_end
|
||||
endfunc
|
||||
|
||||
+151
-151
@@ -330,32 +330,32 @@ endfunc
|
||||
// v17: hev
|
||||
|
||||
// convert to signed value:
|
||||
eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80
|
||||
eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80
|
||||
eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80
|
||||
eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80
|
||||
|
||||
movi v20.8h, #3
|
||||
ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0
|
||||
ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit)
|
||||
eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80
|
||||
eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80
|
||||
mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0)
|
||||
mul v19.8h, v19.8h, v20.8h
|
||||
movi v20.8h, #3
|
||||
ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0
|
||||
ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit)
|
||||
eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80
|
||||
eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80
|
||||
mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0)
|
||||
mul v19.8h, v19.8h, v20.8h
|
||||
|
||||
sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1)
|
||||
movi v22.16b, #4
|
||||
movi v23.16b, #3
|
||||
sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1)
|
||||
movi v22.16b, #4
|
||||
movi v23.16b, #3
|
||||
.if \inner
|
||||
and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1)
|
||||
and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1)
|
||||
.endif
|
||||
saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1)
|
||||
saddw2 v19.8h, v19.8h, v20.16b
|
||||
sqxtn v18.8b, v18.8h // narrow result back into v18
|
||||
sqxtn2 v18.16b, v19.8h
|
||||
saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1)
|
||||
saddw2 v19.8h, v19.8h, v20.16b
|
||||
sqxtn v18.8b, v18.8h // narrow result back into v18
|
||||
sqxtn2 v18.16b, v19.8h
|
||||
.if !\inner && !\simple
|
||||
eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80
|
||||
eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80
|
||||
eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80
|
||||
eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80
|
||||
.endif
|
||||
and v18.16b, v18.16b, v16.16b // w &= normal_limit
|
||||
and v18.16b, v18.16b, v16.16b // w &= normal_limit
|
||||
|
||||
// registers used at this point..
|
||||
// v0 -> P3 (don't corrupt)
|
||||
@@ -375,44 +375,44 @@ endfunc
|
||||
// P0 = s2u(PS0 + c2);
|
||||
|
||||
.if \simple
|
||||
sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
|
||||
sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
|
||||
sshr v19.16b, v19.16b, #3 // c1 >>= 3
|
||||
sshr v20.16b, v20.16b, #3 // c2 >>= 3
|
||||
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
|
||||
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
|
||||
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
|
||||
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
|
||||
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
|
||||
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
|
||||
sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
|
||||
sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
|
||||
sshr v19.16b, v19.16b, #3 // c1 >>= 3
|
||||
sshr v20.16b, v20.16b, #3 // c2 >>= 3
|
||||
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
|
||||
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
|
||||
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
|
||||
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
|
||||
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
|
||||
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
|
||||
.elseif \inner
|
||||
// the !is4tap case of filter_common, only used for inner blocks
|
||||
// c3 = ((c1&~hev) + 1) >> 1;
|
||||
// Q1 = s2u(QS1 - c3);
|
||||
// P1 = s2u(PS1 + c3);
|
||||
sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
|
||||
sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
|
||||
sshr v19.16b, v19.16b, #3 // c1 >>= 3
|
||||
sshr v20.16b, v20.16b, #3 // c2 >>= 3
|
||||
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
|
||||
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
|
||||
bic v19.16b, v19.16b, v17.16b // c1 & ~hev
|
||||
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
|
||||
srshr v19.16b, v19.16b, #1 // c3 >>= 1
|
||||
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
|
||||
sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3)
|
||||
sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3)
|
||||
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
|
||||
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
|
||||
sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
|
||||
sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
|
||||
sshr v19.16b, v19.16b, #3 // c1 >>= 3
|
||||
sshr v20.16b, v20.16b, #3 // c2 >>= 3
|
||||
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
|
||||
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
|
||||
bic v19.16b, v19.16b, v17.16b // c1 & ~hev
|
||||
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
|
||||
srshr v19.16b, v19.16b, #1 // c3 >>= 1
|
||||
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
|
||||
sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3)
|
||||
sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3)
|
||||
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
|
||||
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
|
||||
.else
|
||||
and v20.16b, v18.16b, v17.16b // w & hev
|
||||
sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4)
|
||||
sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3)
|
||||
sshr v19.16b, v19.16b, #3 // c1 >>= 3
|
||||
sshr v20.16b, v20.16b, #3 // c2 >>= 3
|
||||
bic v18.16b, v18.16b, v17.16b // w &= ~hev
|
||||
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
|
||||
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
|
||||
and v20.16b, v18.16b, v17.16b // w & hev
|
||||
sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4)
|
||||
sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3)
|
||||
sshr v19.16b, v19.16b, #3 // c1 >>= 3
|
||||
sshr v20.16b, v20.16b, #3 // c2 >>= 3
|
||||
bic v18.16b, v18.16b, v17.16b // w &= ~hev
|
||||
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
|
||||
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
|
||||
|
||||
// filter_mbedge:
|
||||
// a = clamp((27*w + 63) >> 7);
|
||||
@@ -424,35 +424,35 @@ endfunc
|
||||
// a = clamp((9*w + 63) >> 7);
|
||||
// Q2 = s2u(QS2 - a);
|
||||
// P2 = s2u(PS2 + a);
|
||||
movi v17.8h, #63
|
||||
sshll v22.8h, v18.8b, #3
|
||||
sshll2 v23.8h, v18.16b, #3
|
||||
saddw v22.8h, v22.8h, v18.8b
|
||||
saddw2 v23.8h, v23.8h, v18.16b
|
||||
add v16.8h, v17.8h, v22.8h
|
||||
add v17.8h, v17.8h, v23.8h // 9*w + 63
|
||||
add v19.8h, v16.8h, v22.8h
|
||||
add v20.8h, v17.8h, v23.8h // 18*w + 63
|
||||
add v22.8h, v19.8h, v22.8h
|
||||
add v23.8h, v20.8h, v23.8h // 27*w + 63
|
||||
sqshrn v16.8b, v16.8h, #7
|
||||
sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7)
|
||||
sqshrn v19.8b, v19.8h, #7
|
||||
sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7)
|
||||
sqshrn v22.8b, v22.8h, #7
|
||||
sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7)
|
||||
sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a)
|
||||
sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a)
|
||||
sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a)
|
||||
sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a)
|
||||
sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a)
|
||||
sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a)
|
||||
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
|
||||
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
|
||||
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
|
||||
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
|
||||
eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80
|
||||
eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80
|
||||
movi v17.8h, #63
|
||||
sshll v22.8h, v18.8b, #3
|
||||
sshll2 v23.8h, v18.16b, #3
|
||||
saddw v22.8h, v22.8h, v18.8b
|
||||
saddw2 v23.8h, v23.8h, v18.16b
|
||||
add v16.8h, v17.8h, v22.8h
|
||||
add v17.8h, v17.8h, v23.8h // 9*w + 63
|
||||
add v19.8h, v16.8h, v22.8h
|
||||
add v20.8h, v17.8h, v23.8h // 18*w + 63
|
||||
add v22.8h, v19.8h, v22.8h
|
||||
add v23.8h, v20.8h, v23.8h // 27*w + 63
|
||||
sqshrn v16.8b, v16.8h, #7
|
||||
sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7)
|
||||
sqshrn v19.8b, v19.8h, #7
|
||||
sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7)
|
||||
sqshrn v22.8b, v22.8h, #7
|
||||
sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7)
|
||||
sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a)
|
||||
sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a)
|
||||
sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a)
|
||||
sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a)
|
||||
sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a)
|
||||
sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a)
|
||||
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
|
||||
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
|
||||
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
|
||||
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
|
||||
eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80
|
||||
eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80
|
||||
.endif
|
||||
.endm
|
||||
|
||||
@@ -507,48 +507,48 @@ function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
|
||||
sub x0, x0, x2, lsl #2
|
||||
sub x1, x1, x2, lsl #2
|
||||
// Load pixels:
|
||||
ld1 {v0.d}[0], [x0], x2 // P3
|
||||
ld1 {v0.d}[1], [x1], x2 // P3
|
||||
ld1 {v1.d}[0], [x0], x2 // P2
|
||||
ld1 {v1.d}[1], [x1], x2 // P2
|
||||
ld1 {v2.d}[0], [x0], x2 // P1
|
||||
ld1 {v2.d}[1], [x1], x2 // P1
|
||||
ld1 {v3.d}[0], [x0], x2 // P0
|
||||
ld1 {v3.d}[1], [x1], x2 // P0
|
||||
ld1 {v4.d}[0], [x0], x2 // Q0
|
||||
ld1 {v4.d}[1], [x1], x2 // Q0
|
||||
ld1 {v5.d}[0], [x0], x2 // Q1
|
||||
ld1 {v5.d}[1], [x1], x2 // Q1
|
||||
ld1 {v6.d}[0], [x0], x2 // Q2
|
||||
ld1 {v6.d}[1], [x1], x2 // Q2
|
||||
ld1 {v7.d}[0], [x0] // Q3
|
||||
ld1 {v7.d}[1], [x1] // Q3
|
||||
ld1 {v0.d}[0], [x0], x2 // P3
|
||||
ld1 {v0.d}[1], [x1], x2 // P3
|
||||
ld1 {v1.d}[0], [x0], x2 // P2
|
||||
ld1 {v1.d}[1], [x1], x2 // P2
|
||||
ld1 {v2.d}[0], [x0], x2 // P1
|
||||
ld1 {v2.d}[1], [x1], x2 // P1
|
||||
ld1 {v3.d}[0], [x0], x2 // P0
|
||||
ld1 {v3.d}[1], [x1], x2 // P0
|
||||
ld1 {v4.d}[0], [x0], x2 // Q0
|
||||
ld1 {v4.d}[1], [x1], x2 // Q0
|
||||
ld1 {v5.d}[0], [x0], x2 // Q1
|
||||
ld1 {v5.d}[1], [x1], x2 // Q1
|
||||
ld1 {v6.d}[0], [x0], x2 // Q2
|
||||
ld1 {v6.d}[1], [x1], x2 // Q2
|
||||
ld1 {v7.d}[0], [x0] // Q3
|
||||
ld1 {v7.d}[1], [x1] // Q3
|
||||
|
||||
dup v22.16b, w3 // flim_E
|
||||
dup v23.16b, w4 // flim_I
|
||||
dup v22.16b, w3 // flim_E
|
||||
dup v23.16b, w4 // flim_I
|
||||
|
||||
vp8_loop_filter inner=\inner, hev_thresh=w5
|
||||
|
||||
// back up to P2: u,v -= stride * 6
|
||||
sub x0, x0, x2, lsl #2
|
||||
sub x1, x1, x2, lsl #2
|
||||
sub x0, x0, x2, lsl #1
|
||||
sub x1, x1, x2, lsl #1
|
||||
sub x0, x0, x2, lsl #2
|
||||
sub x1, x1, x2, lsl #2
|
||||
sub x0, x0, x2, lsl #1
|
||||
sub x1, x1, x2, lsl #1
|
||||
|
||||
// Store pixels:
|
||||
|
||||
st1 {v1.d}[0], [x0], x2 // P2
|
||||
st1 {v1.d}[1], [x1], x2 // P2
|
||||
st1 {v2.d}[0], [x0], x2 // P1
|
||||
st1 {v2.d}[1], [x1], x2 // P1
|
||||
st1 {v3.d}[0], [x0], x2 // P0
|
||||
st1 {v3.d}[1], [x1], x2 // P0
|
||||
st1 {v4.d}[0], [x0], x2 // Q0
|
||||
st1 {v4.d}[1], [x1], x2 // Q0
|
||||
st1 {v5.d}[0], [x0], x2 // Q1
|
||||
st1 {v5.d}[1], [x1], x2 // Q1
|
||||
st1 {v6.d}[0], [x0] // Q2
|
||||
st1 {v6.d}[1], [x1] // Q2
|
||||
st1 {v1.d}[0], [x0], x2 // P2
|
||||
st1 {v1.d}[1], [x1], x2 // P2
|
||||
st1 {v2.d}[0], [x0], x2 // P1
|
||||
st1 {v2.d}[1], [x1], x2 // P1
|
||||
st1 {v3.d}[0], [x0], x2 // P0
|
||||
st1 {v3.d}[1], [x1], x2 // P0
|
||||
st1 {v4.d}[0], [x0], x2 // Q0
|
||||
st1 {v4.d}[1], [x1], x2 // Q0
|
||||
st1 {v5.d}[0], [x0], x2 // Q1
|
||||
st1 {v5.d}[1], [x1], x2 // Q1
|
||||
st1 {v6.d}[0], [x0] // Q2
|
||||
st1 {v6.d}[1], [x1] // Q2
|
||||
|
||||
ret
|
||||
endfunc
|
||||
@@ -579,7 +579,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1
|
||||
ld1 {v6.d}[1], [x0], x1
|
||||
ld1 {v7.d}[1], [x0], x1
|
||||
|
||||
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
|
||||
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
|
||||
|
||||
dup v22.16b, w2 // flim_E
|
||||
.if !\simple
|
||||
@@ -590,7 +590,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1
|
||||
|
||||
sub x0, x0, x1, lsl #4 // backup 16 rows
|
||||
|
||||
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
|
||||
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
|
||||
|
||||
// Store pixels:
|
||||
st1 {v0.d}[0], [x0], x1
|
||||
@@ -624,24 +624,24 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
|
||||
sub x1, x1, #4
|
||||
|
||||
// Load pixels:
|
||||
ld1 {v0.d}[0], [x0], x2 // load u
|
||||
ld1 {v0.d}[1], [x1], x2 // load v
|
||||
ld1 {v1.d}[0], [x0], x2
|
||||
ld1 {v1.d}[1], [x1], x2
|
||||
ld1 {v2.d}[0], [x0], x2
|
||||
ld1 {v2.d}[1], [x1], x2
|
||||
ld1 {v3.d}[0], [x0], x2
|
||||
ld1 {v3.d}[1], [x1], x2
|
||||
ld1 {v4.d}[0], [x0], x2
|
||||
ld1 {v4.d}[1], [x1], x2
|
||||
ld1 {v5.d}[0], [x0], x2
|
||||
ld1 {v5.d}[1], [x1], x2
|
||||
ld1 {v6.d}[0], [x0], x2
|
||||
ld1 {v6.d}[1], [x1], x2
|
||||
ld1 {v7.d}[0], [x0], x2
|
||||
ld1 {v7.d}[1], [x1], x2
|
||||
ld1 {v0.d}[0], [x0], x2 // load u
|
||||
ld1 {v0.d}[1], [x1], x2 // load v
|
||||
ld1 {v1.d}[0], [x0], x2
|
||||
ld1 {v1.d}[1], [x1], x2
|
||||
ld1 {v2.d}[0], [x0], x2
|
||||
ld1 {v2.d}[1], [x1], x2
|
||||
ld1 {v3.d}[0], [x0], x2
|
||||
ld1 {v3.d}[1], [x1], x2
|
||||
ld1 {v4.d}[0], [x0], x2
|
||||
ld1 {v4.d}[1], [x1], x2
|
||||
ld1 {v5.d}[0], [x0], x2
|
||||
ld1 {v5.d}[1], [x1], x2
|
||||
ld1 {v6.d}[0], [x0], x2
|
||||
ld1 {v6.d}[1], [x1], x2
|
||||
ld1 {v7.d}[0], [x0], x2
|
||||
ld1 {v7.d}[1], [x1], x2
|
||||
|
||||
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
|
||||
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
|
||||
|
||||
dup v22.16b, w3 // flim_E
|
||||
dup v23.16b, w4 // flim_I
|
||||
@@ -651,25 +651,25 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
|
||||
sub x0, x0, x2, lsl #3 // backup u 8 rows
|
||||
sub x1, x1, x2, lsl #3 // backup v 8 rows
|
||||
|
||||
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
|
||||
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
|
||||
|
||||
// Store pixels:
|
||||
st1 {v0.d}[0], [x0], x2 // load u
|
||||
st1 {v0.d}[1], [x1], x2 // load v
|
||||
st1 {v1.d}[0], [x0], x2
|
||||
st1 {v1.d}[1], [x1], x2
|
||||
st1 {v2.d}[0], [x0], x2
|
||||
st1 {v2.d}[1], [x1], x2
|
||||
st1 {v3.d}[0], [x0], x2
|
||||
st1 {v3.d}[1], [x1], x2
|
||||
st1 {v4.d}[0], [x0], x2
|
||||
st1 {v4.d}[1], [x1], x2
|
||||
st1 {v5.d}[0], [x0], x2
|
||||
st1 {v5.d}[1], [x1], x2
|
||||
st1 {v6.d}[0], [x0], x2
|
||||
st1 {v6.d}[1], [x1], x2
|
||||
st1 {v7.d}[0], [x0]
|
||||
st1 {v7.d}[1], [x1]
|
||||
st1 {v0.d}[0], [x0], x2 // load u
|
||||
st1 {v0.d}[1], [x1], x2 // load v
|
||||
st1 {v1.d}[0], [x0], x2
|
||||
st1 {v1.d}[1], [x1], x2
|
||||
st1 {v2.d}[0], [x0], x2
|
||||
st1 {v2.d}[1], [x1], x2
|
||||
st1 {v3.d}[0], [x0], x2
|
||||
st1 {v3.d}[1], [x1], x2
|
||||
st1 {v4.d}[0], [x0], x2
|
||||
st1 {v4.d}[1], [x1], x2
|
||||
st1 {v5.d}[0], [x0], x2
|
||||
st1 {v5.d}[1], [x1], x2
|
||||
st1 {v6.d}[0], [x0], x2
|
||||
st1 {v6.d}[1], [x1], x2
|
||||
st1 {v7.d}[0], [x0]
|
||||
st1 {v7.d}[1], [x1]
|
||||
|
||||
ret
|
||||
|
||||
|
||||
+6
-2
@@ -2116,8 +2116,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
|
||||
ctx->nbits = av_malloc_array(ctx->cur_frame_length, sizeof(*ctx->nbits));
|
||||
ctx->mlz = av_mallocz(sizeof(*ctx->mlz));
|
||||
|
||||
if (!ctx->mlz || !ctx->acf || !ctx->shift_value || !ctx->last_shift_value
|
||||
|| !ctx->last_acf_mantissa || !ctx->raw_mantissa) {
|
||||
if (!ctx->larray || !ctx->nbits || !ctx->mlz || !ctx->acf || !ctx->shift_value
|
||||
|| !ctx->last_shift_value || !ctx->last_acf_mantissa || !ctx->raw_mantissa) {
|
||||
av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
|
||||
ret = AVERROR(ENOMEM);
|
||||
goto fail;
|
||||
@@ -2128,6 +2128,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
|
||||
|
||||
for (c = 0; c < avctx->channels; ++c) {
|
||||
ctx->raw_mantissa[c] = av_mallocz_array(ctx->cur_frame_length, sizeof(**ctx->raw_mantissa));
|
||||
if (!ctx->raw_mantissa[c]) {
|
||||
av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
|
||||
return AVERROR(ENOMEM);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -48,4 +48,3 @@ function ff_scalarproduct_int16_neon, export=1
|
||||
vmov.32 r0, d3[0]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
|
||||
@@ -91,4 +91,3 @@ AVCodec ff_cljr_decoder = {
|
||||
.decode = decode_frame,
|
||||
.capabilities = AV_CODEC_CAP_DR1,
|
||||
};
|
||||
|
||||
|
||||
@@ -259,4 +259,3 @@ av_cold int ff_dvvideo_init(AVCodecContext *avctx)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -338,4 +338,3 @@ const AVDVProfile *av_dv_codec_profile2(int width, int height,
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
@@ -50,4 +50,3 @@ static inline int RENAME(get_context)(PlaneContext *p, TYPE *src,
|
||||
p->quant_table[1][(LT - T) & 0xFF] +
|
||||
p->quant_table[2][(T - RT) & 0xFF];
|
||||
}
|
||||
|
||||
|
||||
@@ -199,4 +199,3 @@ static int RENAME(encode_rgb_frame)(FFV1Context *s, const uint8_t *src[4],
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -162,4 +162,3 @@ static void MCFUNC(hl_motion)(const H264Context *h, H264SliceContext *sl,
|
||||
if (USES_LIST(mb_type, 1))
|
||||
prefetch_motion(h, sl, 1, PIXEL_SHIFT, CHROMA_IDC);
|
||||
}
|
||||
|
||||
|
||||
@@ -1554,4 +1554,3 @@ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
|
||||
case 0: lc->pu.mvd.y = 0; break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -340,6 +340,25 @@ static int get_siz(Jpeg2000DecoderContext *s)
|
||||
return AVERROR_INVALIDDATA;
|
||||
}
|
||||
|
||||
for (i = 0; i < s->ncomponents; i++) {
|
||||
if (s->cdef[i] < 0) {
|
||||
for (i = 0; i < s->ncomponents; i++) {
|
||||
s->cdef[i] = i + 1;
|
||||
}
|
||||
if ((s->ncomponents & 1) == 0)
|
||||
s->cdef[s->ncomponents-1] = 0;
|
||||
}
|
||||
}
|
||||
// after here we no longer have to consider negative cdef
|
||||
|
||||
int cdef_used = 0;
|
||||
for (i = 0; i < s->ncomponents; i++)
|
||||
cdef_used |= 1<<s->cdef[i];
|
||||
|
||||
// Check that the channels we have are what we expect for the number of components
|
||||
if (cdef_used != ((int[]){0,2,3,14,15})[s->ncomponents])
|
||||
return AVERROR_INVALIDDATA;
|
||||
|
||||
for (i = 0; i < s->ncomponents; i++) { // Ssiz_i XRsiz_i, YRsiz_i
|
||||
uint8_t x = bytestream2_get_byteu(&s->g);
|
||||
s->cbps[i] = (x & 0x7f) + 1;
|
||||
@@ -352,7 +371,9 @@ static int get_siz(Jpeg2000DecoderContext *s)
|
||||
av_log(s->avctx, AV_LOG_ERROR, "Invalid sample separation %d/%d\n", s->cdx[i], s->cdy[i]);
|
||||
return AVERROR_INVALIDDATA;
|
||||
}
|
||||
log2_chroma_wh |= s->cdy[i] >> 1 << i * 4 | s->cdx[i] >> 1 << i * 4 + 2;
|
||||
int i_remapped = s->cdef[i] ? s->cdef[i]-1 : (s->ncomponents-1);
|
||||
|
||||
log2_chroma_wh |= s->cdy[i] >> 1 << i_remapped * 4 | s->cdx[i] >> 1 << i_remapped * 4 + 2;
|
||||
}
|
||||
|
||||
s->numXtiles = ff_jpeg2000_ceildiv(s->width - s->tile_offset_x, s->tile_width);
|
||||
@@ -1198,6 +1219,7 @@ static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile,
|
||||
|
||||
bytestream2_get_bufferu(&s->g, cblk->data + cblk->length, cblk->lengthinc[cwsno]);
|
||||
cblk->length += cblk->lengthinc[cwsno];
|
||||
memset(cblk->data + cblk->length, 0, 4);
|
||||
cblk->lengthinc[cwsno] = 0;
|
||||
if (cblk->nb_terminationsinc) {
|
||||
cblk->nb_terminationsinc--;
|
||||
|
||||
@@ -398,4 +398,3 @@ void RENAME(ff_imdct36_blocks)(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in,
|
||||
out++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -782,4 +782,3 @@ static const AVCodecDefault mp2_defaults[] = {
|
||||
{ "b", "0" },
|
||||
{ NULL },
|
||||
};
|
||||
|
||||
|
||||
@@ -334,4 +334,3 @@ int ff_msmpeg4_pred_dc(MpegEncContext *s, int n,
|
||||
*dc_val_ptr = &dc_val[0];
|
||||
return pred;
|
||||
}
|
||||
|
||||
|
||||
@@ -856,5 +856,3 @@ av_cold void ff_dwt_init(SnowDWTContext *c)
|
||||
if (HAVE_MMX)
|
||||
ff_dwt_init_x86(c);
|
||||
}
|
||||
|
||||
|
||||
|
||||
+11
-7
@@ -1566,6 +1566,8 @@ static int vp9_decode_frame(AVCodecContext *avctx, void *frame,
|
||||
av_log(avctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
|
||||
return AVERROR_INVALIDDATA;
|
||||
}
|
||||
ff_thread_await_progress(&s->s.refs[ref], INT_MAX, 0);
|
||||
|
||||
if ((ret = av_frame_ref(frame, s->s.refs[ref].f)) < 0)
|
||||
return ret;
|
||||
((AVFrame *)frame)->pts = pkt->pts;
|
||||
@@ -1732,10 +1734,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
|
||||
#endif
|
||||
{
|
||||
ret = decode_tiles(avctx, data, size);
|
||||
if (ret < 0) {
|
||||
ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
|
||||
return ret;
|
||||
}
|
||||
if (ret < 0)
|
||||
goto fail;
|
||||
}
|
||||
|
||||
// Sum all counts fields into td[0].counts for tile threading
|
||||
@@ -1749,20 +1749,21 @@ FF_ENABLE_DEPRECATION_WARNINGS
|
||||
ff_thread_finish_setup(avctx);
|
||||
}
|
||||
} while (s->pass++ == 1);
|
||||
ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
|
||||
|
||||
if (s->td->error_info < 0) {
|
||||
av_log(avctx, AV_LOG_ERROR, "Failed to decode tile data\n");
|
||||
s->td->error_info = 0;
|
||||
return AVERROR_INVALIDDATA;
|
||||
ret = AVERROR_INVALIDDATA;
|
||||
goto fail;
|
||||
}
|
||||
if (avctx->export_side_data & AV_CODEC_EXPORT_DATA_VIDEO_ENC_PARAMS) {
|
||||
ret = vp9_export_enc_params(s, &s->s.frames[CUR_FRAME]);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
finish:
|
||||
ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
|
||||
// ref frame setup
|
||||
for (i = 0; i < 8; i++) {
|
||||
if (s->s.refs[i].f->buf[0])
|
||||
@@ -1779,6 +1780,9 @@ finish:
|
||||
}
|
||||
|
||||
return pkt->size;
|
||||
fail:
|
||||
ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void vp9_decode_flush(AVCodecContext *avctx)
|
||||
|
||||
@@ -121,4 +121,3 @@ INIT_XMM sse
|
||||
INT32_TO_FLOAT_FMUL_ARRAY8
|
||||
INIT_XMM sse2
|
||||
INT32_TO_FLOAT_FMUL_ARRAY8
|
||||
|
||||
|
||||
@@ -151,4 +151,3 @@ INIT_MMX mmx
|
||||
PIX_NORM1 0, 16
|
||||
INIT_XMM sse2
|
||||
PIX_NORM1 6, 8
|
||||
|
||||
|
||||
@@ -163,6 +163,7 @@ cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
|
||||
dec cntrq
|
||||
jge .bpp_loop
|
||||
POP dstq
|
||||
emms
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
|
||||
@@ -22,52 +22,52 @@
|
||||
|
||||
// acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D}
|
||||
.macro acc_sum_store x, xb
|
||||
dup v24.4S, v24.S[3] // ...X -> XXXX
|
||||
ext v25.16B, v26.16B, \xb, #12 // ext(0000,ABCD,12)=0ABC
|
||||
add v24.4S, v24.4S, \x // XXXX+ABCD={X+A,X+B,X+C,X+D}
|
||||
add v24.4S, v24.4S, v25.4S // {X+A,X+B+A,X+C+B,X+D+C} (+0ABC)
|
||||
ext v25.16B, v26.16B, v25.16B, #12 // ext(0000,0ABC,12)=00AB
|
||||
add v24.4S, v24.4S, v25.4S // {X+A,X+B+A,X+C+B+A,X+D+C+B} (+00AB)
|
||||
ext v25.16B, v26.16B, v25.16B, #12 // ext(0000,00AB,12)=000A
|
||||
add v24.4S, v24.4S, v25.4S // {X+A,X+B+A,X+C+B+A,X+D+C+B+A} (+000A)
|
||||
st1 {v24.4S}, [x0], #16 // write 4x32-bit final values
|
||||
dup v24.4s, v24.s[3] // ...X -> XXXX
|
||||
ext v25.16b, v26.16b, \xb, #12 // ext(0000,ABCD,12)=0ABC
|
||||
add v24.4s, v24.4s, \x // XXXX+ABCD={X+A,X+B,X+C,X+D}
|
||||
add v24.4s, v24.4s, v25.4s // {X+A,X+B+A,X+C+B,X+D+C} (+0ABC)
|
||||
ext v25.16b, v26.16b, v25.16b, #12 // ext(0000,0ABC,12)=00AB
|
||||
add v24.4s, v24.4s, v25.4s // {X+A,X+B+A,X+C+B+A,X+D+C+B} (+00AB)
|
||||
ext v25.16b, v26.16b, v25.16b, #12 // ext(0000,00AB,12)=000A
|
||||
add v24.4s, v24.4s, v25.4s // {X+A,X+B+A,X+C+B+A,X+D+C+B+A} (+000A)
|
||||
st1 {v24.4s}, [x0], #16 // write 4x32-bit final values
|
||||
.endm
|
||||
|
||||
function ff_compute_safe_ssd_integral_image_neon, export=1
|
||||
movi v26.4S, #0 // used as zero for the "rotations" in acc_sum_store
|
||||
sub x3, x3, w6, UXTW // s1 padding (s1_linesize - w)
|
||||
sub x5, x5, w6, UXTW // s2 padding (s2_linesize - w)
|
||||
sub x9, x0, w1, UXTW #2 // dst_top
|
||||
sub x1, x1, w6, UXTW // dst padding (dst_linesize_32 - w)
|
||||
movi v26.4s, #0 // used as zero for the "rotations" in acc_sum_store
|
||||
sub x3, x3, w6, uxtw // s1 padding (s1_linesize - w)
|
||||
sub x5, x5, w6, uxtw // s2 padding (s2_linesize - w)
|
||||
sub x9, x0, w1, uxtw #2 // dst_top
|
||||
sub x1, x1, w6, uxtw // dst padding (dst_linesize_32 - w)
|
||||
lsl x1, x1, #2 // dst padding expressed in bytes
|
||||
1: mov w10, w6 // width copy for each line
|
||||
sub x0, x0, #16 // beginning of the dst line minus 4 sums
|
||||
sub x8, x9, #4 // dst_top-1
|
||||
ld1 {v24.4S}, [x0], #16 // load ...X (contextual last sums)
|
||||
2: ld1 {v0.16B}, [x2], #16 // s1[x + 0..15]
|
||||
ld1 {v1.16B}, [x4], #16 // s2[x + 0..15]
|
||||
ld1 {v16.4S,v17.4S}, [x8], #32 // dst_top[x + 0..7 - 1]
|
||||
usubl v2.8H, v0.8B, v1.8B // d[x + 0..7] = s1[x + 0..7] - s2[x + 0..7]
|
||||
usubl2 v3.8H, v0.16B, v1.16B // d[x + 8..15] = s1[x + 8..15] - s2[x + 8..15]
|
||||
ld1 {v18.4S,v19.4S}, [x8], #32 // dst_top[x + 8..15 - 1]
|
||||
smull v4.4S, v2.4H, v2.4H // d[x + 0..3]^2
|
||||
smull2 v5.4S, v2.8H, v2.8H // d[x + 4..7]^2
|
||||
ld1 {v20.4S,v21.4S}, [x9], #32 // dst_top[x + 0..7]
|
||||
smull v6.4S, v3.4H, v3.4H // d[x + 8..11]^2
|
||||
smull2 v7.4S, v3.8H, v3.8H // d[x + 12..15]^2
|
||||
ld1 {v22.4S,v23.4S}, [x9], #32 // dst_top[x + 8..15]
|
||||
sub v0.4S, v20.4S, v16.4S // dst_top[x + 0..3] - dst_top[x + 0..3 - 1]
|
||||
sub v1.4S, v21.4S, v17.4S // dst_top[x + 4..7] - dst_top[x + 4..7 - 1]
|
||||
add v0.4S, v0.4S, v4.4S // + d[x + 0..3]^2
|
||||
add v1.4S, v1.4S, v5.4S // + d[x + 4..7]^2
|
||||
sub v2.4S, v22.4S, v18.4S // dst_top[x + 8..11] - dst_top[x + 8..11 - 1]
|
||||
sub v3.4S, v23.4S, v19.4S // dst_top[x + 12..15] - dst_top[x + 12..15 - 1]
|
||||
add v2.4S, v2.4S, v6.4S // + d[x + 8..11]^2
|
||||
add v3.4S, v3.4S, v7.4S // + d[x + 12..15]^2
|
||||
acc_sum_store v0.4S, v0.16B // accumulate and store dst[ 0..3]
|
||||
acc_sum_store v1.4S, v1.16B // accumulate and store dst[ 4..7]
|
||||
acc_sum_store v2.4S, v2.16B // accumulate and store dst[ 8..11]
|
||||
acc_sum_store v3.4S, v3.16B // accumulate and store dst[12..15]
|
||||
ld1 {v24.4s}, [x0], #16 // load ...X (contextual last sums)
|
||||
2: ld1 {v0.16b}, [x2], #16 // s1[x + 0..15]
|
||||
ld1 {v1.16b}, [x4], #16 // s2[x + 0..15]
|
||||
ld1 {v16.4s,v17.4s}, [x8], #32 // dst_top[x + 0..7 - 1]
|
||||
usubl v2.8h, v0.8b, v1.8b // d[x + 0..7] = s1[x + 0..7] - s2[x + 0..7]
|
||||
usubl2 v3.8h, v0.16b, v1.16b // d[x + 8..15] = s1[x + 8..15] - s2[x + 8..15]
|
||||
ld1 {v18.4s,v19.4s}, [x8], #32 // dst_top[x + 8..15 - 1]
|
||||
smull v4.4s, v2.4h, v2.4h // d[x + 0..3]^2
|
||||
smull2 v5.4s, v2.8h, v2.8h // d[x + 4..7]^2
|
||||
ld1 {v20.4s,v21.4s}, [x9], #32 // dst_top[x + 0..7]
|
||||
smull v6.4s, v3.4h, v3.4h // d[x + 8..11]^2
|
||||
smull2 v7.4s, v3.8h, v3.8h // d[x + 12..15]^2
|
||||
ld1 {v22.4s,v23.4s}, [x9], #32 // dst_top[x + 8..15]
|
||||
sub v0.4s, v20.4s, v16.4s // dst_top[x + 0..3] - dst_top[x + 0..3 - 1]
|
||||
sub v1.4s, v21.4s, v17.4s // dst_top[x + 4..7] - dst_top[x + 4..7 - 1]
|
||||
add v0.4s, v0.4s, v4.4s // + d[x + 0..3]^2
|
||||
add v1.4s, v1.4s, v5.4s // + d[x + 4..7]^2
|
||||
sub v2.4s, v22.4s, v18.4s // dst_top[x + 8..11] - dst_top[x + 8..11 - 1]
|
||||
sub v3.4s, v23.4s, v19.4s // dst_top[x + 12..15] - dst_top[x + 12..15 - 1]
|
||||
add v2.4s, v2.4s, v6.4s // + d[x + 8..11]^2
|
||||
add v3.4s, v3.4s, v7.4s // + d[x + 12..15]^2
|
||||
acc_sum_store v0.4s, v0.16b // accumulate and store dst[ 0..3]
|
||||
acc_sum_store v1.4s, v1.16b // accumulate and store dst[ 4..7]
|
||||
acc_sum_store v2.4s, v2.16b // accumulate and store dst[ 8..11]
|
||||
acc_sum_store v3.4s, v3.16b // accumulate and store dst[12..15]
|
||||
subs w10, w10, #16 // width dec
|
||||
b.ne 2b // loop til next line
|
||||
add x2, x2, x3 // skip to next line (s1)
|
||||
|
||||
@@ -822,6 +822,8 @@ static int config_input(AVFilterLink *inlink)
|
||||
if (s->dumpfile) {
|
||||
s->analysis_rdft = av_rdft_init(rdft_bits, DFT_R2C);
|
||||
s->dump_buf = av_malloc_array(s->analysis_rdft_len, sizeof(*s->dump_buf));
|
||||
if (!s->dump_buf)
|
||||
return AVERROR(ENOMEM);
|
||||
}
|
||||
|
||||
s->analysis_buf = av_malloc_array(s->analysis_rdft_len, sizeof(*s->analysis_buf));
|
||||
|
||||
@@ -69,4 +69,3 @@ ff_scene_sad_fn ff_scene_sad_get_fn(int depth)
|
||||
}
|
||||
return sad;
|
||||
}
|
||||
|
||||
|
||||
@@ -51,4 +51,3 @@ __global__ void Overlay_Cuda(
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -77,4 +77,3 @@ const AVCodecTag ff_codec_caf_tags[] = {
|
||||
{ AV_CODEC_ID_PCM_F64BE, MKTAG('l','p','c','m') },
|
||||
{ AV_CODEC_ID_NONE, 0 },
|
||||
};
|
||||
|
||||
|
||||
@@ -153,5 +153,3 @@ void ff_dash_fill_tmpl_params(char *dst, size_t buffer_size,
|
||||
t_cur = t_next;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -149,4 +149,3 @@ AVOutputFormat ff_fifo_test_muxer = {
|
||||
.priv_class = &failing_muxer_class,
|
||||
.flags = AVFMT_NOFILE | AVFMT_ALLOW_FLUSH,
|
||||
};
|
||||
|
||||
|
||||
@@ -102,4 +102,3 @@ AVInputFormat ff_g726le_demuxer = {
|
||||
.raw_codec_id = AV_CODEC_ID_ADPCM_G726LE,
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
@@ -886,6 +886,10 @@ static int parse_playlist(HLSContext *c, const char *url,
|
||||
ff_parse_key_value(ptr, (ff_parse_key_val_cb) handle_init_section_args,
|
||||
&info);
|
||||
cur_init_section = new_init_section(pls, &info, url);
|
||||
if (!cur_init_section) {
|
||||
ret = AVERROR(ENOMEM);
|
||||
goto fail;
|
||||
}
|
||||
cur_init_section->key_type = key_type;
|
||||
if (has_iv) {
|
||||
memcpy(cur_init_section->iv, iv, sizeof(iv));
|
||||
|
||||
@@ -192,4 +192,3 @@ void ff_hls_write_end_list(AVIOContext *out)
|
||||
return;
|
||||
avio_printf(out, "#EXT-X-ENDLIST\n");
|
||||
}
|
||||
|
||||
|
||||
@@ -91,4 +91,3 @@ AVInputFormat ff_truehd_demuxer = {
|
||||
.priv_class = &truehd_demuxer_class,
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
@@ -392,5 +392,3 @@ AVInputFormat ff_mpjpeg_demuxer = {
|
||||
.priv_class = &mpjpeg_demuxer_class,
|
||||
.flags = AVFMT_NOTIMESTAMPS,
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -571,4 +571,3 @@ RDT_HANDLER(live_video, "x-pn-multirate-realvideo-live", AVMEDIA_TYPE_VIDEO);
|
||||
RDT_HANDLER(live_audio, "x-pn-multirate-realaudio-live", AVMEDIA_TYPE_AUDIO);
|
||||
RDT_HANDLER(video, "x-pn-realvideo", AVMEDIA_TYPE_VIDEO);
|
||||
RDT_HANDLER(audio, "x-pn-realaudio", AVMEDIA_TYPE_AUDIO);
|
||||
|
||||
|
||||
+100
-100
@@ -25,16 +25,16 @@
|
||||
|
||||
function ff_vector_fmul_neon, export=1
|
||||
1: subs w3, w3, #16
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
ld1 {v2.4S, v3.4S}, [x1], #32
|
||||
ld1 {v4.4S, v5.4S}, [x2], #32
|
||||
ld1 {v6.4S, v7.4S}, [x2], #32
|
||||
fmul v16.4S, v0.4S, v4.4S
|
||||
fmul v17.4S, v1.4S, v5.4S
|
||||
fmul v18.4S, v2.4S, v6.4S
|
||||
fmul v19.4S, v3.4S, v7.4S
|
||||
st1 {v16.4S, v17.4S}, [x0], #32
|
||||
st1 {v18.4S, v19.4S}, [x0], #32
|
||||
ld1 {v0.4s, v1.4s}, [x1], #32
|
||||
ld1 {v2.4s, v3.4s}, [x1], #32
|
||||
ld1 {v4.4s, v5.4s}, [x2], #32
|
||||
ld1 {v6.4s, v7.4s}, [x2], #32
|
||||
fmul v16.4s, v0.4s, v4.4s
|
||||
fmul v17.4s, v1.4s, v5.4s
|
||||
fmul v18.4s, v2.4s, v6.4s
|
||||
fmul v19.4s, v3.4s, v7.4s
|
||||
st1 {v16.4s, v17.4s}, [x0], #32
|
||||
st1 {v18.4s, v19.4s}, [x0], #32
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
@@ -42,16 +42,16 @@ endfunc
|
||||
function ff_vector_fmac_scalar_neon, export=1
|
||||
mov x3, #-32
|
||||
1: subs w2, w2, #16
|
||||
ld1 {v16.4S, v17.4S}, [x0], #32
|
||||
ld1 {v18.4S, v19.4S}, [x0], x3
|
||||
ld1 {v4.4S, v5.4S}, [x1], #32
|
||||
ld1 {v6.4S, v7.4S}, [x1], #32
|
||||
fmla v16.4S, v4.4S, v0.S[0]
|
||||
fmla v17.4S, v5.4S, v0.S[0]
|
||||
fmla v18.4S, v6.4S, v0.S[0]
|
||||
fmla v19.4S, v7.4S, v0.S[0]
|
||||
st1 {v16.4S, v17.4S}, [x0], #32
|
||||
st1 {v18.4S, v19.4S}, [x0], #32
|
||||
ld1 {v16.4s, v17.4s}, [x0], #32
|
||||
ld1 {v18.4s, v19.4s}, [x0], x3
|
||||
ld1 {v4.4s, v5.4s}, [x1], #32
|
||||
ld1 {v6.4s, v7.4s}, [x1], #32
|
||||
fmla v16.4s, v4.4s, v0.s[0]
|
||||
fmla v17.4s, v5.4s, v0.s[0]
|
||||
fmla v18.4s, v6.4s, v0.s[0]
|
||||
fmla v19.4s, v7.4s, v0.s[0]
|
||||
st1 {v16.4s, v17.4s}, [x0], #32
|
||||
st1 {v18.4s, v19.4s}, [x0], #32
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
@@ -59,43 +59,43 @@ endfunc
|
||||
function ff_vector_fmul_scalar_neon, export=1
|
||||
mov w4, #15
|
||||
bics w3, w2, w4
|
||||
dup v16.4S, v0.S[0]
|
||||
dup v16.4s, v0.s[0]
|
||||
b.eq 3f
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
ld1 {v0.4s, v1.4s}, [x1], #32
|
||||
1: subs w3, w3, #16
|
||||
fmul v0.4S, v0.4S, v16.4S
|
||||
ld1 {v2.4S, v3.4S}, [x1], #32
|
||||
fmul v1.4S, v1.4S, v16.4S
|
||||
fmul v2.4S, v2.4S, v16.4S
|
||||
st1 {v0.4S, v1.4S}, [x0], #32
|
||||
fmul v3.4S, v3.4S, v16.4S
|
||||
fmul v0.4s, v0.4s, v16.4s
|
||||
ld1 {v2.4s, v3.4s}, [x1], #32
|
||||
fmul v1.4s, v1.4s, v16.4s
|
||||
fmul v2.4s, v2.4s, v16.4s
|
||||
st1 {v0.4s, v1.4s}, [x0], #32
|
||||
fmul v3.4s, v3.4s, v16.4s
|
||||
b.eq 2f
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
st1 {v2.4S, v3.4S}, [x0], #32
|
||||
ld1 {v0.4s, v1.4s}, [x1], #32
|
||||
st1 {v2.4s, v3.4s}, [x0], #32
|
||||
b 1b
|
||||
2: ands w2, w2, #15
|
||||
st1 {v2.4S, v3.4S}, [x0], #32
|
||||
st1 {v2.4s, v3.4s}, [x0], #32
|
||||
b.eq 4f
|
||||
3: ld1 {v0.4S}, [x1], #16
|
||||
fmul v0.4S, v0.4S, v16.4S
|
||||
st1 {v0.4S}, [x0], #16
|
||||
3: ld1 {v0.4s}, [x1], #16
|
||||
fmul v0.4s, v0.4s, v16.4s
|
||||
st1 {v0.4s}, [x0], #16
|
||||
subs w2, w2, #4
|
||||
b.gt 3b
|
||||
4: ret
|
||||
endfunc
|
||||
|
||||
function ff_vector_dmul_scalar_neon, export=1
|
||||
dup v16.2D, v0.D[0]
|
||||
ld1 {v0.2D, v1.2D}, [x1], #32
|
||||
dup v16.2d, v0.d[0]
|
||||
ld1 {v0.2d, v1.2d}, [x1], #32
|
||||
1: subs w2, w2, #8
|
||||
fmul v0.2D, v0.2D, v16.2D
|
||||
ld1 {v2.2D, v3.2D}, [x1], #32
|
||||
fmul v1.2D, v1.2D, v16.2D
|
||||
fmul v2.2D, v2.2D, v16.2D
|
||||
st1 {v0.2D, v1.2D}, [x0], #32
|
||||
fmul v3.2D, v3.2D, v16.2D
|
||||
ld1 {v0.2D, v1.2D}, [x1], #32
|
||||
st1 {v2.2D, v3.2D}, [x0], #32
|
||||
fmul v0.2d, v0.2d, v16.2d
|
||||
ld1 {v2.2d, v3.2d}, [x1], #32
|
||||
fmul v1.2d, v1.2d, v16.2d
|
||||
fmul v2.2d, v2.2d, v16.2d
|
||||
st1 {v0.2d, v1.2d}, [x0], #32
|
||||
fmul v3.2d, v3.2d, v16.2d
|
||||
ld1 {v0.2d, v1.2d}, [x1], #32
|
||||
st1 {v2.2d, v3.2d}, [x0], #32
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
@@ -108,49 +108,49 @@ function ff_vector_fmul_window_neon, export=1
|
||||
add x6, x3, x5, lsl #3 // win + 8 * (len - 2)
|
||||
add x5, x0, x5, lsl #3 // dst + 8 * (len - 2)
|
||||
mov x7, #-16
|
||||
ld1 {v0.4S}, [x1], #16 // s0
|
||||
ld1 {v2.4S}, [x3], #16 // wi
|
||||
ld1 {v1.4S}, [x2], x7 // s1
|
||||
1: ld1 {v3.4S}, [x6], x7 // wj
|
||||
ld1 {v0.4s}, [x1], #16 // s0
|
||||
ld1 {v2.4s}, [x3], #16 // wi
|
||||
ld1 {v1.4s}, [x2], x7 // s1
|
||||
1: ld1 {v3.4s}, [x6], x7 // wj
|
||||
subs x4, x4, #4
|
||||
fmul v17.4S, v0.4S, v2.4S // s0 * wi
|
||||
rev64 v4.4S, v1.4S
|
||||
rev64 v5.4S, v3.4S
|
||||
rev64 v17.4S, v17.4S
|
||||
ext v4.16B, v4.16B, v4.16B, #8 // s1_r
|
||||
ext v5.16B, v5.16B, v5.16B, #8 // wj_r
|
||||
ext v17.16B, v17.16B, v17.16B, #8 // (s0 * wi)_rev
|
||||
fmul v16.4S, v0.4S, v5.4S // s0 * wj_r
|
||||
fmla v17.4S, v1.4S, v3.4S // (s0 * wi)_rev + s1 * wj
|
||||
fmul v17.4s, v0.4s, v2.4s // s0 * wi
|
||||
rev64 v4.4s, v1.4s
|
||||
rev64 v5.4s, v3.4s
|
||||
rev64 v17.4s, v17.4s
|
||||
ext v4.16b, v4.16b, v4.16b, #8 // s1_r
|
||||
ext v5.16b, v5.16b, v5.16b, #8 // wj_r
|
||||
ext v17.16b, v17.16b, v17.16b, #8 // (s0 * wi)_rev
|
||||
fmul v16.4s, v0.4s, v5.4s // s0 * wj_r
|
||||
fmla v17.4s, v1.4s, v3.4s // (s0 * wi)_rev + s1 * wj
|
||||
b.eq 2f
|
||||
ld1 {v0.4S}, [x1], #16
|
||||
fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
|
||||
st1 {v17.4S}, [x5], x7
|
||||
ld1 {v2.4S}, [x3], #16
|
||||
ld1 {v1.4S}, [x2], x7
|
||||
st1 {v16.4S}, [x0], #16
|
||||
ld1 {v0.4s}, [x1], #16
|
||||
fmls v16.4s, v4.4s, v2.4s // s0 * wj_r - s1_r * wi
|
||||
st1 {v17.4s}, [x5], x7
|
||||
ld1 {v2.4s}, [x3], #16
|
||||
ld1 {v1.4s}, [x2], x7
|
||||
st1 {v16.4s}, [x0], #16
|
||||
b 1b
|
||||
2:
|
||||
fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
|
||||
st1 {v17.4S}, [x5], x7
|
||||
st1 {v16.4S}, [x0], #16
|
||||
fmls v16.4s, v4.4s, v2.4s // s0 * wj_r - s1_r * wi
|
||||
st1 {v17.4s}, [x5], x7
|
||||
st1 {v16.4s}, [x0], #16
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vector_fmul_add_neon, export=1
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
ld1 {v2.4S, v3.4S}, [x2], #32
|
||||
ld1 {v4.4S, v5.4S}, [x3], #32
|
||||
ld1 {v0.4s, v1.4s}, [x1], #32
|
||||
ld1 {v2.4s, v3.4s}, [x2], #32
|
||||
ld1 {v4.4s, v5.4s}, [x3], #32
|
||||
1: subs w4, w4, #8
|
||||
fmla v4.4S, v0.4S, v2.4S
|
||||
fmla v5.4S, v1.4S, v3.4S
|
||||
fmla v4.4s, v0.4s, v2.4s
|
||||
fmla v5.4s, v1.4s, v3.4s
|
||||
b.eq 2f
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
ld1 {v2.4S, v3.4S}, [x2], #32
|
||||
st1 {v4.4S, v5.4S}, [x0], #32
|
||||
ld1 {v4.4S, v5.4S}, [x3], #32
|
||||
ld1 {v0.4s, v1.4s}, [x1], #32
|
||||
ld1 {v2.4s, v3.4s}, [x2], #32
|
||||
st1 {v4.4s, v5.4s}, [x0], #32
|
||||
ld1 {v4.4s, v5.4s}, [x3], #32
|
||||
b 1b
|
||||
2: st1 {v4.4S, v5.4S}, [x0], #32
|
||||
2: st1 {v4.4s, v5.4s}, [x0], #32
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@@ -159,44 +159,44 @@ function ff_vector_fmul_reverse_neon, export=1
|
||||
add x2, x2, x3, lsl #2
|
||||
sub x2, x2, #32
|
||||
mov x4, #-32
|
||||
ld1 {v2.4S, v3.4S}, [x2], x4
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
ld1 {v2.4s, v3.4s}, [x2], x4
|
||||
ld1 {v0.4s, v1.4s}, [x1], #32
|
||||
1: subs x3, x3, #8
|
||||
rev64 v3.4S, v3.4S
|
||||
rev64 v2.4S, v2.4S
|
||||
ext v3.16B, v3.16B, v3.16B, #8
|
||||
ext v2.16B, v2.16B, v2.16B, #8
|
||||
fmul v16.4S, v0.4S, v3.4S
|
||||
fmul v17.4S, v1.4S, v2.4S
|
||||
rev64 v3.4s, v3.4s
|
||||
rev64 v2.4s, v2.4s
|
||||
ext v3.16b, v3.16b, v3.16b, #8
|
||||
ext v2.16b, v2.16b, v2.16b, #8
|
||||
fmul v16.4s, v0.4s, v3.4s
|
||||
fmul v17.4s, v1.4s, v2.4s
|
||||
b.eq 2f
|
||||
ld1 {v2.4S, v3.4S}, [x2], x4
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
st1 {v16.4S, v17.4S}, [x0], #32
|
||||
ld1 {v2.4s, v3.4s}, [x2], x4
|
||||
ld1 {v0.4s, v1.4s}, [x1], #32
|
||||
st1 {v16.4s, v17.4s}, [x0], #32
|
||||
b 1b
|
||||
2: st1 {v16.4S, v17.4S}, [x0], #32
|
||||
2: st1 {v16.4s, v17.4s}, [x0], #32
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_butterflies_float_neon, export=1
|
||||
1: ld1 {v0.4S}, [x0]
|
||||
ld1 {v1.4S}, [x1]
|
||||
1: ld1 {v0.4s}, [x0]
|
||||
ld1 {v1.4s}, [x1]
|
||||
subs w2, w2, #4
|
||||
fsub v2.4S, v0.4S, v1.4S
|
||||
fadd v3.4S, v0.4S, v1.4S
|
||||
st1 {v2.4S}, [x1], #16
|
||||
st1 {v3.4S}, [x0], #16
|
||||
fsub v2.4s, v0.4s, v1.4s
|
||||
fadd v3.4s, v0.4s, v1.4s
|
||||
st1 {v2.4s}, [x1], #16
|
||||
st1 {v3.4s}, [x0], #16
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_scalarproduct_float_neon, export=1
|
||||
movi v2.4S, #0
|
||||
1: ld1 {v0.4S}, [x0], #16
|
||||
ld1 {v1.4S}, [x1], #16
|
||||
movi v2.4s, #0
|
||||
1: ld1 {v0.4s}, [x0], #16
|
||||
ld1 {v1.4s}, [x1], #16
|
||||
subs w2, w2, #4
|
||||
fmla v2.4S, v0.4S, v1.4S
|
||||
fmla v2.4s, v0.4s, v1.4s
|
||||
b.gt 1b
|
||||
faddp v0.4S, v2.4S, v2.4S
|
||||
faddp s0, v0.2S
|
||||
faddp v0.4s, v2.4s, v2.4s
|
||||
faddp s0, v0.2s
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@@ -265,4 +265,3 @@ int av_aes_init(AVAES *a, const uint8_t *key, int key_bits, int decrypt)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -36,4 +36,3 @@ struct AVCUDADeviceContextInternal {
|
||||
};
|
||||
|
||||
#endif /* AVUTIL_HWCONTEXT_CUDA_INTERNAL_H */
|
||||
|
||||
|
||||
@@ -50,4 +50,3 @@ typedef struct AVQSVFramesContext {
|
||||
} AVQSVFramesContext;
|
||||
|
||||
#endif /* AVUTIL_HWCONTEXT_QSV_H */
|
||||
|
||||
|
||||
@@ -191,4 +191,3 @@ int main(void)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -21,57 +21,57 @@
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_resample_common_apply_filter_x4_float_neon, export=1
|
||||
movi v0.4S, #0 // accumulator
|
||||
1: ld1 {v1.4S}, [x1], #16 // src[0..3]
|
||||
ld1 {v2.4S}, [x2], #16 // filter[0..3]
|
||||
fmla v0.4S, v1.4S, v2.4S // accumulator += src[0..3] * filter[0..3]
|
||||
subs w3, w3, #4 // filter_length -= 4
|
||||
b.gt 1b // loop until filter_length
|
||||
faddp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
|
||||
faddp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.S}[0], [x0], #4 // write accumulator
|
||||
ret
|
||||
movi v0.4s, #0 // accumulator
|
||||
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
|
||||
ld1 {v2.4s}, [x2], #16 // filter[0..3]
|
||||
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
|
||||
subs w3, w3, #4 // filter_length -= 4
|
||||
b.gt 1b // loop until filter_length
|
||||
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.s}[0], [x0], #4 // write accumulator
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_resample_common_apply_filter_x8_float_neon, export=1
|
||||
movi v0.4S, #0 // accumulator
|
||||
1: ld1 {v1.4S}, [x1], #16 // src[0..3]
|
||||
ld1 {v2.4S}, [x2], #16 // filter[0..3]
|
||||
ld1 {v3.4S}, [x1], #16 // src[4..7]
|
||||
ld1 {v4.4S}, [x2], #16 // filter[4..7]
|
||||
fmla v0.4S, v1.4S, v2.4S // accumulator += src[0..3] * filter[0..3]
|
||||
fmla v0.4S, v3.4S, v4.4S // accumulator += src[4..7] * filter[4..7]
|
||||
subs w3, w3, #8 // filter_length -= 8
|
||||
b.gt 1b // loop until filter_length
|
||||
faddp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
|
||||
faddp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.S}[0], [x0], #4 // write accumulator
|
||||
ret
|
||||
movi v0.4s, #0 // accumulator
|
||||
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
|
||||
ld1 {v2.4s}, [x2], #16 // filter[0..3]
|
||||
ld1 {v3.4s}, [x1], #16 // src[4..7]
|
||||
ld1 {v4.4s}, [x2], #16 // filter[4..7]
|
||||
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
|
||||
fmla v0.4s, v3.4s, v4.4s // accumulator += src[4..7] * filter[4..7]
|
||||
subs w3, w3, #8 // filter_length -= 8
|
||||
b.gt 1b // loop until filter_length
|
||||
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.s}[0], [x0], #4 // write accumulator
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_resample_common_apply_filter_x4_s16_neon, export=1
|
||||
movi v0.4S, #0 // accumulator
|
||||
1: ld1 {v1.4H}, [x1], #8 // src[0..3]
|
||||
ld1 {v2.4H}, [x2], #8 // filter[0..3]
|
||||
smlal v0.4S, v1.4H, v2.4H // accumulator += src[0..3] * filter[0..3]
|
||||
subs w3, w3, #4 // filter_length -= 4
|
||||
b.gt 1b // loop until filter_length
|
||||
addp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
|
||||
addp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.S}[0], [x0], #4 // write accumulator
|
||||
ret
|
||||
movi v0.4s, #0 // accumulator
|
||||
1: ld1 {v1.4h}, [x1], #8 // src[0..3]
|
||||
ld1 {v2.4h}, [x2], #8 // filter[0..3]
|
||||
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
|
||||
subs w3, w3, #4 // filter_length -= 4
|
||||
b.gt 1b // loop until filter_length
|
||||
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.s}[0], [x0], #4 // write accumulator
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_resample_common_apply_filter_x8_s16_neon, export=1
|
||||
movi v0.4S, #0 // accumulator
|
||||
1: ld1 {v1.8H}, [x1], #16 // src[0..7]
|
||||
ld1 {v2.8H}, [x2], #16 // filter[0..7]
|
||||
smlal v0.4S, v1.4H, v2.4H // accumulator += src[0..3] * filter[0..3]
|
||||
smlal2 v0.4S, v1.8H, v2.8H // accumulator += src[4..7] * filter[4..7]
|
||||
subs w3, w3, #8 // filter_length -= 8
|
||||
b.gt 1b // loop until filter_length
|
||||
addp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
|
||||
addp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.S}[0], [x0], #4 // write accumulator
|
||||
ret
|
||||
movi v0.4s, #0 // accumulator
|
||||
1: ld1 {v1.8h}, [x1], #16 // src[0..7]
|
||||
ld1 {v2.8h}, [x2], #16 // filter[0..7]
|
||||
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
|
||||
smlal2 v0.4s, v1.8h, v2.8h // accumulator += src[4..7] * filter[4..7]
|
||||
subs w3, w3, #8 // filter_length -= 8
|
||||
b.gt 1b // loop until filter_length
|
||||
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.s}[0], [x0], #4 // write accumulator
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@@ -127,4 +127,3 @@ struct Resampler const swri_soxr_resampler={
|
||||
create, destroy, process, flush, NULL /* set_compensation */, get_delay,
|
||||
invert_initial_buffer, get_out_samples
|
||||
};
|
||||
|
||||
|
||||
@@ -156,4 +156,3 @@ int swr_convert_frame(SwrContext *s,
|
||||
|
||||
return convert_frame(s, out, in);
|
||||
}
|
||||
|
||||
|
||||
+55
-55
@@ -21,60 +21,60 @@
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_hscale_8_to_15_neon, export=1
|
||||
sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
|
||||
1: ldr w8, [x5], #4 // filterPos[idx]
|
||||
ldr w0, [x5], #4 // filterPos[idx + 1]
|
||||
ldr w11, [x5], #4 // filterPos[idx + 2]
|
||||
ldr w9, [x5], #4 // filterPos[idx + 3]
|
||||
mov x16, x4 // filter0 = filter
|
||||
add x12, x16, x7 // filter1 = filter0 + filterSize*2
|
||||
add x13, x12, x7 // filter2 = filter1 + filterSize*2
|
||||
add x4, x13, x7 // filter3 = filter2 + filterSize*2
|
||||
movi v0.2D, #0 // val sum part 1 (for dst[0])
|
||||
movi v1.2D, #0 // val sum part 2 (for dst[1])
|
||||
movi v2.2D, #0 // val sum part 3 (for dst[2])
|
||||
movi v3.2D, #0 // val sum part 4 (for dst[3])
|
||||
add x17, x3, w8, UXTW // srcp + filterPos[0]
|
||||
add x8, x3, w0, UXTW // srcp + filterPos[1]
|
||||
add x0, x3, w11, UXTW // srcp + filterPos[2]
|
||||
add x11, x3, w9, UXTW // srcp + filterPos[3]
|
||||
mov w15, w6 // filterSize counter
|
||||
2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}]
|
||||
ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1
|
||||
ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}]
|
||||
ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize
|
||||
uxtl v4.8H, v4.8B // unpack part 1 to 16-bit
|
||||
smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
|
||||
smlal2 v0.4S, v4.8H, v5.8H // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v16.8B}, [x0], #8 // srcp[filterPos[2] + {0..7}]
|
||||
ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize
|
||||
uxtl v6.8H, v6.8B // unpack part 2 to 16-bit
|
||||
smlal v1.4S, v6.4H, v7.4H // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
|
||||
uxtl v16.8H, v16.8B // unpack part 3 to 16-bit
|
||||
smlal v2.4S, v16.4H, v17.4H // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
|
||||
smlal2 v2.4S, v16.8H, v17.8H // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v18.8B}, [x11], #8 // srcp[filterPos[3] + {0..7}]
|
||||
smlal2 v1.4S, v6.8H, v7.8H // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize
|
||||
subs w15, w15, #8 // j -= 8: processed 8/filterSize
|
||||
uxtl v18.8H, v18.8B // unpack part 4 to 16-bit
|
||||
smlal v3.4S, v18.4H, v19.4H // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
|
||||
smlal2 v3.4S, v18.8H, v19.8H // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
|
||||
b.gt 2b // inner loop if filterSize not consumed completely
|
||||
addp v0.4S, v0.4S, v0.4S // part0 horizontal pair adding
|
||||
addp v1.4S, v1.4S, v1.4S // part1 horizontal pair adding
|
||||
addp v2.4S, v2.4S, v2.4S // part2 horizontal pair adding
|
||||
addp v3.4S, v3.4S, v3.4S // part3 horizontal pair adding
|
||||
addp v0.4S, v0.4S, v0.4S // part0 horizontal pair adding
|
||||
addp v1.4S, v1.4S, v1.4S // part1 horizontal pair adding
|
||||
addp v2.4S, v2.4S, v2.4S // part2 horizontal pair adding
|
||||
addp v3.4S, v3.4S, v3.4S // part3 horizontal pair adding
|
||||
zip1 v0.4S, v0.4S, v1.4S // part01 = zip values from part0 and part1
|
||||
zip1 v2.4S, v2.4S, v3.4S // part23 = zip values from part2 and part3
|
||||
mov v0.d[1], v2.d[0] // part0123 = zip values from part01 and part23
|
||||
subs w2, w2, #4 // dstW -= 4
|
||||
sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values
|
||||
st1 {v0.4H}, [x1], #8 // write to destination part0123
|
||||
b.gt 1b // loop until end of line
|
||||
sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
|
||||
1: ldr w8, [x5], #4 // filterPos[idx]
|
||||
ldr w0, [x5], #4 // filterPos[idx + 1]
|
||||
ldr w11, [x5], #4 // filterPos[idx + 2]
|
||||
ldr w9, [x5], #4 // filterPos[idx + 3]
|
||||
mov x16, x4 // filter0 = filter
|
||||
add x12, x16, x7 // filter1 = filter0 + filterSize*2
|
||||
add x13, x12, x7 // filter2 = filter1 + filterSize*2
|
||||
add x4, x13, x7 // filter3 = filter2 + filterSize*2
|
||||
movi v0.2d, #0 // val sum part 1 (for dst[0])
|
||||
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
||||
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
||||
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
||||
add x17, x3, w8, uxtw // srcp + filterPos[0]
|
||||
add x8, x3, w0, uxtw // srcp + filterPos[1]
|
||||
add x0, x3, w11, uxtw // srcp + filterPos[2]
|
||||
add x11, x3, w9, uxtw // srcp + filterPos[3]
|
||||
mov w15, w6 // filterSize counter
|
||||
2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
|
||||
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
|
||||
ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
|
||||
ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
|
||||
uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
|
||||
smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
|
||||
smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
|
||||
ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
|
||||
uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
|
||||
smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
|
||||
uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
|
||||
smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
|
||||
smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
|
||||
smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
|
||||
subs w15, w15, #8 // j -= 8: processed 8/filterSize
|
||||
uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
|
||||
smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
|
||||
smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
|
||||
b.gt 2b // inner loop if filterSize not consumed completely
|
||||
addp v0.4s, v0.4s, v0.4s // part0 horizontal pair adding
|
||||
addp v1.4s, v1.4s, v1.4s // part1 horizontal pair adding
|
||||
addp v2.4s, v2.4s, v2.4s // part2 horizontal pair adding
|
||||
addp v3.4s, v3.4s, v3.4s // part3 horizontal pair adding
|
||||
addp v0.4s, v0.4s, v0.4s // part0 horizontal pair adding
|
||||
addp v1.4s, v1.4s, v1.4s // part1 horizontal pair adding
|
||||
addp v2.4s, v2.4s, v2.4s // part2 horizontal pair adding
|
||||
addp v3.4s, v3.4s, v3.4s // part3 horizontal pair adding
|
||||
zip1 v0.4s, v0.4s, v1.4s // part01 = zip values from part0 and part1
|
||||
zip1 v2.4s, v2.4s, v3.4s // part23 = zip values from part2 and part3
|
||||
mov v0.d[1], v2.d[0] // part0123 = zip values from part01 and part23
|
||||
subs w2, w2, #4 // dstW -= 4
|
||||
sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
|
||||
st1 {v0.4h}, [x1], #8 // write to destination part0123
|
||||
b.gt 1b // loop until end of line
|
||||
ret
|
||||
endfunc
|
||||
|
||||
+32
-32
@@ -21,38 +21,38 @@
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_yuv2planeX_8_neon, export=1
|
||||
ld1 {v0.8B}, [x5] // load 8x8-bit dither
|
||||
cbz w6, 1f // check if offsetting present
|
||||
ext v0.8B, v0.8B, v0.8B, #3 // honor offsetting which can be 0 or 3 only
|
||||
1: uxtl v0.8H, v0.8B // extend dither to 16-bit
|
||||
ushll v1.4S, v0.4H, #12 // extend dither to 32-bit with left shift by 12 (part 1)
|
||||
ushll2 v2.4S, v0.8H, #12 // extend dither to 32-bit with left shift by 12 (part 2)
|
||||
mov x7, #0 // i = 0
|
||||
2: mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
|
||||
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
|
||||
mov w8, w1 // tmpfilterSize = filterSize
|
||||
mov x9, x2 // srcp = src
|
||||
mov x10, x0 // filterp = filter
|
||||
3: ldp x11, x12, [x9], #16 // get 2 pointers: src[j] and src[j+1]
|
||||
add x11, x11, x7, lsl #1 // &src[j ][i]
|
||||
add x12, x12, x7, lsl #1 // &src[j+1][i]
|
||||
ld1 {v5.8H}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
|
||||
ld1 {v6.8H}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
|
||||
ld1r {v7.8H}, [x10], #2 // read 1x16-bit coeff X at filter[j ] and duplicate across lanes
|
||||
ld1r {v16.8H}, [x10], #2 // read 1x16-bit coeff Y at filter[j+1] and duplicate across lanes
|
||||
smlal v3.4S, v5.4H, v7.4H // val0 += {A,B,C,D} * X
|
||||
smlal2 v4.4S, v5.8H, v7.8H // val1 += {E,F,G,H} * X
|
||||
smlal v3.4S, v6.4H, v16.4H // val0 += {I,J,K,L} * Y
|
||||
smlal2 v4.4S, v6.8H, v16.8H // val1 += {M,N,O,P} * Y
|
||||
subs w8, w8, #2 // tmpfilterSize -= 2
|
||||
b.gt 3b // loop until filterSize consumed
|
||||
ld1 {v0.8b}, [x5] // load 8x8-bit dither
|
||||
cbz w6, 1f // check if offsetting present
|
||||
ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
|
||||
1: uxtl v0.8h, v0.8b // extend dither to 16-bit
|
||||
ushll v1.4s, v0.4h, #12 // extend dither to 32-bit with left shift by 12 (part 1)
|
||||
ushll2 v2.4s, v0.8h, #12 // extend dither to 32-bit with left shift by 12 (part 2)
|
||||
mov x7, #0 // i = 0
|
||||
2: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
|
||||
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
|
||||
mov w8, w1 // tmpfilterSize = filterSize
|
||||
mov x9, x2 // srcp = src
|
||||
mov x10, x0 // filterp = filter
|
||||
3: ldp x11, x12, [x9], #16 // get 2 pointers: src[j] and src[j+1]
|
||||
add x11, x11, x7, lsl #1 // &src[j ][i]
|
||||
add x12, x12, x7, lsl #1 // &src[j+1][i]
|
||||
ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
|
||||
ld1 {v6.8h}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
|
||||
ld1r {v7.8h}, [x10], #2 // read 1x16-bit coeff X at filter[j ] and duplicate across lanes
|
||||
ld1r {v16.8h}, [x10], #2 // read 1x16-bit coeff Y at filter[j+1] and duplicate across lanes
|
||||
smlal v3.4s, v5.4h, v7.4h // val0 += {A,B,C,D} * X
|
||||
smlal2 v4.4s, v5.8h, v7.8h // val1 += {E,F,G,H} * X
|
||||
smlal v3.4s, v6.4h, v16.4h // val0 += {I,J,K,L} * Y
|
||||
smlal2 v4.4s, v6.8h, v16.8h // val1 += {M,N,O,P} * Y
|
||||
subs w8, w8, #2 // tmpfilterSize -= 2
|
||||
b.gt 3b // loop until filterSize consumed
|
||||
|
||||
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
|
||||
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
|
||||
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
|
||||
st1 {v3.8b}, [x3], #8 // write to destination
|
||||
subs w4, w4, #8 // dstW -= 8
|
||||
add x7, x7, #8 // i += 8
|
||||
b.gt 2b // loop until width consumed
|
||||
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
|
||||
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
|
||||
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
|
||||
st1 {v3.8b}, [x3], #8 // write to destination
|
||||
subs w4, w4, #8 // dstW -= 8
|
||||
add x7, x7, #8 // i += 8
|
||||
b.gt 2b // loop until width consumed
|
||||
ret
|
||||
endfunc
|
||||
|
||||
+115
-115
@@ -23,185 +23,185 @@
|
||||
|
||||
.macro load_yoff_ycoeff yoff ycoeff
|
||||
#if defined(__APPLE__)
|
||||
ldp w9, w10, [sp, #\yoff]
|
||||
ldp w9, w10, [sp, #\yoff]
|
||||
#else
|
||||
ldr w9, [sp, #\yoff]
|
||||
ldr w10, [sp, #\ycoeff]
|
||||
ldr w9, [sp, #\yoff]
|
||||
ldr w10, [sp, #\ycoeff]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro load_args_nv12
|
||||
ldr x8, [sp] // table
|
||||
load_yoff_ycoeff 8, 16 // y_offset, y_coeff
|
||||
ld1 {v1.1D}, [x8]
|
||||
dup v0.8H, w10
|
||||
dup v3.8H, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0 // w7 = linesizeC - width (paddingC)
|
||||
neg w11, w0
|
||||
ldr x8, [sp] // table
|
||||
load_yoff_ycoeff 8, 16 // y_offset, y_coeff
|
||||
ld1 {v1.1d}, [x8]
|
||||
dup v0.8h, w10
|
||||
dup v3.8h, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0 // w7 = linesizeC - width (paddingC)
|
||||
neg w11, w0
|
||||
.endm
|
||||
|
||||
.macro load_args_nv21
|
||||
load_args_nv12
|
||||
load_args_nv12
|
||||
.endm
|
||||
|
||||
.macro load_args_yuv420p
|
||||
ldr x13, [sp] // srcV
|
||||
ldr w14, [sp, #8] // linesizeV
|
||||
ldr x8, [sp, #16] // table
|
||||
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
|
||||
ld1 {v1.1D}, [x8]
|
||||
dup v0.8H, w10
|
||||
dup v3.8H, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
|
||||
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
|
||||
lsr w11, w0, #1
|
||||
neg w11, w11
|
||||
ldr x13, [sp] // srcV
|
||||
ldr w14, [sp, #8] // linesizeV
|
||||
ldr x8, [sp, #16] // table
|
||||
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
|
||||
ld1 {v1.1d}, [x8]
|
||||
dup v0.8h, w10
|
||||
dup v3.8h, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
|
||||
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
|
||||
lsr w11, w0, #1
|
||||
neg w11, w11
|
||||
.endm
|
||||
|
||||
.macro load_args_yuv422p
|
||||
ldr x13, [sp] // srcV
|
||||
ldr w14, [sp, #8] // linesizeV
|
||||
ldr x8, [sp, #16] // table
|
||||
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
|
||||
ld1 {v1.1D}, [x8]
|
||||
dup v0.8H, w10
|
||||
dup v3.8H, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
|
||||
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
|
||||
ldr x13, [sp] // srcV
|
||||
ldr w14, [sp, #8] // linesizeV
|
||||
ldr x8, [sp, #16] // table
|
||||
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
|
||||
ld1 {v1.1d}, [x8]
|
||||
dup v0.8h, w10
|
||||
dup v3.8h, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
|
||||
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
|
||||
.endm
|
||||
|
||||
.macro load_chroma_nv12
|
||||
ld2 {v16.8B, v17.8B}, [x6], #16
|
||||
ushll v18.8H, v16.8B, #3
|
||||
ushll v19.8H, v17.8B, #3
|
||||
ld2 {v16.8b, v17.8b}, [x6], #16
|
||||
ushll v18.8h, v16.8b, #3
|
||||
ushll v19.8h, v17.8b, #3
|
||||
.endm
|
||||
|
||||
.macro load_chroma_nv21
|
||||
ld2 {v16.8B, v17.8B}, [x6], #16
|
||||
ushll v19.8H, v16.8B, #3
|
||||
ushll v18.8H, v17.8B, #3
|
||||
ld2 {v16.8b, v17.8b}, [x6], #16
|
||||
ushll v19.8h, v16.8b, #3
|
||||
ushll v18.8h, v17.8b, #3
|
||||
.endm
|
||||
|
||||
.macro load_chroma_yuv420p
|
||||
ld1 {v16.8B}, [ x6], #8
|
||||
ld1 {v17.8B}, [x13], #8
|
||||
ushll v18.8H, v16.8B, #3
|
||||
ushll v19.8H, v17.8B, #3
|
||||
ld1 {v16.8b}, [ x6], #8
|
||||
ld1 {v17.8b}, [x13], #8
|
||||
ushll v18.8h, v16.8b, #3
|
||||
ushll v19.8h, v17.8b, #3
|
||||
.endm
|
||||
|
||||
.macro load_chroma_yuv422p
|
||||
load_chroma_yuv420p
|
||||
load_chroma_yuv420p
|
||||
.endm
|
||||
|
||||
.macro increment_nv12
|
||||
ands w15, w1, #1
|
||||
csel w16, w7, w11, ne // incC = (h & 1) ? paddincC : -width
|
||||
add x6, x6, w16, SXTW // srcC += incC
|
||||
ands w15, w1, #1
|
||||
csel w16, w7, w11, ne // incC = (h & 1) ? paddincC : -width
|
||||
add x6, x6, w16, sxtw // srcC += incC
|
||||
.endm
|
||||
|
||||
.macro increment_nv21
|
||||
increment_nv12
|
||||
increment_nv12
|
||||
.endm
|
||||
|
||||
.macro increment_yuv420p
|
||||
ands w15, w1, #1
|
||||
csel w16, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2
|
||||
csel w17, w14, w11, ne // incV = (h & 1) ? paddincV : -width/2
|
||||
add x6, x6, w16, SXTW // srcU += incU
|
||||
add x13, x13, w17, SXTW // srcV += incV
|
||||
ands w15, w1, #1
|
||||
csel w16, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2
|
||||
csel w17, w14, w11, ne // incV = (h & 1) ? paddincV : -width/2
|
||||
add x6, x6, w16, sxtw // srcU += incU
|
||||
add x13, x13, w17, sxtw // srcV += incV
|
||||
.endm
|
||||
|
||||
.macro increment_yuv422p
|
||||
add x6, x6, w7, SXTW // srcU += incU
|
||||
add x13, x13, w14, SXTW // srcV += incV
|
||||
add x6, x6, w7, sxtw // srcU += incU
|
||||
add x13, x13, w14, sxtw // srcV += incV
|
||||
.endm
|
||||
|
||||
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
|
||||
add v20.8H, v26.8H, v20.8H // Y1 + R1
|
||||
add v21.8H, v27.8H, v21.8H // Y2 + R2
|
||||
add v22.8H, v26.8H, v22.8H // Y1 + G1
|
||||
add v23.8H, v27.8H, v23.8H // Y2 + G2
|
||||
add v24.8H, v26.8H, v24.8H // Y1 + B1
|
||||
add v25.8H, v27.8H, v25.8H // Y2 + B2
|
||||
sqrshrun \r1, v20.8H, #1 // clip_u8((Y1 + R1) >> 1)
|
||||
sqrshrun \r2, v21.8H, #1 // clip_u8((Y2 + R1) >> 1)
|
||||
sqrshrun \g1, v22.8H, #1 // clip_u8((Y1 + G1) >> 1)
|
||||
sqrshrun \g2, v23.8H, #1 // clip_u8((Y2 + G1) >> 1)
|
||||
sqrshrun \b1, v24.8H, #1 // clip_u8((Y1 + B1) >> 1)
|
||||
sqrshrun \b2, v25.8H, #1 // clip_u8((Y2 + B1) >> 1)
|
||||
movi \a1, #255
|
||||
movi \a2, #255
|
||||
add v20.8h, v26.8h, v20.8h // Y1 + R1
|
||||
add v21.8h, v27.8h, v21.8h // Y2 + R2
|
||||
add v22.8h, v26.8h, v22.8h // Y1 + G1
|
||||
add v23.8h, v27.8h, v23.8h // Y2 + G2
|
||||
add v24.8h, v26.8h, v24.8h // Y1 + B1
|
||||
add v25.8h, v27.8h, v25.8h // Y2 + B2
|
||||
sqrshrun \r1, v20.8h, #1 // clip_u8((Y1 + R1) >> 1)
|
||||
sqrshrun \r2, v21.8h, #1 // clip_u8((Y2 + R1) >> 1)
|
||||
sqrshrun \g1, v22.8h, #1 // clip_u8((Y1 + G1) >> 1)
|
||||
sqrshrun \g2, v23.8h, #1 // clip_u8((Y2 + G1) >> 1)
|
||||
sqrshrun \b1, v24.8h, #1 // clip_u8((Y1 + B1) >> 1)
|
||||
sqrshrun \b2, v25.8h, #1 // clip_u8((Y2 + B1) >> 1)
|
||||
movi \a1, #255
|
||||
movi \a2, #255
|
||||
.endm
|
||||
|
||||
.macro declare_func ifmt ofmt
|
||||
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
|
||||
load_args_\ifmt
|
||||
load_args_\ifmt
|
||||
1:
|
||||
mov w8, w0 // w8 = width
|
||||
mov w8, w0 // w8 = width
|
||||
2:
|
||||
movi v5.8H, #4, lsl #8 // 128 * (1<<3)
|
||||
load_chroma_\ifmt
|
||||
sub v18.8H, v18.8H, v5.8H // U*(1<<3) - 128*(1<<3)
|
||||
sub v19.8H, v19.8H, v5.8H // V*(1<<3) - 128*(1<<3)
|
||||
sqdmulh v20.8H, v19.8H, v1.H[0] // V * v2r (R)
|
||||
sqdmulh v22.8H, v18.8H, v1.H[1] // U * u2g
|
||||
sqdmulh v19.8H, v19.8H, v1.H[2] // V * v2g
|
||||
add v22.8H, v22.8H, v19.8H // U * u2g + V * v2g (G)
|
||||
sqdmulh v24.8H, v18.8H, v1.H[3] // U * u2b (B)
|
||||
zip2 v21.8H, v20.8H, v20.8H // R2
|
||||
zip1 v20.8H, v20.8H, v20.8H // R1
|
||||
zip2 v23.8H, v22.8H, v22.8H // G2
|
||||
zip1 v22.8H, v22.8H, v22.8H // G1
|
||||
zip2 v25.8H, v24.8H, v24.8H // B2
|
||||
zip1 v24.8H, v24.8H, v24.8H // B1
|
||||
ld1 {v2.16B}, [x4], #16 // load luma
|
||||
ushll v26.8H, v2.8B, #3 // Y1*(1<<3)
|
||||
ushll2 v27.8H, v2.16B, #3 // Y2*(1<<3)
|
||||
sub v26.8H, v26.8H, v3.8H // Y1*(1<<3) - y_offset
|
||||
sub v27.8H, v27.8H, v3.8H // Y2*(1<<3) - y_offset
|
||||
sqdmulh v26.8H, v26.8H, v0.8H // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
|
||||
sqdmulh v27.8H, v27.8H, v0.8H // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
|
||||
movi v5.8h, #4, lsl #8 // 128 * (1<<3)
|
||||
load_chroma_\ifmt
|
||||
sub v18.8h, v18.8h, v5.8h // U*(1<<3) - 128*(1<<3)
|
||||
sub v19.8h, v19.8h, v5.8h // V*(1<<3) - 128*(1<<3)
|
||||
sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R)
|
||||
sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g
|
||||
sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g
|
||||
add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G)
|
||||
sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B)
|
||||
zip2 v21.8h, v20.8h, v20.8h // R2
|
||||
zip1 v20.8h, v20.8h, v20.8h // R1
|
||||
zip2 v23.8h, v22.8h, v22.8h // G2
|
||||
zip1 v22.8h, v22.8h, v22.8h // G1
|
||||
zip2 v25.8h, v24.8h, v24.8h // B2
|
||||
zip1 v24.8h, v24.8h, v24.8h // B1
|
||||
ld1 {v2.16b}, [x4], #16 // load luma
|
||||
ushll v26.8h, v2.8b, #3 // Y1*(1<<3)
|
||||
ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3)
|
||||
sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset
|
||||
sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset
|
||||
sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
|
||||
sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
|
||||
|
||||
.ifc \ofmt,argb // 1 2 3 0
|
||||
compute_rgba v5.8B,v6.8B,v7.8B,v4.8B, v17.8B,v18.8B,v19.8B,v16.8B
|
||||
compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,rgba // 0 1 2 3
|
||||
compute_rgba v4.8B,v5.8B,v6.8B,v7.8B, v16.8B,v17.8B,v18.8B,v19.8B
|
||||
compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,abgr // 3 2 1 0
|
||||
compute_rgba v7.8B,v6.8B,v5.8B,v4.8B, v19.8B,v18.8B,v17.8B,v16.8B
|
||||
compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,bgra // 2 1 0 3
|
||||
compute_rgba v6.8B,v5.8B,v4.8B,v7.8B, v18.8B,v17.8B,v16.8B,v19.8B
|
||||
compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
|
||||
.endif
|
||||
|
||||
st4 { v4.8B, v5.8B, v6.8B, v7.8B}, [x2], #32
|
||||
st4 {v16.8B,v17.8B,v18.8B,v19.8B}, [x2], #32
|
||||
subs w8, w8, #16 // width -= 16
|
||||
b.gt 2b
|
||||
add x2, x2, w3, SXTW // dst += padding
|
||||
add x4, x4, w5, SXTW // srcY += paddingY
|
||||
increment_\ifmt
|
||||
subs w1, w1, #1 // height -= 1
|
||||
b.gt 1b
|
||||
ret
|
||||
st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
|
||||
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
|
||||
subs w8, w8, #16 // width -= 16
|
||||
b.gt 2b
|
||||
add x2, x2, w3, sxtw // dst += padding
|
||||
add x4, x4, w5, sxtw // srcY += paddingY
|
||||
increment_\ifmt
|
||||
subs w1, w1, #1 // height -= 1
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro declare_rgb_funcs ifmt
|
||||
declare_func \ifmt, argb
|
||||
declare_func \ifmt, rgba
|
||||
declare_func \ifmt, abgr
|
||||
declare_func \ifmt, bgra
|
||||
declare_func \ifmt, argb
|
||||
declare_func \ifmt, rgba
|
||||
declare_func \ifmt, abgr
|
||||
declare_func \ifmt, bgra
|
||||
.endm
|
||||
|
||||
declare_rgb_funcs nv12
|
||||
|
||||
@@ -69,4 +69,3 @@ int ff_init_gamma_convert(SwsFilterDescriptor *desc, SwsSlice * src, uint16_t *t
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
+2
-2
@@ -909,7 +909,7 @@ yuv2ya16_X_c_template(SwsContext *c, const int16_t *lumFilter,
|
||||
int A = 0xffff;
|
||||
|
||||
for (j = 0; j < lumFilterSize; j++)
|
||||
Y += lumSrc[j][i] * lumFilter[j];
|
||||
Y += lumSrc[j][i] * (unsigned)lumFilter[j];
|
||||
|
||||
Y >>= 15;
|
||||
Y += (1<<3) + 0x8000;
|
||||
@@ -918,7 +918,7 @@ yuv2ya16_X_c_template(SwsContext *c, const int16_t *lumFilter,
|
||||
if (hasAlpha) {
|
||||
A = -0x40000000 + (1<<14);
|
||||
for (j = 0; j < lumFilterSize; j++)
|
||||
A += alpSrc[j][i] * lumFilter[j];
|
||||
A += alpSrc[j][i] * (unsigned)lumFilter[j];
|
||||
|
||||
A >>= 15;
|
||||
A += 0x8000;
|
||||
|
||||
@@ -318,5 +318,3 @@ void ff_init_vscale_pfn(SwsContext *c,
|
||||
lumCtx->pfn.yuv2anyX = yuv2anyX;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -26,6 +26,23 @@
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libswscale/swscale_internal.h"
|
||||
|
||||
#undef EMMS_IF_MMX
|
||||
|
||||
#if defined(COMPILE_TEMPLATE_MMX) || defined(COMPILE_TEMPLATE_MMXEXT)
|
||||
// Don't use emms_c() directly as it may entail an av_get_cpu_flags() call.
|
||||
#if HAVE_MMX_INLINE
|
||||
# define EMMS_IF_MMX __asm__ volatile ("emms" ::: "memory");
|
||||
#elif HAVE_MM_EMPTY
|
||||
# include <mmintrin.h>
|
||||
# define EMMS_IF_MMX _mm_empty();
|
||||
#else
|
||||
# include "libavutil/x86/emms.h"
|
||||
# define EMMS_IF_MMX emms_c();
|
||||
#endif
|
||||
#else
|
||||
#define EMMS_IF_MMX
|
||||
#endif
|
||||
|
||||
#define YUV2RGB_LOOP(depth) \
|
||||
h_size = (c->dstW + 7) & ~7; \
|
||||
if (h_size * depth > FFABS(dstStride[0])) \
|
||||
@@ -84,6 +101,7 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
|
||||
|
||||
RENAME(ff_yuv_420_rgb15)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
|
||||
}
|
||||
EMMS_IF_MMX
|
||||
return srcSliceH;
|
||||
}
|
||||
|
||||
@@ -104,6 +122,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
|
||||
|
||||
RENAME(ff_yuv_420_rgb16)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
|
||||
}
|
||||
EMMS_IF_MMX
|
||||
return srcSliceH;
|
||||
}
|
||||
|
||||
@@ -118,6 +137,7 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
|
||||
|
||||
RENAME(ff_yuv_420_rgb32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
|
||||
}
|
||||
EMMS_IF_MMX
|
||||
return srcSliceH;
|
||||
}
|
||||
|
||||
@@ -132,6 +152,7 @@ static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
|
||||
|
||||
RENAME(ff_yuv_420_bgr32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
|
||||
}
|
||||
EMMS_IF_MMX
|
||||
return srcSliceH;
|
||||
}
|
||||
|
||||
@@ -146,6 +167,7 @@ static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
|
||||
const uint8_t *pa = src[3] + y * srcStride[3];
|
||||
RENAME(ff_yuva_420_rgb32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
|
||||
}
|
||||
EMMS_IF_MMX
|
||||
return srcSliceH;
|
||||
}
|
||||
|
||||
@@ -161,6 +183,7 @@ static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
|
||||
const uint8_t *pa = src[3] + y * srcStride[3];
|
||||
RENAME(ff_yuva_420_bgr32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
|
||||
}
|
||||
EMMS_IF_MMX
|
||||
return srcSliceH;
|
||||
}
|
||||
#endif
|
||||
@@ -176,6 +199,7 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
|
||||
|
||||
RENAME(ff_yuv_420_rgb24)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
|
||||
}
|
||||
EMMS_IF_MMX
|
||||
return srcSliceH;
|
||||
}
|
||||
|
||||
@@ -190,6 +214,6 @@ static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
|
||||
|
||||
RENAME(ff_yuv_420_bgr24)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
|
||||
}
|
||||
EMMS_IF_MMX
|
||||
return srcSliceH;
|
||||
}
|
||||
|
||||
|
||||
@@ -111,4 +111,3 @@ outpoint 00:00.40
|
||||
|
||||
file %SRCFILE%
|
||||
inpoint 00:00.40
|
||||
|
||||
|
||||
@@ -32,4 +32,3 @@ fate-ffprobe_xml: CMD = run $(FFPROBE_COMMAND) -of xml
|
||||
FATE_FFPROBE += $(FATE_FFPROBE-yes)
|
||||
|
||||
fate-ffprobe: $(FATE_FFPROBE)
|
||||
|
||||
|
||||
@@ -7,5 +7,3 @@ tests/data/add_keyframe_index.flv: ffmpeg$(PROGSSUF)$(EXESUF) | tests/data
|
||||
FATE_AFILTER-$(call ALLYES, FLV_MUXER FLV_DEMUXER AVDEVICE TESTSRC_FILTER LAVFI_INDEV FLV_ENCODER) += fate-flv-add_keyframe_index
|
||||
fate-flv-add_keyframe_index: tests/data/add_keyframe_index.flv
|
||||
fate-flv-add_keyframe_index: CMD = ffmetadata -flags +bitexact -i $(TARGET_PATH)/tests/data/add_keyframe_index.flv
|
||||
|
||||
|
||||
|
||||
@@ -30,4 +30,3 @@ FATE_SAMPLES_LOSSLESS_AUDIO += $(FATE_SAMPLES_LOSSLESS_AUDIO-yes)
|
||||
|
||||
FATE_SAMPLES_FFMPEG += $(FATE_SAMPLES_LOSSLESS_AUDIO)
|
||||
fate-lossless-audio: $(FATE_SAMPLES_LOSSLESS_AUDIO)
|
||||
|
||||
|
||||
@@ -9,4 +9,3 @@ file %SRCFILE%
|
||||
inpoint 00:00.20
|
||||
outpoint 00:00.40
|
||||
file_packet_metadata dummy=1
|
||||
|
||||
|
||||
@@ -16,4 +16,3 @@ inpoint 00:02.20
|
||||
file %SRCFILE%
|
||||
inpoint 00:01.80
|
||||
outpoint 00:02.00
|
||||
|
||||
|
||||
Executable
+58
@@ -0,0 +1,58 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Copyright (c) 2025 Martin Storsjo
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
cd $(dirname $0)/..
|
||||
|
||||
if [ "$1" = "--apply" ]; then
|
||||
apply=1
|
||||
fi
|
||||
|
||||
ret=0
|
||||
|
||||
for i in */aarch64/*.S */aarch64/*/*.S; do
|
||||
if ! [ -f "$i" ]; then
|
||||
continue
|
||||
fi
|
||||
case $i in
|
||||
libavcodec/aarch64/h264idct_neon.S|libavcodec/aarch64/h26x/epel_neon.S|libavcodec/aarch64/h26x/qpel_neon.S|libavcodec/aarch64/vc1dsp_neon.S)
|
||||
# Skip files with known (and tolerated) deviations from the tool.
|
||||
continue
|
||||
esac
|
||||
./tools/indent_arm_assembly.pl < "$i" > tmp.S || ret=$?
|
||||
if ! git diff --quiet --no-index "$i" tmp.S; then
|
||||
if [ -n "$apply" ]; then
|
||||
mv tmp.S "$i"
|
||||
else
|
||||
git --no-pager diff --no-index "$i" tmp.S
|
||||
fi
|
||||
ret=1
|
||||
fi
|
||||
done
|
||||
|
||||
rm -f tmp.S
|
||||
|
||||
exit $ret
|
||||
Executable
+243
@@ -0,0 +1,243 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright (c) 2025 Martin Storsjo
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
# A script for reformatting ARM/AArch64 assembly according to the following
|
||||
# style:
|
||||
# - Instructions start after 8 columns, operands start after 24 columns
|
||||
# - Vector register layouts and modifiers like "uxtw" are written in lowercase
|
||||
# - Optionally align operand columns vertically according to their
|
||||
# maximum width (accommodating for e.g. x0 vs x10, or v0.8b vs v16.16b).
|
||||
#
|
||||
# The script can be executed as "indent_arm_assembly.pl file [outfile]".
|
||||
# If no outfile is specified, the given file is overwritten in place.
|
||||
#
|
||||
# Alternatively, the if no file parameters are given, the script reads input
|
||||
# code on stdin, and outputs the reformatted code on stdout.
|
||||
|
||||
use strict;
|
||||
|
||||
my $indent_operands = 0;
|
||||
my $instr_indent = 8;
|
||||
my $operand_indent = 24;
|
||||
my $match_indent = 0;
|
||||
my $file;
|
||||
my $outfile;
|
||||
|
||||
while (@ARGV) {
|
||||
my $opt = shift;
|
||||
|
||||
if ($opt eq "-operands") {
|
||||
$indent_operands = 1;
|
||||
} elsif ($opt eq "-indent") {
|
||||
$instr_indent = shift;
|
||||
} elsif ($opt eq "-operand-indent") {
|
||||
$operand_indent = shift;
|
||||
} elsif ($opt eq "-match-indent") {
|
||||
$match_indent = 1;
|
||||
} else {
|
||||
if (!$file) {
|
||||
$file = $opt;
|
||||
} elsif (!$outfile) {
|
||||
$outfile = $opt;
|
||||
} else {
|
||||
die "Unrecognized parameter $opt\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($operand_indent < $instr_indent) {
|
||||
die "Can't indent operands to $operand_indent while indenting " .
|
||||
"instructions to $instr_indent\n";
|
||||
}
|
||||
|
||||
# Return a string consisting of n spaces
|
||||
sub spaces {
|
||||
my $n = $_[0];
|
||||
return " " x $n;
|
||||
}
|
||||
|
||||
sub indentcolumns {
|
||||
my $input = $_[0];
|
||||
my $chars = $_[1];
|
||||
my @operands = split(/,/, $input);
|
||||
my $num = @operands;
|
||||
my $ret = "";
|
||||
for (my $i = 0; $i < $num; $i++) {
|
||||
my $cur = $operands[$i];
|
||||
# Trim out leading/trailing whitespace
|
||||
$cur =~ s/^\s+|\s+$//g;
|
||||
$ret .= $cur;
|
||||
if ($i + 1 < $num) {
|
||||
# If we have a following operand, add a comma and whitespace to
|
||||
# align the next operand.
|
||||
my $next = $operands[$i+1];
|
||||
my $len = length($cur);
|
||||
if ($len > $chars) {
|
||||
# If this operand was too wide for the intended column width,
|
||||
# don't try to realign the line at all, just return the input
|
||||
# untouched.
|
||||
return $input;
|
||||
}
|
||||
my $pad = $chars - $len;
|
||||
if ($next =~ /[su]xt[bhw]|[la]s[lr]/) {
|
||||
# If the next item isn't a regular operand, but a modifier,
|
||||
# don't try to align that. E.g. "add x0, x0, w1, uxtw #1".
|
||||
$pad = 0;
|
||||
}
|
||||
$ret .= "," . spaces(1 + $pad);
|
||||
}
|
||||
}
|
||||
return $ret;
|
||||
}
|
||||
|
||||
# Realign the operands part of an instruction line, making each operand
|
||||
# take up the maximum width for that kind of operand.
|
||||
sub columns {
|
||||
my $rest = $_[0];
|
||||
if ($rest !~ /,/) {
|
||||
# No commas, no operands to split and align
|
||||
return $rest;
|
||||
}
|
||||
if ($rest =~ /{|[^\w]\[/) {
|
||||
# Check for instructions that use register ranges, like {v0.8b,v1.8b}
|
||||
# or mem address operands, like "ldr x0, [sp]" - we skip trying to
|
||||
# realign these.
|
||||
return $rest;
|
||||
}
|
||||
if ($rest =~ /v[0-9]+\.[0-9]+[bhsd]/) {
|
||||
# If we have references to aarch64 style vector registers, like
|
||||
# v0.8b, then align all operands to the maximum width of such
|
||||
# operands - v16.16b.
|
||||
#
|
||||
# TODO: Ideally, we'd handle mixed operand types individually.
|
||||
return indentcolumns($rest, 7);
|
||||
}
|
||||
# Indent operands according to the maximum width of regular registers,
|
||||
# like x10.
|
||||
return indentcolumns($rest, 3);
|
||||
}
|
||||
|
||||
my $in;
|
||||
my $out;
|
||||
my $tempfile;
|
||||
|
||||
if ($file) {
|
||||
open(INPUT, "$file") or die "Unable to open $file: $!";
|
||||
$in = *INPUT;
|
||||
if ($outfile) {
|
||||
open(OUTPUT, ">$outfile") or die "Unable to open $outfile: $!";
|
||||
} else {
|
||||
$tempfile = "$file.tmp";
|
||||
open(OUTPUT, ">$tempfile") or die "Unable to open $tempfile: $!";
|
||||
}
|
||||
$out = *OUTPUT;
|
||||
} else {
|
||||
$in = *STDIN;
|
||||
$out = *STDOUT;
|
||||
}
|
||||
|
||||
while (<$in>) {
|
||||
# Trim off trailing whitespace.
|
||||
chomp;
|
||||
if (/^([\.\w\d]+:)?(\s+)([\w\\][\w\\\.]*)(?:(\s+)(.*)|$)/) {
|
||||
my $label = $1;
|
||||
my $indent = $2;
|
||||
my $instr = $3;
|
||||
my $origspace = $4;
|
||||
my $rest = $5;
|
||||
|
||||
my $orig_operand_indent = length($label) + length($indent) +
|
||||
length($instr) + length($origspace);
|
||||
|
||||
if ($indent_operands) {
|
||||
$rest = columns($rest);
|
||||
}
|
||||
|
||||
my $size = $instr_indent;
|
||||
if ($match_indent) {
|
||||
# Try to check the current attempted indent size and normalize
|
||||
# to it; match existing ident sizes of 4, 8, 10 and 12 columns.
|
||||
my $cur_indent = length($label) + length($indent);
|
||||
if ($cur_indent >= 3 && $cur_indent <= 5) {
|
||||
$size = 4;
|
||||
} elsif ($cur_indent >= 7 && $cur_indent <= 9) {
|
||||
$size = 8;
|
||||
} elsif ($cur_indent == 10 || $cur_indent == 12) {
|
||||
$size = $cur_indent;
|
||||
}
|
||||
}
|
||||
if (length($label) >= $size) {
|
||||
# Not enough space for the label; just add a space between the label
|
||||
# and the instruction.
|
||||
$indent = " ";
|
||||
} else {
|
||||
$indent = spaces($size - length($label));
|
||||
}
|
||||
|
||||
my $instr_end = length($label) + length($indent) + length($instr);
|
||||
$size = $operand_indent - $instr_end;
|
||||
if ($match_indent) {
|
||||
# Check how the operands currently seem to be indented.
|
||||
my $cur_indent = $orig_operand_indent;
|
||||
if ($cur_indent >= 11 && $cur_indent <= 13) {
|
||||
$size = 12;
|
||||
} elsif ($cur_indent >= 14 && $cur_indent <= 17) {
|
||||
$size = 16;
|
||||
} elsif ($cur_indent >= 18 && $cur_indent <= 22) {
|
||||
$size = 20;
|
||||
} elsif ($cur_indent >= 23 && $cur_indent <= 27) {
|
||||
$size = 24;
|
||||
}
|
||||
$size -= $instr_end;
|
||||
}
|
||||
my $operand_space = " ";
|
||||
if ($size > 0) {
|
||||
$operand_space = spaces($size);
|
||||
}
|
||||
|
||||
# Lowercase the aarch64 vector layout description, .8B -> .8b
|
||||
$rest =~ s/(\.[84216]*[BHSD])/lc($1)/ge;
|
||||
# Lowercase modifiers like "uxtw" or "lsl"
|
||||
$rest =~ s/([SU]XT[BWH]|[LA]S[LR])/lc($1)/ge;
|
||||
|
||||
# Reassemble the line
|
||||
if ($rest eq "") {
|
||||
$_ = $label . $indent . $instr;
|
||||
} else {
|
||||
$_ = $label . $indent . $instr . $operand_space . $rest;
|
||||
}
|
||||
}
|
||||
print $out $_ . "\n";
|
||||
}
|
||||
|
||||
if ($file) {
|
||||
close(INPUT);
|
||||
close(OUTPUT);
|
||||
}
|
||||
if ($tempfile) {
|
||||
rename($tempfile, $file);
|
||||
}
|
||||
Reference in New Issue
Block a user