Bump minor versions after release branch

Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
2021-03-20 01:02:11 +01:00
694 changed files with 6002 additions and 10759 deletions
@@ -1,23 +0,0 @@
-exclude: ^tests/ref/
-
-repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v5.0.0
-  hooks:
-    - id: check-case-conflict
-    - id: check-executables-have-shebangs
-    - id: check-illegal-windows-names
-    - id: check-shebang-scripts-are-executable
-    - id: check-yaml
-    - id: end-of-file-fixer
-    - id: fix-byte-order-marker
-    - id: mixed-line-ending
-    - id: trailing-whitespace
- repo: local
-  hooks:
-    - id: aarch64-asm-indent
-      name: fix aarch64 assembly indentation
-      files: ^.*/aarch64/.*\.S$
-      language: script
-      entry: ./tools/check_arm_indent.sh --apply
-      pass_filenames: false
@@ -1,29 +0,0 @@
-name: Lint
-
-on:
-  push:
-    branches:
-      - release/4.4
-  pull_request:
-
-jobs:
-  lint:
-    name: Pre-Commit
-    runs-on: utilities
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install pre-commit CI
-        id: install
-        run: |
-            python3 -m venv ~/pre-commit
-            ~/pre-commit/bin/pip install --upgrade pip setuptools
-            ~/pre-commit/bin/pip install pre-commit
-            echo "envhash=$({ python3 --version && cat .forgejo/pre-commit/config.yaml; } | sha256sum | cut -d' ' -f1)" >> $FORGEJO_OUTPUT
-      - name: Cache
-        uses: actions/cache@v4
-        with:
-          path: ~/.cache/pre-commit
-          key: pre-commit-${{ steps.install.outputs.envhash }}
-      - name: Run pre-commit CI
-        run: ~/pre-commit/bin/pre-commit run -c .forgejo/pre-commit/config.yaml --show-diff-on-failure --color=always --all-files
@@ -1,80 +0,0 @@
-name: Test
-
-on:
-  push:
-    branches:
-      - release/4.4
-  pull_request:
-
-jobs:
-  run_fate:
-    name: Fate (${{ matrix.runner }}, ${{ matrix.shared }}, ${{ matrix.bits }} bit)
-    strategy:
-      fail-fast: false
-      matrix:
-        runner: [linux-aarch64]
-        shared: ['static']
-        bits: ['64']
-        include:
-          - runner: linux-amd64
-            shared: 'static'
-            bits: '32'
-          - runner: linux-amd64
-            shared: 'shared'
-            bits: '64'
-    runs-on: ${{ matrix.runner }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Configure
-        run: |
-          ./configure --enable-gpl --enable-nonfree --enable-memory-poisoning --assert-level=2 \
-              $([ "${{ matrix.bits }}" != "32" ] || echo --arch=x86_32 --extra-cflags=-m32 --extra-cxxflags=-m32 --extra-ldflags=-m32) \
-              $([ "${{ matrix.shared }}" != "shared" ] || echo --enable-shared --disable-static) \
-              || CFGRES=$? && CFGRES=$?
-          cat ffbuild/config.log
-          exit $CFGRES
-      - name: Build
-        run: make -j$(nproc)
-      - name: Restore Cached Fate-Suite
-        id: cache
-        uses: actions/cache/restore@v4
-        with:
-          path: fate-suite
-          key: fate-suite
-          restore-keys: |
-            fate-suite-
-      - name: Sync Fate-Suite
-        id: fate
-        run: |
-          make fate-rsync SAMPLES=$PWD/fate-suite
-          echo "hash=$(find fate-suite -type f -printf "%P %s %T@\n" | sort | sha256sum | cut -d' ' -f1)" >> $FORGEJO_OUTPUT
-      - name: Cache Fate-Suite
-        uses: actions/cache/save@v4
-        if: ${{ format('fate-suite-{0}', steps.fate.outputs.hash) != steps.cache.outputs.cache-matched-key }}
-        with:
-          path: fate-suite
-          key: fate-suite-${{ steps.fate.outputs.hash }}
-      - name: Run Fate
-        run: LD_LIBRARY_PATH="$(printf "%s:" "$PWD"/lib*)$PWD" make fate fate-build SAMPLES=$PWD/fate-suite -j$(nproc)
-  compile_only:
-    name: Fate (Win64, Build-Only)
-    strategy:
-      fail-fast: false
-      matrix:
-        image: ["ghcr.io/btbn/ffmpeg-builds/win64-gpl-4.4:latest"]
-    runs-on: linux-amd64
-    container: ${{ matrix.image }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Configure
-        run: |
-          ./configure --pkg-config-flags="--static" $FFBUILD_TARGET_FLAGS $FF_CONFIGURE \
-              --cc="$CC" --cxx="$CXX" --ar="$AR" --ranlib="$RANLIB" --nm="$NM" \
-              --extra-cflags="$FF_CFLAGS" --extra-cxxflags="$FF_CXXFLAGS" \
-              --extra-libs="$FF_LIBS" --extra-ldflags="$FF_LDFLAGS" --extra-ldexeflags="$FF_LDEXEFLAGS"
-      - name: Build
-        run: make -j$(nproc)
-      - name: Run Fate
-        run: make -j$(nproc) fate-build
@@ -55,7 +55,7 @@ modified by someone else and passed on, the recipients should know
 that what they have is not the original version, so that the original
 author's reputation will not be affected by problems that might be
 introduced by others.
-
+
  Finally, software patents pose a constant threat to the existence of
 any free program.  We wish to make sure that a company cannot
 effectively restrict the users of a free program by obtaining a
@@ -111,7 +111,7 @@ modification follow.  Pay close attention to the difference between a
 "work based on the library" and a "work that uses the library".  The
 former contains code derived from the library, whereas the latter must
 be combined with the library in order to run.
-
+
                  GNU LESSER GENERAL PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

@@ -158,7 +158,7 @@ Library.
  You may charge a fee for the physical act of transferring a copy,
 and you may at your option offer warranty protection in exchange for a
 fee.
-
+
  2. You may modify your copy or copies of the Library or any portion
 of it, thus forming a work based on the Library, and copy and
 distribute such modifications or work under the terms of Section 1
@@ -216,7 +216,7 @@ instead of to this License.  (If a newer version than version 2 of the
 ordinary GNU General Public License has appeared, then you can specify
 that version instead if you wish.)  Do not make any other change in
 these notices.
-
+
  Once this change is made in a given copy, it is irreversible for
 that copy, so the ordinary GNU General Public License applies to all
 subsequent copies and derivative works made from that copy.
@@ -267,7 +267,7 @@ Library will still fall under Section 6.)
 distribute the object code for the work under the terms of Section 6.
 Any executables containing that work also fall under Section 6,
 whether or not they are linked directly with the Library itself.
-
+
  6. As an exception to the Sections above, you may also combine or
 link a "work that uses the Library" with the Library to produce a
 work containing portions of the Library, and distribute that work
@@ -329,7 +329,7 @@ restrictions of other proprietary libraries that do not normally
 accompany the operating system.  Such a contradiction means you cannot
 use both them and the Library together in an executable that you
 distribute.
-
+
  7. You may place library facilities that are a work based on the
 Library side-by-side in a single library together with other library
 facilities not covered by this License, and distribute such a combined
@@ -370,7 +370,7 @@ subject to these terms and conditions.  You may not impose any further
 restrictions on the recipients' exercise of the rights granted herein.
 You are not responsible for enforcing compliance by third parties with
 this License.
-
+
  11. If, as a consequence of a court judgment or allegation of patent
 infringement or for any other reason (not limited to patent issues),
 conditions are imposed on you (whether by court order, agreement or
@@ -422,7 +422,7 @@ conditions either of that version or of any later version published by
 the Free Software Foundation.  If the Library does not specify a
 license version number, you may choose any version ever published by
 the Free Software Foundation.
-
+
  14. If you wish to incorporate parts of the Library into other free
 programs whose distribution conditions are incompatible with these,
 write to the author to ask for permission.  For software which is
@@ -456,7 +456,7 @@ SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
 DAMAGES.

                     END OF TERMS AND CONDITIONS
-
+
           How to Apply These Terms to Your New Libraries

  If you develop a new library, and you want it to be of the greatest
@@ -1,6 +1,6 @@
-See the Git history of the project (https://git.ffmpeg.org/ffmpeg) to
+See the Git history of the project (git://source.ffmpeg.org/ffmpeg) to
 get the names of people who have contributed to FFmpeg.

 To check the log, you can type the command "git log" in the FFmpeg
 source directory, or browse the online repository at
-https://git.ffmpeg.org/ffmpeg
+http://source.ffmpeg.org.
@@ -15,11 +15,3 @@ NOTICE
 ------

 - Non system dependencies (e.g. libx264, libvpx) are disabled by default.
-
-NOTICE for Package Maintainers
------------------------------
-
- - It is recommended to build FFmpeg twice, first with minimal external dependencies so
-   that 3rd party packages, which depend on FFmpegs libavutil/libavfilter/libavcodec/libavformat
-   can then be built. And last build FFmpeg with full dependancies (which may in turn depend on
-   some of these 3rd party packages). This avoids circular dependencies during build.
@@ -583,12 +583,10 @@ wm4
 Releases
 ========

-7.0                                     Michael Niedermayer
-6.1                                     Michael Niedermayer
-5.1                                     Michael Niedermayer
-4.4                                     Michael Niedermayer
-3.4                                     Michael Niedermayer
 2.8                                     Michael Niedermayer
+2.7                                     Michael Niedermayer
+2.6                                     Michael Niedermayer
+2.5                                     Michael Niedermayer

 If you want to maintain an older release, please contact us

@@ -617,7 +615,6 @@ Jean Delvare                  7CA6 9F44 60F1 BDC4 1FD2 C858 A552 6B9B B3CD 4E6A
 Loren Merritt                 ABD9 08F4 C920 3F65 D8BE 35D7 1540 DAA7 060F 56DE
 Lynne                         FE50 139C 6805 72CA FD52 1F8D A2FE A5F0 3F03 4464
 Michael Niedermayer           9FF2 128B 147E F673 0BAD F133 611E C787 040B 0FAB
-                              DD1E C9E8 DE08 5C62 9B3E 1846 B18E 8928 B394 8D64
 Nicolas George                24CE 01CE 9ACC 5CEB 74D8 8D9D B063 D997 36E5 4C93
 Nikolay Aleksandrov           8978 1D8C FB71 588E 4B27 EAA8 C4F0 B5FC E011 13B1
 Panagiotis Issaris            6571 13A3 33D9 3726 F728 AA98 F643 B12E ECF3 E029
@@ -1 +1 @@
-4.4.7
+4.4.git
@@ -1,15 +0,0 @@
-
-              ┌────────────────────────────────────┐
-              │ RELEASE NOTES for FFmpeg 4.4 "Rao" │
-              └────────────────────────────────────┘
-
-   The FFmpeg Project proudly presents FFmpeg 4.4 "Rao", about 10
-   months after the release of FFmpeg 4.3.
-
-   A complete Changelog is available at the root of the project, and the
-   complete Git history on https://git.ffmpeg.org/gitweb/ffmpeg.git
-
-   We hope you will like this release as much as we enjoyed working on it, and
-   as usual, if you have any questions about it, or any FFmpeg related topic,
-   feel free to join us on the #ffmpeg IRC channel (on irc.libera.chat) or ask
-   on the mailing-lists.
@@ -416,9 +416,7 @@ Advanced options (experts only):
  --enable-hardcoded-tables use hardcoded tables instead of runtime generation
  --disable-safe-bitstream-reader
                           disable buffer boundary checking in bitreaders
-                           (This disables some security checks and can cause undefined behavior,
-                            crashes and arbitrary code execution, it may be faster, but
-                            should only be used with trusted input)
+                           (faster, but may crash)
  --sws-max-filter-size=N  the max filter size swscale uses [$sws_max_filter_size_default]

 Optimization options (experts only):
@@ -538,7 +536,7 @@ die(){

 If you think configure made a mistake, make sure you are using the latest
 version from Git.  If the latest version fails, report the problem to the
-ffmpeg-user@ffmpeg.org mailing list or IRC #ffmpeg on irc.libera.chat.
+ffmpeg-user@ffmpeg.org mailing list or IRC #ffmpeg on irc.freenode.net.
 EOF
    if disabled logging; then
        cat <<EOF
@@ -1737,6 +1735,7 @@ EXTERNAL_LIBRARY_GPL_LIST="
 EXTERNAL_LIBRARY_NONFREE_LIST="
    decklink
    libfdk_aac
+    openssl
    libtls
 "

@@ -1828,7 +1827,6 @@ EXTERNAL_LIBRARY_LIST="
    mediacodec
    openal
    opengl
-    openssl
    pocketsphinx
    vapoursynth
 "
@@ -2062,7 +2060,6 @@ ARCH_EXT_LIST_PPC="
    ldbrx
    power8
    ppc4xx
-    vec_xl
    vsx
 "

@@ -2343,7 +2340,6 @@ HAVE_LIST="
    opencl_vaapi_intel_media
    perl
    pod2man
-    posix_ioctl
    texi2html
 "

@@ -2553,7 +2549,6 @@ altivec_deps="ppc"
 dcbzl_deps="ppc"
 ldbrx_deps="ppc"
 ppc4xx_deps="ppc"
-vec_xl_deps="altivec"
 vsx_deps="altivec"
 power8_deps="vsx"

@@ -2766,7 +2761,6 @@ indeo3_decoder_select="hpeldsp"
 indeo4_decoder_select="ividsp"
 indeo5_decoder_select="ividsp"
 interplay_video_decoder_select="hpeldsp"
-ipu_decoder_select="mpegvideo"
 jpegls_decoder_select="mjpeg_decoder"
 jv_decoder_select="blockdsp"
 lagarith_decoder_select="llviddsp"
@@ -3273,7 +3267,7 @@ librav1e_encoder_deps="librav1e"
 librav1e_encoder_select="extract_extradata_bsf"
 librsvg_decoder_deps="librsvg"
 libshine_encoder_deps="libshine"
-libshine_encoder_select="audio_frame_queue mpegaudioheader"
+libshine_encoder_select="audio_frame_queue"
 libspeex_decoder_deps="libspeex"
 libspeex_encoder_deps="libspeex"
 libspeex_encoder_select="audio_frame_queue"
@@ -3370,7 +3364,6 @@ opus_muxer_select="ogg_muxer"
 psp_muxer_select="mov_muxer"
 rtp_demuxer_select="sdp_demuxer"
 rtp_muxer_select="golomb jpegtables"
-rtp_mpegts_muxer_select="mpegts_muxer rtp_muxer"
 rtpdec_select="asf_demuxer jpegtables mov_demuxer mpegts_demuxer rm_demuxer rtp_protocol srtp"
 rtsp_demuxer_select="http_protocol rtpdec"
 rtsp_muxer_select="rtp_muxer http_protocol rtp_protocol rtpenc_chain"
@@ -3711,23 +3704,23 @@ cws2fws_extralibs="zlib_extralibs"

 # libraries, in any order
 avcodec_deps="avutil"
-avcodec_suggest="libm stdatomic"
+avcodec_suggest="libm"
 avcodec_select="null_bsf"
 avdevice_deps="avformat avcodec avutil"
-avdevice_suggest="libm stdatomic"
+avdevice_suggest="libm"
 avfilter_deps="avutil"
-avfilter_suggest="libm stdatomic"
+avfilter_suggest="libm"
 avformat_deps="avcodec avutil"
-avformat_suggest="libm network zlib stdatomic"
+avformat_suggest="libm network zlib"
 avresample_deps="avutil"
 avresample_suggest="libm"
-avutil_suggest="clock_gettime ffnvcodec libm libdrm libmfx opencl user32 vaapi vulkan videotoolbox corefoundation corevideo coremedia bcrypt stdatomic"
+avutil_suggest="clock_gettime ffnvcodec libm libdrm libmfx opencl user32 vaapi vulkan videotoolbox corefoundation corevideo coremedia bcrypt"
 postproc_deps="avutil gpl"
-postproc_suggest="libm stdatomic"
+postproc_suggest="libm"
 swresample_deps="avutil"
-swresample_suggest="libm libsoxr stdatomic"
+swresample_suggest="libm libsoxr"
 swscale_deps="avutil"
-swscale_suggest="libm stdatomic"
+swscale_suggest="libm"

 avcodec_extralibs="pthreads_extralibs iconv_extralibs dxva2_extralibs"
 avfilter_extralibs="pthreads_extralibs"
@@ -5374,7 +5367,6 @@ case $target_os in
        ;;
    netbsd)
        disable symver
-        enable section_data_rel_ro
        oss_indev_extralibs="-lossaudio"
        oss_outdev_extralibs="-lossaudio"
        enabled gcc || check_ldflags -Wl,-zmuldefs
@@ -5393,7 +5385,6 @@ case $target_os in
        disable symver
        ;;
    freebsd)
-        enable section_data_rel_ro
        ;;
    bsd/os)
        add_extralibs -lpoll -lgnugetopt
@@ -5970,11 +5961,6 @@ elif enabled ppc; then
        check_cpp_condition power8 "altivec.h" "defined(_ARCH_PWR8)"
    fi

-    if enabled altivec; then
-        check_cc vec_xl altivec.h "const unsigned char *y1i = { 0 };
-                                   vector unsigned char y0 = vec_xl(0, y1i);"
-    fi
-
 elif enabled x86; then

    check_builtin rdtsc    intrin.h   "__rdtsc()"
@@ -6198,14 +6184,7 @@ check_headers asm/types.h
 # it seems there are versions of clang in some distros that try to use the
 # gcc headers, which explodes for stdatomic
 # so we also check that atomics actually work here
-#
-# some configurations also require linking to libatomic, so try
-# both with -latomic and without
-for LATOMIC in "-latomic" ""; do
-    check_builtin stdatomic stdatomic.h                                                 \
-        "atomic_int foo, bar = ATOMIC_VAR_INIT(-1); atomic_store(&foo, 0); foo += bar"  \
-        $LATOMIC && eval stdatomic_extralibs="\$LATOMIC" && break
-done
+check_builtin stdatomic stdatomic.h "atomic_int foo, bar = ATOMIC_VAR_INIT(-1); atomic_store(&foo, 0); foo += bar"

 check_lib advapi32 "windows.h"            RegCloseKey          -ladvapi32
 check_lib bcrypt   "windows.h bcrypt.h"   BCryptGenRandom      -lbcrypt &&
@@ -6543,10 +6522,7 @@ enabled omx_rpi           && { test_code cc OMX_Core.h OMX_IndexConfigBrcmVideoR
                               die "ERROR: OpenMAX IL headers from raspberrypi/firmware not found"; } &&
                             enable omx
 enabled omx               && require_headers OMX_Core.h
-enabled openssl           && { { check_pkg_config openssl "openssl >= 3.0.0" openssl/ssl.h OPENSSL_init_ssl &&
-                                 { enabled gplv3 || ! enabled gpl || enabled nonfree || die "ERROR: OpenSSL >=3.0.0 requires --enable-version3"; }; } ||
-                               { enabled gpl && ! enabled nonfree && die "ERROR: OpenSSL <3.0.0 is incompatible with the gpl"; } ||
-                               check_pkg_config openssl openssl openssl/ssl.h OPENSSL_init_ssl ||
+enabled openssl           && { check_pkg_config openssl openssl openssl/ssl.h OPENSSL_init_ssl ||
                               check_pkg_config openssl openssl openssl/ssl.h SSL_library_init ||
                               check_lib openssl openssl/ssl.h OPENSSL_init_ssl -lssl -lcrypto ||
                               check_lib openssl openssl/ssl.h SSL_library_init -lssl -lcrypto ||
@@ -6577,7 +6553,7 @@ fi

 if enabled sdl2; then
    SDL2_CONFIG="${cross_prefix}sdl2-config"
-    test_pkg_config sdl2 "sdl2 >= 2.0.1 sdl2 < 3.0.0" SDL_events.h SDL_PollEvent
+    test_pkg_config sdl2 "sdl2 >= 2.0.1 sdl2 < 2.1.0" SDL_events.h SDL_PollEvent
    if disabled sdl2 && "${SDL2_CONFIG}" --version > /dev/null 2>&1; then
        sdl2_cflags=$("${SDL2_CONFIG}" --cflags)
        sdl2_extralibs=$("${SDL2_CONFIG}" --libs)
@@ -6621,15 +6597,13 @@ enabled makeinfo \
 disabled makeinfo_html && texi2html --help 2> /dev/null | grep -q 'init-file' && enable texi2html || disable texi2html
 perl -v            > /dev/null 2>&1 && enable perl      || disable perl
 pod2man --help     > /dev/null 2>&1 && enable pod2man   || disable pod2man
-rsync --help 2> /dev/null | grep -q 'contimeout=' && enable rsync_contimeout || disable rsync_contimeout
-
-check_headers linux/fb.h
-check_headers linux/videodev2.h
-test_code cc linux/videodev2.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
-test_code cc sys/ioctl.h "int ioctl(int, int, ...)" && enable posix_ioctl
+rsync --help 2> /dev/null | grep -q 'contimeout' && enable rsync_contimeout || disable rsync_contimeout

 # check V4L2 codecs available in the API
 if enabled v4l2_m2m; then
+    check_headers linux/fb.h
+    check_headers linux/videodev2.h
+    test_code cc linux/videodev2.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
    check_cc v4l2_m2m linux/videodev2.h "int i = V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M | V4L2_BUF_FLAG_LAST;"
    check_cc vc1_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VC1_ANNEX_G;"
    check_cc mpeg1_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG1;"
@@ -6674,7 +6648,7 @@ enabled alsa && { check_pkg_config alsa alsa "alsa/asoundlib.h" snd_pcm_htimesta
 enabled libjack &&
    require_pkg_config libjack jack jack/jack.h jack_port_get_latency_range

-enabled sndio && check_pkg_config sndio sndio sndio.h sio_open
+enabled sndio && check_lib sndio sndio.h sio_open -lsndio

 if enabled libcdio; then
    check_pkg_config libcdio libcdio_paranoia "cdio/cdda.h cdio/paranoia.h" cdio_cddap_open ||
@@ -6775,7 +6749,7 @@ enabled vulkan &&

 if enabled x86; then
    case $target_os in
-        freebsd|mingw32*|mingw64*|win32|win64|linux|cygwin*)
+        mingw32*|mingw64*|win32|win64|linux|cygwin*)
            ;;
        *)
            disable ffnvcodec cuvid nvdec nvenc
@@ -7374,7 +7348,6 @@ if enabled ppc; then
    echo "POWER8 enabled            ${power8-no}"
    echo "PPC 4xx optimizations     ${ppc4xx-no}"
    echo "dcbzl available           ${dcbzl-no}"
-    echo "vec_xl available          ${vec_xl-no}"
 fi
 echo "debug symbols             ${debug-no}"
 echo "strip symbols             ${stripping-no}"
@@ -7527,6 +7500,7 @@ LD_LIB=$LD_LIB
 LD_PATH=$LD_PATH
 DLLTOOL=$dlltool
 WINDRES=$windres
+DEPWINDRES=$dep_cc
 DOXYGEN=$doxygen
 LDFLAGS=$LDFLAGS
 LDEXEFLAGS=$LDEXEFLAGS
@@ -7609,7 +7583,7 @@ cat > $TMPH <<EOF
 #define FFMPEG_CONFIG_H
 #define FFMPEG_CONFIGURATION "$(c_escape $FFMPEG_CONFIGURATION)"
 #define FFMPEG_LICENSE "$(c_escape $license)"
-#define CONFIG_THIS_YEAR 2026
+#define CONFIG_THIS_YEAR 2021
 #define FFMPEG_DATADIR "$(eval c_escape $datadir)"
 #define AVCONV_DATADIR "$(eval c_escape $datadir)"
 #define CC_IDENT "$(c_escape ${cc_ident:-Unknown compiler})"
@@ -38,7 +38,7 @@ PROJECT_NAME           = FFmpeg
 # could be handy for archiving the generated documentation or if some version
 # control system is used.

-PROJECT_NUMBER         = 4.4.7
+PROJECT_NUMBER         =

 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -3,9 +3,9 @@
 The FFmpeg developers.

 For details about the authorship, see the Git history of the project
-(https://git.ffmpeg.org/ffmpeg), e.g. by typing the command
+(git://source.ffmpeg.org/ffmpeg), e.g. by typing the command
@command{git log} in the FFmpeg source directory, or browsing the
-online repository at @url{https://git.ffmpeg.org/ffmpeg}.
+online repository at @url{http://source.ffmpeg.org}.

 Maintainers for the specific components are listed in the file
@file{MAINTAINERS} in the source code tree.
@@ -63,3 +63,4 @@ make -j<num>
 make -k
    Continue build in case of errors, this is useful for the regression tests
    sometimes but note that it will still not run all reg tests.
+
@@ -327,13 +327,6 @@ segment index to start live streams at (negative values are from the end).
@item allowed_extensions
 ',' separated list of file extensions that hls is allowed to access.

-@item extension_picky
-This blocks disallowed extensions from probing
-It also requires all available segments to have matching extensions to the format
-except mpegts, which is always allowed.
-It is recommended to set the whitelists correctly instead of depending on extensions
-Enabled by default.
-
@item max_reload
 Maximum number of times a insufficient list is attempted to be reloaded.
 Default value is 1000.
@@ -88,3 +88,4 @@ lead to this decision.

 The decisions from the TC are final, until the matters are reopened after
 no less than one year.
+
@@ -762,25 +762,6 @@ In case you need finer control over how valgrind is invoked, use the
@code{--target-exec='valgrind <your_custom_valgrind_options>} option in
 your configure line instead.

-@anchor{Maintenance}
-@chapter Maintenance process
-
-@anchor{MAINTAINERS}
-@section MAINTAINERS
-
-The developers maintaining each part of the codebase are listed in @file{MAINTAINERS}.
-Being listed in @file{MAINTAINERS}, gives one the right to have git write access to
-the specific repository.
-
-@anchor{Becoming a maintainer}
-@section Becoming a maintainer
-
-People add themselves to @file{MAINTAINERS} by sending a patch like any other code
-change. These get reviewed by the community like any other patch. It is expected
-that, if someone has an objection to a new maintainer, she is willing to object
-in public with her full name and is willing to take over maintainership for the area.
-
-
@anchor{Release process}
@chapter Release process

@@ -96,7 +96,6 @@ int main(int argc, char *argv[])
    avio_ctx = avio_alloc_context(avio_ctx_buffer, avio_ctx_buffer_size,
                                  0, &bd, &read_packet, NULL, NULL);
    if (!avio_ctx) {
-        av_freep(&avio_ctx_buffer);
        ret = AVERROR(ENOMEM);
        goto end;
    }
@@ -127,10 +127,6 @@ int main(int argc, char **argv)
    outfilename = argv[2];

    pkt = av_packet_alloc();
-    if (!pkt) {
-        fprintf(stderr, "Could not allocate AVPacket\n");
-        exit(1); /* or proper cleanup and returning */
-    }

    /* find the MPEG audio decoder */
    codec = avcodec_find_decoder(AV_CODEC_ID_MP2);
@@ -164,7 +160,7 @@ int main(int argc, char **argv)
    }
    outfile = fopen(outfilename, "wb");
    if (!outfile) {
-        fprintf(stderr, "Could not open %s\n", outfilename);
+        av_free(c);
        exit(1);
    }

@@ -137,9 +137,11 @@ static int decode_packet(AVCodecContext *dec, const AVPacket *pkt)
            ret = output_audio_frame(frame);

        av_frame_unref(frame);
+        if (ret < 0)
+            return ret;
    }

-    return ret;
+    return 0;
 }

 static int open_codec_context(int *stream_idx,
@@ -350,7 +350,8 @@ static int write_audio_frame(AVFormatContext *oc, OutputStream *ost)
    if (frame) {
        /* convert samples from native format to destination codec format, using the resampler */
        /* compute destination number of samples */
-        dst_nb_samples = swr_get_delay(ost->swr_ctx, c->sample_rate) + frame->nb_samples;
+        dst_nb_samples = av_rescale_rnd(swr_get_delay(ost->swr_ctx, c->sample_rate) + frame->nb_samples,
+                                        c->sample_rate, c->sample_rate, AV_ROUND_UP);
        av_assert0(dst_nb_samples == frame->nb_samples);

        /* when we pass a frame to the encoder, it may keep a reference to it
@@ -91,10 +91,6 @@ static int encode_write(AVCodecContext *avctx, AVFrame *frame, FILE *fout)
        enc_pkt->stream_index = 0;
        ret = fwrite(enc_pkt->data, enc_pkt->size, 1, fout);
        av_packet_unref(enc_pkt);
-        if (ret != enc_pkt->size) {
-            ret = AVERROR(errno);
-            break;
-        }
    }

 end:
@@ -218,8 +218,10 @@ static int dec_enc(AVPacket *pkt, AVCodec *enc_codec)

 fail:
        av_frame_free(&frame);
+        if (ret < 0)
+            return ret;
    }
-    return ret;
+    return 0;
 }

 int main(int argc, char **argv)
@@ -1,5 +1,5 @@
 slot=                                    # some unique identifier
-repo=https://git.ffmpeg.org/ffmpeg.git   # the source repository
+repo=git://source.ffmpeg.org/ffmpeg.git  # the source repository
 #branch=release/2.6                       # the branch to test
 samples=                                 # path to samples directory
 workdir=                                 # directory in which to do all the work
@@ -11,21 +11,16 @@ ignore_tests=
 # the following are optional and map to configure options
 arch=
 cpu=
-toolchain=
 cross_prefix=
 as=
 cc=
-cxx=
 ld=
-nm=
 target_os=
 sysroot=
 target_exec=
 target_path=
 target_samples=
 extra_cflags=
-extra_cxxflags=
-extra_objcflags=
 extra_ldflags=
 extra_libs=
 extra_conf=     # extra configure options not covered above
@@ -53,7 +53,7 @@ Most distribution and operating system provide a package for it.
@section Cloning the source tree

@example
-git clone https://git.ffmpeg.org/ffmpeg.git <target>
+git clone git://source.ffmpeg.org/ffmpeg <target>
@end example

 This will put the FFmpeg sources into the directory @var{<target>}.
@@ -143,7 +143,7 @@ git log <filename(s)>
@end example

 You may also use the graphical tools like @command{gitview} or @command{gitk}
-or the web interface available at @url{https://git.ffmpeg.org/ffmpeg.git}.
+or the web interface available at @url{http://source.ffmpeg.org/}.

@section Checking source tree status

@@ -187,18 +187,11 @@ to make sure you don't have untracked files or deletions.
 git add [-i|-p|-A] <filenames/dirnames>
@end example

-Make sure you have told Git your name, email address and GPG key
+Make sure you have told Git your name and email address

@example
 git config --global user.name "My Name"
 git config --global user.email my@@email.invalid
-git config --global user.signingkey ABCDEF0123245
-@end example
-
-Enable signing all commits or use -S
-
-@example
-git config --global commit.gpgsign true
@end example

 Use @option{--global} to set the global configuration for all your Git checkouts.
@@ -400,19 +393,6 @@ git checkout -b svn_23456 $SHA1
 where @var{$SHA1} is the commit hash from the @command{git log} output.


-@chapter gpg key generation
-
-If you have no gpg key yet, we recommend that you create a ed25519 based key as it
-is small, fast and secure. Especially it results in small signatures in git.
-
-@example
-gpg --default-new-key-algo "ed25519/cert,sign+cv25519/encr" --quick-generate-key "human@@server.com"
-@end example
-
-When generating a key, make sure the email specified matches the email used in git as some sites like
-github consider mismatches a reason to declare such commits unverified. After generating a key you
-can add it to the MAINTAINER file and upload it to a keyserver.
-
@chapter Pre-push checklist

 Once you have a set of commits that you feel are ready for pushing,
@@ -157,3 +157,4 @@ PFD[32]   would for example be signed 32 bit little-endian IEEE float
@item XVID @tab non-compliant MPEG-4 generated by old Xvid
@item XVIX @tab non-compliant MPEG-4 generated by old Xvid with interlacing bug
@end multitable
+
@@ -20,45 +20,8 @@
 # License along with FFmpeg; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

-# Texinfo 7.0 changed the syntax of various functions.
-# Provide a shim for older versions.
-sub ff_set_from_init_file($$) {
-    my $key = shift;
-    my $value = shift;
-    if (exists &{'texinfo_set_from_init_file'}) {
-        texinfo_set_from_init_file($key, $value);
-    } else {
-        set_from_init_file($key, $value);
-    }
-}
-
-sub ff_get_conf($) {
-    my $key = shift;
-    if (exists &{'texinfo_get_conf'}) {
-        texinfo_get_conf($key);
-    } else {
-        get_conf($key);
-    }
-}
-
-sub get_formatting_function($$) {
-    my $obj = shift;
-    my $func = shift;
-
-    my $sub = $obj->can('formatting_function');
-    if ($sub) {
-        return $obj->formatting_function($func);
-    } else {
-        return $obj->{$func};
-    }
-}
-
-# determine texinfo version
-my $program_version_num = version->declare(ff_get_conf('PACKAGE_VERSION'))->numify;
-my $program_version_6_8 = $program_version_num >= 6.008000;
-
 # no navigation elements
-ff_set_from_init_file('HEADERS', 0);
+set_from_init_file('HEADERS', 0);

 sub ffmpeg_heading_command($$$$$)
 {
@@ -92,7 +55,7 @@ sub ffmpeg_heading_command($$$$$)
        $element = $command->{'parent'};
    }
    if ($element) {
-        $result .= &{get_formatting_function($self, 'format_element_header')}($self, $cmdname,
+        $result .= &{$self->{'format_element_header'}}($self, $cmdname,
                                                       $command, $element);
    }

@@ -149,11 +112,7 @@ sub ffmpeg_heading_command($$$$$)
                $cmdname
                    = $Texinfo::Common::level_to_structuring_command{$cmdname}->[$heading_level];
            }
-            # format_heading_text expects an array of headings for texinfo >= 7.0
-            if ($program_version_num >= 7.000000) {
-                $heading = [$heading];
-            }
-            $result .= &{get_formatting_function($self,'format_heading_text')}(
+            $result .= &{$self->{'format_heading_text'}}(
                        $self, $cmdname, $heading,
                        $heading_level +
                        $self->get_conf('CHAPTER_HEADER_LEVEL') - 1, $command);
@@ -168,18 +127,14 @@ foreach my $command (keys(%Texinfo::Common::sectioning_commands), 'node') {
 }

 # print the TOC where @contents is used
-if ($program_version_6_8) {
-    ff_set_from_init_file('CONTENTS_OUTPUT_LOCATION', 'inline');
-} else {
-    ff_set_from_init_file('INLINE_CONTENTS', 1);
-}
+set_from_init_file('INLINE_CONTENTS', 1);

 # make chapters <h2>
-ff_set_from_init_file('CHAPTER_HEADER_LEVEL', 2);
+set_from_init_file('CHAPTER_HEADER_LEVEL', 2);

 # Do not add <hr>
-ff_set_from_init_file('DEFAULT_RULE', '');
-ff_set_from_init_file('BIG_RULE', '');
+set_from_init_file('DEFAULT_RULE', '');
+set_from_init_file('BIG_RULE', '');

 # Customized file beginning
 sub ffmpeg_begin_file($$$)
@@ -196,18 +151,7 @@ sub ffmpeg_begin_file($$$)
    my ($title, $description, $encoding, $date, $css_lines,
        $doctype, $bodytext, $copying_comment, $after_body_open,
        $extra_head, $program_and_version, $program_homepage,
-        $program, $generator);
-    if ($program_version_num >= 7.000000) {
-        ($title, $description, $encoding, $date, $css_lines,
-         $doctype, $bodytext, $copying_comment, $after_body_open,
-         $extra_head, $program_and_version, $program_homepage,
-         $program, $generator) = $self->_file_header_information($command);
-    } else {
-        ($title, $description, $encoding, $date, $css_lines,
-         $doctype, $bodytext, $copying_comment, $after_body_open,
-         $extra_head, $program_and_version, $program_homepage,
-         $program, $generator) = $self->_file_header_informations($command);
-    }
+        $program, $generator) = $self->_file_header_informations($command);

    my $links = $self->_get_links ($filename, $element);

@@ -240,11 +184,7 @@ EOT

    return $head1 . $head_title . $head2 . $head_title . $head3;
 }
-if ($program_version_6_8) {
-    texinfo_register_formatting_function('format_begin_file', \&ffmpeg_begin_file);
-} else {
-    texinfo_register_formatting_function('begin_file', \&ffmpeg_begin_file);
-}
+texinfo_register_formatting_function('begin_file', \&ffmpeg_begin_file);

 sub ffmpeg_program_string($)
 {
@@ -261,17 +201,13 @@ sub ffmpeg_program_string($)
      $self->gdt('This document was generated automatically.'));
  }
 }
-if ($program_version_6_8) {
-    texinfo_register_formatting_function('format_program_string', \&ffmpeg_program_string);
-} else {
-    texinfo_register_formatting_function('program_string', \&ffmpeg_program_string);
-}
+texinfo_register_formatting_function('program_string', \&ffmpeg_program_string);

 # Customized file ending
 sub ffmpeg_end_file($)
 {
    my $self = shift;
-    my $program_string = &{get_formatting_function($self,'format_program_string')}($self);
+    my $program_string = &{$self->{'format_program_string'}}($self);
    my $program_text = <<EOT;
      <p style="font-size: small;">
        $program_string
@@ -284,15 +220,11 @@ EOT
 EOT
    return $program_text . $footer;
 }
-if ($program_version_6_8) {
-    texinfo_register_formatting_function('format_end_file', \&ffmpeg_end_file);
-} else {
-    texinfo_register_formatting_function('end_file', \&ffmpeg_end_file);
-}
+texinfo_register_formatting_function('end_file', \&ffmpeg_end_file);

 # Dummy title command
 # Ignore title. Title is handled through ffmpeg_begin_file().
-ff_set_from_init_file('USE_TITLEPAGE_FOR_TITLE', 1);
+set_from_init_file('USE_TITLEPAGE_FOR_TITLE', 1);
 sub ffmpeg_title($$$$)
 {
    return '';
@@ -310,14 +242,8 @@ sub ffmpeg_float($$$$$)
    my $args = shift;
    my $content = shift;

-    my ($caption, $prepended);
-    if ($program_version_num >= 7.000000) {
-        ($caption, $prepended) = Texinfo::Convert::Converter::float_name_caption($self,
-                                                                                 $command);
-    } else {
-        ($caption, $prepended) = Texinfo::Common::float_name_caption($self,
-                                                                     $command);
-    }
+    my ($caption, $prepended) = Texinfo::Common::float_name_caption($self,
+                                                                $command);
    my $caption_text = '';
    my $prepended_text;
    my $prepended_save = '';
@@ -389,13 +315,8 @@ sub ffmpeg_float($$$$$)
            $caption->{'args'}->[0], 'float caption');
    }
    if ($prepended_text.$caption_text ne '') {
-        if ($program_version_num >= 7.000000) {
-            $prepended_text = $self->html_attribute_class('div',['float-caption']). '>'
-                    . $prepended_text;
-        } else {
-            $prepended_text = $self->_attribute_class('div','float-caption'). '>'
-                    . $prepended_text;
-        }
+        $prepended_text = $self->_attribute_class('div','float-caption'). '>'
+                . $prepended_text;
        $caption_text .= '</div>';
    }
    my $html_class = '';
@@ -408,13 +329,8 @@ sub ffmpeg_float($$$$$)
        $prepended_text = '';
        $caption_text   = '';
    }
-    if ($program_version_num >= 7.000000) {
-        return $self->html_attribute_class('div', [$html_class]). '>' . "\n" .
-            $prepended_text . $caption_text . $content . '</div>';
-    } else {
-        return $self->_attribute_class('div', $html_class). '>' . "\n" .
-            $prepended_text . $caption_text . $content . '</div>';
-    }
+    return $self->_attribute_class('div', $html_class). '>' . "\n" .
+        $prepended_text . $caption_text . $content . '</div>';
 }

 texinfo_register_command_formatting('float',
@@ -44,3 +44,4 @@ a+b*c;
 here the reader knows that a,b,c are meant to be signed integers but for C
 standard compliance / to avoid undefined behavior they are stored in unsigned
 ints.
+
@@ -418,4 +418,4 @@ done:

 When all of this is done, you can submit your patch to the ffmpeg-devel
 mailing-list for review.  If you need any help, feel free to come on our IRC
-channel, #ffmpeg-devel on irc.libera.chat.
+channel, #ffmpeg-devel on irc.freenode.net.
@@ -90,7 +90,7 @@ COMPILE_MSA = $(call COMPILE,CC,MSAFLAGS)
 	-$(if $(ASMSTRIPFLAGS), $(STRIP) $(ASMSTRIPFLAGS) $@)

 %.o: %.rc
-	$(WINDRES) $(IFLAGS) $(foreach ARG,$(CC_DEPFLAGS),--preprocessor-arg "$(ARG)") -o $@ $<
+	$(WINDRES) $(IFLAGS) --preprocessor "$(DEPWINDRES) -E -xc-header -DRC_INVOKED $(CC_DEPFLAGS)" -o $@ $<

 %.i: %.c
 	$(CC) $(CCFLAGS) $(CC_E) $<
@@ -1,5 +1,3 @@
-#!/bin/sh
-
 toupper(){
    echo "$@" | tr abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ
 }
@@ -537,7 +537,7 @@ static const AVOption *opt_find(void *obj, const char *name, const char *unit,
    return o;
 }

-#define FLAGS ((o->type == AV_OPT_TYPE_FLAGS && (arg[0]=='-' || arg[0]=='+')) ? AV_DICT_APPEND : 0)
+#define FLAGS (o->type == AV_OPT_TYPE_FLAGS && (arg[0]=='-' || arg[0]=='+')) ? AV_DICT_APPEND : 0
 int opt_default(void *optctx, const char *opt, const char *arg)
 {
    const AVOption *o;
@@ -492,9 +492,8 @@ static int read_key(void)
        }
        //Read it
        if(nchars != 0) {
-            if (read(0, &ch, 1) == 1)
-                return ch;
-            return 0;
+            read(0, &ch, 1);
+            return ch;
        }else{
            return -1;
        }
@@ -1975,9 +1974,6 @@ static void flush_encoders(void)
            AVPacket *pkt = ost->pkt;
            int pkt_size;

-            if (!pkt)
-                break;
-
            switch (enc->codec_type) {
            case AVMEDIA_TYPE_AUDIO:
                desc   = "audio";
@@ -3467,7 +3463,12 @@ static int init_output_stream_encode(OutputStream *ost, AVFrame *frame)
            enc_ctx->bits_per_raw_sample = frame_bits_per_raw_sample;
        }

-        // Field order: autodetection
+        if (ost->top_field_first == 0) {
+            enc_ctx->field_order = AV_FIELD_BB;
+        } else if (ost->top_field_first == 1) {
+            enc_ctx->field_order = AV_FIELD_TT;
+        }
+
        if (frame) {
            if (enc_ctx->flags & (AV_CODEC_FLAG_INTERLACED_DCT | AV_CODEC_FLAG_INTERLACED_ME) &&
                ost->top_field_first >= 0)
@@ -3482,13 +3483,6 @@ static int init_output_stream_encode(OutputStream *ost, AVFrame *frame)
                enc_ctx->field_order = AV_FIELD_PROGRESSIVE;
        }

-        // Field order: override
-        if (ost->top_field_first == 0) {
-            enc_ctx->field_order = AV_FIELD_BB;
-        } else if (ost->top_field_first == 1) {
-            enc_ctx->field_order = AV_FIELD_TT;
-        }
-
        if (ost->forced_keyframes) {
            if (!strncmp(ost->forced_keyframes, "expr:", 5)) {
                ret = av_expr_parse(&ost->forced_keyframes_pexpr, ost->forced_keyframes+5,
@@ -3956,7 +3950,7 @@ static OutputStream *choose_output(void)
                ost->st->index, ost->st->id, ost->initialized, ost->inputs_done, ost->finished);

        if (!ost->initialized && !ost->inputs_done)
-            return ost->unavailable ? NULL : ost;
+            return ost;

        if (!ost->finished && opts < opts_min) {
            opts_min = opts;
@@ -93,7 +93,6 @@ typedef struct {

 typedef struct OptionsContext {
    OptionGroup *g;
-    int depth;

    /* input/output options */
    int64_t start_time;
@@ -414,8 +414,6 @@ static int opt_map(void *optctx, const char *opt, const char *arg)
            for (i = 0; i < o->nb_stream_maps; i++) {
                m = &o->stream_maps[i];
                if (file_idx == m->file_index &&
-                    m->stream_index >= 0 &&
-                    m->stream_index < input_files[m->file_index]->nb_streams &&
                    check_stream_specifier(input_files[m->file_index]->ctx,
                                           input_files[m->file_index]->ctx->streams[m->stream_index],
                                           *p == ':' ? p + 1 : p) > 0)
@@ -1918,7 +1916,6 @@ static OutputStream *new_audio_stream(OptionsContext *o, AVFormatContext *oc, in

    if (!ost->stream_copy) {
        char *sample_fmt = NULL;
-        const char *apad = NULL;

        MATCH_PER_STREAM_OPT(audio_channels, i, audio_enc->channels, oc, st);

@@ -1931,12 +1928,8 @@ static OutputStream *new_audio_stream(OptionsContext *o, AVFormatContext *oc, in

        MATCH_PER_STREAM_OPT(audio_sample_rate, i, audio_enc->sample_rate, oc, st);

-        MATCH_PER_STREAM_OPT(apad, str, apad, oc, st);
-        if (apad) {
-            ost->apad = av_strdup(apad);
-            if (!ost->apad)
-                exit_program(1);
-        }
+        MATCH_PER_STREAM_OPT(apad, str, ost->apad, oc, st);
+        ost->apad = av_strdup(ost->apad);

        ost->avfilter = get_ost_filters(o, oc, ost);
        if (!ost->avfilter)
@@ -3021,12 +3014,6 @@ static int opt_preset(void *optctx, const char *opt, const char *arg)
    FILE *f=NULL;
    char filename[1000], line[1000], tmp_line[1000];
    const char *codec_name = NULL;
-    int depth = o->depth;
-
-    if (depth > 2) {
-        av_log(NULL, AV_LOG_ERROR, "too deep recursion\n");
-        return AVERROR(EINVAL);
-    }

    tmp_line[0] = *opt;
    tmp_line[1] = 0;
@@ -3040,7 +3027,6 @@ static int opt_preset(void *optctx, const char *opt, const char *arg)
        exit_program(1);
    }

-    o->depth ++;
    while (fgets(line, sizeof(line), f)) {
        char *key = tmp_line, *value, *endptr;

@@ -3065,7 +3051,6 @@ static int opt_preset(void *optctx, const char *opt, const char *arg)
        }
    }

-    o->depth = depth;
    fclose(f);

    return 0;
@@ -131,8 +131,8 @@ static int zero12v_decode_frame(AVCodecContext *avctx, void *data,
            u = x/2 + (uint16_t *)(pic->data[1] + line * pic->linesize[1]);
            v = x/2 + (uint16_t *)(pic->data[2] + line * pic->linesize[2]);
            memcpy(y, y_temp, sizeof(*y) * (width - x));
-            memcpy(u, u_temp, sizeof(*u) * ((width - x + 1) / 2));
-            memcpy(v, v_temp, sizeof(*v) * ((width - x + 1) / 2));
+            memcpy(u, u_temp, sizeof(*u) * (width - x + 1) / 2);
+            memcpy(v, v_temp, sizeof(*v) * (width - x + 1) / 2);
        }

        line_end += stride;
@@ -886,8 +886,6 @@ static int decode_frame(AVCodecContext *avctx, void *data,
        }

        if (i >= CFRAME_BUFFER_COUNT) {
-            if (free_index < 0)
-                return AVERROR_INVALIDDATA;
            i             = free_index;
            f->cfrm[i].id = id;
        }
@@ -70,9 +70,6 @@ static int decode_frame(AVCodecContext *avctx, void *data,
    unsigned char *planemap = c->planemap;
    int ret;

-    if (buf_size < planes * height *2)
-        return AVERROR_INVALIDDATA;
-
    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
        return ret;

@@ -132,6 +132,7 @@ OBJS-$(CONFIG_MPEGVIDEOENC)            += mpegvideo_enc.o mpeg12data.o  \
                                          motion_est.o ratecontrol.o    \
                                          mpegvideoencdsp.o
 OBJS-$(CONFIG_MSS34DSP)                += mss34dsp.o
+OBJS-$(CONFIG_NVENC)                   += nvenc.o
 OBJS-$(CONFIG_PIXBLOCKDSP)             += pixblockdsp.o
 OBJS-$(CONFIG_QPELDSP)                 += qpeldsp.o
 OBJS-$(CONFIG_QSV)                     += qsv.o
@@ -374,9 +375,9 @@ OBJS-$(CONFIG_H264_CUVID_DECODER)      += cuviddec.o
 OBJS-$(CONFIG_H264_MEDIACODEC_DECODER) += mediacodecdec.o
 OBJS-$(CONFIG_H264_MF_ENCODER)         += mfenc.o mf_utils.o
 OBJS-$(CONFIG_H264_MMAL_DECODER)       += mmaldec.o
-OBJS-$(CONFIG_H264_NVENC_ENCODER)      += nvenc.o nvenc_h264.o
-OBJS-$(CONFIG_NVENC_ENCODER)           += nvenc.o nvenc_h264.o
-OBJS-$(CONFIG_NVENC_H264_ENCODER)      += nvenc.o nvenc_h264.o
+OBJS-$(CONFIG_H264_NVENC_ENCODER)      += nvenc_h264.o
+OBJS-$(CONFIG_NVENC_ENCODER)           += nvenc_h264.o
+OBJS-$(CONFIG_NVENC_H264_ENCODER)      += nvenc_h264.o
 OBJS-$(CONFIG_H264_OMX_ENCODER)        += omx.o
 OBJS-$(CONFIG_H264_QSV_DECODER)        += qsvdec.o
 OBJS-$(CONFIG_H264_QSV_ENCODER)        += qsvenc_h264.o
@@ -396,8 +397,8 @@ OBJS-$(CONFIG_HEVC_AMF_ENCODER)        += amfenc_hevc.o
 OBJS-$(CONFIG_HEVC_CUVID_DECODER)      += cuviddec.o
 OBJS-$(CONFIG_HEVC_MEDIACODEC_DECODER) += mediacodecdec.o
 OBJS-$(CONFIG_HEVC_MF_ENCODER)         += mfenc.o mf_utils.o
-OBJS-$(CONFIG_HEVC_NVENC_ENCODER)      += nvenc.o nvenc_hevc.o
-OBJS-$(CONFIG_NVENC_HEVC_ENCODER)      += nvenc.o nvenc_hevc.o
+OBJS-$(CONFIG_HEVC_NVENC_ENCODER)      += nvenc_hevc.o
+OBJS-$(CONFIG_NVENC_HEVC_ENCODER)      += nvenc_hevc.o
 OBJS-$(CONFIG_HEVC_QSV_DECODER)        += qsvdec.o
 OBJS-$(CONFIG_HEVC_QSV_ENCODER)        += qsvenc_hevc.o hevc_ps_enc.o       \
                                          hevc_data.o
@@ -874,7 +875,6 @@ OBJS-$(CONFIG_ADPCM_G726_ENCODER)         += g726.o
 OBJS-$(CONFIG_ADPCM_G726LE_DECODER)       += g726.o
 OBJS-$(CONFIG_ADPCM_G726LE_ENCODER)       += g726.o
 OBJS-$(CONFIG_ADPCM_IMA_AMV_DECODER)      += adpcm.o adpcm_data.o
-OBJS-$(CONFIG_ADPCM_IMA_AMV_ENCODER)      += adpcmenc.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_ALP_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_ALP_ENCODER)      += adpcmenc.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_APC_DECODER)      += adpcm.o adpcm_data.o
@@ -107,16 +107,13 @@ static void render_charset(AVCodecContext *avctx, uint8_t *charset,
    uint8_t pix;
    int lowdiff, highdiff;
    int *best_cb = c->mc_best_cb;
-    uint8_t index1[256];
-    uint8_t index2[256];
-    uint8_t dither[256];
+    static uint8_t index1[256];
+    static uint8_t index2[256];
+    static uint8_t dither[256];
    int i;
    int distance;

-    /* Generate lookup-tables for dither and index before looping.
-     * This code relies on c->mc_luma_vals[c->mc_pal_size - 1] being
-     * the maximum of all the mc_luma_vals values and on the minimum
-     * being zero; this ensures that dither is properly initialized. */
+    /* generate lookup-tables for dither and index before looping */
    i = 0;
    for (a=0; a < 256; a++) {
        if(i < c->mc_pal_size -1 && a == c->mc_luma_vals[i + 1]) {
@@ -843,25 +843,25 @@ static void search_for_ms(AACEncContext *s, ChannelElement *cpe)
                                                    sce0->ics.swb_sizes[g],
                                                    sce0->sf_idx[w*16+g],
                                                    sce0->band_type[w*16+g],
-                                                    lambda / (band0->threshold + FLT_MIN), INFINITY, &b1, NULL, 0);
+                                                    lambda / band0->threshold, INFINITY, &b1, NULL, 0);
                        dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
                                                    R34,
                                                    sce1->ics.swb_sizes[g],
                                                    sce1->sf_idx[w*16+g],
                                                    sce1->band_type[w*16+g],
-                                                    lambda / (band1->threshold + FLT_MIN), INFINITY, &b2, NULL, 0);
+                                                    lambda / band1->threshold, INFINITY, &b2, NULL, 0);
                        dist2 += quantize_band_cost(s, M,
                                                    M34,
                                                    sce0->ics.swb_sizes[g],
                                                    mididx,
                                                    midcb,
-                                                    lambda / (minthr + FLT_MIN), INFINITY, &b3, NULL, 0);
+                                                    lambda / minthr, INFINITY, &b3, NULL, 0);
                        dist2 += quantize_band_cost(s, S,
                                                    S34,
                                                    sce1->ics.swb_sizes[g],
                                                    sididx,
                                                    sidcb,
-                                                    mslambda / (minthr * bmax + FLT_MIN), INFINITY, &b4, NULL, 0);
+                                                    mslambda / (minthr * bmax), INFINITY, &b4, NULL, 0);
                        B0 += b1+b2;
                        B1 += b3+b4;
                        dist1 -= b1+b2;
@@ -539,9 +539,6 @@ static int output_configure(AACContext *ac,
    uint8_t id_map[TYPE_END][MAX_ELEM_ID] = {{ 0 }};
    uint8_t type_counts[TYPE_END] = { 0 };

-    if (get_new_frame && !ac->frame)
-        return AVERROR_INVALIDDATA;
-
    if (ac->oc[1].layout_map != layout_map) {
        memcpy(ac->oc[1].layout_map, layout_map, tags * sizeof(layout_map[0]));
        ac->oc[1].layout_map_tags = tags;
@@ -1079,18 +1076,14 @@ static int decode_audio_specific_config_gb(AACContext *ac,
 {
    int i, ret;
    GetBitContext gbc = *gb;
-    MPEG4AudioConfig m4ac_bak = *m4ac;

-    if ((i = ff_mpeg4audio_get_config_gb(m4ac, &gbc, sync_extension, avctx)) < 0) {
-        *m4ac = m4ac_bak;
+    if ((i = ff_mpeg4audio_get_config_gb(m4ac, &gbc, sync_extension, avctx)) < 0)
        return AVERROR_INVALIDDATA;
-    }

    if (m4ac->sampling_index > 12) {
        av_log(avctx, AV_LOG_ERROR,
               "invalid sampling rate index %d\n",
               m4ac->sampling_index);
-        *m4ac = m4ac_bak;
        return AVERROR_INVALIDDATA;
    }
    if (m4ac->object_type == AOT_ER_AAC_LD &&
@@ -1098,7 +1091,6 @@ static int decode_audio_specific_config_gb(AACContext *ac,
        av_log(avctx, AV_LOG_ERROR,
               "invalid low delay sampling rate index %d\n",
               m4ac->sampling_index);
-        *m4ac = m4ac_bak;
        return AVERROR_INVALIDDATA;
    }

@@ -28,7 +28,6 @@
 *              TODOs:
 * add sane pulse detection
 ***********************************/
-#include <float.h>

 #include "libavutil/libm.h"
 #include "libavutil/float_dsp.h"
@@ -853,7 +852,7 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                /* Not so fast though */
                ratio = sqrtf(ratio);
            }
-            s->lambda = av_clipf(s->lambda * ratio, FLT_EPSILON, 65536.f);
+            s->lambda = FFMIN(s->lambda * ratio, 65536.f);

            /* Keep iterating if we must reduce and lambda is in the sky */
            if (ratio > 0.9f && ratio < 1.1f) {
@@ -898,7 +897,7 @@ static av_cold int aac_encode_end(AVCodecContext *avctx)
 {
    AACEncContext *s = avctx->priv_data;

-    av_log(avctx, AV_LOG_INFO, "Qavg: %.3f\n", s->lambda_count ? s->lambda_sum / s->lambda_count : NAN);
+    av_log(avctx, AV_LOG_INFO, "Qavg: %.3f\n", s->lambda_sum / s->lambda_count);

    ff_mdct_end(&s->mdct1024);
    ff_mdct_end(&s->mdct128);
@@ -173,7 +173,6 @@ void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce)
                      sce->ics.window_sequence[0] == LONG_START_SEQUENCE ? 0 : 2;
    const int sfb_len = sfb_end - sfb_start;
    const int coef_len = sce->ics.swb_offset[sfb_end] - sce->ics.swb_offset[sfb_start];
-    const int n_filt = is8 ? 1 : order != TNS_MAX_ORDER ? 2 : 3;

    if (coef_len <= 0 || sfb_len <= 0) {
        sce->tns.present = 0;
@@ -181,30 +180,16 @@ void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce)
    }

    for (w = 0; w < sce->ics.num_windows; w++) {
-        float en[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-        int oc_start = 0;
+        float en[2] = {0.0f, 0.0f};
+        int oc_start = 0, os_start = 0;
        int coef_start = sce->ics.swb_offset[sfb_start];

-        if (n_filt == 2) {
-            for (g = sfb_start; g < sce->ics.num_swb && g <= sfb_end; g++) {
-                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[w*16+g];
-                    if (g > sfb_start + (sfb_len/2))
-                        en[1] += band->energy; /* End */
-                    else
-                        en[0] += band->energy; /* Start */
-            }
-            en[2] = en[0];
-        } else {
-            for (g = sfb_start; g < sce->ics.num_swb && g <= sfb_end; g++) {
-                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[w*16+g];
-                    if (g > sfb_start + (sfb_len/2) + (sfb_len/4))
-                        en[2] += band->energy; /* End */
-                    else if (g > sfb_start + (sfb_len/2) - (sfb_len/4))
-                        en[1] += band->energy; /* Middle */
-                    else
-                        en[0] += band->energy; /* Start */
-            }
-            en[3] = en[0];
+        for (g = sfb_start; g < sce->ics.num_swb && g <= sfb_end; g++) {
+            FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[w*16+g];
+            if (g > sfb_start + (sfb_len/2))
+                en[1] += band->energy;
+            else
+                en[0] += band->energy;
        }

        /* LPC */
@@ -214,14 +199,15 @@ void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce)
        if (!order || !isfinite(gain) || gain < TNS_GAIN_THRESHOLD_LOW || gain > TNS_GAIN_THRESHOLD_HIGH)
            continue;

-        tns->n_filt[w] = n_filt;
+        tns->n_filt[w] = is8 ? 1 : order != TNS_MAX_ORDER ? 2 : 3;
        for (g = 0; g < tns->n_filt[w]; g++) {
-            tns->direction[w][g] = slant != 2 ? slant : en[g] < en[g + 1];
-            tns->order[w][g] = order/tns->n_filt[w];
-            tns->length[w][g] = sfb_len/tns->n_filt[w];
+            tns->direction[w][g] = slant != 2 ? slant : en[g] < en[!g];
+            tns->order[w][g] = g < tns->n_filt[w] ? order/tns->n_filt[w] : order - oc_start;
+            tns->length[w][g] = g < tns->n_filt[w] ? sfb_len/tns->n_filt[w] : sfb_len - os_start;
            quantize_coefs(&coefs[oc_start], tns->coef_idx[w][g], tns->coef[w][g],
                            tns->order[w][g], c_bits);
            oc_start += tns->order[w][g];
+            os_start += tns->length[w][g];
        }
        count++;
    }
@@ -308,9 +308,6 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
    const int bandwidth    = ctx->cutoff ? ctx->cutoff : AAC_CUTOFF(ctx->avctx);
    const float num_bark   = calc_bark((float)bandwidth);

-    if (bandwidth <= 0)
-        return AVERROR(EINVAL);
-
    ctx->model_priv_data = av_mallocz(sizeof(AacPsyContext));
    if (!ctx->model_priv_data)
        return AVERROR(ENOMEM);
@@ -797,7 +794,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,

        if (pe < 1.15f * desired_pe) {
            /* 6.6.1.3.6 "Final threshold modification by linearization" */
-            norm_fac = norm_fac ? 1.0f / norm_fac : 0;
+            norm_fac = 1.0f / norm_fac;
            for (w = 0; w < wi->num_windows*16; w += 16) {
                for (g = 0; g < num_bands; g++) {
                    AacPsyBand *band = &pch->band[w+g];
@@ -588,7 +588,6 @@ static int sbr_make_f_derived(AACContext *ac, SpectralBandReplication *sbr)

    if (sbr->n_q > 5) {
        av_log(ac->avctx, AV_LOG_ERROR, "Too many noise floor scale factors: %d\n", sbr->n_q);
-        sbr->n_q = 1;
        return -1;
    }

@@ -1430,9 +1429,6 @@ static void sbr_env_estimate(AAC_FLOAT (*e_curr)[48], INTFLOAT X_high[64][40][2]
            int ilb = ch_data->t_env[e]     * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
            int iub = ch_data->t_env[e + 1] * 2 + ENVELOPE_ADJUSTMENT_OFFSET;

-            if (ilb >= 40)
-                return;
-
            for (m = 0; m < sbr->m[1]; m++) {
                AAC_FLOAT sum = sbr->dsp.sum_square(X_high[m+kx1] + ilb, iub - ilb);
 #if USE_FIXED
@@ -1451,9 +1447,6 @@ static void sbr_env_estimate(AAC_FLOAT (*e_curr)[48], INTFLOAT X_high[64][40][2]
            int iub = ch_data->t_env[e + 1] * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
            const uint16_t *table = ch_data->bs_freq_res[e + 1] ? sbr->f_tablehigh : sbr->f_tablelow;

-            if (ilb >= 40)
-                return;
-
            for (p = 0; p < sbr->n[ch_data->bs_freq_res[e + 1]]; p++) {
 #if USE_FIXED
                SoftFloat sum = FLOAT_0;
@@ -19,130 +19,130 @@
 #include "libavutil/aarch64/asm.S"

 function ff_ps_add_squares_neon, export=1
-1:      ld1             {v0.4s,v1.4s}, [x1], #32
-        fmul            v0.4s, v0.4s, v0.4s
-        fmul            v1.4s, v1.4s, v1.4s
-        faddp           v2.4s, v0.4s, v1.4s
-        ld1             {v3.4s}, [x0]
-        fadd            v3.4s, v3.4s, v2.4s
-        st1             {v3.4s}, [x0], #16
-        subs            w2, w2, #4
-        b.gt            1b
+1:      ld1         {v0.4S,v1.4S}, [x1], #32
+        fmul        v0.4S, v0.4S, v0.4S
+        fmul        v1.4S, v1.4S, v1.4S
+        faddp       v2.4S, v0.4S, v1.4S
+        ld1         {v3.4S}, [x0]
+        fadd        v3.4S, v3.4S, v2.4S
+        st1         {v3.4S}, [x0], #16
+        subs        w2, w2, #4
+        b.gt        1b
        ret
 endfunc

 function ff_ps_mul_pair_single_neon, export=1
-1:      ld1             {v0.4s,v1.4s}, [x1], #32
-        ld1             {v2.4s},       [x2], #16
-        zip1            v3.4s, v2.4s, v2.4s
-        zip2            v4.4s, v2.4s, v2.4s
-        fmul            v0.4s, v0.4s, v3.4s
-        fmul            v1.4s, v1.4s, v4.4s
-        st1             {v0.4s,v1.4s}, [x0], #32
-        subs            w3, w3, #4
-        b.gt            1b
+1:      ld1         {v0.4S,v1.4S}, [x1], #32
+        ld1         {v2.4S},       [x2], #16
+        zip1        v3.4S, v2.4S, v2.4S
+        zip2        v4.4S, v2.4S, v2.4S
+        fmul        v0.4S, v0.4S, v3.4S
+        fmul        v1.4S, v1.4S, v4.4S
+        st1         {v0.4S,v1.4S}, [x0], #32
+        subs        w3, w3, #4
+        b.gt        1b
        ret
 endfunc

 function ff_ps_stereo_interpolate_neon, export=1
-        ld1             {v0.4s}, [x2]
-        ld1             {v1.4s}, [x3]
-        zip1            v4.4s, v0.4s, v0.4s
-        zip2            v5.4s, v0.4s, v0.4s
-        zip1            v6.4s, v1.4s, v1.4s
-        zip2            v7.4s, v1.4s, v1.4s
-1:      ld1             {v2.2s}, [x0]
-        ld1             {v3.2s}, [x1]
-        fadd            v4.4s, v4.4s, v6.4s
-        fadd            v5.4s, v5.4s, v7.4s
-        mov             v2.d[1], v2.d[0]
-        mov             v3.d[1], v3.d[0]
-        fmul            v2.4s, v2.4s, v4.4s
-        fmla            v2.4s, v3.4s, v5.4s
-        st1             {v2.d}[0], [x0], #8
-        st1             {v2.d}[1], [x1], #8
-        subs            w4, w4, #1
-        b.gt            1b
+        ld1         {v0.4S}, [x2]
+        ld1         {v1.4S}, [x3]
+        zip1        v4.4S, v0.4S, v0.4S
+        zip2        v5.4S, v0.4S, v0.4S
+        zip1        v6.4S, v1.4S, v1.4S
+        zip2        v7.4S, v1.4S, v1.4S
+1:      ld1         {v2.2S}, [x0]
+        ld1         {v3.2S}, [x1]
+        fadd        v4.4S, v4.4S, v6.4S
+        fadd        v5.4S, v5.4S, v7.4S
+        mov         v2.D[1], v2.D[0]
+        mov         v3.D[1], v3.D[0]
+        fmul        v2.4S, v2.4S, v4.4S
+        fmla        v2.4S, v3.4S, v5.4S
+        st1         {v2.D}[0], [x0], #8
+        st1         {v2.D}[1], [x1], #8
+        subs        w4, w4, #1
+        b.gt        1b
        ret
 endfunc

 function ff_ps_stereo_interpolate_ipdopd_neon, export=1
-        ld1             {v0.4s,v1.4s}, [x2]
-        ld1             {v6.4s,v7.4s}, [x3]
-        fneg            v2.4s, v1.4s
-        fneg            v3.4s, v7.4s
-        zip1            v16.4s, v0.4s, v0.4s
-        zip2            v17.4s, v0.4s, v0.4s
-        zip1            v18.4s, v2.4s, v1.4s
-        zip2            v19.4s, v2.4s, v1.4s
-        zip1            v20.4s, v6.4s, v6.4s
-        zip2            v21.4s, v6.4s, v6.4s
-        zip1            v22.4s, v3.4s, v7.4s
-        zip2            v23.4s, v3.4s, v7.4s
-1:      ld1             {v2.2s}, [x0]
-        ld1             {v3.2s}, [x1]
-        fadd            v16.4s, v16.4s, v20.4s
-        fadd            v17.4s, v17.4s, v21.4s
-        mov             v2.d[1], v2.d[0]
-        mov             v3.d[1], v3.d[0]
-        fmul            v4.4s, v2.4s, v16.4s
-        fmla            v4.4s, v3.4s, v17.4s
-        fadd            v18.4s, v18.4s, v22.4s
-        fadd            v19.4s, v19.4s, v23.4s
-        ext             v2.16b, v2.16b, v2.16b, #4
-        ext             v3.16b, v3.16b, v3.16b, #4
-        fmla            v4.4s, v2.4s, v18.4s
-        fmla            v4.4s, v3.4s, v19.4s
-        st1             {v4.d}[0], [x0], #8
-        st1             {v4.d}[1], [x1], #8
-        subs            w4, w4, #1
-        b.gt            1b
+        ld1         {v0.4S,v1.4S}, [x2]
+        ld1         {v6.4S,v7.4S}, [x3]
+        fneg        v2.4S, v1.4S
+        fneg        v3.4S, v7.4S
+        zip1        v16.4S, v0.4S, v0.4S
+        zip2        v17.4S, v0.4S, v0.4S
+        zip1        v18.4S, v2.4S, v1.4S
+        zip2        v19.4S, v2.4S, v1.4S
+        zip1        v20.4S, v6.4S, v6.4S
+        zip2        v21.4S, v6.4S, v6.4S
+        zip1        v22.4S, v3.4S, v7.4S
+        zip2        v23.4S, v3.4S, v7.4S
+1:      ld1         {v2.2S}, [x0]
+        ld1         {v3.2S}, [x1]
+        fadd        v16.4S, v16.4S, v20.4S
+        fadd        v17.4S, v17.4S, v21.4S
+        mov         v2.D[1], v2.D[0]
+        mov         v3.D[1], v3.D[0]
+        fmul        v4.4S, v2.4S, v16.4S
+        fmla        v4.4S, v3.4S, v17.4S
+        fadd        v18.4S, v18.4S, v22.4S
+        fadd        v19.4S, v19.4S, v23.4S
+        ext         v2.16B, v2.16B, v2.16B, #4
+        ext         v3.16B, v3.16B, v3.16B, #4
+        fmla        v4.4S, v2.4S, v18.4S
+        fmla        v4.4S, v3.4S, v19.4S
+        st1         {v4.D}[0], [x0], #8
+        st1         {v4.D}[1], [x1], #8
+        subs        w4, w4, #1
+        b.gt        1b
        ret
 endfunc

 function ff_ps_hybrid_analysis_neon, export=1
-        lsl             x3, x3, #3
-        ld2             {v0.4s,v1.4s}, [x1], #32
-        ld2             {v2.2s,v3.2s}, [x1], #16
-        ld1             {v24.2s},      [x1], #8
-        ld2             {v4.2s,v5.2s}, [x1], #16
-        ld2             {v6.4s,v7.4s}, [x1]
-        rev64           v6.4s, v6.4s
-        rev64           v7.4s, v7.4s
-        ext             v6.16b, v6.16b, v6.16b, #8
-        ext             v7.16b, v7.16b, v7.16b, #8
-        rev64           v4.2s, v4.2s
-        rev64           v5.2s, v5.2s
-        mov             v2.d[1], v3.d[0]
-        mov             v4.d[1], v5.d[0]
-        mov             v5.d[1], v2.d[0]
-        mov             v3.d[1], v4.d[0]
-        fadd            v16.4s, v0.4s, v6.4s
-        fadd            v17.4s, v1.4s, v7.4s
-        fsub            v18.4s, v1.4s, v7.4s
-        fsub            v19.4s, v0.4s, v6.4s
-        fadd            v22.4s, v2.4s, v4.4s
-        fsub            v23.4s, v5.4s, v3.4s
-        trn1            v20.2d, v22.2d, v23.2d      // {re4+re8, re5+re7, im8-im4, im7-im5}
-        trn2            v21.2d, v22.2d, v23.2d      // {im4+im8, im5+im7, re4-re8, re5-re7}
-1:      ld2             {v2.4s,v3.4s}, [x2], #32
-        ld2             {v4.2s,v5.2s}, [x2], #16
-        ld1             {v6.2s},       [x2], #8
-        add             x2, x2, #8
-        mov             v4.d[1], v5.d[0]
-        mov             v6.s[1], v6.s[0]
-        fmul            v6.2s, v6.2s, v24.2s
-        fmul            v0.4s, v2.4s, v16.4s
-        fmul            v1.4s, v2.4s, v17.4s
-        fmls            v0.4s, v3.4s, v18.4s
-        fmla            v1.4s, v3.4s, v19.4s
-        fmla            v0.4s, v4.4s, v20.4s
-        fmla            v1.4s, v4.4s, v21.4s
-        faddp           v0.4s, v0.4s, v1.4s
-        faddp           v0.4s, v0.4s, v0.4s
-        fadd            v0.2s, v0.2s, v6.2s
-        st1             {v0.2s}, [x0], x3
-        subs            w4, w4, #1
-        b.gt            1b
+        lsl         x3, x3, #3
+        ld2         {v0.4S,v1.4S}, [x1], #32
+        ld2         {v2.2S,v3.2S}, [x1], #16
+        ld1         {v24.2S},      [x1], #8
+        ld2         {v4.2S,v5.2S}, [x1], #16
+        ld2         {v6.4S,v7.4S}, [x1]
+        rev64       v6.4S, v6.4S
+        rev64       v7.4S, v7.4S
+        ext         v6.16B, v6.16B, v6.16B, #8
+        ext         v7.16B, v7.16B, v7.16B, #8
+        rev64       v4.2S, v4.2S
+        rev64       v5.2S, v5.2S
+        mov         v2.D[1], v3.D[0]
+        mov         v4.D[1], v5.D[0]
+        mov         v5.D[1], v2.D[0]
+        mov         v3.D[1], v4.D[0]
+        fadd        v16.4S, v0.4S, v6.4S
+        fadd        v17.4S, v1.4S, v7.4S
+        fsub        v18.4S, v1.4S, v7.4S
+        fsub        v19.4S, v0.4S, v6.4S
+        fadd        v22.4S, v2.4S, v4.4S
+        fsub        v23.4S, v5.4S, v3.4S
+        trn1        v20.2D, v22.2D, v23.2D      // {re4+re8, re5+re7, im8-im4, im7-im5}
+        trn2        v21.2D, v22.2D, v23.2D      // {im4+im8, im5+im7, re4-re8, re5-re7}
+1:      ld2         {v2.4S,v3.4S}, [x2], #32
+        ld2         {v4.2S,v5.2S}, [x2], #16
+        ld1         {v6.2S},       [x2], #8
+        add         x2, x2, #8
+        mov         v4.D[1], v5.D[0]
+        mov         v6.S[1], v6.S[0]
+        fmul        v6.2S, v6.2S, v24.2S
+        fmul        v0.4S, v2.4S, v16.4S
+        fmul        v1.4S, v2.4S, v17.4S
+        fmls        v0.4S, v3.4S, v18.4S
+        fmla        v1.4S, v3.4S, v19.4S
+        fmla        v0.4S, v4.4S, v20.4S
+        fmla        v1.4S, v4.4S, v21.4S
+        faddp       v0.4S, v0.4S, v1.4S
+        faddp       v0.4S, v0.4S, v0.4S
+        fadd        v0.2S, v0.2S, v6.2S
+        st1         {v0.2S}, [x0], x3
+        subs        w4, w4, #1
+        b.gt        1b
        ret
 endfunc
@@ -353,18 +353,18 @@ function fft\n\()_neon, align=6
 endfunc
 .endm

-        def_fft         32,    16,     8
-        def_fft         64,    32,    16
-        def_fft         128,    64,    32
-        def_fft         256,   128,    64
-        def_fft         512,   256,   128
-        def_fft         1024,   512,   256
-        def_fft         2048,  1024,   512
-        def_fft         4096,  2048,  1024
-        def_fft         8192,  4096,  2048
-        def_fft         16384,  8192,  4096
-        def_fft         32768, 16384,  8192
-        def_fft         65536, 32768, 16384
+        def_fft    32,    16,     8
+        def_fft    64,    32,    16
+        def_fft   128,    64,    32
+        def_fft   256,   128,    64
+        def_fft   512,   256,   128
+        def_fft  1024,   512,   256
+        def_fft  2048,  1024,   512
+        def_fft  4096,  2048,  1024
+        def_fft  8192,  4096,  2048
+        def_fft 16384,  8192,  4096
+        def_fft 32768, 16384,  8192
+        def_fft 65536, 32768, 16384

 function ff_fft_calc_neon, export=1
        prfm            pldl1keep, [x1]
@@ -36,11 +36,11 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
        lsl             w9,  w9,  #3
        lsl             w10, w10, #1
        add             w9,  w9,  w10
-        add             x6,  x6,  w9, uxtw
-        ld1r            {v22.8h}, [x6]
+        add             x6,  x6,  w9, UXTW
+        ld1r            {v22.8H}, [x6]
  .endif
  .ifc \codec,vc1
-        movi            v22.8h,   #28
+        movi            v22.8H,   #28
  .endif
        mul             w7,  w4,  w5
        lsl             w14, w5,  #3
@@ -53,139 +53,139 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
        add             w4,  w4,  #64
        b.eq            2f

-        dup             v0.8b,  w4
-        dup             v1.8b,  w12
-        ld1             {v4.8b, v5.8b}, [x1], x2
-        dup             v2.8b,  w6
-        dup             v3.8b,  w7
-        ext             v5.8b,  v4.8b,  v5.8b,  #1
-1:      ld1             {v6.8b, v7.8b}, [x1], x2
-        umull           v16.8h, v4.8b,  v0.8b
-        umlal           v16.8h, v5.8b,  v1.8b
-        ext             v7.8b,  v6.8b,  v7.8b,  #1
-        ld1             {v4.8b, v5.8b}, [x1], x2
-        umlal           v16.8h, v6.8b,  v2.8b
+        dup             v0.8B,  w4
+        dup             v1.8B,  w12
+        ld1             {v4.8B, v5.8B}, [x1], x2
+        dup             v2.8B,  w6
+        dup             v3.8B,  w7
+        ext             v5.8B,  v4.8B,  v5.8B,  #1
+1:      ld1             {v6.8B, v7.8B}, [x1], x2
+        umull           v16.8H, v4.8B,  v0.8B
+        umlal           v16.8H, v5.8B,  v1.8B
+        ext             v7.8B,  v6.8B,  v7.8B,  #1
+        ld1             {v4.8B, v5.8B}, [x1], x2
+        umlal           v16.8H, v6.8B,  v2.8B
        prfm            pldl1strm, [x1]
-        ext             v5.8b,  v4.8b,  v5.8b,  #1
-        umlal           v16.8h, v7.8b,  v3.8b
-        umull           v17.8h, v6.8b,  v0.8b
+        ext             v5.8B,  v4.8B,  v5.8B,  #1
+        umlal           v16.8H, v7.8B,  v3.8B
+        umull           v17.8H, v6.8B,  v0.8B
        subs            w3,  w3,  #2
-        umlal           v17.8h, v7.8b, v1.8b
-        umlal           v17.8h, v4.8b, v2.8b
-        umlal           v17.8h, v5.8b, v3.8b
+        umlal           v17.8H, v7.8B, v1.8B
+        umlal           v17.8H, v4.8B, v2.8B
+        umlal           v17.8H, v5.8B, v3.8B
        prfm            pldl1strm, [x1, x2]
  .ifc \codec,h264
-        rshrn           v16.8b, v16.8h, #6
-        rshrn           v17.8b, v17.8h, #6
+        rshrn           v16.8B, v16.8H, #6
+        rshrn           v17.8B, v17.8H, #6
  .else
-        add             v16.8h, v16.8h, v22.8h
-        add             v17.8h, v17.8h, v22.8h
-        shrn            v16.8b, v16.8h, #6
-        shrn            v17.8b, v17.8h, #6
+        add             v16.8H, v16.8H, v22.8H
+        add             v17.8H, v17.8H, v22.8H
+        shrn            v16.8B, v16.8H, #6
+        shrn            v17.8B, v17.8H, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.8b}, [x8], x2
-        ld1             {v21.8b}, [x8], x2
-        urhadd          v16.8b, v16.8b, v20.8b
-        urhadd          v17.8b, v17.8b, v21.8b
+        ld1             {v20.8B}, [x8], x2
+        ld1             {v21.8B}, [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+        urhadd          v17.8B, v17.8B, v21.8B
  .endif
-        st1             {v16.8b}, [x0], x2
-        st1             {v17.8b}, [x0], x2
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
        b.gt            1b
        ret

 2:      adds            w12, w12, w6
-        dup             v0.8b, w4
+        dup             v0.8B, w4
        b.eq            5f
        tst             w6,  w6
-        dup             v1.8b, w12
+        dup             v1.8B, w12
        b.eq            4f

-        ld1             {v4.8b}, [x1], x2
-3:      ld1             {v6.8b}, [x1], x2
-        umull           v16.8h, v4.8b,  v0.8b
-        umlal           v16.8h, v6.8b,  v1.8b
-        ld1             {v4.8b}, [x1], x2
-        umull           v17.8h, v6.8b,  v0.8b
-        umlal           v17.8h, v4.8b,  v1.8b
+        ld1             {v4.8B}, [x1], x2
+3:      ld1             {v6.8B}, [x1], x2
+        umull           v16.8H, v4.8B,  v0.8B
+        umlal           v16.8H, v6.8B,  v1.8B
+        ld1             {v4.8B}, [x1], x2
+        umull           v17.8H, v6.8B,  v0.8B
+        umlal           v17.8H, v4.8B,  v1.8B
        prfm            pldl1strm, [x1]
  .ifc \codec,h264
-        rshrn           v16.8b, v16.8h, #6
-        rshrn           v17.8b, v17.8h, #6
+        rshrn           v16.8B, v16.8H, #6
+        rshrn           v17.8B, v17.8H, #6
  .else
-        add             v16.8h, v16.8h, v22.8h
-        add             v17.8h, v17.8h, v22.8h
-        shrn            v16.8b, v16.8h, #6
-        shrn            v17.8b, v17.8h, #6
+        add             v16.8H, v16.8H, v22.8H
+        add             v17.8H, v17.8H, v22.8H
+        shrn            v16.8B, v16.8H, #6
+        shrn            v17.8B, v17.8H, #6
  .endif
        prfm            pldl1strm, [x1, x2]
  .ifc \type,avg
-        ld1             {v20.8b}, [x8], x2
-        ld1             {v21.8b}, [x8], x2
-        urhadd          v16.8b, v16.8b, v20.8b
-        urhadd          v17.8b, v17.8b, v21.8b
+        ld1             {v20.8B}, [x8], x2
+        ld1             {v21.8B}, [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+        urhadd          v17.8B, v17.8B, v21.8B
  .endif
        subs            w3,  w3,  #2
-        st1             {v16.8b}, [x0], x2
-        st1             {v17.8b}, [x0], x2
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
        b.gt            3b
        ret

-4:      ld1             {v4.8b, v5.8b}, [x1], x2
-        ld1             {v6.8b, v7.8b}, [x1], x2
-        ext             v5.8b,  v4.8b,  v5.8b,  #1
-        ext             v7.8b,  v6.8b,  v7.8b,  #1
+4:      ld1             {v4.8B, v5.8B}, [x1], x2
+        ld1             {v6.8B, v7.8B}, [x1], x2
+        ext             v5.8B,  v4.8B,  v5.8B,  #1
+        ext             v7.8B,  v6.8B,  v7.8B,  #1
        prfm            pldl1strm, [x1]
        subs            w3,  w3,  #2
-        umull           v16.8h, v4.8b, v0.8b
-        umlal           v16.8h, v5.8b, v1.8b
-        umull           v17.8h, v6.8b, v0.8b
-        umlal           v17.8h, v7.8b, v1.8b
+        umull           v16.8H, v4.8B, v0.8B
+        umlal           v16.8H, v5.8B, v1.8B
+        umull           v17.8H, v6.8B, v0.8B
+        umlal           v17.8H, v7.8B, v1.8B
        prfm            pldl1strm, [x1, x2]
  .ifc \codec,h264
-        rshrn           v16.8b, v16.8h, #6
-        rshrn           v17.8b, v17.8h, #6
+        rshrn           v16.8B, v16.8H, #6
+        rshrn           v17.8B, v17.8H, #6
  .else
-        add             v16.8h, v16.8h, v22.8h
-        add             v17.8h, v17.8h, v22.8h
-        shrn            v16.8b, v16.8h, #6
-        shrn            v17.8b, v17.8h, #6
+        add             v16.8H, v16.8H, v22.8H
+        add             v17.8H, v17.8H, v22.8H
+        shrn            v16.8B, v16.8H, #6
+        shrn            v17.8B, v17.8H, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.8b}, [x8], x2
-        ld1             {v21.8b}, [x8], x2
-        urhadd          v16.8b, v16.8b, v20.8b
-        urhadd          v17.8b, v17.8b, v21.8b
+        ld1             {v20.8B}, [x8], x2
+        ld1             {v21.8B}, [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+        urhadd          v17.8B, v17.8B, v21.8B
  .endif
-        st1             {v16.8b}, [x0], x2
-        st1             {v17.8b}, [x0], x2
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
        b.gt            4b
        ret

-5:      ld1             {v4.8b}, [x1], x2
-        ld1             {v5.8b}, [x1], x2
+5:      ld1             {v4.8B}, [x1], x2
+        ld1             {v5.8B}, [x1], x2
        prfm            pldl1strm, [x1]
        subs            w3,  w3,  #2
-        umull           v16.8h, v4.8b, v0.8b
-        umull           v17.8h, v5.8b, v0.8b
+        umull           v16.8H, v4.8B, v0.8B
+        umull           v17.8H, v5.8B, v0.8B
        prfm            pldl1strm, [x1, x2]
  .ifc \codec,h264
-        rshrn           v16.8b, v16.8h, #6
-        rshrn           v17.8b, v17.8h, #6
+        rshrn           v16.8B, v16.8H, #6
+        rshrn           v17.8B, v17.8H, #6
  .else
-        add             v16.8h, v16.8h, v22.8h
-        add             v17.8h, v17.8h, v22.8h
-        shrn            v16.8b, v16.8h, #6
-        shrn            v17.8b, v17.8h, #6
+        add             v16.8H, v16.8H, v22.8H
+        add             v17.8H, v17.8H, v22.8H
+        shrn            v16.8B, v16.8H, #6
+        shrn            v17.8B, v17.8H, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.8b}, [x8], x2
-        ld1             {v21.8b}, [x8], x2
-        urhadd          v16.8b, v16.8b, v20.8b
-        urhadd          v17.8b, v17.8b, v21.8b
+        ld1             {v20.8B}, [x8], x2
+        ld1             {v21.8B}, [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+        urhadd          v17.8B, v17.8B, v21.8B
  .endif
-        st1             {v16.8b}, [x0], x2
-        st1             {v17.8b}, [x0], x2
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
        b.gt            5b
        ret
 endfunc
@@ -206,11 +206,11 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
        lsl             w9,  w9,  #3
        lsl             w10, w10, #1
        add             w9,  w9,  w10
-        add             x6,  x6,  w9, uxtw
-        ld1r            {v22.8h}, [x6]
+        add             x6,  x6,  w9, UXTW
+        ld1r            {v22.8H}, [x6]
  .endif
  .ifc \codec,vc1
-        movi            v22.8h,   #28
+        movi            v22.8H,   #28
  .endif
        mul             w7,  w4,  w5
        lsl             w14, w5,  #3
@@ -223,133 +223,133 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
        add             w4,  w4,  #64
        b.eq            2f

-        dup             v24.8b,  w4
-        dup             v25.8b,  w12
-        ld1             {v4.8b}, [x1], x2
-        dup             v26.8b,  w6
-        dup             v27.8b,  w7
-        ext             v5.8b,  v4.8b,  v5.8b, #1
-        trn1            v0.2s,  v24.2s, v25.2s
-        trn1            v2.2s,  v26.2s, v27.2s
-        trn1            v4.2s,  v4.2s,  v5.2s
-1:      ld1             {v6.8b}, [x1], x2
-        ext             v7.8b,  v6.8b,  v7.8b, #1
-        trn1            v6.2s,  v6.2s,  v7.2s
-        umull           v18.8h, v4.8b,  v0.8b
-        umlal           v18.8h, v6.8b,  v2.8b
-        ld1             {v4.8b}, [x1], x2
-        ext             v5.8b,  v4.8b,  v5.8b, #1
-        trn1            v4.2s,  v4.2s,  v5.2s
+        dup             v24.8B,  w4
+        dup             v25.8B,  w12
+        ld1             {v4.8B}, [x1], x2
+        dup             v26.8B,  w6
+        dup             v27.8B,  w7
+        ext             v5.8B,  v4.8B,  v5.8B, #1
+        trn1            v0.2S,  v24.2S, v25.2S
+        trn1            v2.2S,  v26.2S, v27.2S
+        trn1            v4.2S,  v4.2S,  v5.2S
+1:      ld1             {v6.8B}, [x1], x2
+        ext             v7.8B,  v6.8B,  v7.8B, #1
+        trn1            v6.2S,  v6.2S,  v7.2S
+        umull           v18.8H, v4.8B,  v0.8B
+        umlal           v18.8H, v6.8B,  v2.8B
+        ld1             {v4.8B}, [x1], x2
+        ext             v5.8B,  v4.8B,  v5.8B, #1
+        trn1            v4.2S,  v4.2S,  v5.2S
        prfm            pldl1strm, [x1]
-        umull           v19.8h, v6.8b,  v0.8b
-        umlal           v19.8h, v4.8b,  v2.8b
-        trn1            v30.2d, v18.2d, v19.2d
-        trn2            v31.2d, v18.2d, v19.2d
-        add             v18.8h, v30.8h, v31.8h
+        umull           v19.8H, v6.8B,  v0.8B
+        umlal           v19.8H, v4.8B,  v2.8B
+        trn1            v30.2D, v18.2D, v19.2D
+        trn2            v31.2D, v18.2D, v19.2D
+        add             v18.8H, v30.8H, v31.8H
  .ifc \codec,h264
-        rshrn           v16.8b, v18.8h, #6
+        rshrn           v16.8B, v18.8H, #6
  .else
-        add             v18.8h, v18.8h, v22.8h
-        shrn            v16.8b, v18.8h, #6
+        add             v18.8H, v18.8H, v22.8H
+        shrn            v16.8B, v18.8H, #6
  .endif
        subs            w3,  w3,  #2
        prfm            pldl1strm, [x1, x2]
  .ifc \type,avg
-        ld1             {v20.s}[0], [x8], x2
-        ld1             {v20.s}[1], [x8], x2
-        urhadd          v16.8b, v16.8b, v20.8b
+        ld1             {v20.S}[0], [x8], x2
+        ld1             {v20.S}[1], [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
  .endif
-        st1             {v16.s}[0], [x0], x2
-        st1             {v16.s}[1], [x0], x2
+        st1             {v16.S}[0], [x0], x2
+        st1             {v16.S}[1], [x0], x2
        b.gt            1b
        ret

 2:      adds            w12, w12, w6
-        dup             v30.8b, w4
+        dup             v30.8B, w4
        b.eq            5f
        tst             w6,  w6
-        dup             v31.8b, w12
-        trn1            v0.2s,  v30.2s, v31.2s
-        trn2            v1.2s,  v30.2s, v31.2s
+        dup             v31.8B, w12
+        trn1            v0.2S,  v30.2S, v31.2S
+        trn2            v1.2S,  v30.2S, v31.2S
        b.eq            4f

-        ext             v1.8b,  v0.8b,  v1.8b, #4
-        ld1             {v4.s}[0], [x1], x2
-3:      ld1             {v4.s}[1], [x1], x2
-        umull           v18.8h, v4.8b,  v0.8b
-        ld1             {v4.s}[0], [x1], x2
-        umull           v19.8h, v4.8b,  v1.8b
-        trn1            v30.2d, v18.2d, v19.2d
-        trn2            v31.2d, v18.2d, v19.2d
-        add             v18.8h, v30.8h, v31.8h
+        ext             v1.8B,  v0.8B,  v1.8B, #4
+        ld1             {v4.S}[0], [x1], x2
+3:      ld1             {v4.S}[1], [x1], x2
+        umull           v18.8H, v4.8B,  v0.8B
+        ld1             {v4.S}[0], [x1], x2
+        umull           v19.8H, v4.8B,  v1.8B
+        trn1            v30.2D, v18.2D, v19.2D
+        trn2            v31.2D, v18.2D, v19.2D
+        add             v18.8H, v30.8H, v31.8H
        prfm            pldl1strm, [x1]
  .ifc \codec,h264
-        rshrn           v16.8b, v18.8h, #6
+        rshrn           v16.8B, v18.8H, #6
  .else
-        add             v18.8h, v18.8h, v22.8h
-        shrn            v16.8b, v18.8h, #6
+        add             v18.8H, v18.8H, v22.8H
+        shrn            v16.8B, v18.8H, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.s}[0], [x8], x2
-        ld1             {v20.s}[1], [x8], x2
-        urhadd          v16.8b, v16.8b, v20.8b
+        ld1             {v20.S}[0], [x8], x2
+        ld1             {v20.S}[1], [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
  .endif
        subs            w3,  w3,  #2
        prfm            pldl1strm, [x1, x2]
-        st1             {v16.s}[0], [x0], x2
-        st1             {v16.s}[1], [x0], x2
+        st1             {v16.S}[0], [x0], x2
+        st1             {v16.S}[1], [x0], x2
        b.gt            3b
        ret

-4:      ld1             {v4.8b}, [x1], x2
-        ld1             {v6.8b}, [x1], x2
-        ext             v5.8b,  v4.8b,  v5.8b, #1
-        ext             v7.8b,  v6.8b,  v7.8b, #1
-        trn1            v4.2s,  v4.2s,  v5.2s
-        trn1            v6.2s,  v6.2s,  v7.2s
-        umull           v18.8h, v4.8b,  v0.8b
-        umull           v19.8h, v6.8b,  v0.8b
+4:      ld1             {v4.8B}, [x1], x2
+        ld1             {v6.8B}, [x1], x2
+        ext             v5.8B,  v4.8B,  v5.8B, #1
+        ext             v7.8B,  v6.8B,  v7.8B, #1
+        trn1            v4.2S,  v4.2S,  v5.2S
+        trn1            v6.2S,  v6.2S,  v7.2S
+        umull           v18.8H, v4.8B,  v0.8B
+        umull           v19.8H, v6.8B,  v0.8B
        subs            w3,  w3,  #2
-        trn1            v30.2d, v18.2d, v19.2d
-        trn2            v31.2d, v18.2d, v19.2d
-        add             v18.8h, v30.8h, v31.8h
+        trn1            v30.2D, v18.2D, v19.2D
+        trn2            v31.2D, v18.2D, v19.2D
+        add             v18.8H, v30.8H, v31.8H
        prfm            pldl1strm, [x1]
  .ifc \codec,h264
-        rshrn           v16.8b, v18.8h, #6
+        rshrn           v16.8B, v18.8H, #6
  .else
-        add             v18.8h, v18.8h, v22.8h
-        shrn            v16.8b, v18.8h, #6
+        add             v18.8H, v18.8H, v22.8H
+        shrn            v16.8B, v18.8H, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.s}[0], [x8], x2
-        ld1             {v20.s}[1], [x8], x2
-        urhadd          v16.8b, v16.8b, v20.8b
+        ld1             {v20.S}[0], [x8], x2
+        ld1             {v20.S}[1], [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
  .endif
        prfm            pldl1strm, [x1]
-        st1             {v16.s}[0], [x0], x2
-        st1             {v16.s}[1], [x0], x2
+        st1             {v16.S}[0], [x0], x2
+        st1             {v16.S}[1], [x0], x2
        b.gt            4b
        ret

-5:      ld1             {v4.s}[0], [x1], x2
-        ld1             {v4.s}[1], [x1], x2
-        umull           v18.8h, v4.8b,  v30.8b
+5:      ld1             {v4.S}[0], [x1], x2
+        ld1             {v4.S}[1], [x1], x2
+        umull           v18.8H, v4.8B,  v30.8B
        subs            w3,  w3,  #2
        prfm            pldl1strm, [x1]
  .ifc \codec,h264
-        rshrn           v16.8b, v18.8h, #6
+        rshrn           v16.8B, v18.8H, #6
  .else
-        add             v18.8h, v18.8h, v22.8h
-        shrn            v16.8b, v18.8h, #6
+        add             v18.8H, v18.8H, v22.8H
+        shrn            v16.8B, v18.8H, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.s}[0], [x8], x2
-        ld1             {v20.s}[1], [x8], x2
-        urhadd          v16.8b, v16.8b, v20.8b
+        ld1             {v20.S}[0], [x8], x2
+        ld1             {v20.S}[1], [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
  .endif
        prfm            pldl1strm, [x1]
-        st1             {v16.s}[0], [x0], x2
-        st1             {v16.s}[1], [x0], x2
+        st1             {v16.S}[0], [x0], x2
+        st1             {v16.S}[1], [x0], x2
        b.gt            5b
        ret
 endfunc
@@ -370,51 +370,51 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1
        sub             w4,  w7,  w13
        sub             w4,  w4,  w14
        add             w4,  w4,  #64
-        dup             v0.8b,  w4
-        dup             v2.8b,  w12
-        dup             v1.8b,  w6
-        dup             v3.8b,  w7
-        trn1            v0.4h,  v0.4h,  v2.4h
-        trn1            v1.4h,  v1.4h,  v3.4h
+        dup             v0.8B,  w4
+        dup             v2.8B,  w12
+        dup             v1.8B,  w6
+        dup             v3.8B,  w7
+        trn1            v0.4H,  v0.4H,  v2.4H
+        trn1            v1.4H,  v1.4H,  v3.4H
 1:
-        ld1             {v4.s}[0],  [x1], x2
-        ld1             {v4.s}[1],  [x1], x2
-        rev64           v5.2s,  v4.2s
-        ld1             {v5.s}[1],  [x1]
-        ext             v6.8b,  v4.8b,  v5.8b,  #1
-        ext             v7.8b,  v5.8b,  v4.8b,  #1
-        trn1            v4.4h,  v4.4h,  v6.4h
-        trn1            v5.4h,  v5.4h,  v7.4h
-        umull           v16.8h, v4.8b,  v0.8b
-        umlal           v16.8h, v5.8b,  v1.8b
+        ld1             {v4.S}[0],  [x1], x2
+        ld1             {v4.S}[1],  [x1], x2
+        rev64           v5.2S,  v4.2S
+        ld1             {v5.S}[1],  [x1]
+        ext             v6.8B,  v4.8B,  v5.8B,  #1
+        ext             v7.8B,  v5.8B,  v4.8B,  #1
+        trn1            v4.4H,  v4.4H,  v6.4H
+        trn1            v5.4H,  v5.4H,  v7.4H
+        umull           v16.8H, v4.8B,  v0.8B
+        umlal           v16.8H, v5.8B,  v1.8B
  .ifc \type,avg
-        ld1             {v18.h}[0], [x0], x2
-        ld1             {v18.h}[2], [x0]
+        ld1             {v18.H}[0], [x0], x2
+        ld1             {v18.H}[2], [x0]
        sub             x0,  x0,  x2
  .endif
-        rev64           v17.4s, v16.4s
-        add             v16.8h, v16.8h, v17.8h
-        rshrn           v16.8b, v16.8h, #6
+        rev64           v17.4S, v16.4S
+        add             v16.8H, v16.8H, v17.8H
+        rshrn           v16.8B, v16.8H, #6
  .ifc \type,avg
-        urhadd          v16.8b, v16.8b, v18.8b
+        urhadd          v16.8B, v16.8B, v18.8B
  .endif
-        st1             {v16.h}[0], [x0], x2
-        st1             {v16.h}[2], [x0], x2
+        st1             {v16.H}[0], [x0], x2
+        st1             {v16.H}[2], [x0], x2
        subs            w3,  w3,  #2
        b.gt            1b
        ret

 2:
-        ld1             {v16.h}[0], [x1], x2
-        ld1             {v16.h}[1], [x1], x2
+        ld1             {v16.H}[0], [x1], x2
+        ld1             {v16.H}[1], [x1], x2
  .ifc \type,avg
-        ld1             {v18.h}[0], [x0], x2
-        ld1             {v18.h}[1], [x0]
+        ld1             {v18.H}[0], [x0], x2
+        ld1             {v18.H}[1], [x0]
        sub             x0,  x0,  x2
-        urhadd          v16.8b, v16.8b, v18.8b
+        urhadd          v16.8B, v16.8B, v18.8B
  .endif
-        st1             {v16.h}[0], [x0], x2
-        st1             {v16.h}[1], [x0], x2
+        st1             {v16.H}[0], [x0], x2
+        st1             {v16.H}[1], [x0], x2
        subs            w3,  w3,  #2
        b.gt            2b
        ret
@@ -27,114 +27,114 @@
 .macro  lowpass_const   r
        movz            \r, #20, lsl #16
        movk            \r, #5
-        mov             v6.s[0], \r
+        mov             v6.S[0], \r
 .endm

 //trashes v0-v5
 .macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
-        ext             v2.8b,      \r0\().8b, \r1\().8b, #2
-        ext             v3.8b,      \r0\().8b, \r1\().8b, #3
-        uaddl           v2.8h,      v2.8b,     v3.8b
-        ext             v4.8b,      \r0\().8b, \r1\().8b, #1
-        ext             v5.8b,      \r0\().8b, \r1\().8b, #4
-        uaddl           v4.8h,      v4.8b,     v5.8b
-        ext             v1.8b,      \r0\().8b, \r1\().8b, #5
-        uaddl           \d0\().8h,  \r0\().8b, v1.8b
-        ext             v0.8b,      \r2\().8b, \r3\().8b, #2
-        mla             \d0\().8h,  v2.8h,     v6.h[1]
-        ext             v1.8b,      \r2\().8b, \r3\().8b, #3
-        uaddl           v0.8h,      v0.8b,     v1.8b
-        ext             v1.8b,      \r2\().8b, \r3\().8b, #1
-        mls             \d0\().8h,  v4.8h,     v6.h[0]
-        ext             v3.8b,      \r2\().8b, \r3\().8b, #4
-        uaddl           v1.8h,      v1.8b,     v3.8b
-        ext             v2.8b,      \r2\().8b, \r3\().8b, #5
-        uaddl           \d1\().8h,  \r2\().8b, v2.8b
-        mla             \d1\().8h,  v0.8h,     v6.h[1]
-        mls             \d1\().8h,  v1.8h,     v6.h[0]
+        ext             v2.8B,      \r0\().8B, \r1\().8B, #2
+        ext             v3.8B,      \r0\().8B, \r1\().8B, #3
+        uaddl           v2.8H,      v2.8B,     v3.8B
+        ext             v4.8B,      \r0\().8B, \r1\().8B, #1
+        ext             v5.8B,      \r0\().8B, \r1\().8B, #4
+        uaddl           v4.8H,      v4.8B,     v5.8B
+        ext             v1.8B,      \r0\().8B, \r1\().8B, #5
+        uaddl           \d0\().8H,  \r0\().8B, v1.8B
+        ext             v0.8B,      \r2\().8B, \r3\().8B, #2
+        mla             \d0\().8H,  v2.8H,     v6.H[1]
+        ext             v1.8B,      \r2\().8B, \r3\().8B, #3
+        uaddl           v0.8H,      v0.8B,     v1.8B
+        ext             v1.8B,      \r2\().8B, \r3\().8B, #1
+        mls             \d0\().8H,  v4.8H,     v6.H[0]
+        ext             v3.8B,      \r2\().8B, \r3\().8B, #4
+        uaddl           v1.8H,      v1.8B,     v3.8B
+        ext             v2.8B,      \r2\().8B, \r3\().8B, #5
+        uaddl           \d1\().8H,  \r2\().8B, v2.8B
+        mla             \d1\().8H,  v0.8H,     v6.H[1]
+        mls             \d1\().8H,  v1.8H,     v6.H[0]
  .if \narrow
-        sqrshrun        \d0\().8b,  \d0\().8h, #5
-        sqrshrun        \d1\().8b,  \d1\().8h, #5
+        sqrshrun        \d0\().8B,  \d0\().8H, #5
+        sqrshrun        \d1\().8B,  \d1\().8H, #5
  .endif
 .endm

 //trashes v0-v5, v7, v30-v31
 .macro  lowpass_8H      r0,  r1
-        ext             v0.16b,     \r0\().16b, \r0\().16b, #2
-        ext             v1.16b,     \r0\().16b, \r0\().16b, #3
-        uaddl           v0.8h,      v0.8b,      v1.8b
-        ext             v2.16b,     \r0\().16b, \r0\().16b, #1
-        ext             v3.16b,     \r0\().16b, \r0\().16b, #4
-        uaddl           v2.8h,      v2.8b,      v3.8b
-        ext             v30.16b,    \r0\().16b, \r0\().16b, #5
-        uaddl           \r0\().8h,  \r0\().8b,  v30.8b
-        ext             v4.16b,     \r1\().16b, \r1\().16b, #2
-        mla             \r0\().8h,  v0.8h,      v6.h[1]
-        ext             v5.16b,     \r1\().16b, \r1\().16b, #3
-        uaddl           v4.8h,      v4.8b,      v5.8b
-        ext             v7.16b,     \r1\().16b, \r1\().16b, #1
-        mls             \r0\().8h,  v2.8h,      v6.h[0]
-        ext             v0.16b,     \r1\().16b, \r1\().16b, #4
-        uaddl           v7.8h,      v7.8b,      v0.8b
-        ext             v31.16b,    \r1\().16b, \r1\().16b, #5
-        uaddl           \r1\().8h,  \r1\().8b,  v31.8b
-        mla             \r1\().8h,  v4.8h,      v6.h[1]
-        mls             \r1\().8h,  v7.8h,      v6.h[0]
+        ext             v0.16B,     \r0\().16B, \r0\().16B, #2
+        ext             v1.16B,     \r0\().16B, \r0\().16B, #3
+        uaddl           v0.8H,      v0.8B,      v1.8B
+        ext             v2.16B,     \r0\().16B, \r0\().16B, #1
+        ext             v3.16B,     \r0\().16B, \r0\().16B, #4
+        uaddl           v2.8H,      v2.8B,      v3.8B
+        ext             v30.16B,    \r0\().16B, \r0\().16B, #5
+        uaddl           \r0\().8H,  \r0\().8B,  v30.8B
+        ext             v4.16B,     \r1\().16B, \r1\().16B, #2
+        mla             \r0\().8H,  v0.8H,      v6.H[1]
+        ext             v5.16B,     \r1\().16B, \r1\().16B, #3
+        uaddl           v4.8H,      v4.8B,      v5.8B
+        ext             v7.16B,     \r1\().16B, \r1\().16B, #1
+        mls             \r0\().8H,  v2.8H,      v6.H[0]
+        ext             v0.16B,     \r1\().16B, \r1\().16B, #4
+        uaddl           v7.8H,      v7.8B,      v0.8B
+        ext             v31.16B,    \r1\().16B, \r1\().16B, #5
+        uaddl           \r1\().8H,  \r1\().8B,  v31.8B
+        mla             \r1\().8H,  v4.8H,      v6.H[1]
+        mls             \r1\().8H,  v7.8H,      v6.H[0]
 .endm

 // trashes v2-v5, v30
 .macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
-        ext             v2.8b,     \r0\().8b, \r1\().8b, #2
-        ext             v3.8b,     \r0\().8b, \r1\().8b, #3
-        uaddl           v2.8h,     v2.8b,     v3.8b
-        ext             v4.8b,     \r0\().8b, \r1\().8b, #1
-        ext             v5.8b,     \r0\().8b, \r1\().8b, #4
-        uaddl           v4.8h,     v4.8b,     v5.8b
-        ext             v30.8b,    \r0\().8b, \r1\().8b, #5
-        uaddl           \d0\().8h, \r0\().8b, v30.8b
-        mla             \d0\().8h, v2.8h,     v6.h[1]
-        mls             \d0\().8h, v4.8h,     v6.h[0]
+        ext             v2.8B,     \r0\().8B, \r1\().8B, #2
+        ext             v3.8B,     \r0\().8B, \r1\().8B, #3
+        uaddl           v2.8H,     v2.8B,     v3.8B
+        ext             v4.8B,     \r0\().8B, \r1\().8B, #1
+        ext             v5.8B,     \r0\().8B, \r1\().8B, #4
+        uaddl           v4.8H,     v4.8B,     v5.8B
+        ext             v30.8B,    \r0\().8B, \r1\().8B, #5
+        uaddl           \d0\().8H, \r0\().8B, v30.8B
+        mla             \d0\().8H, v2.8H,     v6.H[1]
+        mls             \d0\().8H, v4.8H,     v6.H[0]
  .if \narrow
-        sqrshrun        \d0\().8b, \d0\().8h, #5
+        sqrshrun        \d0\().8B, \d0\().8H, #5
  .endif
 .endm

 // trashed v0-v7
 .macro  lowpass_8.16    r0,  r1,  r2
-        ext             v1.16b,     \r0\().16b, \r1\().16b, #4
-        ext             v0.16b,     \r0\().16b, \r1\().16b, #6
-        saddl           v5.4s,      v1.4h,      v0.4h
-        ext             v2.16b,     \r0\().16b, \r1\().16b, #2
-        saddl2          v1.4s,      v1.8h,      v0.8h
-        ext             v3.16b,     \r0\().16b, \r1\().16b, #8
-        saddl           v6.4s,      v2.4h,      v3.4h
-        ext             \r1\().16b, \r0\().16b, \r1\().16b, #10
-        saddl2          v2.4s,      v2.8h,      v3.8h
-        saddl           v0.4s,      \r0\().4h,  \r1\().4h
-        saddl2          v4.4s,      \r0\().8h,  \r1\().8h
+        ext             v1.16B,     \r0\().16B, \r1\().16B, #4
+        ext             v0.16B,     \r0\().16B, \r1\().16B, #6
+        saddl           v5.4S,      v1.4H,      v0.4H
+        ext             v2.16B,     \r0\().16B, \r1\().16B, #2
+        saddl2          v1.4S,      v1.8H,      v0.8H
+        ext             v3.16B,     \r0\().16B, \r1\().16B, #8
+        saddl           v6.4S,      v2.4H,      v3.4H
+        ext             \r1\().16B, \r0\().16B, \r1\().16B, #10
+        saddl2          v2.4S,      v2.8H,      v3.8H
+        saddl           v0.4S,      \r0\().4H,  \r1\().4H
+        saddl2          v4.4S,      \r0\().8H,  \r1\().8H

-        shl             v3.4s,  v5.4s,  #4
-        shl             v5.4s,  v5.4s,  #2
-        shl             v7.4s,  v6.4s,  #2
-        add             v5.4s,  v5.4s,  v3.4s
-        add             v6.4s,  v6.4s,  v7.4s
+        shl             v3.4S,  v5.4S,  #4
+        shl             v5.4S,  v5.4S,  #2
+        shl             v7.4S,  v6.4S,  #2
+        add             v5.4S,  v5.4S,  v3.4S
+        add             v6.4S,  v6.4S,  v7.4S

-        shl             v3.4s,  v1.4s,  #4
-        shl             v1.4s,  v1.4s,  #2
-        shl             v7.4s,  v2.4s,  #2
-        add             v1.4s,  v1.4s,  v3.4s
-        add             v2.4s,  v2.4s,  v7.4s
+        shl             v3.4S,  v1.4S,  #4
+        shl             v1.4S,  v1.4S,  #2
+        shl             v7.4S,  v2.4S,  #2
+        add             v1.4S,  v1.4S,  v3.4S
+        add             v2.4S,  v2.4S,  v7.4S

-        add             v5.4s,  v5.4s,  v0.4s
-        sub             v5.4s,  v5.4s,  v6.4s
+        add             v5.4S,  v5.4S,  v0.4S
+        sub             v5.4S,  v5.4S,  v6.4S

-        add             v1.4s,  v1.4s,  v4.4s
-        sub             v1.4s,  v1.4s,  v2.4s
+        add             v1.4S,  v1.4S,  v4.4S
+        sub             v1.4S,  v1.4S,  v2.4S

-        rshrn           v5.4h,  v5.4s,  #10
-        rshrn2          v5.8h,  v1.4s,  #10
+        rshrn           v5.4H,  v5.4S,  #10
+        rshrn2          v5.8H,  v1.4S,  #10

-        sqxtun          \r2\().8b,  v5.8h
+        sqxtun          \r2\().8B,  v5.8H
 .endm

 function put_h264_qpel16_h_lowpass_neon_packed
@@ -163,19 +163,19 @@ function \type\()_h264_qpel16_h_lowpass_neon
 endfunc

 function \type\()_h264_qpel8_h_lowpass_neon
-1:      ld1             {v28.8b, v29.8b}, [x1], x2
-        ld1             {v16.8b, v17.8b}, [x1], x2
+1:      ld1             {v28.8B, v29.8B}, [x1], x2
+        ld1             {v16.8B, v17.8B}, [x1], x2
        subs            x12, x12, #2
        lowpass_8       v28, v29, v16, v17, v28, v16
  .ifc \type,avg
-        ld1             {v2.8b},    [x0], x3
-        urhadd          v28.8b, v28.8b,  v2.8b
-        ld1             {v3.8b},    [x0]
-        urhadd          v16.8b, v16.8b, v3.8b
+        ld1             {v2.8B},    [x0], x3
+        urhadd          v28.8B, v28.8B,  v2.8B
+        ld1             {v3.8B},    [x0]
+        urhadd          v16.8B, v16.8B, v3.8B
        sub             x0,  x0,  x3
  .endif
-        st1             {v28.8b},    [x0], x3
-        st1             {v16.8b},    [x0], x3
+        st1             {v28.8B},    [x0], x3
+        st1             {v16.8B},    [x0], x3
        b.ne            1b
        ret
 endfunc
@@ -200,23 +200,23 @@ function \type\()_h264_qpel16_h_lowpass_l2_neon
 endfunc

 function \type\()_h264_qpel8_h_lowpass_l2_neon
-1:      ld1             {v26.8b, v27.8b}, [x1], x2
-        ld1             {v16.8b, v17.8b}, [x1], x2
-        ld1             {v28.8b},     [x3], x2
-        ld1             {v29.8b},     [x3], x2
+1:      ld1             {v26.8B, v27.8B}, [x1], x2
+        ld1             {v16.8B, v17.8B}, [x1], x2
+        ld1             {v28.8B},     [x3], x2
+        ld1             {v29.8B},     [x3], x2
        subs            x12, x12, #2
        lowpass_8       v26, v27, v16, v17, v26, v27
-        urhadd          v26.8b, v26.8b, v28.8b
-        urhadd          v27.8b, v27.8b, v29.8b
+        urhadd          v26.8B, v26.8B, v28.8B
+        urhadd          v27.8B, v27.8B, v29.8B
  .ifc \type,avg
-        ld1             {v2.8b},      [x0], x2
-        urhadd          v26.8b, v26.8b, v2.8b
-        ld1             {v3.8b},      [x0]
-        urhadd          v27.8b, v27.8b, v3.8b
+        ld1             {v2.8B},      [x0], x2
+        urhadd          v26.8B, v26.8B, v2.8B
+        ld1             {v3.8B},      [x0]
+        urhadd          v27.8B, v27.8B, v3.8B
        sub             x0,  x0,  x2
  .endif
-        st1             {v26.8b},     [x0], x2
-        st1             {v27.8b},     [x0], x2
+        st1             {v26.8B},     [x0], x2
+        st1             {v27.8B},     [x0], x2
        b.ne            1b
        ret
 endfunc
@@ -257,19 +257,19 @@ function \type\()_h264_qpel16_v_lowpass_neon
 endfunc

 function \type\()_h264_qpel8_v_lowpass_neon
-        ld1             {v16.8b}, [x1], x3
-        ld1             {v18.8b}, [x1], x3
-        ld1             {v20.8b}, [x1], x3
-        ld1             {v22.8b}, [x1], x3
-        ld1             {v24.8b}, [x1], x3
-        ld1             {v26.8b}, [x1], x3
-        ld1             {v28.8b}, [x1], x3
-        ld1             {v30.8b}, [x1], x3
-        ld1             {v17.8b}, [x1], x3
-        ld1             {v19.8b}, [x1], x3
-        ld1             {v21.8b}, [x1], x3
-        ld1             {v23.8b}, [x1], x3
-        ld1             {v25.8b}, [x1]
+        ld1             {v16.8B}, [x1], x3
+        ld1             {v18.8B}, [x1], x3
+        ld1             {v20.8B}, [x1], x3
+        ld1             {v22.8B}, [x1], x3
+        ld1             {v24.8B}, [x1], x3
+        ld1             {v26.8B}, [x1], x3
+        ld1             {v28.8B}, [x1], x3
+        ld1             {v30.8B}, [x1], x3
+        ld1             {v17.8B}, [x1], x3
+        ld1             {v19.8B}, [x1], x3
+        ld1             {v21.8B}, [x1], x3
+        ld1             {v23.8B}, [x1], x3
+        ld1             {v25.8B}, [x1]

        transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
        transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
@@ -280,33 +280,33 @@ function \type\()_h264_qpel8_v_lowpass_neon
        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1

  .ifc \type,avg
-        ld1             {v24.8b},  [x0], x2
-        urhadd          v16.8b, v16.8b, v24.8b
-        ld1             {v25.8b}, [x0], x2
-        urhadd          v17.8b, v17.8b, v25.8b
-        ld1             {v26.8b}, [x0], x2
-        urhadd          v18.8b, v18.8b, v26.8b
-        ld1             {v27.8b}, [x0], x2
-        urhadd          v19.8b, v19.8b, v27.8b
-        ld1             {v28.8b}, [x0], x2
-        urhadd          v20.8b, v20.8b, v28.8b
-        ld1             {v29.8b}, [x0], x2
-        urhadd          v21.8b, v21.8b, v29.8b
-        ld1             {v30.8b}, [x0], x2
-        urhadd          v22.8b, v22.8b, v30.8b
-        ld1             {v31.8b}, [x0], x2
-        urhadd          v23.8b, v23.8b, v31.8b
+        ld1             {v24.8B},  [x0], x2
+        urhadd          v16.8B, v16.8B, v24.8B
+        ld1             {v25.8B}, [x0], x2
+        urhadd          v17.8B, v17.8B, v25.8B
+        ld1             {v26.8B}, [x0], x2
+        urhadd          v18.8B, v18.8B, v26.8B
+        ld1             {v27.8B}, [x0], x2
+        urhadd          v19.8B, v19.8B, v27.8B
+        ld1             {v28.8B}, [x0], x2
+        urhadd          v20.8B, v20.8B, v28.8B
+        ld1             {v29.8B}, [x0], x2
+        urhadd          v21.8B, v21.8B, v29.8B
+        ld1             {v30.8B}, [x0], x2
+        urhadd          v22.8B, v22.8B, v30.8B
+        ld1             {v31.8B}, [x0], x2
+        urhadd          v23.8B, v23.8B, v31.8B
        sub             x0,  x0,  x2,  lsl #3
  .endif

-        st1             {v16.8b}, [x0], x2
-        st1             {v17.8b}, [x0], x2
-        st1             {v18.8b}, [x0], x2
-        st1             {v19.8b}, [x0], x2
-        st1             {v20.8b}, [x0], x2
-        st1             {v21.8b}, [x0], x2
-        st1             {v22.8b}, [x0], x2
-        st1             {v23.8b}, [x0], x2
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
+        st1             {v18.8B}, [x0], x2
+        st1             {v19.8B}, [x0], x2
+        st1             {v20.8B}, [x0], x2
+        st1             {v21.8B}, [x0], x2
+        st1             {v22.8B}, [x0], x2
+        st1             {v23.8B}, [x0], x2

        ret
 endfunc
@@ -334,19 +334,19 @@ function \type\()_h264_qpel16_v_lowpass_l2_neon
 endfunc

 function \type\()_h264_qpel8_v_lowpass_l2_neon
-        ld1             {v16.8b}, [x1], x3
-        ld1             {v18.8b}, [x1], x3
-        ld1             {v20.8b}, [x1], x3
-        ld1             {v22.8b}, [x1], x3
-        ld1             {v24.8b}, [x1], x3
-        ld1             {v26.8b}, [x1], x3
-        ld1             {v28.8b}, [x1], x3
-        ld1             {v30.8b}, [x1], x3
-        ld1             {v17.8b}, [x1], x3
-        ld1             {v19.8b}, [x1], x3
-        ld1             {v21.8b}, [x1], x3
-        ld1             {v23.8b}, [x1], x3
-        ld1             {v25.8b}, [x1]
+        ld1             {v16.8B}, [x1], x3
+        ld1             {v18.8B}, [x1], x3
+        ld1             {v20.8B}, [x1], x3
+        ld1             {v22.8B}, [x1], x3
+        ld1             {v24.8B}, [x1], x3
+        ld1             {v26.8B}, [x1], x3
+        ld1             {v28.8B}, [x1], x3
+        ld1             {v30.8B}, [x1], x3
+        ld1             {v17.8B}, [x1], x3
+        ld1             {v19.8B}, [x1], x3
+        ld1             {v21.8B}, [x1], x3
+        ld1             {v23.8B}, [x1], x3
+        ld1             {v25.8B}, [x1]

        transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
        transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
@@ -356,51 +356,51 @@ function \type\()_h264_qpel8_v_lowpass_l2_neon
        lowpass_8       v28, v29, v30, v31, v22, v23
        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1

-        ld1             {v24.8b},  [x12], x2
-        ld1             {v25.8b},  [x12], x2
-        ld1             {v26.8b},  [x12], x2
-        ld1             {v27.8b},  [x12], x2
-        ld1             {v28.8b},  [x12], x2
-        urhadd          v16.8b, v24.8b, v16.8b
-        urhadd          v17.8b, v25.8b, v17.8b
-        ld1             {v29.8b},  [x12], x2
-        urhadd          v18.8b, v26.8b, v18.8b
-        urhadd          v19.8b, v27.8b, v19.8b
-        ld1             {v30.8b}, [x12], x2
-        urhadd          v20.8b, v28.8b, v20.8b
-        urhadd          v21.8b, v29.8b, v21.8b
-        ld1             {v31.8b}, [x12], x2
-        urhadd          v22.8b, v30.8b, v22.8b
-        urhadd          v23.8b, v31.8b, v23.8b
+        ld1             {v24.8B},  [x12], x2
+        ld1             {v25.8B},  [x12], x2
+        ld1             {v26.8B},  [x12], x2
+        ld1             {v27.8B},  [x12], x2
+        ld1             {v28.8B},  [x12], x2
+        urhadd          v16.8B, v24.8B, v16.8B
+        urhadd          v17.8B, v25.8B, v17.8B
+        ld1             {v29.8B},  [x12], x2
+        urhadd          v18.8B, v26.8B, v18.8B
+        urhadd          v19.8B, v27.8B, v19.8B
+        ld1             {v30.8B}, [x12], x2
+        urhadd          v20.8B, v28.8B, v20.8B
+        urhadd          v21.8B, v29.8B, v21.8B
+        ld1             {v31.8B}, [x12], x2
+        urhadd          v22.8B, v30.8B, v22.8B
+        urhadd          v23.8B, v31.8B, v23.8B

  .ifc \type,avg
-        ld1             {v24.8b}, [x0], x3
-        urhadd          v16.8b, v16.8b, v24.8b
-        ld1             {v25.8b}, [x0], x3
-        urhadd          v17.8b, v17.8b, v25.8b
-        ld1             {v26.8b}, [x0], x3
-        urhadd          v18.8b, v18.8b, v26.8b
-        ld1             {v27.8b}, [x0], x3
-        urhadd          v19.8b, v19.8b, v27.8b
-        ld1             {v28.8b}, [x0], x3
-        urhadd          v20.8b, v20.8b, v28.8b
-        ld1             {v29.8b}, [x0], x3
-        urhadd          v21.8b, v21.8b, v29.8b
-        ld1             {v30.8b}, [x0], x3
-        urhadd          v22.8b, v22.8b, v30.8b
-        ld1             {v31.8b}, [x0], x3
-        urhadd          v23.8b, v23.8b, v31.8b
+        ld1             {v24.8B}, [x0], x3
+        urhadd          v16.8B, v16.8B, v24.8B
+        ld1             {v25.8B}, [x0], x3
+        urhadd          v17.8B, v17.8B, v25.8B
+        ld1             {v26.8B}, [x0], x3
+        urhadd          v18.8B, v18.8B, v26.8B
+        ld1             {v27.8B}, [x0], x3
+        urhadd          v19.8B, v19.8B, v27.8B
+        ld1             {v28.8B}, [x0], x3
+        urhadd          v20.8B, v20.8B, v28.8B
+        ld1             {v29.8B}, [x0], x3
+        urhadd          v21.8B, v21.8B, v29.8B
+        ld1             {v30.8B}, [x0], x3
+        urhadd          v22.8B, v22.8B, v30.8B
+        ld1             {v31.8B}, [x0], x3
+        urhadd          v23.8B, v23.8B, v31.8B
        sub             x0,  x0,  x3,  lsl #3
  .endif

-        st1             {v16.8b}, [x0], x3
-        st1             {v17.8b}, [x0], x3
-        st1             {v18.8b}, [x0], x3
-        st1             {v19.8b}, [x0], x3
-        st1             {v20.8b}, [x0], x3
-        st1             {v21.8b}, [x0], x3
-        st1             {v22.8b}, [x0], x3
-        st1             {v23.8b}, [x0], x3
+        st1             {v16.8B}, [x0], x3
+        st1             {v17.8B}, [x0], x3
+        st1             {v18.8B}, [x0], x3
+        st1             {v19.8B}, [x0], x3
+        st1             {v20.8B}, [x0], x3
+        st1             {v21.8B}, [x0], x3
+        st1             {v22.8B}, [x0], x3
+        st1             {v23.8B}, [x0], x3

        ret
 endfunc
@@ -411,19 +411,19 @@ endfunc

 function put_h264_qpel8_hv_lowpass_neon_top
        lowpass_const   w12
-        ld1             {v16.8h}, [x1], x3
-        ld1             {v17.8h}, [x1], x3
-        ld1             {v18.8h}, [x1], x3
-        ld1             {v19.8h}, [x1], x3
-        ld1             {v20.8h}, [x1], x3
-        ld1             {v21.8h}, [x1], x3
-        ld1             {v22.8h}, [x1], x3
-        ld1             {v23.8h}, [x1], x3
-        ld1             {v24.8h}, [x1], x3
-        ld1             {v25.8h}, [x1], x3
-        ld1             {v26.8h}, [x1], x3
-        ld1             {v27.8h}, [x1], x3
-        ld1             {v28.8h}, [x1]
+        ld1             {v16.8H}, [x1], x3
+        ld1             {v17.8H}, [x1], x3
+        ld1             {v18.8H}, [x1], x3
+        ld1             {v19.8H}, [x1], x3
+        ld1             {v20.8H}, [x1], x3
+        ld1             {v21.8H}, [x1], x3
+        ld1             {v22.8H}, [x1], x3
+        ld1             {v23.8H}, [x1], x3
+        ld1             {v24.8H}, [x1], x3
+        ld1             {v25.8H}, [x1], x3
+        ld1             {v26.8H}, [x1], x3
+        ld1             {v27.8H}, [x1], x3
+        ld1             {v28.8H}, [x1]
        lowpass_8H      v16, v17
        lowpass_8H      v18, v19
        lowpass_8H      v20, v21
@@ -447,7 +447,7 @@ function put_h264_qpel8_hv_lowpass_neon_top
        lowpass_8.16    v22, v30, v22
        lowpass_8.16    v23, v31, v23

-        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+        transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1

        ret
 endfunc
@@ -457,33 +457,33 @@ function \type\()_h264_qpel8_hv_lowpass_neon
        mov             x10, x30
        bl              put_h264_qpel8_hv_lowpass_neon_top
  .ifc \type,avg
-        ld1             {v0.8b},      [x0], x2
-        urhadd          v16.8b, v16.8b, v0.8b
-        ld1             {v1.8b},      [x0], x2
-        urhadd          v17.8b, v17.8b, v1.8b
-        ld1             {v2.8b},      [x0], x2
-        urhadd          v18.8b, v18.8b, v2.8b
-        ld1             {v3.8b},      [x0], x2
-        urhadd          v19.8b, v19.8b, v3.8b
-        ld1             {v4.8b},      [x0], x2
-        urhadd          v20.8b, v20.8b, v4.8b
-        ld1             {v5.8b},      [x0], x2
-        urhadd          v21.8b, v21.8b, v5.8b
-        ld1             {v6.8b},      [x0], x2
-        urhadd          v22.8b, v22.8b, v6.8b
-        ld1             {v7.8b},      [x0], x2
-        urhadd          v23.8b, v23.8b, v7.8b
+        ld1             {v0.8B},      [x0], x2
+        urhadd          v16.8B, v16.8B, v0.8B
+        ld1             {v1.8B},      [x0], x2
+        urhadd          v17.8B, v17.8B, v1.8B
+        ld1             {v2.8B},      [x0], x2
+        urhadd          v18.8B, v18.8B, v2.8B
+        ld1             {v3.8B},      [x0], x2
+        urhadd          v19.8B, v19.8B, v3.8B
+        ld1             {v4.8B},      [x0], x2
+        urhadd          v20.8B, v20.8B, v4.8B
+        ld1             {v5.8B},      [x0], x2
+        urhadd          v21.8B, v21.8B, v5.8B
+        ld1             {v6.8B},      [x0], x2
+        urhadd          v22.8B, v22.8B, v6.8B
+        ld1             {v7.8B},      [x0], x2
+        urhadd          v23.8B, v23.8B, v7.8B
        sub             x0,  x0,  x2,  lsl #3
  .endif

-        st1             {v16.8b},     [x0], x2
-        st1             {v17.8b},     [x0], x2
-        st1             {v18.8b},     [x0], x2
-        st1             {v19.8b},     [x0], x2
-        st1             {v20.8b},     [x0], x2
-        st1             {v21.8b},     [x0], x2
-        st1             {v22.8b},     [x0], x2
-        st1             {v23.8b},     [x0], x2
+        st1             {v16.8B},     [x0], x2
+        st1             {v17.8B},     [x0], x2
+        st1             {v18.8B},     [x0], x2
+        st1             {v19.8B},     [x0], x2
+        st1             {v20.8B},     [x0], x2
+        st1             {v21.8B},     [x0], x2
+        st1             {v22.8B},     [x0], x2
+        st1             {v23.8B},     [x0], x2

        ret             x10
 endfunc
@@ -497,45 +497,45 @@ function \type\()_h264_qpel8_hv_lowpass_l2_neon
        mov             x10, x30
        bl              put_h264_qpel8_hv_lowpass_neon_top

-        ld1             {v0.8b, v1.8b},  [x2], #16
-        ld1             {v2.8b, v3.8b},  [x2], #16
-        urhadd          v0.8b,  v0.8b,  v16.8b
-        urhadd          v1.8b,  v1.8b,  v17.8b
-        ld1             {v4.8b, v5.8b},  [x2], #16
-        urhadd          v2.8b,  v2.8b,  v18.8b
-        urhadd          v3.8b,  v3.8b,  v19.8b
-        ld1             {v6.8b, v7.8b},  [x2], #16
-        urhadd          v4.8b,  v4.8b,  v20.8b
-        urhadd          v5.8b,  v5.8b,  v21.8b
-        urhadd          v6.8b,  v6.8b,  v22.8b
-        urhadd          v7.8b,  v7.8b,  v23.8b
+        ld1             {v0.8B, v1.8B},  [x2], #16
+        ld1             {v2.8B, v3.8B},  [x2], #16
+        urhadd          v0.8B,  v0.8B,  v16.8B
+        urhadd          v1.8B,  v1.8B,  v17.8B
+        ld1             {v4.8B, v5.8B},  [x2], #16
+        urhadd          v2.8B,  v2.8B,  v18.8B
+        urhadd          v3.8B,  v3.8B,  v19.8B
+        ld1             {v6.8B, v7.8B},  [x2], #16
+        urhadd          v4.8B,  v4.8B,  v20.8B
+        urhadd          v5.8B,  v5.8B,  v21.8B
+        urhadd          v6.8B,  v6.8B,  v22.8B
+        urhadd          v7.8B,  v7.8B,  v23.8B
  .ifc \type,avg
-        ld1             {v16.8b},     [x0], x3
-        urhadd          v0.8b,  v0.8b,  v16.8b
-        ld1             {v17.8b},     [x0], x3
-        urhadd          v1.8b,  v1.8b,  v17.8b
-        ld1             {v18.8b},     [x0], x3
-        urhadd          v2.8b,  v2.8b,  v18.8b
-        ld1             {v19.8b},     [x0], x3
-        urhadd          v3.8b,  v3.8b,  v19.8b
-        ld1             {v20.8b},     [x0], x3
-        urhadd          v4.8b,  v4.8b,  v20.8b
-        ld1             {v21.8b},     [x0], x3
-        urhadd          v5.8b,  v5.8b,  v21.8b
-        ld1             {v22.8b},     [x0], x3
-        urhadd          v6.8b,  v6.8b,  v22.8b
-        ld1             {v23.8b},     [x0], x3
-        urhadd          v7.8b,  v7.8b,  v23.8b
+        ld1             {v16.8B},     [x0], x3
+        urhadd          v0.8B,  v0.8B,  v16.8B
+        ld1             {v17.8B},     [x0], x3
+        urhadd          v1.8B,  v1.8B,  v17.8B
+        ld1             {v18.8B},     [x0], x3
+        urhadd          v2.8B,  v2.8B,  v18.8B
+        ld1             {v19.8B},     [x0], x3
+        urhadd          v3.8B,  v3.8B,  v19.8B
+        ld1             {v20.8B},     [x0], x3
+        urhadd          v4.8B,  v4.8B,  v20.8B
+        ld1             {v21.8B},     [x0], x3
+        urhadd          v5.8B,  v5.8B,  v21.8B
+        ld1             {v22.8B},     [x0], x3
+        urhadd          v6.8B,  v6.8B,  v22.8B
+        ld1             {v23.8B},     [x0], x3
+        urhadd          v7.8B,  v7.8B,  v23.8B
        sub             x0,  x0,  x3,  lsl #3
  .endif
-        st1             {v0.8b},      [x0], x3
-        st1             {v1.8b},      [x0], x3
-        st1             {v2.8b},      [x0], x3
-        st1             {v3.8b},      [x0], x3
-        st1             {v4.8b},      [x0], x3
-        st1             {v5.8b},      [x0], x3
-        st1             {v6.8b},      [x0], x3
-        st1             {v7.8b},      [x0], x3
+        st1             {v0.8B},      [x0], x3
+        st1             {v1.8B},      [x0], x3
+        st1             {v2.8B},      [x0], x3
+        st1             {v3.8B},      [x0], x3
+        st1             {v4.8B},      [x0], x3
+        st1             {v5.8B},      [x0], x3
+        st1             {v6.8B},      [x0], x3
+        st1             {v7.8B},      [x0], x3

        ret             x10
 endfunc
@@ -579,8 +579,8 @@ function \type\()_h264_qpel16_hv_lowpass_l2_neon
 endfunc
 .endm

-        h264_qpel16_hv  put
-        h264_qpel16_hv  avg
+        h264_qpel16_hv put
+        h264_qpel16_hv avg

 .macro  h264_qpel8      type
 function ff_\type\()_h264_qpel8_mc10_neon, export=1
@@ -758,8 +758,8 @@ function ff_\type\()_h264_qpel8_mc33_neon, export=1
 endfunc
 .endm

-        h264_qpel8      put
-        h264_qpel8      avg
+        h264_qpel8 put
+        h264_qpel8 avg

 .macro  h264_qpel16     type
 function ff_\type\()_h264_qpel16_mc10_neon, export=1
@@ -930,5 +930,5 @@ function ff_\type\()_h264_qpel16_mc33_neon, export=1
 endfunc
 .endm

-        h264_qpel16     put
-        h264_qpel16     avg
+        h264_qpel16 put
+        h264_qpel16 avg
@@ -38,10 +38,10 @@ const trans, align=4
 endconst

 .macro clip10 in1, in2, c1, c2
-        smax            \in1, \in1, \c1
-        smax            \in2, \in2, \c1
-        smin            \in1, \in1, \c2
-        smin            \in2, \in2, \c2
+        smax        \in1, \in1, \c1
+        smax        \in2, \in2, \c1
+        smin        \in1, \in1, \c2
+        smin        \in2, \in2, \c2
 .endm

 function ff_hevc_add_residual_4x4_8_neon, export=1
@@ -50,13 +50,13 @@ function ff_hevc_add_residual_4x4_8_neon, export=1
        ld1             {v2.s}[1], [x0], x2
        ld1             {v2.s}[2], [x0], x2
        ld1             {v2.s}[3], [x0], x2
-        sub             x0,  x0,  x2, lsl #2
-        uxtl            v6.8h,  v2.8b
-        uxtl2           v7.8h,  v2.16b
-        sqadd           v0.8h,  v0.8h, v6.8h
-        sqadd           v1.8h,  v1.8h, v7.8h
-        sqxtun          v0.8b,  v0.8h
-        sqxtun2         v0.16b, v1.8h
+        sub              x0,  x0,  x2, lsl #2
+        uxtl             v6.8h,  v2.8b
+        uxtl2            v7.8h,  v2.16b
+        sqadd            v0.8h,  v0.8h, v6.8h
+        sqadd            v1.8h,  v1.8h, v7.8h
+        sqxtun           v0.8b,  v0.8h
+        sqxtun2          v0.16b, v1.8h
        st1             {v0.s}[0], [x0], x2
        st1             {v0.s}[1], [x0], x2
        st1             {v0.s}[2], [x0], x2
@@ -70,12 +70,12 @@ function ff_hevc_add_residual_4x4_10_neon, export=1
        ld1             {v2.d}[0], [x12], x2
        ld1             {v2.d}[1], [x12], x2
        ld1             {v3.d}[0], [x12], x2
-        sqadd           v0.8h, v0.8h, v2.8h
+        sqadd            v0.8h, v0.8h, v2.8h
        ld1             {v3.d}[1], [x12], x2
-        movi            v4.8h, #0
-        sqadd           v1.8h, v1.8h, v3.8h
-        mvni            v5.8h, #0xFC, lsl #8 // movi #0x3FF
-        clip10          v0.8h, v1.8h, v4.8h, v5.8h
+        movi             v4.8h, #0
+        sqadd            v1.8h, v1.8h, v3.8h
+        mvni             v5.8h, #0xFC, lsl #8 // movi #0x3FF
+        clip10           v0.8h, v1.8h, v4.8h, v5.8h
        st1             {v0.d}[0],  [x0], x2
        st1             {v0.d}[1],  [x0], x2
        st1             {v1.d}[0],  [x0], x2
@@ -85,48 +85,48 @@ endfunc

 function ff_hevc_add_residual_8x8_8_neon, export=1
        add             x12,  x0, x2
-        add             x2,  x2, x2
-        mov             x3,  #8
-1:      subs            x3,  x3, #2
+        add              x2,  x2, x2
+        mov              x3,  #8
+1:      subs             x3,  x3, #2
        ld1             {v2.d}[0],     [x0]
        ld1             {v2.d}[1],    [x12]
-        uxtl            v3.8h,  v2.8b
+        uxtl             v3.8h,  v2.8b
        ld1             {v0.8h-v1.8h}, [x1], #32
-        uxtl2           v2.8h,  v2.16b
-        sqadd           v0.8h,  v0.8h,   v3.8h
-        sqadd           v1.8h,  v1.8h,   v2.8h
-        sqxtun          v0.8b,  v0.8h
-        sqxtun2         v0.16b, v1.8h
+        uxtl2            v2.8h,  v2.16b
+        sqadd            v0.8h,  v0.8h,   v3.8h
+        sqadd            v1.8h,  v1.8h,   v2.8h
+        sqxtun           v0.8b,  v0.8h
+        sqxtun2          v0.16b, v1.8h
        st1             {v0.d}[0],     [x0], x2
        st1             {v0.d}[1],    [x12], x2
-        bne             1b
+        bne              1b
        ret
 endfunc

 function ff_hevc_add_residual_8x8_10_neon, export=1
        add             x12,  x0, x2
-        add             x2,  x2, x2
-        mov             x3,  #8
-        movi            v4.8h, #0
-        mvni            v5.8h, #0xFC, lsl #8 // movi #0x3FF
-1:      subs            x3,  x3, #2
+        add              x2,  x2, x2
+        mov              x3,  #8
+        movi             v4.8h, #0
+        mvni             v5.8h, #0xFC, lsl #8 // movi #0x3FF
+1:      subs             x3,  x3, #2
        ld1             {v0.8h-v1.8h}, [x1], #32
        ld1             {v2.8h},       [x0]
-        sqadd           v0.8h, v0.8h, v2.8h
+        sqadd            v0.8h, v0.8h, v2.8h
        ld1             {v3.8h},      [x12]
-        sqadd           v1.8h, v1.8h, v3.8h
-        clip10          v0.8h, v1.8h, v4.8h, v5.8h
+        sqadd            v1.8h, v1.8h, v3.8h
+        clip10           v0.8h, v1.8h, v4.8h, v5.8h
        st1             {v0.8h},       [x0], x2
        st1             {v1.8h},      [x12], x2
-        bne             1b
+        bne              1b
        ret
 endfunc

 function ff_hevc_add_residual_16x16_8_neon, export=1
-        mov             x3,  #16
+        mov              x3,  #16
        add             x12, x0, x2
-        add             x2,  x2, x2
-1:      subs            x3,  x3, #2
+        add              x2,  x2, x2
+1:      subs             x3,  x3, #2
        ld1             {v16.16b},     [x0]
        ld1             {v0.8h-v3.8h}, [x1], #64
        ld1             {v19.16b},    [x12]
@@ -134,47 +134,47 @@ function ff_hevc_add_residual_16x16_8_neon, export=1
        uxtl2           v18.8h, v16.16b
        uxtl            v20.8h, v19.8b
        uxtl2           v21.8h, v19.16b
-        sqadd           v0.8h,  v0.8h, v17.8h
-        sqadd           v1.8h,  v1.8h, v18.8h
-        sqadd           v2.8h,  v2.8h, v20.8h
-        sqadd           v3.8h,  v3.8h, v21.8h
-        sqxtun          v0.8b,  v0.8h
+        sqadd            v0.8h,  v0.8h, v17.8h
+        sqadd            v1.8h,  v1.8h, v18.8h
+        sqadd            v2.8h,  v2.8h, v20.8h
+        sqadd            v3.8h,  v3.8h, v21.8h
+        sqxtun           v0.8b,  v0.8h
        sqxtun2         v0.16b,  v1.8h
-        sqxtun          v1.8b,  v2.8h
+        sqxtun           v1.8b,  v2.8h
        sqxtun2         v1.16b,  v3.8h
        st1             {v0.16b},     [x0], x2
        st1             {v1.16b},    [x12], x2
-        bne             1b
+        bne              1b
        ret
 endfunc

 function ff_hevc_add_residual_16x16_10_neon, export=1
-        mov             x3,  #16
+        mov              x3,  #16
        movi            v20.8h, #0
        mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
        add             x12,  x0, x2
-        add             x2,  x2, x2
-1:      subs            x3,  x3, #2
+        add              x2,  x2, x2
+1:      subs             x3,  x3, #2
        ld1             {v16.8h-v17.8h}, [x0]
        ld1             {v0.8h-v3.8h},  [x1], #64
-        sqadd           v0.8h, v0.8h, v16.8h
+        sqadd            v0.8h, v0.8h, v16.8h
        ld1             {v18.8h-v19.8h}, [x12]
-        sqadd           v1.8h, v1.8h, v17.8h
-        sqadd           v2.8h, v2.8h, v18.8h
-        sqadd           v3.8h, v3.8h, v19.8h
-        clip10          v0.8h, v1.8h, v20.8h, v21.8h
-        clip10          v2.8h, v3.8h, v20.8h, v21.8h
+        sqadd            v1.8h, v1.8h, v17.8h
+        sqadd            v2.8h, v2.8h, v18.8h
+        sqadd            v3.8h, v3.8h, v19.8h
+        clip10           v0.8h, v1.8h, v20.8h, v21.8h
+        clip10           v2.8h, v3.8h, v20.8h, v21.8h
        st1             {v0.8h-v1.8h},   [x0], x2
        st1             {v2.8h-v3.8h},  [x12], x2
-        bne             1b
+        bne              1b
        ret
 endfunc

 function ff_hevc_add_residual_32x32_8_neon, export=1
        add             x12,  x0, x2
-        add             x2,  x2, x2
-        mov             x3,  #32
-1:      subs            x3,  x3, #2
+        add              x2,  x2, x2
+        mov              x3,  #32
+1:      subs             x3,  x3, #2
        ld1             {v20.16b, v21.16b}, [x0]
        uxtl            v16.8h,  v20.8b
        uxtl2           v17.8h,  v20.16b
@@ -187,43 +187,43 @@ function ff_hevc_add_residual_32x32_8_neon, export=1
        uxtl2           v21.8h,  v22.16b
        uxtl            v22.8h,  v23.8b
        uxtl2           v23.8h,  v23.16b
-        sqadd           v0.8h,  v0.8h,  v16.8h
-        sqadd           v1.8h,  v1.8h,  v17.8h
-        sqadd           v2.8h,  v2.8h,  v18.8h
-        sqadd           v3.8h,  v3.8h,  v19.8h
-        sqadd           v4.8h,  v4.8h,  v20.8h
-        sqadd           v5.8h,  v5.8h,  v21.8h
-        sqadd           v6.8h,  v6.8h,  v22.8h
-        sqadd           v7.8h,  v7.8h,  v23.8h
-        sqxtun          v0.8b,  v0.8h
+        sqadd            v0.8h,  v0.8h,  v16.8h
+        sqadd            v1.8h,  v1.8h,  v17.8h
+        sqadd            v2.8h,  v2.8h,  v18.8h
+        sqadd            v3.8h,  v3.8h,  v19.8h
+        sqadd            v4.8h,  v4.8h,  v20.8h
+        sqadd            v5.8h,  v5.8h,  v21.8h
+        sqadd            v6.8h,  v6.8h,  v22.8h
+        sqadd            v7.8h,  v7.8h,  v23.8h
+        sqxtun           v0.8b,  v0.8h
        sqxtun2         v0.16b,  v1.8h
-        sqxtun          v1.8b,  v2.8h
+        sqxtun           v1.8b,  v2.8h
        sqxtun2         v1.16b,  v3.8h
-        sqxtun          v2.8b,  v4.8h
+        sqxtun           v2.8b,  v4.8h
        sqxtun2         v2.16b,  v5.8h
        st1             {v0.16b, v1.16b},  [x0], x2
-        sqxtun          v3.8b,  v6.8h
+        sqxtun           v3.8b,  v6.8h
        sqxtun2         v3.16b,  v7.8h
        st1             {v2.16b, v3.16b}, [x12], x2
-        bne             1b
+        bne              1b
        ret
 endfunc

 function ff_hevc_add_residual_32x32_10_neon, export=1
-        mov             x3,  #32
+        mov              x3,  #32
        movi            v20.8h, #0
        mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
-1:      subs            x3,  x3, #1
+1:      subs             x3,  x3, #1
        ld1             {v0.8h-v3.8h},   [x1], #64
        ld1             {v16.8h-v19.8h}, [x0]
-        sqadd           v0.8h, v0.8h, v16.8h
-        sqadd           v1.8h, v1.8h, v17.8h
-        sqadd           v2.8h, v2.8h, v18.8h
-        sqadd           v3.8h, v3.8h, v19.8h
-        clip10          v0.8h, v1.8h, v20.8h, v21.8h
-        clip10          v2.8h, v3.8h, v20.8h, v21.8h
+        sqadd            v0.8h, v0.8h, v16.8h
+        sqadd            v1.8h, v1.8h, v17.8h
+        sqadd            v2.8h, v2.8h, v18.8h
+        sqadd            v3.8h, v3.8h, v19.8h
+        clip10           v0.8h, v1.8h, v20.8h, v21.8h
+        clip10           v2.8h, v3.8h, v20.8h, v21.8h
        st1             {v0.8h-v3.8h},   [x0], x2
-        bne             1b
+        bne              1b
        ret
 endfunc

@@ -246,19 +246,19 @@ endfunc

 // uses and clobbers v28-v31 as temp registers
 .macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
-        sshll\p1        v28.4s, \in0, #6
-        mov             v29.16b, v28.16b
-        smull\p1        v30.4s, \in1, v0.h[1]
-        smull\p1        v31.4s, \in1, v0.h[3]
-        smlal\p2        v28.4s, \in2, v0.h[0] //e0
-        smlsl\p2        v29.4s, \in2, v0.h[0] //e1
-        smlal\p2        v30.4s, \in3, v0.h[3] //o0
-        smlsl\p2        v31.4s, \in3, v0.h[1] //o1
+         sshll\p1       v28.4s, \in0, #6
+         mov            v29.16b, v28.16b
+         smull\p1       v30.4s, \in1, v0.h[1]
+         smull\p1       v31.4s, \in1, v0.h[3]
+         smlal\p2       v28.4s, \in2, v0.h[0] //e0
+         smlsl\p2       v29.4s, \in2, v0.h[0] //e1
+         smlal\p2       v30.4s, \in3, v0.h[3] //o0
+         smlsl\p2       v31.4s, \in3, v0.h[1] //o1

-        add             \out0, v28.4s, v30.4s
-        add             \out1, v29.4s, v31.4s
-        sub             \out2, v29.4s, v31.4s
-        sub             \out3, v28.4s, v30.4s
+         add            \out0, v28.4s, v30.4s
+         add            \out1, v29.4s, v31.4s
+         sub            \out2, v29.4s, v31.4s
+         sub            \out3, v28.4s, v30.4s
 .endm

 .macro transpose8_4x4 r0, r1, r2, r3
@@ -325,11 +325,11 @@ endfunc
 .macro idct_8x8 bitdepth
 function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
 //x0 - coeffs
-        mov             x1,  x0
+        mov              x1,  x0
        ld1             {v16.8h-v19.8h}, [x1], #64
        ld1             {v20.8h-v23.8h}, [x1]

-        movrel          x1, trans
+        movrel           x1, trans
        ld1             {v0.8h}, [x1]

        tr_8x4          7, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v20,.4h, v21,.4h, v22,.4h, v23,.4h
@@ -342,7 +342,7 @@ function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1

        transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23

-        mov             x1,  x0
+        mov              x1,  x0
        st1             {v16.8h-v19.8h}, [x1], #64
        st1             {v20.8h-v23.8h}, [x1]

@@ -351,8 +351,8 @@ endfunc
 .endm

 .macro butterfly e, o, tmp_p, tmp_m
-        add             \tmp_p, \e, \o
-        sub             \tmp_m, \e, \o
+        add        \tmp_p, \e, \o
+        sub        \tmp_m, \e, \o
 .endm

 .macro tr16_8x4 in0, in1, in2, in3, offset
@@ -381,7 +381,7 @@ endfunc
        butterfly       v25.4s, v29.4s, v17.4s, v22.4s
        butterfly       v26.4s, v30.4s, v18.4s, v21.4s
        butterfly       v27.4s, v31.4s, v19.4s, v20.4s
-        add             x4,  sp,  #\offset
+        add              x4,  sp,  #\offset
        st1             {v16.4s-v19.4s}, [x4], #64
        st1             {v20.4s-v23.4s}, [x4]
 .endm
@@ -398,14 +398,14 @@ endfunc
 .endm

 .macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7, p
-        sum_sub         v21.4s, \in, \t0, \op0, \p
-        sum_sub         v22.4s, \in, \t1, \op1, \p
-        sum_sub         v23.4s, \in, \t2, \op2, \p
-        sum_sub         v24.4s, \in, \t3, \op3, \p
-        sum_sub         v25.4s, \in, \t4, \op4, \p
-        sum_sub         v26.4s, \in, \t5, \op5, \p
-        sum_sub         v27.4s, \in, \t6, \op6, \p
-        sum_sub         v28.4s, \in, \t7, \op7, \p
+        sum_sub v21.4s, \in, \t0, \op0, \p
+        sum_sub v22.4s, \in, \t1, \op1, \p
+        sum_sub v23.4s, \in, \t2, \op2, \p
+        sum_sub v24.4s, \in, \t3, \op3, \p
+        sum_sub v25.4s, \in, \t4, \op4, \p
+        sum_sub v26.4s, \in, \t5, \op5, \p
+        sum_sub v27.4s, \in, \t6, \op6, \p
+        sum_sub v28.4s, \in, \t7, \op7, \p
 .endm

 .macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
@@ -473,20 +473,20 @@ endfunc

 .macro tr_16x4 name, shift, offset, step
 function func_tr_16x4_\name
-        mov             x1,  x5
-        add             x3,  x5, #(\step * 64)
-        mov             x2,  #(\step * 128)
+        mov              x1,  x5
+        add              x3,  x5, #(\step * 64)
+        mov              x2,  #(\step * 128)
        load16          v16.d, v17.d, v18.d, v19.d
-        movrel          x1,  trans
+        movrel           x1,  trans
        ld1             {v0.8h}, [x1]

        tr16_8x4        v16, v17, v18, v19, \offset

-        add             x1,  x5, #(\step * 32)
-        add             x3,  x5, #(\step * 3 *32)
-        mov             x2,  #(\step * 128)
+        add              x1,  x5, #(\step * 32)
+        add              x3,  x5, #(\step * 3 *32)
+        mov              x2,  #(\step * 128)
        load16          v20.d, v17.d, v18.d, v19.d
-        movrel          x1, trans, 16
+        movrel           x1, trans, 16
        ld1             {v1.8h}, [x1]
        smull           v21.4s, v20.4h, v1.h[0]
        smull           v22.4s, v20.4h, v1.h[1]
@@ -505,16 +505,16 @@ function func_tr_16x4_\name
        add_member      v19.4h, v1.h[6], v1.h[3], v1.h[0], v1.h[2], v1.h[5], v1.h[7], v1.h[4], v1.h[1], +, -, +, -, +, +, -, +
        add_member      v19.8h, v1.h[7], v1.h[6], v1.h[5], v1.h[4], v1.h[3], v1.h[2], v1.h[1], v1.h[0], +, -, +, -, +, -, +, -, 2

-        add             x4, sp, #\offset
+        add              x4, sp, #\offset
        ld1             {v16.4s-v19.4s}, [x4], #64

        butterfly16     v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, v24.4s
        scale           v29, v30, v31, v24, v20.4s, v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, \shift
        transpose16_4x4_2 v29, v30, v31, v24
-        mov             x1,  x6
-        add             x3,  x6, #(24 +3*32)
-        mov             x2, #32
-        mov             x4, #-32
+        mov              x1,  x6
+        add              x3,  x6, #(24 +3*32)
+        mov              x2, #32
+        mov              x4, #-32
        store16         v29.d, v30.d, v31.d, v24.d, x4

        add             x4, sp, #(\offset + 64)
@@ -523,10 +523,10 @@ function func_tr_16x4_\name
        scale           v29, v30, v31, v20, v20.4s, v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, \shift
        transpose16_4x4_2 v29, v30, v31, v20

-        add             x1,  x6, #8
-        add             x3,  x6, #(16 + 3 * 32)
-        mov             x2, #32
-        mov             x4, #-32
+        add              x1,  x6, #8
+        add              x3,  x6, #(16 + 3 * 32)
+        mov              x2, #32
+        mov              x4, #-32
        store16         v29.d, v30.d, v31.d, v20.d, x4

        ret
@@ -539,21 +539,21 @@ function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
        mov             x15, x30

        // allocate a temp buffer
-        sub             sp,  sp,  #640
+        sub              sp,  sp,  #640

 .irp i, 0, 1, 2, 3
-        add             x5,  x0, #(8 * \i)
-        add             x6,  sp, #(8 * \i * 16)
+        add              x5,  x0, #(8 * \i)
+        add              x6,  sp, #(8 * \i * 16)
        bl              func_tr_16x4_firstpass
 .endr

 .irp i, 0, 1, 2, 3
-        add             x5,  sp, #(8 * \i)
-        add             x6,  x0, #(8 * \i * 16)
+        add              x5,  sp, #(8 * \i)
+        add              x6,  x0, #(8 * \i * 16)
        bl              func_tr_16x4_secondpass_\bitdepth
 .endr

-        add             sp,  sp,  #640
+        add              sp,  sp,  #640

        mov             x30, x15
        ret
@@ -573,35 +573,36 @@ idct_16x16 10
 // void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
 .macro idct_dc size, bitdepth
 function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
-        ld1r            {v4.8h}, [x0]
-        srshr           v4.8h,  v4.8h,  #1
-        srshr           v0.8h,  v4.8h,  #(14 - \bitdepth)
-        srshr           v1.8h,  v4.8h,  #(14 - \bitdepth)
+        movi          v1.8h,  #((1 << (14 - \bitdepth))+1)
+        ld1r         {v4.8h}, [x0]
+        add           v4.8h,  v4.8h,  v1.8h
+        sshr          v0.8h,  v4.8h,  #(15 - \bitdepth)
+        sshr          v1.8h,  v4.8h,  #(15 - \bitdepth)
 .if \size > 4
-        srshr           v2.8h,  v4.8h,  #(14 - \bitdepth)
-        srshr           v3.8h,  v4.8h,  #(14 - \bitdepth)
+        sshr          v2.8h,  v4.8h,  #(15 - \bitdepth)
+        sshr          v3.8h,  v4.8h,  #(15 - \bitdepth)
 .if \size > 16 /* dc 32x32 */
-        mov             x2,  #4
+        mov              x2,  #4
 1:
-        subs            x2,  x2, #1
+        subs             x2,  x2, #1
 .endif
        add             x12,  x0, #64
        mov             x13,  #128
 .if \size > 8 /* dc 16x16 */
-        st1             {v0.8h-v3.8h},  [x0], x13
-        st1             {v0.8h-v3.8h}, [x12], x13
-        st1             {v0.8h-v3.8h},  [x0], x13
-        st1             {v0.8h-v3.8h}, [x12], x13
-        st1             {v0.8h-v3.8h},  [x0], x13
-        st1             {v0.8h-v3.8h}, [x12], x13
+        st1            {v0.8h-v3.8h},  [x0], x13
+        st1            {v0.8h-v3.8h}, [x12], x13
+        st1            {v0.8h-v3.8h},  [x0], x13
+        st1            {v0.8h-v3.8h}, [x12], x13
+        st1            {v0.8h-v3.8h},  [x0], x13
+        st1            {v0.8h-v3.8h}, [x12], x13
 .endif /* dc 8x8 */
-        st1             {v0.8h-v3.8h},  [x0], x13
-        st1             {v0.8h-v3.8h}, [x12], x13
+        st1            {v0.8h-v3.8h},  [x0], x13
+        st1            {v0.8h-v3.8h}, [x12], x13
 .if \size > 16 /* dc 32x32 */
        bne             1b
 .endif
 .else /* dc 4x4 */
-        st1             {v0.8h-v1.8h},  [x0]
+        st1            {v0.8h-v1.8h},  [x0]
 .endif
        ret
 endfunc
@@ -30,20 +30,20 @@
 //                      int width, int height)
 function ff_hevc_sao_band_filter_8x8_8_neon, export=1
        sub             sp,  sp, #64
-        stp             xzr, xzr, [sp]
-        stp             xzr, xzr, [sp, #16]
-        stp             xzr, xzr, [sp, #32]
-        stp             xzr, xzr, [sp, #48]
+        stp            xzr, xzr, [sp]
+        stp            xzr, xzr, [sp, #16]
+        stp            xzr, xzr, [sp, #32]
+        stp            xzr, xzr, [sp, #48]
        mov             w8,  #4
 0:
        ldrsh           x9, [x4,  x8, lsl #1] // x9 = sao_offset_val[k+1]
        subs            w8,  w8,  #1
-        add             w10,  w8,  w5 // x10 = k + sao_left_class
-        and             w10, w10, #0x1F
+        add            w10,  w8,  w5 // x10 = k + sao_left_class
+        and            w10, w10, #0x1F
        strh            w9, [sp, x10, lsl #1]
        bne             0b
-        ld1             {v16.16b-v19.16b}, [sp], #64
-        movi            v20.8h,   #1
+        ld1            {v16.16b-v19.16b}, [sp], #64
+        movi           v20.8h,   #1
 1:      // beginning of line
        mov             w8,  w6
 2:
@@ -56,7 +56,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
        // +----------------------------------->
        //    i-0     i-1     i-2     i-3
        // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-        ld1             {v2.8b}, [x1]
+        ld1            {v2.8b}, [x1]
        // load src[x]
        uxtl            v0.8h,  v2.8b
        // >> shift
@@ -68,13 +68,13 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
        // shift insert index to upper byte
        sli             v1.8h,  v3.8h, #8
        // table
-        tbx             v2.16b, {v16.16b-v19.16b}, v1.16b
+        tbx            v2.16b, {v16.16b-v19.16b}, v1.16b
        // src[x] + table
        add             v1.8h,  v0.8h, v2.8h
        // clip + narrow
        sqxtun          v4.8b,  v1.8h
        // store
-        st1             {v4.8b}, [x0]
+        st1            {v4.8b}, [x0]
        // done 8 pixels
        subs            w8, w8,  #8
        bne             2b
@@ -26,297 +26,295 @@
  .if \avg
        mov             x12, x0
  .endif
-1:      ld1             {v0.16b},  [x1], x2
-        ld1             {v1.16b},  [x1], x2
-        ld1             {v2.16b},  [x1], x2
-        ld1             {v3.16b},  [x1], x2
+1:      ld1             {v0.16B},  [x1], x2
+        ld1             {v1.16B},  [x1], x2
+        ld1             {v2.16B},  [x1], x2
+        ld1             {v3.16B},  [x1], x2
  .if \avg
-        ld1             {v4.16b},  [x12], x2
-        urhadd          v0.16b,  v0.16b,  v4.16b
-        ld1             {v5.16b},  [x12], x2
-        urhadd          v1.16b,  v1.16b,  v5.16b
-        ld1             {v6.16b},  [x12], x2
-        urhadd          v2.16b,  v2.16b,  v6.16b
-        ld1             {v7.16b},  [x12], x2
-        urhadd          v3.16b,  v3.16b,  v7.16b
+        ld1             {v4.16B},  [x12], x2
+        urhadd          v0.16B,  v0.16B,  v4.16B
+        ld1             {v5.16B},  [x12], x2
+        urhadd          v1.16B,  v1.16B,  v5.16B
+        ld1             {v6.16B},  [x12], x2
+        urhadd          v2.16B,  v2.16B,  v6.16B
+        ld1             {v7.16B},  [x12], x2
+        urhadd          v3.16B,  v3.16B,  v7.16B
  .endif
        subs            w3,  w3,  #4
-        st1             {v0.16b},  [x0], x2
-        st1             {v1.16b},  [x0], x2
-        st1             {v2.16b},  [x0], x2
-        st1             {v3.16b},  [x0], x2
+        st1             {v0.16B},  [x0], x2
+        st1             {v1.16B},  [x0], x2
+        st1             {v2.16B},  [x0], x2
+        st1             {v3.16B},  [x0], x2
        b.ne            1b
        ret
 .endm

 .macro  pixels16_x2     rnd=1, avg=0
-1:
-        ldur            q1, [x1, #1]
-        ld1             {v0.16b}, [x1], x2
+1:      ld1             {v0.16B, v1.16B}, [x1], x2
+        ld1             {v2.16B, v3.16B}, [x1], x2
        subs            w3,  w3,  #2
-        ldur            q3, [x1, #1]
-        ld1             {v2.16b}, [x1], x2
-        avg             v0.16b,  v0.16b,  v1.16b
-        avg             v2.16b,  v2.16b,  v3.16b
+        ext             v1.16B,  v0.16B,  v1.16B,  #1
+        avg             v0.16B,  v0.16B,  v1.16B
+        ext             v3.16B,  v2.16B,  v3.16B,  #1
+        avg             v2.16B,  v2.16B,  v3.16B
  .if \avg
-        ld1             {v1.16b}, [x0], x2
-        ld1             {v3.16b}, [x0]
-        urhadd          v0.16b,  v0.16b,  v1.16b
-        urhadd          v2.16b,  v2.16b,  v3.16b
+        ld1             {v1.16B}, [x0], x2
+        ld1             {v3.16B}, [x0]
+        urhadd          v0.16B,  v0.16B,  v1.16B
+        urhadd          v2.16B,  v2.16B,  v3.16B
        sub             x0,  x0,  x2
  .endif
-        st1             {v0.16b}, [x0], x2
-        st1             {v2.16b}, [x0], x2
+        st1             {v0.16B}, [x0], x2
+        st1             {v2.16B}, [x0], x2
        b.ne            1b
        ret
 .endm

 .macro  pixels16_y2     rnd=1, avg=0
        sub             w3,  w3,  #2
-        ld1             {v0.16b}, [x1], x2
-        ld1             {v1.16b}, [x1], x2
+        ld1             {v0.16B}, [x1], x2
+        ld1             {v1.16B}, [x1], x2
 1:      subs            w3,  w3,  #2
-        avg             v2.16b,  v0.16b,  v1.16b
-        ld1             {v0.16b}, [x1], x2
-        avg             v3.16b,  v0.16b,  v1.16b
-        ld1             {v1.16b}, [x1], x2
+        avg             v2.16B,  v0.16B,  v1.16B
+        ld1             {v0.16B}, [x1], x2
+        avg             v3.16B,  v0.16B,  v1.16B
+        ld1             {v1.16B}, [x1], x2
  .if \avg
-        ld1             {v4.16b}, [x0], x2
-        ld1             {v5.16b}, [x0]
-        urhadd          v2.16b,  v2.16b,  v4.16b
-        urhadd          v3.16b,  v3.16b,  v5.16b
+        ld1             {v4.16B}, [x0], x2
+        ld1             {v5.16B}, [x0]
+        urhadd          v2.16B,  v2.16B,  v4.16B
+        urhadd          v3.16B,  v3.16B,  v5.16B
        sub             x0,  x0,  x2
  .endif
-        st1             {v2.16b}, [x0], x2
-        st1             {v3.16b}, [x0], x2
+        st1             {v2.16B}, [x0], x2
+        st1             {v3.16B}, [x0], x2
        b.ne            1b

-        avg             v2.16b,  v0.16b,  v1.16b
-        ld1             {v0.16b}, [x1], x2
-        avg             v3.16b,  v0.16b,  v1.16b
+        avg             v2.16B,  v0.16B,  v1.16B
+        ld1             {v0.16B}, [x1], x2
+        avg             v3.16B,  v0.16B,  v1.16B
  .if \avg
-        ld1             {v4.16b}, [x0], x2
-        ld1             {v5.16b}, [x0]
-        urhadd          v2.16b,  v2.16b,  v4.16b
-        urhadd          v3.16b,  v3.16b,  v5.16b
+        ld1             {v4.16B}, [x0], x2
+        ld1             {v5.16B}, [x0]
+        urhadd          v2.16B,  v2.16B,  v4.16B
+        urhadd          v3.16B,  v3.16B,  v5.16B
        sub             x0,  x0,  x2
  .endif
-        st1             {v2.16b},     [x0], x2
-        st1             {v3.16b},     [x0], x2
+        st1             {v2.16B},     [x0], x2
+        st1             {v3.16B},     [x0], x2

        ret
 .endm

 .macro  pixels16_xy2    rnd=1, avg=0
        sub             w3,  w3,  #2
-        ldur            q1, [x1, #1]
-        ld1             {v0.16b}, [x1], x2
+        ld1             {v0.16B, v1.16B}, [x1], x2
+        ld1             {v4.16B, v5.16B}, [x1], x2
 NRND    movi            v26.8H, #1
-        ldur            q5, [x1, #1]
-        ld1             {v4.16b}, [x1], x2
-        uaddl           v16.8h,  v0.8b,   v1.8b
-        uaddl2          v20.8h,  v0.16b,  v1.16b
-        uaddl           v18.8h,  v4.8b,   v5.8b
-        uaddl2          v22.8h,  v4.16b,  v5.16b
+        ext             v1.16B,  v0.16B,  v1.16B,  #1
+        ext             v5.16B,  v4.16B,  v5.16B,  #1
+        uaddl           v16.8H,  v0.8B,   v1.8B
+        uaddl2          v20.8H,  v0.16B,  v1.16B
+        uaddl           v18.8H,  v4.8B,   v5.8B
+        uaddl2          v22.8H,  v4.16B,  v5.16B
 1:      subs            w3,  w3,  #2
-        ldur            q30, [x1, #1]
-        ld1             {v0.16b}, [x1], x2
-        add             v24.8h,  v16.8h,  v18.8h
+        ld1             {v0.16B, v1.16B}, [x1], x2
+        add             v24.8H,  v16.8H,  v18.8H
 NRND    add             v24.8H,  v24.8H,  v26.8H
-        add             v1.8h,   v20.8h,  v22.8h
-        mshrn           v28.8b,  v24.8h,  #2
+        ext             v30.16B, v0.16B,  v1.16B,  #1
+        add             v1.8H,   v20.8H,  v22.8H
+        mshrn           v28.8B,  v24.8H,  #2
 NRND    add             v1.8H,   v1.8H,   v26.8H
-        mshrn2          v28.16b, v1.8h,   #2
+        mshrn2          v28.16B, v1.8H,   #2
  .if \avg
-        ld1             {v16.16b},        [x0]
-        urhadd          v28.16b, v28.16b, v16.16b
+        ld1             {v16.16B},        [x0]
+        urhadd          v28.16B, v28.16B, v16.16B
  .endif
-        uaddl           v16.8h,  v0.8b,   v30.8b
-        ldur            q3, [x1, #1]
-        ld1             {v2.16b}, [x1], x2
-        uaddl2          v20.8h,  v0.16b,  v30.16b
-        st1             {v28.16b},        [x0], x2
-        add             v24.8h,  v16.8h,  v18.8h
+        uaddl           v16.8H,  v0.8B,   v30.8B
+        ld1             {v2.16B, v3.16B}, [x1], x2
+        uaddl2          v20.8H,  v0.16B,  v30.16B
+        st1             {v28.16B},        [x0], x2
+        add             v24.8H,  v16.8H,  v18.8H
 NRND    add             v24.8H,  v24.8H,  v26.8H
-        add             v0.8h,   v20.8h,  v22.8h
-        mshrn           v30.8b,  v24.8h,  #2
+        ext             v3.16B,  v2.16B,  v3.16B,  #1
+        add             v0.8H,   v20.8H,  v22.8H
+        mshrn           v30.8B,  v24.8H,  #2
 NRND    add             v0.8H,   v0.8H,   v26.8H
-        mshrn2          v30.16b, v0.8h,   #2
+        mshrn2          v30.16B, v0.8H,   #2
  .if \avg
-        ld1             {v18.16b},        [x0]
-        urhadd          v30.16b, v30.16b, v18.16b
+        ld1             {v18.16B},        [x0]
+        urhadd          v30.16B, v30.16B, v18.16B
  .endif
-        uaddl           v18.8h,   v2.8b,  v3.8b
-        uaddl2          v22.8h,   v2.16b, v3.16b
-        st1             {v30.16b},        [x0], x2
+        uaddl           v18.8H,   v2.8B,  v3.8B
+        uaddl2          v22.8H,   v2.16B, v3.16B
+        st1             {v30.16B},        [x0], x2
        b.gt            1b

-        ldur            q30, [x1, #1]
-        ld1             {v0.16b}, [x1], x2
-        add             v24.8h,  v16.8h,  v18.8h
+        ld1             {v0.16B, v1.16B}, [x1], x2
+        add             v24.8H,  v16.8H,  v18.8H
 NRND    add             v24.8H,  v24.8H,  v26.8H
-        add             v1.8h,   v20.8h,  v22.8h
-        mshrn           v28.8b,  v24.8h,  #2
+        ext             v30.16B, v0.16B,  v1.16B,  #1
+        add             v1.8H,   v20.8H,  v22.8H
+        mshrn           v28.8B,  v24.8H,  #2
 NRND    add             v1.8H,   v1.8H,   v26.8H
-        mshrn2          v28.16b, v1.8h,   #2
+        mshrn2          v28.16B, v1.8H,   #2
  .if \avg
-        ld1             {v16.16b},        [x0]
-        urhadd          v28.16b, v28.16b, v16.16b
+        ld1             {v16.16B},        [x0]
+        urhadd          v28.16B, v28.16B, v16.16B
  .endif
-        uaddl           v16.8h,  v0.8b,   v30.8b
-        uaddl2          v20.8h,  v0.16b,  v30.16b
-        st1             {v28.16b},        [x0], x2
-        add             v24.8h,  v16.8h,  v18.8h
+        uaddl           v16.8H,  v0.8B,   v30.8B
+        uaddl2          v20.8H,  v0.16B,  v30.16B
+        st1             {v28.16B},        [x0], x2
+        add             v24.8H,  v16.8H,  v18.8H
 NRND    add             v24.8H,  v24.8H,  v26.8H
-        add             v0.8h,   v20.8h,  v22.8h
-        mshrn           v30.8b,  v24.8h,  #2
+        add             v0.8H,   v20.8H,  v22.8H
+        mshrn           v30.8B,  v24.8H,  #2
 NRND    add             v0.8H,   v0.8H,   v26.8H
-        mshrn2          v30.16b, v0.8h,   #2
+        mshrn2          v30.16B, v0.8H,   #2
  .if \avg
-        ld1             {v18.16b},        [x0]
-        urhadd          v30.16b, v30.16b, v18.16b
+        ld1             {v18.16B},        [x0]
+        urhadd          v30.16B, v30.16B, v18.16B
  .endif
-        st1             {v30.16b},        [x0], x2
+        st1             {v30.16B},        [x0], x2

        ret
 .endm

 .macro  pixels8         rnd=1, avg=0
-1:      ld1             {v0.8b}, [x1], x2
-        ld1             {v1.8b}, [x1], x2
-        ld1             {v2.8b}, [x1], x2
-        ld1             {v3.8b}, [x1], x2
+1:      ld1             {v0.8B}, [x1], x2
+        ld1             {v1.8B}, [x1], x2
+        ld1             {v2.8B}, [x1], x2
+        ld1             {v3.8B}, [x1], x2
  .if \avg
-        ld1             {v4.8b}, [x0], x2
-        urhadd          v0.8b,  v0.8b,  v4.8b
-        ld1             {v5.8b}, [x0], x2
-        urhadd          v1.8b,  v1.8b,  v5.8b
-        ld1             {v6.8b}, [x0], x2
-        urhadd          v2.8b,  v2.8b,  v6.8b
-        ld1             {v7.8b}, [x0], x2
-        urhadd          v3.8b,  v3.8b,  v7.8b
+        ld1             {v4.8B}, [x0], x2
+        urhadd          v0.8B,  v0.8B,  v4.8B
+        ld1             {v5.8B}, [x0], x2
+        urhadd          v1.8B,  v1.8B,  v5.8B
+        ld1             {v6.8B}, [x0], x2
+        urhadd          v2.8B,  v2.8B,  v6.8B
+        ld1             {v7.8B}, [x0], x2
+        urhadd          v3.8B,  v3.8B,  v7.8B
        sub             x0,  x0,  x2,  lsl #2
  .endif
        subs            w3,  w3,  #4
-        st1             {v0.8b}, [x0], x2
-        st1             {v1.8b}, [x0], x2
-        st1             {v2.8b}, [x0], x2
-        st1             {v3.8b}, [x0], x2
+        st1             {v0.8B}, [x0], x2
+        st1             {v1.8B}, [x0], x2
+        st1             {v2.8B}, [x0], x2
+        st1             {v3.8B}, [x0], x2
        b.ne            1b
        ret
 .endm

 .macro  pixels8_x2      rnd=1, avg=0
-1:
-        ldur            d1, [x1, #1]
-        ld1             {v0.8b}, [x1], x2
-        ldur            d3, [x1, #1]
-        ld1             {v2.8b}, [x1], x2
+1:      ld1             {v0.8B, v1.8B}, [x1], x2
+        ext             v1.8B,  v0.8B,  v1.8B,  #1
+        ld1             {v2.8B, v3.8B}, [x1], x2
+        ext             v3.8B,  v2.8B,  v3.8B,  #1
        subs            w3,  w3,  #2
-        avg             v0.8b,   v0.8b,   v1.8b
-        avg             v2.8b,   v2.8b,   v3.8b
+        avg             v0.8B,   v0.8B,   v1.8B
+        avg             v2.8B,   v2.8B,   v3.8B
  .if \avg
-        ld1             {v4.8b},     [x0], x2
-        ld1             {v5.8b},     [x0]
-        urhadd          v0.8b,   v0.8b,   v4.8b
-        urhadd          v2.8b,   v2.8b,   v5.8b
+        ld1             {v4.8B},     [x0], x2
+        ld1             {v5.8B},     [x0]
+        urhadd          v0.8B,   v0.8B,   v4.8B
+        urhadd          v2.8B,   v2.8B,   v5.8B
        sub             x0,  x0,  x2
  .endif
-        st1             {v0.8b}, [x0], x2
-        st1             {v2.8b}, [x0], x2
+        st1             {v0.8B}, [x0], x2
+        st1             {v2.8B}, [x0], x2
        b.ne            1b
        ret
 .endm

 .macro  pixels8_y2      rnd=1, avg=0
        sub             w3,  w3,  #2
-        ld1             {v0.8b},  [x1], x2
-        ld1             {v1.8b},  [x1], x2
+        ld1             {v0.8B},  [x1], x2
+        ld1             {v1.8B},  [x1], x2
 1:      subs            w3,  w3,  #2
-        avg             v4.8b,  v0.8b,  v1.8b
-        ld1             {v0.8b},  [x1], x2
-        avg             v5.8b,  v0.8b,  v1.8b
-        ld1             {v1.8b},  [x1], x2
+        avg             v4.8B,  v0.8B,  v1.8B
+        ld1             {v0.8B},  [x1], x2
+        avg             v5.8B,  v0.8B,  v1.8B
+        ld1             {v1.8B},  [x1], x2
  .if \avg
-        ld1             {v2.8b},     [x0], x2
-        ld1             {v3.8b},     [x0]
-        urhadd          v4.8b,  v4.8b,  v2.8b
-        urhadd          v5.8b,  v5.8b,  v3.8b
+        ld1             {v2.8B},     [x0], x2
+        ld1             {v3.8B},     [x0]
+        urhadd          v4.8B,  v4.8B,  v2.8B
+        urhadd          v5.8B,  v5.8B,  v3.8B
        sub             x0,  x0,  x2
  .endif
-        st1             {v4.8b},     [x0], x2
-        st1             {v5.8b},     [x0], x2
+        st1             {v4.8B},     [x0], x2
+        st1             {v5.8B},     [x0], x2
        b.ne            1b

-        avg             v4.8b,  v0.8b,  v1.8b
-        ld1             {v0.8b},  [x1], x2
-        avg             v5.8b,  v0.8b,  v1.8b
+        avg             v4.8B,  v0.8B,  v1.8B
+        ld1             {v0.8B},  [x1], x2
+        avg             v5.8B,  v0.8B,  v1.8B
  .if \avg
-        ld1             {v2.8b},     [x0], x2
-        ld1             {v3.8b},     [x0]
-        urhadd          v4.8b,  v4.8b,  v2.8b
-        urhadd          v5.8b,  v5.8b,  v3.8b
+        ld1             {v2.8B},     [x0], x2
+        ld1             {v3.8B},     [x0]
+        urhadd          v4.8B,  v4.8B,  v2.8B
+        urhadd          v5.8B,  v5.8B,  v3.8B
        sub             x0,  x0,  x2
  .endif
-        st1             {v4.8b},     [x0], x2
-        st1             {v5.8b},     [x0], x2
+        st1             {v4.8B},     [x0], x2
+        st1             {v5.8B},     [x0], x2

        ret
 .endm

 .macro  pixels8_xy2     rnd=1, avg=0
-        ldur            d4, [x1, #1]
        sub             w3,  w3,  #2
-        ld1             {v0.8b}, [x1], x2
+        ld1             {v0.16B},     [x1], x2
+        ld1             {v1.16B},     [x1], x2
 NRND    movi            v19.8H, #1
-        ldur            d6, [x1, #1]
-        ld1             {v1.8b}, [x1], x2
-        uaddl           v16.8h,  v0.8b,  v4.8b
-        uaddl           v17.8h,  v1.8b,  v6.8b
+        ext             v4.16B,  v0.16B,  v4.16B,  #1
+        ext             v6.16B,  v1.16B,  v6.16B,  #1
+        uaddl           v16.8H,  v0.8B,  v4.8B
+        uaddl           v17.8H,  v1.8B,  v6.8B
 1:      subs            w3,  w3,  #2
-        ldur            d4, [x1, #1]
-        ld1             {v0.8b}, [x1], x2
-        add             v18.8h, v16.8h,  v17.8h
+        ld1             {v0.16B},     [x1], x2
+        add             v18.8H, v16.8H,  v17.8H
+        ext             v4.16B,  v0.16B,  v4.16B,  #1
 NRND    add             v18.8H, v18.8H, v19.8H
-        uaddl           v16.8h,  v0.8b,  v4.8b
-        mshrn           v5.8b,  v18.8h, #2
-        ldur            d6, [x1, #1]
-        ld1             {v1.8b}, [x1], x2
-        add             v18.8h, v16.8h,  v17.8h
+        uaddl           v16.8H,  v0.8B,  v4.8B
+        mshrn           v5.8B,  v18.8H, #2
+        ld1             {v1.16B},     [x1], x2
+        add             v18.8H, v16.8H,  v17.8H
  .if \avg
-        ld1             {v7.8b},     [x0]
-        urhadd          v5.8b,  v5.8b,  v7.8b
+        ld1             {v7.8B},     [x0]
+        urhadd          v5.8B,  v5.8B,  v7.8B
  .endif
 NRND    add             v18.8H, v18.8H, v19.8H
-        st1             {v5.8b},     [x0], x2
-        mshrn           v7.8b,  v18.8h, #2
+        st1             {v5.8B},     [x0], x2
+        mshrn           v7.8B,  v18.8H, #2
  .if \avg
-        ld1             {v5.8b},     [x0]
-        urhadd          v7.8b,  v7.8b,  v5.8b
+        ld1             {v5.8B},     [x0]
+        urhadd          v7.8B,  v7.8B,  v5.8B
  .endif
-        uaddl           v17.8h,  v1.8b,   v6.8b
-        st1             {v7.8b},     [x0], x2
+        ext             v6.16B,  v1.16B,  v6.16B,  #1
+        uaddl           v17.8H,  v1.8B,   v6.8B
+        st1             {v7.8B},     [x0], x2
        b.gt            1b

-        ldur            d4, [x1, #1]
-        ld1             {v0.8b}, [x1], x2
-        add             v18.8h, v16.8h, v17.8h
+        ld1             {v0.16B},     [x1], x2
+        add             v18.8H, v16.8H, v17.8H
+        ext             v4.16B, v0.16B, v4.16B,  #1
 NRND    add             v18.8H, v18.8H, v19.8H
-        uaddl           v16.8h,  v0.8b, v4.8b
-        mshrn           v5.8b,  v18.8h, #2
-        add             v18.8h, v16.8h, v17.8h
+        uaddl           v16.8H,  v0.8B, v4.8B
+        mshrn           v5.8B,  v18.8H, #2
+        add             v18.8H, v16.8H, v17.8H
  .if \avg
-        ld1             {v7.8b},     [x0]
-        urhadd          v5.8b,  v5.8b,  v7.8b
+        ld1             {v7.8B},     [x0]
+        urhadd          v5.8B,  v5.8B,  v7.8B
  .endif
 NRND    add             v18.8H, v18.8H, v19.8H
-        st1             {v5.8b},     [x0], x2
-        mshrn           v7.8b,  v18.8h, #2
+        st1             {v5.8B},     [x0], x2
+        mshrn           v7.8B,  v18.8H, #2
  .if \avg
-        ld1             {v5.8b},     [x0]
-        urhadd          v7.8b,  v7.8b,  v5.8b
+        ld1             {v5.8B},     [x0]
+        urhadd          v7.8B,  v7.8B,  v5.8B
  .endif
-        st1             {v7.8b},     [x0], x2
+        st1             {v7.8B},     [x0], x2

        ret
 .endm
@@ -19,7 +19,6 @@
 #ifndef AVCODEC_AARCH64_IDCT_H
 #define AVCODEC_AARCH64_IDCT_H

-#include <stddef.h>
 #include <stdint.h>

 void ff_simple_idct_neon(int16_t *data);
@@ -17,133 +17,133 @@
 */

 .macro  transpose_8x8B  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
-        trn1            \r8\().8b,  \r0\().8b,  \r1\().8b
-        trn2            \r9\().8b,  \r0\().8b,  \r1\().8b
-        trn1            \r1\().8b,  \r2\().8b,  \r3\().8b
-        trn2            \r3\().8b,  \r2\().8b,  \r3\().8b
-        trn1            \r0\().8b,  \r4\().8b,  \r5\().8b
-        trn2            \r5\().8b,  \r4\().8b,  \r5\().8b
-        trn1            \r2\().8b,  \r6\().8b,  \r7\().8b
-        trn2            \r7\().8b,  \r6\().8b,  \r7\().8b
+        trn1            \r8\().8B,  \r0\().8B,  \r1\().8B
+        trn2            \r9\().8B,  \r0\().8B,  \r1\().8B
+        trn1            \r1\().8B,  \r2\().8B,  \r3\().8B
+        trn2            \r3\().8B,  \r2\().8B,  \r3\().8B
+        trn1            \r0\().8B,  \r4\().8B,  \r5\().8B
+        trn2            \r5\().8B,  \r4\().8B,  \r5\().8B
+        trn1            \r2\().8B,  \r6\().8B,  \r7\().8B
+        trn2            \r7\().8B,  \r6\().8B,  \r7\().8B

-        trn1            \r4\().4h,  \r0\().4h,  \r2\().4h
-        trn2            \r2\().4h,  \r0\().4h,  \r2\().4h
-        trn1            \r6\().4h,  \r5\().4h,  \r7\().4h
-        trn2            \r7\().4h,  \r5\().4h,  \r7\().4h
-        trn1            \r5\().4h,  \r9\().4h,  \r3\().4h
-        trn2            \r9\().4h,  \r9\().4h,  \r3\().4h
-        trn1            \r3\().4h,  \r8\().4h,  \r1\().4h
-        trn2            \r8\().4h,  \r8\().4h,  \r1\().4h
+        trn1            \r4\().4H,  \r0\().4H,  \r2\().4H
+        trn2            \r2\().4H,  \r0\().4H,  \r2\().4H
+        trn1            \r6\().4H,  \r5\().4H,  \r7\().4H
+        trn2            \r7\().4H,  \r5\().4H,  \r7\().4H
+        trn1            \r5\().4H,  \r9\().4H,  \r3\().4H
+        trn2            \r9\().4H,  \r9\().4H,  \r3\().4H
+        trn1            \r3\().4H,  \r8\().4H,  \r1\().4H
+        trn2            \r8\().4H,  \r8\().4H,  \r1\().4H

-        trn1            \r0\().2s,  \r3\().2s,  \r4\().2s
-        trn2            \r4\().2s,  \r3\().2s,  \r4\().2s
+        trn1            \r0\().2S,  \r3\().2S,  \r4\().2S
+        trn2            \r4\().2S,  \r3\().2S,  \r4\().2S

-        trn1            \r1\().2s,  \r5\().2s,  \r6\().2s
-        trn2            \r5\().2s,  \r5\().2s,  \r6\().2s
+        trn1            \r1\().2S,  \r5\().2S,  \r6\().2S
+        trn2            \r5\().2S,  \r5\().2S,  \r6\().2S

-        trn2            \r6\().2s,  \r8\().2s,  \r2\().2s
-        trn1            \r2\().2s,  \r8\().2s,  \r2\().2s
+        trn2            \r6\().2S,  \r8\().2S,  \r2\().2S
+        trn1            \r2\().2S,  \r8\().2S,  \r2\().2S

-        trn1            \r3\().2s,  \r9\().2s,  \r7\().2s
-        trn2            \r7\().2s,  \r9\().2s,  \r7\().2s
+        trn1            \r3\().2S,  \r9\().2S,  \r7\().2S
+        trn2            \r7\().2S,  \r9\().2S,  \r7\().2S
 .endm

 .macro  transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
-        trn1            \t0\().16b, \r0\().16b, \r1\().16b
-        trn2            \t1\().16b, \r0\().16b, \r1\().16b
-        trn1            \r1\().16b, \r2\().16b, \r3\().16b
-        trn2            \r3\().16b, \r2\().16b, \r3\().16b
-        trn1            \r0\().16b, \r4\().16b, \r5\().16b
-        trn2            \r5\().16b, \r4\().16b, \r5\().16b
-        trn1            \r2\().16b, \r6\().16b, \r7\().16b
-        trn2            \r7\().16b, \r6\().16b, \r7\().16b
+        trn1            \t0\().16B, \r0\().16B, \r1\().16B
+        trn2            \t1\().16B, \r0\().16B, \r1\().16B
+        trn1            \r1\().16B, \r2\().16B, \r3\().16B
+        trn2            \r3\().16B, \r2\().16B, \r3\().16B
+        trn1            \r0\().16B, \r4\().16B, \r5\().16B
+        trn2            \r5\().16B, \r4\().16B, \r5\().16B
+        trn1            \r2\().16B, \r6\().16B, \r7\().16B
+        trn2            \r7\().16B, \r6\().16B, \r7\().16B

-        trn1            \r4\().8h,  \r0\().8h,  \r2\().8h
-        trn2            \r2\().8h,  \r0\().8h,  \r2\().8h
-        trn1            \r6\().8h,  \r5\().8h,  \r7\().8h
-        trn2            \r7\().8h,  \r5\().8h,  \r7\().8h
-        trn1            \r5\().8h,  \t1\().8h,  \r3\().8h
-        trn2            \t1\().8h,  \t1\().8h,  \r3\().8h
-        trn1            \r3\().8h,  \t0\().8h,  \r1\().8h
-        trn2            \t0\().8h,  \t0\().8h,  \r1\().8h
+        trn1            \r4\().8H,  \r0\().8H,  \r2\().8H
+        trn2            \r2\().8H,  \r0\().8H,  \r2\().8H
+        trn1            \r6\().8H,  \r5\().8H,  \r7\().8H
+        trn2            \r7\().8H,  \r5\().8H,  \r7\().8H
+        trn1            \r5\().8H,  \t1\().8H,  \r3\().8H
+        trn2            \t1\().8H,  \t1\().8H,  \r3\().8H
+        trn1            \r3\().8H,  \t0\().8H,  \r1\().8H
+        trn2            \t0\().8H,  \t0\().8H,  \r1\().8H

-        trn1            \r0\().4s,  \r3\().4s,  \r4\().4s
-        trn2            \r4\().4s,  \r3\().4s,  \r4\().4s
+        trn1            \r0\().4S,  \r3\().4S,  \r4\().4S
+        trn2            \r4\().4S,  \r3\().4S,  \r4\().4S

-        trn1            \r1\().4s,  \r5\().4s,  \r6\().4s
-        trn2            \r5\().4s,  \r5\().4s,  \r6\().4s
+        trn1            \r1\().4S,  \r5\().4S,  \r6\().4S
+        trn2            \r5\().4S,  \r5\().4S,  \r6\().4S

-        trn2            \r6\().4s,  \t0\().4s,  \r2\().4s
-        trn1            \r2\().4s,  \t0\().4s,  \r2\().4s
+        trn2            \r6\().4S,  \t0\().4S,  \r2\().4S
+        trn1            \r2\().4S,  \t0\().4S,  \r2\().4S

-        trn1            \r3\().4s,  \t1\().4s,  \r7\().4s
-        trn2            \r7\().4s,  \t1\().4s,  \r7\().4s
+        trn1            \r3\().4S,  \t1\().4S,  \r7\().4S
+        trn2            \r7\().4S,  \t1\().4S,  \r7\().4S
 .endm

 .macro  transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
-        trn1            \t4\().16b, \r0\().16b,  \r1\().16b
-        trn2            \t5\().16b, \r0\().16b,  \r1\().16b
-        trn1            \t6\().16b, \r2\().16b,  \r3\().16b
-        trn2            \t7\().16b, \r2\().16b,  \r3\().16b
+        trn1            \t4\().16B, \r0\().16B,  \r1\().16B
+        trn2            \t5\().16B, \r0\().16B,  \r1\().16B
+        trn1            \t6\().16B, \r2\().16B,  \r3\().16B
+        trn2            \t7\().16B, \r2\().16B,  \r3\().16B

-        trn1            \r0\().8h,  \t4\().8h,  \t6\().8h
-        trn2            \r2\().8h,  \t4\().8h,  \t6\().8h
-        trn1            \r1\().8h,  \t5\().8h,  \t7\().8h
-        trn2            \r3\().8h,  \t5\().8h,  \t7\().8h
+        trn1            \r0\().8H,  \t4\().8H,  \t6\().8H
+        trn2            \r2\().8H,  \t4\().8H,  \t6\().8H
+        trn1            \r1\().8H,  \t5\().8H,  \t7\().8H
+        trn2            \r3\().8H,  \t5\().8H,  \t7\().8H
 .endm

 .macro  transpose_4x8B  r0, r1, r2, r3, t4, t5, t6, t7
-        trn1            \t4\().8b,  \r0\().8b,  \r1\().8b
-        trn2            \t5\().8b,  \r0\().8b,  \r1\().8b
-        trn1            \t6\().8b,  \r2\().8b,  \r3\().8b
-        trn2            \t7\().8b,  \r2\().8b,  \r3\().8b
+        trn1            \t4\().8B,  \r0\().8B,  \r1\().8B
+        trn2            \t5\().8B,  \r0\().8B,  \r1\().8B
+        trn1            \t6\().8B,  \r2\().8B,  \r3\().8B
+        trn2            \t7\().8B,  \r2\().8B,  \r3\().8B

-        trn1            \r0\().4h,  \t4\().4h,  \t6\().4h
-        trn2            \r2\().4h,  \t4\().4h,  \t6\().4h
-        trn1            \r1\().4h,  \t5\().4h,  \t7\().4h
-        trn2            \r3\().4h,  \t5\().4h,  \t7\().4h
+        trn1            \r0\().4H,  \t4\().4H,  \t6\().4H
+        trn2            \r2\().4H,  \t4\().4H,  \t6\().4H
+        trn1            \r1\().4H,  \t5\().4H,  \t7\().4H
+        trn2            \r3\().4H,  \t5\().4H,  \t7\().4H
 .endm

 .macro  transpose_4x4H  r0, r1, r2, r3, r4, r5, r6, r7
-        trn1            \r4\().4h,  \r0\().4h,  \r1\().4h
-        trn2            \r5\().4h,  \r0\().4h,  \r1\().4h
-        trn1            \r6\().4h,  \r2\().4h,  \r3\().4h
-        trn2            \r7\().4h,  \r2\().4h,  \r3\().4h
-        trn1            \r0\().2s,  \r4\().2s,  \r6\().2s
-        trn2            \r2\().2s,  \r4\().2s,  \r6\().2s
-        trn1            \r1\().2s,  \r5\().2s,  \r7\().2s
-        trn2            \r3\().2s,  \r5\().2s,  \r7\().2s
+        trn1            \r4\().4H,  \r0\().4H,  \r1\().4H
+        trn2            \r5\().4H,  \r0\().4H,  \r1\().4H
+        trn1            \r6\().4H,  \r2\().4H,  \r3\().4H
+        trn2            \r7\().4H,  \r2\().4H,  \r3\().4H
+        trn1            \r0\().2S,  \r4\().2S,  \r6\().2S
+        trn2            \r2\().2S,  \r4\().2S,  \r6\().2S
+        trn1            \r1\().2S,  \r5\().2S,  \r7\().2S
+        trn2            \r3\().2S,  \r5\().2S,  \r7\().2S
 .endm

 .macro  transpose_8x8H  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
-        trn1            \r8\().8h,  \r0\().8h,  \r1\().8h
-        trn2            \r9\().8h,  \r0\().8h,  \r1\().8h
-        trn1            \r1\().8h,  \r2\().8h,  \r3\().8h
-        trn2            \r3\().8h,  \r2\().8h,  \r3\().8h
-        trn1            \r0\().8h,  \r4\().8h,  \r5\().8h
-        trn2            \r5\().8h,  \r4\().8h,  \r5\().8h
-        trn1            \r2\().8h,  \r6\().8h,  \r7\().8h
-        trn2            \r7\().8h,  \r6\().8h,  \r7\().8h
+        trn1            \r8\().8H,  \r0\().8H,  \r1\().8H
+        trn2            \r9\().8H,  \r0\().8H,  \r1\().8H
+        trn1            \r1\().8H,  \r2\().8H,  \r3\().8H
+        trn2            \r3\().8H,  \r2\().8H,  \r3\().8H
+        trn1            \r0\().8H,  \r4\().8H,  \r5\().8H
+        trn2            \r5\().8H,  \r4\().8H,  \r5\().8H
+        trn1            \r2\().8H,  \r6\().8H,  \r7\().8H
+        trn2            \r7\().8H,  \r6\().8H,  \r7\().8H

-        trn1            \r4\().4s,  \r0\().4s,  \r2\().4s
-        trn2            \r2\().4s,  \r0\().4s,  \r2\().4s
-        trn1            \r6\().4s,  \r5\().4s,  \r7\().4s
-        trn2            \r7\().4s,  \r5\().4s,  \r7\().4s
-        trn1            \r5\().4s,  \r9\().4s,  \r3\().4s
-        trn2            \r9\().4s,  \r9\().4s,  \r3\().4s
-        trn1            \r3\().4s,  \r8\().4s,  \r1\().4s
-        trn2            \r8\().4s,  \r8\().4s,  \r1\().4s
+        trn1            \r4\().4S,  \r0\().4S,  \r2\().4S
+        trn2            \r2\().4S,  \r0\().4S,  \r2\().4S
+        trn1            \r6\().4S,  \r5\().4S,  \r7\().4S
+        trn2            \r7\().4S,  \r5\().4S,  \r7\().4S
+        trn1            \r5\().4S,  \r9\().4S,  \r3\().4S
+        trn2            \r9\().4S,  \r9\().4S,  \r3\().4S
+        trn1            \r3\().4S,  \r8\().4S,  \r1\().4S
+        trn2            \r8\().4S,  \r8\().4S,  \r1\().4S

-        trn1            \r0\().2d,  \r3\().2d,  \r4\().2d
-        trn2            \r4\().2d,  \r3\().2d,  \r4\().2d
+        trn1            \r0\().2D,  \r3\().2D,  \r4\().2D
+        trn2            \r4\().2D,  \r3\().2D,  \r4\().2D

-        trn1            \r1\().2d,  \r5\().2d,  \r6\().2d
-        trn2            \r5\().2d,  \r5\().2d,  \r6\().2d
+        trn1            \r1\().2D,  \r5\().2D,  \r6\().2D
+        trn2            \r5\().2D,  \r5\().2D,  \r6\().2D

-        trn2            \r6\().2d,  \r8\().2d,  \r2\().2d
-        trn1            \r2\().2d,  \r8\().2d,  \r2\().2d
+        trn2            \r6\().2D,  \r8\().2D,  \r2\().2D
+        trn1            \r2\().2D,  \r8\().2D,  \r2\().2D

-        trn1            \r3\().2d,  \r9\().2d,  \r7\().2d
-        trn2            \r7\().2d,  \r9\().2d,  \r7\().2d
+        trn1            \r3\().2D,  \r9\().2D,  \r7\().2D
+        trn2            \r7\().2D,  \r9\().2D,  \r7\().2D

 .endm
@@ -33,81 +33,81 @@ const tab_x2, align=4
 endconst

 function ff_opus_deemphasis_neon, export=1
-        movrel          x4, tab_st
-        ld1             {v4.4s}, [x4]
-        movrel          x4, tab_x0
-        ld1             {v5.4s}, [x4]
-        movrel          x4, tab_x1
-        ld1             {v6.4s}, [x4]
-        movrel          x4, tab_x2
-        ld1             {v7.4s}, [x4]
+        movrel  x4, tab_st
+        ld1    {v4.4s}, [x4]
+        movrel  x4, tab_x0
+        ld1    {v5.4s}, [x4]
+        movrel  x4, tab_x1
+        ld1    {v6.4s}, [x4]
+        movrel  x4, tab_x2
+        ld1    {v7.4s}, [x4]

-        fmul            v0.4s, v4.4s, v0.s[0]
+        fmul v0.4s, v4.4s, v0.s[0]

-1:      ld1             {v1.4s, v2.4s}, [x1], #32
+1:      ld1  {v1.4s, v2.4s}, [x1], #32

-        fmla            v0.4s, v5.4s, v1.s[0]
-        fmul            v3.4s, v7.4s, v2.s[2]
+        fmla v0.4s, v5.4s, v1.s[0]
+        fmul v3.4s, v7.4s, v2.s[2]

-        fmla            v0.4s, v6.4s, v1.s[1]
-        fmla            v3.4s, v6.4s, v2.s[1]
+        fmla v0.4s, v6.4s, v1.s[1]
+        fmla v3.4s, v6.4s, v2.s[1]

-        fmla            v0.4s, v7.4s, v1.s[2]
-        fmla            v3.4s, v5.4s, v2.s[0]
+        fmla v0.4s, v7.4s, v1.s[2]
+        fmla v3.4s, v5.4s, v2.s[0]

-        fadd            v1.4s, v1.4s, v0.4s
-        fadd            v2.4s, v2.4s, v3.4s
+        fadd v1.4s, v1.4s, v0.4s
+        fadd v2.4s, v2.4s, v3.4s

-        fmla            v2.4s, v4.4s, v1.s[3]
+        fmla v2.4s, v4.4s, v1.s[3]

-        st1             {v1.4s, v2.4s}, [x0], #32
-        fmul            v0.4s, v4.4s, v2.s[3]
+        st1  {v1.4s, v2.4s}, [x0], #32
+        fmul v0.4s, v4.4s, v2.s[3]

-        subs            w2, w2, #8
-        b.gt            1b
+        subs w2, w2, #8
+        b.gt 1b

-        mov             s0, v2.s[3]
+        mov s0, v2.s[3]

        ret
 endfunc

 function ff_opus_postfilter_neon, export=1
-        ld1             {v0.4s}, [x2]
-        dup             v1.4s, v0.s[1]
-        dup             v2.4s, v0.s[2]
-        dup             v0.4s, v0.s[0]
+        ld1 {v0.4s}, [x2]
+        dup v1.4s, v0.s[1]
+        dup v2.4s, v0.s[2]
+        dup v0.4s, v0.s[0]

-        add             w1, w1, #2
-        sub             x1, x0, x1, lsl #2
+        add w1, w1, #2
+        sub x1, x0, x1, lsl #2

-        ld1             {v3.4s}, [x1]
-        fmul            v3.4s, v3.4s, v2.4s
+        ld1 {v3.4s}, [x1]
+        fmul v3.4s, v3.4s, v2.4s

-1:      add             x1, x1, #4
-        ld1             {v4.4s}, [x1]
-        add             x1, x1, #4
-        ld1             {v5.4s}, [x1]
-        add             x1, x1, #4
-        ld1             {v6.4s}, [x1]
-        add             x1, x1, #4
-        ld1             {v7.4s}, [x1]
+1:      add x1, x1, #4
+        ld1 {v4.4s}, [x1]
+        add x1, x1, #4
+        ld1 {v5.4s}, [x1]
+        add x1, x1, #4
+        ld1 {v6.4s}, [x1]
+        add x1, x1, #4
+        ld1 {v7.4s}, [x1]

-        fmla            v3.4s, v7.4s, v2.4s
-        fadd            v6.4s, v6.4s, v4.4s
+        fmla v3.4s, v7.4s, v2.4s
+        fadd v6.4s, v6.4s, v4.4s

-        ld1             {v4.4s}, [x0]
-        fmla            v4.4s, v5.4s, v0.4s
+        ld1 {v4.4s}, [x0]
+        fmla v4.4s, v5.4s, v0.4s

-        fmul            v6.4s, v6.4s, v1.4s
-        fadd            v6.4s, v6.4s, v3.4s
+        fmul v6.4s, v6.4s, v1.4s
+        fadd v6.4s, v6.4s, v3.4s

-        fadd            v4.4s, v4.4s, v6.4s
-        fmul            v3.4s, v7.4s, v2.4s
+        fadd v4.4s, v4.4s, v6.4s
+        fmul v3.4s, v7.4s, v2.4s

-        st1             {v4.4s}, [x0], #16
+        st1  {v4.4s}, [x0], #16

-        subs            w3, w3, #4
-        b.gt            1b
+        subs w3, w3, #4
+        b.gt 1b

        ret
 endfunc
@@ -46,49 +46,49 @@ function ff_sbr_sum64x5_neon, export=1
        add             x3, x0, #192*4
        add             x4, x0, #256*4
        mov             x5, #64
-1:      ld1             {v0.4s}, [x0]
-        ld1             {v1.4s}, [x1], #16
-        fadd            v0.4s, v0.4s, v1.4s
-        ld1             {v2.4s}, [x2], #16
-        fadd            v0.4s, v0.4s, v2.4s
-        ld1             {v3.4s}, [x3], #16
-        fadd            v0.4s, v0.4s, v3.4s
-        ld1             {v4.4s}, [x4], #16
-        fadd            v0.4s, v0.4s, v4.4s
-        st1             {v0.4s}, [x0], #16
+1:      ld1             {v0.4S}, [x0]
+        ld1             {v1.4S}, [x1], #16
+        fadd            v0.4S, v0.4S, v1.4S
+        ld1             {v2.4S}, [x2], #16
+        fadd            v0.4S, v0.4S, v2.4S
+        ld1             {v3.4S}, [x3], #16
+        fadd            v0.4S, v0.4S, v3.4S
+        ld1             {v4.4S}, [x4], #16
+        fadd            v0.4S, v0.4S, v4.4S
+        st1             {v0.4S}, [x0], #16
        subs            x5, x5, #4
        b.gt            1b
        ret
 endfunc

 function ff_sbr_sum_square_neon, export=1
-        movi            v0.4s, #0
-1:      ld1             {v1.4s}, [x0], #16
-        fmla            v0.4s, v1.4s, v1.4s
+        movi            v0.4S, #0
+1:      ld1             {v1.4S}, [x0], #16
+        fmla            v0.4S, v1.4S, v1.4S
        subs            w1, w1, #2
        b.gt            1b
-        faddp           v0.4s, v0.4s, v0.4s
-        faddp           v0.4s, v0.4s, v0.4s
+        faddp           v0.4S, v0.4S, v0.4S
+        faddp           v0.4S, v0.4S, v0.4S
        ret
 endfunc

 function ff_sbr_neg_odd_64_neon, export=1
        mov             x1, x0
-        movi            v5.4s, #1<<7, lsl #24
-        ld2             {v0.4s, v1.4s}, [x0], #32
-        eor             v1.16b, v1.16b, v5.16b
-        ld2             {v2.4s, v3.4s}, [x0], #32
+        movi            v5.4S, #1<<7, lsl #24
+        ld2             {v0.4S, v1.4S}, [x0], #32
+        eor             v1.16B, v1.16B, v5.16B
+        ld2             {v2.4S, v3.4S}, [x0], #32
 .rept 3
-        st2             {v0.4s, v1.4s}, [x1], #32
-        eor             v3.16b, v3.16b, v5.16b
-        ld2             {v0.4s, v1.4s}, [x0], #32
-        st2             {v2.4s, v3.4s}, [x1], #32
-        eor             v1.16b, v1.16b, v5.16b
-        ld2             {v2.4s, v3.4s}, [x0], #32
+        st2             {v0.4S, v1.4S}, [x1], #32
+        eor             v3.16B, v3.16B, v5.16B
+        ld2             {v0.4S, v1.4S}, [x0], #32
+        st2             {v2.4S, v3.4S}, [x1], #32
+        eor             v1.16B, v1.16B, v5.16B
+        ld2             {v2.4S, v3.4S}, [x0], #32
 .endr
-        eor             v3.16b, v3.16b, v5.16b
-        st2             {v0.4s, v1.4s}, [x1], #32
-        st2             {v2.4s, v3.4s}, [x1], #32
+        eor             v3.16B, v3.16B, v5.16B
+        st2             {v0.4S, v1.4S}, [x1], #32
+        st2             {v2.4S, v3.4S}, [x1], #32
        ret
 endfunc

@@ -97,26 +97,26 @@ function ff_sbr_qmf_pre_shuffle_neon, export=1
        add             x2, x0, #64*4
        mov             x3, #-16
        mov             x4, #-4
-        movi            v6.4s, #1<<7, lsl #24
-        ld1             {v0.2s}, [x0], #8
-        st1             {v0.2s}, [x2], #8
+        movi            v6.4S, #1<<7, lsl #24
+        ld1             {v0.2S}, [x0], #8
+        st1             {v0.2S}, [x2], #8
 .rept 7
-        ld1             {v1.4s}, [x1], x3
-        ld1             {v2.4s}, [x0], #16
-        eor             v1.16b, v1.16b, v6.16b
-        rev64           v1.4s, v1.4s
-        ext             v1.16b, v1.16b, v1.16b, #8
-        st2             {v1.4s, v2.4s}, [x2], #32
+        ld1             {v1.4S}, [x1], x3
+        ld1             {v2.4S}, [x0], #16
+        eor             v1.16B, v1.16B, v6.16B
+        rev64           v1.4S, v1.4S
+        ext             v1.16B, v1.16B, v1.16B, #8
+        st2             {v1.4S, v2.4S}, [x2], #32
 .endr
        add             x1, x1, #8
-        ld1             {v1.2s}, [x1], x4
-        ld1             {v2.2s}, [x0], #8
-        ld1             {v1.s}[3], [x1]
-        ld1             {v2.s}[2], [x0]
-        eor             v1.16b, v1.16b, v6.16b
-        rev64           v1.4s, v1.4s
-        st2             {v1.2s, v2.2s}, [x2], #16
-        st2             {v1.s, v2.s}[2], [x2]
+        ld1             {v1.2S}, [x1], x4
+        ld1             {v2.2S}, [x0], #8
+        ld1             {v1.S}[3], [x1]
+        ld1             {v2.S}[2], [x0]
+        eor             v1.16B, v1.16B, v6.16B
+        rev64           v1.4S, v1.4S
+        st2             {v1.2S, v2.2S}, [x2], #16
+        st2             {v1.S, v2.S}[2], [x2]
        ret
 endfunc

@@ -124,13 +124,13 @@ function ff_sbr_qmf_post_shuffle_neon, export=1
        add             x2, x1, #60*4
        mov             x3, #-16
        mov             x4, #32
-        movi            v6.4s, #1<<7, lsl #24
-1:      ld1             {v0.4s}, [x2], x3
-        ld1             {v1.4s}, [x1], #16
-        eor             v0.16b, v0.16b, v6.16b
-        rev64           v0.4s, v0.4s
-        ext             v0.16b, v0.16b, v0.16b, #8
-        st2             {v0.4s, v1.4s}, [x0], #32
+        movi            v6.4S, #1<<7, lsl #24
+1:      ld1             {v0.4S}, [x2], x3
+        ld1             {v1.4S}, [x1], #16
+        eor             v0.16B, v0.16B, v6.16B
+        rev64           v0.4S, v0.4S
+        ext             v0.16B, v0.16B, v0.16B, #8
+        st2             {v0.4S, v1.4S}, [x0], #32
        subs            x4, x4, #4
        b.gt            1b
        ret
@@ -141,13 +141,13 @@ function ff_sbr_qmf_deint_neg_neon, export=1
        add             x2, x0, #60*4
        mov             x3, #-32
        mov             x4, #32
-        movi            v2.4s, #1<<7, lsl #24
-1:      ld2             {v0.4s, v1.4s}, [x1], x3
-        eor             v0.16b, v0.16b, v2.16b
-        rev64           v1.4s, v1.4s
-        ext             v1.16b, v1.16b, v1.16b, #8
-        st1             {v0.4s}, [x2]
-        st1             {v1.4s}, [x0], #16
+        movi            v2.4S, #1<<7, lsl #24
+1:      ld2             {v0.4S, v1.4S}, [x1], x3
+        eor             v0.16B, v0.16B, v2.16B
+        rev64           v1.4S, v1.4S
+        ext             v1.16B, v1.16B, v1.16B, #8
+        st1             {v0.4S}, [x2]
+        st1             {v1.4S}, [x0], #16
        sub             x2, x2, #16
        subs            x4, x4, #4
        b.gt            1b
@@ -159,16 +159,16 @@ function ff_sbr_qmf_deint_bfly_neon, export=1
        add             x3, x0, #124*4
        mov             x4, #64
        mov             x5, #-16
-1:      ld1             {v0.4s}, [x1], #16
-        ld1             {v1.4s}, [x2], x5
-        rev64           v2.4s, v0.4s
-        ext             v2.16b, v2.16b, v2.16b, #8
-        rev64           v3.4s, v1.4s
-        ext             v3.16b, v3.16b, v3.16b, #8
-        fadd            v1.4s, v1.4s, v2.4s
-        fsub            v0.4s, v0.4s, v3.4s
-        st1             {v0.4s}, [x0], #16
-        st1             {v1.4s}, [x3], x5
+1:      ld1             {v0.4S}, [x1], #16
+        ld1             {v1.4S}, [x2], x5
+        rev64           v2.4S, v0.4S
+        ext             v2.16B, v2.16B, v2.16B, #8
+        rev64           v3.4S, v1.4S
+        ext             v3.16B, v3.16B, v3.16B, #8
+        fadd            v1.4S, v1.4S, v2.4S
+        fsub            v0.4S, v0.4S, v3.4S
+        st1             {v0.4S}, [x0], #16
+        st1             {v1.4S}, [x3], x5
        subs            x4, x4, #4
        b.gt            1b
        ret
@@ -178,32 +178,32 @@ function ff_sbr_hf_gen_neon, export=1
        sxtw            x4, w4
        sxtw            x5, w5
        movrel          x6, factors
-        ld1             {v7.4s}, [x6]
-        dup             v1.4s, v0.s[0]
-        mov             v2.8b, v1.8b
-        mov             v2.s[2], v7.s[0]
-        mov             v2.s[3], v7.s[0]
-        fmul            v1.4s, v1.4s, v2.4s
-        ld1             {v0.d}[0], [x3]
-        ld1             {v0.d}[1], [x2]
-        fmul            v0.4s, v0.4s, v1.4s
-        fmul            v1.4s, v0.4s, v7.4s
-        rev64           v0.4s, v0.4s
+        ld1             {v7.4S}, [x6]
+        dup             v1.4S, v0.S[0]
+        mov             v2.8B, v1.8B
+        mov             v2.S[2], v7.S[0]
+        mov             v2.S[3], v7.S[0]
+        fmul            v1.4S, v1.4S, v2.4S
+        ld1             {v0.D}[0], [x3]
+        ld1             {v0.D}[1], [x2]
+        fmul            v0.4S, v0.4S, v1.4S
+        fmul            v1.4S, v0.4S, v7.4S
+        rev64           v0.4S, v0.4S
        sub             x7, x5, x4
        add             x0, x0, x4, lsl #3
        add             x1, x1, x4, lsl #3
        sub             x1, x1, #16
-1:      ld1             {v2.4s}, [x1], #16
-        ld1             {v3.2s}, [x1]
-        fmul            v4.4s, v2.4s, v1.4s
-        fmul            v5.4s, v2.4s, v0.4s
-        faddp           v4.4s, v4.4s, v4.4s
-        faddp           v5.4s, v5.4s, v5.4s
-        faddp           v4.4s, v4.4s, v4.4s
-        faddp           v5.4s, v5.4s, v5.4s
-        mov             v4.s[1], v5.s[0]
-        fadd            v4.2s, v4.2s, v3.2s
-        st1             {v4.2s}, [x0], #8
+1:      ld1             {v2.4S}, [x1], #16
+        ld1             {v3.2S}, [x1]
+        fmul            v4.4S, v2.4S, v1.4S
+        fmul            v5.4S, v2.4S, v0.4S
+        faddp           v4.4S, v4.4S, v4.4S
+        faddp           v5.4S, v5.4S, v5.4S
+        faddp           v4.4S, v4.4S, v4.4S
+        faddp           v5.4S, v5.4S, v5.4S
+        mov             v4.S[1], v5.S[0]
+        fadd            v4.2S, v4.2S, v3.2S
+        st1             {v4.2S}, [x0], #8
        sub             x1, x1, #8
        subs            x7, x7, #1
        b.gt            1b
@@ -215,10 +215,10 @@ function ff_sbr_hf_g_filt_neon, export=1
        sxtw            x4, w4
        mov             x5, #40*2*4
        add             x1, x1, x4, lsl #3
-1:      ld1             {v0.2s}, [x1], x5
-        ld1             {v1.s}[0], [x2], #4
-        fmul            v2.4s, v0.4s, v1.s[0]
-        st1             {v2.2s}, [x0], #8
+1:      ld1             {v0.2S}, [x1], x5
+        ld1             {v1.S}[0], [x2], #4
+        fmul            v2.4S, v0.4S, v1.S[0]
+        st1             {v2.2S}, [x0], #8
        subs            x3, x3, #1
        b.gt            1b
        ret
@@ -227,46 +227,46 @@ endfunc
 function ff_sbr_autocorrelate_neon, export=1
        mov             x2, #38
        movrel          x3, factors
-        ld1             {v0.4s}, [x3]
-        movi            v1.4s, #0
-        movi            v2.4s, #0
-        movi            v3.4s, #0
-        ld1             {v4.2s}, [x0], #8
-        ld1             {v5.2s}, [x0], #8
-        fmul            v16.2s, v4.2s, v4.2s
-        fmul            v17.2s, v5.2s, v4.s[0]
-        fmul            v18.2s, v5.2s, v4.s[1]
-1:      ld1             {v5.d}[1], [x0], #8
-        fmla            v1.2s, v4.2s, v4.2s
-        fmla            v2.4s, v5.4s, v4.s[0]
-        fmla            v3.4s, v5.4s, v4.s[1]
-        mov             v4.d[0], v5.d[0]
-        mov             v5.d[0], v5.d[1]
+        ld1             {v0.4S}, [x3]
+        movi            v1.4S, #0
+        movi            v2.4S, #0
+        movi            v3.4S, #0
+        ld1             {v4.2S}, [x0], #8
+        ld1             {v5.2S}, [x0], #8
+        fmul            v16.2S, v4.2S, v4.2S
+        fmul            v17.2S, v5.2S, v4.S[0]
+        fmul            v18.2S, v5.2S, v4.S[1]
+1:      ld1             {v5.D}[1], [x0], #8
+        fmla            v1.2S, v4.2S, v4.2S
+        fmla            v2.4S, v5.4S, v4.S[0]
+        fmla            v3.4S, v5.4S, v4.S[1]
+        mov             v4.D[0], v5.D[0]
+        mov             v5.D[0], v5.D[1]
        subs            x2, x2, #1
        b.gt            1b
-        fmul            v19.2s, v4.2s, v4.2s
-        fmul            v20.2s, v5.2s, v4.s[0]
-        fmul            v21.2s, v5.2s, v4.s[1]
-        fadd            v22.4s, v2.4s, v20.4s
-        fsub            v22.4s, v22.4s, v17.4s
-        fadd            v23.4s, v3.4s, v21.4s
-        fsub            v23.4s, v23.4s, v18.4s
-        rev64           v23.4s, v23.4s
-        fmul            v23.4s, v23.4s, v0.4s
-        fadd            v22.4s, v22.4s, v23.4s
-        st1             {v22.4s}, [x1], #16
-        fadd            v23.2s, v1.2s, v19.2s
-        fsub            v23.2s, v23.2s, v16.2s
-        faddp           v23.2s, v23.2s, v23.2s
-        st1             {v23.s}[0], [x1]
+        fmul            v19.2S, v4.2S, v4.2S
+        fmul            v20.2S, v5.2S, v4.S[0]
+        fmul            v21.2S, v5.2S, v4.S[1]
+        fadd            v22.4S, v2.4S, v20.4S
+        fsub            v22.4S, v22.4S, v17.4S
+        fadd            v23.4S, v3.4S, v21.4S
+        fsub            v23.4S, v23.4S, v18.4S
+        rev64           v23.4S, v23.4S
+        fmul            v23.4S, v23.4S, v0.4S
+        fadd            v22.4S, v22.4S, v23.4S
+        st1             {v22.4S}, [x1], #16
+        fadd            v23.2S, v1.2S, v19.2S
+        fsub            v23.2S, v23.2S, v16.2S
+        faddp           v23.2S, v23.2S, v23.2S
+        st1             {v23.S}[0], [x1]
        add             x1, x1, #8
-        rev64           v3.2s, v3.2s
-        fmul            v3.2s, v3.2s, v0.2s
-        fadd            v2.2s, v2.2s, v3.2s
-        st1             {v2.2s}, [x1]
+        rev64           v3.2S, v3.2S
+        fmul            v3.2S, v3.2S, v0.2S
+        fadd            v2.2S, v2.2S, v3.2S
+        st1             {v2.2S}, [x1]
        add             x1, x1, #16
-        faddp           v1.2s, v1.2s, v1.2s
-        st1             {v1.s}[0], [x1]
+        faddp           v1.2S, v1.2S, v1.2S
+        st1             {v1.S}[0], [x1]
        ret
 endfunc

@@ -278,25 +278,25 @@ endfunc
 1:      and             x3, x3, #0x1ff
        add             x8, x7, x3, lsl #3
        add             x3, x3, #2
-        ld1             {v2.4s}, [x0]
-        ld1             {v3.2s}, [x1], #8
-        ld1             {v4.2s}, [x2], #8
-        ld1             {v5.4s}, [x8]
-        mov             v6.16b, v2.16b
-        zip1            v3.4s, v3.4s, v3.4s
-        zip1            v4.4s, v4.4s, v4.4s
-        fmla            v6.4s, v1.4s, v3.4s
-        fmla            v2.4s, v5.4s, v4.4s
-        fcmeq           v7.4s, v3.4s, #0
-        bif             v2.16b, v6.16b, v7.16b
-        st1             {v2.4s}, [x0], #16
+        ld1             {v2.4S}, [x0]
+        ld1             {v3.2S}, [x1], #8
+        ld1             {v4.2S}, [x2], #8
+        ld1             {v5.4S}, [x8]
+        mov             v6.16B, v2.16B
+        zip1            v3.4S, v3.4S, v3.4S
+        zip1            v4.4S, v4.4S, v4.4S
+        fmla            v6.4S, v1.4S, v3.4S
+        fmla            v2.4S, v5.4S, v4.4S
+        fcmeq           v7.4S, v3.4S, #0
+        bif             v2.16B, v6.16B, v7.16B
+        st1             {v2.4S}, [x0], #16
        subs            x5, x5, #2
        b.gt            1b
 .endm

 function ff_sbr_hf_apply_noise_0_neon, export=1
        movrel          x9, phi_noise_0
-        ld1             {v1.4s}, [x9]
+        ld1             {v1.4S}, [x9]
        apply_noise_common
        ret
 endfunc
@@ -305,14 +305,14 @@ function ff_sbr_hf_apply_noise_1_neon, export=1
        movrel          x9, phi_noise_1
        and             x4, x4, #1
        add             x9, x9, x4, lsl #4
-        ld1             {v1.4s}, [x9]
+        ld1             {v1.4S}, [x9]
        apply_noise_common
        ret
 endfunc

 function ff_sbr_hf_apply_noise_2_neon, export=1
        movrel          x9, phi_noise_2
-        ld1             {v1.4s}, [x9]
+        ld1             {v1.4S}, [x9]
        apply_noise_common
        ret
 endfunc
@@ -321,7 +321,7 @@ function ff_sbr_hf_apply_noise_3_neon, export=1
        movrel          x9, phi_noise_3
        and             x4, x4, #1
        add             x9, x9, x4, lsl #4
-        ld1             {v1.4s}, [x9]
+        ld1             {v1.4S}, [x9]
        apply_noise_common
        ret
 endfunc
@@ -54,7 +54,7 @@ endconst
        prfm            pldl1keep, [\data]
        mov             x10, x30
        movrel          x3, idct_coeff_neon
-        ld1             {v0.2d}, [x3]
+        ld1             {v0.2D}, [x3]
 .endm

 .macro idct_end
@@ -74,146 +74,146 @@ endconst
 .endm

 .macro idct_col4_top y1, y2, y3, y4, i, l
-        smull\i         v7.4s,  \y3\l, z2
-        smull\i         v16.4s, \y3\l, z6
-        smull\i         v17.4s, \y2\l, z1
-        add             v19.4s, v23.4s, v7.4s
-        smull\i         v18.4s, \y2\l, z3
-        add             v20.4s, v23.4s, v16.4s
-        smull\i         v5.4s,  \y2\l, z5
-        sub             v21.4s, v23.4s, v16.4s
-        smull\i         v6.4s,  \y2\l, z7
-        sub             v22.4s, v23.4s, v7.4s
+        smull\i         v7.4S,  \y3\l, z2
+        smull\i         v16.4S, \y3\l, z6
+        smull\i         v17.4S, \y2\l, z1
+        add             v19.4S, v23.4S, v7.4S
+        smull\i         v18.4S, \y2\l, z3
+        add             v20.4S, v23.4S, v16.4S
+        smull\i         v5.4S,  \y2\l, z5
+        sub             v21.4S, v23.4S, v16.4S
+        smull\i         v6.4S,  \y2\l, z7
+        sub             v22.4S, v23.4S, v7.4S

-        smlal\i         v17.4s, \y4\l, z3
-        smlsl\i         v18.4s, \y4\l, z7
-        smlsl\i         v5.4s,  \y4\l, z1
-        smlsl\i         v6.4s,  \y4\l, z5
+        smlal\i         v17.4S, \y4\l, z3
+        smlsl\i         v18.4S, \y4\l, z7
+        smlsl\i         v5.4S,  \y4\l, z1
+        smlsl\i         v6.4S,  \y4\l, z5
 .endm

 .macro idct_row4_neon y1, y2, y3, y4, pass
-        ld1             {\y1\().2d,\y2\().2d}, [x2], #32
-        movi            v23.4s, #1<<2, lsl #8
-        orr             v5.16b, \y1\().16b, \y2\().16b
-        ld1             {\y3\().2d,\y4\().2d}, [x2], #32
-        orr             v6.16b, \y3\().16b, \y4\().16b
-        orr             v5.16b, v5.16b, v6.16b
-        mov             x3, v5.d[1]
-        smlal           v23.4s, \y1\().4h, z4
+        ld1             {\y1\().2D,\y2\().2D}, [x2], #32
+        movi            v23.4S, #1<<2, lsl #8
+        orr             v5.16B, \y1\().16B, \y2\().16B
+        ld1             {\y3\().2D,\y4\().2D}, [x2], #32
+        orr             v6.16B, \y3\().16B, \y4\().16B
+        orr             v5.16B, v5.16B, v6.16B
+        mov             x3, v5.D[1]
+        smlal           v23.4S, \y1\().4H, z4

-        idct_col4_top   \y1, \y2, \y3, \y4, 1, .4h
+        idct_col4_top   \y1, \y2, \y3, \y4, 1, .4H

        cmp             x3, #0
        b.eq            \pass\()f

-        smull2          v7.4s, \y1\().8h, z4
-        smlal2          v17.4s, \y2\().8h, z5
-        smlsl2          v18.4s, \y2\().8h, z1
-        smull2          v16.4s, \y3\().8h, z2
-        smlal2          v5.4s, \y2\().8h, z7
-        add             v19.4s, v19.4s, v7.4s
-        sub             v20.4s, v20.4s, v7.4s
-        sub             v21.4s, v21.4s, v7.4s
-        add             v22.4s, v22.4s, v7.4s
-        smlal2          v6.4s, \y2\().8h, z3
-        smull2          v7.4s, \y3\().8h, z6
-        smlal2          v17.4s, \y4\().8h, z7
-        smlsl2          v18.4s, \y4\().8h, z5
-        smlal2          v5.4s, \y4\().8h, z3
-        smlsl2          v6.4s, \y4\().8h, z1
-        add             v19.4s, v19.4s, v7.4s
-        sub             v20.4s, v20.4s, v16.4s
-        add             v21.4s, v21.4s, v16.4s
-        sub             v22.4s, v22.4s, v7.4s
+        smull2          v7.4S, \y1\().8H, z4
+        smlal2          v17.4S, \y2\().8H, z5
+        smlsl2          v18.4S, \y2\().8H, z1
+        smull2          v16.4S, \y3\().8H, z2
+        smlal2          v5.4S, \y2\().8H, z7
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v7.4S
+        sub             v21.4S, v21.4S, v7.4S
+        add             v22.4S, v22.4S, v7.4S
+        smlal2          v6.4S, \y2\().8H, z3
+        smull2          v7.4S, \y3\().8H, z6
+        smlal2          v17.4S, \y4\().8H, z7
+        smlsl2          v18.4S, \y4\().8H, z5
+        smlal2          v5.4S, \y4\().8H, z3
+        smlsl2          v6.4S, \y4\().8H, z1
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v16.4S
+        add             v21.4S, v21.4S, v16.4S
+        sub             v22.4S, v22.4S, v7.4S

 \pass:  add             \y3\().4S, v19.4S, v17.4S
-        add             \y4\().4s, v20.4s, v18.4s
-        shrn            \y1\().4h, \y3\().4s, #ROW_SHIFT
-        shrn            \y2\().4h, \y4\().4s, #ROW_SHIFT
-        add             v7.4s, v21.4s, v5.4s
-        add             v16.4s, v22.4s, v6.4s
-        shrn            \y3\().4h, v7.4s, #ROW_SHIFT
-        shrn            \y4\().4h, v16.4s, #ROW_SHIFT
-        sub             v22.4s, v22.4s, v6.4s
-        sub             v19.4s, v19.4s, v17.4s
-        sub             v21.4s, v21.4s, v5.4s
-        shrn2           \y1\().8h, v22.4s, #ROW_SHIFT
-        sub             v20.4s, v20.4s, v18.4s
-        shrn2           \y2\().8h, v21.4s, #ROW_SHIFT
-        shrn2           \y3\().8h, v20.4s, #ROW_SHIFT
-        shrn2           \y4\().8h, v19.4s, #ROW_SHIFT
+        add             \y4\().4S, v20.4S, v18.4S
+        shrn            \y1\().4H, \y3\().4S, #ROW_SHIFT
+        shrn            \y2\().4H, \y4\().4S, #ROW_SHIFT
+        add             v7.4S, v21.4S, v5.4S
+        add             v16.4S, v22.4S, v6.4S
+        shrn            \y3\().4H, v7.4S, #ROW_SHIFT
+        shrn            \y4\().4H, v16.4S, #ROW_SHIFT
+        sub             v22.4S, v22.4S, v6.4S
+        sub             v19.4S, v19.4S, v17.4S
+        sub             v21.4S, v21.4S, v5.4S
+        shrn2           \y1\().8H, v22.4S, #ROW_SHIFT
+        sub             v20.4S, v20.4S, v18.4S
+        shrn2           \y2\().8H, v21.4S, #ROW_SHIFT
+        shrn2           \y3\().8H, v20.4S, #ROW_SHIFT
+        shrn2           \y4\().8H, v19.4S, #ROW_SHIFT

-        trn1            v16.8h, \y1\().8h, \y2\().8h
-        trn2            v17.8h, \y1\().8h, \y2\().8h
-        trn1            v18.8h, \y3\().8h, \y4\().8h
-        trn2            v19.8h, \y3\().8h, \y4\().8h
-        trn1            \y1\().4s, v16.4s, v18.4s
-        trn1            \y2\().4s, v17.4s, v19.4s
-        trn2            \y3\().4s, v16.4s, v18.4s
-        trn2            \y4\().4s, v17.4s, v19.4s
+        trn1            v16.8H, \y1\().8H, \y2\().8H
+        trn2            v17.8H, \y1\().8H, \y2\().8H
+        trn1            v18.8H, \y3\().8H, \y4\().8H
+        trn2            v19.8H, \y3\().8H, \y4\().8H
+        trn1            \y1\().4S, v16.4S, v18.4S
+        trn1            \y2\().4S, v17.4S, v19.4S
+        trn2            \y3\().4S, v16.4S, v18.4S
+        trn2            \y4\().4S, v17.4S, v19.4S
 .endm

 .macro declare_idct_col4_neon i, l
 function idct_col4_neon\i
-        dup             v23.4h, z4c
+        dup             v23.4H, z4c
 .if \i == 1
-        add             v23.4h, v23.4h, v24.4h
+        add             v23.4H, v23.4H, v24.4H
 .else
-        mov             v5.d[0], v24.d[1]
-        add             v23.4h, v23.4h, v5.4h
+        mov             v5.D[0], v24.D[1]
+        add             v23.4H, v23.4H, v5.4H
 .endif
-        smull           v23.4s, v23.4h, z4
+        smull           v23.4S, v23.4H, z4

        idct_col4_top   v24, v25, v26, v27, \i, \l

-        mov             x4, v28.d[\i - 1]
-        mov             x5, v29.d[\i - 1]
+        mov             x4, v28.D[\i - 1]
+        mov             x5, v29.D[\i - 1]
        cmp             x4, #0
        b.eq            1f

-        smull\i         v7.4s,  v28\l,  z4
-        add             v19.4s, v19.4s, v7.4s
-        sub             v20.4s, v20.4s, v7.4s
-        sub             v21.4s, v21.4s, v7.4s
-        add             v22.4s, v22.4s, v7.4s
+        smull\i         v7.4S,  v28\l,  z4
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v7.4S
+        sub             v21.4S, v21.4S, v7.4S
+        add             v22.4S, v22.4S, v7.4S

-1:      mov             x4, v30.d[\i - 1]
+1:      mov             x4, v30.D[\i - 1]
        cmp             x5, #0
        b.eq            2f

-        smlal\i         v17.4s, v29\l, z5
-        smlsl\i         v18.4s, v29\l, z1
-        smlal\i         v5.4s,  v29\l, z7
-        smlal\i         v6.4s,  v29\l, z3
+        smlal\i         v17.4S, v29\l, z5
+        smlsl\i         v18.4S, v29\l, z1
+        smlal\i         v5.4S,  v29\l, z7
+        smlal\i         v6.4S,  v29\l, z3

-2:      mov             x5, v31.d[\i - 1]
+2:      mov             x5, v31.D[\i - 1]
        cmp             x4, #0
        b.eq            3f

-        smull\i         v7.4s,  v30\l, z6
-        smull\i         v16.4s, v30\l, z2
-        add             v19.4s, v19.4s, v7.4s
-        sub             v22.4s, v22.4s, v7.4s
-        sub             v20.4s, v20.4s, v16.4s
-        add             v21.4s, v21.4s, v16.4s
+        smull\i         v7.4S,  v30\l, z6
+        smull\i         v16.4S, v30\l, z2
+        add             v19.4S, v19.4S, v7.4S
+        sub             v22.4S, v22.4S, v7.4S
+        sub             v20.4S, v20.4S, v16.4S
+        add             v21.4S, v21.4S, v16.4S

 3:      cmp             x5, #0
        b.eq            4f

-        smlal\i         v17.4s, v31\l, z7
-        smlsl\i         v18.4s, v31\l, z5
-        smlal\i         v5.4s,  v31\l, z3
-        smlsl\i         v6.4s,  v31\l, z1
+        smlal\i         v17.4S, v31\l, z7
+        smlsl\i         v18.4S, v31\l, z5
+        smlal\i         v5.4S,  v31\l, z3
+        smlsl\i         v6.4S,  v31\l, z1

-4:      addhn           v7.4h, v19.4s, v17.4s
-        addhn2          v7.8h, v20.4s, v18.4s
-        subhn           v18.4h, v20.4s, v18.4s
-        subhn2          v18.8h, v19.4s, v17.4s
+4:      addhn           v7.4H, v19.4S, v17.4S
+        addhn2          v7.8H, v20.4S, v18.4S
+        subhn           v18.4H, v20.4S, v18.4S
+        subhn2          v18.8H, v19.4S, v17.4S

-        addhn           v16.4h, v21.4s, v5.4s
-        addhn2          v16.8h, v22.4s, v6.4s
-        subhn           v17.4h, v22.4s, v6.4s
-        subhn2          v17.8h, v21.4s, v5.4s
+        addhn           v16.4H, v21.4S, v5.4S
+        addhn2          v16.8H, v22.4S, v6.4S
+        subhn           v17.4H, v22.4S, v6.4S
+        subhn2          v17.8H, v21.4S, v5.4S

        ret
 endfunc
@@ -229,33 +229,33 @@ function ff_simple_idct_put_neon, export=1
        idct_row4_neon  v28, v29, v30, v31, 2
        bl              idct_col4_neon1

-        sqshrun         v1.8b,  v7.8h, #COL_SHIFT-16
-        sqshrun2        v1.16b, v16.8h, #COL_SHIFT-16
-        sqshrun         v3.8b,  v17.8h, #COL_SHIFT-16
-        sqshrun2        v3.16b, v18.8h, #COL_SHIFT-16
+        sqshrun         v1.8B,  v7.8H, #COL_SHIFT-16
+        sqshrun2        v1.16B, v16.8H, #COL_SHIFT-16
+        sqshrun         v3.8B,  v17.8H, #COL_SHIFT-16
+        sqshrun2        v3.16B, v18.8H, #COL_SHIFT-16

        bl              idct_col4_neon2

-        sqshrun         v2.8b,  v7.8h, #COL_SHIFT-16
-        sqshrun2        v2.16b, v16.8h, #COL_SHIFT-16
-        sqshrun         v4.8b,  v17.8h, #COL_SHIFT-16
-        sqshrun2        v4.16b, v18.8h, #COL_SHIFT-16
+        sqshrun         v2.8B,  v7.8H, #COL_SHIFT-16
+        sqshrun2        v2.16B, v16.8H, #COL_SHIFT-16
+        sqshrun         v4.8B,  v17.8H, #COL_SHIFT-16
+        sqshrun2        v4.16B, v18.8H, #COL_SHIFT-16

-        zip1            v16.4s, v1.4s, v2.4s
-        zip2            v17.4s, v1.4s, v2.4s
+        zip1            v16.4S, v1.4S, v2.4S
+        zip2            v17.4S, v1.4S, v2.4S

-        st1             {v16.d}[0], [x0], x1
-        st1             {v16.d}[1], [x0], x1
+        st1             {v16.D}[0], [x0], x1
+        st1             {v16.D}[1], [x0], x1

-        zip1            v18.4s, v3.4s, v4.4s
-        zip2            v19.4s, v3.4s, v4.4s
+        zip1            v18.4S, v3.4S, v4.4S
+        zip2            v19.4S, v3.4S, v4.4S

-        st1             {v17.d}[0], [x0], x1
-        st1             {v17.d}[1], [x0], x1
-        st1             {v18.d}[0], [x0], x1
-        st1             {v18.d}[1], [x0], x1
-        st1             {v19.d}[0], [x0], x1
-        st1             {v19.d}[1], [x0], x1
+        st1             {v17.D}[0], [x0], x1
+        st1             {v17.D}[1], [x0], x1
+        st1             {v18.D}[0], [x0], x1
+        st1             {v18.D}[1], [x0], x1
+        st1             {v19.D}[0], [x0], x1
+        st1             {v19.D}[1], [x0], x1

        idct_end
 endfunc
@@ -267,59 +267,59 @@ function ff_simple_idct_add_neon, export=1
        idct_row4_neon  v28, v29, v30, v31, 2
        bl              idct_col4_neon1

-        sshr            v1.8h, v7.8h, #COL_SHIFT-16
-        sshr            v2.8h, v16.8h, #COL_SHIFT-16
-        sshr            v3.8h, v17.8h, #COL_SHIFT-16
-        sshr            v4.8h, v18.8h, #COL_SHIFT-16
+        sshr            v1.8H, v7.8H, #COL_SHIFT-16
+        sshr            v2.8H, v16.8H, #COL_SHIFT-16
+        sshr            v3.8H, v17.8H, #COL_SHIFT-16
+        sshr            v4.8H, v18.8H, #COL_SHIFT-16

        bl              idct_col4_neon2

-        sshr            v7.8h, v7.8h, #COL_SHIFT-16
-        sshr            v16.8h, v16.8h, #COL_SHIFT-16
-        sshr            v17.8h, v17.8h, #COL_SHIFT-16
-        sshr            v18.8h, v18.8h, #COL_SHIFT-16
+        sshr            v7.8H, v7.8H, #COL_SHIFT-16
+        sshr            v16.8H, v16.8H, #COL_SHIFT-16
+        sshr            v17.8H, v17.8H, #COL_SHIFT-16
+        sshr            v18.8H, v18.8H, #COL_SHIFT-16

        mov             x9,  x0
-        ld1             {v19.d}[0], [x0], x1
-        zip1            v23.2d, v1.2d, v7.2d
-        zip2            v24.2d, v1.2d, v7.2d
-        ld1             {v19.d}[1], [x0], x1
-        zip1            v25.2d, v2.2d, v16.2d
-        zip2            v26.2d, v2.2d, v16.2d
-        ld1             {v20.d}[0], [x0], x1
-        zip1            v27.2d, v3.2d, v17.2d
-        zip2            v28.2d, v3.2d, v17.2d
-        ld1             {v20.d}[1], [x0], x1
-        zip1            v29.2d, v4.2d, v18.2d
-        zip2            v30.2d, v4.2d, v18.2d
-        ld1             {v21.d}[0], [x0], x1
-        uaddw           v23.8h, v23.8h, v19.8b
-        uaddw2          v24.8h, v24.8h, v19.16b
-        ld1             {v21.d}[1], [x0], x1
-        sqxtun          v23.8b, v23.8h
-        sqxtun2         v23.16b, v24.8h
-        ld1             {v22.d}[0], [x0], x1
-        uaddw           v24.8h, v25.8h, v20.8b
-        uaddw2          v25.8h, v26.8h, v20.16b
-        ld1             {v22.d}[1], [x0], x1
-        sqxtun          v24.8b, v24.8h
-        sqxtun2         v24.16b, v25.8h
-        st1             {v23.d}[0], [x9], x1
-        uaddw           v25.8h, v27.8h, v21.8b
-        uaddw2          v26.8h, v28.8h, v21.16b
-        st1             {v23.d}[1], [x9], x1
-        sqxtun          v25.8b, v25.8h
-        sqxtun2         v25.16b, v26.8h
-        st1             {v24.d}[0], [x9], x1
-        uaddw           v26.8h, v29.8h, v22.8b
-        uaddw2          v27.8h, v30.8h, v22.16b
-        st1             {v24.d}[1], [x9], x1
-        sqxtun          v26.8b, v26.8h
-        sqxtun2         v26.16b, v27.8h
-        st1             {v25.d}[0], [x9], x1
-        st1             {v25.d}[1], [x9], x1
-        st1             {v26.d}[0], [x9], x1
-        st1             {v26.d}[1], [x9], x1
+        ld1             {v19.D}[0], [x0], x1
+        zip1            v23.2D, v1.2D, v7.2D
+        zip2            v24.2D, v1.2D, v7.2D
+        ld1             {v19.D}[1], [x0], x1
+        zip1            v25.2D, v2.2D, v16.2D
+        zip2            v26.2D, v2.2D, v16.2D
+        ld1             {v20.D}[0], [x0], x1
+        zip1            v27.2D, v3.2D, v17.2D
+        zip2            v28.2D, v3.2D, v17.2D
+        ld1             {v20.D}[1], [x0], x1
+        zip1            v29.2D, v4.2D, v18.2D
+        zip2            v30.2D, v4.2D, v18.2D
+        ld1             {v21.D}[0], [x0], x1
+        uaddw           v23.8H, v23.8H, v19.8B
+        uaddw2          v24.8H, v24.8H, v19.16B
+        ld1             {v21.D}[1], [x0], x1
+        sqxtun          v23.8B, v23.8H
+        sqxtun2         v23.16B, v24.8H
+        ld1             {v22.D}[0], [x0], x1
+        uaddw           v24.8H, v25.8H, v20.8B
+        uaddw2          v25.8H, v26.8H, v20.16B
+        ld1             {v22.D}[1], [x0], x1
+        sqxtun          v24.8B, v24.8H
+        sqxtun2         v24.16B, v25.8H
+        st1             {v23.D}[0], [x9], x1
+        uaddw           v25.8H, v27.8H, v21.8B
+        uaddw2          v26.8H, v28.8H, v21.16B
+        st1             {v23.D}[1], [x9], x1
+        sqxtun          v25.8B, v25.8H
+        sqxtun2         v25.16B, v26.8H
+        st1             {v24.D}[0], [x9], x1
+        uaddw           v26.8H, v29.8H, v22.8B
+        uaddw2          v27.8H, v30.8H, v22.16B
+        st1             {v24.D}[1], [x9], x1
+        sqxtun          v26.8B, v26.8H
+        sqxtun2         v26.16B, v27.8H
+        st1             {v25.D}[0], [x9], x1
+        st1             {v25.D}[1], [x9], x1
+        st1             {v26.D}[0], [x9], x1
+        st1             {v26.D}[1], [x9], x1

        idct_end
 endfunc
@@ -333,30 +333,30 @@ function ff_simple_idct_neon, export=1
        sub             x2, x2, #128
        bl              idct_col4_neon1

-        sshr            v1.8h, v7.8h, #COL_SHIFT-16
-        sshr            v2.8h, v16.8h, #COL_SHIFT-16
-        sshr            v3.8h, v17.8h, #COL_SHIFT-16
-        sshr            v4.8h, v18.8h, #COL_SHIFT-16
+        sshr            v1.8H, v7.8H, #COL_SHIFT-16
+        sshr            v2.8H, v16.8H, #COL_SHIFT-16
+        sshr            v3.8H, v17.8H, #COL_SHIFT-16
+        sshr            v4.8H, v18.8H, #COL_SHIFT-16

        bl              idct_col4_neon2

-        sshr            v7.8h, v7.8h, #COL_SHIFT-16
-        sshr            v16.8h, v16.8h, #COL_SHIFT-16
-        sshr            v17.8h, v17.8h, #COL_SHIFT-16
-        sshr            v18.8h, v18.8h, #COL_SHIFT-16
+        sshr            v7.8H, v7.8H, #COL_SHIFT-16
+        sshr            v16.8H, v16.8H, #COL_SHIFT-16
+        sshr            v17.8H, v17.8H, #COL_SHIFT-16
+        sshr            v18.8H, v18.8H, #COL_SHIFT-16

-        zip1            v23.2d, v1.2d, v7.2d
-        zip2            v24.2d, v1.2d, v7.2d
-        st1             {v23.2d,v24.2d}, [x2], #32
-        zip1            v25.2d, v2.2d, v16.2d
-        zip2            v26.2d, v2.2d, v16.2d
-        st1             {v25.2d,v26.2d}, [x2], #32
-        zip1            v27.2d, v3.2d, v17.2d
-        zip2            v28.2d, v3.2d, v17.2d
-        st1             {v27.2d,v28.2d}, [x2], #32
-        zip1            v29.2d, v4.2d, v18.2d
-        zip2            v30.2d, v4.2d, v18.2d
-        st1             {v29.2d,v30.2d}, [x2], #32
+        zip1            v23.2D, v1.2D, v7.2D
+        zip2            v24.2D, v1.2D, v7.2D
+        st1             {v23.2D,v24.2D}, [x2], #32
+        zip1            v25.2D, v2.2D, v16.2D
+        zip2            v26.2D, v2.2D, v16.2D
+        st1             {v25.2D,v26.2D}, [x2], #32
+        zip1            v27.2D, v3.2D, v17.2D
+        zip2            v28.2D, v3.2D, v17.2D
+        st1             {v27.2D,v28.2D}, [x2], #32
+        zip1            v29.2D, v4.2D, v18.2D
+        zip2            v30.2D, v4.2D, v18.2D
+        st1             {v29.2D,v30.2D}, [x2], #32

        idct_end
 endfunc
@@ -330,32 +330,32 @@ endfunc
        //   v17: hev

        // convert to signed value:
-        eor             v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
-        eor             v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80
+        eor            v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
+        eor            v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80

-        movi            v20.8h, #3
-        ssubl           v18.8h, v4.8b,  v3.8b             // QS0 - PS0
-        ssubl2          v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
-        eor             v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
-        eor             v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
-        mul             v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
-        mul             v19.8h, v19.8h, v20.8h
+        movi           v20.8h, #3
+        ssubl          v18.8h, v4.8b,  v3.8b             // QS0 - PS0
+        ssubl2         v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
+        eor            v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
+        eor            v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
+        mul            v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
+        mul            v19.8h, v19.8h, v20.8h

-        sqsub           v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
-        movi            v22.16b, #4
-        movi            v23.16b, #3
+        sqsub          v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
+        movi           v22.16b, #4
+        movi           v23.16b, #3
    .if \inner
-        and             v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
+        and            v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
    .endif
-        saddw           v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
-        saddw2          v19.8h,  v19.8h, v20.16b
-        sqxtn           v18.8b,  v18.8h                   // narrow result back into v18
-        sqxtn2          v18.16b, v19.8h
+        saddw          v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
+        saddw2         v19.8h,  v19.8h, v20.16b
+        sqxtn          v18.8b,  v18.8h                   // narrow result back into v18
+        sqxtn2         v18.16b, v19.8h
    .if !\inner && !\simple
-        eor             v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
-        eor             v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
+        eor            v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
+        eor            v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
    .endif
-        and             v18.16b, v18.16b, v16.16b         // w &= normal_limit
+        and            v18.16b, v18.16b, v16.16b         // w &= normal_limit

        // registers used at this point..
        //   v0 -> P3  (don't corrupt)
@@ -375,44 +375,44 @@ endfunc
        //   P0 = s2u(PS0 + c2);

    .if \simple
-        sqadd           v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
-        sqadd           v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
-        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
-        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
-        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
-        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
-        eor             v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
-        eor             v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
-        eor             v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
-        eor             v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
+        sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
+        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
+        eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
+        eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
+        eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
    .elseif \inner
        // the !is4tap case of filter_common, only used for inner blocks
        //   c3 = ((c1&~hev) + 1) >> 1;
        //   Q1 = s2u(QS1 - c3);
        //   P1 = s2u(PS1 + c3);
-        sqadd           v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
-        sqadd           v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
-        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
-        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
-        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
-        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
-        bic             v19.16b, v19.16b, v17.16b           // c1 & ~hev
-        eor             v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
-        srshr           v19.16b, v19.16b, #1                // c3 >>= 1
-        eor             v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
-        sqsub           v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
-        sqadd           v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
-        eor             v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
-        eor             v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
+        sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
+        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        bic            v19.16b, v19.16b, v17.16b           // c1 & ~hev
+        eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
+        srshr          v19.16b, v19.16b, #1                // c3 >>= 1
+        eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
+        sqsub          v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
+        sqadd          v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
+        eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
+        eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
    .else
-        and             v20.16b, v18.16b, v17.16b           // w & hev
-        sqadd           v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
-        sqadd           v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
-        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
-        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
-        bic             v18.16b, v18.16b, v17.16b           // w &= ~hev
-        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
-        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        and            v20.16b, v18.16b, v17.16b           // w & hev
+        sqadd          v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd          v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
+        bic            v18.16b, v18.16b, v17.16b           // w &= ~hev
+        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)

        // filter_mbedge:
        //   a = clamp((27*w + 63) >> 7);
@@ -424,35 +424,35 @@ endfunc
        //   a = clamp((9*w + 63) >> 7);
        //   Q2 = s2u(QS2 - a);
        //   P2 = s2u(PS2 + a);
-        movi            v17.8h,  #63
-        sshll           v22.8h,  v18.8b, #3
-        sshll2          v23.8h,  v18.16b, #3
-        saddw           v22.8h,  v22.8h, v18.8b
-        saddw2          v23.8h,  v23.8h, v18.16b
-        add             v16.8h,  v17.8h, v22.8h
-        add             v17.8h,  v17.8h, v23.8h           //  9*w + 63
-        add             v19.8h,  v16.8h, v22.8h
-        add             v20.8h,  v17.8h, v23.8h           // 18*w + 63
-        add             v22.8h,  v19.8h, v22.8h
-        add             v23.8h,  v20.8h, v23.8h           // 27*w + 63
-        sqshrn          v16.8b,  v16.8h,  #7
-        sqshrn2         v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
-        sqshrn          v19.8b,  v19.8h, #7
-        sqshrn2         v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
-        sqshrn          v22.8b,  v22.8h, #7
-        sqshrn2         v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
-        sqadd           v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
-        sqsub           v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
-        sqadd           v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
-        sqsub           v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
-        sqadd           v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
-        sqsub           v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
-        eor             v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
-        eor             v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
-        eor             v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
-        eor             v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
-        eor             v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
-        eor             v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
+        movi           v17.8h,  #63
+        sshll          v22.8h,  v18.8b, #3
+        sshll2         v23.8h,  v18.16b, #3
+        saddw          v22.8h,  v22.8h, v18.8b
+        saddw2         v23.8h,  v23.8h, v18.16b
+        add            v16.8h,  v17.8h, v22.8h
+        add            v17.8h,  v17.8h, v23.8h           //  9*w + 63
+        add            v19.8h,  v16.8h, v22.8h
+        add            v20.8h,  v17.8h, v23.8h           // 18*w + 63
+        add            v22.8h,  v19.8h, v22.8h
+        add            v23.8h,  v20.8h, v23.8h           // 27*w + 63
+        sqshrn         v16.8b,  v16.8h,  #7
+        sqshrn2        v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
+        sqshrn         v19.8b,  v19.8h, #7
+        sqshrn2        v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
+        sqshrn         v22.8b,  v22.8h, #7
+        sqshrn2        v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
+        sqadd          v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
+        sqsub          v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
+        sqadd          v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
+        sqsub          v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
+        sqadd          v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
+        sqsub          v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
+        eor            v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
+        eor            v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
+        eor            v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
+        eor            v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
+        eor            v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
+        eor            v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
    .endif
 .endm

@@ -507,48 +507,48 @@ function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
        sub             x0,  x0,  x2,  lsl #2
        sub             x1,  x1,  x2,  lsl #2
        // Load pixels:
-        ld1             {v0.d}[0],     [x0], x2  // P3
-        ld1             {v0.d}[1],     [x1], x2  // P3
-        ld1             {v1.d}[0],     [x0], x2  // P2
-        ld1             {v1.d}[1],     [x1], x2  // P2
-        ld1             {v2.d}[0],     [x0], x2  // P1
-        ld1             {v2.d}[1],     [x1], x2  // P1
-        ld1             {v3.d}[0],     [x0], x2  // P0
-        ld1             {v3.d}[1],     [x1], x2  // P0
-        ld1             {v4.d}[0],     [x0], x2  // Q0
-        ld1             {v4.d}[1],     [x1], x2  // Q0
-        ld1             {v5.d}[0],     [x0], x2  // Q1
-        ld1             {v5.d}[1],     [x1], x2  // Q1
-        ld1             {v6.d}[0],     [x0], x2  // Q2
-        ld1             {v6.d}[1],     [x1], x2  // Q2
-        ld1             {v7.d}[0],     [x0]      // Q3
-        ld1             {v7.d}[1],     [x1]      // Q3
+        ld1          {v0.d}[0],     [x0], x2  // P3
+        ld1          {v0.d}[1],     [x1], x2  // P3
+        ld1          {v1.d}[0],     [x0], x2  // P2
+        ld1          {v1.d}[1],     [x1], x2  // P2
+        ld1          {v2.d}[0],     [x0], x2  // P1
+        ld1          {v2.d}[1],     [x1], x2  // P1
+        ld1          {v3.d}[0],     [x0], x2  // P0
+        ld1          {v3.d}[1],     [x1], x2  // P0
+        ld1          {v4.d}[0],     [x0], x2  // Q0
+        ld1          {v4.d}[1],     [x1], x2  // Q0
+        ld1          {v5.d}[0],     [x0], x2  // Q1
+        ld1          {v5.d}[1],     [x1], x2  // Q1
+        ld1          {v6.d}[0],     [x0], x2  // Q2
+        ld1          {v6.d}[1],     [x1], x2  // Q2
+        ld1          {v7.d}[0],     [x0]      // Q3
+        ld1          {v7.d}[1],     [x1]      // Q3

-        dup             v22.16b, w3                 // flim_E
-        dup             v23.16b, w4                 // flim_I
+        dup          v22.16b, w3                 // flim_E
+        dup          v23.16b, w4                 // flim_I

        vp8_loop_filter inner=\inner, hev_thresh=w5

        // back up to P2:  u,v -= stride * 6
-        sub             x0,  x0,  x2,  lsl #2
-        sub             x1,  x1,  x2,  lsl #2
-        sub             x0,  x0,  x2,  lsl #1
-        sub             x1,  x1,  x2,  lsl #1
+        sub          x0,  x0,  x2,  lsl #2
+        sub          x1,  x1,  x2,  lsl #2
+        sub          x0,  x0,  x2,  lsl #1
+        sub          x1,  x1,  x2,  lsl #1

        // Store pixels:

-        st1             {v1.d}[0],     [x0], x2  // P2
-        st1             {v1.d}[1],     [x1], x2  // P2
-        st1             {v2.d}[0],     [x0], x2  // P1
-        st1             {v2.d}[1],     [x1], x2  // P1
-        st1             {v3.d}[0],     [x0], x2  // P0
-        st1             {v3.d}[1],     [x1], x2  // P0
-        st1             {v4.d}[0],     [x0], x2  // Q0
-        st1             {v4.d}[1],     [x1], x2  // Q0
-        st1             {v5.d}[0],     [x0], x2  // Q1
-        st1             {v5.d}[1],     [x1], x2  // Q1
-        st1             {v6.d}[0],     [x0]      // Q2
-        st1             {v6.d}[1],     [x1]      // Q2
+        st1          {v1.d}[0],     [x0], x2  // P2
+        st1          {v1.d}[1],     [x1], x2  // P2
+        st1          {v2.d}[0],     [x0], x2  // P1
+        st1          {v2.d}[1],     [x1], x2  // P1
+        st1          {v3.d}[0],     [x0], x2  // P0
+        st1          {v3.d}[1],     [x1], x2  // P0
+        st1          {v4.d}[0],     [x0], x2  // Q0
+        st1          {v4.d}[1],     [x1], x2  // Q0
+        st1          {v5.d}[0],     [x0], x2  // Q1
+        st1          {v5.d}[1],     [x1], x2  // Q1
+        st1          {v6.d}[0],     [x0]      // Q2
+        st1          {v6.d}[1],     [x1]      // Q2

        ret
 endfunc
@@ -579,7 +579,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1
        ld1             {v6.d}[1], [x0], x1
        ld1             {v7.d}[1], [x0], x1

-        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31

        dup             v22.16b, w2                 // flim_E
    .if !\simple
@@ -590,7 +590,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1

        sub             x0,  x0,  x1, lsl #4    // backup 16 rows

-        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31

        // Store pixels:
        st1             {v0.d}[0], [x0], x1
@@ -624,24 +624,24 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
        sub             x1,  x1,  #4

        // Load pixels:
-        ld1             {v0.d}[0],     [x0], x2 // load u
-        ld1             {v0.d}[1],     [x1], x2 // load v
-        ld1             {v1.d}[0],     [x0], x2
-        ld1             {v1.d}[1],     [x1], x2
-        ld1             {v2.d}[0],     [x0], x2
-        ld1             {v2.d}[1],     [x1], x2
-        ld1             {v3.d}[0],     [x0], x2
-        ld1             {v3.d}[1],     [x1], x2
-        ld1             {v4.d}[0],     [x0], x2
-        ld1             {v4.d}[1],     [x1], x2
-        ld1             {v5.d}[0],     [x0], x2
-        ld1             {v5.d}[1],     [x1], x2
-        ld1             {v6.d}[0],     [x0], x2
-        ld1             {v6.d}[1],     [x1], x2
-        ld1             {v7.d}[0],     [x0], x2
-        ld1             {v7.d}[1],     [x1], x2
+        ld1          {v0.d}[0],     [x0], x2 // load u
+        ld1          {v0.d}[1],     [x1], x2 // load v
+        ld1          {v1.d}[0],     [x0], x2
+        ld1          {v1.d}[1],     [x1], x2
+        ld1          {v2.d}[0],     [x0], x2
+        ld1          {v2.d}[1],     [x1], x2
+        ld1          {v3.d}[0],     [x0], x2
+        ld1          {v3.d}[1],     [x1], x2
+        ld1          {v4.d}[0],     [x0], x2
+        ld1          {v4.d}[1],     [x1], x2
+        ld1          {v5.d}[0],     [x0], x2
+        ld1          {v5.d}[1],     [x1], x2
+        ld1          {v6.d}[0],     [x0], x2
+        ld1          {v6.d}[1],     [x1], x2
+        ld1          {v7.d}[0],     [x0], x2
+        ld1          {v7.d}[1],     [x1], x2

-        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31

        dup             v22.16b, w3                 // flim_E
        dup             v23.16b, w4                 // flim_I
@@ -651,25 +651,25 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
        sub             x0,  x0,  x2, lsl #3    // backup u 8 rows
        sub             x1,  x1,  x2, lsl #3    // backup v 8 rows

-        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31

        // Store pixels:
-        st1             {v0.d}[0],     [x0], x2 // load u
-        st1             {v0.d}[1],     [x1], x2 // load v
-        st1             {v1.d}[0],     [x0], x2
-        st1             {v1.d}[1],     [x1], x2
-        st1             {v2.d}[0],     [x0], x2
-        st1             {v2.d}[1],     [x1], x2
-        st1             {v3.d}[0],     [x0], x2
-        st1             {v3.d}[1],     [x1], x2
-        st1             {v4.d}[0],     [x0], x2
-        st1             {v4.d}[1],     [x1], x2
-        st1             {v5.d}[0],     [x0], x2
-        st1             {v5.d}[1],     [x1], x2
-        st1             {v6.d}[0],     [x0], x2
-        st1             {v6.d}[1],     [x1], x2
-        st1             {v7.d}[0],     [x0]
-        st1             {v7.d}[1],     [x1]
+        st1          {v0.d}[0],     [x0], x2 // load u
+        st1          {v0.d}[1],     [x1], x2 // load v
+        st1          {v1.d}[0],     [x0], x2
+        st1          {v1.d}[1],     [x1], x2
+        st1          {v2.d}[0],     [x0], x2
+        st1          {v2.d}[1],     [x1], x2
+        st1          {v3.d}[0],     [x0], x2
+        st1          {v3.d}[1],     [x1], x2
+        st1          {v4.d}[0],     [x0], x2
+        st1          {v4.d}[1],     [x1], x2
+        st1          {v5.d}[0],     [x0], x2
+        st1          {v5.d}[1],     [x1], x2
+        st1          {v6.d}[0],     [x0], x2
+        st1          {v6.d}[1],     [x1], x2
+        st1          {v7.d}[0],     [x0]
+        st1          {v7.d}[1],     [x1]

        ret

@@ -230,9 +230,6 @@ function \type\()_8tap_\size\()h_\idx1\idx2
        // reduced dst stride
 .if \size >= 16
        sub             x1,  x1,  x5
-.elseif \size == 4
-        add             x12, x2,  #8
-        add             x13, x7,  #8
 .endif
        // size >= 16 loads two qwords and increments x2,
        // for size 4/8 it's enough with one qword and no
@@ -251,14 +248,9 @@ function \type\()_8tap_\size\()h_\idx1\idx2
 .if \size >= 16
        ld1             {v4.8b,  v5.8b,  v6.8b},  [x2], #24
        ld1             {v16.8b, v17.8b, v18.8b}, [x7], #24
-.elseif \size == 8
+.else
        ld1             {v4.8b,  v5.8b},  [x2]
        ld1             {v16.8b, v17.8b}, [x7]
-.else // \size == 4
-        ld1             {v4.8b},  [x2]
-        ld1             {v16.8b}, [x7]
-        ld1             {v5.s}[0],  [x12], x3
-        ld1             {v17.s}[0], [x13], x3
 .endif
        uxtl            v4.8h,  v4.8b
        uxtl            v5.8h,  v5.8b
@@ -104,26 +104,26 @@ static int aasc_decode_frame(AVCodecContext *avctx,
        ff_msrle_decode(avctx, s->frame, 8, &s->gb);
        break;
    case MKTAG('A', 'A', 'S', 'C'):
-        switch (compr) {
-        case 0:
-            stride = (avctx->width * psize + psize) & ~psize;
-            if (buf_size < stride * avctx->height)
-                return AVERROR_INVALIDDATA;
-            for (i = avctx->height - 1; i >= 0; i--) {
-                memcpy(s->frame->data[0] + i * s->frame->linesize[0], buf, avctx->width * psize);
-                buf += stride;
-                buf_size -= stride;
-            }
-            break;
-        case 1:
-            bytestream2_init(&s->gb, buf, buf_size);
-            ff_msrle_decode(avctx, s->frame, 8, &s->gb);
-            break;
-        default:
-            av_log(avctx, AV_LOG_ERROR, "Unknown compression type %d\n", compr);
+    switch (compr) {
+    case 0:
+        stride = (avctx->width * psize + psize) & ~psize;
+        if (buf_size < stride * avctx->height)
            return AVERROR_INVALIDDATA;
+        for (i = avctx->height - 1; i >= 0; i--) {
+            memcpy(s->frame->data[0] + i * s->frame->linesize[0], buf, avctx->width * psize);
+            buf += stride;
+            buf_size -= stride;
        }
        break;
+    case 1:
+        bytestream2_init(&s->gb, buf, buf_size);
+        ff_msrle_decode(avctx, s->frame, 8, &s->gb);
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unknown compression type %d\n", compr);
+        return AVERROR_INVALIDDATA;
+    }
+        break;
    default:
        av_log(avctx, AV_LOG_ERROR, "Unknown FourCC: %X\n", avctx->codec_tag);
        return -1;
@@ -27,6 +27,7 @@
 #ifndef AVCODEC_AC3_H
 #define AVCODEC_AC3_H

+#define AC3_MAX_CODED_FRAME_SIZE 3840 /* in bytes */
 #define EAC3_MAX_CHANNELS 16          /**< maximum number of channels in EAC3 */
 #define AC3_MAX_CHANNELS 7            /**< maximum number of channels, including coupling channel */
 #define CPL_CH 0                      /**< coupling channel index */
@@ -74,7 +75,6 @@
 #define AC3_DYNAMIC_RANGE1      0

 typedef int                     INTFLOAT;
-typedef unsigned int            UINTFLOAT;
 typedef int16_t                 SHORTFLOAT;

 #else /* USE_FIXED */
@@ -94,7 +94,6 @@ typedef int16_t                 SHORTFLOAT;
 #define AC3_DYNAMIC_RANGE1      1.0f

 typedef float                   INTFLOAT;
-typedef float                   UINTFLOAT;
 typedef float                   SHORTFLOAT;

 #endif /* USE_FIXED */
@@ -179,9 +179,7 @@ int av_ac3_parse_header(const uint8_t *buf, size_t size,
    AC3HeaderInfo hdr;
    int err;

-    err = init_get_bits8(&gb, buf, size);
-    if (err < 0)
-        return AVERROR_INVALIDDATA;
+    init_get_bits8(&gb, buf, size);
    err = ff_ac3_parse_header(&gb, &hdr);
    if (err < 0)
        return AVERROR_INVALIDDATA;
@@ -1729,7 +1729,7 @@ static void ac3_output_frame(AC3EncodeContext *s, unsigned char *frame)
 {
    int blk;

-    init_put_bits(&s->pb, frame, s->frame_size);
+    init_put_bits(&s->pb, frame, AC3_MAX_CODED_FRAME_SIZE);

    s->output_frame_header(s);

@@ -100,7 +100,7 @@ static const int8_t mtf_index_table[16] = {
 typedef struct ADPCMDecodeContext {
    ADPCMChannelStatus status[14];
    int vqa_version;                /**< VQA version. Used for ADPCM_IMA_WS */
-    int has_status;                 /**< Status flag. Reset to 0 after a flush. */
+    int has_status;
 } ADPCMDecodeContext;

 static av_cold int adpcm_decode_init(AVCodecContext * avctx)
@@ -735,8 +735,6 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,

    if(ch <= 0)
        return 0;
-    if (buf_size > INT_MAX / 2)
-        return 0;

    switch (avctx->codec->id) {
    /* constant, only check buf_size */
@@ -1813,6 +1811,11 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
        }
        break;
    case AV_CODEC_ID_ADPCM_AICA:
+        if (!c->has_status) {
+            for (channel = 0; channel < avctx->channels; channel++)
+                c->status[channel].step = 0;
+            c->has_status = 1;
+        }
        for (channel = 0; channel < avctx->channels; channel++) {
            samples = samples_p[channel];
            for (n = nb_samples >> 1; n > 0; n--) {
@@ -2074,6 +2077,13 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
        }
        break;
    case AV_CODEC_ID_ADPCM_ZORK:
+        if (!c->has_status) {
+            for (channel = 0; channel < avctx->channels; channel++) {
+                c->status[channel].predictor  = 0;
+                c->status[channel].step_index = 0;
+            }
+            c->has_status = 1;
+        }
        for (n = 0; n < nb_samples * avctx->channels; n++) {
            int v = bytestream2_get_byteu(&gb);
            *samples++ = adpcm_zork_expand_nibble(&c->status[n % avctx->channels], v);
@@ -2111,37 +2121,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
 static void adpcm_flush(AVCodecContext *avctx)
 {
    ADPCMDecodeContext *c = avctx->priv_data;
-
-    switch(avctx->codec_id) {
-    case AV_CODEC_ID_ADPCM_AICA:
-        for (int channel = 0; channel < avctx->channels; channel++)
-            c->status[channel].step = 0;
-        break;
-
-    case AV_CODEC_ID_ADPCM_ARGO:
-        for (int channel = 0; channel < avctx->channels; channel++) {
-            c->status[channel].sample1 = 0;
-            c->status[channel].sample2 = 0;
-        }
-        break;
-
-    case AV_CODEC_ID_ADPCM_IMA_ALP:
-    case AV_CODEC_ID_ADPCM_IMA_CUNNING:
-    case AV_CODEC_ID_ADPCM_IMA_SSI:
-    case AV_CODEC_ID_ADPCM_ZORK:
-        for (int channel = 0; channel < avctx->channels; channel++) {
-            c->status[channel].predictor  = 0;
-            c->status[channel].step_index = 0;
-        }
-        break;
-
-    default:
-        /* Other codecs may want to handle this during decoding. */
-        c->has_status = 0;
-        return;
-    }
-
-    c->has_status = 1;
+    c->has_status = 0;
 }


@@ -959,14 +959,14 @@ static const AVOption options[] = {
    { NULL }
 };

+static const AVClass adpcm_encoder_class = {
+    .class_name = "ADPCM Encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 #define ADPCM_ENCODER(id_, name_, sample_fmts_, capabilities_, long_name_) \
-static const AVClass name_ ## _encoder_class = {                           \
-    .class_name = #name_,                                                  \
-    .item_name  = av_default_item_name,                                    \
-    .option     = options,                                                 \
-    .version    = LIBAVUTIL_VERSION_INT,                                   \
-};                                                                         \
-                                                                           \
 AVCodec ff_ ## name_ ## _encoder = {                                       \
    .name           = #name_,                                              \
    .long_name      = NULL_IF_CONFIG_SMALL(long_name_),                    \
@@ -979,7 +979,7 @@ AVCodec ff_ ## name_ ## _encoder = {                                       \
    .sample_fmts    = sample_fmts_,                                        \
    .capabilities   = capabilities_,                                       \
    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP | FF_CODEC_CAP_INIT_THREADSAFE, \
-    .priv_class     = &name_ ## _encoder_class,                            \
+    .priv_class     = &adpcm_encoder_class,                                \
 }

 ADPCM_ENCODER(AV_CODEC_ID_ADPCM_ARGO,    adpcm_argo,    sample_fmts_p, 0,                             "ADPCM Argonaut Games");
@@ -472,7 +472,8 @@ static av_cold int aic_decode_init(AVCodecContext *avctx)
        }
    }

-    ctx->slice_data = av_calloc(ctx->slice_width, AIC_BAND_COEFFS * sizeof(*ctx->slice_data));
+    ctx->slice_data = av_malloc_array(ctx->slice_width, AIC_BAND_COEFFS
+                                * sizeof(*ctx->slice_data));
    if (!ctx->slice_data) {
        av_log(avctx, AV_LOG_ERROR, "Error allocating slice buffer\n");

@@ -29,12 +29,12 @@ static void decorrelate_stereo(int32_t *buffer[2], int nb_samples,
    int i;

    for (i = 0; i < nb_samples; i++) {
-        uint32_t a, b;
+        int32_t a, b;

        a = buffer[0][i];
        b = buffer[1][i];

-        a -= (int)(b * decorr_left_weight) >> decorr_shift;
+        a -= (b * decorr_left_weight) >> decorr_shift;
        b += a;

        buffer[0][i] = b;
@@ -1017,7 +1017,7 @@ static int read_block(ALSDecContext *ctx, ALSBlockData *bd)

    *bd->shift_lsbs = 0;

-    if (get_bits_left(gb) < 7)
+    if (get_bits_left(gb) < 1)
        return AVERROR_INVALIDDATA;

    // read block type flag and read the samples accordingly
@@ -1529,12 +1529,8 @@ static int read_diff_float_data(ALSDecContext *ctx, unsigned int ra_frame) {
                    return AVERROR_INVALIDDATA;
                }

-                j = 0;
                for (i = 0; i < frame_length; ++i) {
-                    if (ctx->raw_samples[c][i] == 0) {
-                        ctx->raw_mantissa[c][i] = AV_RB32(larray + j);
-                        j += 4;
-                    }
+                    ctx->raw_mantissa[c][i] = AV_RB32(larray);
                }
            }
        }
@@ -1545,10 +1541,7 @@ static int read_diff_float_data(ALSDecContext *ctx, unsigned int ra_frame) {
                if (ctx->raw_samples[c][i] != 0) {
                    //The following logic is taken from Tabel 14.45 and 14.46 from the ISO spec
                    if (av_cmp_sf_ieee754(acf[c], FLOAT_1)) {
-                        int nbit = av_log2(FFABSU(ctx->raw_samples[c][i]));
-                        if (nbit > 23)
-                            return AVERROR_INVALIDDATA;
-                        nbits[i] = 23 - nbit;
+                        nbits[i] = 23 - av_log2(abs(ctx->raw_samples[c][i]));
                    } else {
                        nbits[i] = 23;
                    }
@@ -1622,7 +1615,7 @@ static int read_diff_float_data(ALSDecContext *ctx, unsigned int ra_frame) {
                tmp_32 = (sign << 31) | ((e + EXP_BIAS) << 23) | (mantissa);
                ctx->raw_samples[c][i] = tmp_32;
            } else {
-                ctx->raw_samples[c][i] = raw_mantissa[c][i];
+                ctx->raw_samples[c][i] = raw_mantissa[c][i] & 0x007fffffUL;
            }
        }
        align_get_bits(gb);
@@ -1639,7 +1632,7 @@ static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
    AVCodecContext *avctx    = ctx->avctx;
    GetBitContext *gb = &ctx->gb;
    unsigned int div_blocks[32];                ///< block sizes.
-    int c;
+    unsigned int c;
    unsigned int js_blocks[2];
    uint32_t bs_info = 0;
    int ret;
@@ -1776,9 +1769,7 @@ static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
    }

    if (sconf->floating) {
-        ret = read_diff_float_data(ctx, ra_frame);
-        if (ret < 0)
-            return ret;
+        read_diff_float_data(ctx, ra_frame);
    }

    if (get_bits_left(gb) < 0) {
@@ -1819,17 +1810,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
    else
        ctx->cur_frame_length = sconf->frame_length;

-    ctx->highest_decoded_channel = -1;
+    ctx->highest_decoded_channel = 0;
    // decode the frame data
    if ((invalid_frame = read_frame_data(ctx, ra_frame)) < 0)
        av_log(ctx->avctx, AV_LOG_WARNING,
               "Reading frame data failed. Skipping RA unit.\n");

-    if (ctx->highest_decoded_channel == -1) {
-        av_log(ctx->avctx, AV_LOG_WARNING,
-               "No channel data decoded.\n");
+    if (ctx->highest_decoded_channel == 0)
        return AVERROR_INVALIDDATA;
-    }

    ctx->frame_id++;

@@ -102,7 +102,7 @@ typedef struct APEFilter {
    int16_t *historybuffer; ///< filter memory
    int16_t *delay;         ///< filtered values

-    uint32_t avg;
+    int avg;
 } APEFilter;

 typedef struct APERice {
@@ -879,7 +879,7 @@ static av_always_inline int filter_fast_3320(APEPredictor *p,
    }

    predictionA = p->buf[delayA] * 2U - p->buf[delayA - 1];
-    p->lastA[filter] = decoded + (unsigned)((int32_t)(predictionA  * p->coeffsA[filter][0]) >> 9);
+    p->lastA[filter] = decoded + ((int32_t)(predictionA  * p->coeffsA[filter][0]) >> 9);

    if ((decoded ^ predictionA) > 0)
        p->coeffsA[filter][0]++;
@@ -909,8 +909,8 @@ static av_always_inline int filter_3800(APEPredictor *p,
        return predictionA;
    }
    d2 =  p->buf[delayA];
-    d1 = (p->buf[delayA] - (unsigned)p->buf[delayA - 1]) * 2;
-    d0 =  p->buf[delayA] + ((p->buf[delayA - 2] - (unsigned)p->buf[delayA - 1]) * 8);
+    d1 = (p->buf[delayA] - p->buf[delayA - 1]) * 2U;
+    d0 =  p->buf[delayA] + ((p->buf[delayA - 2] - p->buf[delayA - 1]) * 8U);
    d3 =  p->buf[delayB] * 2U - p->buf[delayB - 1];
    d4 =  p->buf[delayB];

@@ -930,7 +930,7 @@ static av_always_inline int filter_3800(APEPredictor *p,
    p->coeffsB[filter][0] += (((d3 >> 29) & 4) - 2) * sign;
    p->coeffsB[filter][1] -= (((d4 >> 30) & 2) - 1) * sign;

-    p->filterB[filter] = p->lastA[filter] + (unsigned)(predictionB >> shift);
+    p->filterB[filter] = p->lastA[filter] + (predictionB >> shift);
    p->filterA[filter] = p->filterB[filter] + (unsigned)((int)(p->filterA[filter] * 31U) >> 5);

    return p->filterA[filter];
@@ -955,7 +955,7 @@ static void long_filter_high_3800(int32_t *buffer, int order, int shift, int len
            dotprod += delay[j] * (unsigned)coeffs[j];
            coeffs[j] += ((delay[j] >> 31) | 1) * sign;
        }
-        buffer[i] -= (unsigned)(dotprod >> shift);
+        buffer[i] -= dotprod >> shift;
        for (j = 0; j < order - 1; j++)
            delay[j] = delay[j + 1];
        delay[order - 1] = buffer[i];
@@ -979,7 +979,7 @@ static void long_filter_ehigh_3830(int32_t *buffer, int length)
        for (j = 7; j > 0; j--)
            delay[j] = delay[j - 1];
        delay[0] = buffer[i];
-        buffer[i] -= (unsigned)(dotprod >> 9);
+        buffer[i] -= dotprod >> 9;
    }
 }

@@ -1088,13 +1088,13 @@ static av_always_inline int predictor_update_3930(APEPredictor *p,
                                                  const int delayA)
 {
    int32_t predictionA, sign;
-    uint32_t d0, d1, d2, d3;
+    int32_t d0, d1, d2, d3;

    p->buf[delayA]     = p->lastA[filter];
    d0 = p->buf[delayA    ];
-    d1 = p->buf[delayA    ] - (unsigned)p->buf[delayA - 1];
-    d2 = p->buf[delayA - 1] - (unsigned)p->buf[delayA - 2];
-    d3 = p->buf[delayA - 2] - (unsigned)p->buf[delayA - 3];
+    d1 = p->buf[delayA    ] - p->buf[delayA - 1];
+    d2 = p->buf[delayA - 1] - p->buf[delayA - 2];
+    d3 = p->buf[delayA - 2] - p->buf[delayA - 3];

    predictionA = d0 * p->coeffsA[filter][0] +
                  d1 * p->coeffsA[filter][1] +
@@ -1105,10 +1105,10 @@ static av_always_inline int predictor_update_3930(APEPredictor *p,
    p->filterA[filter] = p->lastA[filter] + ((int)(p->filterA[filter] * 31U) >> 5);

    sign = APESIGN(decoded);
-    p->coeffsA[filter][0] += (((int32_t)d0 < 0) * 2 - 1) * sign;
-    p->coeffsA[filter][1] += (((int32_t)d1 < 0) * 2 - 1) * sign;
-    p->coeffsA[filter][2] += (((int32_t)d2 < 0) * 2 - 1) * sign;
-    p->coeffsA[filter][3] += (((int32_t)d3 < 0) * 2 - 1) * sign;
+    p->coeffsA[filter][0] += ((d0 < 0) * 2 - 1) * sign;
+    p->coeffsA[filter][1] += ((d1 < 0) * 2 - 1) * sign;
+    p->coeffsA[filter][2] += ((d2 < 0) * 2 - 1) * sign;
+    p->coeffsA[filter][3] += ((d3 < 0) * 2 - 1) * sign;

    return p->filterA[filter];
 }
@@ -1166,8 +1166,7 @@ static void predictor_decode_mono_3930(APEContext *ctx, int count)
 static av_always_inline int predictor_update_filter(APEPredictor64 *p,
                                                    const int decoded, const int filter,
                                                    const int delayA,  const int delayB,
-                                                    const int adaptA,  const int adaptB,
-                                                    int compression_level)
+                                                    const int adaptA,  const int adaptB)
 {
    int64_t predictionA, predictionB;
    int32_t sign;
@@ -1195,13 +1194,7 @@ static av_always_inline int predictor_update_filter(APEPredictor64 *p,
                  p->buf[delayB - 3] * p->coeffsB[filter][3] +
                  p->buf[delayB - 4] * p->coeffsB[filter][4];

-    if (compression_level < COMPRESSION_LEVEL_INSANE) {
-        predictionA = (int32_t)predictionA;
-        predictionB = (int32_t)predictionB;
-        p->lastA[filter] = (int32_t)(decoded + (unsigned)((int32_t)(predictionA + (predictionB >> 1)) >> 10));
-    } else {
-        p->lastA[filter] = decoded + ((int64_t)((uint64_t)predictionA + (predictionB >> 1)) >> 10);
-    }
+    p->lastA[filter] = decoded + ((int64_t)((uint64_t)predictionA + (predictionB >> 1)) >> 10);
    p->filterA[filter] = p->lastA[filter] + ((int64_t)(p->filterA[filter] * 31ULL) >> 5);

    sign = APESIGN(decoded);
@@ -1229,12 +1222,10 @@ static void predictor_decode_stereo_3950(APEContext *ctx, int count)
    while (count--) {
        /* Predictor Y */
        *decoded0 = predictor_update_filter(p, *decoded0, 0, YDELAYA, YDELAYB,
-                                            YADAPTCOEFFSA, YADAPTCOEFFSB,
-                                            ctx->compression_level);
+                                            YADAPTCOEFFSA, YADAPTCOEFFSB);
        decoded0++;
        *decoded1 = predictor_update_filter(p, *decoded1, 1, XDELAYA, XDELAYB,
-                                            XADAPTCOEFFSA, XADAPTCOEFFSB,
-                                            ctx->compression_level);
+                                            XADAPTCOEFFSA, XADAPTCOEFFSB);
        decoded1++;

        /* Combined */
@@ -1346,7 +1337,7 @@ static void do_apply_filter(APEContext *ctx, int version, APEFilter *f,
            absres = FFABSU(res);
            if (absres)
                *f->adaptcoeffs = APESIGN(res) *
-                                  (8 << ((absres > f->avg * 3LL) + (absres > (f->avg + f->avg / 3))));
+                                  (8 << ((absres > f->avg * 3) + (absres > f->avg * 4 / 3)));
                /* equivalent to the following code
                    if (absres <= f->avg * 4 / 3)
                        *f->adaptcoeffs = APESIGN(res) * 8;
@@ -1596,7 +1587,7 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
        for (ch = 0; ch < s->channels; ch++) {
            sample8 = (uint8_t *)frame->data[ch];
            for (i = 0; i < blockstodecode; i++)
-                *sample8++ = (s->decoded[ch][i] + 0x80U) & 0xff;
+                *sample8++ = (s->decoded[ch][i] + 0x80) & 0xff;
        }
        break;
    case 16:
@@ -1618,24 +1609,13 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
    s->samples -= blockstodecode;

    if (avctx->err_recognition & AV_EF_CRCCHECK &&
-        s->fileversion >= 3900) {
+        s->fileversion >= 3900 && s->bps < 24) {
        uint32_t crc = s->CRC_state;
        const AVCRC *crc_tab = av_crc_get_table(AV_CRC_32_IEEE_LE);
-        int stride = s->bps == 24 ? 4 : (s->bps>>3);
-        int offset = s->bps == 24;
-        int bytes  = s->bps >> 3;
-
        for (i = 0; i < blockstodecode; i++) {
            for (ch = 0; ch < s->channels; ch++) {
-#if HAVE_BIGENDIAN
-                uint8_t *smp_native = frame->data[ch] + i*stride;
-                uint8_t smp[4];
-                for(int j = 0; j<stride; j++)
-                    smp[j] = smp_native[stride-j-1];
-#else
-                uint8_t *smp = frame->data[ch] + i*stride;
-#endif
-                crc = av_crc(crc_tab, crc, smp+offset, bytes);
+                uint8_t *smp = frame->data[ch] + (i*(s->bps >> 3));
+                crc = av_crc(crc_tab, crc, smp, s->bps >> 3);
            }
        }

@@ -59,7 +59,7 @@ static int decode_pal8(AVCodecContext *avctx, uint32_t *pal)
        return AVERROR_INVALIDDATA;

    for (int i = 0; i < count; i++)
-        pal[start + i] = (0xFFU << 24) | bytestream2_get_be24u(gb);
+        pal[start + i] = (0xFF << 24U) | bytestream2_get_be24u(gb);

    return 0;
 }
@@ -608,9 +608,6 @@ static int decode_frame(AVCodecContext *avctx, void *data,
    uint32_t chunk;
    int ret;

-    if (avpkt->size < 4)
-        return AVERROR_INVALIDDATA;
-
    bytestream2_init(gb, avpkt->data, avpkt->size);

    if ((ret = ff_reget_buffer(avctx, frame, 0)) < 0)
@@ -688,11 +685,6 @@ static av_cold int decode_init(AVCodecContext *avctx)
             return AVERROR_PATCHWELCOME;
    }

-    if (avctx->width % 2 || avctx->height % 2) {
-        avpriv_request_sample(s, "Odd dimensions\n");
-        return AVERROR_PATCHWELCOME;
-    }
-
    s->frame = av_frame_alloc();
    if (!s->frame)
        return AVERROR(ENOMEM);
@@ -48,3 +48,4 @@ function ff_scalarproduct_int16_neon, export=1
        vmov.32         r0,  d3[0]
        bx              lr
 endfunc
+
@@ -229,7 +229,7 @@ A .endif
  .endif

        // Begin loop
-1:
+01:
  .if TOTAL_TAPS == 0
        // Things simplify a lot in this case
        // In fact this could be pipelined further if it's worth it...
@@ -241,7 +241,7 @@ A .endif
        str     ST0, [PST, #-4]!
        str     ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
        str     ST0, [PSAMP], #4 * MAX_CHANNELS
-        bne     1b
+        bne     01b
  .else
    .if \fir_taps & 1
      .set LOAD_REG, 1
@@ -333,7 +333,7 @@ T       orr     AC0, AC0, AC1
        str     ST3, [PST, #-4]!
        str     ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
        str     ST3, [PSAMP], #4 * MAX_CHANNELS
-        bne     1b
+        bne     01b
  .endif
        b       99f

@@ -38,49 +38,49 @@ function ff_sbc_analyze_4_neon, export=1
        /* TODO: merge even and odd cases (or even merge all four calls to this
         * function) in order to have only aligned reads from 'in' array
         * and reduce number of load instructions */
-        vld1.16         {d16, d17}, [r0, :64]!
-        vld1.16         {d20, d21}, [r2, :128]!
+        vld1.16         {d4, d5}, [r0, :64]!
+        vld1.16         {d8, d9}, [r2, :128]!

-        vmull.s16       q0, d16, d20
-        vld1.16         {d18, d19}, [r0, :64]!
-        vmull.s16       q1, d17, d21
-        vld1.16         {d22, d23}, [r2, :128]!
+        vmull.s16       q0, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmull.s16       q1, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!

-        vmlal.s16       q0, d18, d22
-        vld1.16         {d16, d17}, [r0, :64]!
-        vmlal.s16       q1, d19, d23
-        vld1.16         {d20, d21}, [r2, :128]!
+        vmlal.s16       q0, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmlal.s16       q1, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!

-        vmlal.s16       q0, d16, d20
-        vld1.16         {d18, d19}, [r0, :64]!
-        vmlal.s16       q1, d17, d21
-        vld1.16         {d22, d23}, [r2, :128]!
+        vmlal.s16       q0, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmlal.s16       q1, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!

-        vmlal.s16       q0, d18, d22
-        vld1.16         {d16, d17}, [r0, :64]!
-        vmlal.s16       q1, d19, d23
-        vld1.16         {d20, d21}, [r2, :128]!
+        vmlal.s16       q0, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmlal.s16       q1, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!

-        vmlal.s16       q0, d16, d20
-        vmlal.s16       q1, d17, d21
+        vmlal.s16       q0, d4, d8
+        vmlal.s16       q1, d5, d9

        vpadd.s32       d0, d0, d1
        vpadd.s32       d1, d2, d3

        vrshrn.s32      d0, q0, SBC_PROTO_FIXED_SCALE

-        vld1.16         {d16, d17, d18, d19}, [r2, :128]!
+        vld1.16         {d2, d3, d4, d5}, [r2, :128]!

        vdup.i32        d1, d0[1]  /* TODO: can be eliminated */
        vdup.i32        d0, d0[0]  /* TODO: can be eliminated */

-        vmull.s16       q10, d16, d0
-        vmull.s16       q11, d17, d0
-        vmlal.s16       q10, d18, d1
-        vmlal.s16       q11, d19, d1
+        vmull.s16       q3, d2, d0
+        vmull.s16       q4, d3, d0
+        vmlal.s16       q3, d4, d1
+        vmlal.s16       q4, d5, d1

-        vpadd.s32       d0, d20, d21 /* TODO: can be eliminated */
-        vpadd.s32       d1, d22, d23 /* TODO: can be eliminated */
+        vpadd.s32       d0, d6, d7 /* TODO: can be eliminated */
+        vpadd.s32       d1, d8, d9 /* TODO: can be eliminated */

        vst1.32         {d0, d1}, [r1, :128]

@@ -91,57 +91,57 @@ function ff_sbc_analyze_8_neon, export=1
        /* TODO: merge even and odd cases (or even merge all four calls to this
         * function) in order to have only aligned reads from 'in' array
         * and reduce number of load instructions */
-        vld1.16         {d16, d17}, [r0, :64]!
-        vld1.16         {d20, d21}, [r2, :128]!
+        vld1.16         {d4, d5}, [r0, :64]!
+        vld1.16         {d8, d9}, [r2, :128]!

-        vmull.s16       q12, d16, d20
-        vld1.16         {d18, d19}, [r0, :64]!
-        vmull.s16       q13, d17, d21
-        vld1.16         {d22, d23}, [r2, :128]!
-        vmull.s16       q14, d18, d22
-        vld1.16         {d16, d17}, [r0, :64]!
-        vmull.s16       q15, d19, d23
-        vld1.16         {d20, d21}, [r2, :128]!
+        vmull.s16       q6, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmull.s16       q7, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!
+        vmull.s16       q8, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmull.s16       q9, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!

-        vmlal.s16       q12, d16, d20
-        vld1.16         {d18, d19}, [r0, :64]!
-        vmlal.s16       q13, d17, d21
-        vld1.16         {d22, d23}, [r2, :128]!
-        vmlal.s16       q14, d18, d22
-        vld1.16         {d16, d17}, [r0, :64]!
-        vmlal.s16       q15, d19, d23
-        vld1.16         {d20, d21}, [r2, :128]!
+        vmlal.s16       q6, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmlal.s16       q7, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!
+        vmlal.s16       q8, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmlal.s16       q9, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!

-        vmlal.s16       q12, d16, d20
-        vld1.16         {d18, d19}, [r0, :64]!
-        vmlal.s16       q13, d17, d21
-        vld1.16         {d22, d23}, [r2, :128]!
-        vmlal.s16       q14, d18, d22
-        vld1.16         {d16, d17}, [r0, :64]!
-        vmlal.s16       q15, d19, d23
-        vld1.16         {d20, d21}, [r2, :128]!
+        vmlal.s16       q6, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmlal.s16       q7, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!
+        vmlal.s16       q8, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmlal.s16       q9, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!

-        vmlal.s16       q12, d16, d20
-        vld1.16         {d18, d19}, [r0, :64]!
-        vmlal.s16       q13, d17, d21
-        vld1.16         {d22, d23}, [r2, :128]!
-        vmlal.s16       q14, d18, d22
-        vld1.16         {d16, d17}, [r0, :64]!
-        vmlal.s16       q15, d19, d23
-        vld1.16         {d20, d21}, [r2, :128]!
+        vmlal.s16       q6, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmlal.s16       q7, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!
+        vmlal.s16       q8, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmlal.s16       q9, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!

-        vmlal.s16       q12, d16, d20
-        vld1.16         {d18, d19}, [r0, :64]!
-        vmlal.s16       q13, d17, d21
-        vld1.16         {d22, d23}, [r2, :128]!
+        vmlal.s16       q6, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmlal.s16       q7, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!

-        vmlal.s16       q14, d18, d22
-        vmlal.s16       q15, d19, d23
+        vmlal.s16       q8, d6, d10
+        vmlal.s16       q9, d7, d11

-        vpadd.s32       d0, d24, d25
-        vpadd.s32       d1, d26, d27
-        vpadd.s32       d2, d28, d29
-        vpadd.s32       d3, d30, d31
+        vpadd.s32       d0, d12, d13
+        vpadd.s32       d1, d14, d15
+        vpadd.s32       d2, d16, d17
+        vpadd.s32       d3, d18, d19

        vrshr.s32       q0, q0, SBC_PROTO_FIXED_SCALE
        vrshr.s32       q1, q1, SBC_PROTO_FIXED_SCALE
@@ -153,38 +153,38 @@ function ff_sbc_analyze_8_neon, export=1
        vdup.i32        d1, d0[1]  /* TODO: can be eliminated */
        vdup.i32        d0, d0[0]  /* TODO: can be eliminated */

-        vld1.16         {d16, d17}, [r2, :128]!
-        vmull.s16       q12, d16, d0
-        vld1.16         {d18, d19}, [r2, :128]!
-        vmull.s16       q13, d17, d0
-        vmull.s16       q14, d18, d0
-        vmull.s16       q15, d19, d0
+        vld1.16         {d4, d5}, [r2, :128]!
+        vmull.s16       q6, d4, d0
+        vld1.16         {d6, d7}, [r2, :128]!
+        vmull.s16       q7, d5, d0
+        vmull.s16       q8, d6, d0
+        vmull.s16       q9, d7, d0

-        vld1.16         {d16, d17}, [r2, :128]!
-        vmlal.s16       q12, d16, d1
-        vld1.16         {d18, d19}, [r2, :128]!
-        vmlal.s16       q13, d17, d1
-        vmlal.s16       q14, d18, d1
-        vmlal.s16       q15, d19, d1
+        vld1.16         {d4, d5}, [r2, :128]!
+        vmlal.s16       q6, d4, d1
+        vld1.16         {d6, d7}, [r2, :128]!
+        vmlal.s16       q7, d5, d1
+        vmlal.s16       q8, d6, d1
+        vmlal.s16       q9, d7, d1

-        vld1.16         {d16, d17}, [r2, :128]!
-        vmlal.s16       q12, d16, d2
-        vld1.16         {d18, d19}, [r2, :128]!
-        vmlal.s16       q13, d17, d2
-        vmlal.s16       q14, d18, d2
-        vmlal.s16       q15, d19, d2
+        vld1.16         {d4, d5}, [r2, :128]!
+        vmlal.s16       q6, d4, d2
+        vld1.16         {d6, d7}, [r2, :128]!
+        vmlal.s16       q7, d5, d2
+        vmlal.s16       q8, d6, d2
+        vmlal.s16       q9, d7, d2

-        vld1.16         {d16, d17}, [r2, :128]!
-        vmlal.s16       q12, d16, d3
-        vld1.16         {d18, d19}, [r2, :128]!
-        vmlal.s16       q13, d17, d3
-        vmlal.s16       q14, d18, d3
-        vmlal.s16       q15, d19, d3
+        vld1.16         {d4, d5}, [r2, :128]!
+        vmlal.s16       q6, d4, d3
+        vld1.16         {d6, d7}, [r2, :128]!
+        vmlal.s16       q7, d5, d3
+        vmlal.s16       q8, d6, d3
+        vmlal.s16       q9, d7, d3

-        vpadd.s32       d0, d24, d25 /* TODO: can be eliminated */
-        vpadd.s32       d1, d26, d27 /* TODO: can be eliminated */
-        vpadd.s32       d2, d28, d29 /* TODO: can be eliminated */
-        vpadd.s32       d3, d30, d31 /* TODO: can be eliminated */
+        vpadd.s32       d0, d12, d13 /* TODO: can be eliminated */
+        vpadd.s32       d1, d14, d15 /* TODO: can be eliminated */
+        vpadd.s32       d2, d16, d17 /* TODO: can be eliminated */
+        vpadd.s32       d3, d18, d19 /* TODO: can be eliminated */

        vst1.32         {d0, d1, d2, d3}, [r1, :128]

@@ -279,13 +279,11 @@ function \type\()_8tap_\size\()h_\idx1\idx2
        sub             r1,  r1,  r5
 .endif
        @ size >= 16 loads two qwords and increments r2,
-        @ size 4 loads 1 d word, increments r2 and loads 1 32-bit lane
-        @ for size 8 it's enough with one qword and no postincrement
+        @ for size 4/8 it's enough with one qword and no
+        @ postincrement
 .if \size >= 16
        sub             r3,  r3,  r5
        sub             r3,  r3,  #8
-.elseif \size == 4
-        sub             r3,  r3,  #8
 .endif
        @ Load the filter vector
        vld1.16         {q0},  [r12,:128]
@@ -297,14 +295,9 @@ function \type\()_8tap_\size\()h_\idx1\idx2
 .if \size >= 16
        vld1.8          {d18, d19, d20}, [r2]!
        vld1.8          {d24, d25, d26}, [r7]!
-.elseif \size == 8
+.else
        vld1.8          {q9},  [r2]
        vld1.8          {q12}, [r7]
-.else @ size == 4
-        vld1.8          {d18}, [r2]!
-        vld1.8          {d24}, [r7]!
-        vld1.32         {d19[0]}, [r2]
-        vld1.32         {d25[0]}, [r7]
 .endif
        vmovl.u8        q8,  d18
        vmovl.u8        q9,  d19
@@ -376,7 +376,7 @@ ASSSplitContext *ff_ass_split(const char *buf)
    ASSSplitContext *ctx = av_mallocz(sizeof(*ctx));
    if (!ctx)
        return NULL;
-    if (buf && !strncmp(buf, "\xef\xbb\xbf", 3)) // Skip UTF-8 BOM header
+    if (buf && !memcmp(buf, "\xef\xbb\xbf", 3)) // Skip UTF-8 BOM header
        buf += 3;
    ctx->current_section = -1;
    if (ass_split(ctx, buf) < 0) {
@@ -114,13 +114,6 @@ enum {
    AV1_WARP_MODEL_TRANSLATION = 1,
    AV1_WARP_MODEL_ROTZOOM     = 2,
    AV1_WARP_MODEL_AFFINE      = 3,
-    AV1_WARP_PARAM_REDUCE_BITS = 6,
-
-    AV1_DIV_LUT_BITS      = 8,
-    AV1_DIV_LUT_PREC_BITS = 14,
-    AV1_DIV_LUT_NUM       = 257,
-
-    AV1_MAX_LOOP_FILTER = 63,
 };


@@ -28,7 +28,6 @@ typedef struct AV1MetadataContext {
    CBSBSFContext common;

    int td;
-    AV1RawOBU td_obu;

    int color_primaries;
    int transfer_characteristics;
@@ -108,11 +107,12 @@ static int av1_metadata_update_fragment(AVBSFContext *bsf, AVPacket *pkt,
                                        CodedBitstreamFragment *frag)
 {
    AV1MetadataContext *ctx = bsf->priv_data;
+    AV1RawOBU td, *obu;
    int err, i;

    for (i = 0; i < frag->nb_units; i++) {
        if (frag->units[i].type == AV1_OBU_SEQUENCE_HEADER) {
-            AV1RawOBU *obu = frag->units[i].content;
+            obu = frag->units[i].content;
            err = av1_metadata_update_sequence_header(bsf, &obu->obu.sequence_header);
            if (err < 0)
                return err;
@@ -120,12 +120,16 @@ static int av1_metadata_update_fragment(AVBSFContext *bsf, AVPacket *pkt,
    }

    // If a Temporal Delimiter is present, it must be the first OBU.
-    if (frag->nb_units && frag->units[0].type == AV1_OBU_TEMPORAL_DELIMITER) {
+    if (frag->units[0].type == AV1_OBU_TEMPORAL_DELIMITER) {
        if (ctx->td == BSF_ELEMENT_REMOVE)
            ff_cbs_delete_unit(frag, 0);
    } else if (pkt && ctx->td == BSF_ELEMENT_INSERT) {
+        td = (AV1RawOBU) {
+            .header.obu_type = AV1_OBU_TEMPORAL_DELIMITER,
+        };
+
        err = ff_cbs_insert_unit_content(frag, 0, AV1_OBU_TEMPORAL_DELIMITER,
-                                         &ctx->td_obu, NULL);
+                                         &td, NULL);
        if (err < 0) {
            av_log(bsf, AV_LOG_ERROR, "Failed to insert Temporal Delimiter.\n");
            return err;
@@ -151,12 +155,6 @@ static const CBSBSFType av1_metadata_type = {

 static int av1_metadata_init(AVBSFContext *bsf)
 {
-    AV1MetadataContext *ctx = bsf->priv_data;
-
-    ctx->td_obu = (AV1RawOBU) {
-        .header.obu_type = AV1_OBU_TEMPORAL_DELIMITER,
-    };
-
    return ff_cbs_bsf_generic_init(bsf, &av1_metadata_type);
 }

@@ -28,34 +28,6 @@
 #include "internal.h"
 #include "profiles.h"

-/**< same with Div_Lut defined in spec 7.11.3.7 */
-static const uint16_t div_lut[AV1_DIV_LUT_NUM] = {
-  16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
-  15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
-  15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
-  14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
-  13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
-  13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
-  13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
-  12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
-  12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
-  11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
-  11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
-  11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
-  10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
-  10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
-  10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
-  9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
-  9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
-  9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
-  9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
-  9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
-  8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
-  8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
-  8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
-  8240,  8224,  8208,  8192
-};
-
 static uint32_t inverse_recenter(int r, uint32_t v)
 {
    if (v > 2 * r)
@@ -85,11 +57,12 @@ static int32_t decode_signed_subexp_with_ref(uint32_t sub_exp, int low,

 static void read_global_param(AV1DecContext *s, int type, int ref, int idx)
 {
-    int primary_frame;
+    uint8_t primary_frame, prev_frame;
    uint32_t abs_bits, prec_bits, round, prec_diff, sub, mx;
    int32_t r, prev_gm_param;

    primary_frame = s->raw_frame_header->primary_ref_frame;
+    prev_frame = s->raw_frame_header->ref_frame_idx[primary_frame];
    abs_bits = AV1_GM_ABS_ALPHA_BITS;
    prec_bits = AV1_GM_ALPHA_PREC_BITS;

@@ -99,10 +72,8 @@ static void read_global_param(AV1DecContext *s, int type, int ref, int idx)
     */
    if (s->raw_frame_header->primary_ref_frame == AV1_PRIMARY_REF_NONE)
        prev_gm_param = s->cur_frame.gm_params[ref][idx];
-    else {
-        int prev_frame = s->raw_frame_header->ref_frame_idx[primary_frame];
+    else
        prev_gm_param = s->ref[prev_frame].gm_params[ref][idx];
-    }

    if (idx < 2) {
        if (type == AV1_WARP_MODEL_TRANSLATION) {
@@ -126,70 +97,6 @@ static void read_global_param(AV1DecContext *s, int type, int ref, int idx)
                                       -mx, mx + 1, r) << prec_diff) + round;
 }

-static uint64_t round_two(uint64_t x, uint16_t n)
-{
-    if (n == 0)
-        return x;
-    return ((x + ((uint64_t)1 << (n - 1))) >> n);
-}
-
-static int64_t round_two_signed(int64_t x, uint16_t n)
-{
-    return ((x<0) ? -((int64_t)round_two(-x, n)) : (int64_t)round_two(x, n));
-}
-
-/**
- * Resolve divisor process.
- * see spec 7.11.3.7
- */
-static int16_t resolve_divisor(uint32_t d, uint16_t *shift)
-{
-    int32_t e, f;
-
-    *shift = av_log2(d);
-    e = d - (1 << (*shift));
-    if (*shift > AV1_DIV_LUT_BITS)
-        f = round_two(e, *shift - AV1_DIV_LUT_BITS);
-    else
-        f = e << (AV1_DIV_LUT_BITS - (*shift));
-
-    *shift += AV1_DIV_LUT_PREC_BITS;
-
-    return div_lut[f];
-}
-
-/**
- * check if global motion params is valid.
- * see spec 7.11.3.6
- */
-static uint8_t get_shear_params_valid(AV1DecContext *s, int idx)
-{
-    int16_t alpha, beta, gamma, delta, divf, divs;
-    int64_t v, w;
-    int32_t *param = &s->cur_frame.gm_params[idx][0];
-    if (param[2] <= 0)
-        return 0;
-
-    alpha = av_clip_int16(param[2] - (1 << AV1_WARPEDMODEL_PREC_BITS));
-    beta  = av_clip_int16(param[3]);
-    divf  = resolve_divisor(abs(param[2]), &divs);
-    v     = (int64_t)param[4] * (1 << AV1_WARPEDMODEL_PREC_BITS);
-    w     = (int64_t)param[3] * param[4];
-    gamma = av_clip_int16((int)round_two_signed((v * divf), divs));
-    delta = av_clip_int16(param[5] - (int)round_two_signed((w * divf), divs) - (1 << AV1_WARPEDMODEL_PREC_BITS));
-
-    alpha = round_two_signed(alpha, AV1_WARP_PARAM_REDUCE_BITS) << AV1_WARP_PARAM_REDUCE_BITS;
-    beta  = round_two_signed(beta,  AV1_WARP_PARAM_REDUCE_BITS) << AV1_WARP_PARAM_REDUCE_BITS;
-    gamma = round_two_signed(gamma, AV1_WARP_PARAM_REDUCE_BITS) << AV1_WARP_PARAM_REDUCE_BITS;
-    delta = round_two_signed(delta, AV1_WARP_PARAM_REDUCE_BITS) << AV1_WARP_PARAM_REDUCE_BITS;
-
-    if ((4 * abs(alpha) + 7 * abs(beta)) >= (1 << AV1_WARPEDMODEL_PREC_BITS) ||
-        (4 * abs(gamma) + 4 * abs(delta)) >= (1 << AV1_WARPEDMODEL_PREC_BITS))
-        return 0;
-
-    return 1;
-}
-
 /**
 * update gm type/params, since cbs already implemented part of this funcation,
 * so we don't need to full implement spec.
@@ -237,9 +144,6 @@ static void global_motion_params(AV1DecContext *s)
            read_global_param(s, type, ref, 0);
            read_global_param(s, type, ref, 1);
        }
-        if (type <= AV1_WARP_MODEL_AFFINE) {
-            s->cur_frame.gm_invalid[ref] = !get_shear_params_valid(s, ref);
-        }
    }
 }

@@ -605,9 +509,6 @@ static int av1_frame_ref(AVCodecContext *avctx, AV1Frame *dst, const AV1Frame *s

    dst->spatial_id = src->spatial_id;
    dst->temporal_id = src->temporal_id;
-    memcpy(dst->gm_invalid,
-           src->gm_invalid,
-           AV1_NUM_REF_FRAMES * sizeof(uint8_t));
    memcpy(dst->gm_type,
           src->gm_type,
           AV1_NUM_REF_FRAMES * sizeof(uint8_t));
@@ -662,7 +563,7 @@ static int set_context_with_sequence(AVCodecContext *avctx,
    avctx->color_range =
        seq->color_config.color_range ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
    avctx->color_primaries = seq->color_config.color_primaries;
-    avctx->colorspace = seq->color_config.matrix_coefficients;
+    avctx->colorspace = seq->color_config.color_primaries;
    avctx->color_trc = seq->color_config.transfer_characteristics;

    switch (seq->color_config.chroma_sample_position) {
@@ -1018,8 +919,6 @@ static int av1_decode_frame(AVCodecContext *avctx, void *frame,
            }

            s->raw_seq = &obu->obu.sequence_header;
-            s->raw_frame_header = NULL;
-            raw_tile_group      = NULL;

            ret = set_context_with_sequence(avctx, s->raw_seq);
            if (ret < 0) {
@@ -1069,8 +968,6 @@ static int av1_decode_frame(AVCodecContext *avctx, void *frame,
                goto end;
            }

-            raw_tile_group      = NULL;
-
            if (unit->type == AV1_OBU_FRAME)
                s->raw_frame_header = &obu->obu.frame.header;
            else
@@ -1149,11 +1046,8 @@ static int av1_decode_frame(AVCodecContext *avctx, void *frame,
                }
            }
            break;
-        case AV1_OBU_TEMPORAL_DELIMITER:
-            s->raw_frame_header = NULL;
-            raw_tile_group      = NULL;
-        // fall-through
        case AV1_OBU_TILE_LIST:
+        case AV1_OBU_TEMPORAL_DELIMITER:
        case AV1_OBU_PADDING:
        case AV1_OBU_METADATA:
            break;
@@ -42,7 +42,6 @@ typedef struct AV1Frame {
    int temporal_id;
    int spatial_id;

-    uint8_t gm_invalid[AV1_NUM_REF_FRAMES];
    uint8_t gm_type[AV1_NUM_REF_FRAMES];
    int32_t gm_params[AV1_NUM_REF_FRAMES][6];

@@ -318,13 +318,6 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
        avctx->time_base.den = avctx->sample_rate;
    }

-    if (av_codec_is_encoder(avctx->codec))
-        ret = ff_encode_preinit(avctx);
-    else
-        ret = ff_decode_preinit(avctx);
-    if (ret < 0)
-        goto free_and_end;
-
    if (!HAVE_THREADS)
        av_log(avctx, AV_LOG_WARNING, "Warning: not compiled with thread support, using thread emulation\n");

@@ -346,6 +339,13 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
    if (!HAVE_THREADS && !(codec->caps_internal & FF_CODEC_CAP_AUTO_THREADS))
        avctx->thread_count = 1;

+    if (av_codec_is_encoder(avctx->codec))
+        ret = ff_encode_preinit(avctx);
+    else
+        ret = ff_decode_preinit(avctx);
+    if (ret < 0)
+        goto free_and_end;
+
    if (   avctx->codec->init && (!(avctx->active_thread_type&FF_THREAD_FRAME)
        || avci->frame_thread_encoder)) {
        ret = avctx->codec->init(avctx);
@@ -644,11 +644,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
    return 0;
 }

-static const char *unknown_if_null(const char *str)
-{
-    return str ? str : "unknown";
-}
-
 void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
 {
    const char *codec_type;
@@ -658,7 +653,6 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
    int new_line = 0;
    AVRational display_aspect_ratio;
    const char *separator = enc->dump_separator ? (const char *)enc->dump_separator : ", ";
-    const char *str;

    if (!buf || buf_size <= 0)
        return;
@@ -694,27 +688,28 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
            av_strlcat(buf, separator, buf_size);

            snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                     "%s", enc->pix_fmt == AV_PIX_FMT_NONE ? "none" :
-                     unknown_if_null(av_get_pix_fmt_name(enc->pix_fmt)));
+                 "%s", enc->pix_fmt == AV_PIX_FMT_NONE ? "none" :
+                     av_get_pix_fmt_name(enc->pix_fmt));
            if (enc->bits_per_raw_sample && enc->pix_fmt != AV_PIX_FMT_NONE &&
                enc->bits_per_raw_sample < av_pix_fmt_desc_get(enc->pix_fmt)->comp[0].depth)
                av_strlcatf(detail, sizeof(detail), "%d bpc, ", enc->bits_per_raw_sample);
-            if (enc->color_range != AVCOL_RANGE_UNSPECIFIED &&
-                (str = av_color_range_name(enc->color_range)))
-                av_strlcatf(detail, sizeof(detail), "%s, ", str);
+            if (enc->color_range != AVCOL_RANGE_UNSPECIFIED)
+                av_strlcatf(detail, sizeof(detail), "%s, ",
+                            av_color_range_name(enc->color_range));

            if (enc->colorspace != AVCOL_SPC_UNSPECIFIED ||
                enc->color_primaries != AVCOL_PRI_UNSPECIFIED ||
                enc->color_trc != AVCOL_TRC_UNSPECIFIED) {
-                const char *col = unknown_if_null(av_color_space_name(enc->colorspace));
-                const char *pri = unknown_if_null(av_color_primaries_name(enc->color_primaries));
-                const char *trc = unknown_if_null(av_color_transfer_name(enc->color_trc));
-                if (strcmp(col, pri) || strcmp(col, trc)) {
+                if (enc->colorspace != (int)enc->color_primaries ||
+                    enc->colorspace != (int)enc->color_trc) {
                    new_line = 1;
                    av_strlcatf(detail, sizeof(detail), "%s/%s/%s, ",
-                                col, pri, trc);
+                                av_color_space_name(enc->colorspace),
+                                av_color_primaries_name(enc->color_primaries),
+                                av_color_transfer_name(enc->color_trc));
                } else
-                    av_strlcatf(detail, sizeof(detail), "%s, ", col);
+                    av_strlcatf(detail, sizeof(detail), "%s, ",
+                                av_get_colorspace_name(enc->colorspace));
            }

            if (enc->field_order != AV_FIELD_UNKNOWN) {
@@ -732,9 +727,9 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
            }

            if (av_log_get_level() >= AV_LOG_VERBOSE &&
-                enc->chroma_sample_location != AVCHROMA_LOC_UNSPECIFIED &&
-                (str = av_chroma_location_name(enc->chroma_sample_location)))
-                av_strlcatf(detail, sizeof(detail), "%s, ", str);
+                enc->chroma_sample_location != AVCHROMA_LOC_UNSPECIFIED)
+                av_strlcatf(detail, sizeof(detail), "%s, ",
+                            av_chroma_location_name(enc->chroma_sample_location));

            if (strlen(detail) > 1) {
                detail[strlen(detail) - 2] = 0;
@@ -792,10 +787,9 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
                     "%d Hz, ", enc->sample_rate);
        }
        av_get_channel_layout_string(buf + strlen(buf), buf_size - strlen(buf), enc->channels, enc->channel_layout);
-        if (enc->sample_fmt != AV_SAMPLE_FMT_NONE &&
-            (str = av_get_sample_fmt_name(enc->sample_fmt))) {
+        if (enc->sample_fmt != AV_SAMPLE_FMT_NONE) {
            snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                     ", %s", str);
+                     ", %s", av_get_sample_fmt_name(enc->sample_fmt));
        }
        if (   enc->bits_per_raw_sample > 0
            && enc->bits_per_raw_sample != av_get_bytes_per_sample(enc->sample_fmt) * 8)
@@ -1304,10 +1304,6 @@ typedef struct AVCodecContext {
     *   this callback and filled with the extra buffers if there are more
     *   buffers than buf[] can hold. extended_buf will be freed in
     *   av_frame_unref().
-     *   Decoders will generally initialize the whole buffer before it is output
-     *   but it can in rare error conditions happen that uninitialized data is passed
-     *   through. \important The buffers returned by get_buffer* should thus not contain sensitive
-     *   data.
     *
     * If AV_CODEC_CAP_DR1 is not set then get_buffer2() must call
     * avcodec_default_get_buffer2() instead of providing buffers allocated by
@@ -869,7 +869,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,

    binkb_init_bundles(c);
    ref_start = frame->data[plane_idx];
-    ref_end   = frame->data[plane_idx] + ((bh - 1) * frame->linesize[plane_idx] + bw - 1) * 8;
+    ref_end   = frame->data[plane_idx] + (bh * frame->linesize[plane_idx] + bw) * 8;

    for (i = 0; i < 64; i++)
        coordmap[i] = (i & 7) + (i >> 3) * stride;
@@ -925,7 +925,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                xoff = binkb_get_value(c, BINKB_SRC_X_OFF);
                yoff = binkb_get_value(c, BINKB_SRC_Y_OFF) + ybias;
                ref = dst + xoff + yoff * stride;
-                if (ref < ref_start || ref > ref_end) {
+                if (ref < ref_start || ref + 8*stride > ref_end) {
                    av_log(c->avctx, AV_LOG_WARNING, "Reference block is out of bounds\n");
                } else if (ref + 8*stride < dst || ref >= dst + 8*stride) {
                    c->put_pixels_tab(dst, ref, stride, 8);
@@ -941,7 +941,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                xoff = binkb_get_value(c, BINKB_SRC_X_OFF);
                yoff = binkb_get_value(c, BINKB_SRC_Y_OFF) + ybias;
                ref = dst + xoff + yoff * stride;
-                if (ref < ref_start || ref > ref_end) {
+                if (ref < ref_start || ref + 8 * stride > ref_end) {
                    av_log(c->avctx, AV_LOG_WARNING, "Reference block is out of bounds\n");
                } else if (ref + 8*stride < dst || ref >= dst + 8*stride) {
                    c->put_pixels_tab(dst, ref, stride, 8);
@@ -973,7 +973,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                xoff = binkb_get_value(c, BINKB_SRC_X_OFF);
                yoff = binkb_get_value(c, BINKB_SRC_Y_OFF) + ybias;
                ref = dst + xoff + yoff * stride;
-                if (ref < ref_start || ref > ref_end) {
+                if (ref < ref_start || ref + 8 * stride > ref_end) {
                    av_log(c->avctx, AV_LOG_WARNING, "Reference block is out of bounds\n");
                } else if (ref + 8*stride < dst || ref >= dst + 8*stride) {
                    c->put_pixels_tab(dst, ref, stride, 8);
@@ -1086,7 +1086,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
        for (bx = 0; bx < bw; bx++, dst += 8, prev += 8) {
            blk = get_value(c, BINK_SRC_BLOCK_TYPES);
            // 16x16 block type on odd line means part of the already decoded block, so skip it
-            if (((by & 1) || (bx & 1)) && blk == SCALED_BLOCK) {
+            if ((by & 1) && blk == SCALED_BLOCK) {
                bx++;
                dst  += 8;
                prev += 8;
@@ -70,7 +70,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
    BinkAudioContext *s = avctx->priv_data;
    int sample_rate = avctx->sample_rate;
    int sample_rate_half;
-    int i, ret;
+    int i;
    int frame_len_bits;

    /* determine frame length */
@@ -132,13 +132,11 @@ static av_cold int decode_init(AVCodecContext *avctx)
    s->first = 1;

    if (CONFIG_BINKAUDIO_RDFT_DECODER && avctx->codec->id == AV_CODEC_ID_BINKAUDIO_RDFT)
-        ret = ff_rdft_init(&s->trans.rdft, frame_len_bits, DFT_C2R);
+        ff_rdft_init(&s->trans.rdft, frame_len_bits, DFT_C2R);
    else if (CONFIG_BINKAUDIO_DCT_DECODER)
-        ret = ff_dct_init(&s->trans.dct, frame_len_bits, DCT_III);
+        ff_dct_init(&s->trans.dct, frame_len_bits, DCT_III);
    else
        av_assert0(0);
-    if (ret < 0)
-        return ret;

    s->pkt = av_packet_alloc();
    if (!s->pkt)
@@ -347,7 +345,6 @@ AVCodec ff_binkaudio_rdft_decoder = {
    .close          = decode_end,
    .receive_frame  = binkaudio_receive_frame,
    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
-    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
 };

 AVCodec ff_binkaudio_dct_decoder = {
@@ -360,5 +357,4 @@ AVCodec ff_binkaudio_dct_decoder = {
    .close          = decode_end,
    .receive_frame  = binkaudio_receive_frame,
    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
-    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
 };
@@ -130,7 +130,7 @@ static int bmp_decode_frame(AVCodecContext *avctx,
        rgb[1] = bytestream_get_le32(&buf);
        rgb[2] = bytestream_get_le32(&buf);
        if (ihsize > 40)
-            alpha = bytestream_get_le32(&buf);
+        alpha = bytestream_get_le32(&buf);
    }

    ret = ff_set_dimensions(avctx, width, height > 0 ? height : -(unsigned)height);
@@ -45,15 +45,14 @@ void av_bsf_free(AVBSFContext **pctx)
        return;
    ctx = *pctx;

-    if (ctx->internal) {
-        if (ctx->filter->close)
-            ctx->filter->close(ctx);
-        av_packet_free(&ctx->internal->buffer_pkt);
-        av_freep(&ctx->internal);
-    }
+    if (ctx->filter->close)
+        ctx->filter->close(ctx);
    if (ctx->filter->priv_class && ctx->priv_data)
        av_opt_free(ctx->priv_data);

+    if (ctx->internal)
+        av_packet_free(&ctx->internal->buffer_pkt);
+    av_freep(&ctx->internal);
    av_freep(&ctx->priv_data);

    avcodec_parameters_free(&ctx->par_in);
@@ -111,20 +110,7 @@ int av_bsf_alloc(const AVBitStreamFilter *filter, AVBSFContext **pctx)
        ret = AVERROR(ENOMEM);
        goto fail;
    }
-    /* allocate priv data and init private options */
-    if (filter->priv_data_size) {
-        ctx->priv_data = av_mallocz(filter->priv_data_size);
-        if (!ctx->priv_data) {
-            ret = AVERROR(ENOMEM);
-            goto fail;
-        }
-        if (filter->priv_class) {
-            *(const AVClass **)ctx->priv_data = filter->priv_class;
-            av_opt_set_defaults(ctx->priv_data);
-        }
-    }
-    /* Allocate AVBSFInternal; must happen after priv_data has been allocated
-     * so that a filter->close needing priv_data is never called without. */
+
    bsfi = av_mallocz(sizeof(*bsfi));
    if (!bsfi) {
        ret = AVERROR(ENOMEM);
@@ -138,6 +124,19 @@ int av_bsf_alloc(const AVBitStreamFilter *filter, AVBSFContext **pctx)
        goto fail;
    }

+    /* allocate priv data and init private options */
+    if (filter->priv_data_size) {
+        ctx->priv_data = av_mallocz(filter->priv_data_size);
+        if (!ctx->priv_data) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+        if (filter->priv_class) {
+            *(const AVClass **)ctx->priv_data = filter->priv_class;
+            av_opt_set_defaults(ctx->priv_data);
+        }
+    }
+
    *pctx = ctx;
    return 0;
 fail:
@@ -37,7 +37,7 @@ static int cbs_av1_read_uvlc(CodedBitstreamContext *ctx, GetBitContext *gbc,
        position = get_bits_count(gbc);

    zeroes = 0;
-    while (zeroes < 32) {
+    while (1) {
        if (get_bits_left(gbc) < 1) {
            av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid uvlc code at "
                   "%s: bitstream ended.\n", name);
@@ -50,18 +50,7 @@ static int cbs_av1_read_uvlc(CodedBitstreamContext *ctx, GetBitContext *gbc,
    }

    if (zeroes >= 32) {
-        // The spec allows at least thirty-two zero bits followed by a
-        // one to mean 2^32-1, with no constraint on the number of
-        // zeroes.  The libaom reference decoder does not match this,
-        // instead reading thirty-two zeroes but not the following one
-        // to mean 2^32-1.  These two interpretations are incompatible
-        // and other implementations may follow one or the other.
-        // Therefore we reject thirty-two zeroes because the intended
-        // behaviour is not clear.
-        av_log(ctx->log_ctx, AV_LOG_ERROR, "Thirty-two zero bits in "
-               "%s uvlc code: considered invalid due to conflicting "
-               "standard and reference decoder behaviour.\n", name);
-        return AVERROR_INVALIDDATA;
+        value = MAX_UINT_BITS(32);
    } else {
        if (get_bits_left(gbc) < zeroes) {
            av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid uvlc code at "
@@ -390,7 +379,7 @@ static int cbs_av1_write_increment(CodedBitstreamContext *ctx, PutBitContext *pb
    }

    if (len > 0)
-        put_bits(pbc, len, (1U << len) - 1 - (value != range_max));
+        put_bits(pbc, len, (1 << len) - 1 - (value != range_max));

    return 0;
 }
@@ -355,7 +355,7 @@ static int FUNC(set_frame_refs)(CodedBitstreamContext *ctx, RWContext *rw,
        AV1_REF_FRAME_ALTREF2, AV1_REF_FRAME_ALTREF
    };
    int8_t ref_frame_idx[AV1_REFS_PER_FRAME], used_frame[AV1_NUM_REF_FRAMES];
-    int16_t shifted_order_hints[AV1_NUM_REF_FRAMES];
+    int8_t shifted_order_hints[AV1_NUM_REF_FRAMES];
    int cur_frame_hint, latest_order_hint, earliest_order_hint, ref;
    int i, j;

@@ -728,7 +728,7 @@ static int FUNC(sps_scc_extension)(CodedBitstreamContext *ctx, RWContext *rw,

        flag(sps_palette_predictor_initializer_present_flag);
        if (current->sps_palette_predictor_initializer_present_flag) {
-            ue(sps_num_palette_predictor_initializer_minus1, 0, 127);
+            ue(sps_num_palette_predictor_initializer_minus1, 0, 128);
            for (comp = 0; comp < (current->chroma_format_idc ? 3 : 1); comp++) {
                int bit_depth = comp == 0 ? current->bit_depth_luma_minus8 + 8
                                          : current->bit_depth_chroma_minus8 + 8;
@@ -166,13 +166,13 @@ static int cbs_jpeg_split_fragment(CodedBitstreamContext *ctx,
            }
        } else {
            i = start;
-            if (i > frag->data_size - 2) {
+            if (i + 2 > frag->data_size) {
                av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid JPEG image: "
                       "truncated at %02x marker.\n", marker);
                return AVERROR_INVALIDDATA;
            }
            length = AV_RB16(frag->data + i);
-            if (length > frag->data_size - i) {
+            if (i + length > frag->data_size) {
                av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid JPEG image: "
                       "truncated at %02x marker segment.\n", marker);
                return AVERROR_INVALIDDATA;
@@ -422,7 +422,7 @@ static int cbs_vp9_split_fragment(CodedBitstreamContext *ctx,
    superframe_header = frag->data[frag->data_size - 1];

    if ((superframe_header & 0xe0) == 0xc0) {
-        VP9RawSuperframeIndex sfi = {0};
+        VP9RawSuperframeIndex sfi;
        GetBitContext gbc;
        size_t index_size, pos;
        int i;
@@ -239,7 +239,7 @@ static void cdg_scroll(CDGraphicsContext *cc, uint8_t *data,
    for (y = FFMAX(0, vinc); y < FFMIN(CDG_FULL_HEIGHT + vinc, CDG_FULL_HEIGHT); y++)
        memcpy(out + FFMAX(0, hinc) + stride * y,
               in + FFMAX(0, hinc) - hinc + (y - vinc) * stride,
-               FFABS(stride) - FFABS(hinc));
+               FFMIN(stride + hinc, stride));

    if (vinc > 0)
        cdg_fill_wrapper(0, 0, out,
@@ -78,7 +78,7 @@ int64_t ff_dot_product(const int16_t *a, const int16_t *b, int length);
 *
 * @return value << offset, if offset>=0; value >> -offset - otherwise
 */
-static inline unsigned bidir_sal(unsigned value, int offset)
+static inline int bidir_sal(int value, int offset)
 {
    if(offset < 0) return value >> -offset;
    else           return value <<  offset;
@@ -221,7 +221,6 @@ static void free_buffers(CFHDContext *s)
    int i, j;

    for (i = 0; i < FF_ARRAY_ELEMS(s->plane); i++) {
-        Plane *p = &s->plane[i];
        av_freep(&s->plane[i].idwt_buf);
        av_freep(&s->plane[i].idwt_tmp);
        s->plane[i].idwt_size = 0;
@@ -231,16 +230,9 @@ static void free_buffers(CFHDContext *s)

        for (j = 0; j < 10; j++)
            s->plane[i].l_h[j] = NULL;
-
-        for (j = 0; j < DWT_LEVELS_3D; j++)
-            p->band[j][0].read_ok =
-            p->band[j][1].read_ok =
-            p->band[j][2].read_ok =
-            p->band[j][3].read_ok = 0;
    }
    s->a_height = 0;
    s->a_width  = 0;
-    s->a_transform_type = INT_MIN;
 }

 static int alloc_buffers(AVCodecContext *avctx)
@@ -274,9 +266,6 @@ static int alloc_buffers(AVCodecContext *avctx)
        int height = (i || bayer) ? s->coded_height >> chroma_y_shift : s->coded_height;
        ptrdiff_t stride = (FFALIGN(width  / 8, 8) + 64) * 8;

-        if ((ret = av_image_check_size2(stride, height, avctx->max_pixels, s->coded_format, 0, avctx)) < 0)
-            return ret;
-
        if (chroma_y_shift && !bayer)
            height = FFALIGN(height / 8, 2) * 8;
        s->plane[i].width  = width;
@@ -367,7 +356,6 @@ static int alloc_buffers(AVCodecContext *avctx)
        }
    }

-    s->a_transform_type = s->transform_type;
    s->a_height = s->coded_height;
    s->a_width  = s->coded_width;
    s->a_format = s->coded_format;
@@ -639,7 +627,7 @@ static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
        } else
            av_log(avctx, AV_LOG_DEBUG,  "Unknown tag %i data %x\n", tag, data);

-        if (tag == BitstreamMarker && data == CoefficientSegment &&
+        if (tag == BitstreamMarker && data == 0xf0f &&
            s->coded_format != AV_PIX_FMT_NONE) {
            int lowpass_height = s->plane[s->channel_num].band[0][0].height;
            int lowpass_width  = s->plane[s->channel_num].band[0][0].width;
@@ -667,8 +655,7 @@ static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
                s->coded_height = s->a_height;

            if (s->a_width != s->coded_width || s->a_height != s->coded_height ||
-                s->a_format != s->coded_format ||
-                s->transform_type != s->a_transform_type) {
+                s->a_format != s->coded_format) {
                free_buffers(s);
                if ((ret = alloc_buffers(avctx)) < 0) {
                    free_buffers(s);
@@ -708,26 +695,14 @@ static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,

        if (s->subband_num_actual == 255)
            goto finish;
-
-        if (tag == BitstreamMarker && data == CoefficientSegment || tag == BandHeader || tag == BandSecondPass || s->peak.level)
-            if (s->transform_type != s->a_transform_type)
-                return AVERROR_PATCHWELCOME;
-
        coeff_data = s->plane[s->channel_num].subband[s->subband_num_actual];

        /* Lowpass coefficients */
-        if (tag == BitstreamMarker && data == CoefficientSegment) {
-            int lowpass_height, lowpass_width, lowpass_a_height, lowpass_a_width;
-
-            if (!s->a_width || !s->a_height) {
-                ret = AVERROR_INVALIDDATA;
-                goto end;
-            }
-
-            lowpass_height = s->plane[s->channel_num].band[0][0].height;
-            lowpass_width  = s->plane[s->channel_num].band[0][0].width;
-            lowpass_a_height = s->plane[s->channel_num].band[0][0].a_height;
-            lowpass_a_width  = s->plane[s->channel_num].band[0][0].a_width;
+        if (tag == BitstreamMarker && data == 0xf0f && s->a_width && s->a_height) {
+            int lowpass_height = s->plane[s->channel_num].band[0][0].height;
+            int lowpass_width  = s->plane[s->channel_num].band[0][0].width;
+            int lowpass_a_height = s->plane[s->channel_num].band[0][0].a_height;
+            int lowpass_a_width  = s->plane[s->channel_num].band[0][0].a_width;

            if (lowpass_width < 3 ||
                lowpass_width > lowpass_a_width) {
@@ -774,30 +749,20 @@ static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
                       lowpass_width * sizeof(*coeff_data));
            }

-            s->plane[s->channel_num].band[0][0].read_ok = 1;
-
            av_log(avctx, AV_LOG_DEBUG, "Lowpass coefficients %d\n", lowpass_width * lowpass_height);
        }

-        av_assert0(s->subband_num_actual != 255);
-        if (tag == BandHeader || tag == BandSecondPass) {
-            int highpass_height, highpass_width, highpass_a_width, highpass_a_height, highpass_stride, a_expected;
+        if ((tag == BandHeader || tag == BandSecondPass) && s->subband_num_actual != 255 && s->a_width && s->a_height) {
+            int highpass_height = s->plane[s->channel_num].band[s->level][s->subband_num].height;
+            int highpass_width  = s->plane[s->channel_num].band[s->level][s->subband_num].width;
+            int highpass_a_width = s->plane[s->channel_num].band[s->level][s->subband_num].a_width;
+            int highpass_a_height = s->plane[s->channel_num].band[s->level][s->subband_num].a_height;
+            int highpass_stride = s->plane[s->channel_num].band[s->level][s->subband_num].stride;
            int expected;
+            int a_expected = highpass_a_height * highpass_a_width;
            int level, run, coeff;
            int count = 0, bytes;

-            if (!s->a_width || !s->a_height) {
-                ret = AVERROR_INVALIDDATA;
-                goto end;
-            }
-
-            highpass_height = s->plane[s->channel_num].band[s->level][s->subband_num].height;
-            highpass_width  = s->plane[s->channel_num].band[s->level][s->subband_num].width;
-            highpass_a_width = s->plane[s->channel_num].band[s->level][s->subband_num].a_width;
-            highpass_a_height = s->plane[s->channel_num].band[s->level][s->subband_num].a_height;
-            highpass_stride = s->plane[s->channel_num].band[s->level][s->subband_num].stride;
-            a_expected = highpass_a_height * highpass_a_width;
-
            if (!got_buffer) {
                av_log(avctx, AV_LOG_ERROR, "No end of header tag found\n");
                ret = AVERROR(EINVAL);
@@ -846,7 +811,7 @@ static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
                            const uint16_t q = s->quantisation;

                            for (i = 0; i < run; i++) {
-                                *coeff_data |= coeff * 256U;
+                                *coeff_data |= coeff * 256;
                                *coeff_data++ *= q;
                            }
                        } else {
@@ -877,7 +842,7 @@ static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
                            const uint16_t q = s->quantisation;

                            for (i = 0; i < run; i++) {
-                                *coeff_data |= coeff * 256U;
+                                *coeff_data |= coeff * 256;
                                *coeff_data++ *= q;
                            }
                        } else {
@@ -908,7 +873,6 @@ static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
                bytestream2_seek(&gb, bytes, SEEK_CUR);

            av_log(avctx, AV_LOG_DEBUG, "End subband coeffs %i extra %i\n", count, count - expected);
-            s->plane[s->channel_num].band[s->level][s->subband_num].read_ok = 1;
 finish:
            if (s->subband_num_actual != 255)
                s->codebook = 0;
@@ -924,7 +888,6 @@ finish:
    ff_thread_finish_setup(avctx);

    if (!s->a_width || !s->a_height || s->a_format == AV_PIX_FMT_NONE ||
-        s->a_transform_type == INT_MIN ||
        s->coded_width || s->coded_height || s->coded_format != AV_PIX_FMT_NONE) {
        av_log(avctx, AV_LOG_ERROR, "Invalid dimensions\n");
        ret = AVERROR(EINVAL);
@@ -937,22 +900,6 @@ finish:
        goto end;
    }

-    for (plane = 0; plane < s->planes; plane++) {
-        int o, level;
-
-        for (level = 0; level < (s->transform_type == 0 ? DWT_LEVELS : DWT_LEVELS_3D) ; level++) {
-            if (s->transform_type == 2)
-                if (level == 2 || level == 5)
-                    continue;
-            for (o = !!level; o < 4 ; o++) {
-                if (!s->plane[plane].band[level][o].read_ok) {
-                    ret = AVERROR_INVALIDDATA;
-                    goto end;
-                }
-            }
-        }
-    }
-
    if (s->transform_type == 0 && s->sample_type != 1) {
        for (plane = 0; plane < s->planes && !ret; plane++) {
            /* level 1 */
@@ -1434,14 +1381,12 @@ static int update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
    if (pdst->plane[0].idwt_size != psrc->plane[0].idwt_size ||
        pdst->a_format != psrc->a_format ||
        pdst->a_width != psrc->a_width ||
-        pdst->a_height != psrc->a_height ||
-        pdst->a_transform_type != psrc->a_transform_type)
+        pdst->a_height != psrc->a_height)
        free_buffers(pdst);

    pdst->a_format = psrc->a_format;
    pdst->a_width  = psrc->a_width;
    pdst->a_height = psrc->a_height;
-    pdst->a_transform_type = psrc->a_transform_type;
    pdst->transform_type = psrc->transform_type;
    pdst->progressive = psrc->progressive;
    pdst->planes = psrc->planes;
@@ -1450,7 +1395,6 @@ static int update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
        pdst->coded_width  = pdst->a_width;
        pdst->coded_height = pdst->a_height;
        pdst->coded_format = pdst->a_format;
-        pdst->transform_type = pdst->a_transform_type;
        ret = alloc_buffers(dst);
        if (ret < 0)
            return ret;
@@ -95,15 +95,6 @@ enum CFHDParam {
    ChannelHeight    = 105,
 };

-enum CFHDSegment {
-    LowPassSegment      = 0x1a4a,
-    LowPassEndSegment   = 0x1b4b,
-    HighPassSegment     = 0x0d0d,
-    BandSegment         = 0x0e0e,
-    HighPassEndSegment  = 0x0c0c,
-    CoefficientSegment  = 0x0f0f,
-};
-
 #define VLC_BITS       9
 #define SUBBAND_COUNT 10
 #define SUBBAND_COUNT_3D 17
@@ -123,7 +114,6 @@ typedef struct SubBand {
    int width;
    int a_height;
    int height;
-    int8_t read_ok;
 } SubBand;

 typedef struct Plane {
@@ -175,7 +165,6 @@ typedef struct CFHDContext {
    int a_width;
    int a_height;
    int a_format;
-    int a_transform_type;

    int bpc; // bits per channel/component
    int channel_cnt;
@@ -258,11 +258,6 @@ static av_cold int cfhd_encode_init(AVCodecContext *avctx)
    if (ret < 0)
        return ret;

-    if (avctx->height < 32) {
-        av_log(avctx, AV_LOG_ERROR, "Height must be >= 32.\n");
-        return AVERROR_INVALIDDATA;
-    }
-
    if (avctx->width & 15) {
        av_log(avctx, AV_LOG_ERROR, "Width must be multiple of 16.\n");
        return AVERROR_INVALIDDATA;
@@ -552,7 +547,7 @@ static int cfhd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         width, height * 2);
    }

-    ret = ff_alloc_packet2(avctx, pkt, 256LL + s->planes * (4LL * avctx->width * (avctx->height + 15) + 2048LL), 0);
+    ret = ff_alloc_packet2(avctx, pkt, 64LL + s->planes * (2LL * avctx->width * avctx->height + 1000LL), 0);
    if (ret < 0)
        return ret;

@@ -624,7 +619,7 @@ static int cfhd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
        }

        bytestream2_put_be16(pby, BitstreamMarker);
-        bytestream2_put_be16(pby, LowPassSegment);
+        bytestream2_put_be16(pby, 0x1a4a);

        pos = bytestream2_tell_p(pby);

@@ -650,7 +645,7 @@ static int cfhd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
        bytestream2_put_be16(pby, 16);

        bytestream2_put_be16(pby, BitstreamMarker);
-        bytestream2_put_be16(pby, CoefficientSegment);
+        bytestream2_put_be16(pby, 0x0f0f);

        for (int i = 0; i < height; i++) {
            for (int j = 0; j < width; j++)
@@ -659,7 +654,7 @@ static int cfhd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
        }

        bytestream2_put_be16(pby, BitstreamMarker);
-        bytestream2_put_be16(pby, LowPassEndSegment);
+        bytestream2_put_be16(pby, 0x1b4b);

        for (int l = 0; l < 3; l++) {
            for (int i = 0; i < 3; i++) {
@@ -674,7 +669,7 @@ static int cfhd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
            int height = s->plane[p].band[l][0].height;

            bytestream2_put_be16(pby, BitstreamMarker);
-            bytestream2_put_be16(pby, HighPassSegment);
+            bytestream2_put_be16(pby, 0x0d0d);

            bytestream2_put_be16(pby, WaveletType);
            bytestream2_put_be16(pby, 3 + 2 * (l == 2));
@@ -711,7 +706,7 @@ static int cfhd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                int count = 0, padd = 0;

                bytestream2_put_be16(pby, BitstreamMarker);
-                bytestream2_put_be16(pby, BandSegment);
+                bytestream2_put_be16(pby, 0x0e0e);

                bytestream2_put_be16(pby, SubbandNumber);
                bytestream2_put_be16(pby, i + 1);
@@ -781,7 +776,7 @@ static int cfhd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
            }

            bytestream2_put_be16(pby, BitstreamMarker);
-            bytestream2_put_be16(pby, HighPassEndSegment);
+            bytestream2_put_be16(pby, 0x0c0c);
        }

        s->plane[p].size = bytestream2_tell_p(pby) - pos;
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .4.7
 .4.git