swscale/x86/yuv2rgb_template: Add emms to MMX(EXT) functions

Fixes issue #22333. Note: These functions have been removed in commit 61e851381f, so the issue only affects releases 7.0 and older. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> (cherry picked from commit 5e6c584b98cea9b9d78b037728b915887758666d)
swscale/output: Fix integer overflow in yuv2ya16_X_c_template()
2026-03-05 14:59:24 +01:00 · 2026-01-02 21:58:51 +00:00 · 2026-01-02 21:58:51 +00:00 · 2026-01-02 21:58:51 +00:00 · 2026-01-02 21:58:51 +00:00 · 2026-01-02 21:58:51 +00:00
153 changed files with 3424 additions and 2563 deletions
@@ -0,0 +1,23 @@
+exclude: ^tests/ref/
+
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v5.0.0
+  hooks:
+    - id: check-case-conflict
+    - id: check-executables-have-shebangs
+    - id: check-illegal-windows-names
+    - id: check-shebang-scripts-are-executable
+    - id: check-yaml
+    - id: end-of-file-fixer
+    - id: fix-byte-order-marker
+    - id: mixed-line-ending
+    - id: trailing-whitespace
+- repo: local
+  hooks:
+    - id: aarch64-asm-indent
+      name: fix aarch64 assembly indentation
+      files: ^.*/aarch64/.*\.S$
+      language: script
+      entry: ./tools/check_arm_indent.sh --apply
+      pass_filenames: false
@@ -0,0 +1,29 @@
+name: Lint
+
+on:
+  push:
+    branches:
+      - release/4.3
+  pull_request:
+
+jobs:
+  lint:
+    name: Pre-Commit
+    runs-on: utilities
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install pre-commit CI
+        id: install
+        run: |
+            python3 -m venv ~/pre-commit
+            ~/pre-commit/bin/pip install --upgrade pip setuptools
+            ~/pre-commit/bin/pip install pre-commit
+            echo "envhash=$({ python3 --version && cat .forgejo/pre-commit/config.yaml; } | sha256sum | cut -d' ' -f1)" >> $FORGEJO_OUTPUT
+      - name: Cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/pre-commit
+          key: pre-commit-${{ steps.install.outputs.envhash }}
+      - name: Run pre-commit CI
+        run: ~/pre-commit/bin/pre-commit run -c .forgejo/pre-commit/config.yaml --show-diff-on-failure --color=always --all-files
@@ -0,0 +1,80 @@
+name: Test
+
+on:
+  push:
+    branches:
+      - release/4.3
+  pull_request:
+
+jobs:
+  run_fate:
+    name: Fate (${{ matrix.runner }}, ${{ matrix.shared }}, ${{ matrix.bits }} bit)
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-aarch64]
+        shared: ['static']
+        bits: ['64']
+        include:
+          - runner: linux-amd64
+            shared: 'static'
+            bits: '32'
+          - runner: linux-amd64
+            shared: 'shared'
+            bits: '64'
+    runs-on: ${{ matrix.runner }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Configure
+        run: |
+          ./configure --enable-gpl --enable-nonfree --enable-memory-poisoning --assert-level=2 \
+              $([ "${{ matrix.bits }}" != "32" ] || echo --arch=x86_32 --extra-cflags=-m32 --extra-cxxflags=-m32 --extra-ldflags=-m32) \
+              $([ "${{ matrix.shared }}" != "shared" ] || echo --enable-shared --disable-static) \
+              || CFGRES=$? && CFGRES=$?
+          cat ffbuild/config.log
+          exit $CFGRES
+      - name: Build
+        run: make -j$(nproc)
+      - name: Restore Cached Fate-Suite
+        id: cache
+        uses: actions/cache/restore@v4
+        with:
+          path: fate-suite
+          key: fate-suite
+          restore-keys: |
+            fate-suite-
+      - name: Sync Fate-Suite
+        id: fate
+        run: |
+          make fate-rsync SAMPLES=$PWD/fate-suite
+          echo "hash=$(find fate-suite -type f -printf "%P %s %T@\n" | sort | sha256sum | cut -d' ' -f1)" >> $FORGEJO_OUTPUT
+      - name: Cache Fate-Suite
+        uses: actions/cache/save@v4
+        if: ${{ format('fate-suite-{0}', steps.fate.outputs.hash) != steps.cache.outputs.cache-matched-key }}
+        with:
+          path: fate-suite
+          key: fate-suite-${{ steps.fate.outputs.hash }}
+      - name: Run Fate
+        run: LD_LIBRARY_PATH="$(printf "%s:" "$PWD"/lib*)$PWD" make fate fate-build SAMPLES=$PWD/fate-suite -j$(nproc)
+  compile_only:
+    name: Fate (Win64, Build-Only)
+    strategy:
+      fail-fast: false
+      matrix:
+        image: ["ghcr.io/btbn/ffmpeg-builds/win64-gpl-4.3:latest"]
+    runs-on: linux-amd64
+    container: ${{ matrix.image }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Configure
+        run: |
+          ./configure --pkg-config-flags="--static" $FFBUILD_TARGET_FLAGS $FF_CONFIGURE \
+              --cc="$CC" --cxx="$CXX" --ar="$AR" --ranlib="$RANLIB" --nm="$NM" \
+              --extra-cflags="$FF_CFLAGS" --extra-cxxflags="$FF_CXXFLAGS" \
+              --extra-libs="$FF_LIBS" --extra-ldflags="$FF_LDFLAGS" --extra-ldexeflags="$FF_LDEXEFLAGS"
+      - name: Build
+        run: make -j$(nproc)
+      - name: Run Fate
+        run: make -j$(nproc) fate-build
@@ -55,7 +55,7 @@ modified by someone else and passed on, the recipients should know
 that what they have is not the original version, so that the original
 author's reputation will not be affected by problems that might be
 introduced by others.
-
+
  Finally, software patents pose a constant threat to the existence of
 any free program.  We wish to make sure that a company cannot
 effectively restrict the users of a free program by obtaining a
@@ -111,7 +111,7 @@ modification follow.  Pay close attention to the difference between a
 "work based on the library" and a "work that uses the library".  The
 former contains code derived from the library, whereas the latter must
 be combined with the library in order to run.
-
+
                  GNU LESSER GENERAL PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

@@ -158,7 +158,7 @@ Library.
  You may charge a fee for the physical act of transferring a copy,
 and you may at your option offer warranty protection in exchange for a
 fee.
-
+
  2. You may modify your copy or copies of the Library or any portion
 of it, thus forming a work based on the Library, and copy and
 distribute such modifications or work under the terms of Section 1
@@ -216,7 +216,7 @@ instead of to this License.  (If a newer version than version 2 of the
 ordinary GNU General Public License has appeared, then you can specify
 that version instead if you wish.)  Do not make any other change in
 these notices.
-
+
  Once this change is made in a given copy, it is irreversible for
 that copy, so the ordinary GNU General Public License applies to all
 subsequent copies and derivative works made from that copy.
@@ -267,7 +267,7 @@ Library will still fall under Section 6.)
 distribute the object code for the work under the terms of Section 6.
 Any executables containing that work also fall under Section 6,
 whether or not they are linked directly with the Library itself.
-
+
  6. As an exception to the Sections above, you may also combine or
 link a "work that uses the Library" with the Library to produce a
 work containing portions of the Library, and distribute that work
@@ -329,7 +329,7 @@ restrictions of other proprietary libraries that do not normally
 accompany the operating system.  Such a contradiction means you cannot
 use both them and the Library together in an executable that you
 distribute.
-
+
  7. You may place library facilities that are a work based on the
 Library side-by-side in a single library together with other library
 facilities not covered by this License, and distribute such a combined
@@ -370,7 +370,7 @@ subject to these terms and conditions.  You may not impose any further
 restrictions on the recipients' exercise of the rights granted herein.
 You are not responsible for enforcing compliance by third parties with
 this License.
-
+
  11. If, as a consequence of a court judgment or allegation of patent
 infringement or for any other reason (not limited to patent issues),
 conditions are imposed on you (whether by court order, agreement or
@@ -422,7 +422,7 @@ conditions either of that version or of any later version published by
 the Free Software Foundation.  If the Library does not specify a
 license version number, you may choose any version ever published by
 the Free Software Foundation.
-
+
  14. If you wish to incorporate parts of the Library into other free
 programs whose distribution conditions are incompatible with these,
 write to the author to ask for permission.  For software which is
@@ -456,7 +456,7 @@ SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
 DAMAGES.

                     END OF TERMS AND CONDITIONS
-
+
           How to Apply These Terms to Your New Libraries

  If you develop a new library, and you want it to be of the greatest
@@ -2,6 +2,111 @@ Entries are sorted chronologically from oldest to youngest within each release,
 releases are sorted from youngest to oldest.


+version 4.3.9:
+ configure: update copyright year
+ avformat/hls: Partially revert "reduce default max reload to 3"
+ avformat/hls: Fix twitter
+ libavformat/hls: Be more restrictive on mpegts extensions
+ avformat/hls: .ts is always ok even if its a mov/mp4
+ avformat/hls: Print input format in error message
+ avformat/hls: Be more picky on extensions
+ avformat: add ff_match_url_ext()
+ avfilter/bwdif: account for chroma sub-sampling in min size calculation
+ avformat/iff: Check that we have a stream in read_dst_frame()
+ avformat/mlvdec: fix size checks
+ avformat/mxfdec: Check edit unit for overflow in mxf_set_current_edit_unit()
+ avcodec/h263dec: Check against previous dimensions instead of coded
+ avformat/mxfdec: Check avio_read() success in mxf_decrypt_triplet()
+ avcodec/huffyuvdec: Initialize whole output for decode_gray_bitstream()
+ avformat/ipmovie: Check signature_buffer read
+ avformat/wtvdec: Initialize buf
+ avcodec/cbs_vp9: Initialize VP9RawSuperframeIndex
+ avformat/vqf: Propagate errors from add_metadata()
+ avformat/vqf: Check avio_read() in add_metadata()
+ avformat/dashdec: Check whitelist
+ avutil/avstring: dont mess with NULL pointers in av_match_list()
+ avcodec/mpegvideo_enc: Check FLV1 resolution limits
+ avcodec/ffv1enc: Fix handling of 32bit unsigned symbols
+ avcodec/vc1dec: Clear block_index in vc1_decode_reset()
+ avcodec/aacsbr_template: Clear n_q on error
+ swscale/output: Fix undefined overflow in yuv2rgba64_full_X_c_template()
+ avfilter/af_pan: Fix sscanf() use
+ avfilter/vf_addroi: Add missing NULL termination to addroi_var_names[]()
+ avformat/rmdec: check that buf if completely filled
+ avcodec/hapdec: Clear tex buffer
+ avformat/mxfdec: Check that key was read sucessfull
+ avformat/rpl: Fix check for negative values
+ avformat/mlvdec: Check avio_read()
+ avcodec/utils: Fix block align overflow for ADPCM_IMA_WAV
+ avformat/matroskadec: Check pre_ns for overflow
+ avcodec/webp: Check ref_x/y
+ avcodec/ilbcdec: Initialize tempbuff2
+ avformat/dxa: check bpc
+ swscale/slice: clear allocated memory in alloc_lines()
+ avformat/icodec: fix integer overflow with nb_pal
+ doc/developer: Document relationship between git accounts and MAINTAINERS
+ avformat/vividas: Check avio_read() for failure
+ avformat/ilbc: Check avio_read() for failure
+ avformat/nistspheredec: Clear buffer
+ INSTALL: explain the circular dependency issue and solution
+ avformat/mpegts: Initialize predefined_SLConfigDescriptor_seen
+ avformat/mxfdec: Fix overflow in midpoint computation
+ swscale/output: used unsigned for bit accumulation
+ avcodec/rangecoder: only perform renorm check/loop for callers that need it
+ avcodec/ffv1dec: Fix end computation with ec=2
+ avcodec/ffv1enc: Prevent generation of files with broken slices
+ avformat/matroskadec: Check desc_bytes so bits fit in 64bit
+ avcodec/ffv1enc: Correct error message about unsupported version
+ avcodec/ffv1enc: Slice combination is unsupported
+ avcodec/ffv1enc: 2Pass mode is not possible with golomb coding
+ avcodec/ffv1enc: Fix >8bit context size
+ avcodec/xan: Add basic input size check
+ avcodec/svq3: Check for minimum size input
+ avcodec/eacmv: Check input size for intra frames
+ avcodec/jfdctint_template: use unsigned z* in row_fdct()
+ avformat/mxfdec: Check timecode for overflow
+ avformat/mxfdec: More offset_temp checks
+ swscale/output: Fix undefined integer overflow in yuv2rgba64_2_c_template()
+ swscale/swscale: Use unsigned operation to avoid undefined behavior
+ avcodec/vc2enc: basic sanity check on slice_max_bytes
+ avformat/mvdec: Check if name was fully read
+ avcodec/wmavoice: Do not use uninitialized pitch[0]
+ avcodec/notchlc: Check bytes left before reading
+ avcodec/vc1_block: propagate error codes
+ avformat/apetag: Check APETAGEX
+ avcodec/avcodec: Warn about data returned from get_buffer*()
+ avcodec/aic: Clear slice_data
+ avcodec/vc1dec: Clear mb_type_base and ttblk_base
+ avcodec/shorten: clear padding
+ avformat/mpeg: Check an avio_read() for failure
+ avcodec/mvha: Clear remaining space after inflate()
+ avformat/segafilm: Set keyframe
+ avcodec/dxva2: initialize hr in ff_dxva2_common_end_frame()
+ avcodec/dxva2: initialize validate
+ avcodec/dxva2: Initialize ConfigBitstreamRaw
+ avcodec/dxva2: Initialize dxva_size and check it
+ avfilter/vf_xfade: Compute w2, h2 with float
+ avfilter/vf_v360: Assert that vf was initialized
+ avfilter/vf_tonemap_opencl: Dereference after NULL check
+ avfilter/vf_xfade_opencl: Check ff_inlink_consume_frame() for failure
+ avformat/lmlm4: Eliminate some AVERROR(EIO)
+ avformat/wtvdec: Check length of read mpeg2_descriptor
+ avformat/wtvdec: clear sectors
+ vp9: recon: Use emulated edge to prevent buffer overflows
+ arm: vp9mc: Load only 12 pixels in the 4 pixel wide horizontal filter
+ aarch64: vp9mc: Load only 12 pixels in the 4 pixel wide horizontal filter
+ avformat/libzmq: fix check for zmq protocol prefix
+ configure: improve check for POSIX ioctl
+ configure: restore autodetection of v4l2 and fbdev
+ configure: use just the pkg-config for sndio
+ configure: enable ffnvcodec, nvenc, nvdec for FreeBSD
+ avutil/ppc/cpu: Also use the machdep.altivec sysctl on NetBSD
+ avutil/ppc/cpu: Use proper header for OpenBSD PPC CPU detection
+ lavd/v4l2: Use proper field type for second parameter of ioctl() with BSD's
+ configure: use pkg-config for sndio
+ libavcodec/arm/mlpdsp_armv5te: fix label format to work with binutils 2.43
+
+
 version 4.3.8:
 avcodec/parser: ensure input padding is zeroed
 avformat/img2dec: Clear padding data after EOF
@@ -15,3 +15,11 @@ NOTICE
 ------

 - Non system dependencies (e.g. libx264, libvpx) are disabled by default.
+
+NOTICE for Package Maintainers
+------------------------------
+
+ - It is recommended to build FFmpeg twice, first with minimal external dependencies so
+   that 3rd party packages, which depend on FFmpegs libavutil/libavfilter/libavcodec/libavformat
+   can then be built. And last build FFmpeg with full dependancies (which may in turn depend on
+   some of these 3rd party packages). This avoids circular dependencies during build.
@@ -1 +1 @@
-4.3.8
+4.3.9
@@ -2330,6 +2330,7 @@ HAVE_LIST="
    opencl_vaapi_intel_media
    perl
    pod2man
+    posix_ioctl
    texi2html
 "

@@ -6541,11 +6542,13 @@ perl -v            > /dev/null 2>&1 && enable perl      || disable perl
 pod2man --help     > /dev/null 2>&1 && enable pod2man   || disable pod2man
 rsync --help 2> /dev/null | grep -q 'contimeout' && enable rsync_contimeout || disable rsync_contimeout

+check_headers linux/fb.h
+check_headers linux/videodev2.h
+test_code cc linux/videodev2.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
+test_code cc sys/ioctl.h "int ioctl(int, int, ...)" && enable posix_ioctl
+
 # check V4L2 codecs available in the API
 if enabled v4l2_m2m; then
-    check_headers linux/fb.h
-    check_headers linux/videodev2.h
-    test_code cc linux/videodev2.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
    check_cc v4l2_m2m linux/videodev2.h "int i = V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M | V4L2_BUF_FLAG_LAST;"
    check_cc vc1_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VC1_ANNEX_G;"
    check_cc mpeg1_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG1;"
@@ -6590,7 +6593,7 @@ enabled alsa && { check_pkg_config alsa alsa "alsa/asoundlib.h" snd_pcm_htimesta
 enabled libjack &&
    require_pkg_config libjack jack jack/jack.h jack_port_get_latency_range

-enabled sndio && check_lib sndio sndio.h sio_open -lsndio
+enabled sndio && check_pkg_config sndio sndio sndio.h sio_open

 if enabled libcdio; then
    check_pkg_config libcdio libcdio_paranoia "cdio/cdda.h cdio/paranoia.h" cdio_cddap_open ||
@@ -6687,7 +6690,7 @@ enabled vulkan &&

 if enabled x86; then
    case $target_os in
-        mingw32*|mingw64*|win32|win64|linux|cygwin*)
+        freebsd|mingw32*|mingw64*|win32|win64|linux|cygwin*)
            ;;
        *)
            disable ffnvcodec cuvid nvdec nvenc
@@ -7515,7 +7518,7 @@ cat > $TMPH <<EOF
 #define FFMPEG_CONFIG_H
 #define FFMPEG_CONFIGURATION "$(c_escape $FFMPEG_CONFIGURATION)"
 #define FFMPEG_LICENSE "$(c_escape $license)"
-#define CONFIG_THIS_YEAR 2024
+#define CONFIG_THIS_YEAR 2025
 #define FFMPEG_DATADIR "$(eval c_escape $datadir)"
 #define AVCONV_DATADIR "$(eval c_escape $datadir)"
 #define CC_IDENT "$(c_escape ${cc_ident:-Unknown compiler})"
@@ -38,7 +38,7 @@ PROJECT_NAME           = FFmpeg
 # could be handy for archiving the generated documentation or if some version
 # control system is used.

-PROJECT_NUMBER         = 4.3.8
+PROJECT_NUMBER         = 4.3.9

 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -63,4 +63,3 @@ make -j<num>
 make -k
    Continue build in case of errors, this is useful for the regression tests
    sometimes but note that it will still not run all reg tests.
-
@@ -327,6 +327,13 @@ segment index to start live streams at (negative values are from the end).
@item allowed_extensions
 ',' separated list of file extensions that hls is allowed to access.

+@item extension_picky
+This blocks disallowed extensions from probing
+It also requires all available segments to have matching extensions to the format
+except mpegts, which is always allowed.
+It is recommended to set the whitelists correctly instead of depending on extensions
+Enabled by default.
+
@item max_reload
 Maximum number of times a insufficient list is attempted to be reloaded.
 Default value is 1000.
@@ -762,6 +762,25 @@ In case you need finer control over how valgrind is invoked, use the
@code{--target-exec='valgrind <your_custom_valgrind_options>} option in
 your configure line instead.

+@anchor{Maintenance}
+@chapter Maintenance process
+
+@anchor{MAINTAINERS}
+@section MAINTAINERS
+
+The developers maintaining each part of the codebase are listed in @file{MAINTAINERS}.
+Being listed in @file{MAINTAINERS}, gives one the right to have git write access to
+the specific repository.
+
+@anchor{Becoming a maintainer}
+@section Becoming a maintainer
+
+People add themselves to @file{MAINTAINERS} by sending a patch like any other code
+change. These get reviewed by the community like any other patch. It is expected
+that, if someone has an objection to a new maintainer, she is willing to object
+in public with her full name and is willing to take over maintainership for the area.
+
+
@anchor{Release process}
@chapter Release process

@@ -157,4 +157,3 @@ PFD[32]   would for example be signed 32 bit little-endian IEEE float
@item XVID @tab non-compliant MPEG-4 generated by old Xvid
@item XVIX @tab non-compliant MPEG-4 generated by old Xvid with interlacing bug
@end multitable
-
@@ -44,4 +44,3 @@ a+b*c;
 here the reader knows that a,b,c are meant to be signed integers but for C
 standard compliance / to avoid undefined behavior they are stored in unsigned
 ints.
-
@@ -1,3 +1,5 @@
+#!/bin/sh
+
 toupper(){
    echo "$@" | tr abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ
 }
@@ -1174,7 +1174,7 @@ SKIPHEADERS-$(CONFIG_QSV)              += qsv.h qsv_internal.h
 SKIPHEADERS-$(CONFIG_QSVDEC)           += qsvdec.h
 SKIPHEADERS-$(CONFIG_QSVENC)           += qsvenc.h
 SKIPHEADERS-$(CONFIG_XVMC)             += xvmc.h
-SKIPHEADERS-$(CONFIG_VAAPI)            += vaapi_decode.h vaapi_encode.h
+SKIPHEADERS-$(CONFIG_VAAPI)            += vaapi_decode.h vaapi_hevc.h vaapi_encode.h
 SKIPHEADERS-$(CONFIG_VDPAU)            += vdpau.h vdpau_internal.h
 SKIPHEADERS-$(CONFIG_VIDEOTOOLBOX)     += videotoolbox.h vt_internal.h
 SKIPHEADERS-$(CONFIG_V4L2_M2M)         += v4l2_buffers.h v4l2_context.h v4l2_m2m.h
@@ -173,6 +173,7 @@ void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce)
                      sce->ics.window_sequence[0] == LONG_START_SEQUENCE ? 0 : 2;
    const int sfb_len = sfb_end - sfb_start;
    const int coef_len = sce->ics.swb_offset[sfb_end] - sce->ics.swb_offset[sfb_start];
+    const int n_filt = is8 ? 1 : order != TNS_MAX_ORDER ? 2 : 3;

    if (coef_len <= 0 || sfb_len <= 0) {
        sce->tns.present = 0;
@@ -180,16 +181,30 @@ void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce)
    }

    for (w = 0; w < sce->ics.num_windows; w++) {
-        float en[2] = {0.0f, 0.0f};
-        int oc_start = 0, os_start = 0;
+        float en[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+        int oc_start = 0;
        int coef_start = sce->ics.swb_offset[sfb_start];

-        for (g = sfb_start; g < sce->ics.num_swb && g <= sfb_end; g++) {
-            FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[w*16+g];
-            if (g > sfb_start + (sfb_len/2))
-                en[1] += band->energy;
-            else
-                en[0] += band->energy;
+        if (n_filt == 2) {
+            for (g = sfb_start; g < sce->ics.num_swb && g <= sfb_end; g++) {
+                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[w*16+g];
+                    if (g > sfb_start + (sfb_len/2))
+                        en[1] += band->energy; /* End */
+                    else
+                        en[0] += band->energy; /* Start */
+            }
+            en[2] = en[0];
+        } else {
+            for (g = sfb_start; g < sce->ics.num_swb && g <= sfb_end; g++) {
+                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[w*16+g];
+                    if (g > sfb_start + (sfb_len/2) + (sfb_len/4))
+                        en[2] += band->energy; /* End */
+                    else if (g > sfb_start + (sfb_len/2) - (sfb_len/4))
+                        en[1] += band->energy; /* Middle */
+                    else
+                        en[0] += band->energy; /* Start */
+            }
+            en[3] = en[0];
        }

        /* LPC */
@@ -199,15 +214,14 @@ void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce)
        if (!order || !isfinite(gain) || gain < TNS_GAIN_THRESHOLD_LOW || gain > TNS_GAIN_THRESHOLD_HIGH)
            continue;

-        tns->n_filt[w] = is8 ? 1 : order != TNS_MAX_ORDER ? 2 : 3;
+        tns->n_filt[w] = n_filt;
        for (g = 0; g < tns->n_filt[w]; g++) {
-            tns->direction[w][g] = slant != 2 ? slant : en[g] < en[!g];
-            tns->order[w][g] = g < tns->n_filt[w] ? order/tns->n_filt[w] : order - oc_start;
-            tns->length[w][g] = g < tns->n_filt[w] ? sfb_len/tns->n_filt[w] : sfb_len - os_start;
+            tns->direction[w][g] = slant != 2 ? slant : en[g] < en[g + 1];
+            tns->order[w][g] = order/tns->n_filt[w];
+            tns->length[w][g] = sfb_len/tns->n_filt[w];
            quantize_coefs(&coefs[oc_start], tns->coef_idx[w][g], tns->coef[w][g],
                            tns->order[w][g], c_bits);
            oc_start += tns->order[w][g];
-            os_start += tns->length[w][g];
        }
        count++;
    }
@@ -592,6 +592,7 @@ static int sbr_make_f_derived(AACContext *ac, SpectralBandReplication *sbr)

    if (sbr->n_q > 5) {
        av_log(ac->avctx, AV_LOG_ERROR, "Too many noise floor scale factors: %d\n", sbr->n_q);
+        sbr->n_q = 1;
        return -1;
    }

@@ -19,130 +19,130 @@
 #include "libavutil/aarch64/asm.S"

 function ff_ps_add_squares_neon, export=1
-1:      ld1         {v0.4S,v1.4S}, [x1], #32
-        fmul        v0.4S, v0.4S, v0.4S
-        fmul        v1.4S, v1.4S, v1.4S
-        faddp       v2.4S, v0.4S, v1.4S
-        ld1         {v3.4S}, [x0]
-        fadd        v3.4S, v3.4S, v2.4S
-        st1         {v3.4S}, [x0], #16
-        subs        w2, w2, #4
-        b.gt        1b
+1:      ld1             {v0.4s,v1.4s}, [x1], #32
+        fmul            v0.4s, v0.4s, v0.4s
+        fmul            v1.4s, v1.4s, v1.4s
+        faddp           v2.4s, v0.4s, v1.4s
+        ld1             {v3.4s}, [x0]
+        fadd            v3.4s, v3.4s, v2.4s
+        st1             {v3.4s}, [x0], #16
+        subs            w2, w2, #4
+        b.gt            1b
        ret
 endfunc

 function ff_ps_mul_pair_single_neon, export=1
-1:      ld1         {v0.4S,v1.4S}, [x1], #32
-        ld1         {v2.4S},       [x2], #16
-        zip1        v3.4S, v2.4S, v2.4S
-        zip2        v4.4S, v2.4S, v2.4S
-        fmul        v0.4S, v0.4S, v3.4S
-        fmul        v1.4S, v1.4S, v4.4S
-        st1         {v0.4S,v1.4S}, [x0], #32
-        subs        w3, w3, #4
-        b.gt        1b
+1:      ld1             {v0.4s,v1.4s}, [x1], #32
+        ld1             {v2.4s},       [x2], #16
+        zip1            v3.4s, v2.4s, v2.4s
+        zip2            v4.4s, v2.4s, v2.4s
+        fmul            v0.4s, v0.4s, v3.4s
+        fmul            v1.4s, v1.4s, v4.4s
+        st1             {v0.4s,v1.4s}, [x0], #32
+        subs            w3, w3, #4
+        b.gt            1b
        ret
 endfunc

 function ff_ps_stereo_interpolate_neon, export=1
-        ld1         {v0.4S}, [x2]
-        ld1         {v1.4S}, [x3]
-        zip1        v4.4S, v0.4S, v0.4S
-        zip2        v5.4S, v0.4S, v0.4S
-        zip1        v6.4S, v1.4S, v1.4S
-        zip2        v7.4S, v1.4S, v1.4S
-1:      ld1         {v2.2S}, [x0]
-        ld1         {v3.2S}, [x1]
-        fadd        v4.4S, v4.4S, v6.4S
-        fadd        v5.4S, v5.4S, v7.4S
-        mov         v2.D[1], v2.D[0]
-        mov         v3.D[1], v3.D[0]
-        fmul        v2.4S, v2.4S, v4.4S
-        fmla        v2.4S, v3.4S, v5.4S
-        st1         {v2.D}[0], [x0], #8
-        st1         {v2.D}[1], [x1], #8
-        subs        w4, w4, #1
-        b.gt        1b
+        ld1             {v0.4s}, [x2]
+        ld1             {v1.4s}, [x3]
+        zip1            v4.4s, v0.4s, v0.4s
+        zip2            v5.4s, v0.4s, v0.4s
+        zip1            v6.4s, v1.4s, v1.4s
+        zip2            v7.4s, v1.4s, v1.4s
+1:      ld1             {v2.2s}, [x0]
+        ld1             {v3.2s}, [x1]
+        fadd            v4.4s, v4.4s, v6.4s
+        fadd            v5.4s, v5.4s, v7.4s
+        mov             v2.d[1], v2.d[0]
+        mov             v3.d[1], v3.d[0]
+        fmul            v2.4s, v2.4s, v4.4s
+        fmla            v2.4s, v3.4s, v5.4s
+        st1             {v2.d}[0], [x0], #8
+        st1             {v2.d}[1], [x1], #8
+        subs            w4, w4, #1
+        b.gt            1b
        ret
 endfunc

 function ff_ps_stereo_interpolate_ipdopd_neon, export=1
-        ld1         {v0.4S,v1.4S}, [x2]
-        ld1         {v6.4S,v7.4S}, [x3]
-        fneg        v2.4S, v1.4S
-        fneg        v3.4S, v7.4S
-        zip1        v16.4S, v0.4S, v0.4S
-        zip2        v17.4S, v0.4S, v0.4S
-        zip1        v18.4S, v2.4S, v1.4S
-        zip2        v19.4S, v2.4S, v1.4S
-        zip1        v20.4S, v6.4S, v6.4S
-        zip2        v21.4S, v6.4S, v6.4S
-        zip1        v22.4S, v3.4S, v7.4S
-        zip2        v23.4S, v3.4S, v7.4S
-1:      ld1         {v2.2S}, [x0]
-        ld1         {v3.2S}, [x1]
-        fadd        v16.4S, v16.4S, v20.4S
-        fadd        v17.4S, v17.4S, v21.4S
-        mov         v2.D[1], v2.D[0]
-        mov         v3.D[1], v3.D[0]
-        fmul        v4.4S, v2.4S, v16.4S
-        fmla        v4.4S, v3.4S, v17.4S
-        fadd        v18.4S, v18.4S, v22.4S
-        fadd        v19.4S, v19.4S, v23.4S
-        ext         v2.16B, v2.16B, v2.16B, #4
-        ext         v3.16B, v3.16B, v3.16B, #4
-        fmla        v4.4S, v2.4S, v18.4S
-        fmla        v4.4S, v3.4S, v19.4S
-        st1         {v4.D}[0], [x0], #8
-        st1         {v4.D}[1], [x1], #8
-        subs        w4, w4, #1
-        b.gt        1b
+        ld1             {v0.4s,v1.4s}, [x2]
+        ld1             {v6.4s,v7.4s}, [x3]
+        fneg            v2.4s, v1.4s
+        fneg            v3.4s, v7.4s
+        zip1            v16.4s, v0.4s, v0.4s
+        zip2            v17.4s, v0.4s, v0.4s
+        zip1            v18.4s, v2.4s, v1.4s
+        zip2            v19.4s, v2.4s, v1.4s
+        zip1            v20.4s, v6.4s, v6.4s
+        zip2            v21.4s, v6.4s, v6.4s
+        zip1            v22.4s, v3.4s, v7.4s
+        zip2            v23.4s, v3.4s, v7.4s
+1:      ld1             {v2.2s}, [x0]
+        ld1             {v3.2s}, [x1]
+        fadd            v16.4s, v16.4s, v20.4s
+        fadd            v17.4s, v17.4s, v21.4s
+        mov             v2.d[1], v2.d[0]
+        mov             v3.d[1], v3.d[0]
+        fmul            v4.4s, v2.4s, v16.4s
+        fmla            v4.4s, v3.4s, v17.4s
+        fadd            v18.4s, v18.4s, v22.4s
+        fadd            v19.4s, v19.4s, v23.4s
+        ext             v2.16b, v2.16b, v2.16b, #4
+        ext             v3.16b, v3.16b, v3.16b, #4
+        fmla            v4.4s, v2.4s, v18.4s
+        fmla            v4.4s, v3.4s, v19.4s
+        st1             {v4.d}[0], [x0], #8
+        st1             {v4.d}[1], [x1], #8
+        subs            w4, w4, #1
+        b.gt            1b
        ret
 endfunc

 function ff_ps_hybrid_analysis_neon, export=1
-        lsl         x3, x3, #3
-        ld2         {v0.4S,v1.4S}, [x1], #32
-        ld2         {v2.2S,v3.2S}, [x1], #16
-        ld1         {v24.2S},      [x1], #8
-        ld2         {v4.2S,v5.2S}, [x1], #16
-        ld2         {v6.4S,v7.4S}, [x1]
-        rev64       v6.4S, v6.4S
-        rev64       v7.4S, v7.4S
-        ext         v6.16B, v6.16B, v6.16B, #8
-        ext         v7.16B, v7.16B, v7.16B, #8
-        rev64       v4.2S, v4.2S
-        rev64       v5.2S, v5.2S
-        mov         v2.D[1], v3.D[0]
-        mov         v4.D[1], v5.D[0]
-        mov         v5.D[1], v2.D[0]
-        mov         v3.D[1], v4.D[0]
-        fadd        v16.4S, v0.4S, v6.4S
-        fadd        v17.4S, v1.4S, v7.4S
-        fsub        v18.4S, v1.4S, v7.4S
-        fsub        v19.4S, v0.4S, v6.4S
-        fadd        v22.4S, v2.4S, v4.4S
-        fsub        v23.4S, v5.4S, v3.4S
-        trn1        v20.2D, v22.2D, v23.2D      // {re4+re8, re5+re7, im8-im4, im7-im5}
-        trn2        v21.2D, v22.2D, v23.2D      // {im4+im8, im5+im7, re4-re8, re5-re7}
-1:      ld2         {v2.4S,v3.4S}, [x2], #32
-        ld2         {v4.2S,v5.2S}, [x2], #16
-        ld1         {v6.2S},       [x2], #8
-        add         x2, x2, #8
-        mov         v4.D[1], v5.D[0]
-        mov         v6.S[1], v6.S[0]
-        fmul        v6.2S, v6.2S, v24.2S
-        fmul        v0.4S, v2.4S, v16.4S
-        fmul        v1.4S, v2.4S, v17.4S
-        fmls        v0.4S, v3.4S, v18.4S
-        fmla        v1.4S, v3.4S, v19.4S
-        fmla        v0.4S, v4.4S, v20.4S
-        fmla        v1.4S, v4.4S, v21.4S
-        faddp       v0.4S, v0.4S, v1.4S
-        faddp       v0.4S, v0.4S, v0.4S
-        fadd        v0.2S, v0.2S, v6.2S
-        st1         {v0.2S}, [x0], x3
-        subs        w4, w4, #1
-        b.gt        1b
+        lsl             x3, x3, #3
+        ld2             {v0.4s,v1.4s}, [x1], #32
+        ld2             {v2.2s,v3.2s}, [x1], #16
+        ld1             {v24.2s},      [x1], #8
+        ld2             {v4.2s,v5.2s}, [x1], #16
+        ld2             {v6.4s,v7.4s}, [x1]
+        rev64           v6.4s, v6.4s
+        rev64           v7.4s, v7.4s
+        ext             v6.16b, v6.16b, v6.16b, #8
+        ext             v7.16b, v7.16b, v7.16b, #8
+        rev64           v4.2s, v4.2s
+        rev64           v5.2s, v5.2s
+        mov             v2.d[1], v3.d[0]
+        mov             v4.d[1], v5.d[0]
+        mov             v5.d[1], v2.d[0]
+        mov             v3.d[1], v4.d[0]
+        fadd            v16.4s, v0.4s, v6.4s
+        fadd            v17.4s, v1.4s, v7.4s
+        fsub            v18.4s, v1.4s, v7.4s
+        fsub            v19.4s, v0.4s, v6.4s
+        fadd            v22.4s, v2.4s, v4.4s
+        fsub            v23.4s, v5.4s, v3.4s
+        trn1            v20.2d, v22.2d, v23.2d      // {re4+re8, re5+re7, im8-im4, im7-im5}
+        trn2            v21.2d, v22.2d, v23.2d      // {im4+im8, im5+im7, re4-re8, re5-re7}
+1:      ld2             {v2.4s,v3.4s}, [x2], #32
+        ld2             {v4.2s,v5.2s}, [x2], #16
+        ld1             {v6.2s},       [x2], #8
+        add             x2, x2, #8
+        mov             v4.d[1], v5.d[0]
+        mov             v6.s[1], v6.s[0]
+        fmul            v6.2s, v6.2s, v24.2s
+        fmul            v0.4s, v2.4s, v16.4s
+        fmul            v1.4s, v2.4s, v17.4s
+        fmls            v0.4s, v3.4s, v18.4s
+        fmla            v1.4s, v3.4s, v19.4s
+        fmla            v0.4s, v4.4s, v20.4s
+        fmla            v1.4s, v4.4s, v21.4s
+        faddp           v0.4s, v0.4s, v1.4s
+        faddp           v0.4s, v0.4s, v0.4s
+        fadd            v0.2s, v0.2s, v6.2s
+        st1             {v0.2s}, [x0], x3
+        subs            w4, w4, #1
+        b.gt            1b
        ret
 endfunc
@@ -353,18 +353,18 @@ function fft\n\()_neon, align=6
 endfunc
 .endm

-        def_fft    32,    16,     8
-        def_fft    64,    32,    16
-        def_fft   128,    64,    32
-        def_fft   256,   128,    64
-        def_fft   512,   256,   128
-        def_fft  1024,   512,   256
-        def_fft  2048,  1024,   512
-        def_fft  4096,  2048,  1024
-        def_fft  8192,  4096,  2048
-        def_fft 16384,  8192,  4096
-        def_fft 32768, 16384,  8192
-        def_fft 65536, 32768, 16384
+        def_fft         32,    16,     8
+        def_fft         64,    32,    16
+        def_fft         128,    64,    32
+        def_fft         256,   128,    64
+        def_fft         512,   256,   128
+        def_fft         1024,   512,   256
+        def_fft         2048,  1024,   512
+        def_fft         4096,  2048,  1024
+        def_fft         8192,  4096,  2048
+        def_fft         16384,  8192,  4096
+        def_fft         32768, 16384,  8192
+        def_fft         65536, 32768, 16384

 function ff_fft_calc_neon, export=1
        prfm            pldl1keep, [x1]
@@ -36,11 +36,11 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
        lsl             w9,  w9,  #3
        lsl             w10, w10, #1
        add             w9,  w9,  w10
-        add             x6,  x6,  w9, UXTW
-        ld1r            {v22.8H}, [x6]
+        add             x6,  x6,  w9, uxtw
+        ld1r            {v22.8h}, [x6]
  .endif
  .ifc \codec,vc1
-        movi            v22.8H,   #28
+        movi            v22.8h,   #28
  .endif
        mul             w7,  w4,  w5
        lsl             w14, w5,  #3
@@ -53,139 +53,139 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
        add             w4,  w4,  #64
        b.eq            2f

-        dup             v0.8B,  w4
-        dup             v1.8B,  w12
-        ld1             {v4.8B, v5.8B}, [x1], x2
-        dup             v2.8B,  w6
-        dup             v3.8B,  w7
-        ext             v5.8B,  v4.8B,  v5.8B,  #1
-1:      ld1             {v6.8B, v7.8B}, [x1], x2
-        umull           v16.8H, v4.8B,  v0.8B
-        umlal           v16.8H, v5.8B,  v1.8B
-        ext             v7.8B,  v6.8B,  v7.8B,  #1
-        ld1             {v4.8B, v5.8B}, [x1], x2
-        umlal           v16.8H, v6.8B,  v2.8B
+        dup             v0.8b,  w4
+        dup             v1.8b,  w12
+        ld1             {v4.8b, v5.8b}, [x1], x2
+        dup             v2.8b,  w6
+        dup             v3.8b,  w7
+        ext             v5.8b,  v4.8b,  v5.8b,  #1
+1:      ld1             {v6.8b, v7.8b}, [x1], x2
+        umull           v16.8h, v4.8b,  v0.8b
+        umlal           v16.8h, v5.8b,  v1.8b
+        ext             v7.8b,  v6.8b,  v7.8b,  #1
+        ld1             {v4.8b, v5.8b}, [x1], x2
+        umlal           v16.8h, v6.8b,  v2.8b
        prfm            pldl1strm, [x1]
-        ext             v5.8B,  v4.8B,  v5.8B,  #1
-        umlal           v16.8H, v7.8B,  v3.8B
-        umull           v17.8H, v6.8B,  v0.8B
+        ext             v5.8b,  v4.8b,  v5.8b,  #1
+        umlal           v16.8h, v7.8b,  v3.8b
+        umull           v17.8h, v6.8b,  v0.8b
        subs            w3,  w3,  #2
-        umlal           v17.8H, v7.8B, v1.8B
-        umlal           v17.8H, v4.8B, v2.8B
-        umlal           v17.8H, v5.8B, v3.8B
+        umlal           v17.8h, v7.8b, v1.8b
+        umlal           v17.8h, v4.8b, v2.8b
+        umlal           v17.8h, v5.8b, v3.8b
        prfm            pldl1strm, [x1, x2]
  .ifc \codec,h264
-        rshrn           v16.8B, v16.8H, #6
-        rshrn           v17.8B, v17.8H, #6
+        rshrn           v16.8b, v16.8h, #6
+        rshrn           v17.8b, v17.8h, #6
  .else
-        add             v16.8H, v16.8H, v22.8H
-        add             v17.8H, v17.8H, v22.8H
-        shrn            v16.8B, v16.8H, #6
-        shrn            v17.8B, v17.8H, #6
+        add             v16.8h, v16.8h, v22.8h
+        add             v17.8h, v17.8h, v22.8h
+        shrn            v16.8b, v16.8h, #6
+        shrn            v17.8b, v17.8h, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.8B}, [x8], x2
-        ld1             {v21.8B}, [x8], x2
-        urhadd          v16.8B, v16.8B, v20.8B
-        urhadd          v17.8B, v17.8B, v21.8B
+        ld1             {v20.8b}, [x8], x2
+        ld1             {v21.8b}, [x8], x2
+        urhadd          v16.8b, v16.8b, v20.8b
+        urhadd          v17.8b, v17.8b, v21.8b
  .endif
-        st1             {v16.8B}, [x0], x2
-        st1             {v17.8B}, [x0], x2
+        st1             {v16.8b}, [x0], x2
+        st1             {v17.8b}, [x0], x2
        b.gt            1b
        ret

 2:      adds            w12, w12, w6
-        dup             v0.8B, w4
+        dup             v0.8b, w4
        b.eq            5f
        tst             w6,  w6
-        dup             v1.8B, w12
+        dup             v1.8b, w12
        b.eq            4f

-        ld1             {v4.8B}, [x1], x2
-3:      ld1             {v6.8B}, [x1], x2
-        umull           v16.8H, v4.8B,  v0.8B
-        umlal           v16.8H, v6.8B,  v1.8B
-        ld1             {v4.8B}, [x1], x2
-        umull           v17.8H, v6.8B,  v0.8B
-        umlal           v17.8H, v4.8B,  v1.8B
+        ld1             {v4.8b}, [x1], x2
+3:      ld1             {v6.8b}, [x1], x2
+        umull           v16.8h, v4.8b,  v0.8b
+        umlal           v16.8h, v6.8b,  v1.8b
+        ld1             {v4.8b}, [x1], x2
+        umull           v17.8h, v6.8b,  v0.8b
+        umlal           v17.8h, v4.8b,  v1.8b
        prfm            pldl1strm, [x1]
  .ifc \codec,h264
-        rshrn           v16.8B, v16.8H, #6
-        rshrn           v17.8B, v17.8H, #6
+        rshrn           v16.8b, v16.8h, #6
+        rshrn           v17.8b, v17.8h, #6
  .else
-        add             v16.8H, v16.8H, v22.8H
-        add             v17.8H, v17.8H, v22.8H
-        shrn            v16.8B, v16.8H, #6
-        shrn            v17.8B, v17.8H, #6
+        add             v16.8h, v16.8h, v22.8h
+        add             v17.8h, v17.8h, v22.8h
+        shrn            v16.8b, v16.8h, #6
+        shrn            v17.8b, v17.8h, #6
  .endif
        prfm            pldl1strm, [x1, x2]
  .ifc \type,avg
-        ld1             {v20.8B}, [x8], x2
-        ld1             {v21.8B}, [x8], x2
-        urhadd          v16.8B, v16.8B, v20.8B
-        urhadd          v17.8B, v17.8B, v21.8B
+        ld1             {v20.8b}, [x8], x2
+        ld1             {v21.8b}, [x8], x2
+        urhadd          v16.8b, v16.8b, v20.8b
+        urhadd          v17.8b, v17.8b, v21.8b
  .endif
        subs            w3,  w3,  #2
-        st1             {v16.8B}, [x0], x2
-        st1             {v17.8B}, [x0], x2
+        st1             {v16.8b}, [x0], x2
+        st1             {v17.8b}, [x0], x2
        b.gt            3b
        ret

-4:      ld1             {v4.8B, v5.8B}, [x1], x2
-        ld1             {v6.8B, v7.8B}, [x1], x2
-        ext             v5.8B,  v4.8B,  v5.8B,  #1
-        ext             v7.8B,  v6.8B,  v7.8B,  #1
+4:      ld1             {v4.8b, v5.8b}, [x1], x2
+        ld1             {v6.8b, v7.8b}, [x1], x2
+        ext             v5.8b,  v4.8b,  v5.8b,  #1
+        ext             v7.8b,  v6.8b,  v7.8b,  #1
        prfm            pldl1strm, [x1]
        subs            w3,  w3,  #2
-        umull           v16.8H, v4.8B, v0.8B
-        umlal           v16.8H, v5.8B, v1.8B
-        umull           v17.8H, v6.8B, v0.8B
-        umlal           v17.8H, v7.8B, v1.8B
+        umull           v16.8h, v4.8b, v0.8b
+        umlal           v16.8h, v5.8b, v1.8b
+        umull           v17.8h, v6.8b, v0.8b
+        umlal           v17.8h, v7.8b, v1.8b
        prfm            pldl1strm, [x1, x2]
  .ifc \codec,h264
-        rshrn           v16.8B, v16.8H, #6
-        rshrn           v17.8B, v17.8H, #6
+        rshrn           v16.8b, v16.8h, #6
+        rshrn           v17.8b, v17.8h, #6
  .else
-        add             v16.8H, v16.8H, v22.8H
-        add             v17.8H, v17.8H, v22.8H
-        shrn            v16.8B, v16.8H, #6
-        shrn            v17.8B, v17.8H, #6
+        add             v16.8h, v16.8h, v22.8h
+        add             v17.8h, v17.8h, v22.8h
+        shrn            v16.8b, v16.8h, #6
+        shrn            v17.8b, v17.8h, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.8B}, [x8], x2
-        ld1             {v21.8B}, [x8], x2
-        urhadd          v16.8B, v16.8B, v20.8B
-        urhadd          v17.8B, v17.8B, v21.8B
+        ld1             {v20.8b}, [x8], x2
+        ld1             {v21.8b}, [x8], x2
+        urhadd          v16.8b, v16.8b, v20.8b
+        urhadd          v17.8b, v17.8b, v21.8b
  .endif
-        st1             {v16.8B}, [x0], x2
-        st1             {v17.8B}, [x0], x2
+        st1             {v16.8b}, [x0], x2
+        st1             {v17.8b}, [x0], x2
        b.gt            4b
        ret

-5:      ld1             {v4.8B}, [x1], x2
-        ld1             {v5.8B}, [x1], x2
+5:      ld1             {v4.8b}, [x1], x2
+        ld1             {v5.8b}, [x1], x2
        prfm            pldl1strm, [x1]
        subs            w3,  w3,  #2
-        umull           v16.8H, v4.8B, v0.8B
-        umull           v17.8H, v5.8B, v0.8B
+        umull           v16.8h, v4.8b, v0.8b
+        umull           v17.8h, v5.8b, v0.8b
        prfm            pldl1strm, [x1, x2]
  .ifc \codec,h264
-        rshrn           v16.8B, v16.8H, #6
-        rshrn           v17.8B, v17.8H, #6
+        rshrn           v16.8b, v16.8h, #6
+        rshrn           v17.8b, v17.8h, #6
  .else
-        add             v16.8H, v16.8H, v22.8H
-        add             v17.8H, v17.8H, v22.8H
-        shrn            v16.8B, v16.8H, #6
-        shrn            v17.8B, v17.8H, #6
+        add             v16.8h, v16.8h, v22.8h
+        add             v17.8h, v17.8h, v22.8h
+        shrn            v16.8b, v16.8h, #6
+        shrn            v17.8b, v17.8h, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.8B}, [x8], x2
-        ld1             {v21.8B}, [x8], x2
-        urhadd          v16.8B, v16.8B, v20.8B
-        urhadd          v17.8B, v17.8B, v21.8B
+        ld1             {v20.8b}, [x8], x2
+        ld1             {v21.8b}, [x8], x2
+        urhadd          v16.8b, v16.8b, v20.8b
+        urhadd          v17.8b, v17.8b, v21.8b
  .endif
-        st1             {v16.8B}, [x0], x2
-        st1             {v17.8B}, [x0], x2
+        st1             {v16.8b}, [x0], x2
+        st1             {v17.8b}, [x0], x2
        b.gt            5b
        ret
 endfunc
@@ -206,11 +206,11 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
        lsl             w9,  w9,  #3
        lsl             w10, w10, #1
        add             w9,  w9,  w10
-        add             x6,  x6,  w9, UXTW
-        ld1r            {v22.8H}, [x6]
+        add             x6,  x6,  w9, uxtw
+        ld1r            {v22.8h}, [x6]
  .endif
  .ifc \codec,vc1
-        movi            v22.8H,   #28
+        movi            v22.8h,   #28
  .endif
        mul             w7,  w4,  w5
        lsl             w14, w5,  #3
@@ -223,133 +223,133 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
        add             w4,  w4,  #64
        b.eq            2f

-        dup             v24.8B,  w4
-        dup             v25.8B,  w12
-        ld1             {v4.8B}, [x1], x2
-        dup             v26.8B,  w6
-        dup             v27.8B,  w7
-        ext             v5.8B,  v4.8B,  v5.8B, #1
-        trn1            v0.2S,  v24.2S, v25.2S
-        trn1            v2.2S,  v26.2S, v27.2S
-        trn1            v4.2S,  v4.2S,  v5.2S
-1:      ld1             {v6.8B}, [x1], x2
-        ext             v7.8B,  v6.8B,  v7.8B, #1
-        trn1            v6.2S,  v6.2S,  v7.2S
-        umull           v18.8H, v4.8B,  v0.8B
-        umlal           v18.8H, v6.8B,  v2.8B
-        ld1             {v4.8B}, [x1], x2
-        ext             v5.8B,  v4.8B,  v5.8B, #1
-        trn1            v4.2S,  v4.2S,  v5.2S
+        dup             v24.8b,  w4
+        dup             v25.8b,  w12
+        ld1             {v4.8b}, [x1], x2
+        dup             v26.8b,  w6
+        dup             v27.8b,  w7
+        ext             v5.8b,  v4.8b,  v5.8b, #1
+        trn1            v0.2s,  v24.2s, v25.2s
+        trn1            v2.2s,  v26.2s, v27.2s
+        trn1            v4.2s,  v4.2s,  v5.2s
+1:      ld1             {v6.8b}, [x1], x2
+        ext             v7.8b,  v6.8b,  v7.8b, #1
+        trn1            v6.2s,  v6.2s,  v7.2s
+        umull           v18.8h, v4.8b,  v0.8b
+        umlal           v18.8h, v6.8b,  v2.8b
+        ld1             {v4.8b}, [x1], x2
+        ext             v5.8b,  v4.8b,  v5.8b, #1
+        trn1            v4.2s,  v4.2s,  v5.2s
        prfm            pldl1strm, [x1]
-        umull           v19.8H, v6.8B,  v0.8B
-        umlal           v19.8H, v4.8B,  v2.8B
-        trn1            v30.2D, v18.2D, v19.2D
-        trn2            v31.2D, v18.2D, v19.2D
-        add             v18.8H, v30.8H, v31.8H
+        umull           v19.8h, v6.8b,  v0.8b
+        umlal           v19.8h, v4.8b,  v2.8b
+        trn1            v30.2d, v18.2d, v19.2d
+        trn2            v31.2d, v18.2d, v19.2d
+        add             v18.8h, v30.8h, v31.8h
  .ifc \codec,h264
-        rshrn           v16.8B, v18.8H, #6
+        rshrn           v16.8b, v18.8h, #6
  .else
-        add             v18.8H, v18.8H, v22.8H
-        shrn            v16.8B, v18.8H, #6
+        add             v18.8h, v18.8h, v22.8h
+        shrn            v16.8b, v18.8h, #6
  .endif
        subs            w3,  w3,  #2
        prfm            pldl1strm, [x1, x2]
  .ifc \type,avg
-        ld1             {v20.S}[0], [x8], x2
-        ld1             {v20.S}[1], [x8], x2
-        urhadd          v16.8B, v16.8B, v20.8B
+        ld1             {v20.s}[0], [x8], x2
+        ld1             {v20.s}[1], [x8], x2
+        urhadd          v16.8b, v16.8b, v20.8b
  .endif
-        st1             {v16.S}[0], [x0], x2
-        st1             {v16.S}[1], [x0], x2
+        st1             {v16.s}[0], [x0], x2
+        st1             {v16.s}[1], [x0], x2
        b.gt            1b
        ret

 2:      adds            w12, w12, w6
-        dup             v30.8B, w4
+        dup             v30.8b, w4
        b.eq            5f
        tst             w6,  w6
-        dup             v31.8B, w12
-        trn1            v0.2S,  v30.2S, v31.2S
-        trn2            v1.2S,  v30.2S, v31.2S
+        dup             v31.8b, w12
+        trn1            v0.2s,  v30.2s, v31.2s
+        trn2            v1.2s,  v30.2s, v31.2s
        b.eq            4f

-        ext             v1.8B,  v0.8B,  v1.8B, #4
-        ld1             {v4.S}[0], [x1], x2
-3:      ld1             {v4.S}[1], [x1], x2
-        umull           v18.8H, v4.8B,  v0.8B
-        ld1             {v4.S}[0], [x1], x2
-        umull           v19.8H, v4.8B,  v1.8B
-        trn1            v30.2D, v18.2D, v19.2D
-        trn2            v31.2D, v18.2D, v19.2D
-        add             v18.8H, v30.8H, v31.8H
+        ext             v1.8b,  v0.8b,  v1.8b, #4
+        ld1             {v4.s}[0], [x1], x2
+3:      ld1             {v4.s}[1], [x1], x2
+        umull           v18.8h, v4.8b,  v0.8b
+        ld1             {v4.s}[0], [x1], x2
+        umull           v19.8h, v4.8b,  v1.8b
+        trn1            v30.2d, v18.2d, v19.2d
+        trn2            v31.2d, v18.2d, v19.2d
+        add             v18.8h, v30.8h, v31.8h
        prfm            pldl1strm, [x1]
  .ifc \codec,h264
-        rshrn           v16.8B, v18.8H, #6
+        rshrn           v16.8b, v18.8h, #6
  .else
-        add             v18.8H, v18.8H, v22.8H
-        shrn            v16.8B, v18.8H, #6
+        add             v18.8h, v18.8h, v22.8h
+        shrn            v16.8b, v18.8h, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.S}[0], [x8], x2
-        ld1             {v20.S}[1], [x8], x2
-        urhadd          v16.8B, v16.8B, v20.8B
+        ld1             {v20.s}[0], [x8], x2
+        ld1             {v20.s}[1], [x8], x2
+        urhadd          v16.8b, v16.8b, v20.8b
  .endif
        subs            w3,  w3,  #2
        prfm            pldl1strm, [x1, x2]
-        st1             {v16.S}[0], [x0], x2
-        st1             {v16.S}[1], [x0], x2
+        st1             {v16.s}[0], [x0], x2
+        st1             {v16.s}[1], [x0], x2
        b.gt            3b
        ret

-4:      ld1             {v4.8B}, [x1], x2
-        ld1             {v6.8B}, [x1], x2
-        ext             v5.8B,  v4.8B,  v5.8B, #1
-        ext             v7.8B,  v6.8B,  v7.8B, #1
-        trn1            v4.2S,  v4.2S,  v5.2S
-        trn1            v6.2S,  v6.2S,  v7.2S
-        umull           v18.8H, v4.8B,  v0.8B
-        umull           v19.8H, v6.8B,  v0.8B
+4:      ld1             {v4.8b}, [x1], x2
+        ld1             {v6.8b}, [x1], x2
+        ext             v5.8b,  v4.8b,  v5.8b, #1
+        ext             v7.8b,  v6.8b,  v7.8b, #1
+        trn1            v4.2s,  v4.2s,  v5.2s
+        trn1            v6.2s,  v6.2s,  v7.2s
+        umull           v18.8h, v4.8b,  v0.8b
+        umull           v19.8h, v6.8b,  v0.8b
        subs            w3,  w3,  #2
-        trn1            v30.2D, v18.2D, v19.2D
-        trn2            v31.2D, v18.2D, v19.2D
-        add             v18.8H, v30.8H, v31.8H
+        trn1            v30.2d, v18.2d, v19.2d
+        trn2            v31.2d, v18.2d, v19.2d
+        add             v18.8h, v30.8h, v31.8h
        prfm            pldl1strm, [x1]
  .ifc \codec,h264
-        rshrn           v16.8B, v18.8H, #6
+        rshrn           v16.8b, v18.8h, #6
  .else
-        add             v18.8H, v18.8H, v22.8H
-        shrn            v16.8B, v18.8H, #6
+        add             v18.8h, v18.8h, v22.8h
+        shrn            v16.8b, v18.8h, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.S}[0], [x8], x2
-        ld1             {v20.S}[1], [x8], x2
-        urhadd          v16.8B, v16.8B, v20.8B
+        ld1             {v20.s}[0], [x8], x2
+        ld1             {v20.s}[1], [x8], x2
+        urhadd          v16.8b, v16.8b, v20.8b
  .endif
        prfm            pldl1strm, [x1]
-        st1             {v16.S}[0], [x0], x2
-        st1             {v16.S}[1], [x0], x2
+        st1             {v16.s}[0], [x0], x2
+        st1             {v16.s}[1], [x0], x2
        b.gt            4b
        ret

-5:      ld1             {v4.S}[0], [x1], x2
-        ld1             {v4.S}[1], [x1], x2
-        umull           v18.8H, v4.8B,  v30.8B
+5:      ld1             {v4.s}[0], [x1], x2
+        ld1             {v4.s}[1], [x1], x2
+        umull           v18.8h, v4.8b,  v30.8b
        subs            w3,  w3,  #2
        prfm            pldl1strm, [x1]
  .ifc \codec,h264
-        rshrn           v16.8B, v18.8H, #6
+        rshrn           v16.8b, v18.8h, #6
  .else
-        add             v18.8H, v18.8H, v22.8H
-        shrn            v16.8B, v18.8H, #6
+        add             v18.8h, v18.8h, v22.8h
+        shrn            v16.8b, v18.8h, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.S}[0], [x8], x2
-        ld1             {v20.S}[1], [x8], x2
-        urhadd          v16.8B, v16.8B, v20.8B
+        ld1             {v20.s}[0], [x8], x2
+        ld1             {v20.s}[1], [x8], x2
+        urhadd          v16.8b, v16.8b, v20.8b
  .endif
        prfm            pldl1strm, [x1]
-        st1             {v16.S}[0], [x0], x2
-        st1             {v16.S}[1], [x0], x2
+        st1             {v16.s}[0], [x0], x2
+        st1             {v16.s}[1], [x0], x2
        b.gt            5b
        ret
 endfunc
@@ -370,51 +370,51 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1
        sub             w4,  w7,  w13
        sub             w4,  w4,  w14
        add             w4,  w4,  #64
-        dup             v0.8B,  w4
-        dup             v2.8B,  w12
-        dup             v1.8B,  w6
-        dup             v3.8B,  w7
-        trn1            v0.4H,  v0.4H,  v2.4H
-        trn1            v1.4H,  v1.4H,  v3.4H
+        dup             v0.8b,  w4
+        dup             v2.8b,  w12
+        dup             v1.8b,  w6
+        dup             v3.8b,  w7
+        trn1            v0.4h,  v0.4h,  v2.4h
+        trn1            v1.4h,  v1.4h,  v3.4h
 1:
-        ld1             {v4.S}[0],  [x1], x2
-        ld1             {v4.S}[1],  [x1], x2
-        rev64           v5.2S,  v4.2S
-        ld1             {v5.S}[1],  [x1]
-        ext             v6.8B,  v4.8B,  v5.8B,  #1
-        ext             v7.8B,  v5.8B,  v4.8B,  #1
-        trn1            v4.4H,  v4.4H,  v6.4H
-        trn1            v5.4H,  v5.4H,  v7.4H
-        umull           v16.8H, v4.8B,  v0.8B
-        umlal           v16.8H, v5.8B,  v1.8B
+        ld1             {v4.s}[0],  [x1], x2
+        ld1             {v4.s}[1],  [x1], x2
+        rev64           v5.2s,  v4.2s
+        ld1             {v5.s}[1],  [x1]
+        ext             v6.8b,  v4.8b,  v5.8b,  #1
+        ext             v7.8b,  v5.8b,  v4.8b,  #1
+        trn1            v4.4h,  v4.4h,  v6.4h
+        trn1            v5.4h,  v5.4h,  v7.4h
+        umull           v16.8h, v4.8b,  v0.8b
+        umlal           v16.8h, v5.8b,  v1.8b
  .ifc \type,avg
-        ld1             {v18.H}[0], [x0], x2
-        ld1             {v18.H}[2], [x0]
+        ld1             {v18.h}[0], [x0], x2
+        ld1             {v18.h}[2], [x0]
        sub             x0,  x0,  x2
  .endif
-        rev64           v17.4S, v16.4S
-        add             v16.8H, v16.8H, v17.8H
-        rshrn           v16.8B, v16.8H, #6
+        rev64           v17.4s, v16.4s
+        add             v16.8h, v16.8h, v17.8h
+        rshrn           v16.8b, v16.8h, #6
  .ifc \type,avg
-        urhadd          v16.8B, v16.8B, v18.8B
+        urhadd          v16.8b, v16.8b, v18.8b
  .endif
-        st1             {v16.H}[0], [x0], x2
-        st1             {v16.H}[2], [x0], x2
+        st1             {v16.h}[0], [x0], x2
+        st1             {v16.h}[2], [x0], x2
        subs            w3,  w3,  #2
        b.gt            1b
        ret

 2:
-        ld1             {v16.H}[0], [x1], x2
-        ld1             {v16.H}[1], [x1], x2
+        ld1             {v16.h}[0], [x1], x2
+        ld1             {v16.h}[1], [x1], x2
  .ifc \type,avg
-        ld1             {v18.H}[0], [x0], x2
-        ld1             {v18.H}[1], [x0]
+        ld1             {v18.h}[0], [x0], x2
+        ld1             {v18.h}[1], [x0]
        sub             x0,  x0,  x2
-        urhadd          v16.8B, v16.8B, v18.8B
+        urhadd          v16.8b, v16.8b, v18.8b
  .endif
-        st1             {v16.H}[0], [x0], x2
-        st1             {v16.H}[1], [x0], x2
+        st1             {v16.h}[0], [x0], x2
+        st1             {v16.h}[1], [x0], x2
        subs            w3,  w3,  #2
        b.gt            2b
        ret
@@ -27,114 +27,114 @@
 .macro  lowpass_const   r
        movz            \r, #20, lsl #16
        movk            \r, #5
-        mov             v6.S[0], \r
+        mov             v6.s[0], \r
 .endm

 //trashes v0-v5
 .macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
-        ext             v2.8B,      \r0\().8B, \r1\().8B, #2
-        ext             v3.8B,      \r0\().8B, \r1\().8B, #3
-        uaddl           v2.8H,      v2.8B,     v3.8B
-        ext             v4.8B,      \r0\().8B, \r1\().8B, #1
-        ext             v5.8B,      \r0\().8B, \r1\().8B, #4
-        uaddl           v4.8H,      v4.8B,     v5.8B
-        ext             v1.8B,      \r0\().8B, \r1\().8B, #5
-        uaddl           \d0\().8H,  \r0\().8B, v1.8B
-        ext             v0.8B,      \r2\().8B, \r3\().8B, #2
-        mla             \d0\().8H,  v2.8H,     v6.H[1]
-        ext             v1.8B,      \r2\().8B, \r3\().8B, #3
-        uaddl           v0.8H,      v0.8B,     v1.8B
-        ext             v1.8B,      \r2\().8B, \r3\().8B, #1
-        mls             \d0\().8H,  v4.8H,     v6.H[0]
-        ext             v3.8B,      \r2\().8B, \r3\().8B, #4
-        uaddl           v1.8H,      v1.8B,     v3.8B
-        ext             v2.8B,      \r2\().8B, \r3\().8B, #5
-        uaddl           \d1\().8H,  \r2\().8B, v2.8B
-        mla             \d1\().8H,  v0.8H,     v6.H[1]
-        mls             \d1\().8H,  v1.8H,     v6.H[0]
+        ext             v2.8b,      \r0\().8b, \r1\().8b, #2
+        ext             v3.8b,      \r0\().8b, \r1\().8b, #3
+        uaddl           v2.8h,      v2.8b,     v3.8b
+        ext             v4.8b,      \r0\().8b, \r1\().8b, #1
+        ext             v5.8b,      \r0\().8b, \r1\().8b, #4
+        uaddl           v4.8h,      v4.8b,     v5.8b
+        ext             v1.8b,      \r0\().8b, \r1\().8b, #5
+        uaddl           \d0\().8h,  \r0\().8b, v1.8b
+        ext             v0.8b,      \r2\().8b, \r3\().8b, #2
+        mla             \d0\().8h,  v2.8h,     v6.h[1]
+        ext             v1.8b,      \r2\().8b, \r3\().8b, #3
+        uaddl           v0.8h,      v0.8b,     v1.8b
+        ext             v1.8b,      \r2\().8b, \r3\().8b, #1
+        mls             \d0\().8h,  v4.8h,     v6.h[0]
+        ext             v3.8b,      \r2\().8b, \r3\().8b, #4
+        uaddl           v1.8h,      v1.8b,     v3.8b
+        ext             v2.8b,      \r2\().8b, \r3\().8b, #5
+        uaddl           \d1\().8h,  \r2\().8b, v2.8b
+        mla             \d1\().8h,  v0.8h,     v6.h[1]
+        mls             \d1\().8h,  v1.8h,     v6.h[0]
  .if \narrow
-        sqrshrun        \d0\().8B,  \d0\().8H, #5
-        sqrshrun        \d1\().8B,  \d1\().8H, #5
+        sqrshrun        \d0\().8b,  \d0\().8h, #5
+        sqrshrun        \d1\().8b,  \d1\().8h, #5
  .endif
 .endm

 //trashes v0-v5, v7, v30-v31
 .macro  lowpass_8H      r0,  r1
-        ext             v0.16B,     \r0\().16B, \r0\().16B, #2
-        ext             v1.16B,     \r0\().16B, \r0\().16B, #3
-        uaddl           v0.8H,      v0.8B,      v1.8B
-        ext             v2.16B,     \r0\().16B, \r0\().16B, #1
-        ext             v3.16B,     \r0\().16B, \r0\().16B, #4
-        uaddl           v2.8H,      v2.8B,      v3.8B
-        ext             v30.16B,    \r0\().16B, \r0\().16B, #5
-        uaddl           \r0\().8H,  \r0\().8B,  v30.8B
-        ext             v4.16B,     \r1\().16B, \r1\().16B, #2
-        mla             \r0\().8H,  v0.8H,      v6.H[1]
-        ext             v5.16B,     \r1\().16B, \r1\().16B, #3
-        uaddl           v4.8H,      v4.8B,      v5.8B
-        ext             v7.16B,     \r1\().16B, \r1\().16B, #1
-        mls             \r0\().8H,  v2.8H,      v6.H[0]
-        ext             v0.16B,     \r1\().16B, \r1\().16B, #4
-        uaddl           v7.8H,      v7.8B,      v0.8B
-        ext             v31.16B,    \r1\().16B, \r1\().16B, #5
-        uaddl           \r1\().8H,  \r1\().8B,  v31.8B
-        mla             \r1\().8H,  v4.8H,      v6.H[1]
-        mls             \r1\().8H,  v7.8H,      v6.H[0]
+        ext             v0.16b,     \r0\().16b, \r0\().16b, #2
+        ext             v1.16b,     \r0\().16b, \r0\().16b, #3
+        uaddl           v0.8h,      v0.8b,      v1.8b
+        ext             v2.16b,     \r0\().16b, \r0\().16b, #1
+        ext             v3.16b,     \r0\().16b, \r0\().16b, #4
+        uaddl           v2.8h,      v2.8b,      v3.8b
+        ext             v30.16b,    \r0\().16b, \r0\().16b, #5
+        uaddl           \r0\().8h,  \r0\().8b,  v30.8b
+        ext             v4.16b,     \r1\().16b, \r1\().16b, #2
+        mla             \r0\().8h,  v0.8h,      v6.h[1]
+        ext             v5.16b,     \r1\().16b, \r1\().16b, #3
+        uaddl           v4.8h,      v4.8b,      v5.8b
+        ext             v7.16b,     \r1\().16b, \r1\().16b, #1
+        mls             \r0\().8h,  v2.8h,      v6.h[0]
+        ext             v0.16b,     \r1\().16b, \r1\().16b, #4
+        uaddl           v7.8h,      v7.8b,      v0.8b
+        ext             v31.16b,    \r1\().16b, \r1\().16b, #5
+        uaddl           \r1\().8h,  \r1\().8b,  v31.8b
+        mla             \r1\().8h,  v4.8h,      v6.h[1]
+        mls             \r1\().8h,  v7.8h,      v6.h[0]
 .endm

 // trashes v2-v5, v30
 .macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
-        ext             v2.8B,     \r0\().8B, \r1\().8B, #2
-        ext             v3.8B,     \r0\().8B, \r1\().8B, #3
-        uaddl           v2.8H,     v2.8B,     v3.8B
-        ext             v4.8B,     \r0\().8B, \r1\().8B, #1
-        ext             v5.8B,     \r0\().8B, \r1\().8B, #4
-        uaddl           v4.8H,     v4.8B,     v5.8B
-        ext             v30.8B,    \r0\().8B, \r1\().8B, #5
-        uaddl           \d0\().8H, \r0\().8B, v30.8B
-        mla             \d0\().8H, v2.8H,     v6.H[1]
-        mls             \d0\().8H, v4.8H,     v6.H[0]
+        ext             v2.8b,     \r0\().8b, \r1\().8b, #2
+        ext             v3.8b,     \r0\().8b, \r1\().8b, #3
+        uaddl           v2.8h,     v2.8b,     v3.8b
+        ext             v4.8b,     \r0\().8b, \r1\().8b, #1
+        ext             v5.8b,     \r0\().8b, \r1\().8b, #4
+        uaddl           v4.8h,     v4.8b,     v5.8b
+        ext             v30.8b,    \r0\().8b, \r1\().8b, #5
+        uaddl           \d0\().8h, \r0\().8b, v30.8b
+        mla             \d0\().8h, v2.8h,     v6.h[1]
+        mls             \d0\().8h, v4.8h,     v6.h[0]
  .if \narrow
-        sqrshrun        \d0\().8B, \d0\().8H, #5
+        sqrshrun        \d0\().8b, \d0\().8h, #5
  .endif
 .endm

 // trashed v0-v7
 .macro  lowpass_8.16    r0,  r1,  r2
-        ext             v1.16B,     \r0\().16B, \r1\().16B, #4
-        ext             v0.16B,     \r0\().16B, \r1\().16B, #6
-        saddl           v5.4S,      v1.4H,      v0.4H
-        ext             v2.16B,     \r0\().16B, \r1\().16B, #2
-        saddl2          v1.4S,      v1.8H,      v0.8H
-        ext             v3.16B,     \r0\().16B, \r1\().16B, #8
-        saddl           v6.4S,      v2.4H,      v3.4H
-        ext             \r1\().16B, \r0\().16B, \r1\().16B, #10
-        saddl2          v2.4S,      v2.8H,      v3.8H
-        saddl           v0.4S,      \r0\().4H,  \r1\().4H
-        saddl2          v4.4S,      \r0\().8H,  \r1\().8H
+        ext             v1.16b,     \r0\().16b, \r1\().16b, #4
+        ext             v0.16b,     \r0\().16b, \r1\().16b, #6
+        saddl           v5.4s,      v1.4h,      v0.4h
+        ext             v2.16b,     \r0\().16b, \r1\().16b, #2
+        saddl2          v1.4s,      v1.8h,      v0.8h
+        ext             v3.16b,     \r0\().16b, \r1\().16b, #8
+        saddl           v6.4s,      v2.4h,      v3.4h
+        ext             \r1\().16b, \r0\().16b, \r1\().16b, #10
+        saddl2          v2.4s,      v2.8h,      v3.8h
+        saddl           v0.4s,      \r0\().4h,  \r1\().4h
+        saddl2          v4.4s,      \r0\().8h,  \r1\().8h

-        shl             v3.4S,  v5.4S,  #4
-        shl             v5.4S,  v5.4S,  #2
-        shl             v7.4S,  v6.4S,  #2
-        add             v5.4S,  v5.4S,  v3.4S
-        add             v6.4S,  v6.4S,  v7.4S
+        shl             v3.4s,  v5.4s,  #4
+        shl             v5.4s,  v5.4s,  #2
+        shl             v7.4s,  v6.4s,  #2
+        add             v5.4s,  v5.4s,  v3.4s
+        add             v6.4s,  v6.4s,  v7.4s

-        shl             v3.4S,  v1.4S,  #4
-        shl             v1.4S,  v1.4S,  #2
-        shl             v7.4S,  v2.4S,  #2
-        add             v1.4S,  v1.4S,  v3.4S
-        add             v2.4S,  v2.4S,  v7.4S
+        shl             v3.4s,  v1.4s,  #4
+        shl             v1.4s,  v1.4s,  #2
+        shl             v7.4s,  v2.4s,  #2
+        add             v1.4s,  v1.4s,  v3.4s
+        add             v2.4s,  v2.4s,  v7.4s

-        add             v5.4S,  v5.4S,  v0.4S
-        sub             v5.4S,  v5.4S,  v6.4S
+        add             v5.4s,  v5.4s,  v0.4s
+        sub             v5.4s,  v5.4s,  v6.4s

-        add             v1.4S,  v1.4S,  v4.4S
-        sub             v1.4S,  v1.4S,  v2.4S
+        add             v1.4s,  v1.4s,  v4.4s
+        sub             v1.4s,  v1.4s,  v2.4s

-        rshrn           v5.4H,  v5.4S,  #10
-        rshrn2          v5.8H,  v1.4S,  #10
+        rshrn           v5.4h,  v5.4s,  #10
+        rshrn2          v5.8h,  v1.4s,  #10

-        sqxtun          \r2\().8B,  v5.8H
+        sqxtun          \r2\().8b,  v5.8h
 .endm

 function put_h264_qpel16_h_lowpass_neon_packed
@@ -163,19 +163,19 @@ function \type\()_h264_qpel16_h_lowpass_neon
 endfunc

 function \type\()_h264_qpel8_h_lowpass_neon
-1:      ld1             {v28.8B, v29.8B}, [x1], x2
-        ld1             {v16.8B, v17.8B}, [x1], x2
+1:      ld1             {v28.8b, v29.8b}, [x1], x2
+        ld1             {v16.8b, v17.8b}, [x1], x2
        subs            x12, x12, #2
        lowpass_8       v28, v29, v16, v17, v28, v16
  .ifc \type,avg
-        ld1             {v2.8B},    [x0], x3
-        urhadd          v28.8B, v28.8B,  v2.8B
-        ld1             {v3.8B},    [x0]
-        urhadd          v16.8B, v16.8B, v3.8B
+        ld1             {v2.8b},    [x0], x3
+        urhadd          v28.8b, v28.8b,  v2.8b
+        ld1             {v3.8b},    [x0]
+        urhadd          v16.8b, v16.8b, v3.8b
        sub             x0,  x0,  x3
  .endif
-        st1             {v28.8B},    [x0], x3
-        st1             {v16.8B},    [x0], x3
+        st1             {v28.8b},    [x0], x3
+        st1             {v16.8b},    [x0], x3
        b.ne            1b
        ret
 endfunc
@@ -200,23 +200,23 @@ function \type\()_h264_qpel16_h_lowpass_l2_neon
 endfunc

 function \type\()_h264_qpel8_h_lowpass_l2_neon
-1:      ld1             {v26.8B, v27.8B}, [x1], x2
-        ld1             {v16.8B, v17.8B}, [x1], x2
-        ld1             {v28.8B},     [x3], x2
-        ld1             {v29.8B},     [x3], x2
+1:      ld1             {v26.8b, v27.8b}, [x1], x2
+        ld1             {v16.8b, v17.8b}, [x1], x2
+        ld1             {v28.8b},     [x3], x2
+        ld1             {v29.8b},     [x3], x2
        subs            x12, x12, #2
        lowpass_8       v26, v27, v16, v17, v26, v27
-        urhadd          v26.8B, v26.8B, v28.8B
-        urhadd          v27.8B, v27.8B, v29.8B
+        urhadd          v26.8b, v26.8b, v28.8b
+        urhadd          v27.8b, v27.8b, v29.8b
  .ifc \type,avg
-        ld1             {v2.8B},      [x0], x2
-        urhadd          v26.8B, v26.8B, v2.8B
-        ld1             {v3.8B},      [x0]
-        urhadd          v27.8B, v27.8B, v3.8B
+        ld1             {v2.8b},      [x0], x2
+        urhadd          v26.8b, v26.8b, v2.8b
+        ld1             {v3.8b},      [x0]
+        urhadd          v27.8b, v27.8b, v3.8b
        sub             x0,  x0,  x2
  .endif
-        st1             {v26.8B},     [x0], x2
-        st1             {v27.8B},     [x0], x2
+        st1             {v26.8b},     [x0], x2
+        st1             {v27.8b},     [x0], x2
        b.ne            1b
        ret
 endfunc
@@ -257,19 +257,19 @@ function \type\()_h264_qpel16_v_lowpass_neon
 endfunc

 function \type\()_h264_qpel8_v_lowpass_neon
-        ld1             {v16.8B}, [x1], x3
-        ld1             {v18.8B}, [x1], x3
-        ld1             {v20.8B}, [x1], x3
-        ld1             {v22.8B}, [x1], x3
-        ld1             {v24.8B}, [x1], x3
-        ld1             {v26.8B}, [x1], x3
-        ld1             {v28.8B}, [x1], x3
-        ld1             {v30.8B}, [x1], x3
-        ld1             {v17.8B}, [x1], x3
-        ld1             {v19.8B}, [x1], x3
-        ld1             {v21.8B}, [x1], x3
-        ld1             {v23.8B}, [x1], x3
-        ld1             {v25.8B}, [x1]
+        ld1             {v16.8b}, [x1], x3
+        ld1             {v18.8b}, [x1], x3
+        ld1             {v20.8b}, [x1], x3
+        ld1             {v22.8b}, [x1], x3
+        ld1             {v24.8b}, [x1], x3
+        ld1             {v26.8b}, [x1], x3
+        ld1             {v28.8b}, [x1], x3
+        ld1             {v30.8b}, [x1], x3
+        ld1             {v17.8b}, [x1], x3
+        ld1             {v19.8b}, [x1], x3
+        ld1             {v21.8b}, [x1], x3
+        ld1             {v23.8b}, [x1], x3
+        ld1             {v25.8b}, [x1]

        transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
        transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
@@ -280,33 +280,33 @@ function \type\()_h264_qpel8_v_lowpass_neon
        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1

  .ifc \type,avg
-        ld1             {v24.8B},  [x0], x2
-        urhadd          v16.8B, v16.8B, v24.8B
-        ld1             {v25.8B}, [x0], x2
-        urhadd          v17.8B, v17.8B, v25.8B
-        ld1             {v26.8B}, [x0], x2
-        urhadd          v18.8B, v18.8B, v26.8B
-        ld1             {v27.8B}, [x0], x2
-        urhadd          v19.8B, v19.8B, v27.8B
-        ld1             {v28.8B}, [x0], x2
-        urhadd          v20.8B, v20.8B, v28.8B
-        ld1             {v29.8B}, [x0], x2
-        urhadd          v21.8B, v21.8B, v29.8B
-        ld1             {v30.8B}, [x0], x2
-        urhadd          v22.8B, v22.8B, v30.8B
-        ld1             {v31.8B}, [x0], x2
-        urhadd          v23.8B, v23.8B, v31.8B
+        ld1             {v24.8b},  [x0], x2
+        urhadd          v16.8b, v16.8b, v24.8b
+        ld1             {v25.8b}, [x0], x2
+        urhadd          v17.8b, v17.8b, v25.8b
+        ld1             {v26.8b}, [x0], x2
+        urhadd          v18.8b, v18.8b, v26.8b
+        ld1             {v27.8b}, [x0], x2
+        urhadd          v19.8b, v19.8b, v27.8b
+        ld1             {v28.8b}, [x0], x2
+        urhadd          v20.8b, v20.8b, v28.8b
+        ld1             {v29.8b}, [x0], x2
+        urhadd          v21.8b, v21.8b, v29.8b
+        ld1             {v30.8b}, [x0], x2
+        urhadd          v22.8b, v22.8b, v30.8b
+        ld1             {v31.8b}, [x0], x2
+        urhadd          v23.8b, v23.8b, v31.8b
        sub             x0,  x0,  x2,  lsl #3
  .endif

-        st1             {v16.8B}, [x0], x2
-        st1             {v17.8B}, [x0], x2
-        st1             {v18.8B}, [x0], x2
-        st1             {v19.8B}, [x0], x2
-        st1             {v20.8B}, [x0], x2
-        st1             {v21.8B}, [x0], x2
-        st1             {v22.8B}, [x0], x2
-        st1             {v23.8B}, [x0], x2
+        st1             {v16.8b}, [x0], x2
+        st1             {v17.8b}, [x0], x2
+        st1             {v18.8b}, [x0], x2
+        st1             {v19.8b}, [x0], x2
+        st1             {v20.8b}, [x0], x2
+        st1             {v21.8b}, [x0], x2
+        st1             {v22.8b}, [x0], x2
+        st1             {v23.8b}, [x0], x2

        ret
 endfunc
@@ -334,19 +334,19 @@ function \type\()_h264_qpel16_v_lowpass_l2_neon
 endfunc

 function \type\()_h264_qpel8_v_lowpass_l2_neon
-        ld1             {v16.8B}, [x1], x3
-        ld1             {v18.8B}, [x1], x3
-        ld1             {v20.8B}, [x1], x3
-        ld1             {v22.8B}, [x1], x3
-        ld1             {v24.8B}, [x1], x3
-        ld1             {v26.8B}, [x1], x3
-        ld1             {v28.8B}, [x1], x3
-        ld1             {v30.8B}, [x1], x3
-        ld1             {v17.8B}, [x1], x3
-        ld1             {v19.8B}, [x1], x3
-        ld1             {v21.8B}, [x1], x3
-        ld1             {v23.8B}, [x1], x3
-        ld1             {v25.8B}, [x1]
+        ld1             {v16.8b}, [x1], x3
+        ld1             {v18.8b}, [x1], x3
+        ld1             {v20.8b}, [x1], x3
+        ld1             {v22.8b}, [x1], x3
+        ld1             {v24.8b}, [x1], x3
+        ld1             {v26.8b}, [x1], x3
+        ld1             {v28.8b}, [x1], x3
+        ld1             {v30.8b}, [x1], x3
+        ld1             {v17.8b}, [x1], x3
+        ld1             {v19.8b}, [x1], x3
+        ld1             {v21.8b}, [x1], x3
+        ld1             {v23.8b}, [x1], x3
+        ld1             {v25.8b}, [x1]

        transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
        transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
@@ -356,51 +356,51 @@ function \type\()_h264_qpel8_v_lowpass_l2_neon
        lowpass_8       v28, v29, v30, v31, v22, v23
        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1

-        ld1             {v24.8B},  [x12], x2
-        ld1             {v25.8B},  [x12], x2
-        ld1             {v26.8B},  [x12], x2
-        ld1             {v27.8B},  [x12], x2
-        ld1             {v28.8B},  [x12], x2
-        urhadd          v16.8B, v24.8B, v16.8B
-        urhadd          v17.8B, v25.8B, v17.8B
-        ld1             {v29.8B},  [x12], x2
-        urhadd          v18.8B, v26.8B, v18.8B
-        urhadd          v19.8B, v27.8B, v19.8B
-        ld1             {v30.8B}, [x12], x2
-        urhadd          v20.8B, v28.8B, v20.8B
-        urhadd          v21.8B, v29.8B, v21.8B
-        ld1             {v31.8B}, [x12], x2
-        urhadd          v22.8B, v30.8B, v22.8B
-        urhadd          v23.8B, v31.8B, v23.8B
+        ld1             {v24.8b},  [x12], x2
+        ld1             {v25.8b},  [x12], x2
+        ld1             {v26.8b},  [x12], x2
+        ld1             {v27.8b},  [x12], x2
+        ld1             {v28.8b},  [x12], x2
+        urhadd          v16.8b, v24.8b, v16.8b
+        urhadd          v17.8b, v25.8b, v17.8b
+        ld1             {v29.8b},  [x12], x2
+        urhadd          v18.8b, v26.8b, v18.8b
+        urhadd          v19.8b, v27.8b, v19.8b
+        ld1             {v30.8b}, [x12], x2
+        urhadd          v20.8b, v28.8b, v20.8b
+        urhadd          v21.8b, v29.8b, v21.8b
+        ld1             {v31.8b}, [x12], x2
+        urhadd          v22.8b, v30.8b, v22.8b
+        urhadd          v23.8b, v31.8b, v23.8b

  .ifc \type,avg
-        ld1             {v24.8B}, [x0], x3
-        urhadd          v16.8B, v16.8B, v24.8B
-        ld1             {v25.8B}, [x0], x3
-        urhadd          v17.8B, v17.8B, v25.8B
-        ld1             {v26.8B}, [x0], x3
-        urhadd          v18.8B, v18.8B, v26.8B
-        ld1             {v27.8B}, [x0], x3
-        urhadd          v19.8B, v19.8B, v27.8B
-        ld1             {v28.8B}, [x0], x3
-        urhadd          v20.8B, v20.8B, v28.8B
-        ld1             {v29.8B}, [x0], x3
-        urhadd          v21.8B, v21.8B, v29.8B
-        ld1             {v30.8B}, [x0], x3
-        urhadd          v22.8B, v22.8B, v30.8B
-        ld1             {v31.8B}, [x0], x3
-        urhadd          v23.8B, v23.8B, v31.8B
+        ld1             {v24.8b}, [x0], x3
+        urhadd          v16.8b, v16.8b, v24.8b
+        ld1             {v25.8b}, [x0], x3
+        urhadd          v17.8b, v17.8b, v25.8b
+        ld1             {v26.8b}, [x0], x3
+        urhadd          v18.8b, v18.8b, v26.8b
+        ld1             {v27.8b}, [x0], x3
+        urhadd          v19.8b, v19.8b, v27.8b
+        ld1             {v28.8b}, [x0], x3
+        urhadd          v20.8b, v20.8b, v28.8b
+        ld1             {v29.8b}, [x0], x3
+        urhadd          v21.8b, v21.8b, v29.8b
+        ld1             {v30.8b}, [x0], x3
+        urhadd          v22.8b, v22.8b, v30.8b
+        ld1             {v31.8b}, [x0], x3
+        urhadd          v23.8b, v23.8b, v31.8b
        sub             x0,  x0,  x3,  lsl #3
  .endif

-        st1             {v16.8B}, [x0], x3
-        st1             {v17.8B}, [x0], x3
-        st1             {v18.8B}, [x0], x3
-        st1             {v19.8B}, [x0], x3
-        st1             {v20.8B}, [x0], x3
-        st1             {v21.8B}, [x0], x3
-        st1             {v22.8B}, [x0], x3
-        st1             {v23.8B}, [x0], x3
+        st1             {v16.8b}, [x0], x3
+        st1             {v17.8b}, [x0], x3
+        st1             {v18.8b}, [x0], x3
+        st1             {v19.8b}, [x0], x3
+        st1             {v20.8b}, [x0], x3
+        st1             {v21.8b}, [x0], x3
+        st1             {v22.8b}, [x0], x3
+        st1             {v23.8b}, [x0], x3

        ret
 endfunc
@@ -411,19 +411,19 @@ endfunc

 function put_h264_qpel8_hv_lowpass_neon_top
        lowpass_const   w12
-        ld1             {v16.8H}, [x1], x3
-        ld1             {v17.8H}, [x1], x3
-        ld1             {v18.8H}, [x1], x3
-        ld1             {v19.8H}, [x1], x3
-        ld1             {v20.8H}, [x1], x3
-        ld1             {v21.8H}, [x1], x3
-        ld1             {v22.8H}, [x1], x3
-        ld1             {v23.8H}, [x1], x3
-        ld1             {v24.8H}, [x1], x3
-        ld1             {v25.8H}, [x1], x3
-        ld1             {v26.8H}, [x1], x3
-        ld1             {v27.8H}, [x1], x3
-        ld1             {v28.8H}, [x1]
+        ld1             {v16.8h}, [x1], x3
+        ld1             {v17.8h}, [x1], x3
+        ld1             {v18.8h}, [x1], x3
+        ld1             {v19.8h}, [x1], x3
+        ld1             {v20.8h}, [x1], x3
+        ld1             {v21.8h}, [x1], x3
+        ld1             {v22.8h}, [x1], x3
+        ld1             {v23.8h}, [x1], x3
+        ld1             {v24.8h}, [x1], x3
+        ld1             {v25.8h}, [x1], x3
+        ld1             {v26.8h}, [x1], x3
+        ld1             {v27.8h}, [x1], x3
+        ld1             {v28.8h}, [x1]
        lowpass_8H      v16, v17
        lowpass_8H      v18, v19
        lowpass_8H      v20, v21
@@ -447,7 +447,7 @@ function put_h264_qpel8_hv_lowpass_neon_top
        lowpass_8.16    v22, v30, v22
        lowpass_8.16    v23, v31, v23

-        transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1

        ret
 endfunc
@@ -457,33 +457,33 @@ function \type\()_h264_qpel8_hv_lowpass_neon
        mov             x10, x30
        bl              put_h264_qpel8_hv_lowpass_neon_top
  .ifc \type,avg
-        ld1             {v0.8B},      [x0], x2
-        urhadd          v16.8B, v16.8B, v0.8B
-        ld1             {v1.8B},      [x0], x2
-        urhadd          v17.8B, v17.8B, v1.8B
-        ld1             {v2.8B},      [x0], x2
-        urhadd          v18.8B, v18.8B, v2.8B
-        ld1             {v3.8B},      [x0], x2
-        urhadd          v19.8B, v19.8B, v3.8B
-        ld1             {v4.8B},      [x0], x2
-        urhadd          v20.8B, v20.8B, v4.8B
-        ld1             {v5.8B},      [x0], x2
-        urhadd          v21.8B, v21.8B, v5.8B
-        ld1             {v6.8B},      [x0], x2
-        urhadd          v22.8B, v22.8B, v6.8B
-        ld1             {v7.8B},      [x0], x2
-        urhadd          v23.8B, v23.8B, v7.8B
+        ld1             {v0.8b},      [x0], x2
+        urhadd          v16.8b, v16.8b, v0.8b
+        ld1             {v1.8b},      [x0], x2
+        urhadd          v17.8b, v17.8b, v1.8b
+        ld1             {v2.8b},      [x0], x2
+        urhadd          v18.8b, v18.8b, v2.8b
+        ld1             {v3.8b},      [x0], x2
+        urhadd          v19.8b, v19.8b, v3.8b
+        ld1             {v4.8b},      [x0], x2
+        urhadd          v20.8b, v20.8b, v4.8b
+        ld1             {v5.8b},      [x0], x2
+        urhadd          v21.8b, v21.8b, v5.8b
+        ld1             {v6.8b},      [x0], x2
+        urhadd          v22.8b, v22.8b, v6.8b
+        ld1             {v7.8b},      [x0], x2
+        urhadd          v23.8b, v23.8b, v7.8b
        sub             x0,  x0,  x2,  lsl #3
  .endif

-        st1             {v16.8B},     [x0], x2
-        st1             {v17.8B},     [x0], x2
-        st1             {v18.8B},     [x0], x2
-        st1             {v19.8B},     [x0], x2
-        st1             {v20.8B},     [x0], x2
-        st1             {v21.8B},     [x0], x2
-        st1             {v22.8B},     [x0], x2
-        st1             {v23.8B},     [x0], x2
+        st1             {v16.8b},     [x0], x2
+        st1             {v17.8b},     [x0], x2
+        st1             {v18.8b},     [x0], x2
+        st1             {v19.8b},     [x0], x2
+        st1             {v20.8b},     [x0], x2
+        st1             {v21.8b},     [x0], x2
+        st1             {v22.8b},     [x0], x2
+        st1             {v23.8b},     [x0], x2

        ret             x10
 endfunc
@@ -497,45 +497,45 @@ function \type\()_h264_qpel8_hv_lowpass_l2_neon
        mov             x10, x30
        bl              put_h264_qpel8_hv_lowpass_neon_top

-        ld1             {v0.8B, v1.8B},  [x2], #16
-        ld1             {v2.8B, v3.8B},  [x2], #16
-        urhadd          v0.8B,  v0.8B,  v16.8B
-        urhadd          v1.8B,  v1.8B,  v17.8B
-        ld1             {v4.8B, v5.8B},  [x2], #16
-        urhadd          v2.8B,  v2.8B,  v18.8B
-        urhadd          v3.8B,  v3.8B,  v19.8B
-        ld1             {v6.8B, v7.8B},  [x2], #16
-        urhadd          v4.8B,  v4.8B,  v20.8B
-        urhadd          v5.8B,  v5.8B,  v21.8B
-        urhadd          v6.8B,  v6.8B,  v22.8B
-        urhadd          v7.8B,  v7.8B,  v23.8B
+        ld1             {v0.8b, v1.8b},  [x2], #16
+        ld1             {v2.8b, v3.8b},  [x2], #16
+        urhadd          v0.8b,  v0.8b,  v16.8b
+        urhadd          v1.8b,  v1.8b,  v17.8b
+        ld1             {v4.8b, v5.8b},  [x2], #16
+        urhadd          v2.8b,  v2.8b,  v18.8b
+        urhadd          v3.8b,  v3.8b,  v19.8b
+        ld1             {v6.8b, v7.8b},  [x2], #16
+        urhadd          v4.8b,  v4.8b,  v20.8b
+        urhadd          v5.8b,  v5.8b,  v21.8b
+        urhadd          v6.8b,  v6.8b,  v22.8b
+        urhadd          v7.8b,  v7.8b,  v23.8b
  .ifc \type,avg
-        ld1             {v16.8B},     [x0], x3
-        urhadd          v0.8B,  v0.8B,  v16.8B
-        ld1             {v17.8B},     [x0], x3
-        urhadd          v1.8B,  v1.8B,  v17.8B
-        ld1             {v18.8B},     [x0], x3
-        urhadd          v2.8B,  v2.8B,  v18.8B
-        ld1             {v19.8B},     [x0], x3
-        urhadd          v3.8B,  v3.8B,  v19.8B
-        ld1             {v20.8B},     [x0], x3
-        urhadd          v4.8B,  v4.8B,  v20.8B
-        ld1             {v21.8B},     [x0], x3
-        urhadd          v5.8B,  v5.8B,  v21.8B
-        ld1             {v22.8B},     [x0], x3
-        urhadd          v6.8B,  v6.8B,  v22.8B
-        ld1             {v23.8B},     [x0], x3
-        urhadd          v7.8B,  v7.8B,  v23.8B
+        ld1             {v16.8b},     [x0], x3
+        urhadd          v0.8b,  v0.8b,  v16.8b
+        ld1             {v17.8b},     [x0], x3
+        urhadd          v1.8b,  v1.8b,  v17.8b
+        ld1             {v18.8b},     [x0], x3
+        urhadd          v2.8b,  v2.8b,  v18.8b
+        ld1             {v19.8b},     [x0], x3
+        urhadd          v3.8b,  v3.8b,  v19.8b
+        ld1             {v20.8b},     [x0], x3
+        urhadd          v4.8b,  v4.8b,  v20.8b
+        ld1             {v21.8b},     [x0], x3
+        urhadd          v5.8b,  v5.8b,  v21.8b
+        ld1             {v22.8b},     [x0], x3
+        urhadd          v6.8b,  v6.8b,  v22.8b
+        ld1             {v23.8b},     [x0], x3
+        urhadd          v7.8b,  v7.8b,  v23.8b
        sub             x0,  x0,  x3,  lsl #3
  .endif
-        st1             {v0.8B},      [x0], x3
-        st1             {v1.8B},      [x0], x3
-        st1             {v2.8B},      [x0], x3
-        st1             {v3.8B},      [x0], x3
-        st1             {v4.8B},      [x0], x3
-        st1             {v5.8B},      [x0], x3
-        st1             {v6.8B},      [x0], x3
-        st1             {v7.8B},      [x0], x3
+        st1             {v0.8b},      [x0], x3
+        st1             {v1.8b},      [x0], x3
+        st1             {v2.8b},      [x0], x3
+        st1             {v3.8b},      [x0], x3
+        st1             {v4.8b},      [x0], x3
+        st1             {v5.8b},      [x0], x3
+        st1             {v6.8b},      [x0], x3
+        st1             {v7.8b},      [x0], x3

        ret             x10
 endfunc
@@ -579,8 +579,8 @@ function \type\()_h264_qpel16_hv_lowpass_l2_neon
 endfunc
 .endm

-        h264_qpel16_hv put
-        h264_qpel16_hv avg
+        h264_qpel16_hv  put
+        h264_qpel16_hv  avg

 .macro  h264_qpel8      type
 function ff_\type\()_h264_qpel8_mc10_neon, export=1
@@ -758,8 +758,8 @@ function ff_\type\()_h264_qpel8_mc33_neon, export=1
 endfunc
 .endm

-        h264_qpel8 put
-        h264_qpel8 avg
+        h264_qpel8      put
+        h264_qpel8      avg

 .macro  h264_qpel16     type
 function ff_\type\()_h264_qpel16_mc10_neon, export=1
@@ -930,5 +930,5 @@ function ff_\type\()_h264_qpel16_mc33_neon, export=1
 endfunc
 .endm

-        h264_qpel16 put
-        h264_qpel16 avg
+        h264_qpel16     put
+        h264_qpel16     avg
@@ -26,295 +26,295 @@
  .if \avg
        mov             x12, x0
  .endif
-1:      ld1             {v0.16B},  [x1], x2
-        ld1             {v1.16B},  [x1], x2
-        ld1             {v2.16B},  [x1], x2
-        ld1             {v3.16B},  [x1], x2
+1:      ld1             {v0.16b},  [x1], x2
+        ld1             {v1.16b},  [x1], x2
+        ld1             {v2.16b},  [x1], x2
+        ld1             {v3.16b},  [x1], x2
  .if \avg
-        ld1             {v4.16B},  [x12], x2
-        urhadd          v0.16B,  v0.16B,  v4.16B
-        ld1             {v5.16B},  [x12], x2
-        urhadd          v1.16B,  v1.16B,  v5.16B
-        ld1             {v6.16B},  [x12], x2
-        urhadd          v2.16B,  v2.16B,  v6.16B
-        ld1             {v7.16B},  [x12], x2
-        urhadd          v3.16B,  v3.16B,  v7.16B
+        ld1             {v4.16b},  [x12], x2
+        urhadd          v0.16b,  v0.16b,  v4.16b
+        ld1             {v5.16b},  [x12], x2
+        urhadd          v1.16b,  v1.16b,  v5.16b
+        ld1             {v6.16b},  [x12], x2
+        urhadd          v2.16b,  v2.16b,  v6.16b
+        ld1             {v7.16b},  [x12], x2
+        urhadd          v3.16b,  v3.16b,  v7.16b
  .endif
        subs            w3,  w3,  #4
-        st1             {v0.16B},  [x0], x2
-        st1             {v1.16B},  [x0], x2
-        st1             {v2.16B},  [x0], x2
-        st1             {v3.16B},  [x0], x2
+        st1             {v0.16b},  [x0], x2
+        st1             {v1.16b},  [x0], x2
+        st1             {v2.16b},  [x0], x2
+        st1             {v3.16b},  [x0], x2
        b.ne            1b
        ret
 .endm

 .macro  pixels16_x2     rnd=1, avg=0
-1:      ld1             {v0.16B, v1.16B}, [x1], x2
-        ld1             {v2.16B, v3.16B}, [x1], x2
+1:      ld1             {v0.16b, v1.16b}, [x1], x2
+        ld1             {v2.16b, v3.16b}, [x1], x2
        subs            w3,  w3,  #2
-        ext             v1.16B,  v0.16B,  v1.16B,  #1
-        avg             v0.16B,  v0.16B,  v1.16B
-        ext             v3.16B,  v2.16B,  v3.16B,  #1
-        avg             v2.16B,  v2.16B,  v3.16B
+        ext             v1.16b,  v0.16b,  v1.16b,  #1
+        avg             v0.16b,  v0.16b,  v1.16b
+        ext             v3.16b,  v2.16b,  v3.16b,  #1
+        avg             v2.16b,  v2.16b,  v3.16b
  .if \avg
-        ld1             {v1.16B}, [x0], x2
-        ld1             {v3.16B}, [x0]
-        urhadd          v0.16B,  v0.16B,  v1.16B
-        urhadd          v2.16B,  v2.16B,  v3.16B
+        ld1             {v1.16b}, [x0], x2
+        ld1             {v3.16b}, [x0]
+        urhadd          v0.16b,  v0.16b,  v1.16b
+        urhadd          v2.16b,  v2.16b,  v3.16b
        sub             x0,  x0,  x2
  .endif
-        st1             {v0.16B}, [x0], x2
-        st1             {v2.16B}, [x0], x2
+        st1             {v0.16b}, [x0], x2
+        st1             {v2.16b}, [x0], x2
        b.ne            1b
        ret
 .endm

 .macro  pixels16_y2     rnd=1, avg=0
        sub             w3,  w3,  #2
-        ld1             {v0.16B}, [x1], x2
-        ld1             {v1.16B}, [x1], x2
+        ld1             {v0.16b}, [x1], x2
+        ld1             {v1.16b}, [x1], x2
 1:      subs            w3,  w3,  #2
-        avg             v2.16B,  v0.16B,  v1.16B
-        ld1             {v0.16B}, [x1], x2
-        avg             v3.16B,  v0.16B,  v1.16B
-        ld1             {v1.16B}, [x1], x2
+        avg             v2.16b,  v0.16b,  v1.16b
+        ld1             {v0.16b}, [x1], x2
+        avg             v3.16b,  v0.16b,  v1.16b
+        ld1             {v1.16b}, [x1], x2
  .if \avg
-        ld1             {v4.16B}, [x0], x2
-        ld1             {v5.16B}, [x0]
-        urhadd          v2.16B,  v2.16B,  v4.16B
-        urhadd          v3.16B,  v3.16B,  v5.16B
+        ld1             {v4.16b}, [x0], x2
+        ld1             {v5.16b}, [x0]
+        urhadd          v2.16b,  v2.16b,  v4.16b
+        urhadd          v3.16b,  v3.16b,  v5.16b
        sub             x0,  x0,  x2
  .endif
-        st1             {v2.16B}, [x0], x2
-        st1             {v3.16B}, [x0], x2
+        st1             {v2.16b}, [x0], x2
+        st1             {v3.16b}, [x0], x2
        b.ne            1b

-        avg             v2.16B,  v0.16B,  v1.16B
-        ld1             {v0.16B}, [x1], x2
-        avg             v3.16B,  v0.16B,  v1.16B
+        avg             v2.16b,  v0.16b,  v1.16b
+        ld1             {v0.16b}, [x1], x2
+        avg             v3.16b,  v0.16b,  v1.16b
  .if \avg
-        ld1             {v4.16B}, [x0], x2
-        ld1             {v5.16B}, [x0]
-        urhadd          v2.16B,  v2.16B,  v4.16B
-        urhadd          v3.16B,  v3.16B,  v5.16B
+        ld1             {v4.16b}, [x0], x2
+        ld1             {v5.16b}, [x0]
+        urhadd          v2.16b,  v2.16b,  v4.16b
+        urhadd          v3.16b,  v3.16b,  v5.16b
        sub             x0,  x0,  x2
  .endif
-        st1             {v2.16B},     [x0], x2
-        st1             {v3.16B},     [x0], x2
+        st1             {v2.16b},     [x0], x2
+        st1             {v3.16b},     [x0], x2

        ret
 .endm

 .macro  pixels16_xy2    rnd=1, avg=0
        sub             w3,  w3,  #2
-        ld1             {v0.16B, v1.16B}, [x1], x2
-        ld1             {v4.16B, v5.16B}, [x1], x2
+        ld1             {v0.16b, v1.16b}, [x1], x2
+        ld1             {v4.16b, v5.16b}, [x1], x2
 NRND    movi            v26.8H, #1
-        ext             v1.16B,  v0.16B,  v1.16B,  #1
-        ext             v5.16B,  v4.16B,  v5.16B,  #1
-        uaddl           v16.8H,  v0.8B,   v1.8B
-        uaddl2          v20.8H,  v0.16B,  v1.16B
-        uaddl           v18.8H,  v4.8B,   v5.8B
-        uaddl2          v22.8H,  v4.16B,  v5.16B
+        ext             v1.16b,  v0.16b,  v1.16b,  #1
+        ext             v5.16b,  v4.16b,  v5.16b,  #1
+        uaddl           v16.8h,  v0.8b,   v1.8b
+        uaddl2          v20.8h,  v0.16b,  v1.16b
+        uaddl           v18.8h,  v4.8b,   v5.8b
+        uaddl2          v22.8h,  v4.16b,  v5.16b
 1:      subs            w3,  w3,  #2
-        ld1             {v0.16B, v1.16B}, [x1], x2
-        add             v24.8H,  v16.8H,  v18.8H
+        ld1             {v0.16b, v1.16b}, [x1], x2
+        add             v24.8h,  v16.8h,  v18.8h
 NRND    add             v24.8H,  v24.8H,  v26.8H
-        ext             v30.16B, v0.16B,  v1.16B,  #1
-        add             v1.8H,   v20.8H,  v22.8H
-        mshrn           v28.8B,  v24.8H,  #2
+        ext             v30.16b, v0.16b,  v1.16b,  #1
+        add             v1.8h,   v20.8h,  v22.8h
+        mshrn           v28.8b,  v24.8h,  #2
 NRND    add             v1.8H,   v1.8H,   v26.8H
-        mshrn2          v28.16B, v1.8H,   #2
+        mshrn2          v28.16b, v1.8h,   #2
  .if \avg
-        ld1             {v16.16B},        [x0]
-        urhadd          v28.16B, v28.16B, v16.16B
+        ld1             {v16.16b},        [x0]
+        urhadd          v28.16b, v28.16b, v16.16b
  .endif
-        uaddl           v16.8H,  v0.8B,   v30.8B
-        ld1             {v2.16B, v3.16B}, [x1], x2
-        uaddl2          v20.8H,  v0.16B,  v30.16B
-        st1             {v28.16B},        [x0], x2
-        add             v24.8H,  v16.8H,  v18.8H
+        uaddl           v16.8h,  v0.8b,   v30.8b
+        ld1             {v2.16b, v3.16b}, [x1], x2
+        uaddl2          v20.8h,  v0.16b,  v30.16b
+        st1             {v28.16b},        [x0], x2
+        add             v24.8h,  v16.8h,  v18.8h
 NRND    add             v24.8H,  v24.8H,  v26.8H
-        ext             v3.16B,  v2.16B,  v3.16B,  #1
-        add             v0.8H,   v20.8H,  v22.8H
-        mshrn           v30.8B,  v24.8H,  #2
+        ext             v3.16b,  v2.16b,  v3.16b,  #1
+        add             v0.8h,   v20.8h,  v22.8h
+        mshrn           v30.8b,  v24.8h,  #2
 NRND    add             v0.8H,   v0.8H,   v26.8H
-        mshrn2          v30.16B, v0.8H,   #2
+        mshrn2          v30.16b, v0.8h,   #2
  .if \avg
-        ld1             {v18.16B},        [x0]
-        urhadd          v30.16B, v30.16B, v18.16B
+        ld1             {v18.16b},        [x0]
+        urhadd          v30.16b, v30.16b, v18.16b
  .endif
-        uaddl           v18.8H,   v2.8B,  v3.8B
-        uaddl2          v22.8H,   v2.16B, v3.16B
-        st1             {v30.16B},        [x0], x2
+        uaddl           v18.8h,   v2.8b,  v3.8b
+        uaddl2          v22.8h,   v2.16b, v3.16b
+        st1             {v30.16b},        [x0], x2
        b.gt            1b

-        ld1             {v0.16B, v1.16B}, [x1], x2
-        add             v24.8H,  v16.8H,  v18.8H
+        ld1             {v0.16b, v1.16b}, [x1], x2
+        add             v24.8h,  v16.8h,  v18.8h
 NRND    add             v24.8H,  v24.8H,  v26.8H
-        ext             v30.16B, v0.16B,  v1.16B,  #1
-        add             v1.8H,   v20.8H,  v22.8H
-        mshrn           v28.8B,  v24.8H,  #2
+        ext             v30.16b, v0.16b,  v1.16b,  #1
+        add             v1.8h,   v20.8h,  v22.8h
+        mshrn           v28.8b,  v24.8h,  #2
 NRND    add             v1.8H,   v1.8H,   v26.8H
-        mshrn2          v28.16B, v1.8H,   #2
+        mshrn2          v28.16b, v1.8h,   #2
  .if \avg
-        ld1             {v16.16B},        [x0]
-        urhadd          v28.16B, v28.16B, v16.16B
+        ld1             {v16.16b},        [x0]
+        urhadd          v28.16b, v28.16b, v16.16b
  .endif
-        uaddl           v16.8H,  v0.8B,   v30.8B
-        uaddl2          v20.8H,  v0.16B,  v30.16B
-        st1             {v28.16B},        [x0], x2
-        add             v24.8H,  v16.8H,  v18.8H
+        uaddl           v16.8h,  v0.8b,   v30.8b
+        uaddl2          v20.8h,  v0.16b,  v30.16b
+        st1             {v28.16b},        [x0], x2
+        add             v24.8h,  v16.8h,  v18.8h
 NRND    add             v24.8H,  v24.8H,  v26.8H
-        add             v0.8H,   v20.8H,  v22.8H
-        mshrn           v30.8B,  v24.8H,  #2
+        add             v0.8h,   v20.8h,  v22.8h
+        mshrn           v30.8b,  v24.8h,  #2
 NRND    add             v0.8H,   v0.8H,   v26.8H
-        mshrn2          v30.16B, v0.8H,   #2
+        mshrn2          v30.16b, v0.8h,   #2
  .if \avg
-        ld1             {v18.16B},        [x0]
-        urhadd          v30.16B, v30.16B, v18.16B
+        ld1             {v18.16b},        [x0]
+        urhadd          v30.16b, v30.16b, v18.16b
  .endif
-        st1             {v30.16B},        [x0], x2
+        st1             {v30.16b},        [x0], x2

        ret
 .endm

 .macro  pixels8         rnd=1, avg=0
-1:      ld1             {v0.8B}, [x1], x2
-        ld1             {v1.8B}, [x1], x2
-        ld1             {v2.8B}, [x1], x2
-        ld1             {v3.8B}, [x1], x2
+1:      ld1             {v0.8b}, [x1], x2
+        ld1             {v1.8b}, [x1], x2
+        ld1             {v2.8b}, [x1], x2
+        ld1             {v3.8b}, [x1], x2
  .if \avg
-        ld1             {v4.8B}, [x0], x2
-        urhadd          v0.8B,  v0.8B,  v4.8B
-        ld1             {v5.8B}, [x0], x2
-        urhadd          v1.8B,  v1.8B,  v5.8B
-        ld1             {v6.8B}, [x0], x2
-        urhadd          v2.8B,  v2.8B,  v6.8B
-        ld1             {v7.8B}, [x0], x2
-        urhadd          v3.8B,  v3.8B,  v7.8B
+        ld1             {v4.8b}, [x0], x2
+        urhadd          v0.8b,  v0.8b,  v4.8b
+        ld1             {v5.8b}, [x0], x2
+        urhadd          v1.8b,  v1.8b,  v5.8b
+        ld1             {v6.8b}, [x0], x2
+        urhadd          v2.8b,  v2.8b,  v6.8b
+        ld1             {v7.8b}, [x0], x2
+        urhadd          v3.8b,  v3.8b,  v7.8b
        sub             x0,  x0,  x2,  lsl #2
  .endif
        subs            w3,  w3,  #4
-        st1             {v0.8B}, [x0], x2
-        st1             {v1.8B}, [x0], x2
-        st1             {v2.8B}, [x0], x2
-        st1             {v3.8B}, [x0], x2
+        st1             {v0.8b}, [x0], x2
+        st1             {v1.8b}, [x0], x2
+        st1             {v2.8b}, [x0], x2
+        st1             {v3.8b}, [x0], x2
        b.ne            1b
        ret
 .endm

 .macro  pixels8_x2      rnd=1, avg=0
-1:      ld1             {v0.8B, v1.8B}, [x1], x2
-        ext             v1.8B,  v0.8B,  v1.8B,  #1
-        ld1             {v2.8B, v3.8B}, [x1], x2
-        ext             v3.8B,  v2.8B,  v3.8B,  #1
+1:      ld1             {v0.8b, v1.8b}, [x1], x2
+        ext             v1.8b,  v0.8b,  v1.8b,  #1
+        ld1             {v2.8b, v3.8b}, [x1], x2
+        ext             v3.8b,  v2.8b,  v3.8b,  #1
        subs            w3,  w3,  #2
-        avg             v0.8B,   v0.8B,   v1.8B
-        avg             v2.8B,   v2.8B,   v3.8B
+        avg             v0.8b,   v0.8b,   v1.8b
+        avg             v2.8b,   v2.8b,   v3.8b
  .if \avg
-        ld1             {v4.8B},     [x0], x2
-        ld1             {v5.8B},     [x0]
-        urhadd          v0.8B,   v0.8B,   v4.8B
-        urhadd          v2.8B,   v2.8B,   v5.8B
+        ld1             {v4.8b},     [x0], x2
+        ld1             {v5.8b},     [x0]
+        urhadd          v0.8b,   v0.8b,   v4.8b
+        urhadd          v2.8b,   v2.8b,   v5.8b
        sub             x0,  x0,  x2
  .endif
-        st1             {v0.8B}, [x0], x2
-        st1             {v2.8B}, [x0], x2
+        st1             {v0.8b}, [x0], x2
+        st1             {v2.8b}, [x0], x2
        b.ne            1b
        ret
 .endm

 .macro  pixels8_y2      rnd=1, avg=0
        sub             w3,  w3,  #2
-        ld1             {v0.8B},  [x1], x2
-        ld1             {v1.8B},  [x1], x2
+        ld1             {v0.8b},  [x1], x2
+        ld1             {v1.8b},  [x1], x2
 1:      subs            w3,  w3,  #2
-        avg             v4.8B,  v0.8B,  v1.8B
-        ld1             {v0.8B},  [x1], x2
-        avg             v5.8B,  v0.8B,  v1.8B
-        ld1             {v1.8B},  [x1], x2
+        avg             v4.8b,  v0.8b,  v1.8b
+        ld1             {v0.8b},  [x1], x2
+        avg             v5.8b,  v0.8b,  v1.8b
+        ld1             {v1.8b},  [x1], x2
  .if \avg
-        ld1             {v2.8B},     [x0], x2
-        ld1             {v3.8B},     [x0]
-        urhadd          v4.8B,  v4.8B,  v2.8B
-        urhadd          v5.8B,  v5.8B,  v3.8B
+        ld1             {v2.8b},     [x0], x2
+        ld1             {v3.8b},     [x0]
+        urhadd          v4.8b,  v4.8b,  v2.8b
+        urhadd          v5.8b,  v5.8b,  v3.8b
        sub             x0,  x0,  x2
  .endif
-        st1             {v4.8B},     [x0], x2
-        st1             {v5.8B},     [x0], x2
+        st1             {v4.8b},     [x0], x2
+        st1             {v5.8b},     [x0], x2
        b.ne            1b

-        avg             v4.8B,  v0.8B,  v1.8B
-        ld1             {v0.8B},  [x1], x2
-        avg             v5.8B,  v0.8B,  v1.8B
+        avg             v4.8b,  v0.8b,  v1.8b
+        ld1             {v0.8b},  [x1], x2
+        avg             v5.8b,  v0.8b,  v1.8b
  .if \avg
-        ld1             {v2.8B},     [x0], x2
-        ld1             {v3.8B},     [x0]
-        urhadd          v4.8B,  v4.8B,  v2.8B
-        urhadd          v5.8B,  v5.8B,  v3.8B
+        ld1             {v2.8b},     [x0], x2
+        ld1             {v3.8b},     [x0]
+        urhadd          v4.8b,  v4.8b,  v2.8b
+        urhadd          v5.8b,  v5.8b,  v3.8b
        sub             x0,  x0,  x2
  .endif
-        st1             {v4.8B},     [x0], x2
-        st1             {v5.8B},     [x0], x2
+        st1             {v4.8b},     [x0], x2
+        st1             {v5.8b},     [x0], x2

        ret
 .endm

 .macro  pixels8_xy2     rnd=1, avg=0
        sub             w3,  w3,  #2
-        ld1             {v0.16B},     [x1], x2
-        ld1             {v1.16B},     [x1], x2
+        ld1             {v0.16b},     [x1], x2
+        ld1             {v1.16b},     [x1], x2
 NRND    movi            v19.8H, #1
-        ext             v4.16B,  v0.16B,  v4.16B,  #1
-        ext             v6.16B,  v1.16B,  v6.16B,  #1
-        uaddl           v16.8H,  v0.8B,  v4.8B
-        uaddl           v17.8H,  v1.8B,  v6.8B
+        ext             v4.16b,  v0.16b,  v4.16b,  #1
+        ext             v6.16b,  v1.16b,  v6.16b,  #1
+        uaddl           v16.8h,  v0.8b,  v4.8b
+        uaddl           v17.8h,  v1.8b,  v6.8b
 1:      subs            w3,  w3,  #2
-        ld1             {v0.16B},     [x1], x2
-        add             v18.8H, v16.8H,  v17.8H
-        ext             v4.16B,  v0.16B,  v4.16B,  #1
+        ld1             {v0.16b},     [x1], x2
+        add             v18.8h, v16.8h,  v17.8h
+        ext             v4.16b,  v0.16b,  v4.16b,  #1
 NRND    add             v18.8H, v18.8H, v19.8H
-        uaddl           v16.8H,  v0.8B,  v4.8B
-        mshrn           v5.8B,  v18.8H, #2
-        ld1             {v1.16B},     [x1], x2
-        add             v18.8H, v16.8H,  v17.8H
+        uaddl           v16.8h,  v0.8b,  v4.8b
+        mshrn           v5.8b,  v18.8h, #2
+        ld1             {v1.16b},     [x1], x2
+        add             v18.8h, v16.8h,  v17.8h
  .if \avg
-        ld1             {v7.8B},     [x0]
-        urhadd          v5.8B,  v5.8B,  v7.8B
+        ld1             {v7.8b},     [x0]
+        urhadd          v5.8b,  v5.8b,  v7.8b
  .endif
 NRND    add             v18.8H, v18.8H, v19.8H
-        st1             {v5.8B},     [x0], x2
-        mshrn           v7.8B,  v18.8H, #2
+        st1             {v5.8b},     [x0], x2
+        mshrn           v7.8b,  v18.8h, #2
  .if \avg
-        ld1             {v5.8B},     [x0]
-        urhadd          v7.8B,  v7.8B,  v5.8B
+        ld1             {v5.8b},     [x0]
+        urhadd          v7.8b,  v7.8b,  v5.8b
  .endif
-        ext             v6.16B,  v1.16B,  v6.16B,  #1
-        uaddl           v17.8H,  v1.8B,   v6.8B
-        st1             {v7.8B},     [x0], x2
+        ext             v6.16b,  v1.16b,  v6.16b,  #1
+        uaddl           v17.8h,  v1.8b,   v6.8b
+        st1             {v7.8b},     [x0], x2
        b.gt            1b

-        ld1             {v0.16B},     [x1], x2
-        add             v18.8H, v16.8H, v17.8H
-        ext             v4.16B, v0.16B, v4.16B,  #1
+        ld1             {v0.16b},     [x1], x2
+        add             v18.8h, v16.8h, v17.8h
+        ext             v4.16b, v0.16b, v4.16b,  #1
 NRND    add             v18.8H, v18.8H, v19.8H
-        uaddl           v16.8H,  v0.8B, v4.8B
-        mshrn           v5.8B,  v18.8H, #2
-        add             v18.8H, v16.8H, v17.8H
+        uaddl           v16.8h,  v0.8b, v4.8b
+        mshrn           v5.8b,  v18.8h, #2
+        add             v18.8h, v16.8h, v17.8h
  .if \avg
-        ld1             {v7.8B},     [x0]
-        urhadd          v5.8B,  v5.8B,  v7.8B
+        ld1             {v7.8b},     [x0]
+        urhadd          v5.8b,  v5.8b,  v7.8b
  .endif
 NRND    add             v18.8H, v18.8H, v19.8H
-        st1             {v5.8B},     [x0], x2
-        mshrn           v7.8B,  v18.8H, #2
+        st1             {v5.8b},     [x0], x2
+        mshrn           v7.8b,  v18.8h, #2
  .if \avg
-        ld1             {v5.8B},     [x0]
-        urhadd          v7.8B,  v7.8B,  v5.8B
+        ld1             {v5.8b},     [x0]
+        urhadd          v7.8b,  v7.8b,  v5.8b
  .endif
-        st1             {v7.8B},     [x0], x2
+        st1             {v7.8b},     [x0], x2

        ret
 .endm
@@ -19,6 +19,7 @@
 #ifndef AVCODEC_AARCH64_IDCT_H
 #define AVCODEC_AARCH64_IDCT_H

+#include <stddef.h>
 #include <stdint.h>

 void ff_simple_idct_neon(int16_t *data);
@@ -17,133 +17,133 @@
 */

 .macro  transpose_8x8B  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
-        trn1            \r8\().8B,  \r0\().8B,  \r1\().8B
-        trn2            \r9\().8B,  \r0\().8B,  \r1\().8B
-        trn1            \r1\().8B,  \r2\().8B,  \r3\().8B
-        trn2            \r3\().8B,  \r2\().8B,  \r3\().8B
-        trn1            \r0\().8B,  \r4\().8B,  \r5\().8B
-        trn2            \r5\().8B,  \r4\().8B,  \r5\().8B
-        trn1            \r2\().8B,  \r6\().8B,  \r7\().8B
-        trn2            \r7\().8B,  \r6\().8B,  \r7\().8B
+        trn1            \r8\().8b,  \r0\().8b,  \r1\().8b
+        trn2            \r9\().8b,  \r0\().8b,  \r1\().8b
+        trn1            \r1\().8b,  \r2\().8b,  \r3\().8b
+        trn2            \r3\().8b,  \r2\().8b,  \r3\().8b
+        trn1            \r0\().8b,  \r4\().8b,  \r5\().8b
+        trn2            \r5\().8b,  \r4\().8b,  \r5\().8b
+        trn1            \r2\().8b,  \r6\().8b,  \r7\().8b
+        trn2            \r7\().8b,  \r6\().8b,  \r7\().8b

-        trn1            \r4\().4H,  \r0\().4H,  \r2\().4H
-        trn2            \r2\().4H,  \r0\().4H,  \r2\().4H
-        trn1            \r6\().4H,  \r5\().4H,  \r7\().4H
-        trn2            \r7\().4H,  \r5\().4H,  \r7\().4H
-        trn1            \r5\().4H,  \r9\().4H,  \r3\().4H
-        trn2            \r9\().4H,  \r9\().4H,  \r3\().4H
-        trn1            \r3\().4H,  \r8\().4H,  \r1\().4H
-        trn2            \r8\().4H,  \r8\().4H,  \r1\().4H
+        trn1            \r4\().4h,  \r0\().4h,  \r2\().4h
+        trn2            \r2\().4h,  \r0\().4h,  \r2\().4h
+        trn1            \r6\().4h,  \r5\().4h,  \r7\().4h
+        trn2            \r7\().4h,  \r5\().4h,  \r7\().4h
+        trn1            \r5\().4h,  \r9\().4h,  \r3\().4h
+        trn2            \r9\().4h,  \r9\().4h,  \r3\().4h
+        trn1            \r3\().4h,  \r8\().4h,  \r1\().4h
+        trn2            \r8\().4h,  \r8\().4h,  \r1\().4h

-        trn1            \r0\().2S,  \r3\().2S,  \r4\().2S
-        trn2            \r4\().2S,  \r3\().2S,  \r4\().2S
+        trn1            \r0\().2s,  \r3\().2s,  \r4\().2s
+        trn2            \r4\().2s,  \r3\().2s,  \r4\().2s

-        trn1            \r1\().2S,  \r5\().2S,  \r6\().2S
-        trn2            \r5\().2S,  \r5\().2S,  \r6\().2S
+        trn1            \r1\().2s,  \r5\().2s,  \r6\().2s
+        trn2            \r5\().2s,  \r5\().2s,  \r6\().2s

-        trn2            \r6\().2S,  \r8\().2S,  \r2\().2S
-        trn1            \r2\().2S,  \r8\().2S,  \r2\().2S
+        trn2            \r6\().2s,  \r8\().2s,  \r2\().2s
+        trn1            \r2\().2s,  \r8\().2s,  \r2\().2s

-        trn1            \r3\().2S,  \r9\().2S,  \r7\().2S
-        trn2            \r7\().2S,  \r9\().2S,  \r7\().2S
+        trn1            \r3\().2s,  \r9\().2s,  \r7\().2s
+        trn2            \r7\().2s,  \r9\().2s,  \r7\().2s
 .endm

 .macro  transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
-        trn1            \t0\().16B, \r0\().16B, \r1\().16B
-        trn2            \t1\().16B, \r0\().16B, \r1\().16B
-        trn1            \r1\().16B, \r2\().16B, \r3\().16B
-        trn2            \r3\().16B, \r2\().16B, \r3\().16B
-        trn1            \r0\().16B, \r4\().16B, \r5\().16B
-        trn2            \r5\().16B, \r4\().16B, \r5\().16B
-        trn1            \r2\().16B, \r6\().16B, \r7\().16B
-        trn2            \r7\().16B, \r6\().16B, \r7\().16B
+        trn1            \t0\().16b, \r0\().16b, \r1\().16b
+        trn2            \t1\().16b, \r0\().16b, \r1\().16b
+        trn1            \r1\().16b, \r2\().16b, \r3\().16b
+        trn2            \r3\().16b, \r2\().16b, \r3\().16b
+        trn1            \r0\().16b, \r4\().16b, \r5\().16b
+        trn2            \r5\().16b, \r4\().16b, \r5\().16b
+        trn1            \r2\().16b, \r6\().16b, \r7\().16b
+        trn2            \r7\().16b, \r6\().16b, \r7\().16b

-        trn1            \r4\().8H,  \r0\().8H,  \r2\().8H
-        trn2            \r2\().8H,  \r0\().8H,  \r2\().8H
-        trn1            \r6\().8H,  \r5\().8H,  \r7\().8H
-        trn2            \r7\().8H,  \r5\().8H,  \r7\().8H
-        trn1            \r5\().8H,  \t1\().8H,  \r3\().8H
-        trn2            \t1\().8H,  \t1\().8H,  \r3\().8H
-        trn1            \r3\().8H,  \t0\().8H,  \r1\().8H
-        trn2            \t0\().8H,  \t0\().8H,  \r1\().8H
+        trn1            \r4\().8h,  \r0\().8h,  \r2\().8h
+        trn2            \r2\().8h,  \r0\().8h,  \r2\().8h
+        trn1            \r6\().8h,  \r5\().8h,  \r7\().8h
+        trn2            \r7\().8h,  \r5\().8h,  \r7\().8h
+        trn1            \r5\().8h,  \t1\().8h,  \r3\().8h
+        trn2            \t1\().8h,  \t1\().8h,  \r3\().8h
+        trn1            \r3\().8h,  \t0\().8h,  \r1\().8h
+        trn2            \t0\().8h,  \t0\().8h,  \r1\().8h

-        trn1            \r0\().4S,  \r3\().4S,  \r4\().4S
-        trn2            \r4\().4S,  \r3\().4S,  \r4\().4S
+        trn1            \r0\().4s,  \r3\().4s,  \r4\().4s
+        trn2            \r4\().4s,  \r3\().4s,  \r4\().4s

-        trn1            \r1\().4S,  \r5\().4S,  \r6\().4S
-        trn2            \r5\().4S,  \r5\().4S,  \r6\().4S
+        trn1            \r1\().4s,  \r5\().4s,  \r6\().4s
+        trn2            \r5\().4s,  \r5\().4s,  \r6\().4s

-        trn2            \r6\().4S,  \t0\().4S,  \r2\().4S
-        trn1            \r2\().4S,  \t0\().4S,  \r2\().4S
+        trn2            \r6\().4s,  \t0\().4s,  \r2\().4s
+        trn1            \r2\().4s,  \t0\().4s,  \r2\().4s

-        trn1            \r3\().4S,  \t1\().4S,  \r7\().4S
-        trn2            \r7\().4S,  \t1\().4S,  \r7\().4S
+        trn1            \r3\().4s,  \t1\().4s,  \r7\().4s
+        trn2            \r7\().4s,  \t1\().4s,  \r7\().4s
 .endm

 .macro  transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
-        trn1            \t4\().16B, \r0\().16B,  \r1\().16B
-        trn2            \t5\().16B, \r0\().16B,  \r1\().16B
-        trn1            \t6\().16B, \r2\().16B,  \r3\().16B
-        trn2            \t7\().16B, \r2\().16B,  \r3\().16B
+        trn1            \t4\().16b, \r0\().16b,  \r1\().16b
+        trn2            \t5\().16b, \r0\().16b,  \r1\().16b
+        trn1            \t6\().16b, \r2\().16b,  \r3\().16b
+        trn2            \t7\().16b, \r2\().16b,  \r3\().16b

-        trn1            \r0\().8H,  \t4\().8H,  \t6\().8H
-        trn2            \r2\().8H,  \t4\().8H,  \t6\().8H
-        trn1            \r1\().8H,  \t5\().8H,  \t7\().8H
-        trn2            \r3\().8H,  \t5\().8H,  \t7\().8H
+        trn1            \r0\().8h,  \t4\().8h,  \t6\().8h
+        trn2            \r2\().8h,  \t4\().8h,  \t6\().8h
+        trn1            \r1\().8h,  \t5\().8h,  \t7\().8h
+        trn2            \r3\().8h,  \t5\().8h,  \t7\().8h
 .endm

 .macro  transpose_4x8B  r0, r1, r2, r3, t4, t5, t6, t7
-        trn1            \t4\().8B,  \r0\().8B,  \r1\().8B
-        trn2            \t5\().8B,  \r0\().8B,  \r1\().8B
-        trn1            \t6\().8B,  \r2\().8B,  \r3\().8B
-        trn2            \t7\().8B,  \r2\().8B,  \r3\().8B
+        trn1            \t4\().8b,  \r0\().8b,  \r1\().8b
+        trn2            \t5\().8b,  \r0\().8b,  \r1\().8b
+        trn1            \t6\().8b,  \r2\().8b,  \r3\().8b
+        trn2            \t7\().8b,  \r2\().8b,  \r3\().8b

-        trn1            \r0\().4H,  \t4\().4H,  \t6\().4H
-        trn2            \r2\().4H,  \t4\().4H,  \t6\().4H
-        trn1            \r1\().4H,  \t5\().4H,  \t7\().4H
-        trn2            \r3\().4H,  \t5\().4H,  \t7\().4H
+        trn1            \r0\().4h,  \t4\().4h,  \t6\().4h
+        trn2            \r2\().4h,  \t4\().4h,  \t6\().4h
+        trn1            \r1\().4h,  \t5\().4h,  \t7\().4h
+        trn2            \r3\().4h,  \t5\().4h,  \t7\().4h
 .endm

 .macro  transpose_4x4H  r0, r1, r2, r3, r4, r5, r6, r7
-        trn1            \r4\().4H,  \r0\().4H,  \r1\().4H
-        trn2            \r5\().4H,  \r0\().4H,  \r1\().4H
-        trn1            \r6\().4H,  \r2\().4H,  \r3\().4H
-        trn2            \r7\().4H,  \r2\().4H,  \r3\().4H
-        trn1            \r0\().2S,  \r4\().2S,  \r6\().2S
-        trn2            \r2\().2S,  \r4\().2S,  \r6\().2S
-        trn1            \r1\().2S,  \r5\().2S,  \r7\().2S
-        trn2            \r3\().2S,  \r5\().2S,  \r7\().2S
+        trn1            \r4\().4h,  \r0\().4h,  \r1\().4h
+        trn2            \r5\().4h,  \r0\().4h,  \r1\().4h
+        trn1            \r6\().4h,  \r2\().4h,  \r3\().4h
+        trn2            \r7\().4h,  \r2\().4h,  \r3\().4h
+        trn1            \r0\().2s,  \r4\().2s,  \r6\().2s
+        trn2            \r2\().2s,  \r4\().2s,  \r6\().2s
+        trn1            \r1\().2s,  \r5\().2s,  \r7\().2s
+        trn2            \r3\().2s,  \r5\().2s,  \r7\().2s
 .endm

 .macro  transpose_8x8H  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
-        trn1            \r8\().8H,  \r0\().8H,  \r1\().8H
-        trn2            \r9\().8H,  \r0\().8H,  \r1\().8H
-        trn1            \r1\().8H,  \r2\().8H,  \r3\().8H
-        trn2            \r3\().8H,  \r2\().8H,  \r3\().8H
-        trn1            \r0\().8H,  \r4\().8H,  \r5\().8H
-        trn2            \r5\().8H,  \r4\().8H,  \r5\().8H
-        trn1            \r2\().8H,  \r6\().8H,  \r7\().8H
-        trn2            \r7\().8H,  \r6\().8H,  \r7\().8H
+        trn1            \r8\().8h,  \r0\().8h,  \r1\().8h
+        trn2            \r9\().8h,  \r0\().8h,  \r1\().8h
+        trn1            \r1\().8h,  \r2\().8h,  \r3\().8h
+        trn2            \r3\().8h,  \r2\().8h,  \r3\().8h
+        trn1            \r0\().8h,  \r4\().8h,  \r5\().8h
+        trn2            \r5\().8h,  \r4\().8h,  \r5\().8h
+        trn1            \r2\().8h,  \r6\().8h,  \r7\().8h
+        trn2            \r7\().8h,  \r6\().8h,  \r7\().8h

-        trn1            \r4\().4S,  \r0\().4S,  \r2\().4S
-        trn2            \r2\().4S,  \r0\().4S,  \r2\().4S
-        trn1            \r6\().4S,  \r5\().4S,  \r7\().4S
-        trn2            \r7\().4S,  \r5\().4S,  \r7\().4S
-        trn1            \r5\().4S,  \r9\().4S,  \r3\().4S
-        trn2            \r9\().4S,  \r9\().4S,  \r3\().4S
-        trn1            \r3\().4S,  \r8\().4S,  \r1\().4S
-        trn2            \r8\().4S,  \r8\().4S,  \r1\().4S
+        trn1            \r4\().4s,  \r0\().4s,  \r2\().4s
+        trn2            \r2\().4s,  \r0\().4s,  \r2\().4s
+        trn1            \r6\().4s,  \r5\().4s,  \r7\().4s
+        trn2            \r7\().4s,  \r5\().4s,  \r7\().4s
+        trn1            \r5\().4s,  \r9\().4s,  \r3\().4s
+        trn2            \r9\().4s,  \r9\().4s,  \r3\().4s
+        trn1            \r3\().4s,  \r8\().4s,  \r1\().4s
+        trn2            \r8\().4s,  \r8\().4s,  \r1\().4s

-        trn1            \r0\().2D,  \r3\().2D,  \r4\().2D
-        trn2            \r4\().2D,  \r3\().2D,  \r4\().2D
+        trn1            \r0\().2d,  \r3\().2d,  \r4\().2d
+        trn2            \r4\().2d,  \r3\().2d,  \r4\().2d

-        trn1            \r1\().2D,  \r5\().2D,  \r6\().2D
-        trn2            \r5\().2D,  \r5\().2D,  \r6\().2D
+        trn1            \r1\().2d,  \r5\().2d,  \r6\().2d
+        trn2            \r5\().2d,  \r5\().2d,  \r6\().2d

-        trn2            \r6\().2D,  \r8\().2D,  \r2\().2D
-        trn1            \r2\().2D,  \r8\().2D,  \r2\().2D
+        trn2            \r6\().2d,  \r8\().2d,  \r2\().2d
+        trn1            \r2\().2d,  \r8\().2d,  \r2\().2d

-        trn1            \r3\().2D,  \r9\().2D,  \r7\().2D
-        trn2            \r7\().2D,  \r9\().2D,  \r7\().2D
+        trn1            \r3\().2d,  \r9\().2d,  \r7\().2d
+        trn2            \r7\().2d,  \r9\().2d,  \r7\().2d

 .endm
@@ -33,81 +33,81 @@ const tab_x2, align=4
 endconst

 function ff_opus_deemphasis_neon, export=1
-        movrel  x4, tab_st
-        ld1    {v4.4s}, [x4]
-        movrel  x4, tab_x0
-        ld1    {v5.4s}, [x4]
-        movrel  x4, tab_x1
-        ld1    {v6.4s}, [x4]
-        movrel  x4, tab_x2
-        ld1    {v7.4s}, [x4]
+        movrel          x4, tab_st
+        ld1             {v4.4s}, [x4]
+        movrel          x4, tab_x0
+        ld1             {v5.4s}, [x4]
+        movrel          x4, tab_x1
+        ld1             {v6.4s}, [x4]
+        movrel          x4, tab_x2
+        ld1             {v7.4s}, [x4]

-        fmul v0.4s, v4.4s, v0.s[0]
+        fmul            v0.4s, v4.4s, v0.s[0]

-1:      ld1  {v1.4s, v2.4s}, [x1], #32
+1:      ld1             {v1.4s, v2.4s}, [x1], #32

-        fmla v0.4s, v5.4s, v1.s[0]
-        fmul v3.4s, v7.4s, v2.s[2]
+        fmla            v0.4s, v5.4s, v1.s[0]
+        fmul            v3.4s, v7.4s, v2.s[2]

-        fmla v0.4s, v6.4s, v1.s[1]
-        fmla v3.4s, v6.4s, v2.s[1]
+        fmla            v0.4s, v6.4s, v1.s[1]
+        fmla            v3.4s, v6.4s, v2.s[1]

-        fmla v0.4s, v7.4s, v1.s[2]
-        fmla v3.4s, v5.4s, v2.s[0]
+        fmla            v0.4s, v7.4s, v1.s[2]
+        fmla            v3.4s, v5.4s, v2.s[0]

-        fadd v1.4s, v1.4s, v0.4s
-        fadd v2.4s, v2.4s, v3.4s
+        fadd            v1.4s, v1.4s, v0.4s
+        fadd            v2.4s, v2.4s, v3.4s

-        fmla v2.4s, v4.4s, v1.s[3]
+        fmla            v2.4s, v4.4s, v1.s[3]

-        st1  {v1.4s, v2.4s}, [x0], #32
-        fmul v0.4s, v4.4s, v2.s[3]
+        st1             {v1.4s, v2.4s}, [x0], #32
+        fmul            v0.4s, v4.4s, v2.s[3]

-        subs w2, w2, #8
-        b.gt 1b
+        subs            w2, w2, #8
+        b.gt            1b

-        mov s0, v2.s[3]
+        mov             s0, v2.s[3]

        ret
 endfunc

 function ff_opus_postfilter_neon, export=1
-        ld1 {v0.4s}, [x2]
-        dup v1.4s, v0.s[1]
-        dup v2.4s, v0.s[2]
-        dup v0.4s, v0.s[0]
+        ld1             {v0.4s}, [x2]
+        dup             v1.4s, v0.s[1]
+        dup             v2.4s, v0.s[2]
+        dup             v0.4s, v0.s[0]

-        add w1, w1, #2
-        sub x1, x0, x1, lsl #2
+        add             w1, w1, #2
+        sub             x1, x0, x1, lsl #2

-        ld1 {v3.4s}, [x1]
-        fmul v3.4s, v3.4s, v2.4s
+        ld1             {v3.4s}, [x1]
+        fmul            v3.4s, v3.4s, v2.4s

-1:      add x1, x1, #4
-        ld1 {v4.4s}, [x1]
-        add x1, x1, #4
-        ld1 {v5.4s}, [x1]
-        add x1, x1, #4
-        ld1 {v6.4s}, [x1]
-        add x1, x1, #4
-        ld1 {v7.4s}, [x1]
+1:      add             x1, x1, #4
+        ld1             {v4.4s}, [x1]
+        add             x1, x1, #4
+        ld1             {v5.4s}, [x1]
+        add             x1, x1, #4
+        ld1             {v6.4s}, [x1]
+        add             x1, x1, #4
+        ld1             {v7.4s}, [x1]

-        fmla v3.4s, v7.4s, v2.4s
-        fadd v6.4s, v6.4s, v4.4s
+        fmla            v3.4s, v7.4s, v2.4s
+        fadd            v6.4s, v6.4s, v4.4s

-        ld1 {v4.4s}, [x0]
-        fmla v4.4s, v5.4s, v0.4s
+        ld1             {v4.4s}, [x0]
+        fmla            v4.4s, v5.4s, v0.4s

-        fmul v6.4s, v6.4s, v1.4s
-        fadd v6.4s, v6.4s, v3.4s
+        fmul            v6.4s, v6.4s, v1.4s
+        fadd            v6.4s, v6.4s, v3.4s

-        fadd v4.4s, v4.4s, v6.4s
-        fmul v3.4s, v7.4s, v2.4s
+        fadd            v4.4s, v4.4s, v6.4s
+        fmul            v3.4s, v7.4s, v2.4s

-        st1  {v4.4s}, [x0], #16
+        st1             {v4.4s}, [x0], #16

-        subs w3, w3, #4
-        b.gt 1b
+        subs            w3, w3, #4
+        b.gt            1b

        ret
 endfunc
@@ -46,49 +46,49 @@ function ff_sbr_sum64x5_neon, export=1
        add             x3, x0, #192*4
        add             x4, x0, #256*4
        mov             x5, #64
-1:      ld1             {v0.4S}, [x0]
-        ld1             {v1.4S}, [x1], #16
-        fadd            v0.4S, v0.4S, v1.4S
-        ld1             {v2.4S}, [x2], #16
-        fadd            v0.4S, v0.4S, v2.4S
-        ld1             {v3.4S}, [x3], #16
-        fadd            v0.4S, v0.4S, v3.4S
-        ld1             {v4.4S}, [x4], #16
-        fadd            v0.4S, v0.4S, v4.4S
-        st1             {v0.4S}, [x0], #16
+1:      ld1             {v0.4s}, [x0]
+        ld1             {v1.4s}, [x1], #16
+        fadd            v0.4s, v0.4s, v1.4s
+        ld1             {v2.4s}, [x2], #16
+        fadd            v0.4s, v0.4s, v2.4s
+        ld1             {v3.4s}, [x3], #16
+        fadd            v0.4s, v0.4s, v3.4s
+        ld1             {v4.4s}, [x4], #16
+        fadd            v0.4s, v0.4s, v4.4s
+        st1             {v0.4s}, [x0], #16
        subs            x5, x5, #4
        b.gt            1b
        ret
 endfunc

 function ff_sbr_sum_square_neon, export=1
-        movi            v0.4S, #0
-1:      ld1             {v1.4S}, [x0], #16
-        fmla            v0.4S, v1.4S, v1.4S
+        movi            v0.4s, #0
+1:      ld1             {v1.4s}, [x0], #16
+        fmla            v0.4s, v1.4s, v1.4s
        subs            w1, w1, #2
        b.gt            1b
-        faddp           v0.4S, v0.4S, v0.4S
-        faddp           v0.4S, v0.4S, v0.4S
+        faddp           v0.4s, v0.4s, v0.4s
+        faddp           v0.4s, v0.4s, v0.4s
        ret
 endfunc

 function ff_sbr_neg_odd_64_neon, export=1
        mov             x1, x0
-        movi            v5.4S, #1<<7, lsl #24
-        ld2             {v0.4S, v1.4S}, [x0], #32
-        eor             v1.16B, v1.16B, v5.16B
-        ld2             {v2.4S, v3.4S}, [x0], #32
+        movi            v5.4s, #1<<7, lsl #24
+        ld2             {v0.4s, v1.4s}, [x0], #32
+        eor             v1.16b, v1.16b, v5.16b
+        ld2             {v2.4s, v3.4s}, [x0], #32
 .rept 3
-        st2             {v0.4S, v1.4S}, [x1], #32
-        eor             v3.16B, v3.16B, v5.16B
-        ld2             {v0.4S, v1.4S}, [x0], #32
-        st2             {v2.4S, v3.4S}, [x1], #32
-        eor             v1.16B, v1.16B, v5.16B
-        ld2             {v2.4S, v3.4S}, [x0], #32
+        st2             {v0.4s, v1.4s}, [x1], #32
+        eor             v3.16b, v3.16b, v5.16b
+        ld2             {v0.4s, v1.4s}, [x0], #32
+        st2             {v2.4s, v3.4s}, [x1], #32
+        eor             v1.16b, v1.16b, v5.16b
+        ld2             {v2.4s, v3.4s}, [x0], #32
 .endr
-        eor             v3.16B, v3.16B, v5.16B
-        st2             {v0.4S, v1.4S}, [x1], #32
-        st2             {v2.4S, v3.4S}, [x1], #32
+        eor             v3.16b, v3.16b, v5.16b
+        st2             {v0.4s, v1.4s}, [x1], #32
+        st2             {v2.4s, v3.4s}, [x1], #32
        ret
 endfunc

@@ -97,26 +97,26 @@ function ff_sbr_qmf_pre_shuffle_neon, export=1
        add             x2, x0, #64*4
        mov             x3, #-16
        mov             x4, #-4
-        movi            v6.4S, #1<<7, lsl #24
-        ld1             {v0.2S}, [x0], #8
-        st1             {v0.2S}, [x2], #8
+        movi            v6.4s, #1<<7, lsl #24
+        ld1             {v0.2s}, [x0], #8
+        st1             {v0.2s}, [x2], #8
 .rept 7
-        ld1             {v1.4S}, [x1], x3
-        ld1             {v2.4S}, [x0], #16
-        eor             v1.16B, v1.16B, v6.16B
-        rev64           v1.4S, v1.4S
-        ext             v1.16B, v1.16B, v1.16B, #8
-        st2             {v1.4S, v2.4S}, [x2], #32
+        ld1             {v1.4s}, [x1], x3
+        ld1             {v2.4s}, [x0], #16
+        eor             v1.16b, v1.16b, v6.16b
+        rev64           v1.4s, v1.4s
+        ext             v1.16b, v1.16b, v1.16b, #8
+        st2             {v1.4s, v2.4s}, [x2], #32
 .endr
        add             x1, x1, #8
-        ld1             {v1.2S}, [x1], x4
-        ld1             {v2.2S}, [x0], #8
-        ld1             {v1.S}[3], [x1]
-        ld1             {v2.S}[2], [x0]
-        eor             v1.16B, v1.16B, v6.16B
-        rev64           v1.4S, v1.4S
-        st2             {v1.2S, v2.2S}, [x2], #16
-        st2             {v1.S, v2.S}[2], [x2]
+        ld1             {v1.2s}, [x1], x4
+        ld1             {v2.2s}, [x0], #8
+        ld1             {v1.s}[3], [x1]
+        ld1             {v2.s}[2], [x0]
+        eor             v1.16b, v1.16b, v6.16b
+        rev64           v1.4s, v1.4s
+        st2             {v1.2s, v2.2s}, [x2], #16
+        st2             {v1.s, v2.s}[2], [x2]
        ret
 endfunc

@@ -124,13 +124,13 @@ function ff_sbr_qmf_post_shuffle_neon, export=1
        add             x2, x1, #60*4
        mov             x3, #-16
        mov             x4, #32
-        movi            v6.4S, #1<<7, lsl #24
-1:      ld1             {v0.4S}, [x2], x3
-        ld1             {v1.4S}, [x1], #16
-        eor             v0.16B, v0.16B, v6.16B
-        rev64           v0.4S, v0.4S
-        ext             v0.16B, v0.16B, v0.16B, #8
-        st2             {v0.4S, v1.4S}, [x0], #32
+        movi            v6.4s, #1<<7, lsl #24
+1:      ld1             {v0.4s}, [x2], x3
+        ld1             {v1.4s}, [x1], #16
+        eor             v0.16b, v0.16b, v6.16b
+        rev64           v0.4s, v0.4s
+        ext             v0.16b, v0.16b, v0.16b, #8
+        st2             {v0.4s, v1.4s}, [x0], #32
        subs            x4, x4, #4
        b.gt            1b
        ret
@@ -141,13 +141,13 @@ function ff_sbr_qmf_deint_neg_neon, export=1
        add             x2, x0, #60*4
        mov             x3, #-32
        mov             x4, #32
-        movi            v2.4S, #1<<7, lsl #24
-1:      ld2             {v0.4S, v1.4S}, [x1], x3
-        eor             v0.16B, v0.16B, v2.16B
-        rev64           v1.4S, v1.4S
-        ext             v1.16B, v1.16B, v1.16B, #8
-        st1             {v0.4S}, [x2]
-        st1             {v1.4S}, [x0], #16
+        movi            v2.4s, #1<<7, lsl #24
+1:      ld2             {v0.4s, v1.4s}, [x1], x3
+        eor             v0.16b, v0.16b, v2.16b
+        rev64           v1.4s, v1.4s
+        ext             v1.16b, v1.16b, v1.16b, #8
+        st1             {v0.4s}, [x2]
+        st1             {v1.4s}, [x0], #16
        sub             x2, x2, #16
        subs            x4, x4, #4
        b.gt            1b
@@ -159,16 +159,16 @@ function ff_sbr_qmf_deint_bfly_neon, export=1
        add             x3, x0, #124*4
        mov             x4, #64
        mov             x5, #-16
-1:      ld1             {v0.4S}, [x1], #16
-        ld1             {v1.4S}, [x2], x5
-        rev64           v2.4S, v0.4S
-        ext             v2.16B, v2.16B, v2.16B, #8
-        rev64           v3.4S, v1.4S
-        ext             v3.16B, v3.16B, v3.16B, #8
-        fadd            v1.4S, v1.4S, v2.4S
-        fsub            v0.4S, v0.4S, v3.4S
-        st1             {v0.4S}, [x0], #16
-        st1             {v1.4S}, [x3], x5
+1:      ld1             {v0.4s}, [x1], #16
+        ld1             {v1.4s}, [x2], x5
+        rev64           v2.4s, v0.4s
+        ext             v2.16b, v2.16b, v2.16b, #8
+        rev64           v3.4s, v1.4s
+        ext             v3.16b, v3.16b, v3.16b, #8
+        fadd            v1.4s, v1.4s, v2.4s
+        fsub            v0.4s, v0.4s, v3.4s
+        st1             {v0.4s}, [x0], #16
+        st1             {v1.4s}, [x3], x5
        subs            x4, x4, #4
        b.gt            1b
        ret
@@ -178,32 +178,32 @@ function ff_sbr_hf_gen_neon, export=1
        sxtw            x4, w4
        sxtw            x5, w5
        movrel          x6, factors
-        ld1             {v7.4S}, [x6]
-        dup             v1.4S, v0.S[0]
-        mov             v2.8B, v1.8B
-        mov             v2.S[2], v7.S[0]
-        mov             v2.S[3], v7.S[0]
-        fmul            v1.4S, v1.4S, v2.4S
-        ld1             {v0.D}[0], [x3]
-        ld1             {v0.D}[1], [x2]
-        fmul            v0.4S, v0.4S, v1.4S
-        fmul            v1.4S, v0.4S, v7.4S
-        rev64           v0.4S, v0.4S
+        ld1             {v7.4s}, [x6]
+        dup             v1.4s, v0.s[0]
+        mov             v2.8b, v1.8b
+        mov             v2.s[2], v7.s[0]
+        mov             v2.s[3], v7.s[0]
+        fmul            v1.4s, v1.4s, v2.4s
+        ld1             {v0.d}[0], [x3]
+        ld1             {v0.d}[1], [x2]
+        fmul            v0.4s, v0.4s, v1.4s
+        fmul            v1.4s, v0.4s, v7.4s
+        rev64           v0.4s, v0.4s
        sub             x7, x5, x4
        add             x0, x0, x4, lsl #3
        add             x1, x1, x4, lsl #3
        sub             x1, x1, #16
-1:      ld1             {v2.4S}, [x1], #16
-        ld1             {v3.2S}, [x1]
-        fmul            v4.4S, v2.4S, v1.4S
-        fmul            v5.4S, v2.4S, v0.4S
-        faddp           v4.4S, v4.4S, v4.4S
-        faddp           v5.4S, v5.4S, v5.4S
-        faddp           v4.4S, v4.4S, v4.4S
-        faddp           v5.4S, v5.4S, v5.4S
-        mov             v4.S[1], v5.S[0]
-        fadd            v4.2S, v4.2S, v3.2S
-        st1             {v4.2S}, [x0], #8
+1:      ld1             {v2.4s}, [x1], #16
+        ld1             {v3.2s}, [x1]
+        fmul            v4.4s, v2.4s, v1.4s
+        fmul            v5.4s, v2.4s, v0.4s
+        faddp           v4.4s, v4.4s, v4.4s
+        faddp           v5.4s, v5.4s, v5.4s
+        faddp           v4.4s, v4.4s, v4.4s
+        faddp           v5.4s, v5.4s, v5.4s
+        mov             v4.s[1], v5.s[0]
+        fadd            v4.2s, v4.2s, v3.2s
+        st1             {v4.2s}, [x0], #8
        sub             x1, x1, #8
        subs            x7, x7, #1
        b.gt            1b
@@ -215,10 +215,10 @@ function ff_sbr_hf_g_filt_neon, export=1
        sxtw            x4, w4
        mov             x5, #40*2*4
        add             x1, x1, x4, lsl #3
-1:      ld1             {v0.2S}, [x1], x5
-        ld1             {v1.S}[0], [x2], #4
-        fmul            v2.4S, v0.4S, v1.S[0]
-        st1             {v2.2S}, [x0], #8
+1:      ld1             {v0.2s}, [x1], x5
+        ld1             {v1.s}[0], [x2], #4
+        fmul            v2.4s, v0.4s, v1.s[0]
+        st1             {v2.2s}, [x0], #8
        subs            x3, x3, #1
        b.gt            1b
        ret
@@ -227,46 +227,46 @@ endfunc
 function ff_sbr_autocorrelate_neon, export=1
        mov             x2, #38
        movrel          x3, factors
-        ld1             {v0.4S}, [x3]
-        movi            v1.4S, #0
-        movi            v2.4S, #0
-        movi            v3.4S, #0
-        ld1             {v4.2S}, [x0], #8
-        ld1             {v5.2S}, [x0], #8
-        fmul            v16.2S, v4.2S, v4.2S
-        fmul            v17.2S, v5.2S, v4.S[0]
-        fmul            v18.2S, v5.2S, v4.S[1]
-1:      ld1             {v5.D}[1], [x0], #8
-        fmla            v1.2S, v4.2S, v4.2S
-        fmla            v2.4S, v5.4S, v4.S[0]
-        fmla            v3.4S, v5.4S, v4.S[1]
-        mov             v4.D[0], v5.D[0]
-        mov             v5.D[0], v5.D[1]
+        ld1             {v0.4s}, [x3]
+        movi            v1.4s, #0
+        movi            v2.4s, #0
+        movi            v3.4s, #0
+        ld1             {v4.2s}, [x0], #8
+        ld1             {v5.2s}, [x0], #8
+        fmul            v16.2s, v4.2s, v4.2s
+        fmul            v17.2s, v5.2s, v4.s[0]
+        fmul            v18.2s, v5.2s, v4.s[1]
+1:      ld1             {v5.d}[1], [x0], #8
+        fmla            v1.2s, v4.2s, v4.2s
+        fmla            v2.4s, v5.4s, v4.s[0]
+        fmla            v3.4s, v5.4s, v4.s[1]
+        mov             v4.d[0], v5.d[0]
+        mov             v5.d[0], v5.d[1]
        subs            x2, x2, #1
        b.gt            1b
-        fmul            v19.2S, v4.2S, v4.2S
-        fmul            v20.2S, v5.2S, v4.S[0]
-        fmul            v21.2S, v5.2S, v4.S[1]
-        fadd            v22.4S, v2.4S, v20.4S
-        fsub            v22.4S, v22.4S, v17.4S
-        fadd            v23.4S, v3.4S, v21.4S
-        fsub            v23.4S, v23.4S, v18.4S
-        rev64           v23.4S, v23.4S
-        fmul            v23.4S, v23.4S, v0.4S
-        fadd            v22.4S, v22.4S, v23.4S
-        st1             {v22.4S}, [x1], #16
-        fadd            v23.2S, v1.2S, v19.2S
-        fsub            v23.2S, v23.2S, v16.2S
-        faddp           v23.2S, v23.2S, v23.2S
-        st1             {v23.S}[0], [x1]
+        fmul            v19.2s, v4.2s, v4.2s
+        fmul            v20.2s, v5.2s, v4.s[0]
+        fmul            v21.2s, v5.2s, v4.s[1]
+        fadd            v22.4s, v2.4s, v20.4s
+        fsub            v22.4s, v22.4s, v17.4s
+        fadd            v23.4s, v3.4s, v21.4s
+        fsub            v23.4s, v23.4s, v18.4s
+        rev64           v23.4s, v23.4s
+        fmul            v23.4s, v23.4s, v0.4s
+        fadd            v22.4s, v22.4s, v23.4s
+        st1             {v22.4s}, [x1], #16
+        fadd            v23.2s, v1.2s, v19.2s
+        fsub            v23.2s, v23.2s, v16.2s
+        faddp           v23.2s, v23.2s, v23.2s
+        st1             {v23.s}[0], [x1]
        add             x1, x1, #8
-        rev64           v3.2S, v3.2S
-        fmul            v3.2S, v3.2S, v0.2S
-        fadd            v2.2S, v2.2S, v3.2S
-        st1             {v2.2S}, [x1]
+        rev64           v3.2s, v3.2s
+        fmul            v3.2s, v3.2s, v0.2s
+        fadd            v2.2s, v2.2s, v3.2s
+        st1             {v2.2s}, [x1]
        add             x1, x1, #16
-        faddp           v1.2S, v1.2S, v1.2S
-        st1             {v1.S}[0], [x1]
+        faddp           v1.2s, v1.2s, v1.2s
+        st1             {v1.s}[0], [x1]
        ret
 endfunc

@@ -278,25 +278,25 @@ endfunc
 1:      and             x3, x3, #0x1ff
        add             x8, x7, x3, lsl #3
        add             x3, x3, #2
-        ld1             {v2.4S}, [x0]
-        ld1             {v3.2S}, [x1], #8
-        ld1             {v4.2S}, [x2], #8
-        ld1             {v5.4S}, [x8]
-        mov             v6.16B, v2.16B
-        zip1            v3.4S, v3.4S, v3.4S
-        zip1            v4.4S, v4.4S, v4.4S
-        fmla            v6.4S, v1.4S, v3.4S
-        fmla            v2.4S, v5.4S, v4.4S
-        fcmeq           v7.4S, v3.4S, #0
-        bif             v2.16B, v6.16B, v7.16B
-        st1             {v2.4S}, [x0], #16
+        ld1             {v2.4s}, [x0]
+        ld1             {v3.2s}, [x1], #8
+        ld1             {v4.2s}, [x2], #8
+        ld1             {v5.4s}, [x8]
+        mov             v6.16b, v2.16b
+        zip1            v3.4s, v3.4s, v3.4s
+        zip1            v4.4s, v4.4s, v4.4s
+        fmla            v6.4s, v1.4s, v3.4s
+        fmla            v2.4s, v5.4s, v4.4s
+        fcmeq           v7.4s, v3.4s, #0
+        bif             v2.16b, v6.16b, v7.16b
+        st1             {v2.4s}, [x0], #16
        subs            x5, x5, #2
        b.gt            1b
 .endm

 function ff_sbr_hf_apply_noise_0_neon, export=1
        movrel          x9, phi_noise_0
-        ld1             {v1.4S}, [x9]
+        ld1             {v1.4s}, [x9]
        apply_noise_common
        ret
 endfunc
@@ -305,14 +305,14 @@ function ff_sbr_hf_apply_noise_1_neon, export=1
        movrel          x9, phi_noise_1
        and             x4, x4, #1
        add             x9, x9, x4, lsl #4
-        ld1             {v1.4S}, [x9]
+        ld1             {v1.4s}, [x9]
        apply_noise_common
        ret
 endfunc

 function ff_sbr_hf_apply_noise_2_neon, export=1
        movrel          x9, phi_noise_2
-        ld1             {v1.4S}, [x9]
+        ld1             {v1.4s}, [x9]
        apply_noise_common
        ret
 endfunc
@@ -321,7 +321,7 @@ function ff_sbr_hf_apply_noise_3_neon, export=1
        movrel          x9, phi_noise_3
        and             x4, x4, #1
        add             x9, x9, x4, lsl #4
-        ld1             {v1.4S}, [x9]
+        ld1             {v1.4s}, [x9]
        apply_noise_common
        ret
 endfunc
@@ -54,7 +54,7 @@ endconst
        prfm            pldl1keep, [\data]
        mov             x10, x30
        movrel          x3, idct_coeff_neon
-        ld1             {v0.2D}, [x3]
+        ld1             {v0.2d}, [x3]
 .endm

 .macro idct_end
@@ -74,146 +74,146 @@ endconst
 .endm

 .macro idct_col4_top y1, y2, y3, y4, i, l
-        smull\i         v7.4S,  \y3\l, z2
-        smull\i         v16.4S, \y3\l, z6
-        smull\i         v17.4S, \y2\l, z1
-        add             v19.4S, v23.4S, v7.4S
-        smull\i         v18.4S, \y2\l, z3
-        add             v20.4S, v23.4S, v16.4S
-        smull\i         v5.4S,  \y2\l, z5
-        sub             v21.4S, v23.4S, v16.4S
-        smull\i         v6.4S,  \y2\l, z7
-        sub             v22.4S, v23.4S, v7.4S
+        smull\i         v7.4s,  \y3\l, z2
+        smull\i         v16.4s, \y3\l, z6
+        smull\i         v17.4s, \y2\l, z1
+        add             v19.4s, v23.4s, v7.4s
+        smull\i         v18.4s, \y2\l, z3
+        add             v20.4s, v23.4s, v16.4s
+        smull\i         v5.4s,  \y2\l, z5
+        sub             v21.4s, v23.4s, v16.4s
+        smull\i         v6.4s,  \y2\l, z7
+        sub             v22.4s, v23.4s, v7.4s

-        smlal\i         v17.4S, \y4\l, z3
-        smlsl\i         v18.4S, \y4\l, z7
-        smlsl\i         v5.4S,  \y4\l, z1
-        smlsl\i         v6.4S,  \y4\l, z5
+        smlal\i         v17.4s, \y4\l, z3
+        smlsl\i         v18.4s, \y4\l, z7
+        smlsl\i         v5.4s,  \y4\l, z1
+        smlsl\i         v6.4s,  \y4\l, z5
 .endm

 .macro idct_row4_neon y1, y2, y3, y4, pass
-        ld1             {\y1\().2D,\y2\().2D}, [x2], #32
-        movi            v23.4S, #1<<2, lsl #8
-        orr             v5.16B, \y1\().16B, \y2\().16B
-        ld1             {\y3\().2D,\y4\().2D}, [x2], #32
-        orr             v6.16B, \y3\().16B, \y4\().16B
-        orr             v5.16B, v5.16B, v6.16B
-        mov             x3, v5.D[1]
-        smlal           v23.4S, \y1\().4H, z4
+        ld1             {\y1\().2d,\y2\().2d}, [x2], #32
+        movi            v23.4s, #1<<2, lsl #8
+        orr             v5.16b, \y1\().16b, \y2\().16b
+        ld1             {\y3\().2d,\y4\().2d}, [x2], #32
+        orr             v6.16b, \y3\().16b, \y4\().16b
+        orr             v5.16b, v5.16b, v6.16b
+        mov             x3, v5.d[1]
+        smlal           v23.4s, \y1\().4h, z4

-        idct_col4_top   \y1, \y2, \y3, \y4, 1, .4H
+        idct_col4_top   \y1, \y2, \y3, \y4, 1, .4h

        cmp             x3, #0
        b.eq            \pass\()f

-        smull2          v7.4S, \y1\().8H, z4
-        smlal2          v17.4S, \y2\().8H, z5
-        smlsl2          v18.4S, \y2\().8H, z1
-        smull2          v16.4S, \y3\().8H, z2
-        smlal2          v5.4S, \y2\().8H, z7
-        add             v19.4S, v19.4S, v7.4S
-        sub             v20.4S, v20.4S, v7.4S
-        sub             v21.4S, v21.4S, v7.4S
-        add             v22.4S, v22.4S, v7.4S
-        smlal2          v6.4S, \y2\().8H, z3
-        smull2          v7.4S, \y3\().8H, z6
-        smlal2          v17.4S, \y4\().8H, z7
-        smlsl2          v18.4S, \y4\().8H, z5
-        smlal2          v5.4S, \y4\().8H, z3
-        smlsl2          v6.4S, \y4\().8H, z1
-        add             v19.4S, v19.4S, v7.4S
-        sub             v20.4S, v20.4S, v16.4S
-        add             v21.4S, v21.4S, v16.4S
-        sub             v22.4S, v22.4S, v7.4S
+        smull2          v7.4s, \y1\().8h, z4
+        smlal2          v17.4s, \y2\().8h, z5
+        smlsl2          v18.4s, \y2\().8h, z1
+        smull2          v16.4s, \y3\().8h, z2
+        smlal2          v5.4s, \y2\().8h, z7
+        add             v19.4s, v19.4s, v7.4s
+        sub             v20.4s, v20.4s, v7.4s
+        sub             v21.4s, v21.4s, v7.4s
+        add             v22.4s, v22.4s, v7.4s
+        smlal2          v6.4s, \y2\().8h, z3
+        smull2          v7.4s, \y3\().8h, z6
+        smlal2          v17.4s, \y4\().8h, z7
+        smlsl2          v18.4s, \y4\().8h, z5
+        smlal2          v5.4s, \y4\().8h, z3
+        smlsl2          v6.4s, \y4\().8h, z1
+        add             v19.4s, v19.4s, v7.4s
+        sub             v20.4s, v20.4s, v16.4s
+        add             v21.4s, v21.4s, v16.4s
+        sub             v22.4s, v22.4s, v7.4s

 \pass:  add             \y3\().4S, v19.4S, v17.4S
-        add             \y4\().4S, v20.4S, v18.4S
-        shrn            \y1\().4H, \y3\().4S, #ROW_SHIFT
-        shrn            \y2\().4H, \y4\().4S, #ROW_SHIFT
-        add             v7.4S, v21.4S, v5.4S
-        add             v16.4S, v22.4S, v6.4S
-        shrn            \y3\().4H, v7.4S, #ROW_SHIFT
-        shrn            \y4\().4H, v16.4S, #ROW_SHIFT
-        sub             v22.4S, v22.4S, v6.4S
-        sub             v19.4S, v19.4S, v17.4S
-        sub             v21.4S, v21.4S, v5.4S
-        shrn2           \y1\().8H, v22.4S, #ROW_SHIFT
-        sub             v20.4S, v20.4S, v18.4S
-        shrn2           \y2\().8H, v21.4S, #ROW_SHIFT
-        shrn2           \y3\().8H, v20.4S, #ROW_SHIFT
-        shrn2           \y4\().8H, v19.4S, #ROW_SHIFT
+        add             \y4\().4s, v20.4s, v18.4s
+        shrn            \y1\().4h, \y3\().4s, #ROW_SHIFT
+        shrn            \y2\().4h, \y4\().4s, #ROW_SHIFT
+        add             v7.4s, v21.4s, v5.4s
+        add             v16.4s, v22.4s, v6.4s
+        shrn            \y3\().4h, v7.4s, #ROW_SHIFT
+        shrn            \y4\().4h, v16.4s, #ROW_SHIFT
+        sub             v22.4s, v22.4s, v6.4s
+        sub             v19.4s, v19.4s, v17.4s
+        sub             v21.4s, v21.4s, v5.4s
+        shrn2           \y1\().8h, v22.4s, #ROW_SHIFT
+        sub             v20.4s, v20.4s, v18.4s
+        shrn2           \y2\().8h, v21.4s, #ROW_SHIFT
+        shrn2           \y3\().8h, v20.4s, #ROW_SHIFT
+        shrn2           \y4\().8h, v19.4s, #ROW_SHIFT

-        trn1            v16.8H, \y1\().8H, \y2\().8H
-        trn2            v17.8H, \y1\().8H, \y2\().8H
-        trn1            v18.8H, \y3\().8H, \y4\().8H
-        trn2            v19.8H, \y3\().8H, \y4\().8H
-        trn1            \y1\().4S, v16.4S, v18.4S
-        trn1            \y2\().4S, v17.4S, v19.4S
-        trn2            \y3\().4S, v16.4S, v18.4S
-        trn2            \y4\().4S, v17.4S, v19.4S
+        trn1            v16.8h, \y1\().8h, \y2\().8h
+        trn2            v17.8h, \y1\().8h, \y2\().8h
+        trn1            v18.8h, \y3\().8h, \y4\().8h
+        trn2            v19.8h, \y3\().8h, \y4\().8h
+        trn1            \y1\().4s, v16.4s, v18.4s
+        trn1            \y2\().4s, v17.4s, v19.4s
+        trn2            \y3\().4s, v16.4s, v18.4s
+        trn2            \y4\().4s, v17.4s, v19.4s
 .endm

 .macro declare_idct_col4_neon i, l
 function idct_col4_neon\i
-        dup             v23.4H, z4c
+        dup             v23.4h, z4c
 .if \i == 1
-        add             v23.4H, v23.4H, v24.4H
+        add             v23.4h, v23.4h, v24.4h
 .else
-        mov             v5.D[0], v24.D[1]
-        add             v23.4H, v23.4H, v5.4H
+        mov             v5.d[0], v24.d[1]
+        add             v23.4h, v23.4h, v5.4h
 .endif
-        smull           v23.4S, v23.4H, z4
+        smull           v23.4s, v23.4h, z4

        idct_col4_top   v24, v25, v26, v27, \i, \l

-        mov             x4, v28.D[\i - 1]
-        mov             x5, v29.D[\i - 1]
+        mov             x4, v28.d[\i - 1]
+        mov             x5, v29.d[\i - 1]
        cmp             x4, #0
        b.eq            1f

-        smull\i         v7.4S,  v28\l,  z4
-        add             v19.4S, v19.4S, v7.4S
-        sub             v20.4S, v20.4S, v7.4S
-        sub             v21.4S, v21.4S, v7.4S
-        add             v22.4S, v22.4S, v7.4S
+        smull\i         v7.4s,  v28\l,  z4
+        add             v19.4s, v19.4s, v7.4s
+        sub             v20.4s, v20.4s, v7.4s
+        sub             v21.4s, v21.4s, v7.4s
+        add             v22.4s, v22.4s, v7.4s

-1:      mov             x4, v30.D[\i - 1]
+1:      mov             x4, v30.d[\i - 1]
        cmp             x5, #0
        b.eq            2f

-        smlal\i         v17.4S, v29\l, z5
-        smlsl\i         v18.4S, v29\l, z1
-        smlal\i         v5.4S,  v29\l, z7
-        smlal\i         v6.4S,  v29\l, z3
+        smlal\i         v17.4s, v29\l, z5
+        smlsl\i         v18.4s, v29\l, z1
+        smlal\i         v5.4s,  v29\l, z7
+        smlal\i         v6.4s,  v29\l, z3

-2:      mov             x5, v31.D[\i - 1]
+2:      mov             x5, v31.d[\i - 1]
        cmp             x4, #0
        b.eq            3f

-        smull\i         v7.4S,  v30\l, z6
-        smull\i         v16.4S, v30\l, z2
-        add             v19.4S, v19.4S, v7.4S
-        sub             v22.4S, v22.4S, v7.4S
-        sub             v20.4S, v20.4S, v16.4S
-        add             v21.4S, v21.4S, v16.4S
+        smull\i         v7.4s,  v30\l, z6
+        smull\i         v16.4s, v30\l, z2
+        add             v19.4s, v19.4s, v7.4s
+        sub             v22.4s, v22.4s, v7.4s
+        sub             v20.4s, v20.4s, v16.4s
+        add             v21.4s, v21.4s, v16.4s

 3:      cmp             x5, #0
        b.eq            4f

-        smlal\i         v17.4S, v31\l, z7
-        smlsl\i         v18.4S, v31\l, z5
-        smlal\i         v5.4S,  v31\l, z3
-        smlsl\i         v6.4S,  v31\l, z1
+        smlal\i         v17.4s, v31\l, z7
+        smlsl\i         v18.4s, v31\l, z5
+        smlal\i         v5.4s,  v31\l, z3
+        smlsl\i         v6.4s,  v31\l, z1

-4:      addhn           v7.4H, v19.4S, v17.4S
-        addhn2          v7.8H, v20.4S, v18.4S
-        subhn           v18.4H, v20.4S, v18.4S
-        subhn2          v18.8H, v19.4S, v17.4S
+4:      addhn           v7.4h, v19.4s, v17.4s
+        addhn2          v7.8h, v20.4s, v18.4s
+        subhn           v18.4h, v20.4s, v18.4s
+        subhn2          v18.8h, v19.4s, v17.4s

-        addhn           v16.4H, v21.4S, v5.4S
-        addhn2          v16.8H, v22.4S, v6.4S
-        subhn           v17.4H, v22.4S, v6.4S
-        subhn2          v17.8H, v21.4S, v5.4S
+        addhn           v16.4h, v21.4s, v5.4s
+        addhn2          v16.8h, v22.4s, v6.4s
+        subhn           v17.4h, v22.4s, v6.4s
+        subhn2          v17.8h, v21.4s, v5.4s

        ret
 endfunc
@@ -229,33 +229,33 @@ function ff_simple_idct_put_neon, export=1
        idct_row4_neon  v28, v29, v30, v31, 2
        bl              idct_col4_neon1

-        sqshrun         v1.8B,  v7.8H, #COL_SHIFT-16
-        sqshrun2        v1.16B, v16.8H, #COL_SHIFT-16
-        sqshrun         v3.8B,  v17.8H, #COL_SHIFT-16
-        sqshrun2        v3.16B, v18.8H, #COL_SHIFT-16
+        sqshrun         v1.8b,  v7.8h, #COL_SHIFT-16
+        sqshrun2        v1.16b, v16.8h, #COL_SHIFT-16
+        sqshrun         v3.8b,  v17.8h, #COL_SHIFT-16
+        sqshrun2        v3.16b, v18.8h, #COL_SHIFT-16

        bl              idct_col4_neon2

-        sqshrun         v2.8B,  v7.8H, #COL_SHIFT-16
-        sqshrun2        v2.16B, v16.8H, #COL_SHIFT-16
-        sqshrun         v4.8B,  v17.8H, #COL_SHIFT-16
-        sqshrun2        v4.16B, v18.8H, #COL_SHIFT-16
+        sqshrun         v2.8b,  v7.8h, #COL_SHIFT-16
+        sqshrun2        v2.16b, v16.8h, #COL_SHIFT-16
+        sqshrun         v4.8b,  v17.8h, #COL_SHIFT-16
+        sqshrun2        v4.16b, v18.8h, #COL_SHIFT-16

-        zip1            v16.4S, v1.4S, v2.4S
-        zip2            v17.4S, v1.4S, v2.4S
+        zip1            v16.4s, v1.4s, v2.4s
+        zip2            v17.4s, v1.4s, v2.4s

-        st1             {v16.D}[0], [x0], x1
-        st1             {v16.D}[1], [x0], x1
+        st1             {v16.d}[0], [x0], x1
+        st1             {v16.d}[1], [x0], x1

-        zip1            v18.4S, v3.4S, v4.4S
-        zip2            v19.4S, v3.4S, v4.4S
+        zip1            v18.4s, v3.4s, v4.4s
+        zip2            v19.4s, v3.4s, v4.4s

-        st1             {v17.D}[0], [x0], x1
-        st1             {v17.D}[1], [x0], x1
-        st1             {v18.D}[0], [x0], x1
-        st1             {v18.D}[1], [x0], x1
-        st1             {v19.D}[0], [x0], x1
-        st1             {v19.D}[1], [x0], x1
+        st1             {v17.d}[0], [x0], x1
+        st1             {v17.d}[1], [x0], x1
+        st1             {v18.d}[0], [x0], x1
+        st1             {v18.d}[1], [x0], x1
+        st1             {v19.d}[0], [x0], x1
+        st1             {v19.d}[1], [x0], x1

        idct_end
 endfunc
@@ -267,59 +267,59 @@ function ff_simple_idct_add_neon, export=1
        idct_row4_neon  v28, v29, v30, v31, 2
        bl              idct_col4_neon1

-        sshr            v1.8H, v7.8H, #COL_SHIFT-16
-        sshr            v2.8H, v16.8H, #COL_SHIFT-16
-        sshr            v3.8H, v17.8H, #COL_SHIFT-16
-        sshr            v4.8H, v18.8H, #COL_SHIFT-16
+        sshr            v1.8h, v7.8h, #COL_SHIFT-16
+        sshr            v2.8h, v16.8h, #COL_SHIFT-16
+        sshr            v3.8h, v17.8h, #COL_SHIFT-16
+        sshr            v4.8h, v18.8h, #COL_SHIFT-16

        bl              idct_col4_neon2

-        sshr            v7.8H, v7.8H, #COL_SHIFT-16
-        sshr            v16.8H, v16.8H, #COL_SHIFT-16
-        sshr            v17.8H, v17.8H, #COL_SHIFT-16
-        sshr            v18.8H, v18.8H, #COL_SHIFT-16
+        sshr            v7.8h, v7.8h, #COL_SHIFT-16
+        sshr            v16.8h, v16.8h, #COL_SHIFT-16
+        sshr            v17.8h, v17.8h, #COL_SHIFT-16
+        sshr            v18.8h, v18.8h, #COL_SHIFT-16

        mov             x9,  x0
-        ld1             {v19.D}[0], [x0], x1
-        zip1            v23.2D, v1.2D, v7.2D
-        zip2            v24.2D, v1.2D, v7.2D
-        ld1             {v19.D}[1], [x0], x1
-        zip1            v25.2D, v2.2D, v16.2D
-        zip2            v26.2D, v2.2D, v16.2D
-        ld1             {v20.D}[0], [x0], x1
-        zip1            v27.2D, v3.2D, v17.2D
-        zip2            v28.2D, v3.2D, v17.2D
-        ld1             {v20.D}[1], [x0], x1
-        zip1            v29.2D, v4.2D, v18.2D
-        zip2            v30.2D, v4.2D, v18.2D
-        ld1             {v21.D}[0], [x0], x1
-        uaddw           v23.8H, v23.8H, v19.8B
-        uaddw2          v24.8H, v24.8H, v19.16B
-        ld1             {v21.D}[1], [x0], x1
-        sqxtun          v23.8B, v23.8H
-        sqxtun2         v23.16B, v24.8H
-        ld1             {v22.D}[0], [x0], x1
-        uaddw           v24.8H, v25.8H, v20.8B
-        uaddw2          v25.8H, v26.8H, v20.16B
-        ld1             {v22.D}[1], [x0], x1
-        sqxtun          v24.8B, v24.8H
-        sqxtun2         v24.16B, v25.8H
-        st1             {v23.D}[0], [x9], x1
-        uaddw           v25.8H, v27.8H, v21.8B
-        uaddw2          v26.8H, v28.8H, v21.16B
-        st1             {v23.D}[1], [x9], x1
-        sqxtun          v25.8B, v25.8H
-        sqxtun2         v25.16B, v26.8H
-        st1             {v24.D}[0], [x9], x1
-        uaddw           v26.8H, v29.8H, v22.8B
-        uaddw2          v27.8H, v30.8H, v22.16B
-        st1             {v24.D}[1], [x9], x1
-        sqxtun          v26.8B, v26.8H
-        sqxtun2         v26.16B, v27.8H
-        st1             {v25.D}[0], [x9], x1
-        st1             {v25.D}[1], [x9], x1
-        st1             {v26.D}[0], [x9], x1
-        st1             {v26.D}[1], [x9], x1
+        ld1             {v19.d}[0], [x0], x1
+        zip1            v23.2d, v1.2d, v7.2d
+        zip2            v24.2d, v1.2d, v7.2d
+        ld1             {v19.d}[1], [x0], x1
+        zip1            v25.2d, v2.2d, v16.2d
+        zip2            v26.2d, v2.2d, v16.2d
+        ld1             {v20.d}[0], [x0], x1
+        zip1            v27.2d, v3.2d, v17.2d
+        zip2            v28.2d, v3.2d, v17.2d
+        ld1             {v20.d}[1], [x0], x1
+        zip1            v29.2d, v4.2d, v18.2d
+        zip2            v30.2d, v4.2d, v18.2d
+        ld1             {v21.d}[0], [x0], x1
+        uaddw           v23.8h, v23.8h, v19.8b
+        uaddw2          v24.8h, v24.8h, v19.16b
+        ld1             {v21.d}[1], [x0], x1
+        sqxtun          v23.8b, v23.8h
+        sqxtun2         v23.16b, v24.8h
+        ld1             {v22.d}[0], [x0], x1
+        uaddw           v24.8h, v25.8h, v20.8b
+        uaddw2          v25.8h, v26.8h, v20.16b
+        ld1             {v22.d}[1], [x0], x1
+        sqxtun          v24.8b, v24.8h
+        sqxtun2         v24.16b, v25.8h
+        st1             {v23.d}[0], [x9], x1
+        uaddw           v25.8h, v27.8h, v21.8b
+        uaddw2          v26.8h, v28.8h, v21.16b
+        st1             {v23.d}[1], [x9], x1
+        sqxtun          v25.8b, v25.8h
+        sqxtun2         v25.16b, v26.8h
+        st1             {v24.d}[0], [x9], x1
+        uaddw           v26.8h, v29.8h, v22.8b
+        uaddw2          v27.8h, v30.8h, v22.16b
+        st1             {v24.d}[1], [x9], x1
+        sqxtun          v26.8b, v26.8h
+        sqxtun2         v26.16b, v27.8h
+        st1             {v25.d}[0], [x9], x1
+        st1             {v25.d}[1], [x9], x1
+        st1             {v26.d}[0], [x9], x1
+        st1             {v26.d}[1], [x9], x1

        idct_end
 endfunc
@@ -333,30 +333,30 @@ function ff_simple_idct_neon, export=1
        sub             x2, x2, #128
        bl              idct_col4_neon1

-        sshr            v1.8H, v7.8H, #COL_SHIFT-16
-        sshr            v2.8H, v16.8H, #COL_SHIFT-16
-        sshr            v3.8H, v17.8H, #COL_SHIFT-16
-        sshr            v4.8H, v18.8H, #COL_SHIFT-16
+        sshr            v1.8h, v7.8h, #COL_SHIFT-16
+        sshr            v2.8h, v16.8h, #COL_SHIFT-16
+        sshr            v3.8h, v17.8h, #COL_SHIFT-16
+        sshr            v4.8h, v18.8h, #COL_SHIFT-16

        bl              idct_col4_neon2

-        sshr            v7.8H, v7.8H, #COL_SHIFT-16
-        sshr            v16.8H, v16.8H, #COL_SHIFT-16
-        sshr            v17.8H, v17.8H, #COL_SHIFT-16
-        sshr            v18.8H, v18.8H, #COL_SHIFT-16
+        sshr            v7.8h, v7.8h, #COL_SHIFT-16
+        sshr            v16.8h, v16.8h, #COL_SHIFT-16
+        sshr            v17.8h, v17.8h, #COL_SHIFT-16
+        sshr            v18.8h, v18.8h, #COL_SHIFT-16

-        zip1            v23.2D, v1.2D, v7.2D
-        zip2            v24.2D, v1.2D, v7.2D
-        st1             {v23.2D,v24.2D}, [x2], #32
-        zip1            v25.2D, v2.2D, v16.2D
-        zip2            v26.2D, v2.2D, v16.2D
-        st1             {v25.2D,v26.2D}, [x2], #32
-        zip1            v27.2D, v3.2D, v17.2D
-        zip2            v28.2D, v3.2D, v17.2D
-        st1             {v27.2D,v28.2D}, [x2], #32
-        zip1            v29.2D, v4.2D, v18.2D
-        zip2            v30.2D, v4.2D, v18.2D
-        st1             {v29.2D,v30.2D}, [x2], #32
+        zip1            v23.2d, v1.2d, v7.2d
+        zip2            v24.2d, v1.2d, v7.2d
+        st1             {v23.2d,v24.2d}, [x2], #32
+        zip1            v25.2d, v2.2d, v16.2d
+        zip2            v26.2d, v2.2d, v16.2d
+        st1             {v25.2d,v26.2d}, [x2], #32
+        zip1            v27.2d, v3.2d, v17.2d
+        zip2            v28.2d, v3.2d, v17.2d
+        st1             {v27.2d,v28.2d}, [x2], #32
+        zip1            v29.2d, v4.2d, v18.2d
+        zip2            v30.2d, v4.2d, v18.2d
+        st1             {v29.2d,v30.2d}, [x2], #32

        idct_end
 endfunc
@@ -330,32 +330,32 @@ endfunc
        //   v17: hev

        // convert to signed value:
-        eor            v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
-        eor            v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80
+        eor             v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
+        eor             v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80

-        movi           v20.8h, #3
-        ssubl          v18.8h, v4.8b,  v3.8b             // QS0 - PS0
-        ssubl2         v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
-        eor            v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
-        eor            v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
-        mul            v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
-        mul            v19.8h, v19.8h, v20.8h
+        movi            v20.8h, #3
+        ssubl           v18.8h, v4.8b,  v3.8b             // QS0 - PS0
+        ssubl2          v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
+        eor             v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
+        eor             v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
+        mul             v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
+        mul             v19.8h, v19.8h, v20.8h

-        sqsub          v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
-        movi           v22.16b, #4
-        movi           v23.16b, #3
+        sqsub           v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
+        movi            v22.16b, #4
+        movi            v23.16b, #3
    .if \inner
-        and            v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
+        and             v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
    .endif
-        saddw          v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
-        saddw2         v19.8h,  v19.8h, v20.16b
-        sqxtn          v18.8b,  v18.8h                   // narrow result back into v18
-        sqxtn2         v18.16b, v19.8h
+        saddw           v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
+        saddw2          v19.8h,  v19.8h, v20.16b
+        sqxtn           v18.8b,  v18.8h                   // narrow result back into v18
+        sqxtn2          v18.16b, v19.8h
    .if !\inner && !\simple
-        eor            v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
-        eor            v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
+        eor             v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
+        eor             v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
    .endif
-        and            v18.16b, v18.16b, v16.16b         // w &= normal_limit
+        and             v18.16b, v18.16b, v16.16b         // w &= normal_limit

        // registers used at this point..
        //   v0 -> P3  (don't corrupt)
@@ -375,44 +375,44 @@ endfunc
        //   P0 = s2u(PS0 + c2);

    .if \simple
-        sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
-        sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
-        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
-        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
-        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
-        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
-        eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
-        eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
-        eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
-        eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
+        sqadd           v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd           v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
+        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        eor             v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
+        eor             v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
+        eor             v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
+        eor             v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
    .elseif \inner
        // the !is4tap case of filter_common, only used for inner blocks
        //   c3 = ((c1&~hev) + 1) >> 1;
        //   Q1 = s2u(QS1 - c3);
        //   P1 = s2u(PS1 + c3);
-        sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
-        sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
-        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
-        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
-        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
-        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
-        bic            v19.16b, v19.16b, v17.16b           // c1 & ~hev
-        eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
-        srshr          v19.16b, v19.16b, #1                // c3 >>= 1
-        eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
-        sqsub          v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
-        sqadd          v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
-        eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
-        eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
+        sqadd           v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd           v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
+        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        bic             v19.16b, v19.16b, v17.16b           // c1 & ~hev
+        eor             v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
+        srshr           v19.16b, v19.16b, #1                // c3 >>= 1
+        eor             v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
+        sqsub           v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
+        sqadd           v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
+        eor             v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
+        eor             v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
    .else
-        and            v20.16b, v18.16b, v17.16b           // w & hev
-        sqadd          v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
-        sqadd          v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
-        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
-        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
-        bic            v18.16b, v18.16b, v17.16b           // w &= ~hev
-        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
-        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        and             v20.16b, v18.16b, v17.16b           // w & hev
+        sqadd           v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd           v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
+        bic             v18.16b, v18.16b, v17.16b           // w &= ~hev
+        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)

        // filter_mbedge:
        //   a = clamp((27*w + 63) >> 7);
@@ -424,35 +424,35 @@ endfunc
        //   a = clamp((9*w + 63) >> 7);
        //   Q2 = s2u(QS2 - a);
        //   P2 = s2u(PS2 + a);
-        movi           v17.8h,  #63
-        sshll          v22.8h,  v18.8b, #3
-        sshll2         v23.8h,  v18.16b, #3
-        saddw          v22.8h,  v22.8h, v18.8b
-        saddw2         v23.8h,  v23.8h, v18.16b
-        add            v16.8h,  v17.8h, v22.8h
-        add            v17.8h,  v17.8h, v23.8h           //  9*w + 63
-        add            v19.8h,  v16.8h, v22.8h
-        add            v20.8h,  v17.8h, v23.8h           // 18*w + 63
-        add            v22.8h,  v19.8h, v22.8h
-        add            v23.8h,  v20.8h, v23.8h           // 27*w + 63
-        sqshrn         v16.8b,  v16.8h,  #7
-        sqshrn2        v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
-        sqshrn         v19.8b,  v19.8h, #7
-        sqshrn2        v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
-        sqshrn         v22.8b,  v22.8h, #7
-        sqshrn2        v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
-        sqadd          v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
-        sqsub          v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
-        sqadd          v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
-        sqsub          v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
-        sqadd          v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
-        sqsub          v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
-        eor            v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
-        eor            v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
-        eor            v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
-        eor            v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
-        eor            v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
-        eor            v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
+        movi            v17.8h,  #63
+        sshll           v22.8h,  v18.8b, #3
+        sshll2          v23.8h,  v18.16b, #3
+        saddw           v22.8h,  v22.8h, v18.8b
+        saddw2          v23.8h,  v23.8h, v18.16b
+        add             v16.8h,  v17.8h, v22.8h
+        add             v17.8h,  v17.8h, v23.8h           //  9*w + 63
+        add             v19.8h,  v16.8h, v22.8h
+        add             v20.8h,  v17.8h, v23.8h           // 18*w + 63
+        add             v22.8h,  v19.8h, v22.8h
+        add             v23.8h,  v20.8h, v23.8h           // 27*w + 63
+        sqshrn          v16.8b,  v16.8h,  #7
+        sqshrn2         v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
+        sqshrn          v19.8b,  v19.8h, #7
+        sqshrn2         v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
+        sqshrn          v22.8b,  v22.8h, #7
+        sqshrn2         v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
+        sqadd           v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
+        sqsub           v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
+        sqadd           v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
+        sqsub           v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
+        sqadd           v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
+        sqsub           v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
+        eor             v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
+        eor             v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
+        eor             v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
+        eor             v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
+        eor             v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
+        eor             v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
    .endif
 .endm

@@ -507,48 +507,48 @@ function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
        sub             x0,  x0,  x2,  lsl #2
        sub             x1,  x1,  x2,  lsl #2
        // Load pixels:
-        ld1          {v0.d}[0],     [x0], x2  // P3
-        ld1          {v0.d}[1],     [x1], x2  // P3
-        ld1          {v1.d}[0],     [x0], x2  // P2
-        ld1          {v1.d}[1],     [x1], x2  // P2
-        ld1          {v2.d}[0],     [x0], x2  // P1
-        ld1          {v2.d}[1],     [x1], x2  // P1
-        ld1          {v3.d}[0],     [x0], x2  // P0
-        ld1          {v3.d}[1],     [x1], x2  // P0
-        ld1          {v4.d}[0],     [x0], x2  // Q0
-        ld1          {v4.d}[1],     [x1], x2  // Q0
-        ld1          {v5.d}[0],     [x0], x2  // Q1
-        ld1          {v5.d}[1],     [x1], x2  // Q1
-        ld1          {v6.d}[0],     [x0], x2  // Q2
-        ld1          {v6.d}[1],     [x1], x2  // Q2
-        ld1          {v7.d}[0],     [x0]      // Q3
-        ld1          {v7.d}[1],     [x1]      // Q3
+        ld1             {v0.d}[0],     [x0], x2  // P3
+        ld1             {v0.d}[1],     [x1], x2  // P3
+        ld1             {v1.d}[0],     [x0], x2  // P2
+        ld1             {v1.d}[1],     [x1], x2  // P2
+        ld1             {v2.d}[0],     [x0], x2  // P1
+        ld1             {v2.d}[1],     [x1], x2  // P1
+        ld1             {v3.d}[0],     [x0], x2  // P0
+        ld1             {v3.d}[1],     [x1], x2  // P0
+        ld1             {v4.d}[0],     [x0], x2  // Q0
+        ld1             {v4.d}[1],     [x1], x2  // Q0
+        ld1             {v5.d}[0],     [x0], x2  // Q1
+        ld1             {v5.d}[1],     [x1], x2  // Q1
+        ld1             {v6.d}[0],     [x0], x2  // Q2
+        ld1             {v6.d}[1],     [x1], x2  // Q2
+        ld1             {v7.d}[0],     [x0]      // Q3
+        ld1             {v7.d}[1],     [x1]      // Q3

-        dup          v22.16b, w3                 // flim_E
-        dup          v23.16b, w4                 // flim_I
+        dup             v22.16b, w3                 // flim_E
+        dup             v23.16b, w4                 // flim_I

        vp8_loop_filter inner=\inner, hev_thresh=w5

        // back up to P2:  u,v -= stride * 6
-        sub          x0,  x0,  x2,  lsl #2
-        sub          x1,  x1,  x2,  lsl #2
-        sub          x0,  x0,  x2,  lsl #1
-        sub          x1,  x1,  x2,  lsl #1
+        sub             x0,  x0,  x2,  lsl #2
+        sub             x1,  x1,  x2,  lsl #2
+        sub             x0,  x0,  x2,  lsl #1
+        sub             x1,  x1,  x2,  lsl #1

        // Store pixels:

-        st1          {v1.d}[0],     [x0], x2  // P2
-        st1          {v1.d}[1],     [x1], x2  // P2
-        st1          {v2.d}[0],     [x0], x2  // P1
-        st1          {v2.d}[1],     [x1], x2  // P1
-        st1          {v3.d}[0],     [x0], x2  // P0
-        st1          {v3.d}[1],     [x1], x2  // P0
-        st1          {v4.d}[0],     [x0], x2  // Q0
-        st1          {v4.d}[1],     [x1], x2  // Q0
-        st1          {v5.d}[0],     [x0], x2  // Q1
-        st1          {v5.d}[1],     [x1], x2  // Q1
-        st1          {v6.d}[0],     [x0]      // Q2
-        st1          {v6.d}[1],     [x1]      // Q2
+        st1             {v1.d}[0],     [x0], x2  // P2
+        st1             {v1.d}[1],     [x1], x2  // P2
+        st1             {v2.d}[0],     [x0], x2  // P1
+        st1             {v2.d}[1],     [x1], x2  // P1
+        st1             {v3.d}[0],     [x0], x2  // P0
+        st1             {v3.d}[1],     [x1], x2  // P0
+        st1             {v4.d}[0],     [x0], x2  // Q0
+        st1             {v4.d}[1],     [x1], x2  // Q0
+        st1             {v5.d}[0],     [x0], x2  // Q1
+        st1             {v5.d}[1],     [x1], x2  // Q1
+        st1             {v6.d}[0],     [x0]      // Q2
+        st1             {v6.d}[1],     [x1]      // Q2

        ret
 endfunc
@@ -579,7 +579,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1
        ld1             {v6.d}[1], [x0], x1
        ld1             {v7.d}[1], [x0], x1

-        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31

        dup             v22.16b, w2                 // flim_E
    .if !\simple
@@ -590,7 +590,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1

        sub             x0,  x0,  x1, lsl #4    // backup 16 rows

-        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31

        // Store pixels:
        st1             {v0.d}[0], [x0], x1
@@ -624,24 +624,24 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
        sub             x1,  x1,  #4

        // Load pixels:
-        ld1          {v0.d}[0],     [x0], x2 // load u
-        ld1          {v0.d}[1],     [x1], x2 // load v
-        ld1          {v1.d}[0],     [x0], x2
-        ld1          {v1.d}[1],     [x1], x2
-        ld1          {v2.d}[0],     [x0], x2
-        ld1          {v2.d}[1],     [x1], x2
-        ld1          {v3.d}[0],     [x0], x2
-        ld1          {v3.d}[1],     [x1], x2
-        ld1          {v4.d}[0],     [x0], x2
-        ld1          {v4.d}[1],     [x1], x2
-        ld1          {v5.d}[0],     [x0], x2
-        ld1          {v5.d}[1],     [x1], x2
-        ld1          {v6.d}[0],     [x0], x2
-        ld1          {v6.d}[1],     [x1], x2
-        ld1          {v7.d}[0],     [x0], x2
-        ld1          {v7.d}[1],     [x1], x2
+        ld1             {v0.d}[0],     [x0], x2 // load u
+        ld1             {v0.d}[1],     [x1], x2 // load v
+        ld1             {v1.d}[0],     [x0], x2
+        ld1             {v1.d}[1],     [x1], x2
+        ld1             {v2.d}[0],     [x0], x2
+        ld1             {v2.d}[1],     [x1], x2
+        ld1             {v3.d}[0],     [x0], x2
+        ld1             {v3.d}[1],     [x1], x2
+        ld1             {v4.d}[0],     [x0], x2
+        ld1             {v4.d}[1],     [x1], x2
+        ld1             {v5.d}[0],     [x0], x2
+        ld1             {v5.d}[1],     [x1], x2
+        ld1             {v6.d}[0],     [x0], x2
+        ld1             {v6.d}[1],     [x1], x2
+        ld1             {v7.d}[0],     [x0], x2
+        ld1             {v7.d}[1],     [x1], x2

-        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31

        dup             v22.16b, w3                 // flim_E
        dup             v23.16b, w4                 // flim_I
@@ -651,25 +651,25 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
        sub             x0,  x0,  x2, lsl #3    // backup u 8 rows
        sub             x1,  x1,  x2, lsl #3    // backup v 8 rows

-        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31

        // Store pixels:
-        st1          {v0.d}[0],     [x0], x2 // load u
-        st1          {v0.d}[1],     [x1], x2 // load v
-        st1          {v1.d}[0],     [x0], x2
-        st1          {v1.d}[1],     [x1], x2
-        st1          {v2.d}[0],     [x0], x2
-        st1          {v2.d}[1],     [x1], x2
-        st1          {v3.d}[0],     [x0], x2
-        st1          {v3.d}[1],     [x1], x2
-        st1          {v4.d}[0],     [x0], x2
-        st1          {v4.d}[1],     [x1], x2
-        st1          {v5.d}[0],     [x0], x2
-        st1          {v5.d}[1],     [x1], x2
-        st1          {v6.d}[0],     [x0], x2
-        st1          {v6.d}[1],     [x1], x2
-        st1          {v7.d}[0],     [x0]
-        st1          {v7.d}[1],     [x1]
+        st1             {v0.d}[0],     [x0], x2 // load u
+        st1             {v0.d}[1],     [x1], x2 // load v
+        st1             {v1.d}[0],     [x0], x2
+        st1             {v1.d}[1],     [x1], x2
+        st1             {v2.d}[0],     [x0], x2
+        st1             {v2.d}[1],     [x1], x2
+        st1             {v3.d}[0],     [x0], x2
+        st1             {v3.d}[1],     [x1], x2
+        st1             {v4.d}[0],     [x0], x2
+        st1             {v4.d}[1],     [x1], x2
+        st1             {v5.d}[0],     [x0], x2
+        st1             {v5.d}[1],     [x1], x2
+        st1             {v6.d}[0],     [x0], x2
+        st1             {v6.d}[1],     [x1], x2
+        st1             {v7.d}[0],     [x0]
+        st1             {v7.d}[1],     [x1]

        ret

@@ -230,6 +230,9 @@ function \type\()_8tap_\size\()h_\idx1\idx2
        // reduced dst stride
 .if \size >= 16
        sub             x1,  x1,  x5
+.elseif \size == 4
+        add             x12, x2,  #8
+        add             x13, x7,  #8
 .endif
        // size >= 16 loads two qwords and increments x2,
        // for size 4/8 it's enough with one qword and no
@@ -248,9 +251,14 @@ function \type\()_8tap_\size\()h_\idx1\idx2
 .if \size >= 16
        ld1             {v4.8b,  v5.8b,  v6.8b},  [x2], #24
        ld1             {v16.8b, v17.8b, v18.8b}, [x7], #24
-.else
+.elseif \size == 8
        ld1             {v4.8b,  v5.8b},  [x2]
        ld1             {v16.8b, v17.8b}, [x7]
+.else // \size == 4
+        ld1             {v4.8b},  [x2]
+        ld1             {v16.8b}, [x7]
+        ld1             {v5.s}[0],  [x12], x3
+        ld1             {v17.s}[0], [x13], x3
 .endif
        uxtl            v4.8h,  v4.8b
        uxtl            v5.8h,  v5.8b
@@ -470,8 +470,7 @@ static av_cold int aic_decode_init(AVCodecContext *avctx)
        }
    }

-    ctx->slice_data = av_malloc_array(ctx->slice_width, AIC_BAND_COEFFS
-                                * sizeof(*ctx->slice_data));
+    ctx->slice_data = av_calloc(ctx->slice_width, AIC_BAND_COEFFS * sizeof(*ctx->slice_data));
    if (!ctx->slice_data) {
        av_log(avctx, AV_LOG_ERROR, "Error allocating slice buffer\n");

@@ -2116,8 +2116,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
        ctx->nbits  = av_malloc_array(ctx->cur_frame_length, sizeof(*ctx->nbits));
        ctx->mlz    = av_mallocz(sizeof(*ctx->mlz));

-        if (!ctx->mlz || !ctx->acf || !ctx->shift_value || !ctx->last_shift_value
-            || !ctx->last_acf_mantissa || !ctx->raw_mantissa) {
+        if (!ctx->larray || !ctx->nbits || !ctx->mlz || !ctx->acf || !ctx->shift_value
+            || !ctx->last_shift_value || !ctx->last_acf_mantissa || !ctx->raw_mantissa) {
            av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
            ret = AVERROR(ENOMEM);
            goto fail;
@@ -2128,6 +2128,10 @@ static av_cold int decode_init(AVCodecContext *avctx)

        for (c = 0; c < avctx->channels; ++c) {
            ctx->raw_mantissa[c] = av_mallocz_array(ctx->cur_frame_length, sizeof(**ctx->raw_mantissa));
+            if (!ctx->raw_mantissa[c]) {
+                av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
+                return AVERROR(ENOMEM);
+            }
        }
    }

@@ -48,4 +48,3 @@ function ff_scalarproduct_int16_neon, export=1
        vmov.32         r0,  d3[0]
        bx              lr
 endfunc
-
@@ -229,7 +229,7 @@ A .endif
  .endif

        // Begin loop
-01:
+1:
  .if TOTAL_TAPS == 0
        // Things simplify a lot in this case
        // In fact this could be pipelined further if it's worth it...
@@ -241,7 +241,7 @@ A .endif
        str     ST0, [PST, #-4]!
        str     ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
        str     ST0, [PSAMP], #4 * MAX_CHANNELS
-        bne     01b
+        bne     1b
  .else
    .if \fir_taps & 1
      .set LOAD_REG, 1
@@ -333,7 +333,7 @@ T       orr     AC0, AC0, AC1
        str     ST3, [PST, #-4]!
        str     ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
        str     ST3, [PSAMP], #4 * MAX_CHANNELS
-        bne     01b
+        bne     1b
  .endif
        b       99f

@@ -279,11 +279,13 @@ function \type\()_8tap_\size\()h_\idx1\idx2
        sub             r1,  r1,  r5
 .endif
        @ size >= 16 loads two qwords and increments r2,
-        @ for size 4/8 it's enough with one qword and no
-        @ postincrement
+        @ size 4 loads 1 d word, increments r2 and loads 1 32-bit lane
+        @ for size 8 it's enough with one qword and no postincrement
 .if \size >= 16
        sub             r3,  r3,  r5
        sub             r3,  r3,  #8
+.elseif \size == 4
+        sub             r3,  r3,  #8
 .endif
        @ Load the filter vector
        vld1.16         {q0},  [r12,:128]
@@ -295,9 +297,14 @@ function \type\()_8tap_\size\()h_\idx1\idx2
 .if \size >= 16
        vld1.8          {d18, d19, d20}, [r2]!
        vld1.8          {d24, d25, d26}, [r7]!
-.else
+.elseif \size == 8
        vld1.8          {q9},  [r2]
        vld1.8          {q12}, [r7]
+.else @ size == 4
+        vld1.8          {d18}, [r2]!
+        vld1.8          {d24}, [r7]!
+        vld1.32         {d19[0]}, [r2]
+        vld1.32         {d25[0]}, [r7]
 .endif
        vmovl.u8        q8,  d18
        vmovl.u8        q9,  d19
@@ -1294,6 +1294,10 @@ typedef struct AVCodecContext {
     *   this callback and filled with the extra buffers if there are more
     *   buffers than buf[] can hold. extended_buf will be freed in
     *   av_frame_unref().
+     *   Decoders will generally initialize the whole buffer before it is output
+     *   but it can in rare error conditions happen that uninitialized data is passed
+     *   through. \important The buffers returned by get_buffer* should thus not contain sensitive
+     *   data.
     *
     * If AV_CODEC_CAP_DR1 is not set then get_buffer2() must call
     * avcodec_default_get_buffer2() instead of providing buffers allocated by
@@ -422,7 +422,7 @@ static int cbs_vp9_split_fragment(CodedBitstreamContext *ctx,
    superframe_header = frag->data[frag->data_size - 1];

    if ((superframe_header & 0xe0) == 0xc0) {
-        VP9RawSuperframeIndex sfi;
+        VP9RawSuperframeIndex sfi = {0};
        GetBitContext gbc;
        size_t index_size, pos;
        int i;
@@ -91,4 +91,3 @@ AVCodec ff_cljr_decoder = {
    .decode         = decode_frame,
    .capabilities   = AV_CODEC_CAP_DR1,
 };
-
@@ -259,4 +259,3 @@ av_cold int ff_dvvideo_init(AVCodecContext *avctx)

    return 0;
 }
-
@@ -338,4 +338,3 @@ const AVDVProfile *av_dv_codec_profile2(int width, int height,

    return p;
 }
-
@@ -111,7 +111,7 @@ static int dxva_get_decoder_configuration(AVCodecContext *avctx,

    for (i = 0; i < cfg_count; i++) {
        unsigned score;
-        UINT ConfigBitstreamRaw;
+        UINT ConfigBitstreamRaw = 0;
        GUID guidConfigBitstreamEncryption;

 #if CONFIG_D3D11VA
@@ -262,7 +262,7 @@ static int dxva_get_decoder_guid(AVCodecContext *avctx, void *service, void *sur
    *decoder_guid = ff_GUID_NULL;
    for (i = 0; dxva_modes[i].guid; i++) {
        const dxva_mode *mode = &dxva_modes[i];
-        int validate;
+        int validate = 0;
        if (!dxva_check_codec_compatibility(avctx, mode))
            continue;

@@ -794,7 +794,7 @@ int ff_dxva2_commit_buffer(AVCodecContext *avctx,
                           unsigned type, const void *data, unsigned size,
                           unsigned mb_count)
 {
-    void     *dxva_data;
+    void     *dxva_data = NULL;
    unsigned dxva_size;
    int      result;
    HRESULT hr = 0;
@@ -816,7 +816,7 @@ int ff_dxva2_commit_buffer(AVCodecContext *avctx,
               type, (unsigned)hr);
        return -1;
    }
-    if (size <= dxva_size) {
+    if (dxva_data && size <= dxva_size) {
        memcpy(dxva_data, data, size);

 #if CONFIG_D3D11VA
@@ -894,7 +894,7 @@ int ff_dxva2_common_end_frame(AVCodecContext *avctx, AVFrame *frame,
 #endif
    DECODER_BUFFER_DESC             *buffer = NULL, *buffer_slice = NULL;
    int result, runs = 0;
-    HRESULT hr;
+    HRESULT hr = -1;
    unsigned type;
    FFDXVASharedContext *sctx = DXVA_SHARED_CONTEXT(avctx);

@@ -198,12 +198,15 @@ static int cmv_decode_frame(AVCodecContext *avctx,
    if ((ret = av_image_check_size(s->width, s->height, 0, s->avctx)) < 0)
        return ret;

+    buf += EA_PREAMBLE_SIZE;
+    if (!(buf[0]&1) && buf_end - buf < s->width * s->height * (int64_t)(100 - s->avctx->discard_damaged_percentage) / 100)
+        return AVERROR_INVALIDDATA;
+
    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
        return ret;

    memcpy(frame->data[1], s->palette, AVPALETTE_SIZE);

-    buf += EA_PREAMBLE_SIZE;
    if ((buf[0]&1)) {  // subtype
        cmv_decode_inter(s, frame, buf+2, buf_end);
        frame->key_frame = 0;
@@ -113,6 +113,13 @@ av_cold int ff_ffv1_init_slices_state(FFV1Context *f)
    return 0;
 }

+int ff_need_new_slices(int width, int num_h_slices, int chroma_shift) {
+    int mpw = 1<<chroma_shift;
+    int i = width * (int64_t)(num_h_slices - 1) / num_h_slices;
+
+    return width % mpw && (width - i) % mpw == 0;
+}
+
 av_cold int ff_ffv1_init_slice_contexts(FFV1Context *f)
 {
    int i, max_slice_count = f->num_h_slices * f->num_v_slices;
@@ -146,6 +146,7 @@ int ff_ffv1_init_slice_contexts(FFV1Context *f);
 int ff_ffv1_allocate_initial_states(FFV1Context *f);
 void ff_ffv1_clear_slice_state(FFV1Context *f, FFV1Context *fs);
 int ff_ffv1_close(AVCodecContext *avctx);
+int ff_need_new_slices(int width, int num_h_slices, int chroma_shift);

 static av_always_inline int fold(int diff, int bits)
 {
@@ -50,4 +50,3 @@ static inline int RENAME(get_context)(PlaneContext *p, TYPE *src,
               p->quant_table[1][(LT - T) & 0xFF] +
               p->quant_table[2][(T - RT) & 0xFF];
 }
-
@@ -361,7 +361,7 @@ static int decode_slice(AVCodecContext *c, void *arg)
    if (fs->ac != AC_GOLOMB_RICE && f->version > 2) {
        int v;
        get_rac(&fs->c, (uint8_t[]) { 129 });
-        v = fs->c.bytestream_end - fs->c.bytestream - 2 - 5*f->ec;
+        v = fs->c.bytestream_end - fs->c.bytestream - 2 - 5*!!f->ec;
        if (v) {
            av_log(f->avctx, AV_LOG_ERROR, "bytestream end mismatching by %d\n", v);
            fs->slice_damaged = 1;
@@ -199,7 +199,7 @@ static av_always_inline av_flatten void put_symbol_inline(RangeCoder *c,
    } while (0)

    if (v) {
-        const int a = FFABS(v);
+        const unsigned a = is_signed ? FFABS(v) : v;
        const int e = av_log2(a);
        put_rac(c, state + 0, 0);
        if (e <= 9) {
@@ -526,6 +526,11 @@ static av_cold int encode_init(AVCodecContext *avctx)
        avctx->slices > 1)
        s->version = FFMAX(s->version, 2);

+    if ((avctx->flags & (AV_CODEC_FLAG_PASS1 | AV_CODEC_FLAG_PASS2)) && s->ac == AC_GOLOMB_RICE) {
+        av_log(avctx, AV_LOG_ERROR, "2 Pass mode is not possible with golomb coding\n");
+        return AVERROR(EINVAL);
+    }
+
    // Unspecified level & slices, we choose version 1.2+ to ensure multithreaded decodability
    if (avctx->slices == 0 && avctx->level < 0 && avctx->width * avctx->height > 720*576)
        s->version = FFMAX(s->version, 2);
@@ -550,7 +555,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
        s->version = FFMAX(s->version, 3);

    if ((s->version == 2 || s->version>3) && avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
-        av_log(avctx, AV_LOG_ERROR, "Version 2 needed for requested features but version 2 is experimental and not enabled\n");
+        av_log(avctx, AV_LOG_ERROR, "Version 2 or 4 needed for requested features but version 2 or 4 is experimental and not enabled\n");
        return AVERROR_INVALIDDATA;
    }

@@ -735,19 +740,21 @@ FF_ENABLE_DEPRECATION_WARNINGS
            s->quant_tables[1][2][i]=     11*11*quant5 [i];
            s->quant_tables[1][3][i]=   5*11*11*quant5 [i];
            s->quant_tables[1][4][i]= 5*5*11*11*quant5 [i];
+            s->context_count[0] = (11 * 11 * 11        + 1) / 2;
+            s->context_count[1] = (11 * 11 * 5 * 5 * 5 + 1) / 2;
        } else {
            s->quant_tables[0][0][i]=           quant9_10bit[i];
-            s->quant_tables[0][1][i]=        11*quant9_10bit[i];
-            s->quant_tables[0][2][i]=     11*11*quant9_10bit[i];
+            s->quant_tables[0][1][i]=         9*quant9_10bit[i];
+            s->quant_tables[0][2][i]=       9*9*quant9_10bit[i];
            s->quant_tables[1][0][i]=           quant9_10bit[i];
-            s->quant_tables[1][1][i]=        11*quant9_10bit[i];
-            s->quant_tables[1][2][i]=     11*11*quant5_10bit[i];
-            s->quant_tables[1][3][i]=   5*11*11*quant5_10bit[i];
-            s->quant_tables[1][4][i]= 5*5*11*11*quant5_10bit[i];
+            s->quant_tables[1][1][i]=         9*quant9_10bit[i];
+            s->quant_tables[1][2][i]=       9*9*quant5_10bit[i];
+            s->quant_tables[1][3][i]=     5*9*9*quant5_10bit[i];
+            s->quant_tables[1][4][i]=   5*5*9*9*quant5_10bit[i];
+            s->context_count[0] = (9 * 9 * 9         + 1) / 2;
+            s->context_count[1] = (9 * 9 * 5 * 5 * 5 + 1) / 2;
        }
    }
-    s->context_count[0] = (11 * 11 * 11        + 1) / 2;
-    s->context_count[1] = (11 * 11 * 5 * 5 * 5 + 1) / 2;
    memcpy(s->quant_table, s->quant_tables[s->context_model],
           sizeof(s->quant_table));

@@ -885,6 +892,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
                    continue;
                if (maxw * maxh * (int64_t)(s->bits_per_raw_sample+1) * plane_count > 8<<24)
                    continue;
+                if (s->version < 4)
+                    if (  ff_need_new_slices(avctx->width , s->num_h_slices, s->chroma_h_shift)
+                        ||ff_need_new_slices(avctx->height, s->num_v_slices, s->chroma_v_shift))
+                        continue;
                if (avctx->slices == s->num_h_slices * s->num_v_slices && avctx->slices <= MAX_SLICES || !avctx->slices)
                    goto slices_ok;
            }
@@ -933,8 +944,8 @@ static void encode_slice_header(FFV1Context *f, FFV1Context *fs)

    put_symbol(c, state, (fs->slice_x     +1)*f->num_h_slices / f->width   , 0);
    put_symbol(c, state, (fs->slice_y     +1)*f->num_v_slices / f->height  , 0);
-    put_symbol(c, state, (fs->slice_width +1)*f->num_h_slices / f->width -1, 0);
-    put_symbol(c, state, (fs->slice_height+1)*f->num_v_slices / f->height-1, 0);
+    put_symbol(c, state, 0, 0);
+    put_symbol(c, state, 0, 0);
    for (j=0; j<f->plane_count; j++) {
        put_symbol(c, state, f->plane[j].quant_table_index, 0);
        av_assert0(f->plane[j].quant_table_index == f->context_model);
@@ -199,4 +199,3 @@ static int RENAME(encode_rgb_frame)(FFV1Context *s, const uint8_t *src[4],
    }
    return 0;
 }
-
@@ -426,7 +426,9 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
    MpegEncContext *s  = avctx->priv_data;
    int ret;
    int slice_ret = 0;
+
    AVFrame *pict = data;
+    int bak_width, bak_height;

    /* no supplementary picture */
    if (buf_size == 0) {
@@ -490,6 +492,9 @@ retry:
        // we need the idct permutation for reading a custom matrix
        ff_mpv_idct_init(s);

+    bak_width  = s->width;
+    bak_height = s->height;
+
    /* let's go :-) */
    if (CONFIG_WMV2_DECODER && s->msmpeg4_version == 5) {
        ret = ff_wmv2_decode_picture_header(s);
@@ -512,11 +517,12 @@ retry:
    }

    if (ret < 0 || ret == FRAME_SKIPPED) {
-        if (   s->width  != avctx->coded_width
-            || s->height != avctx->coded_height) {
+        if (   s->width  != bak_width
+            || s->height != bak_height) {
                av_log(s->avctx, AV_LOG_WARNING, "Reverting picture dimensions change due to header decoding failure\n");
-                s->width = avctx->coded_width;
-                s->height= avctx->coded_height;
+                s->width = bak_width;
+                s->height= bak_height;
+
        }
    }
    if (ret == FRAME_SKIPPED)
@@ -162,4 +162,3 @@ static void MCFUNC(hl_motion)(const H264Context *h, H264SliceContext *sl,
    if (USES_LIST(mb_type, 1))
        prefetch_motion(h, sl, 1, PIXEL_SHIFT, CHROMA_IDC);
 }
-
@@ -372,6 +372,7 @@ static int hap_decode(AVCodecContext *avctx, void *data,
            ret = av_reallocp(&ctx->tex_buf, ctx->tex_size);
            if (ret < 0)
                return ret;
+            memset(ctx->tex_buf, 0, ctx->tex_size);

            avctx->execute2(avctx, decompress_chunks_thread, NULL,
                            ctx->chunk_results, ctx->chunk_count);
@@ -1554,4 +1554,3 @@ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
    case 0: lc->pu.mvd.y = 0;                       break;
    }
 }
-
@@ -738,6 +738,8 @@ static void decode_gray_bitstream(HYuvContext *s, int count)
        for (i = 0; i < count && BITS_LEFT(re, &s->gb) > 0; i++) {
            READ_2PIX(s->temp[0][2 * i], s->temp[0][2 * i + 1], 0);
        }
+        for (; i < count; i++)
+            s->temp[0][2 * i] = s->temp[0][2 * i + 1] = 0;
    } else {
        for (i = 0; i < count; i++) {
            READ_2PIX(s->temp[0][2 * i], s->temp[0][2 * i + 1], 0);
@@ -653,7 +653,7 @@ static void get_codebook(int16_t * cbvec,   /* (o) Constructed codebook vector *
    int16_t k, base_size;
    int16_t lag;
    /* Stack based */
-    int16_t tempbuff2[SUBL + 5];
+    int16_t tempbuff2[SUBL + 5] = {0};

    /* Determine size of codebook sections */
    base_size = lMem - cbveclen + 1;
@@ -183,7 +183,7 @@ static av_always_inline void FUNC(row_fdct)(int16_t *data)
 {
  int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  int tmp10, tmp11, tmp12, tmp13;
-  int z1, z2, z3, z4, z5;
+  unsigned z1, z2, z3, z4, z5;
  int16_t *dataptr;
  int ctr;

@@ -340,6 +340,25 @@ static int get_siz(Jpeg2000DecoderContext *s)
        return AVERROR_INVALIDDATA;
    }

+    for (i = 0; i < s->ncomponents; i++) {
+        if (s->cdef[i] < 0) {
+            for (i = 0; i < s->ncomponents; i++) {
+                s->cdef[i] = i + 1;
+            }
+            if ((s->ncomponents & 1) == 0)
+                s->cdef[s->ncomponents-1] = 0;
+        }
+    }
+    // after here we no longer have to consider negative cdef
+
+    int cdef_used = 0;
+    for (i = 0; i < s->ncomponents; i++)
+        cdef_used |= 1<<s->cdef[i];
+
+    // Check that the channels we have are what we expect for the number of components
+    if (cdef_used != ((int[]){0,2,3,14,15})[s->ncomponents])
+        return AVERROR_INVALIDDATA;
+
    for (i = 0; i < s->ncomponents; i++) { // Ssiz_i XRsiz_i, YRsiz_i
        uint8_t x    = bytestream2_get_byteu(&s->g);
        s->cbps[i]   = (x & 0x7f) + 1;
@@ -352,7 +371,9 @@ static int get_siz(Jpeg2000DecoderContext *s)
            av_log(s->avctx, AV_LOG_ERROR, "Invalid sample separation %d/%d\n", s->cdx[i], s->cdy[i]);
            return AVERROR_INVALIDDATA;
        }
-        log2_chroma_wh |= s->cdy[i] >> 1 << i * 4 | s->cdx[i] >> 1 << i * 4 + 2;
+        int i_remapped = s->cdef[i] ? s->cdef[i]-1 : (s->ncomponents-1);
+
+        log2_chroma_wh |= s->cdy[i] >> 1 << i_remapped * 4 | s->cdx[i] >> 1 << i_remapped * 4 + 2;
    }

    s->numXtiles = ff_jpeg2000_ceildiv(s->width  - s->tile_offset_x, s->tile_width);
@@ -1198,6 +1219,7 @@ static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile,

                bytestream2_get_bufferu(&s->g, cblk->data + cblk->length, cblk->lengthinc[cwsno]);
                cblk->length   += cblk->lengthinc[cwsno];
+                memset(cblk->data + cblk->length, 0, 4);
                cblk->lengthinc[cwsno] = 0;
                if (cblk->nb_terminationsinc) {
                    cblk->nb_terminationsinc--;
@@ -398,4 +398,3 @@ void RENAME(ff_imdct36_blocks)(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in,
        out++;
    }
 }
-
@@ -782,4 +782,3 @@ static const AVCodecDefault mp2_defaults[] = {
    { "b", "0" },
    { NULL },
 };
-
@@ -562,6 +562,12 @@ FF_ENABLE_DEPRECATION_WARNINGS
        av_log(avctx, AV_LOG_ERROR, "H.263 does not support resolutions above 2048x1152\n");
        return AVERROR(EINVAL);
    }
+    if (s->codec_id == AV_CODEC_ID_FLV1 &&
+        (avctx->width  > 65535 ||
+         avctx->height > 65535 )) {
+        av_log(avctx, AV_LOG_ERROR, "FLV does not support resolutions above 16bit\n");
+        return AVERROR(EINVAL);
+    }
    if ((s->codec_id == AV_CODEC_ID_H263  ||
         s->codec_id == AV_CODEC_ID_H263P) &&
        ((avctx->width &3) ||
@@ -334,4 +334,3 @@ int ff_msmpeg4_pred_dc(MpegEncContext *s, int n,
    *dc_val_ptr = &dc_val[0];
    return pred;
 }
-
@@ -187,6 +187,8 @@ static int decode_frame(AVCodecContext *avctx,
                    av_log(avctx, AV_LOG_ERROR, "Inflate error: %d\n", ret);
                    return AVERROR_EXTERNAL;
                }
+                if (s->zstream.avail_out > 0)
+                    memset(s->zstream.next_out, 0, s->zstream.avail_out);
            }
        }
    } else if (type == MKTAG('H','U','F','Y')) {
@@ -92,6 +92,9 @@ static int lz4_decompress(AVCodecContext *avctx,
            } while (current == 255);
        }

+        if (bytestream2_get_bytes_left(gb) < num_literals)
+            return AVERROR_INVALIDDATA;
+
        if (pos + num_literals < HISTORY_SIZE) {
            bytestream2_get_buffer(gb, history + pos, num_literals);
            pos += num_literals;
@@ -71,7 +71,6 @@ void ff_build_rac_states(RangeCoder *c, int factor, int max_p);
 static inline void renorm_encoder(RangeCoder *c)
 {
    // FIXME: optimize
-    while (c->range < 0x100) {
        if (c->outstanding_byte < 0) {
            c->outstanding_byte = c->low >> 8;
        } else if (c->low <= 0xFF00) {
@@ -90,7 +89,6 @@ static inline void renorm_encoder(RangeCoder *c)

        c->low     = (c->low & 0xFF) << 8;
        c->range <<= 8;
-    }
 }

 static inline int get_rac_count(RangeCoder *c)
@@ -117,7 +115,8 @@ static inline void put_rac(RangeCoder *c, uint8_t *const state, int bit)
        *state   = c->one_state[*state];
    }

-    renorm_encoder(c);
+    while (c->range < 0x100)
+        renorm_encoder(c);
 }

 static inline void refill(RangeCoder *c)
@@ -558,6 +558,7 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
    buf               = &s->bitstream[s->bitstream_index];
    buf_size         += s->bitstream_size;
    s->bitstream_size = buf_size;
+    memset(buf + buf_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);

    /* do not decode until buffer has at least max_framesize bytes or
     * the end of the file has been reached */
@@ -856,5 +856,3 @@ av_cold void ff_dwt_init(SnowDWTContext *c)
    if (HAVE_MMX)
        ff_dwt_init_x86(c);
 }
-
-
@@ -1439,6 +1439,9 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
    if (svq3_decode_slice_header(avctx))
        return -1;

+    if (avpkt->size < s->mb_width * s->mb_height / 8)
+        return AVERROR_INVALIDDATA;
+
    s->pict_type = s->slice_type;

    if (s->pict_type != AV_PICTURE_TYPE_B)
@@ -1748,7 +1748,7 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
                case AV_CODEC_ID_ADPCM_IMA_WAV:
                    if (bps < 2 || bps > 5)
                        return 0;
-                    tmp = blocks * (1LL + (ba - 4 * ch) / (bps * ch) * 8);
+                    tmp = blocks * (1LL + (ba - 4 * ch) / (bps * ch) * 8LL);
                    break;
                case AV_CODEC_ID_ADPCM_IMA_DK3:
                    tmp = blocks * (((ba - 16LL) * 2 / 3 * 4) / ch);
@@ -1313,6 +1313,7 @@ static int vc1_decode_p_mb(VC1Context *v)
    int dst_idx, off;
    int skipped, fourmv;
    int block_cbp = 0, pat, block_tt = 0, block_intra = 0;
+    int ret;

    mquant = v->pq; /* lossy initialization */

@@ -1371,8 +1372,10 @@ static int vc1_decode_p_mb(VC1Context *v)
                    if (i == 1 || i == 3 || s->mb_x)
                        v->c_avail = v->mb_type[0][s->block_index[i] - 1];

-                    vc1_decode_intra_block(v, v->block[v->cur_blk_idx][block_map[i]], i, val, mquant,
-                                           (i & 4) ? v->codingset2 : v->codingset);
+                    ret = vc1_decode_intra_block(v, v->block[v->cur_blk_idx][block_map[i]], i, val, mquant,
+                                                 (i & 4) ? v->codingset2 : v->codingset);
+                    if (ret < 0)
+                        return ret;
                    if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                        continue;
                    v->vc1dsp.vc1_inv_trans_8x8(v->block[v->cur_blk_idx][block_map[i]]);
@@ -1474,8 +1477,10 @@ static int vc1_decode_p_mb(VC1Context *v)
                    if (i == 1 || i == 3 || s->mb_x)
                        v->c_avail = v->mb_type[0][s->block_index[i] - 1];

-                    vc1_decode_intra_block(v, v->block[v->cur_blk_idx][block_map[i]], i, is_coded[i], mquant,
-                                           (i & 4) ? v->codingset2 : v->codingset);
+                    ret = vc1_decode_intra_block(v, v->block[v->cur_blk_idx][block_map[i]], i, is_coded[i], mquant,
+                                                 (i & 4) ? v->codingset2 : v->codingset);
+                    if (ret < 0)
+                        return ret;
                    if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                        continue;
                    v->vc1dsp.vc1_inv_trans_8x8(v->block[v->cur_blk_idx][block_map[i]]);
@@ -1546,6 +1551,7 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
    int block_cbp = 0, pat, block_tt = 0;
    int idx_mbmode = 0, mvbp;
    int fieldtx;
+    int ret;

    mquant = v->pq; /* Lossy initialization */

@@ -1618,8 +1624,10 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
                if (i == 1 || i == 3 || s->mb_x)
                    v->c_avail = v->mb_type[0][s->block_index[i] - 1];

-                vc1_decode_intra_block(v, v->block[v->cur_blk_idx][block_map[i]], i, val, mquant,
-                                       (i & 4) ? v->codingset2 : v->codingset);
+                ret = vc1_decode_intra_block(v, v->block[v->cur_blk_idx][block_map[i]], i, val, mquant,
+                                             (i & 4) ? v->codingset2 : v->codingset);
+                if (ret < 0)
+                    return ret;
                if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                    continue;
                v->vc1dsp.vc1_inv_trans_8x8(v->block[v->cur_blk_idx][block_map[i]]);
@@ -1755,6 +1763,7 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
    int pred_flag = 0;
    int block_cbp = 0, pat, block_tt = 0;
    int idx_mbmode = 0;
+    int ret;

    mquant = v->pq; /* Lossy initialization */

@@ -1786,8 +1795,10 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
            if (i == 1 || i == 3 || s->mb_x)
                v->c_avail = v->mb_type[0][s->block_index[i] - 1];

-            vc1_decode_intra_block(v, v->block[v->cur_blk_idx][block_map[i]], i, val, mquant,
-                                   (i & 4) ? v->codingset2 : v->codingset);
+            ret = vc1_decode_intra_block(v, v->block[v->cur_blk_idx][block_map[i]], i, val, mquant,
+                                         (i & 4) ? v->codingset2 : v->codingset);
+            if (ret < 0)
+                return ret;
            if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                continue;
            v->vc1dsp.vc1_inv_trans_8x8(v->block[v->cur_blk_idx][block_map[i]]);
@@ -1878,6 +1889,7 @@ static int vc1_decode_b_mb(VC1Context *v)
    int skipped, direct;
    int dmv_x[2], dmv_y[2];
    int bmvtype = BMV_TYPE_BACKWARD;
+    int ret;

    mquant      = v->pq; /* lossy initialization */
    s->mb_intra = 0;
@@ -1990,8 +2002,10 @@ static int vc1_decode_b_mb(VC1Context *v)
            if (i == 1 || i == 3 || s->mb_x)
                v->c_avail = v->mb_type[0][s->block_index[i] - 1];

-            vc1_decode_intra_block(v, s->block[i], i, val, mquant,
-                                   (i & 4) ? v->codingset2 : v->codingset);
+            ret = vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                                         (i & 4) ? v->codingset2 : v->codingset);
+            if (ret < 0)
+                return ret;
            if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                continue;
            v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
@@ -2037,6 +2051,7 @@ static int vc1_decode_b_mb_intfi(VC1Context *v)
    int bmvtype = BMV_TYPE_BACKWARD;
    int block_cbp = 0, pat, block_tt = 0;
    int idx_mbmode;
+    int ret;

    mquant      = v->pq; /* Lossy initialization */
    s->mb_intra = 0;
@@ -2069,8 +2084,10 @@ static int vc1_decode_b_mb_intfi(VC1Context *v)
            if (i == 1 || i == 3 || s->mb_x)
                v->c_avail = v->mb_type[0][s->block_index[i] - 1];

-            vc1_decode_intra_block(v, s->block[i], i, val, mquant,
-                                   (i & 4) ? v->codingset2 : v->codingset);
+            ret = vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                                         (i & 4) ? v->codingset2 : v->codingset);
+            if (ret < 0)
+                return ret;
            if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                continue;
            v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
@@ -2207,6 +2224,7 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
    int stride_y, fieldtx;
    int bmvtype = BMV_TYPE_BACKWARD;
    int dir, dir2;
+    int ret;

    mquant = v->pq; /* Lossy initialization */
    s->mb_intra = 0;
@@ -2263,8 +2281,10 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
            if (i == 1 || i == 3 || s->mb_x)
                v->c_avail = v->mb_type[0][s->block_index[i] - 1];

-            vc1_decode_intra_block(v, s->block[i], i, val, mquant,
-                                   (i & 4) ? v->codingset2 : v->codingset);
+            ret = vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                                         (i & 4) ? v->codingset2 : v->codingset);
+            if (ret < 0)
+                return ret;
            if (CONFIG_GRAY && i > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                continue;
            v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
@@ -2808,6 +2828,7 @@ static void vc1_decode_p_blocks(VC1Context *v)
 {
    MpegEncContext *s = &v->s;
    int apply_loop_filter;
+    int ret;

    /* select coding mode used for VLC tables selection */
    switch (v->c_ac_table_index) {
@@ -2850,22 +2871,22 @@ static void vc1_decode_p_blocks(VC1Context *v)
                }

            if (v->fcm == ILACE_FIELD) {
-                vc1_decode_p_mb_intfi(v);
+                ret = vc1_decode_p_mb_intfi(v);
                if (apply_loop_filter)
                    ff_vc1_p_loop_filter(v);
            } else if (v->fcm == ILACE_FRAME) {
-                vc1_decode_p_mb_intfr(v);
+                ret = vc1_decode_p_mb_intfr(v);
                if (apply_loop_filter)
                    ff_vc1_p_intfr_loop_filter(v);
            } else {
-                vc1_decode_p_mb(v);
+                ret = vc1_decode_p_mb(v);
                if (apply_loop_filter)
                    ff_vc1_p_loop_filter(v);
            }
-            if (get_bits_left(&s->gb) < 0 || get_bits_count(&s->gb) < 0) {
+            if (ret < 0 || get_bits_left(&s->gb) < 0 || get_bits_count(&s->gb) < 0) {
                // TODO: may need modification to handle slice coding
                ff_er_add_slice(&s->er, 0, s->start_mb_y, s->mb_x, s->mb_y, ER_MB_ERROR);
-                av_log(s->avctx, AV_LOG_ERROR, "Bits overconsumption: %i > %i at %ix%i\n",
+                av_log(s->avctx, AV_LOG_ERROR, "Error or Bits overconsumption: %i > %i at %ix%i\n",
                       get_bits_count(&s->gb), s->gb.size_in_bits, s->mb_x, s->mb_y);
                return;
            }
@@ -344,7 +344,7 @@ av_cold int ff_vc1_decode_init_alloc_tables(VC1Context *v)
    if (!v->block || !v->cbp_base)
        goto error;
    v->cbp              = v->cbp_base + 2 * s->mb_stride;
-    v->ttblk_base       = av_malloc(sizeof(v->ttblk_base[0]) * 3 * s->mb_stride);
+    v->ttblk_base       = av_mallocz(sizeof(v->ttblk_base[0]) * 3 * s->mb_stride);
    if (!v->ttblk_base)
        goto error;
    v->ttblk            = v->ttblk_base + 2 * s->mb_stride;
@@ -358,7 +358,7 @@ av_cold int ff_vc1_decode_init_alloc_tables(VC1Context *v)
    v->luma_mv          = v->luma_mv_base + 2 * s->mb_stride;

    /* allocate block type info in that way so it could be used with s->block_index[] */
-    v->mb_type_base = av_malloc(s->b8_stride * (mb_height * 2 + 1) + s->mb_stride * (mb_height + 1) * 2);
+    v->mb_type_base = av_mallocz(s->b8_stride * (mb_height * 2 + 1) + s->mb_stride * (mb_height + 1) * 2);
    if (!v->mb_type_base)
        goto error;
    v->mb_type[0]   = v->mb_type_base + s->b8_stride + 1;
@@ -608,6 +608,7 @@ av_cold int ff_vc1_decode_end(AVCodecContext *avctx)
    av_freep(&v->hrd_rate);
    av_freep(&v->hrd_buffer);
    ff_mpv_common_end(&v->s);
+    memset(v->s.block_index, 0, sizeof(v->s.block_index));
    av_freep(&v->mv_type_mb_plane);
    av_freep(&v->direct_mb_plane);
    av_freep(&v->forward_mb_plane);
@@ -986,7 +986,7 @@ static av_cold int vc2_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
    }

    s->slice_min_bytes = s->slice_max_bytes - s->slice_max_bytes*(s->tolerance/100.0f);
-    if (s->slice_min_bytes < 0)
+    if (s->slice_min_bytes < 0 || s->slice_max_bytes > INT_MAX >> 3)
        return AVERROR(EINVAL);

    ret = encode_frame(s, avpkt, frame, aux_data, header_size, s->interlaced);
@@ -1566,6 +1566,8 @@ static int vp9_decode_frame(AVCodecContext *avctx, void *frame,
            av_log(avctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
            return AVERROR_INVALIDDATA;
        }
+        ff_thread_await_progress(&s->s.refs[ref], INT_MAX, 0);
+
        if ((ret = av_frame_ref(frame, s->s.refs[ref].f)) < 0)
            return ret;
        ((AVFrame *)frame)->pts = pkt->pts;
@@ -1732,10 +1734,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif
        {
            ret = decode_tiles(avctx, data, size);
-            if (ret < 0) {
-                ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
-                return ret;
-            }
+            if (ret < 0)
+                goto fail;
        }

        // Sum all counts fields into td[0].counts for tile threading
@@ -1749,20 +1749,21 @@ FF_ENABLE_DEPRECATION_WARNINGS
            ff_thread_finish_setup(avctx);
        }
    } while (s->pass++ == 1);
-    ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);

    if (s->td->error_info < 0) {
        av_log(avctx, AV_LOG_ERROR, "Failed to decode tile data\n");
        s->td->error_info = 0;
-        return AVERROR_INVALIDDATA;
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
    }
    if (avctx->export_side_data & AV_CODEC_EXPORT_DATA_VIDEO_ENC_PARAMS) {
        ret = vp9_export_enc_params(s, &s->s.frames[CUR_FRAME]);
        if (ret < 0)
-            return ret;
+            goto fail;
    }

 finish:
+    ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
    // ref frame setup
    for (i = 0; i < 8; i++) {
        if (s->s.refs[i].f->buf[0])
@@ -1779,6 +1780,9 @@ finish:
    }

    return pkt->size;
+fail:
+    ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+    return ret;
 }

 static void vp9_decode_flush(AVCodecContext *avctx)
@@ -318,7 +318,11 @@ static av_always_inline void mc_luma_unscaled(VP9TileData *td, vp9_mc_func (*mc)
    // The arm/aarch64 _hv filters read one more row than what actually is
    // needed, so switch to emulated edge one pixel sooner vertically
    // (!!my * 5) than horizontally (!!mx * 4).
+    // The arm/aarch64 _h filters read one more pixel than what actually is
+    // needed, so switch to emulated edge if that would read beyond the bottom
+    // right block.
    if (x < !!mx * 3 || y < !!my * 3 ||
+        ((ARCH_AARCH64 || ARCH_ARM) && (x + !!mx * 5 > w - bw) && (y + !!my * 5 + 1 > h - bh)) ||
        x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
        s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
                                 ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
@@ -357,7 +361,11 @@ static av_always_inline void mc_chroma_unscaled(VP9TileData *td, vp9_mc_func (*m
    // The arm/aarch64 _hv filters read one more row than what actually is
    // needed, so switch to emulated edge one pixel sooner vertically
    // (!!my * 5) than horizontally (!!mx * 4).
+    // The arm/aarch64 _h filters read one more pixel than what actually is
+    // needed, so switch to emulated edge if that would read beyond the bottom
+    // right block.
    if (x < !!mx * 3 || y < !!my * 3 ||
+        ((ARCH_AARCH64 || ARCH_ARM) && (x + !!mx * 5 > w - bw) && (y + !!my * 5 + 1 > h - bh)) ||
        x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
        s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
                                 ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
@@ -739,6 +739,9 @@ static int decode_entropy_coded_image(WebPContext *s, enum ImageRole role,
            ref_x = FFMAX(0, ref_x);
            ref_y = FFMAX(0, ref_y);

+            if (ref_y == y && ref_x >= x)
+                return AVERROR_INVALIDDATA;
+
            /* copy pixels
             * source and dest regions can overlap and wrap lines, so just
             * copy per-pixel */
@@ -1491,6 +1491,8 @@ static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
    /* Parse frame type ("frame header"), see frame_descs */
    int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;

+    pitch[0] = INT_MAX;
+
    if (bd_idx < 0) {
        av_log(ctx, AV_LOG_ERROR,
               "Invalid frame type VLC code, skipping\n");
@@ -1608,6 +1610,9 @@ static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
        double i_lsps[MAX_LSPS];
        float lpcs[MAX_LSPS];

+        if(frame_descs[bd_idx].fcb_type >= FCB_TYPE_AW_PULSES && pitch[0] == INT_MAX)
+            return AVERROR_INVALIDDATA;
+
        for (n = 0; n < s->lsps; n++) // LSF -> LSP
            i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
        ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
@@ -121,4 +121,3 @@ INIT_XMM sse
 INT32_TO_FLOAT_FMUL_ARRAY8
 INIT_XMM sse2
 INT32_TO_FLOAT_FMUL_ARRAY8
-
@@ -151,4 +151,3 @@ INIT_MMX mmx
 PIX_NORM1 0, 16
 INIT_XMM sse2
 PIX_NORM1 6, 8
-
@@ -163,6 +163,7 @@ cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
    dec              cntrq
    jge .bpp_loop
    POP               dstq
+    emms
    RET
 %endmacro

@@ -614,6 +614,9 @@ static int xan_decode_frame(AVCodecContext *avctx,
        return AVERROR_INVALIDDATA;
    }

+    if (buf_size < 9)
+        return AVERROR_INVALIDDATA;
+
    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
        return ret;

@@ -95,10 +95,10 @@ struct video_data {
    int (*open_f)(const char *file, int oflag, ...);
    int (*close_f)(int fd);
    int (*dup_f)(int fd);
-#ifdef __GLIBC__
-    int (*ioctl_f)(int fd, unsigned long int request, ...);
-#else
+#if HAVE_POSIX_IOCTL
    int (*ioctl_f)(int fd, int request, ...);
+#else
+    int (*ioctl_f)(int fd, unsigned long int request, ...);
 #endif
    ssize_t (*read_f)(int fd, void *buffer, size_t n);
    void *(*mmap_f)(void *start, size_t length, int prot, int flags, int fd, int64_t offset);
@@ -22,52 +22,52 @@

 // acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D}
 .macro acc_sum_store x, xb
-        dup             v24.4S, v24.S[3]                                // ...X -> XXXX
-        ext             v25.16B, v26.16B, \xb, #12                      // ext(0000,ABCD,12)=0ABC
-        add             v24.4S, v24.4S, \x                              // XXXX+ABCD={X+A,X+B,X+C,X+D}
-        add             v24.4S, v24.4S, v25.4S                          // {X+A,X+B+A,X+C+B,X+D+C}       (+0ABC)
-        ext             v25.16B, v26.16B, v25.16B, #12                  // ext(0000,0ABC,12)=00AB
-        add             v24.4S, v24.4S, v25.4S                          // {X+A,X+B+A,X+C+B+A,X+D+C+B}   (+00AB)
-        ext             v25.16B, v26.16B, v25.16B, #12                  // ext(0000,00AB,12)=000A
-        add             v24.4S, v24.4S, v25.4S                          // {X+A,X+B+A,X+C+B+A,X+D+C+B+A} (+000A)
-        st1             {v24.4S}, [x0], #16                             // write 4x32-bit final values
+        dup             v24.4s, v24.s[3]                                // ...X -> XXXX
+        ext             v25.16b, v26.16b, \xb, #12                      // ext(0000,ABCD,12)=0ABC
+        add             v24.4s, v24.4s, \x                              // XXXX+ABCD={X+A,X+B,X+C,X+D}
+        add             v24.4s, v24.4s, v25.4s                          // {X+A,X+B+A,X+C+B,X+D+C}       (+0ABC)
+        ext             v25.16b, v26.16b, v25.16b, #12                  // ext(0000,0ABC,12)=00AB
+        add             v24.4s, v24.4s, v25.4s                          // {X+A,X+B+A,X+C+B+A,X+D+C+B}   (+00AB)
+        ext             v25.16b, v26.16b, v25.16b, #12                  // ext(0000,00AB,12)=000A
+        add             v24.4s, v24.4s, v25.4s                          // {X+A,X+B+A,X+C+B+A,X+D+C+B+A} (+000A)
+        st1             {v24.4s}, [x0], #16                             // write 4x32-bit final values
 .endm

 function ff_compute_safe_ssd_integral_image_neon, export=1
-        movi            v26.4S, #0                                      // used as zero for the "rotations" in acc_sum_store
-        sub             x3, x3, w6, UXTW                                // s1 padding (s1_linesize - w)
-        sub             x5, x5, w6, UXTW                                // s2 padding (s2_linesize - w)
-        sub             x9, x0, w1, UXTW #2                             // dst_top
-        sub             x1, x1, w6, UXTW                                // dst padding (dst_linesize_32 - w)
+        movi            v26.4s, #0                                      // used as zero for the "rotations" in acc_sum_store
+        sub             x3, x3, w6, uxtw                                // s1 padding (s1_linesize - w)
+        sub             x5, x5, w6, uxtw                                // s2 padding (s2_linesize - w)
+        sub             x9, x0, w1, uxtw #2                             // dst_top
+        sub             x1, x1, w6, uxtw                                // dst padding (dst_linesize_32 - w)
        lsl             x1, x1, #2                                      // dst padding expressed in bytes
 1:      mov             w10, w6                                         // width copy for each line
        sub             x0, x0, #16                                     // beginning of the dst line minus 4 sums
        sub             x8, x9, #4                                      // dst_top-1
-        ld1             {v24.4S}, [x0], #16                             // load ...X (contextual last sums)
-2:      ld1             {v0.16B}, [x2], #16                             // s1[x + 0..15]
-        ld1             {v1.16B}, [x4], #16                             // s2[x + 0..15]
-        ld1             {v16.4S,v17.4S}, [x8], #32                      // dst_top[x + 0..7 - 1]
-        usubl           v2.8H, v0.8B,  v1.8B                            // d[x + 0..7]  = s1[x + 0..7]  - s2[x + 0..7]
-        usubl2          v3.8H, v0.16B, v1.16B                           // d[x + 8..15] = s1[x + 8..15] - s2[x + 8..15]
-        ld1             {v18.4S,v19.4S}, [x8], #32                      // dst_top[x + 8..15 - 1]
-        smull           v4.4S, v2.4H, v2.4H                             // d[x + 0..3]^2
-        smull2          v5.4S, v2.8H, v2.8H                             // d[x + 4..7]^2
-        ld1             {v20.4S,v21.4S}, [x9], #32                      // dst_top[x + 0..7]
-        smull           v6.4S, v3.4H, v3.4H                             // d[x + 8..11]^2
-        smull2          v7.4S, v3.8H, v3.8H                             // d[x + 12..15]^2
-        ld1             {v22.4S,v23.4S}, [x9], #32                      // dst_top[x + 8..15]
-        sub             v0.4S, v20.4S, v16.4S                           // dst_top[x + 0..3] - dst_top[x + 0..3 - 1]
-        sub             v1.4S, v21.4S, v17.4S                           // dst_top[x + 4..7] - dst_top[x + 4..7 - 1]
-        add             v0.4S, v0.4S, v4.4S                             // + d[x + 0..3]^2
-        add             v1.4S, v1.4S, v5.4S                             // + d[x + 4..7]^2
-        sub             v2.4S, v22.4S, v18.4S                           // dst_top[x +  8..11] - dst_top[x +  8..11 - 1]
-        sub             v3.4S, v23.4S, v19.4S                           // dst_top[x + 12..15] - dst_top[x + 12..15 - 1]
-        add             v2.4S, v2.4S, v6.4S                             // + d[x +  8..11]^2
-        add             v3.4S, v3.4S, v7.4S                             // + d[x + 12..15]^2
-        acc_sum_store   v0.4S, v0.16B                                   // accumulate and store dst[ 0..3]
-        acc_sum_store   v1.4S, v1.16B                                   // accumulate and store dst[ 4..7]
-        acc_sum_store   v2.4S, v2.16B                                   // accumulate and store dst[ 8..11]
-        acc_sum_store   v3.4S, v3.16B                                   // accumulate and store dst[12..15]
+        ld1             {v24.4s}, [x0], #16                             // load ...X (contextual last sums)
+2:      ld1             {v0.16b}, [x2], #16                             // s1[x + 0..15]
+        ld1             {v1.16b}, [x4], #16                             // s2[x + 0..15]
+        ld1             {v16.4s,v17.4s}, [x8], #32                      // dst_top[x + 0..7 - 1]
+        usubl           v2.8h, v0.8b,  v1.8b                            // d[x + 0..7]  = s1[x + 0..7]  - s2[x + 0..7]
+        usubl2          v3.8h, v0.16b, v1.16b                           // d[x + 8..15] = s1[x + 8..15] - s2[x + 8..15]
+        ld1             {v18.4s,v19.4s}, [x8], #32                      // dst_top[x + 8..15 - 1]
+        smull           v4.4s, v2.4h, v2.4h                             // d[x + 0..3]^2
+        smull2          v5.4s, v2.8h, v2.8h                             // d[x + 4..7]^2
+        ld1             {v20.4s,v21.4s}, [x9], #32                      // dst_top[x + 0..7]
+        smull           v6.4s, v3.4h, v3.4h                             // d[x + 8..11]^2
+        smull2          v7.4s, v3.8h, v3.8h                             // d[x + 12..15]^2
+        ld1             {v22.4s,v23.4s}, [x9], #32                      // dst_top[x + 8..15]
+        sub             v0.4s, v20.4s, v16.4s                           // dst_top[x + 0..3] - dst_top[x + 0..3 - 1]
+        sub             v1.4s, v21.4s, v17.4s                           // dst_top[x + 4..7] - dst_top[x + 4..7 - 1]
+        add             v0.4s, v0.4s, v4.4s                             // + d[x + 0..3]^2
+        add             v1.4s, v1.4s, v5.4s                             // + d[x + 4..7]^2
+        sub             v2.4s, v22.4s, v18.4s                           // dst_top[x +  8..11] - dst_top[x +  8..11 - 1]
+        sub             v3.4s, v23.4s, v19.4s                           // dst_top[x + 12..15] - dst_top[x + 12..15 - 1]
+        add             v2.4s, v2.4s, v6.4s                             // + d[x +  8..11]^2
+        add             v3.4s, v3.4s, v7.4s                             // + d[x + 12..15]^2
+        acc_sum_store   v0.4s, v0.16b                                   // accumulate and store dst[ 0..3]
+        acc_sum_store   v1.4s, v1.16b                                   // accumulate and store dst[ 4..7]
+        acc_sum_store   v2.4s, v2.16b                                   // accumulate and store dst[ 8..11]
+        acc_sum_store   v3.4s, v3.16b                                   // accumulate and store dst[12..15]
        subs            w10, w10, #16                                   // width dec
        b.ne            2b                                              // loop til next line
        add             x2, x2, x3                                      // skip to next line (s1)
@@ -822,6 +822,8 @@ static int config_input(AVFilterLink *inlink)
    if (s->dumpfile) {
        s->analysis_rdft = av_rdft_init(rdft_bits, DFT_R2C);
        s->dump_buf = av_malloc_array(s->analysis_rdft_len, sizeof(*s->dump_buf));
+        if (!s->dump_buf)
+            return AVERROR(ENOMEM);
    }

    s->analysis_buf = av_malloc_array(s->analysis_rdft_len, sizeof(*s->analysis_buf));
@@ -186,7 +186,7 @@ static av_cold int init(AVFilterContext *ctx)
        sign = 1;
        while (1) {
            gain = 1;
-            if (sscanf(arg, "%lf%n *%n", &gain, &len, &len))
+            if (sscanf(arg, "%lf%n *%n", &gain, &len, &len) >= 1)
                arg += len;
            if (parse_channel_name(&arg, &in_ch_id, &named)){
                av_log(ctx, AV_LOG_ERROR,
@@ -69,4 +69,3 @@ ff_scene_sad_fn ff_scene_sad_get_fn(int depth)
    }
    return sad;
 }
-
@@ -38,6 +38,7 @@ enum {
 static const char *const addroi_var_names[] = {
    "iw",
    "ih",
+    NULL,
 };

 typedef struct AddROIContext {
@@ -343,13 +343,14 @@ static int config_props(AVFilterLink *link)
    if(yadif->mode&1)
        link->frame_rate = av_mul_q(link->src->inputs[0]->frame_rate, (AVRational){2,1});

-    if (link->w < 3 || link->h < 4) {
-        av_log(ctx, AV_LOG_ERROR, "Video of less than 3 columns or 4 lines is not supported\n");
+    yadif->csp = av_pix_fmt_desc_get(link->format);
+    yadif->filter = filter;
+
+    if (AV_CEIL_RSHIFT(link->w, yadif->csp->log2_chroma_w) < 3 || AV_CEIL_RSHIFT(link->h, yadif->csp->log2_chroma_h) < 4) {
+        av_log(ctx, AV_LOG_ERROR, "Video with planes less than 3 columns or 4 lines is not supported\n");
        return AVERROR(EINVAL);
    }

-    yadif->csp = av_pix_fmt_desc_get(link->format);
-    yadif->filter = filter;
    if (yadif->csp->comp[0].depth > 8) {
        s->filter_intra = filter_intra_16bit;
        s->filter_line  = filter_line_c_16bit;
@@ -51,4 +51,3 @@ __global__ void Overlay_Cuda(
 }

 }
-
@@ -345,8 +345,7 @@ static int tonemap_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
    int err;
    double peak = ctx->peak;

-    AVHWFramesContext *input_frames_ctx =
-        (AVHWFramesContext*)input->hw_frames_ctx->data;
+    AVHWFramesContext *input_frames_ctx;

    av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
           av_get_pix_fmt_name(input->format),
@@ -354,6 +353,7 @@ static int tonemap_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)

    if (!input->hw_frames_ctx)
        return AVERROR(EINVAL);
+    input_frames_ctx = (AVHWFramesContext*)input->hw_frames_ctx->data;

    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
    if (!output) {
@@ -3392,6 +3392,8 @@ static int barrelsplit_to_xyz(const V360Context *s,
            l_y =  0.5f;
            l_z = (-0.5f + vf) / scaleh;
            break;
+        default:
+            av_assert0(0);
        }
    }

@@ -899,7 +899,7 @@ static void vertopen##name##_transition(AVFilterContext *ctx,
 {                                                                                    \
    XFadeContext *s = ctx->priv;                                                     \
    const int width = out->width;                                                    \
-    const float w2 = out->width / 2;                                                 \
+    const float w2 = out->width / 2.0;                                                 \
                                                                                     \
    for (int y = slice_start; y < slice_end; y++) {                                  \
        for (int x = 0; x < width; x++) {                                            \
@@ -926,7 +926,7 @@ static void vertclose##name##_transition(AVFilterContext *ctx,
 {                                                                                    \
    XFadeContext *s = ctx->priv;                                                     \
    const int width = out->width;                                                    \
-    const float w2 = out->width / 2;                                                 \
+    const float w2 = out->width / 2.0;                                                 \
                                                                                     \
    for (int y = slice_start; y < slice_end; y++) {                                  \
        for (int x = 0; x < width; x++) {                                            \
@@ -953,7 +953,7 @@ static void horzopen##name##_transition(AVFilterContext *ctx,
 {                                                                                    \
    XFadeContext *s = ctx->priv;                                                     \
    const int width = out->width;                                                    \
-    const float h2 = out->height / 2;                                                \
+    const float h2 = out->height / 2.0;                                                \
                                                                                     \
    for (int y = slice_start; y < slice_end; y++) {                                  \
        const float smooth = 2.f - fabsf((y - h2) / h2) - progress * 2.f;            \
@@ -980,7 +980,7 @@ static void horzclose##name##_transition(AVFilterContext *ctx,
 {                                                                                    \
    XFadeContext *s = ctx->priv;                                                     \
    const int width = out->width;                                                    \
-    const float h2 = out->height / 2;                                                \
+    const float h2 = out->height / 2.0;                                                \
                                                                                     \
    for (int y = slice_start; y < slice_end; y++) {                                  \
        const float smooth = 1.f + fabsf((y - h2) / h2) - progress * 2.f;            \
@@ -294,7 +294,9 @@ static int xfade_opencl_activate(AVFilterContext *avctx)
            if (ctx->first_pts + ctx->offset_pts > ctx->xf[0]->pts) {
                ctx->xf[0] = NULL;
                ctx->need_second = 0;
-                ff_inlink_consume_frame(avctx->inputs[0], &in);
+                ret = ff_inlink_consume_frame(avctx->inputs[0], &in);
+                if (ret < 0)
+                    return ret;
                return ff_filter_frame(outlink, in);
            }

@@ -303,8 +305,14 @@ static int xfade_opencl_activate(AVFilterContext *avctx)
    }

    if (ctx->xf[0] && ff_inlink_queued_frames(avctx->inputs[1]) > 0) {
-        ff_inlink_consume_frame(avctx->inputs[0], &ctx->xf[0]);
-        ff_inlink_consume_frame(avctx->inputs[1], &ctx->xf[1]);
+        ret = ff_inlink_consume_frame(avctx->inputs[0], &ctx->xf[0]);
+        if (ret < 0)
+            return ret;
+        ret = ff_inlink_consume_frame(avctx->inputs[1], &ctx->xf[1]);
+        if (ret < 0) {
+            av_frame_free(&ctx->xf[0]);
+            return ret;
+        }

        ctx->last_pts = ctx->xf[1]->pts;
        ctx->pts = ctx->xf[0]->pts;
@@ -129,7 +129,8 @@ int64_t ff_ape_parse_tag(AVFormatContext *s)

    avio_seek(pb, file_size - APE_TAG_FOOTER_BYTES, SEEK_SET);

-    avio_read(pb, buf, 8);     /* APETAGEX */
+    if(avio_read(pb, buf, 8) != 8)     /* APETAGEX */
+        return 0;
    if (strncmp(buf, APE_TAG_PREAMBLE, 8)) {
        return 0;
    }
@@ -77,4 +77,3 @@ const AVCodecTag ff_codec_caf_tags[] = {
    { AV_CODEC_ID_PCM_F64BE,    MKTAG('l','p','c','m') },
    { AV_CODEC_ID_NONE,            0 },
 };
-
@@ -153,5 +153,3 @@ void ff_dash_fill_tmpl_params(char *dst, size_t buffer_size,
        t_cur = t_next;
    }
 }
-
-
@@ -439,7 +439,7 @@ static int open_url(AVFormatContext *s, AVIOContext **pb, const char *url,
    av_freep(pb);
    av_dict_copy(&tmp, *opts, 0);
    av_dict_copy(&tmp, opts2, 0);
-    ret = avio_open2(pb, url, AVIO_FLAG_READ, c->interrupt_callback, &tmp);
+    ret = ffio_open_whitelist(pb, url, AVIO_FLAG_READ, c->interrupt_callback, &tmp, s->protocol_whitelist, s->protocol_blacklist);
    if (ret >= 0) {
        // update cookies on http response with setcookies.
        char *new_cookies = NULL;
@@ -1217,7 +1217,7 @@ static int parse_manifest(AVFormatContext *s, const char *url, AVIOContext *in)
        close_in = 1;

        av_dict_copy(&opts, c->avio_opts, 0);
-        ret = avio_open2(&in, url, AVIO_FLAG_READ, c->interrupt_callback, &opts);
+        ret = ffio_open_whitelist(&in, url, AVIO_FLAG_READ, c->interrupt_callback, &opts, s->protocol_whitelist, s->protocol_blacklist);
        av_dict_free(&opts);
        if (ret < 0)
            return ret;
@@ -119,6 +119,8 @@ static int dxa_read_header(AVFormatContext *s)
            avio_skip(pb, fsize);
        }
        c->bpc = (fsize + (int64_t)c->frames - 1) / c->frames;
+        if (c->bpc < 0)
+            return AVERROR_INVALIDDATA;
        if(ast->codecpar->block_align) {
            if (c->bpc > INT_MAX - ast->codecpar->block_align + 1)
                return AVERROR_INVALIDDATA;
@@ -149,4 +149,3 @@ AVOutputFormat ff_fifo_test_muxer = {
    .priv_class     = &failing_muxer_class,
    .flags          = AVFMT_NOFILE | AVFMT_ALLOW_FLUSH,
 };
-
@@ -48,6 +48,31 @@ int av_match_ext(const char *filename, const char *extensions)
    return 0;
 }

+int ff_match_url_ext(const char *url, const char *extensions)
+{
+    const char *ext;
+    URLComponents uc;
+    int ret;
+    char scratchpad[128];
+
+    if (!url)
+        return 0;
+
+    ret = ff_url_decompose(&uc, url, NULL);
+    if (ret < 0 || !URL_COMPONENT_HAVE(uc, scheme))
+        return ret;
+    for (ext = uc.query; *ext != '.' && ext > uc.path; ext--)
+        ;
+
+    if (*ext != '.')
+        return 0;
+    if (uc.query - ext > sizeof(scratchpad))
+        return AVERROR(ENOMEM); //not enough memory in our scratchpad
+    av_strlcpy(scratchpad, ext + 1, FFMIN(sizeof(scratchpad), uc.query - ext));
+
+    return av_match_name(scratchpad, extensions);
+}
+
 ff_const59 AVOutputFormat *av_guess_format(const char *short_name, const char *filename,
                                const char *mime_type)
 {
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .3.8
 .3.9