Bump minor versions after branching 4.3

Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
2020-06-08 22:49:04 +02:00
657 changed files with 5664 additions and 10064 deletions
@@ -1,23 +0,0 @@
-exclude: ^tests/ref/
-
-repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v5.0.0
-  hooks:
-    - id: check-case-conflict
-    - id: check-executables-have-shebangs
-    - id: check-illegal-windows-names
-    - id: check-shebang-scripts-are-executable
-    - id: check-yaml
-    - id: end-of-file-fixer
-    - id: fix-byte-order-marker
-    - id: mixed-line-ending
-    - id: trailing-whitespace
- repo: local
-  hooks:
-    - id: aarch64-asm-indent
-      name: fix aarch64 assembly indentation
-      files: ^.*/aarch64/.*\.S$
-      language: script
-      entry: ./tools/check_arm_indent.sh --apply
-      pass_filenames: false
@@ -1,29 +0,0 @@
-name: Lint
-
-on:
-  push:
-    branches:
-      - release/4.3
-  pull_request:
-
-jobs:
-  lint:
-    name: Pre-Commit
-    runs-on: utilities
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install pre-commit CI
-        id: install
-        run: |
-            python3 -m venv ~/pre-commit
-            ~/pre-commit/bin/pip install --upgrade pip setuptools
-            ~/pre-commit/bin/pip install pre-commit
-            echo "envhash=$({ python3 --version && cat .forgejo/pre-commit/config.yaml; } | sha256sum | cut -d' ' -f1)" >> $FORGEJO_OUTPUT
-      - name: Cache
-        uses: actions/cache@v4
-        with:
-          path: ~/.cache/pre-commit
-          key: pre-commit-${{ steps.install.outputs.envhash }}
-      - name: Run pre-commit CI
-        run: ~/pre-commit/bin/pre-commit run -c .forgejo/pre-commit/config.yaml --show-diff-on-failure --color=always --all-files
@@ -1,80 +0,0 @@
-name: Test
-
-on:
-  push:
-    branches:
-      - release/4.3
-  pull_request:
-
-jobs:
-  run_fate:
-    name: Fate (${{ matrix.runner }}, ${{ matrix.shared }}, ${{ matrix.bits }} bit)
-    strategy:
-      fail-fast: false
-      matrix:
-        runner: [linux-aarch64]
-        shared: ['static']
-        bits: ['64']
-        include:
-          - runner: linux-amd64
-            shared: 'static'
-            bits: '32'
-          - runner: linux-amd64
-            shared: 'shared'
-            bits: '64'
-    runs-on: ${{ matrix.runner }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Configure
-        run: |
-          ./configure --enable-gpl --enable-nonfree --enable-memory-poisoning --assert-level=2 \
-              $([ "${{ matrix.bits }}" != "32" ] || echo --arch=x86_32 --extra-cflags=-m32 --extra-cxxflags=-m32 --extra-ldflags=-m32) \
-              $([ "${{ matrix.shared }}" != "shared" ] || echo --enable-shared --disable-static) \
-              || CFGRES=$? && CFGRES=$?
-          cat ffbuild/config.log
-          exit $CFGRES
-      - name: Build
-        run: make -j$(nproc)
-      - name: Restore Cached Fate-Suite
-        id: cache
-        uses: actions/cache/restore@v4
-        with:
-          path: fate-suite
-          key: fate-suite
-          restore-keys: |
-            fate-suite-
-      - name: Sync Fate-Suite
-        id: fate
-        run: |
-          make fate-rsync SAMPLES=$PWD/fate-suite
-          echo "hash=$(find fate-suite -type f -printf "%P %s %T@\n" | sort | sha256sum | cut -d' ' -f1)" >> $FORGEJO_OUTPUT
-      - name: Cache Fate-Suite
-        uses: actions/cache/save@v4
-        if: ${{ format('fate-suite-{0}', steps.fate.outputs.hash) != steps.cache.outputs.cache-matched-key }}
-        with:
-          path: fate-suite
-          key: fate-suite-${{ steps.fate.outputs.hash }}
-      - name: Run Fate
-        run: LD_LIBRARY_PATH="$(printf "%s:" "$PWD"/lib*)$PWD" make fate fate-build SAMPLES=$PWD/fate-suite -j$(nproc)
-  compile_only:
-    name: Fate (Win64, Build-Only)
-    strategy:
-      fail-fast: false
-      matrix:
-        image: ["ghcr.io/btbn/ffmpeg-builds/win64-gpl-4.3:latest"]
-    runs-on: linux-amd64
-    container: ${{ matrix.image }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Configure
-        run: |
-          ./configure --pkg-config-flags="--static" $FFBUILD_TARGET_FLAGS $FF_CONFIGURE \
-              --cc="$CC" --cxx="$CXX" --ar="$AR" --ranlib="$RANLIB" --nm="$NM" \
-              --extra-cflags="$FF_CFLAGS" --extra-cxxflags="$FF_CXXFLAGS" \
-              --extra-libs="$FF_LIBS" --extra-ldflags="$FF_LDFLAGS" --extra-ldexeflags="$FF_LDEXEFLAGS"
-      - name: Build
-        run: make -j$(nproc)
-      - name: Run Fate
-        run: make -j$(nproc) fate-build
@@ -55,7 +55,7 @@ modified by someone else and passed on, the recipients should know
 that what they have is not the original version, so that the original
 author's reputation will not be affected by problems that might be
 introduced by others.
-
+
  Finally, software patents pose a constant threat to the existence of
 any free program.  We wish to make sure that a company cannot
 effectively restrict the users of a free program by obtaining a
@@ -111,7 +111,7 @@ modification follow.  Pay close attention to the difference between a
 "work based on the library" and a "work that uses the library".  The
 former contains code derived from the library, whereas the latter must
 be combined with the library in order to run.
-
+
                  GNU LESSER GENERAL PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

@@ -158,7 +158,7 @@ Library.
  You may charge a fee for the physical act of transferring a copy,
 and you may at your option offer warranty protection in exchange for a
 fee.
-
+
  2. You may modify your copy or copies of the Library or any portion
 of it, thus forming a work based on the Library, and copy and
 distribute such modifications or work under the terms of Section 1
@@ -216,7 +216,7 @@ instead of to this License.  (If a newer version than version 2 of the
 ordinary GNU General Public License has appeared, then you can specify
 that version instead if you wish.)  Do not make any other change in
 these notices.
-
+
  Once this change is made in a given copy, it is irreversible for
 that copy, so the ordinary GNU General Public License applies to all
 subsequent copies and derivative works made from that copy.
@@ -267,7 +267,7 @@ Library will still fall under Section 6.)
 distribute the object code for the work under the terms of Section 6.
 Any executables containing that work also fall under Section 6,
 whether or not they are linked directly with the Library itself.
-
+
  6. As an exception to the Sections above, you may also combine or
 link a "work that uses the Library" with the Library to produce a
 work containing portions of the Library, and distribute that work
@@ -329,7 +329,7 @@ restrictions of other proprietary libraries that do not normally
 accompany the operating system.  Such a contradiction means you cannot
 use both them and the Library together in an executable that you
 distribute.
-
+
  7. You may place library facilities that are a work based on the
 Library side-by-side in a single library together with other library
 facilities not covered by this License, and distribute such a combined
@@ -370,7 +370,7 @@ subject to these terms and conditions.  You may not impose any further
 restrictions on the recipients' exercise of the rights granted herein.
 You are not responsible for enforcing compliance by third parties with
 this License.
-
+
  11. If, as a consequence of a court judgment or allegation of patent
 infringement or for any other reason (not limited to patent issues),
 conditions are imposed on you (whether by court order, agreement or
@@ -422,7 +422,7 @@ conditions either of that version or of any later version published by
 the Free Software Foundation.  If the Library does not specify a
 license version number, you may choose any version ever published by
 the Free Software Foundation.
-
+
  14. If you wish to incorporate parts of the Library into other free
 programs whose distribution conditions are incompatible with these,
 write to the author to ask for permission.  For software which is
@@ -456,7 +456,7 @@ SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
 DAMAGES.

                     END OF TERMS AND CONDITIONS
-
+
           How to Apply These Terms to Your New Libraries

  If you develop a new library, and you want it to be of the greatest
@@ -1,6 +1,6 @@
-See the Git history of the project (https://git.ffmpeg.org/ffmpeg) to
+See the Git history of the project (git://source.ffmpeg.org/ffmpeg) to
 get the names of people who have contributed to FFmpeg.

 To check the log, you can type the command "git log" in the FFmpeg
 source directory, or browse the online repository at
-https://git.ffmpeg.org/ffmpeg
+http://source.ffmpeg.org.
@@ -15,11 +15,3 @@ NOTICE
 ------

 - Non system dependencies (e.g. libx264, libvpx) are disabled by default.
-
-NOTICE for Package Maintainers
------------------------------
-
- - It is recommended to build FFmpeg twice, first with minimal external dependencies so
-   that 3rd party packages, which depend on FFmpegs libavutil/libavfilter/libavcodec/libavformat
-   can then be built. And last build FFmpeg with full dependancies (which may in turn depend on
-   some of these 3rd party packages). This avoids circular dependencies during build.
@@ -577,12 +577,10 @@ wm4
 Releases
 ========

-7.0                                     Michael Niedermayer
-6.1                                     Michael Niedermayer
-5.1                                     Michael Niedermayer
-4.4                                     Michael Niedermayer
-3.4                                     Michael Niedermayer
 2.8                                     Michael Niedermayer
+2.7                                     Michael Niedermayer
+2.6                                     Michael Niedermayer
+2.5                                     Michael Niedermayer

 If you want to maintain an older release, please contact us

@@ -612,7 +610,6 @@ Loren Merritt                 ABD9 08F4 C920 3F65 D8BE 35D7 1540 DAA7 060F 56DE
 Lou Logan (llogan)            7D68 DC73 CBEF EABB 671A B6CF 621C 2E28 82F8 DC3A
 Lynne                         FE50 139C 6805 72CA FD52 1F8D A2FE A5F0 3F03 4464
 Michael Niedermayer           9FF2 128B 147E F673 0BAD F133 611E C787 040B 0FAB
-                              DD1E C9E8 DE08 5C62 9B3E 1846 B18E 8928 B394 8D64
 Nicolas George                24CE 01CE 9ACC 5CEB 74D8 8D9D B063 D997 36E5 4C93
 Nikolay Aleksandrov           8978 1D8C FB71 588E 4B27 EAA8 C4F0 B5FC E011 13B1
 Panagiotis Issaris            6571 13A3 33D9 3726 F728 AA98 F643 B12E ECF3 E029
@@ -1 +1 @@
-4.3.9
+4.2.git
@@ -1,15 +0,0 @@
-
-              ┌────────────────────────────────────┐
-              │ RELEASE NOTES for FFmpeg 4.3 "4:3" │
-              └────────────────────────────────────┘
-
-   The FFmpeg Project proudly presents FFmpeg 4.3 "4:3", about 10
-   months after the release of FFmpeg 4.2.
-
-   A complete Changelog is available at the root of the project, and the
-   complete Git history on https://git.ffmpeg.org/gitweb/ffmpeg.git
-
-   We hope you will like this release as much as we enjoyed working on it, and
-   as usual, if you have any questions about it, or any FFmpeg related topic,
-   feel free to join us on the #ffmpeg IRC channel (on irc.libera.chat) or ask
-   on the mailing-lists.
@@ -532,7 +532,7 @@ die(){

 If you think configure made a mistake, make sure you are using the latest
 version from Git.  If the latest version fails, report the problem to the
-ffmpeg-user@ffmpeg.org mailing list or IRC #ffmpeg on irc.libera.chat.
+ffmpeg-user@ffmpeg.org mailing list or IRC #ffmpeg on irc.freenode.net.
 EOF
    if disabled logging; then
        cat <<EOF
@@ -2330,7 +2330,6 @@ HAVE_LIST="
    opencl_vaapi_intel_media
    perl
    pod2man
-    posix_ioctl
    texi2html
 "

@@ -3237,7 +3236,7 @@ librav1e_encoder_deps="librav1e"
 librav1e_encoder_select="extract_extradata_bsf"
 librsvg_decoder_deps="librsvg"
 libshine_encoder_deps="libshine"
-libshine_encoder_select="audio_frame_queue mpegaudioheader"
+libshine_encoder_select="audio_frame_queue"
 libspeex_decoder_deps="libspeex"
 libspeex_encoder_deps="libspeex"
 libspeex_encoder_select="audio_frame_queue"
@@ -5331,7 +5330,6 @@ case $target_os in
        ;;
    netbsd)
        disable symver
-        enable section_data_rel_ro
        oss_indev_extralibs="-lossaudio"
        oss_outdev_extralibs="-lossaudio"
        enabled gcc || check_ldflags -Wl,-zmuldefs
@@ -5350,7 +5348,6 @@ case $target_os in
        disable symver
        ;;
    freebsd)
-        enable section_data_rel_ro
        ;;
    bsd/os)
        add_extralibs -lpoll -lgnugetopt
@@ -6496,7 +6493,7 @@ fi

 if enabled sdl2; then
    SDL2_CONFIG="${cross_prefix}sdl2-config"
-    test_pkg_config sdl2 "sdl2 >= 2.0.1 sdl2 < 3.0.0" SDL_events.h SDL_PollEvent
+    test_pkg_config sdl2 "sdl2 >= 2.0.1 sdl2 < 2.1.0" SDL_events.h SDL_PollEvent
    if disabled sdl2 && "${SDL2_CONFIG}" --version > /dev/null 2>&1; then
        sdl2_cflags=$("${SDL2_CONFIG}" --cflags)
        sdl2_extralibs=$("${SDL2_CONFIG}" --libs)
@@ -6542,13 +6539,11 @@ perl -v            > /dev/null 2>&1 && enable perl      || disable perl
 pod2man --help     > /dev/null 2>&1 && enable pod2man   || disable pod2man
 rsync --help 2> /dev/null | grep -q 'contimeout' && enable rsync_contimeout || disable rsync_contimeout

-check_headers linux/fb.h
-check_headers linux/videodev2.h
-test_code cc linux/videodev2.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
-test_code cc sys/ioctl.h "int ioctl(int, int, ...)" && enable posix_ioctl
-
 # check V4L2 codecs available in the API
 if enabled v4l2_m2m; then
+    check_headers linux/fb.h
+    check_headers linux/videodev2.h
+    test_code cc linux/videodev2.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
    check_cc v4l2_m2m linux/videodev2.h "int i = V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M | V4L2_BUF_FLAG_LAST;"
    check_cc vc1_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VC1_ANNEX_G;"
    check_cc mpeg1_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG1;"
@@ -6593,7 +6588,7 @@ enabled alsa && { check_pkg_config alsa alsa "alsa/asoundlib.h" snd_pcm_htimesta
 enabled libjack &&
    require_pkg_config libjack jack jack/jack.h jack_port_get_latency_range

-enabled sndio && check_pkg_config sndio sndio sndio.h sio_open
+enabled sndio && check_lib sndio sndio.h sio_open -lsndio

 if enabled libcdio; then
    check_pkg_config libcdio libcdio_paranoia "cdio/cdda.h cdio/paranoia.h" cdio_cddap_open ||
@@ -6690,7 +6685,7 @@ enabled vulkan &&

 if enabled x86; then
    case $target_os in
-        freebsd|mingw32*|mingw64*|win32|win64|linux|cygwin*)
+        mingw32*|mingw64*|win32|win64|linux|cygwin*)
            ;;
        *)
            disable ffnvcodec cuvid nvdec nvenc
@@ -7518,7 +7513,7 @@ cat > $TMPH <<EOF
 #define FFMPEG_CONFIG_H
 #define FFMPEG_CONFIGURATION "$(c_escape $FFMPEG_CONFIGURATION)"
 #define FFMPEG_LICENSE "$(c_escape $license)"
-#define CONFIG_THIS_YEAR 2025
+#define CONFIG_THIS_YEAR 2020
 #define FFMPEG_DATADIR "$(eval c_escape $datadir)"
 #define AVCONV_DATADIR "$(eval c_escape $datadir)"
 #define CC_IDENT "$(c_escape ${cc_ident:-Unknown compiler})"
@@ -38,7 +38,7 @@ PROJECT_NAME           = FFmpeg
 # could be handy for archiving the generated documentation or if some version
 # control system is used.

-PROJECT_NUMBER         = 4.3.9
+PROJECT_NUMBER         =

 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -3,9 +3,9 @@
 The FFmpeg developers.

 For details about the authorship, see the Git history of the project
-(https://git.ffmpeg.org/ffmpeg), e.g. by typing the command
+(git://source.ffmpeg.org/ffmpeg), e.g. by typing the command
@command{git log} in the FFmpeg source directory, or browsing the
-online repository at @url{https://git.ffmpeg.org/ffmpeg}.
+online repository at @url{http://source.ffmpeg.org}.

 Maintainers for the specific components are listed in the file
@file{MAINTAINERS} in the source code tree.
@@ -63,3 +63,4 @@ make -j<num>
 make -k
    Continue build in case of errors, this is useful for the regression tests
    sometimes but note that it will still not run all reg tests.
+
@@ -317,7 +317,7 @@ list are dropped. You may use the special @code{*} string to match all pages,
 or @code{subtitle} to match all subtitle pages.
 Default value is *.
@item txt_default_region
-Set default character set used for decoding, a value between 0 and 87 (see
+Set default G0 character set used for decoding, a value between 0 and 80 (see
 ETS 300 706, Section 15, Table 32). Default value is -1, which does not
 override the libzvbi default. This option is needed for some legacy level 1.0
 transmissions which cannot signal the proper charset.
@@ -327,13 +327,6 @@ segment index to start live streams at (negative values are from the end).
@item allowed_extensions
 ',' separated list of file extensions that hls is allowed to access.

-@item extension_picky
-This blocks disallowed extensions from probing
-It also requires all available segments to have matching extensions to the format
-except mpegts, which is always allowed.
-It is recommended to set the whitelists correctly instead of depending on extensions
-Enabled by default.
-
@item max_reload
 Maximum number of times a insufficient list is attempted to be reloaded.
 Default value is 1000.
@@ -762,25 +762,6 @@ In case you need finer control over how valgrind is invoked, use the
@code{--target-exec='valgrind <your_custom_valgrind_options>} option in
 your configure line instead.

-@anchor{Maintenance}
-@chapter Maintenance process
-
-@anchor{MAINTAINERS}
-@section MAINTAINERS
-
-The developers maintaining each part of the codebase are listed in @file{MAINTAINERS}.
-Being listed in @file{MAINTAINERS}, gives one the right to have git write access to
-the specific repository.
-
-@anchor{Becoming a maintainer}
-@section Becoming a maintainer
-
-People add themselves to @file{MAINTAINERS} by sending a patch like any other code
-change. These get reviewed by the community like any other patch. It is expected
-that, if someone has an objection to a new maintainer, she is willing to object
-in public with her full name and is willing to take over maintainership for the area.
-
-
@anchor{Release process}
@chapter Release process

@@ -137,9 +137,11 @@ static int decode_packet(AVCodecContext *dec, const AVPacket *pkt)
            ret = output_audio_frame(frame);

        av_frame_unref(frame);
+        if (ret < 0)
+            return ret;
    }

-    return ret;
+    return 0;
 }

 static int open_codec_context(int *stream_idx,
@@ -221,8 +221,10 @@ static int dec_enc(AVPacket *pkt, AVCodec *enc_codec)

 fail:
        av_frame_free(&frame);
+        if (ret < 0)
+            return ret;
    }
-    return ret;
+    return 0;
 }

 int main(int argc, char **argv)
@@ -53,7 +53,7 @@ Most distribution and operating system provide a package for it.
@section Cloning the source tree

@example
-git clone https://git.ffmpeg.org/ffmpeg.git <target>
+git clone git://source.ffmpeg.org/ffmpeg <target>
@end example

 This will put the FFmpeg sources into the directory @var{<target>}.
@@ -187,18 +187,11 @@ to make sure you don't have untracked files or deletions.
 git add [-i|-p|-A] <filenames/dirnames>
@end example

-Make sure you have told Git your name, email address and GPG key
+Make sure you have told Git your name and email address

@example
 git config --global user.name "My Name"
 git config --global user.email my@@email.invalid
-git config --global user.signingkey ABCDEF0123245
-@end example
-
-Enable signing all commits or use -S
-
-@example
-git config --global commit.gpgsign true
@end example

 Use @option{--global} to set the global configuration for all your Git checkouts.
@@ -400,19 +393,6 @@ git checkout -b svn_23456 $SHA1
 where @var{$SHA1} is the commit hash from the @command{git log} output.


-@chapter gpg key generation
-
-If you have no gpg key yet, we recommend that you create a ed25519 based key as it
-is small, fast and secure. Especially it results in small signatures in git.
-
-@example
-gpg --default-new-key-algo "ed25519/cert,sign+cv25519/encr" --quick-generate-key "human@@server.com"
-@end example
-
-When generating a key, make sure the email specified matches the email used in git as some sites like
-github consider mismatches a reason to declare such commits unverified. After generating a key you
-can add it to the MAINTAINER file and upload it to a keyserver.
-
@chapter Pre-push checklist

 Once you have a set of commits that you feel are ready for pushing,
@@ -157,3 +157,4 @@ PFD[32]   would for example be signed 32 bit little-endian IEEE float
@item XVID @tab non-compliant MPEG-4 generated by old Xvid
@item XVIX @tab non-compliant MPEG-4 generated by old Xvid with interlacing bug
@end multitable
+
@@ -20,45 +20,8 @@
 # License along with FFmpeg; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

-# Texinfo 7.0 changed the syntax of various functions.
-# Provide a shim for older versions.
-sub ff_set_from_init_file($$) {
-    my $key = shift;
-    my $value = shift;
-    if (exists &{'texinfo_set_from_init_file'}) {
-        texinfo_set_from_init_file($key, $value);
-    } else {
-        set_from_init_file($key, $value);
-    }
-}
-
-sub ff_get_conf($) {
-    my $key = shift;
-    if (exists &{'texinfo_get_conf'}) {
-        texinfo_get_conf($key);
-    } else {
-        get_conf($key);
-    }
-}
-
-sub get_formatting_function($$) {
-    my $obj = shift;
-    my $func = shift;
-
-    my $sub = $obj->can('formatting_function');
-    if ($sub) {
-        return $obj->formatting_function($func);
-    } else {
-        return $obj->{$func};
-    }
-}
-
-# determine texinfo version
-my $program_version_num = version->declare(ff_get_conf('PACKAGE_VERSION'))->numify;
-my $program_version_6_8 = $program_version_num >= 6.008000;
-
 # no navigation elements
-ff_set_from_init_file('HEADERS', 0);
+set_from_init_file('HEADERS', 0);

 sub ffmpeg_heading_command($$$$$)
 {
@@ -92,7 +55,7 @@ sub ffmpeg_heading_command($$$$$)
        $element = $command->{'parent'};
    }
    if ($element) {
-        $result .= &{get_formatting_function($self, 'format_element_header')}($self, $cmdname,
+        $result .= &{$self->{'format_element_header'}}($self, $cmdname,
                                                       $command, $element);
    }

@@ -149,11 +112,7 @@ sub ffmpeg_heading_command($$$$$)
                $cmdname
                    = $Texinfo::Common::level_to_structuring_command{$cmdname}->[$heading_level];
            }
-            # format_heading_text expects an array of headings for texinfo >= 7.0
-            if ($program_version_num >= 7.000000) {
-                $heading = [$heading];
-            }
-            $result .= &{get_formatting_function($self,'format_heading_text')}(
+            $result .= &{$self->{'format_heading_text'}}(
                        $self, $cmdname, $heading,
                        $heading_level +
                        $self->get_conf('CHAPTER_HEADER_LEVEL') - 1, $command);
@@ -168,18 +127,14 @@ foreach my $command (keys(%Texinfo::Common::sectioning_commands), 'node') {
 }

 # print the TOC where @contents is used
-if ($program_version_6_8) {
-    ff_set_from_init_file('CONTENTS_OUTPUT_LOCATION', 'inline');
-} else {
-    ff_set_from_init_file('INLINE_CONTENTS', 1);
-}
+set_from_init_file('INLINE_CONTENTS', 1);

 # make chapters <h2>
-ff_set_from_init_file('CHAPTER_HEADER_LEVEL', 2);
+set_from_init_file('CHAPTER_HEADER_LEVEL', 2);

 # Do not add <hr>
-ff_set_from_init_file('DEFAULT_RULE', '');
-ff_set_from_init_file('BIG_RULE', '');
+set_from_init_file('DEFAULT_RULE', '');
+set_from_init_file('BIG_RULE', '');

 # Customized file beginning
 sub ffmpeg_begin_file($$$)
@@ -196,18 +151,7 @@ sub ffmpeg_begin_file($$$)
    my ($title, $description, $encoding, $date, $css_lines,
        $doctype, $bodytext, $copying_comment, $after_body_open,
        $extra_head, $program_and_version, $program_homepage,
-        $program, $generator);
-    if ($program_version_num >= 7.000000) {
-        ($title, $description, $encoding, $date, $css_lines,
-         $doctype, $bodytext, $copying_comment, $after_body_open,
-         $extra_head, $program_and_version, $program_homepage,
-         $program, $generator) = $self->_file_header_information($command);
-    } else {
-        ($title, $description, $encoding, $date, $css_lines,
-         $doctype, $bodytext, $copying_comment, $after_body_open,
-         $extra_head, $program_and_version, $program_homepage,
-         $program, $generator) = $self->_file_header_informations($command);
-    }
+        $program, $generator) = $self->_file_header_informations($command);

    my $links = $self->_get_links ($filename, $element);

@@ -240,11 +184,7 @@ EOT

    return $head1 . $head_title . $head2 . $head_title . $head3;
 }
-if ($program_version_6_8) {
-    texinfo_register_formatting_function('format_begin_file', \&ffmpeg_begin_file);
-} else {
-    texinfo_register_formatting_function('begin_file', \&ffmpeg_begin_file);
-}
+texinfo_register_formatting_function('begin_file', \&ffmpeg_begin_file);

 sub ffmpeg_program_string($)
 {
@@ -261,17 +201,13 @@ sub ffmpeg_program_string($)
      $self->gdt('This document was generated automatically.'));
  }
 }
-if ($program_version_6_8) {
-    texinfo_register_formatting_function('format_program_string', \&ffmpeg_program_string);
-} else {
-    texinfo_register_formatting_function('program_string', \&ffmpeg_program_string);
-}
+texinfo_register_formatting_function('program_string', \&ffmpeg_program_string);

 # Customized file ending
 sub ffmpeg_end_file($)
 {
    my $self = shift;
-    my $program_string = &{get_formatting_function($self,'format_program_string')}($self);
+    my $program_string = &{$self->{'format_program_string'}}($self);
    my $program_text = <<EOT;
      <p style="font-size: small;">
        $program_string
@@ -284,15 +220,11 @@ EOT
 EOT
    return $program_text . $footer;
 }
-if ($program_version_6_8) {
-    texinfo_register_formatting_function('format_end_file', \&ffmpeg_end_file);
-} else {
-    texinfo_register_formatting_function('end_file', \&ffmpeg_end_file);
-}
+texinfo_register_formatting_function('end_file', \&ffmpeg_end_file);

 # Dummy title command
 # Ignore title. Title is handled through ffmpeg_begin_file().
-ff_set_from_init_file('USE_TITLEPAGE_FOR_TITLE', 1);
+set_from_init_file('USE_TITLEPAGE_FOR_TITLE', 1);
 sub ffmpeg_title($$$$)
 {
    return '';
@@ -310,14 +242,8 @@ sub ffmpeg_float($$$$$)
    my $args = shift;
    my $content = shift;

-    my ($caption, $prepended);
-    if ($program_version_num >= 7.000000) {
-        ($caption, $prepended) = Texinfo::Convert::Converter::float_name_caption($self,
-                                                                                 $command);
-    } else {
-        ($caption, $prepended) = Texinfo::Common::float_name_caption($self,
-                                                                     $command);
-    }
+    my ($caption, $prepended) = Texinfo::Common::float_name_caption($self,
+                                                                $command);
    my $caption_text = '';
    my $prepended_text;
    my $prepended_save = '';
@@ -389,13 +315,8 @@ sub ffmpeg_float($$$$$)
            $caption->{'args'}->[0], 'float caption');
    }
    if ($prepended_text.$caption_text ne '') {
-        if ($program_version_num >= 7.000000) {
-            $prepended_text = $self->html_attribute_class('div',['float-caption']). '>'
-                    . $prepended_text;
-        } else {
-            $prepended_text = $self->_attribute_class('div','float-caption'). '>'
-                    . $prepended_text;
-        }
+        $prepended_text = $self->_attribute_class('div','float-caption'). '>'
+                . $prepended_text;
        $caption_text .= '</div>';
    }
    my $html_class = '';
@@ -408,13 +329,8 @@ sub ffmpeg_float($$$$$)
        $prepended_text = '';
        $caption_text   = '';
    }
-    if ($program_version_num >= 7.000000) {
-        return $self->html_attribute_class('div', [$html_class]). '>' . "\n" .
-            $prepended_text . $caption_text . $content . '</div>';
-    } else {
-        return $self->_attribute_class('div', $html_class). '>' . "\n" .
-            $prepended_text . $caption_text . $content . '</div>';
-    }
+    return $self->_attribute_class('div', $html_class). '>' . "\n" .
+        $prepended_text . $caption_text . $content . '</div>';
 }

 texinfo_register_command_formatting('float',
@@ -44,3 +44,4 @@ a+b*c;
 here the reader knows that a,b,c are meant to be signed integers but for C
 standard compliance / to avoid undefined behavior they are stored in unsigned
 ints.
+
@@ -418,4 +418,4 @@ done:

 When all of this is done, you can submit your patch to the ffmpeg-devel
 mailing-list for review.  If you need any help, feel free to come on our IRC
-channel, #ffmpeg-devel on irc.libera.chat.
+channel, #ffmpeg-devel on irc.freenode.net.
@@ -1,5 +1,3 @@
-#!/bin/sh
-
 toupper(){
    echo "$@" | tr abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ
 }
@@ -538,7 +538,7 @@ static const AVOption *opt_find(void *obj, const char *name, const char *unit,
    return o;
 }

-#define FLAGS ((o->type == AV_OPT_TYPE_FLAGS && (arg[0]=='-' || arg[0]=='+')) ? AV_DICT_APPEND : 0)
+#define FLAGS (o->type == AV_OPT_TYPE_FLAGS && (arg[0]=='-' || arg[0]=='+')) ? AV_DICT_APPEND : 0
 int opt_default(void *optctx, const char *opt, const char *arg)
 {
    const AVOption *o;
@@ -468,9 +468,8 @@ static int read_key(void)
        }
        //Read it
        if(nchars != 0) {
-            if (read(0, &ch, 1) == 1)
-                return ch;
-            return 0;
+            read(0, &ch, 1);
+            return ch;
        }else{
            return -1;
        }
@@ -529,7 +528,6 @@ static void ffmpeg_cleanup(int ret)
        for (j = 0; j < fg->nb_outputs; j++) {
            OutputFilter *ofilter = fg->outputs[j];

-            avfilter_inout_free(&ofilter->out_tmp);
            av_freep(&ofilter->name);
            av_freep(&ofilter->formats);
            av_freep(&ofilter->channel_layouts);
@@ -1151,8 +1151,6 @@ static void video_audio_display(VideoState *s)
        if (realloc_texture(&s->vis_texture, SDL_PIXELFORMAT_ARGB8888, s->width, s->height, SDL_BLENDMODE_NONE, 1) < 0)
            return;

-        if (s->xpos >= s->width)
-            s->xpos = 0;
        nb_display_channels= FFMIN(nb_display_channels, 2);
        if (rdft_bits != s->rdft_bits) {
            av_rdft_end(s->rdft);
@@ -1202,6 +1200,8 @@ static void video_audio_display(VideoState *s)
        }
        if (!s->paused)
            s->xpos++;
+        if (s->xpos >= s->width)
+            s->xpos= s->xleft;
    }
 }

@@ -131,8 +131,8 @@ static int zero12v_decode_frame(AVCodecContext *avctx, void *data,
            u = x/2 + (uint16_t *)(pic->data[1] + line * pic->linesize[1]);
            v = x/2 + (uint16_t *)(pic->data[2] + line * pic->linesize[2]);
            memcpy(y, y_temp, sizeof(*y) * (width - x));
-            memcpy(u, u_temp, sizeof(*u) * ((width - x + 1) / 2));
-            memcpy(v, v_temp, sizeof(*v) * ((width - x + 1) / 2));
+            memcpy(u, u_temp, sizeof(*u) * (width - x + 1) / 2);
+            memcpy(v, v_temp, sizeof(*v) * (width - x + 1) / 2);
        }

        line_end += stride;
@@ -498,8 +498,8 @@ static int decode_i_block(FourXContext *f, int16_t *block)
 {
    int code, i, j, level, val;

-    if (get_bits_left(&f->pre_gb) < 2) {
-        av_log(f->avctx, AV_LOG_ERROR, "%d bits left before decode_i_block()\n", get_bits_left(&f->pre_gb));
+    if (get_bits_left(&f->gb) < 2){
+        av_log(f->avctx, AV_LOG_ERROR, "%d bits left before decode_i_block()\n", get_bits_left(&f->gb));
        return AVERROR_INVALIDDATA;
    }

@@ -885,8 +885,6 @@ static int decode_frame(AVCodecContext *avctx, void *data,
        }

        if (i >= CFRAME_BUFFER_COUNT) {
-            if (free_index < 0)
-                return AVERROR_INVALIDDATA;
            i             = free_index;
            f->cfrm[i].id = id;
        }
@@ -70,9 +70,6 @@ static int decode_frame(AVCodecContext *avctx, void *data,
    unsigned char *planemap = c->planemap;
    int ret;

-    if (buf_size < planes * height *2)
-        return AVERROR_INVALIDDATA;
-
    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
        return ret;

@@ -1174,7 +1174,7 @@ SKIPHEADERS-$(CONFIG_QSV)              += qsv.h qsv_internal.h
 SKIPHEADERS-$(CONFIG_QSVDEC)           += qsvdec.h
 SKIPHEADERS-$(CONFIG_QSVENC)           += qsvenc.h
 SKIPHEADERS-$(CONFIG_XVMC)             += xvmc.h
-SKIPHEADERS-$(CONFIG_VAAPI)            += vaapi_decode.h vaapi_hevc.h vaapi_encode.h
+SKIPHEADERS-$(CONFIG_VAAPI)            += vaapi_decode.h vaapi_encode.h
 SKIPHEADERS-$(CONFIG_VDPAU)            += vdpau.h vdpau_internal.h
 SKIPHEADERS-$(CONFIG_VIDEOTOOLBOX)     += videotoolbox.h vt_internal.h
 SKIPHEADERS-$(CONFIG_V4L2_M2M)         += v4l2_buffers.h v4l2_context.h v4l2_m2m.h
@@ -407,7 +407,6 @@ AVCodec ff_a64multi_encoder = {
    .close          = a64multi_close_encoder,
    .pix_fmts       = (const enum AVPixelFormat[]) {AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE},
    .capabilities   = AV_CODEC_CAP_DELAY,
-    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
 };
 #endif
 #if CONFIG_A64MULTI5_ENCODER
@@ -422,6 +421,5 @@ AVCodec ff_a64multi5_encoder = {
    .close          = a64multi_close_encoder,
    .pix_fmts       = (const enum AVPixelFormat[]) {AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE},
    .capabilities   = AV_CODEC_CAP_DELAY,
-    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
 };
 #endif
@@ -843,25 +843,25 @@ static void search_for_ms(AACEncContext *s, ChannelElement *cpe)
                                                    sce0->ics.swb_sizes[g],
                                                    sce0->sf_idx[w*16+g],
                                                    sce0->band_type[w*16+g],
-                                                    lambda / (band0->threshold + FLT_MIN), INFINITY, &b1, NULL, 0);
+                                                    lambda / band0->threshold, INFINITY, &b1, NULL, 0);
                        dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
                                                    R34,
                                                    sce1->ics.swb_sizes[g],
                                                    sce1->sf_idx[w*16+g],
                                                    sce1->band_type[w*16+g],
-                                                    lambda / (band1->threshold + FLT_MIN), INFINITY, &b2, NULL, 0);
+                                                    lambda / band1->threshold, INFINITY, &b2, NULL, 0);
                        dist2 += quantize_band_cost(s, M,
                                                    M34,
                                                    sce0->ics.swb_sizes[g],
                                                    mididx,
                                                    midcb,
-                                                    lambda / (minthr + FLT_MIN), INFINITY, &b3, NULL, 0);
+                                                    lambda / minthr, INFINITY, &b3, NULL, 0);
                        dist2 += quantize_band_cost(s, S,
                                                    S34,
                                                    sce1->ics.swb_sizes[g],
                                                    sididx,
                                                    sidcb,
-                                                    mslambda / (minthr * bmax + FLT_MIN), INFINITY, &b4, NULL, 0);
+                                                    mslambda / (minthr * bmax), INFINITY, &b4, NULL, 0);
                        B0 += b1+b2;
                        B1 += b3+b4;
                        dist1 -= b1+b2;
@@ -155,9 +155,9 @@ static void vector_pow43(int *coefs, int len)
    for (i=0; i<len; i++) {
        coef = coefs[i];
        if (coef < 0)
-            coef = -(int)ff_cbrt_tab_fixed[(-coef) & 8191];
+            coef = -(int)ff_cbrt_tab_fixed[-coef];
        else
-            coef =  (int)ff_cbrt_tab_fixed[  coef  & 8191];
+            coef = (int)ff_cbrt_tab_fixed[coef];
        coefs[i] = coef;
    }
 }
@@ -974,18 +974,14 @@ static int decode_audio_specific_config_gb(AACContext *ac,
 {
    int i, ret;
    GetBitContext gbc = *gb;
-    MPEG4AudioConfig m4ac_bak = *m4ac;

-    if ((i = ff_mpeg4audio_get_config_gb(m4ac, &gbc, sync_extension, avctx)) < 0) {
-        *m4ac = m4ac_bak;
+    if ((i = ff_mpeg4audio_get_config_gb(m4ac, &gbc, sync_extension, avctx)) < 0)
        return AVERROR_INVALIDDATA;
-    }

    if (m4ac->sampling_index > 12) {
        av_log(avctx, AV_LOG_ERROR,
               "invalid sampling rate index %d\n",
               m4ac->sampling_index);
-        *m4ac = m4ac_bak;
        return AVERROR_INVALIDDATA;
    }
    if (m4ac->object_type == AOT_ER_AAC_LD &&
@@ -993,7 +989,6 @@ static int decode_audio_specific_config_gb(AACContext *ac,
        av_log(avctx, AV_LOG_ERROR,
               "invalid low delay sampling rate index %d\n",
               m4ac->sampling_index);
-        *m4ac = m4ac_bak;
        return AVERROR_INVALIDDATA;
    }

@@ -2812,7 +2807,7 @@ static void imdct_and_windowing_ld(AACContext *ac, SingleChannelElement *sce)

 static void imdct_and_windowing_eld(AACContext *ac, SingleChannelElement *sce)
 {
-    UINTFLOAT *in   = sce->coeffs;
+    INTFLOAT *in    = sce->coeffs;
    INTFLOAT *out   = sce->ret;
    INTFLOAT *saved = sce->saved;
    INTFLOAT *buf  = ac->buf_mdct;
@@ -28,7 +28,6 @@
 *              TODOs:
 * add sane pulse detection
 ***********************************/
-#include <float.h>

 #include "libavutil/libm.h"
 #include "libavutil/thread.h"
@@ -857,7 +856,7 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                /* Not so fast though */
                ratio = sqrtf(ratio);
            }
-            s->lambda = av_clipf(s->lambda * ratio, FLT_EPSILON, 65536.f);
+            s->lambda = FFMIN(s->lambda * ratio, 65536.f);

            /* Keep iterating if we must reduce and lambda is in the sky */
            if (ratio > 0.9f && ratio < 1.1f) {
@@ -902,7 +901,7 @@ static av_cold int aac_encode_end(AVCodecContext *avctx)
 {
    AACEncContext *s = avctx->priv_data;

-    av_log(avctx, AV_LOG_INFO, "Qavg: %.3f\n", s->lambda_count ? s->lambda_sum / s->lambda_count : NAN);
+    av_log(avctx, AV_LOG_INFO, "Qavg: %.3f\n", s->lambda_sum / s->lambda_count);

    ff_mdct_end(&s->mdct1024);
    ff_mdct_end(&s->mdct128);
@@ -173,7 +173,6 @@ void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce)
                      sce->ics.window_sequence[0] == LONG_START_SEQUENCE ? 0 : 2;
    const int sfb_len = sfb_end - sfb_start;
    const int coef_len = sce->ics.swb_offset[sfb_end] - sce->ics.swb_offset[sfb_start];
-    const int n_filt = is8 ? 1 : order != TNS_MAX_ORDER ? 2 : 3;

    if (coef_len <= 0 || sfb_len <= 0) {
        sce->tns.present = 0;
@@ -181,30 +180,16 @@ void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce)
    }

    for (w = 0; w < sce->ics.num_windows; w++) {
-        float en[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-        int oc_start = 0;
+        float en[2] = {0.0f, 0.0f};
+        int oc_start = 0, os_start = 0;
        int coef_start = sce->ics.swb_offset[sfb_start];

-        if (n_filt == 2) {
-            for (g = sfb_start; g < sce->ics.num_swb && g <= sfb_end; g++) {
-                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[w*16+g];
-                    if (g > sfb_start + (sfb_len/2))
-                        en[1] += band->energy; /* End */
-                    else
-                        en[0] += band->energy; /* Start */
-            }
-            en[2] = en[0];
-        } else {
-            for (g = sfb_start; g < sce->ics.num_swb && g <= sfb_end; g++) {
-                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[w*16+g];
-                    if (g > sfb_start + (sfb_len/2) + (sfb_len/4))
-                        en[2] += band->energy; /* End */
-                    else if (g > sfb_start + (sfb_len/2) - (sfb_len/4))
-                        en[1] += band->energy; /* Middle */
-                    else
-                        en[0] += band->energy; /* Start */
-            }
-            en[3] = en[0];
+        for (g = sfb_start; g < sce->ics.num_swb && g <= sfb_end; g++) {
+            FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[w*16+g];
+            if (g > sfb_start + (sfb_len/2))
+                en[1] += band->energy;
+            else
+                en[0] += band->energy;
        }

        /* LPC */
@@ -214,14 +199,15 @@ void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce)
        if (!order || !isfinite(gain) || gain < TNS_GAIN_THRESHOLD_LOW || gain > TNS_GAIN_THRESHOLD_HIGH)
            continue;

-        tns->n_filt[w] = n_filt;
+        tns->n_filt[w] = is8 ? 1 : order != TNS_MAX_ORDER ? 2 : 3;
        for (g = 0; g < tns->n_filt[w]; g++) {
-            tns->direction[w][g] = slant != 2 ? slant : en[g] < en[g + 1];
-            tns->order[w][g] = order/tns->n_filt[w];
-            tns->length[w][g] = sfb_len/tns->n_filt[w];
+            tns->direction[w][g] = slant != 2 ? slant : en[g] < en[!g];
+            tns->order[w][g] = g < tns->n_filt[w] ? order/tns->n_filt[w] : order - oc_start;
+            tns->length[w][g] = g < tns->n_filt[w] ? sfb_len/tns->n_filt[w] : sfb_len - os_start;
            quantize_coefs(&coefs[oc_start], tns->coef_idx[w][g], tns->coef[w][g],
                            tns->order[w][g], c_bits);
            oc_start += tns->order[w][g];
+            os_start += tns->length[w][g];
        }
        count++;
    }
@@ -308,9 +308,6 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
    const int bandwidth    = ctx->cutoff ? ctx->cutoff : AAC_CUTOFF(ctx->avctx);
    const float num_bark   = calc_bark((float)bandwidth);

-    if (bandwidth <= 0)
-        return AVERROR(EINVAL);
-
    ctx->model_priv_data = av_mallocz(sizeof(AacPsyContext));
    if (!ctx->model_priv_data)
        return AVERROR(ENOMEM);
@@ -797,7 +794,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,

        if (pe < 1.15f * desired_pe) {
            /* 6.6.1.3.6 "Final threshold modification by linearization" */
-            norm_fac = norm_fac ? 1.0f / norm_fac : 0;
+            norm_fac = 1.0f / norm_fac;
            for (w = 0; w < wi->num_windows*16; w += 16) {
                for (g = 0; g < num_bands; g++) {
                    AacPsyBand *band = &pch->band[w+g];
@@ -592,7 +592,6 @@ static int sbr_make_f_derived(AACContext *ac, SpectralBandReplication *sbr)

    if (sbr->n_q > 5) {
        av_log(ac->avctx, AV_LOG_ERROR, "Too many noise floor scale factors: %d\n", sbr->n_q);
-        sbr->n_q = 1;
        return -1;
    }

@@ -19,130 +19,130 @@
 #include "libavutil/aarch64/asm.S"

 function ff_ps_add_squares_neon, export=1
-1:      ld1             {v0.4s,v1.4s}, [x1], #32
-        fmul            v0.4s, v0.4s, v0.4s
-        fmul            v1.4s, v1.4s, v1.4s
-        faddp           v2.4s, v0.4s, v1.4s
-        ld1             {v3.4s}, [x0]
-        fadd            v3.4s, v3.4s, v2.4s
-        st1             {v3.4s}, [x0], #16
-        subs            w2, w2, #4
-        b.gt            1b
+1:      ld1         {v0.4S,v1.4S}, [x1], #32
+        fmul        v0.4S, v0.4S, v0.4S
+        fmul        v1.4S, v1.4S, v1.4S
+        faddp       v2.4S, v0.4S, v1.4S
+        ld1         {v3.4S}, [x0]
+        fadd        v3.4S, v3.4S, v2.4S
+        st1         {v3.4S}, [x0], #16
+        subs        w2, w2, #4
+        b.gt        1b
        ret
 endfunc

 function ff_ps_mul_pair_single_neon, export=1
-1:      ld1             {v0.4s,v1.4s}, [x1], #32
-        ld1             {v2.4s},       [x2], #16
-        zip1            v3.4s, v2.4s, v2.4s
-        zip2            v4.4s, v2.4s, v2.4s
-        fmul            v0.4s, v0.4s, v3.4s
-        fmul            v1.4s, v1.4s, v4.4s
-        st1             {v0.4s,v1.4s}, [x0], #32
-        subs            w3, w3, #4
-        b.gt            1b
+1:      ld1         {v0.4S,v1.4S}, [x1], #32
+        ld1         {v2.4S},       [x2], #16
+        zip1        v3.4S, v2.4S, v2.4S
+        zip2        v4.4S, v2.4S, v2.4S
+        fmul        v0.4S, v0.4S, v3.4S
+        fmul        v1.4S, v1.4S, v4.4S
+        st1         {v0.4S,v1.4S}, [x0], #32
+        subs        w3, w3, #4
+        b.gt        1b
        ret
 endfunc

 function ff_ps_stereo_interpolate_neon, export=1
-        ld1             {v0.4s}, [x2]
-        ld1             {v1.4s}, [x3]
-        zip1            v4.4s, v0.4s, v0.4s
-        zip2            v5.4s, v0.4s, v0.4s
-        zip1            v6.4s, v1.4s, v1.4s
-        zip2            v7.4s, v1.4s, v1.4s
-1:      ld1             {v2.2s}, [x0]
-        ld1             {v3.2s}, [x1]
-        fadd            v4.4s, v4.4s, v6.4s
-        fadd            v5.4s, v5.4s, v7.4s
-        mov             v2.d[1], v2.d[0]
-        mov             v3.d[1], v3.d[0]
-        fmul            v2.4s, v2.4s, v4.4s
-        fmla            v2.4s, v3.4s, v5.4s
-        st1             {v2.d}[0], [x0], #8
-        st1             {v2.d}[1], [x1], #8
-        subs            w4, w4, #1
-        b.gt            1b
+        ld1         {v0.4S}, [x2]
+        ld1         {v1.4S}, [x3]
+        zip1        v4.4S, v0.4S, v0.4S
+        zip2        v5.4S, v0.4S, v0.4S
+        zip1        v6.4S, v1.4S, v1.4S
+        zip2        v7.4S, v1.4S, v1.4S
+1:      ld1         {v2.2S}, [x0]
+        ld1         {v3.2S}, [x1]
+        fadd        v4.4S, v4.4S, v6.4S
+        fadd        v5.4S, v5.4S, v7.4S
+        mov         v2.D[1], v2.D[0]
+        mov         v3.D[1], v3.D[0]
+        fmul        v2.4S, v2.4S, v4.4S
+        fmla        v2.4S, v3.4S, v5.4S
+        st1         {v2.D}[0], [x0], #8
+        st1         {v2.D}[1], [x1], #8
+        subs        w4, w4, #1
+        b.gt        1b
        ret
 endfunc

 function ff_ps_stereo_interpolate_ipdopd_neon, export=1
-        ld1             {v0.4s,v1.4s}, [x2]
-        ld1             {v6.4s,v7.4s}, [x3]
-        fneg            v2.4s, v1.4s
-        fneg            v3.4s, v7.4s
-        zip1            v16.4s, v0.4s, v0.4s
-        zip2            v17.4s, v0.4s, v0.4s
-        zip1            v18.4s, v2.4s, v1.4s
-        zip2            v19.4s, v2.4s, v1.4s
-        zip1            v20.4s, v6.4s, v6.4s
-        zip2            v21.4s, v6.4s, v6.4s
-        zip1            v22.4s, v3.4s, v7.4s
-        zip2            v23.4s, v3.4s, v7.4s
-1:      ld1             {v2.2s}, [x0]
-        ld1             {v3.2s}, [x1]
-        fadd            v16.4s, v16.4s, v20.4s
-        fadd            v17.4s, v17.4s, v21.4s
-        mov             v2.d[1], v2.d[0]
-        mov             v3.d[1], v3.d[0]
-        fmul            v4.4s, v2.4s, v16.4s
-        fmla            v4.4s, v3.4s, v17.4s
-        fadd            v18.4s, v18.4s, v22.4s
-        fadd            v19.4s, v19.4s, v23.4s
-        ext             v2.16b, v2.16b, v2.16b, #4
-        ext             v3.16b, v3.16b, v3.16b, #4
-        fmla            v4.4s, v2.4s, v18.4s
-        fmla            v4.4s, v3.4s, v19.4s
-        st1             {v4.d}[0], [x0], #8
-        st1             {v4.d}[1], [x1], #8
-        subs            w4, w4, #1
-        b.gt            1b
+        ld1         {v0.4S,v1.4S}, [x2]
+        ld1         {v6.4S,v7.4S}, [x3]
+        fneg        v2.4S, v1.4S
+        fneg        v3.4S, v7.4S
+        zip1        v16.4S, v0.4S, v0.4S
+        zip2        v17.4S, v0.4S, v0.4S
+        zip1        v18.4S, v2.4S, v1.4S
+        zip2        v19.4S, v2.4S, v1.4S
+        zip1        v20.4S, v6.4S, v6.4S
+        zip2        v21.4S, v6.4S, v6.4S
+        zip1        v22.4S, v3.4S, v7.4S
+        zip2        v23.4S, v3.4S, v7.4S
+1:      ld1         {v2.2S}, [x0]
+        ld1         {v3.2S}, [x1]
+        fadd        v16.4S, v16.4S, v20.4S
+        fadd        v17.4S, v17.4S, v21.4S
+        mov         v2.D[1], v2.D[0]
+        mov         v3.D[1], v3.D[0]
+        fmul        v4.4S, v2.4S, v16.4S
+        fmla        v4.4S, v3.4S, v17.4S
+        fadd        v18.4S, v18.4S, v22.4S
+        fadd        v19.4S, v19.4S, v23.4S
+        ext         v2.16B, v2.16B, v2.16B, #4
+        ext         v3.16B, v3.16B, v3.16B, #4
+        fmla        v4.4S, v2.4S, v18.4S
+        fmla        v4.4S, v3.4S, v19.4S
+        st1         {v4.D}[0], [x0], #8
+        st1         {v4.D}[1], [x1], #8
+        subs        w4, w4, #1
+        b.gt        1b
        ret
 endfunc

 function ff_ps_hybrid_analysis_neon, export=1
-        lsl             x3, x3, #3
-        ld2             {v0.4s,v1.4s}, [x1], #32
-        ld2             {v2.2s,v3.2s}, [x1], #16
-        ld1             {v24.2s},      [x1], #8
-        ld2             {v4.2s,v5.2s}, [x1], #16
-        ld2             {v6.4s,v7.4s}, [x1]
-        rev64           v6.4s, v6.4s
-        rev64           v7.4s, v7.4s
-        ext             v6.16b, v6.16b, v6.16b, #8
-        ext             v7.16b, v7.16b, v7.16b, #8
-        rev64           v4.2s, v4.2s
-        rev64           v5.2s, v5.2s
-        mov             v2.d[1], v3.d[0]
-        mov             v4.d[1], v5.d[0]
-        mov             v5.d[1], v2.d[0]
-        mov             v3.d[1], v4.d[0]
-        fadd            v16.4s, v0.4s, v6.4s
-        fadd            v17.4s, v1.4s, v7.4s
-        fsub            v18.4s, v1.4s, v7.4s
-        fsub            v19.4s, v0.4s, v6.4s
-        fadd            v22.4s, v2.4s, v4.4s
-        fsub            v23.4s, v5.4s, v3.4s
-        trn1            v20.2d, v22.2d, v23.2d      // {re4+re8, re5+re7, im8-im4, im7-im5}
-        trn2            v21.2d, v22.2d, v23.2d      // {im4+im8, im5+im7, re4-re8, re5-re7}
-1:      ld2             {v2.4s,v3.4s}, [x2], #32
-        ld2             {v4.2s,v5.2s}, [x2], #16
-        ld1             {v6.2s},       [x2], #8
-        add             x2, x2, #8
-        mov             v4.d[1], v5.d[0]
-        mov             v6.s[1], v6.s[0]
-        fmul            v6.2s, v6.2s, v24.2s
-        fmul            v0.4s, v2.4s, v16.4s
-        fmul            v1.4s, v2.4s, v17.4s
-        fmls            v0.4s, v3.4s, v18.4s
-        fmla            v1.4s, v3.4s, v19.4s
-        fmla            v0.4s, v4.4s, v20.4s
-        fmla            v1.4s, v4.4s, v21.4s
-        faddp           v0.4s, v0.4s, v1.4s
-        faddp           v0.4s, v0.4s, v0.4s
-        fadd            v0.2s, v0.2s, v6.2s
-        st1             {v0.2s}, [x0], x3
-        subs            w4, w4, #1
-        b.gt            1b
+        lsl         x3, x3, #3
+        ld2         {v0.4S,v1.4S}, [x1], #32
+        ld2         {v2.2S,v3.2S}, [x1], #16
+        ld1         {v24.2S},      [x1], #8
+        ld2         {v4.2S,v5.2S}, [x1], #16
+        ld2         {v6.4S,v7.4S}, [x1]
+        rev64       v6.4S, v6.4S
+        rev64       v7.4S, v7.4S
+        ext         v6.16B, v6.16B, v6.16B, #8
+        ext         v7.16B, v7.16B, v7.16B, #8
+        rev64       v4.2S, v4.2S
+        rev64       v5.2S, v5.2S
+        mov         v2.D[1], v3.D[0]
+        mov         v4.D[1], v5.D[0]
+        mov         v5.D[1], v2.D[0]
+        mov         v3.D[1], v4.D[0]
+        fadd        v16.4S, v0.4S, v6.4S
+        fadd        v17.4S, v1.4S, v7.4S
+        fsub        v18.4S, v1.4S, v7.4S
+        fsub        v19.4S, v0.4S, v6.4S
+        fadd        v22.4S, v2.4S, v4.4S
+        fsub        v23.4S, v5.4S, v3.4S
+        trn1        v20.2D, v22.2D, v23.2D      // {re4+re8, re5+re7, im8-im4, im7-im5}
+        trn2        v21.2D, v22.2D, v23.2D      // {im4+im8, im5+im7, re4-re8, re5-re7}
+1:      ld2         {v2.4S,v3.4S}, [x2], #32
+        ld2         {v4.2S,v5.2S}, [x2], #16
+        ld1         {v6.2S},       [x2], #8
+        add         x2, x2, #8
+        mov         v4.D[1], v5.D[0]
+        mov         v6.S[1], v6.S[0]
+        fmul        v6.2S, v6.2S, v24.2S
+        fmul        v0.4S, v2.4S, v16.4S
+        fmul        v1.4S, v2.4S, v17.4S
+        fmls        v0.4S, v3.4S, v18.4S
+        fmla        v1.4S, v3.4S, v19.4S
+        fmla        v0.4S, v4.4S, v20.4S
+        fmla        v1.4S, v4.4S, v21.4S
+        faddp       v0.4S, v0.4S, v1.4S
+        faddp       v0.4S, v0.4S, v0.4S
+        fadd        v0.2S, v0.2S, v6.2S
+        st1         {v0.2S}, [x0], x3
+        subs        w4, w4, #1
+        b.gt        1b
        ret
 endfunc
@@ -353,18 +353,18 @@ function fft\n\()_neon, align=6
 endfunc
 .endm

-        def_fft         32,    16,     8
-        def_fft         64,    32,    16
-        def_fft         128,    64,    32
-        def_fft         256,   128,    64
-        def_fft         512,   256,   128
-        def_fft         1024,   512,   256
-        def_fft         2048,  1024,   512
-        def_fft         4096,  2048,  1024
-        def_fft         8192,  4096,  2048
-        def_fft         16384,  8192,  4096
-        def_fft         32768, 16384,  8192
-        def_fft         65536, 32768, 16384
+        def_fft    32,    16,     8
+        def_fft    64,    32,    16
+        def_fft   128,    64,    32
+        def_fft   256,   128,    64
+        def_fft   512,   256,   128
+        def_fft  1024,   512,   256
+        def_fft  2048,  1024,   512
+        def_fft  4096,  2048,  1024
+        def_fft  8192,  4096,  2048
+        def_fft 16384,  8192,  4096
+        def_fft 32768, 16384,  8192
+        def_fft 65536, 32768, 16384

 function ff_fft_calc_neon, export=1
        prfm            pldl1keep, [x1]
@@ -36,11 +36,11 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
        lsl             w9,  w9,  #3
        lsl             w10, w10, #1
        add             w9,  w9,  w10
-        add             x6,  x6,  w9, uxtw
-        ld1r            {v22.8h}, [x6]
+        add             x6,  x6,  w9, UXTW
+        ld1r            {v22.8H}, [x6]
  .endif
  .ifc \codec,vc1
-        movi            v22.8h,   #28
+        movi            v22.8H,   #28
  .endif
        mul             w7,  w4,  w5
        lsl             w14, w5,  #3
@@ -53,139 +53,139 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
        add             w4,  w4,  #64
        b.eq            2f

-        dup             v0.8b,  w4
-        dup             v1.8b,  w12
-        ld1             {v4.8b, v5.8b}, [x1], x2
-        dup             v2.8b,  w6
-        dup             v3.8b,  w7
-        ext             v5.8b,  v4.8b,  v5.8b,  #1
-1:      ld1             {v6.8b, v7.8b}, [x1], x2
-        umull           v16.8h, v4.8b,  v0.8b
-        umlal           v16.8h, v5.8b,  v1.8b
-        ext             v7.8b,  v6.8b,  v7.8b,  #1
-        ld1             {v4.8b, v5.8b}, [x1], x2
-        umlal           v16.8h, v6.8b,  v2.8b
+        dup             v0.8B,  w4
+        dup             v1.8B,  w12
+        ld1             {v4.8B, v5.8B}, [x1], x2
+        dup             v2.8B,  w6
+        dup             v3.8B,  w7
+        ext             v5.8B,  v4.8B,  v5.8B,  #1
+1:      ld1             {v6.8B, v7.8B}, [x1], x2
+        umull           v16.8H, v4.8B,  v0.8B
+        umlal           v16.8H, v5.8B,  v1.8B
+        ext             v7.8B,  v6.8B,  v7.8B,  #1
+        ld1             {v4.8B, v5.8B}, [x1], x2
+        umlal           v16.8H, v6.8B,  v2.8B
        prfm            pldl1strm, [x1]
-        ext             v5.8b,  v4.8b,  v5.8b,  #1
-        umlal           v16.8h, v7.8b,  v3.8b
-        umull           v17.8h, v6.8b,  v0.8b
+        ext             v5.8B,  v4.8B,  v5.8B,  #1
+        umlal           v16.8H, v7.8B,  v3.8B
+        umull           v17.8H, v6.8B,  v0.8B
        subs            w3,  w3,  #2
-        umlal           v17.8h, v7.8b, v1.8b
-        umlal           v17.8h, v4.8b, v2.8b
-        umlal           v17.8h, v5.8b, v3.8b
+        umlal           v17.8H, v7.8B, v1.8B
+        umlal           v17.8H, v4.8B, v2.8B
+        umlal           v17.8H, v5.8B, v3.8B
        prfm            pldl1strm, [x1, x2]
  .ifc \codec,h264
-        rshrn           v16.8b, v16.8h, #6
-        rshrn           v17.8b, v17.8h, #6
+        rshrn           v16.8B, v16.8H, #6
+        rshrn           v17.8B, v17.8H, #6
  .else
-        add             v16.8h, v16.8h, v22.8h
-        add             v17.8h, v17.8h, v22.8h
-        shrn            v16.8b, v16.8h, #6
-        shrn            v17.8b, v17.8h, #6
+        add             v16.8H, v16.8H, v22.8H
+        add             v17.8H, v17.8H, v22.8H
+        shrn            v16.8B, v16.8H, #6
+        shrn            v17.8B, v17.8H, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.8b}, [x8], x2
-        ld1             {v21.8b}, [x8], x2
-        urhadd          v16.8b, v16.8b, v20.8b
-        urhadd          v17.8b, v17.8b, v21.8b
+        ld1             {v20.8B}, [x8], x2
+        ld1             {v21.8B}, [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+        urhadd          v17.8B, v17.8B, v21.8B
  .endif
-        st1             {v16.8b}, [x0], x2
-        st1             {v17.8b}, [x0], x2
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
        b.gt            1b
        ret

 2:      adds            w12, w12, w6
-        dup             v0.8b, w4
+        dup             v0.8B, w4
        b.eq            5f
        tst             w6,  w6
-        dup             v1.8b, w12
+        dup             v1.8B, w12
        b.eq            4f

-        ld1             {v4.8b}, [x1], x2
-3:      ld1             {v6.8b}, [x1], x2
-        umull           v16.8h, v4.8b,  v0.8b
-        umlal           v16.8h, v6.8b,  v1.8b
-        ld1             {v4.8b}, [x1], x2
-        umull           v17.8h, v6.8b,  v0.8b
-        umlal           v17.8h, v4.8b,  v1.8b
+        ld1             {v4.8B}, [x1], x2
+3:      ld1             {v6.8B}, [x1], x2
+        umull           v16.8H, v4.8B,  v0.8B
+        umlal           v16.8H, v6.8B,  v1.8B
+        ld1             {v4.8B}, [x1], x2
+        umull           v17.8H, v6.8B,  v0.8B
+        umlal           v17.8H, v4.8B,  v1.8B
        prfm            pldl1strm, [x1]
  .ifc \codec,h264
-        rshrn           v16.8b, v16.8h, #6
-        rshrn           v17.8b, v17.8h, #6
+        rshrn           v16.8B, v16.8H, #6
+        rshrn           v17.8B, v17.8H, #6
  .else
-        add             v16.8h, v16.8h, v22.8h
-        add             v17.8h, v17.8h, v22.8h
-        shrn            v16.8b, v16.8h, #6
-        shrn            v17.8b, v17.8h, #6
+        add             v16.8H, v16.8H, v22.8H
+        add             v17.8H, v17.8H, v22.8H
+        shrn            v16.8B, v16.8H, #6
+        shrn            v17.8B, v17.8H, #6
  .endif
        prfm            pldl1strm, [x1, x2]
  .ifc \type,avg
-        ld1             {v20.8b}, [x8], x2
-        ld1             {v21.8b}, [x8], x2
-        urhadd          v16.8b, v16.8b, v20.8b
-        urhadd          v17.8b, v17.8b, v21.8b
+        ld1             {v20.8B}, [x8], x2
+        ld1             {v21.8B}, [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+        urhadd          v17.8B, v17.8B, v21.8B
  .endif
        subs            w3,  w3,  #2
-        st1             {v16.8b}, [x0], x2
-        st1             {v17.8b}, [x0], x2
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
        b.gt            3b
        ret

-4:      ld1             {v4.8b, v5.8b}, [x1], x2
-        ld1             {v6.8b, v7.8b}, [x1], x2
-        ext             v5.8b,  v4.8b,  v5.8b,  #1
-        ext             v7.8b,  v6.8b,  v7.8b,  #1
+4:      ld1             {v4.8B, v5.8B}, [x1], x2
+        ld1             {v6.8B, v7.8B}, [x1], x2
+        ext             v5.8B,  v4.8B,  v5.8B,  #1
+        ext             v7.8B,  v6.8B,  v7.8B,  #1
        prfm            pldl1strm, [x1]
        subs            w3,  w3,  #2
-        umull           v16.8h, v4.8b, v0.8b
-        umlal           v16.8h, v5.8b, v1.8b
-        umull           v17.8h, v6.8b, v0.8b
-        umlal           v17.8h, v7.8b, v1.8b
+        umull           v16.8H, v4.8B, v0.8B
+        umlal           v16.8H, v5.8B, v1.8B
+        umull           v17.8H, v6.8B, v0.8B
+        umlal           v17.8H, v7.8B, v1.8B
        prfm            pldl1strm, [x1, x2]
  .ifc \codec,h264
-        rshrn           v16.8b, v16.8h, #6
-        rshrn           v17.8b, v17.8h, #6
+        rshrn           v16.8B, v16.8H, #6
+        rshrn           v17.8B, v17.8H, #6
  .else
-        add             v16.8h, v16.8h, v22.8h
-        add             v17.8h, v17.8h, v22.8h
-        shrn            v16.8b, v16.8h, #6
-        shrn            v17.8b, v17.8h, #6
+        add             v16.8H, v16.8H, v22.8H
+        add             v17.8H, v17.8H, v22.8H
+        shrn            v16.8B, v16.8H, #6
+        shrn            v17.8B, v17.8H, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.8b}, [x8], x2
-        ld1             {v21.8b}, [x8], x2
-        urhadd          v16.8b, v16.8b, v20.8b
-        urhadd          v17.8b, v17.8b, v21.8b
+        ld1             {v20.8B}, [x8], x2
+        ld1             {v21.8B}, [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+        urhadd          v17.8B, v17.8B, v21.8B
  .endif
-        st1             {v16.8b}, [x0], x2
-        st1             {v17.8b}, [x0], x2
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
        b.gt            4b
        ret

-5:      ld1             {v4.8b}, [x1], x2
-        ld1             {v5.8b}, [x1], x2
+5:      ld1             {v4.8B}, [x1], x2
+        ld1             {v5.8B}, [x1], x2
        prfm            pldl1strm, [x1]
        subs            w3,  w3,  #2
-        umull           v16.8h, v4.8b, v0.8b
-        umull           v17.8h, v5.8b, v0.8b
+        umull           v16.8H, v4.8B, v0.8B
+        umull           v17.8H, v5.8B, v0.8B
        prfm            pldl1strm, [x1, x2]
  .ifc \codec,h264
-        rshrn           v16.8b, v16.8h, #6
-        rshrn           v17.8b, v17.8h, #6
+        rshrn           v16.8B, v16.8H, #6
+        rshrn           v17.8B, v17.8H, #6
  .else
-        add             v16.8h, v16.8h, v22.8h
-        add             v17.8h, v17.8h, v22.8h
-        shrn            v16.8b, v16.8h, #6
-        shrn            v17.8b, v17.8h, #6
+        add             v16.8H, v16.8H, v22.8H
+        add             v17.8H, v17.8H, v22.8H
+        shrn            v16.8B, v16.8H, #6
+        shrn            v17.8B, v17.8H, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.8b}, [x8], x2
-        ld1             {v21.8b}, [x8], x2
-        urhadd          v16.8b, v16.8b, v20.8b
-        urhadd          v17.8b, v17.8b, v21.8b
+        ld1             {v20.8B}, [x8], x2
+        ld1             {v21.8B}, [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+        urhadd          v17.8B, v17.8B, v21.8B
  .endif
-        st1             {v16.8b}, [x0], x2
-        st1             {v17.8b}, [x0], x2
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
        b.gt            5b
        ret
 endfunc
@@ -206,11 +206,11 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
        lsl             w9,  w9,  #3
        lsl             w10, w10, #1
        add             w9,  w9,  w10
-        add             x6,  x6,  w9, uxtw
-        ld1r            {v22.8h}, [x6]
+        add             x6,  x6,  w9, UXTW
+        ld1r            {v22.8H}, [x6]
  .endif
  .ifc \codec,vc1
-        movi            v22.8h,   #28
+        movi            v22.8H,   #28
  .endif
        mul             w7,  w4,  w5
        lsl             w14, w5,  #3
@@ -223,133 +223,133 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
        add             w4,  w4,  #64
        b.eq            2f

-        dup             v24.8b,  w4
-        dup             v25.8b,  w12
-        ld1             {v4.8b}, [x1], x2
-        dup             v26.8b,  w6
-        dup             v27.8b,  w7
-        ext             v5.8b,  v4.8b,  v5.8b, #1
-        trn1            v0.2s,  v24.2s, v25.2s
-        trn1            v2.2s,  v26.2s, v27.2s
-        trn1            v4.2s,  v4.2s,  v5.2s
-1:      ld1             {v6.8b}, [x1], x2
-        ext             v7.8b,  v6.8b,  v7.8b, #1
-        trn1            v6.2s,  v6.2s,  v7.2s
-        umull           v18.8h, v4.8b,  v0.8b
-        umlal           v18.8h, v6.8b,  v2.8b
-        ld1             {v4.8b}, [x1], x2
-        ext             v5.8b,  v4.8b,  v5.8b, #1
-        trn1            v4.2s,  v4.2s,  v5.2s
+        dup             v24.8B,  w4
+        dup             v25.8B,  w12
+        ld1             {v4.8B}, [x1], x2
+        dup             v26.8B,  w6
+        dup             v27.8B,  w7
+        ext             v5.8B,  v4.8B,  v5.8B, #1
+        trn1            v0.2S,  v24.2S, v25.2S
+        trn1            v2.2S,  v26.2S, v27.2S
+        trn1            v4.2S,  v4.2S,  v5.2S
+1:      ld1             {v6.8B}, [x1], x2
+        ext             v7.8B,  v6.8B,  v7.8B, #1
+        trn1            v6.2S,  v6.2S,  v7.2S
+        umull           v18.8H, v4.8B,  v0.8B
+        umlal           v18.8H, v6.8B,  v2.8B
+        ld1             {v4.8B}, [x1], x2
+        ext             v5.8B,  v4.8B,  v5.8B, #1
+        trn1            v4.2S,  v4.2S,  v5.2S
        prfm            pldl1strm, [x1]
-        umull           v19.8h, v6.8b,  v0.8b
-        umlal           v19.8h, v4.8b,  v2.8b
-        trn1            v30.2d, v18.2d, v19.2d
-        trn2            v31.2d, v18.2d, v19.2d
-        add             v18.8h, v30.8h, v31.8h
+        umull           v19.8H, v6.8B,  v0.8B
+        umlal           v19.8H, v4.8B,  v2.8B
+        trn1            v30.2D, v18.2D, v19.2D
+        trn2            v31.2D, v18.2D, v19.2D
+        add             v18.8H, v30.8H, v31.8H
  .ifc \codec,h264
-        rshrn           v16.8b, v18.8h, #6
+        rshrn           v16.8B, v18.8H, #6
  .else
-        add             v18.8h, v18.8h, v22.8h
-        shrn            v16.8b, v18.8h, #6
+        add             v18.8H, v18.8H, v22.8H
+        shrn            v16.8B, v18.8H, #6
  .endif
        subs            w3,  w3,  #2
        prfm            pldl1strm, [x1, x2]
  .ifc \type,avg
-        ld1             {v20.s}[0], [x8], x2
-        ld1             {v20.s}[1], [x8], x2
-        urhadd          v16.8b, v16.8b, v20.8b
+        ld1             {v20.S}[0], [x8], x2
+        ld1             {v20.S}[1], [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
  .endif
-        st1             {v16.s}[0], [x0], x2
-        st1             {v16.s}[1], [x0], x2
+        st1             {v16.S}[0], [x0], x2
+        st1             {v16.S}[1], [x0], x2
        b.gt            1b
        ret

 2:      adds            w12, w12, w6
-        dup             v30.8b, w4
+        dup             v30.8B, w4
        b.eq            5f
        tst             w6,  w6
-        dup             v31.8b, w12
-        trn1            v0.2s,  v30.2s, v31.2s
-        trn2            v1.2s,  v30.2s, v31.2s
+        dup             v31.8B, w12
+        trn1            v0.2S,  v30.2S, v31.2S
+        trn2            v1.2S,  v30.2S, v31.2S
        b.eq            4f

-        ext             v1.8b,  v0.8b,  v1.8b, #4
-        ld1             {v4.s}[0], [x1], x2
-3:      ld1             {v4.s}[1], [x1], x2
-        umull           v18.8h, v4.8b,  v0.8b
-        ld1             {v4.s}[0], [x1], x2
-        umull           v19.8h, v4.8b,  v1.8b
-        trn1            v30.2d, v18.2d, v19.2d
-        trn2            v31.2d, v18.2d, v19.2d
-        add             v18.8h, v30.8h, v31.8h
+        ext             v1.8B,  v0.8B,  v1.8B, #4
+        ld1             {v4.S}[0], [x1], x2
+3:      ld1             {v4.S}[1], [x1], x2
+        umull           v18.8H, v4.8B,  v0.8B
+        ld1             {v4.S}[0], [x1], x2
+        umull           v19.8H, v4.8B,  v1.8B
+        trn1            v30.2D, v18.2D, v19.2D
+        trn2            v31.2D, v18.2D, v19.2D
+        add             v18.8H, v30.8H, v31.8H
        prfm            pldl1strm, [x1]
  .ifc \codec,h264
-        rshrn           v16.8b, v18.8h, #6
+        rshrn           v16.8B, v18.8H, #6
  .else
-        add             v18.8h, v18.8h, v22.8h
-        shrn            v16.8b, v18.8h, #6
+        add             v18.8H, v18.8H, v22.8H
+        shrn            v16.8B, v18.8H, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.s}[0], [x8], x2
-        ld1             {v20.s}[1], [x8], x2
-        urhadd          v16.8b, v16.8b, v20.8b
+        ld1             {v20.S}[0], [x8], x2
+        ld1             {v20.S}[1], [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
  .endif
        subs            w3,  w3,  #2
        prfm            pldl1strm, [x1, x2]
-        st1             {v16.s}[0], [x0], x2
-        st1             {v16.s}[1], [x0], x2
+        st1             {v16.S}[0], [x0], x2
+        st1             {v16.S}[1], [x0], x2
        b.gt            3b
        ret

-4:      ld1             {v4.8b}, [x1], x2
-        ld1             {v6.8b}, [x1], x2
-        ext             v5.8b,  v4.8b,  v5.8b, #1
-        ext             v7.8b,  v6.8b,  v7.8b, #1
-        trn1            v4.2s,  v4.2s,  v5.2s
-        trn1            v6.2s,  v6.2s,  v7.2s
-        umull           v18.8h, v4.8b,  v0.8b
-        umull           v19.8h, v6.8b,  v0.8b
+4:      ld1             {v4.8B}, [x1], x2
+        ld1             {v6.8B}, [x1], x2
+        ext             v5.8B,  v4.8B,  v5.8B, #1
+        ext             v7.8B,  v6.8B,  v7.8B, #1
+        trn1            v4.2S,  v4.2S,  v5.2S
+        trn1            v6.2S,  v6.2S,  v7.2S
+        umull           v18.8H, v4.8B,  v0.8B
+        umull           v19.8H, v6.8B,  v0.8B
        subs            w3,  w3,  #2
-        trn1            v30.2d, v18.2d, v19.2d
-        trn2            v31.2d, v18.2d, v19.2d
-        add             v18.8h, v30.8h, v31.8h
+        trn1            v30.2D, v18.2D, v19.2D
+        trn2            v31.2D, v18.2D, v19.2D
+        add             v18.8H, v30.8H, v31.8H
        prfm            pldl1strm, [x1]
  .ifc \codec,h264
-        rshrn           v16.8b, v18.8h, #6
+        rshrn           v16.8B, v18.8H, #6
  .else
-        add             v18.8h, v18.8h, v22.8h
-        shrn            v16.8b, v18.8h, #6
+        add             v18.8H, v18.8H, v22.8H
+        shrn            v16.8B, v18.8H, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.s}[0], [x8], x2
-        ld1             {v20.s}[1], [x8], x2
-        urhadd          v16.8b, v16.8b, v20.8b
+        ld1             {v20.S}[0], [x8], x2
+        ld1             {v20.S}[1], [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
  .endif
        prfm            pldl1strm, [x1]
-        st1             {v16.s}[0], [x0], x2
-        st1             {v16.s}[1], [x0], x2
+        st1             {v16.S}[0], [x0], x2
+        st1             {v16.S}[1], [x0], x2
        b.gt            4b
        ret

-5:      ld1             {v4.s}[0], [x1], x2
-        ld1             {v4.s}[1], [x1], x2
-        umull           v18.8h, v4.8b,  v30.8b
+5:      ld1             {v4.S}[0], [x1], x2
+        ld1             {v4.S}[1], [x1], x2
+        umull           v18.8H, v4.8B,  v30.8B
        subs            w3,  w3,  #2
        prfm            pldl1strm, [x1]
  .ifc \codec,h264
-        rshrn           v16.8b, v18.8h, #6
+        rshrn           v16.8B, v18.8H, #6
  .else
-        add             v18.8h, v18.8h, v22.8h
-        shrn            v16.8b, v18.8h, #6
+        add             v18.8H, v18.8H, v22.8H
+        shrn            v16.8B, v18.8H, #6
  .endif
  .ifc \type,avg
-        ld1             {v20.s}[0], [x8], x2
-        ld1             {v20.s}[1], [x8], x2
-        urhadd          v16.8b, v16.8b, v20.8b
+        ld1             {v20.S}[0], [x8], x2
+        ld1             {v20.S}[1], [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
  .endif
        prfm            pldl1strm, [x1]
-        st1             {v16.s}[0], [x0], x2
-        st1             {v16.s}[1], [x0], x2
+        st1             {v16.S}[0], [x0], x2
+        st1             {v16.S}[1], [x0], x2
        b.gt            5b
        ret
 endfunc
@@ -370,51 +370,51 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1
        sub             w4,  w7,  w13
        sub             w4,  w4,  w14
        add             w4,  w4,  #64
-        dup             v0.8b,  w4
-        dup             v2.8b,  w12
-        dup             v1.8b,  w6
-        dup             v3.8b,  w7
-        trn1            v0.4h,  v0.4h,  v2.4h
-        trn1            v1.4h,  v1.4h,  v3.4h
+        dup             v0.8B,  w4
+        dup             v2.8B,  w12
+        dup             v1.8B,  w6
+        dup             v3.8B,  w7
+        trn1            v0.4H,  v0.4H,  v2.4H
+        trn1            v1.4H,  v1.4H,  v3.4H
 1:
-        ld1             {v4.s}[0],  [x1], x2
-        ld1             {v4.s}[1],  [x1], x2
-        rev64           v5.2s,  v4.2s
-        ld1             {v5.s}[1],  [x1]
-        ext             v6.8b,  v4.8b,  v5.8b,  #1
-        ext             v7.8b,  v5.8b,  v4.8b,  #1
-        trn1            v4.4h,  v4.4h,  v6.4h
-        trn1            v5.4h,  v5.4h,  v7.4h
-        umull           v16.8h, v4.8b,  v0.8b
-        umlal           v16.8h, v5.8b,  v1.8b
+        ld1             {v4.S}[0],  [x1], x2
+        ld1             {v4.S}[1],  [x1], x2
+        rev64           v5.2S,  v4.2S
+        ld1             {v5.S}[1],  [x1]
+        ext             v6.8B,  v4.8B,  v5.8B,  #1
+        ext             v7.8B,  v5.8B,  v4.8B,  #1
+        trn1            v4.4H,  v4.4H,  v6.4H
+        trn1            v5.4H,  v5.4H,  v7.4H
+        umull           v16.8H, v4.8B,  v0.8B
+        umlal           v16.8H, v5.8B,  v1.8B
  .ifc \type,avg
-        ld1             {v18.h}[0], [x0], x2
-        ld1             {v18.h}[2], [x0]
+        ld1             {v18.H}[0], [x0], x2
+        ld1             {v18.H}[2], [x0]
        sub             x0,  x0,  x2
  .endif
-        rev64           v17.4s, v16.4s
-        add             v16.8h, v16.8h, v17.8h
-        rshrn           v16.8b, v16.8h, #6
+        rev64           v17.4S, v16.4S
+        add             v16.8H, v16.8H, v17.8H
+        rshrn           v16.8B, v16.8H, #6
  .ifc \type,avg
-        urhadd          v16.8b, v16.8b, v18.8b
+        urhadd          v16.8B, v16.8B, v18.8B
  .endif
-        st1             {v16.h}[0], [x0], x2
-        st1             {v16.h}[2], [x0], x2
+        st1             {v16.H}[0], [x0], x2
+        st1             {v16.H}[2], [x0], x2
        subs            w3,  w3,  #2
        b.gt            1b
        ret

 2:
-        ld1             {v16.h}[0], [x1], x2
-        ld1             {v16.h}[1], [x1], x2
+        ld1             {v16.H}[0], [x1], x2
+        ld1             {v16.H}[1], [x1], x2
  .ifc \type,avg
-        ld1             {v18.h}[0], [x0], x2
-        ld1             {v18.h}[1], [x0]
+        ld1             {v18.H}[0], [x0], x2
+        ld1             {v18.H}[1], [x0]
        sub             x0,  x0,  x2
-        urhadd          v16.8b, v16.8b, v18.8b
+        urhadd          v16.8B, v16.8B, v18.8B
  .endif
-        st1             {v16.h}[0], [x0], x2
-        st1             {v16.h}[1], [x0], x2
+        st1             {v16.H}[0], [x0], x2
+        st1             {v16.H}[1], [x0], x2
        subs            w3,  w3,  #2
        b.gt            2b
        ret
@@ -27,114 +27,114 @@
 .macro  lowpass_const   r
        movz            \r, #20, lsl #16
        movk            \r, #5
-        mov             v6.s[0], \r
+        mov             v6.S[0], \r
 .endm

 //trashes v0-v5
 .macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
-        ext             v2.8b,      \r0\().8b, \r1\().8b, #2
-        ext             v3.8b,      \r0\().8b, \r1\().8b, #3
-        uaddl           v2.8h,      v2.8b,     v3.8b
-        ext             v4.8b,      \r0\().8b, \r1\().8b, #1
-        ext             v5.8b,      \r0\().8b, \r1\().8b, #4
-        uaddl           v4.8h,      v4.8b,     v5.8b
-        ext             v1.8b,      \r0\().8b, \r1\().8b, #5
-        uaddl           \d0\().8h,  \r0\().8b, v1.8b
-        ext             v0.8b,      \r2\().8b, \r3\().8b, #2
-        mla             \d0\().8h,  v2.8h,     v6.h[1]
-        ext             v1.8b,      \r2\().8b, \r3\().8b, #3
-        uaddl           v0.8h,      v0.8b,     v1.8b
-        ext             v1.8b,      \r2\().8b, \r3\().8b, #1
-        mls             \d0\().8h,  v4.8h,     v6.h[0]
-        ext             v3.8b,      \r2\().8b, \r3\().8b, #4
-        uaddl           v1.8h,      v1.8b,     v3.8b
-        ext             v2.8b,      \r2\().8b, \r3\().8b, #5
-        uaddl           \d1\().8h,  \r2\().8b, v2.8b
-        mla             \d1\().8h,  v0.8h,     v6.h[1]
-        mls             \d1\().8h,  v1.8h,     v6.h[0]
+        ext             v2.8B,      \r0\().8B, \r1\().8B, #2
+        ext             v3.8B,      \r0\().8B, \r1\().8B, #3
+        uaddl           v2.8H,      v2.8B,     v3.8B
+        ext             v4.8B,      \r0\().8B, \r1\().8B, #1
+        ext             v5.8B,      \r0\().8B, \r1\().8B, #4
+        uaddl           v4.8H,      v4.8B,     v5.8B
+        ext             v1.8B,      \r0\().8B, \r1\().8B, #5
+        uaddl           \d0\().8H,  \r0\().8B, v1.8B
+        ext             v0.8B,      \r2\().8B, \r3\().8B, #2
+        mla             \d0\().8H,  v2.8H,     v6.H[1]
+        ext             v1.8B,      \r2\().8B, \r3\().8B, #3
+        uaddl           v0.8H,      v0.8B,     v1.8B
+        ext             v1.8B,      \r2\().8B, \r3\().8B, #1
+        mls             \d0\().8H,  v4.8H,     v6.H[0]
+        ext             v3.8B,      \r2\().8B, \r3\().8B, #4
+        uaddl           v1.8H,      v1.8B,     v3.8B
+        ext             v2.8B,      \r2\().8B, \r3\().8B, #5
+        uaddl           \d1\().8H,  \r2\().8B, v2.8B
+        mla             \d1\().8H,  v0.8H,     v6.H[1]
+        mls             \d1\().8H,  v1.8H,     v6.H[0]
  .if \narrow
-        sqrshrun        \d0\().8b,  \d0\().8h, #5
-        sqrshrun        \d1\().8b,  \d1\().8h, #5
+        sqrshrun        \d0\().8B,  \d0\().8H, #5
+        sqrshrun        \d1\().8B,  \d1\().8H, #5
  .endif
 .endm

 //trashes v0-v5, v7, v30-v31
 .macro  lowpass_8H      r0,  r1
-        ext             v0.16b,     \r0\().16b, \r0\().16b, #2
-        ext             v1.16b,     \r0\().16b, \r0\().16b, #3
-        uaddl           v0.8h,      v0.8b,      v1.8b
-        ext             v2.16b,     \r0\().16b, \r0\().16b, #1
-        ext             v3.16b,     \r0\().16b, \r0\().16b, #4
-        uaddl           v2.8h,      v2.8b,      v3.8b
-        ext             v30.16b,    \r0\().16b, \r0\().16b, #5
-        uaddl           \r0\().8h,  \r0\().8b,  v30.8b
-        ext             v4.16b,     \r1\().16b, \r1\().16b, #2
-        mla             \r0\().8h,  v0.8h,      v6.h[1]
-        ext             v5.16b,     \r1\().16b, \r1\().16b, #3
-        uaddl           v4.8h,      v4.8b,      v5.8b
-        ext             v7.16b,     \r1\().16b, \r1\().16b, #1
-        mls             \r0\().8h,  v2.8h,      v6.h[0]
-        ext             v0.16b,     \r1\().16b, \r1\().16b, #4
-        uaddl           v7.8h,      v7.8b,      v0.8b
-        ext             v31.16b,    \r1\().16b, \r1\().16b, #5
-        uaddl           \r1\().8h,  \r1\().8b,  v31.8b
-        mla             \r1\().8h,  v4.8h,      v6.h[1]
-        mls             \r1\().8h,  v7.8h,      v6.h[0]
+        ext             v0.16B,     \r0\().16B, \r0\().16B, #2
+        ext             v1.16B,     \r0\().16B, \r0\().16B, #3
+        uaddl           v0.8H,      v0.8B,      v1.8B
+        ext             v2.16B,     \r0\().16B, \r0\().16B, #1
+        ext             v3.16B,     \r0\().16B, \r0\().16B, #4
+        uaddl           v2.8H,      v2.8B,      v3.8B
+        ext             v30.16B,    \r0\().16B, \r0\().16B, #5
+        uaddl           \r0\().8H,  \r0\().8B,  v30.8B
+        ext             v4.16B,     \r1\().16B, \r1\().16B, #2
+        mla             \r0\().8H,  v0.8H,      v6.H[1]
+        ext             v5.16B,     \r1\().16B, \r1\().16B, #3
+        uaddl           v4.8H,      v4.8B,      v5.8B
+        ext             v7.16B,     \r1\().16B, \r1\().16B, #1
+        mls             \r0\().8H,  v2.8H,      v6.H[0]
+        ext             v0.16B,     \r1\().16B, \r1\().16B, #4
+        uaddl           v7.8H,      v7.8B,      v0.8B
+        ext             v31.16B,    \r1\().16B, \r1\().16B, #5
+        uaddl           \r1\().8H,  \r1\().8B,  v31.8B
+        mla             \r1\().8H,  v4.8H,      v6.H[1]
+        mls             \r1\().8H,  v7.8H,      v6.H[0]
 .endm

 // trashes v2-v5, v30
 .macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
-        ext             v2.8b,     \r0\().8b, \r1\().8b, #2
-        ext             v3.8b,     \r0\().8b, \r1\().8b, #3
-        uaddl           v2.8h,     v2.8b,     v3.8b
-        ext             v4.8b,     \r0\().8b, \r1\().8b, #1
-        ext             v5.8b,     \r0\().8b, \r1\().8b, #4
-        uaddl           v4.8h,     v4.8b,     v5.8b
-        ext             v30.8b,    \r0\().8b, \r1\().8b, #5
-        uaddl           \d0\().8h, \r0\().8b, v30.8b
-        mla             \d0\().8h, v2.8h,     v6.h[1]
-        mls             \d0\().8h, v4.8h,     v6.h[0]
+        ext             v2.8B,     \r0\().8B, \r1\().8B, #2
+        ext             v3.8B,     \r0\().8B, \r1\().8B, #3
+        uaddl           v2.8H,     v2.8B,     v3.8B
+        ext             v4.8B,     \r0\().8B, \r1\().8B, #1
+        ext             v5.8B,     \r0\().8B, \r1\().8B, #4
+        uaddl           v4.8H,     v4.8B,     v5.8B
+        ext             v30.8B,    \r0\().8B, \r1\().8B, #5
+        uaddl           \d0\().8H, \r0\().8B, v30.8B
+        mla             \d0\().8H, v2.8H,     v6.H[1]
+        mls             \d0\().8H, v4.8H,     v6.H[0]
  .if \narrow
-        sqrshrun        \d0\().8b, \d0\().8h, #5
+        sqrshrun        \d0\().8B, \d0\().8H, #5
  .endif
 .endm

 // trashed v0-v7
 .macro  lowpass_8.16    r0,  r1,  r2
-        ext             v1.16b,     \r0\().16b, \r1\().16b, #4
-        ext             v0.16b,     \r0\().16b, \r1\().16b, #6
-        saddl           v5.4s,      v1.4h,      v0.4h
-        ext             v2.16b,     \r0\().16b, \r1\().16b, #2
-        saddl2          v1.4s,      v1.8h,      v0.8h
-        ext             v3.16b,     \r0\().16b, \r1\().16b, #8
-        saddl           v6.4s,      v2.4h,      v3.4h
-        ext             \r1\().16b, \r0\().16b, \r1\().16b, #10
-        saddl2          v2.4s,      v2.8h,      v3.8h
-        saddl           v0.4s,      \r0\().4h,  \r1\().4h
-        saddl2          v4.4s,      \r0\().8h,  \r1\().8h
+        ext             v1.16B,     \r0\().16B, \r1\().16B, #4
+        ext             v0.16B,     \r0\().16B, \r1\().16B, #6
+        saddl           v5.4S,      v1.4H,      v0.4H
+        ext             v2.16B,     \r0\().16B, \r1\().16B, #2
+        saddl2          v1.4S,      v1.8H,      v0.8H
+        ext             v3.16B,     \r0\().16B, \r1\().16B, #8
+        saddl           v6.4S,      v2.4H,      v3.4H
+        ext             \r1\().16B, \r0\().16B, \r1\().16B, #10
+        saddl2          v2.4S,      v2.8H,      v3.8H
+        saddl           v0.4S,      \r0\().4H,  \r1\().4H
+        saddl2          v4.4S,      \r0\().8H,  \r1\().8H

-        shl             v3.4s,  v5.4s,  #4
-        shl             v5.4s,  v5.4s,  #2
-        shl             v7.4s,  v6.4s,  #2
-        add             v5.4s,  v5.4s,  v3.4s
-        add             v6.4s,  v6.4s,  v7.4s
+        shl             v3.4S,  v5.4S,  #4
+        shl             v5.4S,  v5.4S,  #2
+        shl             v7.4S,  v6.4S,  #2
+        add             v5.4S,  v5.4S,  v3.4S
+        add             v6.4S,  v6.4S,  v7.4S

-        shl             v3.4s,  v1.4s,  #4
-        shl             v1.4s,  v1.4s,  #2
-        shl             v7.4s,  v2.4s,  #2
-        add             v1.4s,  v1.4s,  v3.4s
-        add             v2.4s,  v2.4s,  v7.4s
+        shl             v3.4S,  v1.4S,  #4
+        shl             v1.4S,  v1.4S,  #2
+        shl             v7.4S,  v2.4S,  #2
+        add             v1.4S,  v1.4S,  v3.4S
+        add             v2.4S,  v2.4S,  v7.4S

-        add             v5.4s,  v5.4s,  v0.4s
-        sub             v5.4s,  v5.4s,  v6.4s
+        add             v5.4S,  v5.4S,  v0.4S
+        sub             v5.4S,  v5.4S,  v6.4S

-        add             v1.4s,  v1.4s,  v4.4s
-        sub             v1.4s,  v1.4s,  v2.4s
+        add             v1.4S,  v1.4S,  v4.4S
+        sub             v1.4S,  v1.4S,  v2.4S

-        rshrn           v5.4h,  v5.4s,  #10
-        rshrn2          v5.8h,  v1.4s,  #10
+        rshrn           v5.4H,  v5.4S,  #10
+        rshrn2          v5.8H,  v1.4S,  #10

-        sqxtun          \r2\().8b,  v5.8h
+        sqxtun          \r2\().8B,  v5.8H
 .endm

 function put_h264_qpel16_h_lowpass_neon_packed
@@ -163,19 +163,19 @@ function \type\()_h264_qpel16_h_lowpass_neon
 endfunc

 function \type\()_h264_qpel8_h_lowpass_neon
-1:      ld1             {v28.8b, v29.8b}, [x1], x2
-        ld1             {v16.8b, v17.8b}, [x1], x2
+1:      ld1             {v28.8B, v29.8B}, [x1], x2
+        ld1             {v16.8B, v17.8B}, [x1], x2
        subs            x12, x12, #2
        lowpass_8       v28, v29, v16, v17, v28, v16
  .ifc \type,avg
-        ld1             {v2.8b},    [x0], x3
-        urhadd          v28.8b, v28.8b,  v2.8b
-        ld1             {v3.8b},    [x0]
-        urhadd          v16.8b, v16.8b, v3.8b
+        ld1             {v2.8B},    [x0], x3
+        urhadd          v28.8B, v28.8B,  v2.8B
+        ld1             {v3.8B},    [x0]
+        urhadd          v16.8B, v16.8B, v3.8B
        sub             x0,  x0,  x3
  .endif
-        st1             {v28.8b},    [x0], x3
-        st1             {v16.8b},    [x0], x3
+        st1             {v28.8B},    [x0], x3
+        st1             {v16.8B},    [x0], x3
        b.ne            1b
        ret
 endfunc
@@ -200,23 +200,23 @@ function \type\()_h264_qpel16_h_lowpass_l2_neon
 endfunc

 function \type\()_h264_qpel8_h_lowpass_l2_neon
-1:      ld1             {v26.8b, v27.8b}, [x1], x2
-        ld1             {v16.8b, v17.8b}, [x1], x2
-        ld1             {v28.8b},     [x3], x2
-        ld1             {v29.8b},     [x3], x2
+1:      ld1             {v26.8B, v27.8B}, [x1], x2
+        ld1             {v16.8B, v17.8B}, [x1], x2
+        ld1             {v28.8B},     [x3], x2
+        ld1             {v29.8B},     [x3], x2
        subs            x12, x12, #2
        lowpass_8       v26, v27, v16, v17, v26, v27
-        urhadd          v26.8b, v26.8b, v28.8b
-        urhadd          v27.8b, v27.8b, v29.8b
+        urhadd          v26.8B, v26.8B, v28.8B
+        urhadd          v27.8B, v27.8B, v29.8B
  .ifc \type,avg
-        ld1             {v2.8b},      [x0], x2
-        urhadd          v26.8b, v26.8b, v2.8b
-        ld1             {v3.8b},      [x0]
-        urhadd          v27.8b, v27.8b, v3.8b
+        ld1             {v2.8B},      [x0], x2
+        urhadd          v26.8B, v26.8B, v2.8B
+        ld1             {v3.8B},      [x0]
+        urhadd          v27.8B, v27.8B, v3.8B
        sub             x0,  x0,  x2
  .endif
-        st1             {v26.8b},     [x0], x2
-        st1             {v27.8b},     [x0], x2
+        st1             {v26.8B},     [x0], x2
+        st1             {v27.8B},     [x0], x2
        b.ne            1b
        ret
 endfunc
@@ -257,19 +257,19 @@ function \type\()_h264_qpel16_v_lowpass_neon
 endfunc

 function \type\()_h264_qpel8_v_lowpass_neon
-        ld1             {v16.8b}, [x1], x3
-        ld1             {v18.8b}, [x1], x3
-        ld1             {v20.8b}, [x1], x3
-        ld1             {v22.8b}, [x1], x3
-        ld1             {v24.8b}, [x1], x3
-        ld1             {v26.8b}, [x1], x3
-        ld1             {v28.8b}, [x1], x3
-        ld1             {v30.8b}, [x1], x3
-        ld1             {v17.8b}, [x1], x3
-        ld1             {v19.8b}, [x1], x3
-        ld1             {v21.8b}, [x1], x3
-        ld1             {v23.8b}, [x1], x3
-        ld1             {v25.8b}, [x1]
+        ld1             {v16.8B}, [x1], x3
+        ld1             {v18.8B}, [x1], x3
+        ld1             {v20.8B}, [x1], x3
+        ld1             {v22.8B}, [x1], x3
+        ld1             {v24.8B}, [x1], x3
+        ld1             {v26.8B}, [x1], x3
+        ld1             {v28.8B}, [x1], x3
+        ld1             {v30.8B}, [x1], x3
+        ld1             {v17.8B}, [x1], x3
+        ld1             {v19.8B}, [x1], x3
+        ld1             {v21.8B}, [x1], x3
+        ld1             {v23.8B}, [x1], x3
+        ld1             {v25.8B}, [x1]

        transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
        transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
@@ -280,33 +280,33 @@ function \type\()_h264_qpel8_v_lowpass_neon
        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1

  .ifc \type,avg
-        ld1             {v24.8b},  [x0], x2
-        urhadd          v16.8b, v16.8b, v24.8b
-        ld1             {v25.8b}, [x0], x2
-        urhadd          v17.8b, v17.8b, v25.8b
-        ld1             {v26.8b}, [x0], x2
-        urhadd          v18.8b, v18.8b, v26.8b
-        ld1             {v27.8b}, [x0], x2
-        urhadd          v19.8b, v19.8b, v27.8b
-        ld1             {v28.8b}, [x0], x2
-        urhadd          v20.8b, v20.8b, v28.8b
-        ld1             {v29.8b}, [x0], x2
-        urhadd          v21.8b, v21.8b, v29.8b
-        ld1             {v30.8b}, [x0], x2
-        urhadd          v22.8b, v22.8b, v30.8b
-        ld1             {v31.8b}, [x0], x2
-        urhadd          v23.8b, v23.8b, v31.8b
+        ld1             {v24.8B},  [x0], x2
+        urhadd          v16.8B, v16.8B, v24.8B
+        ld1             {v25.8B}, [x0], x2
+        urhadd          v17.8B, v17.8B, v25.8B
+        ld1             {v26.8B}, [x0], x2
+        urhadd          v18.8B, v18.8B, v26.8B
+        ld1             {v27.8B}, [x0], x2
+        urhadd          v19.8B, v19.8B, v27.8B
+        ld1             {v28.8B}, [x0], x2
+        urhadd          v20.8B, v20.8B, v28.8B
+        ld1             {v29.8B}, [x0], x2
+        urhadd          v21.8B, v21.8B, v29.8B
+        ld1             {v30.8B}, [x0], x2
+        urhadd          v22.8B, v22.8B, v30.8B
+        ld1             {v31.8B}, [x0], x2
+        urhadd          v23.8B, v23.8B, v31.8B
        sub             x0,  x0,  x2,  lsl #3
  .endif

-        st1             {v16.8b}, [x0], x2
-        st1             {v17.8b}, [x0], x2
-        st1             {v18.8b}, [x0], x2
-        st1             {v19.8b}, [x0], x2
-        st1             {v20.8b}, [x0], x2
-        st1             {v21.8b}, [x0], x2
-        st1             {v22.8b}, [x0], x2
-        st1             {v23.8b}, [x0], x2
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
+        st1             {v18.8B}, [x0], x2
+        st1             {v19.8B}, [x0], x2
+        st1             {v20.8B}, [x0], x2
+        st1             {v21.8B}, [x0], x2
+        st1             {v22.8B}, [x0], x2
+        st1             {v23.8B}, [x0], x2

        ret
 endfunc
@@ -334,19 +334,19 @@ function \type\()_h264_qpel16_v_lowpass_l2_neon
 endfunc

 function \type\()_h264_qpel8_v_lowpass_l2_neon
-        ld1             {v16.8b}, [x1], x3
-        ld1             {v18.8b}, [x1], x3
-        ld1             {v20.8b}, [x1], x3
-        ld1             {v22.8b}, [x1], x3
-        ld1             {v24.8b}, [x1], x3
-        ld1             {v26.8b}, [x1], x3
-        ld1             {v28.8b}, [x1], x3
-        ld1             {v30.8b}, [x1], x3
-        ld1             {v17.8b}, [x1], x3
-        ld1             {v19.8b}, [x1], x3
-        ld1             {v21.8b}, [x1], x3
-        ld1             {v23.8b}, [x1], x3
-        ld1             {v25.8b}, [x1]
+        ld1             {v16.8B}, [x1], x3
+        ld1             {v18.8B}, [x1], x3
+        ld1             {v20.8B}, [x1], x3
+        ld1             {v22.8B}, [x1], x3
+        ld1             {v24.8B}, [x1], x3
+        ld1             {v26.8B}, [x1], x3
+        ld1             {v28.8B}, [x1], x3
+        ld1             {v30.8B}, [x1], x3
+        ld1             {v17.8B}, [x1], x3
+        ld1             {v19.8B}, [x1], x3
+        ld1             {v21.8B}, [x1], x3
+        ld1             {v23.8B}, [x1], x3
+        ld1             {v25.8B}, [x1]

        transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
        transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
@@ -356,51 +356,51 @@ function \type\()_h264_qpel8_v_lowpass_l2_neon
        lowpass_8       v28, v29, v30, v31, v22, v23
        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1

-        ld1             {v24.8b},  [x12], x2
-        ld1             {v25.8b},  [x12], x2
-        ld1             {v26.8b},  [x12], x2
-        ld1             {v27.8b},  [x12], x2
-        ld1             {v28.8b},  [x12], x2
-        urhadd          v16.8b, v24.8b, v16.8b
-        urhadd          v17.8b, v25.8b, v17.8b
-        ld1             {v29.8b},  [x12], x2
-        urhadd          v18.8b, v26.8b, v18.8b
-        urhadd          v19.8b, v27.8b, v19.8b
-        ld1             {v30.8b}, [x12], x2
-        urhadd          v20.8b, v28.8b, v20.8b
-        urhadd          v21.8b, v29.8b, v21.8b
-        ld1             {v31.8b}, [x12], x2
-        urhadd          v22.8b, v30.8b, v22.8b
-        urhadd          v23.8b, v31.8b, v23.8b
+        ld1             {v24.8B},  [x12], x2
+        ld1             {v25.8B},  [x12], x2
+        ld1             {v26.8B},  [x12], x2
+        ld1             {v27.8B},  [x12], x2
+        ld1             {v28.8B},  [x12], x2
+        urhadd          v16.8B, v24.8B, v16.8B
+        urhadd          v17.8B, v25.8B, v17.8B
+        ld1             {v29.8B},  [x12], x2
+        urhadd          v18.8B, v26.8B, v18.8B
+        urhadd          v19.8B, v27.8B, v19.8B
+        ld1             {v30.8B}, [x12], x2
+        urhadd          v20.8B, v28.8B, v20.8B
+        urhadd          v21.8B, v29.8B, v21.8B
+        ld1             {v31.8B}, [x12], x2
+        urhadd          v22.8B, v30.8B, v22.8B
+        urhadd          v23.8B, v31.8B, v23.8B

  .ifc \type,avg
-        ld1             {v24.8b}, [x0], x3
-        urhadd          v16.8b, v16.8b, v24.8b
-        ld1             {v25.8b}, [x0], x3
-        urhadd          v17.8b, v17.8b, v25.8b
-        ld1             {v26.8b}, [x0], x3
-        urhadd          v18.8b, v18.8b, v26.8b
-        ld1             {v27.8b}, [x0], x3
-        urhadd          v19.8b, v19.8b, v27.8b
-        ld1             {v28.8b}, [x0], x3
-        urhadd          v20.8b, v20.8b, v28.8b
-        ld1             {v29.8b}, [x0], x3
-        urhadd          v21.8b, v21.8b, v29.8b
-        ld1             {v30.8b}, [x0], x3
-        urhadd          v22.8b, v22.8b, v30.8b
-        ld1             {v31.8b}, [x0], x3
-        urhadd          v23.8b, v23.8b, v31.8b
+        ld1             {v24.8B}, [x0], x3
+        urhadd          v16.8B, v16.8B, v24.8B
+        ld1             {v25.8B}, [x0], x3
+        urhadd          v17.8B, v17.8B, v25.8B
+        ld1             {v26.8B}, [x0], x3
+        urhadd          v18.8B, v18.8B, v26.8B
+        ld1             {v27.8B}, [x0], x3
+        urhadd          v19.8B, v19.8B, v27.8B
+        ld1             {v28.8B}, [x0], x3
+        urhadd          v20.8B, v20.8B, v28.8B
+        ld1             {v29.8B}, [x0], x3
+        urhadd          v21.8B, v21.8B, v29.8B
+        ld1             {v30.8B}, [x0], x3
+        urhadd          v22.8B, v22.8B, v30.8B
+        ld1             {v31.8B}, [x0], x3
+        urhadd          v23.8B, v23.8B, v31.8B
        sub             x0,  x0,  x3,  lsl #3
  .endif

-        st1             {v16.8b}, [x0], x3
-        st1             {v17.8b}, [x0], x3
-        st1             {v18.8b}, [x0], x3
-        st1             {v19.8b}, [x0], x3
-        st1             {v20.8b}, [x0], x3
-        st1             {v21.8b}, [x0], x3
-        st1             {v22.8b}, [x0], x3
-        st1             {v23.8b}, [x0], x3
+        st1             {v16.8B}, [x0], x3
+        st1             {v17.8B}, [x0], x3
+        st1             {v18.8B}, [x0], x3
+        st1             {v19.8B}, [x0], x3
+        st1             {v20.8B}, [x0], x3
+        st1             {v21.8B}, [x0], x3
+        st1             {v22.8B}, [x0], x3
+        st1             {v23.8B}, [x0], x3

        ret
 endfunc
@@ -411,19 +411,19 @@ endfunc

 function put_h264_qpel8_hv_lowpass_neon_top
        lowpass_const   w12
-        ld1             {v16.8h}, [x1], x3
-        ld1             {v17.8h}, [x1], x3
-        ld1             {v18.8h}, [x1], x3
-        ld1             {v19.8h}, [x1], x3
-        ld1             {v20.8h}, [x1], x3
-        ld1             {v21.8h}, [x1], x3
-        ld1             {v22.8h}, [x1], x3
-        ld1             {v23.8h}, [x1], x3
-        ld1             {v24.8h}, [x1], x3
-        ld1             {v25.8h}, [x1], x3
-        ld1             {v26.8h}, [x1], x3
-        ld1             {v27.8h}, [x1], x3
-        ld1             {v28.8h}, [x1]
+        ld1             {v16.8H}, [x1], x3
+        ld1             {v17.8H}, [x1], x3
+        ld1             {v18.8H}, [x1], x3
+        ld1             {v19.8H}, [x1], x3
+        ld1             {v20.8H}, [x1], x3
+        ld1             {v21.8H}, [x1], x3
+        ld1             {v22.8H}, [x1], x3
+        ld1             {v23.8H}, [x1], x3
+        ld1             {v24.8H}, [x1], x3
+        ld1             {v25.8H}, [x1], x3
+        ld1             {v26.8H}, [x1], x3
+        ld1             {v27.8H}, [x1], x3
+        ld1             {v28.8H}, [x1]
        lowpass_8H      v16, v17
        lowpass_8H      v18, v19
        lowpass_8H      v20, v21
@@ -447,7 +447,7 @@ function put_h264_qpel8_hv_lowpass_neon_top
        lowpass_8.16    v22, v30, v22
        lowpass_8.16    v23, v31, v23

-        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+        transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1

        ret
 endfunc
@@ -457,33 +457,33 @@ function \type\()_h264_qpel8_hv_lowpass_neon
        mov             x10, x30
        bl              put_h264_qpel8_hv_lowpass_neon_top
  .ifc \type,avg
-        ld1             {v0.8b},      [x0], x2
-        urhadd          v16.8b, v16.8b, v0.8b
-        ld1             {v1.8b},      [x0], x2
-        urhadd          v17.8b, v17.8b, v1.8b
-        ld1             {v2.8b},      [x0], x2
-        urhadd          v18.8b, v18.8b, v2.8b
-        ld1             {v3.8b},      [x0], x2
-        urhadd          v19.8b, v19.8b, v3.8b
-        ld1             {v4.8b},      [x0], x2
-        urhadd          v20.8b, v20.8b, v4.8b
-        ld1             {v5.8b},      [x0], x2
-        urhadd          v21.8b, v21.8b, v5.8b
-        ld1             {v6.8b},      [x0], x2
-        urhadd          v22.8b, v22.8b, v6.8b
-        ld1             {v7.8b},      [x0], x2
-        urhadd          v23.8b, v23.8b, v7.8b
+        ld1             {v0.8B},      [x0], x2
+        urhadd          v16.8B, v16.8B, v0.8B
+        ld1             {v1.8B},      [x0], x2
+        urhadd          v17.8B, v17.8B, v1.8B
+        ld1             {v2.8B},      [x0], x2
+        urhadd          v18.8B, v18.8B, v2.8B
+        ld1             {v3.8B},      [x0], x2
+        urhadd          v19.8B, v19.8B, v3.8B
+        ld1             {v4.8B},      [x0], x2
+        urhadd          v20.8B, v20.8B, v4.8B
+        ld1             {v5.8B},      [x0], x2
+        urhadd          v21.8B, v21.8B, v5.8B
+        ld1             {v6.8B},      [x0], x2
+        urhadd          v22.8B, v22.8B, v6.8B
+        ld1             {v7.8B},      [x0], x2
+        urhadd          v23.8B, v23.8B, v7.8B
        sub             x0,  x0,  x2,  lsl #3
  .endif

-        st1             {v16.8b},     [x0], x2
-        st1             {v17.8b},     [x0], x2
-        st1             {v18.8b},     [x0], x2
-        st1             {v19.8b},     [x0], x2
-        st1             {v20.8b},     [x0], x2
-        st1             {v21.8b},     [x0], x2
-        st1             {v22.8b},     [x0], x2
-        st1             {v23.8b},     [x0], x2
+        st1             {v16.8B},     [x0], x2
+        st1             {v17.8B},     [x0], x2
+        st1             {v18.8B},     [x0], x2
+        st1             {v19.8B},     [x0], x2
+        st1             {v20.8B},     [x0], x2
+        st1             {v21.8B},     [x0], x2
+        st1             {v22.8B},     [x0], x2
+        st1             {v23.8B},     [x0], x2

        ret             x10
 endfunc
@@ -497,45 +497,45 @@ function \type\()_h264_qpel8_hv_lowpass_l2_neon
        mov             x10, x30
        bl              put_h264_qpel8_hv_lowpass_neon_top

-        ld1             {v0.8b, v1.8b},  [x2], #16
-        ld1             {v2.8b, v3.8b},  [x2], #16
-        urhadd          v0.8b,  v0.8b,  v16.8b
-        urhadd          v1.8b,  v1.8b,  v17.8b
-        ld1             {v4.8b, v5.8b},  [x2], #16
-        urhadd          v2.8b,  v2.8b,  v18.8b
-        urhadd          v3.8b,  v3.8b,  v19.8b
-        ld1             {v6.8b, v7.8b},  [x2], #16
-        urhadd          v4.8b,  v4.8b,  v20.8b
-        urhadd          v5.8b,  v5.8b,  v21.8b
-        urhadd          v6.8b,  v6.8b,  v22.8b
-        urhadd          v7.8b,  v7.8b,  v23.8b
+        ld1             {v0.8B, v1.8B},  [x2], #16
+        ld1             {v2.8B, v3.8B},  [x2], #16
+        urhadd          v0.8B,  v0.8B,  v16.8B
+        urhadd          v1.8B,  v1.8B,  v17.8B
+        ld1             {v4.8B, v5.8B},  [x2], #16
+        urhadd          v2.8B,  v2.8B,  v18.8B
+        urhadd          v3.8B,  v3.8B,  v19.8B
+        ld1             {v6.8B, v7.8B},  [x2], #16
+        urhadd          v4.8B,  v4.8B,  v20.8B
+        urhadd          v5.8B,  v5.8B,  v21.8B
+        urhadd          v6.8B,  v6.8B,  v22.8B
+        urhadd          v7.8B,  v7.8B,  v23.8B
  .ifc \type,avg
-        ld1             {v16.8b},     [x0], x3
-        urhadd          v0.8b,  v0.8b,  v16.8b
-        ld1             {v17.8b},     [x0], x3
-        urhadd          v1.8b,  v1.8b,  v17.8b
-        ld1             {v18.8b},     [x0], x3
-        urhadd          v2.8b,  v2.8b,  v18.8b
-        ld1             {v19.8b},     [x0], x3
-        urhadd          v3.8b,  v3.8b,  v19.8b
-        ld1             {v20.8b},     [x0], x3
-        urhadd          v4.8b,  v4.8b,  v20.8b
-        ld1             {v21.8b},     [x0], x3
-        urhadd          v5.8b,  v5.8b,  v21.8b
-        ld1             {v22.8b},     [x0], x3
-        urhadd          v6.8b,  v6.8b,  v22.8b
-        ld1             {v23.8b},     [x0], x3
-        urhadd          v7.8b,  v7.8b,  v23.8b
+        ld1             {v16.8B},     [x0], x3
+        urhadd          v0.8B,  v0.8B,  v16.8B
+        ld1             {v17.8B},     [x0], x3
+        urhadd          v1.8B,  v1.8B,  v17.8B
+        ld1             {v18.8B},     [x0], x3
+        urhadd          v2.8B,  v2.8B,  v18.8B
+        ld1             {v19.8B},     [x0], x3
+        urhadd          v3.8B,  v3.8B,  v19.8B
+        ld1             {v20.8B},     [x0], x3
+        urhadd          v4.8B,  v4.8B,  v20.8B
+        ld1             {v21.8B},     [x0], x3
+        urhadd          v5.8B,  v5.8B,  v21.8B
+        ld1             {v22.8B},     [x0], x3
+        urhadd          v6.8B,  v6.8B,  v22.8B
+        ld1             {v23.8B},     [x0], x3
+        urhadd          v7.8B,  v7.8B,  v23.8B
        sub             x0,  x0,  x3,  lsl #3
  .endif
-        st1             {v0.8b},      [x0], x3
-        st1             {v1.8b},      [x0], x3
-        st1             {v2.8b},      [x0], x3
-        st1             {v3.8b},      [x0], x3
-        st1             {v4.8b},      [x0], x3
-        st1             {v5.8b},      [x0], x3
-        st1             {v6.8b},      [x0], x3
-        st1             {v7.8b},      [x0], x3
+        st1             {v0.8B},      [x0], x3
+        st1             {v1.8B},      [x0], x3
+        st1             {v2.8B},      [x0], x3
+        st1             {v3.8B},      [x0], x3
+        st1             {v4.8B},      [x0], x3
+        st1             {v5.8B},      [x0], x3
+        st1             {v6.8B},      [x0], x3
+        st1             {v7.8B},      [x0], x3

        ret             x10
 endfunc
@@ -579,8 +579,8 @@ function \type\()_h264_qpel16_hv_lowpass_l2_neon
 endfunc
 .endm

-        h264_qpel16_hv  put
-        h264_qpel16_hv  avg
+        h264_qpel16_hv put
+        h264_qpel16_hv avg

 .macro  h264_qpel8      type
 function ff_\type\()_h264_qpel8_mc10_neon, export=1
@@ -758,8 +758,8 @@ function ff_\type\()_h264_qpel8_mc33_neon, export=1
 endfunc
 .endm

-        h264_qpel8      put
-        h264_qpel8      avg
+        h264_qpel8 put
+        h264_qpel8 avg

 .macro  h264_qpel16     type
 function ff_\type\()_h264_qpel16_mc10_neon, export=1
@@ -930,5 +930,5 @@ function ff_\type\()_h264_qpel16_mc33_neon, export=1
 endfunc
 .endm

-        h264_qpel16     put
-        h264_qpel16     avg
+        h264_qpel16 put
+        h264_qpel16 avg
@@ -26,295 +26,295 @@
  .if \avg
        mov             x12, x0
  .endif
-1:      ld1             {v0.16b},  [x1], x2
-        ld1             {v1.16b},  [x1], x2
-        ld1             {v2.16b},  [x1], x2
-        ld1             {v3.16b},  [x1], x2
+1:      ld1             {v0.16B},  [x1], x2
+        ld1             {v1.16B},  [x1], x2
+        ld1             {v2.16B},  [x1], x2
+        ld1             {v3.16B},  [x1], x2
  .if \avg
-        ld1             {v4.16b},  [x12], x2
-        urhadd          v0.16b,  v0.16b,  v4.16b
-        ld1             {v5.16b},  [x12], x2
-        urhadd          v1.16b,  v1.16b,  v5.16b
-        ld1             {v6.16b},  [x12], x2
-        urhadd          v2.16b,  v2.16b,  v6.16b
-        ld1             {v7.16b},  [x12], x2
-        urhadd          v3.16b,  v3.16b,  v7.16b
+        ld1             {v4.16B},  [x12], x2
+        urhadd          v0.16B,  v0.16B,  v4.16B
+        ld1             {v5.16B},  [x12], x2
+        urhadd          v1.16B,  v1.16B,  v5.16B
+        ld1             {v6.16B},  [x12], x2
+        urhadd          v2.16B,  v2.16B,  v6.16B
+        ld1             {v7.16B},  [x12], x2
+        urhadd          v3.16B,  v3.16B,  v7.16B
  .endif
        subs            w3,  w3,  #4
-        st1             {v0.16b},  [x0], x2
-        st1             {v1.16b},  [x0], x2
-        st1             {v2.16b},  [x0], x2
-        st1             {v3.16b},  [x0], x2
+        st1             {v0.16B},  [x0], x2
+        st1             {v1.16B},  [x0], x2
+        st1             {v2.16B},  [x0], x2
+        st1             {v3.16B},  [x0], x2
        b.ne            1b
        ret
 .endm

 .macro  pixels16_x2     rnd=1, avg=0
-1:      ld1             {v0.16b, v1.16b}, [x1], x2
-        ld1             {v2.16b, v3.16b}, [x1], x2
+1:      ld1             {v0.16B, v1.16B}, [x1], x2
+        ld1             {v2.16B, v3.16B}, [x1], x2
        subs            w3,  w3,  #2
-        ext             v1.16b,  v0.16b,  v1.16b,  #1
-        avg             v0.16b,  v0.16b,  v1.16b
-        ext             v3.16b,  v2.16b,  v3.16b,  #1
-        avg             v2.16b,  v2.16b,  v3.16b
+        ext             v1.16B,  v0.16B,  v1.16B,  #1
+        avg             v0.16B,  v0.16B,  v1.16B
+        ext             v3.16B,  v2.16B,  v3.16B,  #1
+        avg             v2.16B,  v2.16B,  v3.16B
  .if \avg
-        ld1             {v1.16b}, [x0], x2
-        ld1             {v3.16b}, [x0]
-        urhadd          v0.16b,  v0.16b,  v1.16b
-        urhadd          v2.16b,  v2.16b,  v3.16b
+        ld1             {v1.16B}, [x0], x2
+        ld1             {v3.16B}, [x0]
+        urhadd          v0.16B,  v0.16B,  v1.16B
+        urhadd          v2.16B,  v2.16B,  v3.16B
        sub             x0,  x0,  x2
  .endif
-        st1             {v0.16b}, [x0], x2
-        st1             {v2.16b}, [x0], x2
+        st1             {v0.16B}, [x0], x2
+        st1             {v2.16B}, [x0], x2
        b.ne            1b
        ret
 .endm

 .macro  pixels16_y2     rnd=1, avg=0
        sub             w3,  w3,  #2
-        ld1             {v0.16b}, [x1], x2
-        ld1             {v1.16b}, [x1], x2
+        ld1             {v0.16B}, [x1], x2
+        ld1             {v1.16B}, [x1], x2
 1:      subs            w3,  w3,  #2
-        avg             v2.16b,  v0.16b,  v1.16b
-        ld1             {v0.16b}, [x1], x2
-        avg             v3.16b,  v0.16b,  v1.16b
-        ld1             {v1.16b}, [x1], x2
+        avg             v2.16B,  v0.16B,  v1.16B
+        ld1             {v0.16B}, [x1], x2
+        avg             v3.16B,  v0.16B,  v1.16B
+        ld1             {v1.16B}, [x1], x2
  .if \avg
-        ld1             {v4.16b}, [x0], x2
-        ld1             {v5.16b}, [x0]
-        urhadd          v2.16b,  v2.16b,  v4.16b
-        urhadd          v3.16b,  v3.16b,  v5.16b
+        ld1             {v4.16B}, [x0], x2
+        ld1             {v5.16B}, [x0]
+        urhadd          v2.16B,  v2.16B,  v4.16B
+        urhadd          v3.16B,  v3.16B,  v5.16B
        sub             x0,  x0,  x2
  .endif
-        st1             {v2.16b}, [x0], x2
-        st1             {v3.16b}, [x0], x2
+        st1             {v2.16B}, [x0], x2
+        st1             {v3.16B}, [x0], x2
        b.ne            1b

-        avg             v2.16b,  v0.16b,  v1.16b
-        ld1             {v0.16b}, [x1], x2
-        avg             v3.16b,  v0.16b,  v1.16b
+        avg             v2.16B,  v0.16B,  v1.16B
+        ld1             {v0.16B}, [x1], x2
+        avg             v3.16B,  v0.16B,  v1.16B
  .if \avg
-        ld1             {v4.16b}, [x0], x2
-        ld1             {v5.16b}, [x0]
-        urhadd          v2.16b,  v2.16b,  v4.16b
-        urhadd          v3.16b,  v3.16b,  v5.16b
+        ld1             {v4.16B}, [x0], x2
+        ld1             {v5.16B}, [x0]
+        urhadd          v2.16B,  v2.16B,  v4.16B
+        urhadd          v3.16B,  v3.16B,  v5.16B
        sub             x0,  x0,  x2
  .endif
-        st1             {v2.16b},     [x0], x2
-        st1             {v3.16b},     [x0], x2
+        st1             {v2.16B},     [x0], x2
+        st1             {v3.16B},     [x0], x2

        ret
 .endm

 .macro  pixels16_xy2    rnd=1, avg=0
        sub             w3,  w3,  #2
-        ld1             {v0.16b, v1.16b}, [x1], x2
-        ld1             {v4.16b, v5.16b}, [x1], x2
+        ld1             {v0.16B, v1.16B}, [x1], x2
+        ld1             {v4.16B, v5.16B}, [x1], x2
 NRND    movi            v26.8H, #1
-        ext             v1.16b,  v0.16b,  v1.16b,  #1
-        ext             v5.16b,  v4.16b,  v5.16b,  #1
-        uaddl           v16.8h,  v0.8b,   v1.8b
-        uaddl2          v20.8h,  v0.16b,  v1.16b
-        uaddl           v18.8h,  v4.8b,   v5.8b
-        uaddl2          v22.8h,  v4.16b,  v5.16b
+        ext             v1.16B,  v0.16B,  v1.16B,  #1
+        ext             v5.16B,  v4.16B,  v5.16B,  #1
+        uaddl           v16.8H,  v0.8B,   v1.8B
+        uaddl2          v20.8H,  v0.16B,  v1.16B
+        uaddl           v18.8H,  v4.8B,   v5.8B
+        uaddl2          v22.8H,  v4.16B,  v5.16B
 1:      subs            w3,  w3,  #2
-        ld1             {v0.16b, v1.16b}, [x1], x2
-        add             v24.8h,  v16.8h,  v18.8h
+        ld1             {v0.16B, v1.16B}, [x1], x2
+        add             v24.8H,  v16.8H,  v18.8H
 NRND    add             v24.8H,  v24.8H,  v26.8H
-        ext             v30.16b, v0.16b,  v1.16b,  #1
-        add             v1.8h,   v20.8h,  v22.8h
-        mshrn           v28.8b,  v24.8h,  #2
+        ext             v30.16B, v0.16B,  v1.16B,  #1
+        add             v1.8H,   v20.8H,  v22.8H
+        mshrn           v28.8B,  v24.8H,  #2
 NRND    add             v1.8H,   v1.8H,   v26.8H
-        mshrn2          v28.16b, v1.8h,   #2
+        mshrn2          v28.16B, v1.8H,   #2
  .if \avg
-        ld1             {v16.16b},        [x0]
-        urhadd          v28.16b, v28.16b, v16.16b
+        ld1             {v16.16B},        [x0]
+        urhadd          v28.16B, v28.16B, v16.16B
  .endif
-        uaddl           v16.8h,  v0.8b,   v30.8b
-        ld1             {v2.16b, v3.16b}, [x1], x2
-        uaddl2          v20.8h,  v0.16b,  v30.16b
-        st1             {v28.16b},        [x0], x2
-        add             v24.8h,  v16.8h,  v18.8h
+        uaddl           v16.8H,  v0.8B,   v30.8B
+        ld1             {v2.16B, v3.16B}, [x1], x2
+        uaddl2          v20.8H,  v0.16B,  v30.16B
+        st1             {v28.16B},        [x0], x2
+        add             v24.8H,  v16.8H,  v18.8H
 NRND    add             v24.8H,  v24.8H,  v26.8H
-        ext             v3.16b,  v2.16b,  v3.16b,  #1
-        add             v0.8h,   v20.8h,  v22.8h
-        mshrn           v30.8b,  v24.8h,  #2
+        ext             v3.16B,  v2.16B,  v3.16B,  #1
+        add             v0.8H,   v20.8H,  v22.8H
+        mshrn           v30.8B,  v24.8H,  #2
 NRND    add             v0.8H,   v0.8H,   v26.8H
-        mshrn2          v30.16b, v0.8h,   #2
+        mshrn2          v30.16B, v0.8H,   #2
  .if \avg
-        ld1             {v18.16b},        [x0]
-        urhadd          v30.16b, v30.16b, v18.16b
+        ld1             {v18.16B},        [x0]
+        urhadd          v30.16B, v30.16B, v18.16B
  .endif
-        uaddl           v18.8h,   v2.8b,  v3.8b
-        uaddl2          v22.8h,   v2.16b, v3.16b
-        st1             {v30.16b},        [x0], x2
+        uaddl           v18.8H,   v2.8B,  v3.8B
+        uaddl2          v22.8H,   v2.16B, v3.16B
+        st1             {v30.16B},        [x0], x2
        b.gt            1b

-        ld1             {v0.16b, v1.16b}, [x1], x2
-        add             v24.8h,  v16.8h,  v18.8h
+        ld1             {v0.16B, v1.16B}, [x1], x2
+        add             v24.8H,  v16.8H,  v18.8H
 NRND    add             v24.8H,  v24.8H,  v26.8H
-        ext             v30.16b, v0.16b,  v1.16b,  #1
-        add             v1.8h,   v20.8h,  v22.8h
-        mshrn           v28.8b,  v24.8h,  #2
+        ext             v30.16B, v0.16B,  v1.16B,  #1
+        add             v1.8H,   v20.8H,  v22.8H
+        mshrn           v28.8B,  v24.8H,  #2
 NRND    add             v1.8H,   v1.8H,   v26.8H
-        mshrn2          v28.16b, v1.8h,   #2
+        mshrn2          v28.16B, v1.8H,   #2
  .if \avg
-        ld1             {v16.16b},        [x0]
-        urhadd          v28.16b, v28.16b, v16.16b
+        ld1             {v16.16B},        [x0]
+        urhadd          v28.16B, v28.16B, v16.16B
  .endif
-        uaddl           v16.8h,  v0.8b,   v30.8b
-        uaddl2          v20.8h,  v0.16b,  v30.16b
-        st1             {v28.16b},        [x0], x2
-        add             v24.8h,  v16.8h,  v18.8h
+        uaddl           v16.8H,  v0.8B,   v30.8B
+        uaddl2          v20.8H,  v0.16B,  v30.16B
+        st1             {v28.16B},        [x0], x2
+        add             v24.8H,  v16.8H,  v18.8H
 NRND    add             v24.8H,  v24.8H,  v26.8H
-        add             v0.8h,   v20.8h,  v22.8h
-        mshrn           v30.8b,  v24.8h,  #2
+        add             v0.8H,   v20.8H,  v22.8H
+        mshrn           v30.8B,  v24.8H,  #2
 NRND    add             v0.8H,   v0.8H,   v26.8H
-        mshrn2          v30.16b, v0.8h,   #2
+        mshrn2          v30.16B, v0.8H,   #2
  .if \avg
-        ld1             {v18.16b},        [x0]
-        urhadd          v30.16b, v30.16b, v18.16b
+        ld1             {v18.16B},        [x0]
+        urhadd          v30.16B, v30.16B, v18.16B
  .endif
-        st1             {v30.16b},        [x0], x2
+        st1             {v30.16B},        [x0], x2

        ret
 .endm

 .macro  pixels8         rnd=1, avg=0
-1:      ld1             {v0.8b}, [x1], x2
-        ld1             {v1.8b}, [x1], x2
-        ld1             {v2.8b}, [x1], x2
-        ld1             {v3.8b}, [x1], x2
+1:      ld1             {v0.8B}, [x1], x2
+        ld1             {v1.8B}, [x1], x2
+        ld1             {v2.8B}, [x1], x2
+        ld1             {v3.8B}, [x1], x2
  .if \avg
-        ld1             {v4.8b}, [x0], x2
-        urhadd          v0.8b,  v0.8b,  v4.8b
-        ld1             {v5.8b}, [x0], x2
-        urhadd          v1.8b,  v1.8b,  v5.8b
-        ld1             {v6.8b}, [x0], x2
-        urhadd          v2.8b,  v2.8b,  v6.8b
-        ld1             {v7.8b}, [x0], x2
-        urhadd          v3.8b,  v3.8b,  v7.8b
+        ld1             {v4.8B}, [x0], x2
+        urhadd          v0.8B,  v0.8B,  v4.8B
+        ld1             {v5.8B}, [x0], x2
+        urhadd          v1.8B,  v1.8B,  v5.8B
+        ld1             {v6.8B}, [x0], x2
+        urhadd          v2.8B,  v2.8B,  v6.8B
+        ld1             {v7.8B}, [x0], x2
+        urhadd          v3.8B,  v3.8B,  v7.8B
        sub             x0,  x0,  x2,  lsl #2
  .endif
        subs            w3,  w3,  #4
-        st1             {v0.8b}, [x0], x2
-        st1             {v1.8b}, [x0], x2
-        st1             {v2.8b}, [x0], x2
-        st1             {v3.8b}, [x0], x2
+        st1             {v0.8B}, [x0], x2
+        st1             {v1.8B}, [x0], x2
+        st1             {v2.8B}, [x0], x2
+        st1             {v3.8B}, [x0], x2
        b.ne            1b
        ret
 .endm

 .macro  pixels8_x2      rnd=1, avg=0
-1:      ld1             {v0.8b, v1.8b}, [x1], x2
-        ext             v1.8b,  v0.8b,  v1.8b,  #1
-        ld1             {v2.8b, v3.8b}, [x1], x2
-        ext             v3.8b,  v2.8b,  v3.8b,  #1
+1:      ld1             {v0.8B, v1.8B}, [x1], x2
+        ext             v1.8B,  v0.8B,  v1.8B,  #1
+        ld1             {v2.8B, v3.8B}, [x1], x2
+        ext             v3.8B,  v2.8B,  v3.8B,  #1
        subs            w3,  w3,  #2
-        avg             v0.8b,   v0.8b,   v1.8b
-        avg             v2.8b,   v2.8b,   v3.8b
+        avg             v0.8B,   v0.8B,   v1.8B
+        avg             v2.8B,   v2.8B,   v3.8B
  .if \avg
-        ld1             {v4.8b},     [x0], x2
-        ld1             {v5.8b},     [x0]
-        urhadd          v0.8b,   v0.8b,   v4.8b
-        urhadd          v2.8b,   v2.8b,   v5.8b
+        ld1             {v4.8B},     [x0], x2
+        ld1             {v5.8B},     [x0]
+        urhadd          v0.8B,   v0.8B,   v4.8B
+        urhadd          v2.8B,   v2.8B,   v5.8B
        sub             x0,  x0,  x2
  .endif
-        st1             {v0.8b}, [x0], x2
-        st1             {v2.8b}, [x0], x2
+        st1             {v0.8B}, [x0], x2
+        st1             {v2.8B}, [x0], x2
        b.ne            1b
        ret
 .endm

 .macro  pixels8_y2      rnd=1, avg=0
        sub             w3,  w3,  #2
-        ld1             {v0.8b},  [x1], x2
-        ld1             {v1.8b},  [x1], x2
+        ld1             {v0.8B},  [x1], x2
+        ld1             {v1.8B},  [x1], x2
 1:      subs            w3,  w3,  #2
-        avg             v4.8b,  v0.8b,  v1.8b
-        ld1             {v0.8b},  [x1], x2
-        avg             v5.8b,  v0.8b,  v1.8b
-        ld1             {v1.8b},  [x1], x2
+        avg             v4.8B,  v0.8B,  v1.8B
+        ld1             {v0.8B},  [x1], x2
+        avg             v5.8B,  v0.8B,  v1.8B
+        ld1             {v1.8B},  [x1], x2
  .if \avg
-        ld1             {v2.8b},     [x0], x2
-        ld1             {v3.8b},     [x0]
-        urhadd          v4.8b,  v4.8b,  v2.8b
-        urhadd          v5.8b,  v5.8b,  v3.8b
+        ld1             {v2.8B},     [x0], x2
+        ld1             {v3.8B},     [x0]
+        urhadd          v4.8B,  v4.8B,  v2.8B
+        urhadd          v5.8B,  v5.8B,  v3.8B
        sub             x0,  x0,  x2
  .endif
-        st1             {v4.8b},     [x0], x2
-        st1             {v5.8b},     [x0], x2
+        st1             {v4.8B},     [x0], x2
+        st1             {v5.8B},     [x0], x2
        b.ne            1b

-        avg             v4.8b,  v0.8b,  v1.8b
-        ld1             {v0.8b},  [x1], x2
-        avg             v5.8b,  v0.8b,  v1.8b
+        avg             v4.8B,  v0.8B,  v1.8B
+        ld1             {v0.8B},  [x1], x2
+        avg             v5.8B,  v0.8B,  v1.8B
  .if \avg
-        ld1             {v2.8b},     [x0], x2
-        ld1             {v3.8b},     [x0]
-        urhadd          v4.8b,  v4.8b,  v2.8b
-        urhadd          v5.8b,  v5.8b,  v3.8b
+        ld1             {v2.8B},     [x0], x2
+        ld1             {v3.8B},     [x0]
+        urhadd          v4.8B,  v4.8B,  v2.8B
+        urhadd          v5.8B,  v5.8B,  v3.8B
        sub             x0,  x0,  x2
  .endif
-        st1             {v4.8b},     [x0], x2
-        st1             {v5.8b},     [x0], x2
+        st1             {v4.8B},     [x0], x2
+        st1             {v5.8B},     [x0], x2

        ret
 .endm

 .macro  pixels8_xy2     rnd=1, avg=0
        sub             w3,  w3,  #2
-        ld1             {v0.16b},     [x1], x2
-        ld1             {v1.16b},     [x1], x2
+        ld1             {v0.16B},     [x1], x2
+        ld1             {v1.16B},     [x1], x2
 NRND    movi            v19.8H, #1
-        ext             v4.16b,  v0.16b,  v4.16b,  #1
-        ext             v6.16b,  v1.16b,  v6.16b,  #1
-        uaddl           v16.8h,  v0.8b,  v4.8b
-        uaddl           v17.8h,  v1.8b,  v6.8b
+        ext             v4.16B,  v0.16B,  v4.16B,  #1
+        ext             v6.16B,  v1.16B,  v6.16B,  #1
+        uaddl           v16.8H,  v0.8B,  v4.8B
+        uaddl           v17.8H,  v1.8B,  v6.8B
 1:      subs            w3,  w3,  #2
-        ld1             {v0.16b},     [x1], x2
-        add             v18.8h, v16.8h,  v17.8h
-        ext             v4.16b,  v0.16b,  v4.16b,  #1
+        ld1             {v0.16B},     [x1], x2
+        add             v18.8H, v16.8H,  v17.8H
+        ext             v4.16B,  v0.16B,  v4.16B,  #1
 NRND    add             v18.8H, v18.8H, v19.8H
-        uaddl           v16.8h,  v0.8b,  v4.8b
-        mshrn           v5.8b,  v18.8h, #2
-        ld1             {v1.16b},     [x1], x2
-        add             v18.8h, v16.8h,  v17.8h
+        uaddl           v16.8H,  v0.8B,  v4.8B
+        mshrn           v5.8B,  v18.8H, #2
+        ld1             {v1.16B},     [x1], x2
+        add             v18.8H, v16.8H,  v17.8H
  .if \avg
-        ld1             {v7.8b},     [x0]
-        urhadd          v5.8b,  v5.8b,  v7.8b
+        ld1             {v7.8B},     [x0]
+        urhadd          v5.8B,  v5.8B,  v7.8B
  .endif
 NRND    add             v18.8H, v18.8H, v19.8H
-        st1             {v5.8b},     [x0], x2
-        mshrn           v7.8b,  v18.8h, #2
+        st1             {v5.8B},     [x0], x2
+        mshrn           v7.8B,  v18.8H, #2
  .if \avg
-        ld1             {v5.8b},     [x0]
-        urhadd          v7.8b,  v7.8b,  v5.8b
+        ld1             {v5.8B},     [x0]
+        urhadd          v7.8B,  v7.8B,  v5.8B
  .endif
-        ext             v6.16b,  v1.16b,  v6.16b,  #1
-        uaddl           v17.8h,  v1.8b,   v6.8b
-        st1             {v7.8b},     [x0], x2
+        ext             v6.16B,  v1.16B,  v6.16B,  #1
+        uaddl           v17.8H,  v1.8B,   v6.8B
+        st1             {v7.8B},     [x0], x2
        b.gt            1b

-        ld1             {v0.16b},     [x1], x2
-        add             v18.8h, v16.8h, v17.8h
-        ext             v4.16b, v0.16b, v4.16b,  #1
+        ld1             {v0.16B},     [x1], x2
+        add             v18.8H, v16.8H, v17.8H
+        ext             v4.16B, v0.16B, v4.16B,  #1
 NRND    add             v18.8H, v18.8H, v19.8H
-        uaddl           v16.8h,  v0.8b, v4.8b
-        mshrn           v5.8b,  v18.8h, #2
-        add             v18.8h, v16.8h, v17.8h
+        uaddl           v16.8H,  v0.8B, v4.8B
+        mshrn           v5.8B,  v18.8H, #2
+        add             v18.8H, v16.8H, v17.8H
  .if \avg
-        ld1             {v7.8b},     [x0]
-        urhadd          v5.8b,  v5.8b,  v7.8b
+        ld1             {v7.8B},     [x0]
+        urhadd          v5.8B,  v5.8B,  v7.8B
  .endif
 NRND    add             v18.8H, v18.8H, v19.8H
-        st1             {v5.8b},     [x0], x2
-        mshrn           v7.8b,  v18.8h, #2
+        st1             {v5.8B},     [x0], x2
+        mshrn           v7.8B,  v18.8H, #2
  .if \avg
-        ld1             {v5.8b},     [x0]
-        urhadd          v7.8b,  v7.8b,  v5.8b
+        ld1             {v5.8B},     [x0]
+        urhadd          v7.8B,  v7.8B,  v5.8B
  .endif
-        st1             {v7.8b},     [x0], x2
+        st1             {v7.8B},     [x0], x2

        ret
 .endm
@@ -19,7 +19,6 @@
 #ifndef AVCODEC_AARCH64_IDCT_H
 #define AVCODEC_AARCH64_IDCT_H

-#include <stddef.h>
 #include <stdint.h>

 void ff_simple_idct_neon(int16_t *data);
@@ -17,133 +17,133 @@
 */

 .macro  transpose_8x8B  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
-        trn1            \r8\().8b,  \r0\().8b,  \r1\().8b
-        trn2            \r9\().8b,  \r0\().8b,  \r1\().8b
-        trn1            \r1\().8b,  \r2\().8b,  \r3\().8b
-        trn2            \r3\().8b,  \r2\().8b,  \r3\().8b
-        trn1            \r0\().8b,  \r4\().8b,  \r5\().8b
-        trn2            \r5\().8b,  \r4\().8b,  \r5\().8b
-        trn1            \r2\().8b,  \r6\().8b,  \r7\().8b
-        trn2            \r7\().8b,  \r6\().8b,  \r7\().8b
+        trn1            \r8\().8B,  \r0\().8B,  \r1\().8B
+        trn2            \r9\().8B,  \r0\().8B,  \r1\().8B
+        trn1            \r1\().8B,  \r2\().8B,  \r3\().8B
+        trn2            \r3\().8B,  \r2\().8B,  \r3\().8B
+        trn1            \r0\().8B,  \r4\().8B,  \r5\().8B
+        trn2            \r5\().8B,  \r4\().8B,  \r5\().8B
+        trn1            \r2\().8B,  \r6\().8B,  \r7\().8B
+        trn2            \r7\().8B,  \r6\().8B,  \r7\().8B

-        trn1            \r4\().4h,  \r0\().4h,  \r2\().4h
-        trn2            \r2\().4h,  \r0\().4h,  \r2\().4h
-        trn1            \r6\().4h,  \r5\().4h,  \r7\().4h
-        trn2            \r7\().4h,  \r5\().4h,  \r7\().4h
-        trn1            \r5\().4h,  \r9\().4h,  \r3\().4h
-        trn2            \r9\().4h,  \r9\().4h,  \r3\().4h
-        trn1            \r3\().4h,  \r8\().4h,  \r1\().4h
-        trn2            \r8\().4h,  \r8\().4h,  \r1\().4h
+        trn1            \r4\().4H,  \r0\().4H,  \r2\().4H
+        trn2            \r2\().4H,  \r0\().4H,  \r2\().4H
+        trn1            \r6\().4H,  \r5\().4H,  \r7\().4H
+        trn2            \r7\().4H,  \r5\().4H,  \r7\().4H
+        trn1            \r5\().4H,  \r9\().4H,  \r3\().4H
+        trn2            \r9\().4H,  \r9\().4H,  \r3\().4H
+        trn1            \r3\().4H,  \r8\().4H,  \r1\().4H
+        trn2            \r8\().4H,  \r8\().4H,  \r1\().4H

-        trn1            \r0\().2s,  \r3\().2s,  \r4\().2s
-        trn2            \r4\().2s,  \r3\().2s,  \r4\().2s
+        trn1            \r0\().2S,  \r3\().2S,  \r4\().2S
+        trn2            \r4\().2S,  \r3\().2S,  \r4\().2S

-        trn1            \r1\().2s,  \r5\().2s,  \r6\().2s
-        trn2            \r5\().2s,  \r5\().2s,  \r6\().2s
+        trn1            \r1\().2S,  \r5\().2S,  \r6\().2S
+        trn2            \r5\().2S,  \r5\().2S,  \r6\().2S

-        trn2            \r6\().2s,  \r8\().2s,  \r2\().2s
-        trn1            \r2\().2s,  \r8\().2s,  \r2\().2s
+        trn2            \r6\().2S,  \r8\().2S,  \r2\().2S
+        trn1            \r2\().2S,  \r8\().2S,  \r2\().2S

-        trn1            \r3\().2s,  \r9\().2s,  \r7\().2s
-        trn2            \r7\().2s,  \r9\().2s,  \r7\().2s
+        trn1            \r3\().2S,  \r9\().2S,  \r7\().2S
+        trn2            \r7\().2S,  \r9\().2S,  \r7\().2S
 .endm

 .macro  transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
-        trn1            \t0\().16b, \r0\().16b, \r1\().16b
-        trn2            \t1\().16b, \r0\().16b, \r1\().16b
-        trn1            \r1\().16b, \r2\().16b, \r3\().16b
-        trn2            \r3\().16b, \r2\().16b, \r3\().16b
-        trn1            \r0\().16b, \r4\().16b, \r5\().16b
-        trn2            \r5\().16b, \r4\().16b, \r5\().16b
-        trn1            \r2\().16b, \r6\().16b, \r7\().16b
-        trn2            \r7\().16b, \r6\().16b, \r7\().16b
+        trn1            \t0\().16B, \r0\().16B, \r1\().16B
+        trn2            \t1\().16B, \r0\().16B, \r1\().16B
+        trn1            \r1\().16B, \r2\().16B, \r3\().16B
+        trn2            \r3\().16B, \r2\().16B, \r3\().16B
+        trn1            \r0\().16B, \r4\().16B, \r5\().16B
+        trn2            \r5\().16B, \r4\().16B, \r5\().16B
+        trn1            \r2\().16B, \r6\().16B, \r7\().16B
+        trn2            \r7\().16B, \r6\().16B, \r7\().16B

-        trn1            \r4\().8h,  \r0\().8h,  \r2\().8h
-        trn2            \r2\().8h,  \r0\().8h,  \r2\().8h
-        trn1            \r6\().8h,  \r5\().8h,  \r7\().8h
-        trn2            \r7\().8h,  \r5\().8h,  \r7\().8h
-        trn1            \r5\().8h,  \t1\().8h,  \r3\().8h
-        trn2            \t1\().8h,  \t1\().8h,  \r3\().8h
-        trn1            \r3\().8h,  \t0\().8h,  \r1\().8h
-        trn2            \t0\().8h,  \t0\().8h,  \r1\().8h
+        trn1            \r4\().8H,  \r0\().8H,  \r2\().8H
+        trn2            \r2\().8H,  \r0\().8H,  \r2\().8H
+        trn1            \r6\().8H,  \r5\().8H,  \r7\().8H
+        trn2            \r7\().8H,  \r5\().8H,  \r7\().8H
+        trn1            \r5\().8H,  \t1\().8H,  \r3\().8H
+        trn2            \t1\().8H,  \t1\().8H,  \r3\().8H
+        trn1            \r3\().8H,  \t0\().8H,  \r1\().8H
+        trn2            \t0\().8H,  \t0\().8H,  \r1\().8H

-        trn1            \r0\().4s,  \r3\().4s,  \r4\().4s
-        trn2            \r4\().4s,  \r3\().4s,  \r4\().4s
+        trn1            \r0\().4S,  \r3\().4S,  \r4\().4S
+        trn2            \r4\().4S,  \r3\().4S,  \r4\().4S

-        trn1            \r1\().4s,  \r5\().4s,  \r6\().4s
-        trn2            \r5\().4s,  \r5\().4s,  \r6\().4s
+        trn1            \r1\().4S,  \r5\().4S,  \r6\().4S
+        trn2            \r5\().4S,  \r5\().4S,  \r6\().4S

-        trn2            \r6\().4s,  \t0\().4s,  \r2\().4s
-        trn1            \r2\().4s,  \t0\().4s,  \r2\().4s
+        trn2            \r6\().4S,  \t0\().4S,  \r2\().4S
+        trn1            \r2\().4S,  \t0\().4S,  \r2\().4S

-        trn1            \r3\().4s,  \t1\().4s,  \r7\().4s
-        trn2            \r7\().4s,  \t1\().4s,  \r7\().4s
+        trn1            \r3\().4S,  \t1\().4S,  \r7\().4S
+        trn2            \r7\().4S,  \t1\().4S,  \r7\().4S
 .endm

 .macro  transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
-        trn1            \t4\().16b, \r0\().16b,  \r1\().16b
-        trn2            \t5\().16b, \r0\().16b,  \r1\().16b
-        trn1            \t6\().16b, \r2\().16b,  \r3\().16b
-        trn2            \t7\().16b, \r2\().16b,  \r3\().16b
+        trn1            \t4\().16B, \r0\().16B,  \r1\().16B
+        trn2            \t5\().16B, \r0\().16B,  \r1\().16B
+        trn1            \t6\().16B, \r2\().16B,  \r3\().16B
+        trn2            \t7\().16B, \r2\().16B,  \r3\().16B

-        trn1            \r0\().8h,  \t4\().8h,  \t6\().8h
-        trn2            \r2\().8h,  \t4\().8h,  \t6\().8h
-        trn1            \r1\().8h,  \t5\().8h,  \t7\().8h
-        trn2            \r3\().8h,  \t5\().8h,  \t7\().8h
+        trn1            \r0\().8H,  \t4\().8H,  \t6\().8H
+        trn2            \r2\().8H,  \t4\().8H,  \t6\().8H
+        trn1            \r1\().8H,  \t5\().8H,  \t7\().8H
+        trn2            \r3\().8H,  \t5\().8H,  \t7\().8H
 .endm

 .macro  transpose_4x8B  r0, r1, r2, r3, t4, t5, t6, t7
-        trn1            \t4\().8b,  \r0\().8b,  \r1\().8b
-        trn2            \t5\().8b,  \r0\().8b,  \r1\().8b
-        trn1            \t6\().8b,  \r2\().8b,  \r3\().8b
-        trn2            \t7\().8b,  \r2\().8b,  \r3\().8b
+        trn1            \t4\().8B,  \r0\().8B,  \r1\().8B
+        trn2            \t5\().8B,  \r0\().8B,  \r1\().8B
+        trn1            \t6\().8B,  \r2\().8B,  \r3\().8B
+        trn2            \t7\().8B,  \r2\().8B,  \r3\().8B

-        trn1            \r0\().4h,  \t4\().4h,  \t6\().4h
-        trn2            \r2\().4h,  \t4\().4h,  \t6\().4h
-        trn1            \r1\().4h,  \t5\().4h,  \t7\().4h
-        trn2            \r3\().4h,  \t5\().4h,  \t7\().4h
+        trn1            \r0\().4H,  \t4\().4H,  \t6\().4H
+        trn2            \r2\().4H,  \t4\().4H,  \t6\().4H
+        trn1            \r1\().4H,  \t5\().4H,  \t7\().4H
+        trn2            \r3\().4H,  \t5\().4H,  \t7\().4H
 .endm

 .macro  transpose_4x4H  r0, r1, r2, r3, r4, r5, r6, r7
-        trn1            \r4\().4h,  \r0\().4h,  \r1\().4h
-        trn2            \r5\().4h,  \r0\().4h,  \r1\().4h
-        trn1            \r6\().4h,  \r2\().4h,  \r3\().4h
-        trn2            \r7\().4h,  \r2\().4h,  \r3\().4h
-        trn1            \r0\().2s,  \r4\().2s,  \r6\().2s
-        trn2            \r2\().2s,  \r4\().2s,  \r6\().2s
-        trn1            \r1\().2s,  \r5\().2s,  \r7\().2s
-        trn2            \r3\().2s,  \r5\().2s,  \r7\().2s
+        trn1            \r4\().4H,  \r0\().4H,  \r1\().4H
+        trn2            \r5\().4H,  \r0\().4H,  \r1\().4H
+        trn1            \r6\().4H,  \r2\().4H,  \r3\().4H
+        trn2            \r7\().4H,  \r2\().4H,  \r3\().4H
+        trn1            \r0\().2S,  \r4\().2S,  \r6\().2S
+        trn2            \r2\().2S,  \r4\().2S,  \r6\().2S
+        trn1            \r1\().2S,  \r5\().2S,  \r7\().2S
+        trn2            \r3\().2S,  \r5\().2S,  \r7\().2S
 .endm

 .macro  transpose_8x8H  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
-        trn1            \r8\().8h,  \r0\().8h,  \r1\().8h
-        trn2            \r9\().8h,  \r0\().8h,  \r1\().8h
-        trn1            \r1\().8h,  \r2\().8h,  \r3\().8h
-        trn2            \r3\().8h,  \r2\().8h,  \r3\().8h
-        trn1            \r0\().8h,  \r4\().8h,  \r5\().8h
-        trn2            \r5\().8h,  \r4\().8h,  \r5\().8h
-        trn1            \r2\().8h,  \r6\().8h,  \r7\().8h
-        trn2            \r7\().8h,  \r6\().8h,  \r7\().8h
+        trn1            \r8\().8H,  \r0\().8H,  \r1\().8H
+        trn2            \r9\().8H,  \r0\().8H,  \r1\().8H
+        trn1            \r1\().8H,  \r2\().8H,  \r3\().8H
+        trn2            \r3\().8H,  \r2\().8H,  \r3\().8H
+        trn1            \r0\().8H,  \r4\().8H,  \r5\().8H
+        trn2            \r5\().8H,  \r4\().8H,  \r5\().8H
+        trn1            \r2\().8H,  \r6\().8H,  \r7\().8H
+        trn2            \r7\().8H,  \r6\().8H,  \r7\().8H

-        trn1            \r4\().4s,  \r0\().4s,  \r2\().4s
-        trn2            \r2\().4s,  \r0\().4s,  \r2\().4s
-        trn1            \r6\().4s,  \r5\().4s,  \r7\().4s
-        trn2            \r7\().4s,  \r5\().4s,  \r7\().4s
-        trn1            \r5\().4s,  \r9\().4s,  \r3\().4s
-        trn2            \r9\().4s,  \r9\().4s,  \r3\().4s
-        trn1            \r3\().4s,  \r8\().4s,  \r1\().4s
-        trn2            \r8\().4s,  \r8\().4s,  \r1\().4s
+        trn1            \r4\().4S,  \r0\().4S,  \r2\().4S
+        trn2            \r2\().4S,  \r0\().4S,  \r2\().4S
+        trn1            \r6\().4S,  \r5\().4S,  \r7\().4S
+        trn2            \r7\().4S,  \r5\().4S,  \r7\().4S
+        trn1            \r5\().4S,  \r9\().4S,  \r3\().4S
+        trn2            \r9\().4S,  \r9\().4S,  \r3\().4S
+        trn1            \r3\().4S,  \r8\().4S,  \r1\().4S
+        trn2            \r8\().4S,  \r8\().4S,  \r1\().4S

-        trn1            \r0\().2d,  \r3\().2d,  \r4\().2d
-        trn2            \r4\().2d,  \r3\().2d,  \r4\().2d
+        trn1            \r0\().2D,  \r3\().2D,  \r4\().2D
+        trn2            \r4\().2D,  \r3\().2D,  \r4\().2D

-        trn1            \r1\().2d,  \r5\().2d,  \r6\().2d
-        trn2            \r5\().2d,  \r5\().2d,  \r6\().2d
+        trn1            \r1\().2D,  \r5\().2D,  \r6\().2D
+        trn2            \r5\().2D,  \r5\().2D,  \r6\().2D

-        trn2            \r6\().2d,  \r8\().2d,  \r2\().2d
-        trn1            \r2\().2d,  \r8\().2d,  \r2\().2d
+        trn2            \r6\().2D,  \r8\().2D,  \r2\().2D
+        trn1            \r2\().2D,  \r8\().2D,  \r2\().2D

-        trn1            \r3\().2d,  \r9\().2d,  \r7\().2d
-        trn2            \r7\().2d,  \r9\().2d,  \r7\().2d
+        trn1            \r3\().2D,  \r9\().2D,  \r7\().2D
+        trn2            \r7\().2D,  \r9\().2D,  \r7\().2D

 .endm
@@ -33,81 +33,81 @@ const tab_x2, align=4
 endconst

 function ff_opus_deemphasis_neon, export=1
-        movrel          x4, tab_st
-        ld1             {v4.4s}, [x4]
-        movrel          x4, tab_x0
-        ld1             {v5.4s}, [x4]
-        movrel          x4, tab_x1
-        ld1             {v6.4s}, [x4]
-        movrel          x4, tab_x2
-        ld1             {v7.4s}, [x4]
+        movrel  x4, tab_st
+        ld1    {v4.4s}, [x4]
+        movrel  x4, tab_x0
+        ld1    {v5.4s}, [x4]
+        movrel  x4, tab_x1
+        ld1    {v6.4s}, [x4]
+        movrel  x4, tab_x2
+        ld1    {v7.4s}, [x4]

-        fmul            v0.4s, v4.4s, v0.s[0]
+        fmul v0.4s, v4.4s, v0.s[0]

-1:      ld1             {v1.4s, v2.4s}, [x1], #32
+1:      ld1  {v1.4s, v2.4s}, [x1], #32

-        fmla            v0.4s, v5.4s, v1.s[0]
-        fmul            v3.4s, v7.4s, v2.s[2]
+        fmla v0.4s, v5.4s, v1.s[0]
+        fmul v3.4s, v7.4s, v2.s[2]

-        fmla            v0.4s, v6.4s, v1.s[1]
-        fmla            v3.4s, v6.4s, v2.s[1]
+        fmla v0.4s, v6.4s, v1.s[1]
+        fmla v3.4s, v6.4s, v2.s[1]

-        fmla            v0.4s, v7.4s, v1.s[2]
-        fmla            v3.4s, v5.4s, v2.s[0]
+        fmla v0.4s, v7.4s, v1.s[2]
+        fmla v3.4s, v5.4s, v2.s[0]

-        fadd            v1.4s, v1.4s, v0.4s
-        fadd            v2.4s, v2.4s, v3.4s
+        fadd v1.4s, v1.4s, v0.4s
+        fadd v2.4s, v2.4s, v3.4s

-        fmla            v2.4s, v4.4s, v1.s[3]
+        fmla v2.4s, v4.4s, v1.s[3]

-        st1             {v1.4s, v2.4s}, [x0], #32
-        fmul            v0.4s, v4.4s, v2.s[3]
+        st1  {v1.4s, v2.4s}, [x0], #32
+        fmul v0.4s, v4.4s, v2.s[3]

-        subs            w2, w2, #8
-        b.gt            1b
+        subs w2, w2, #8
+        b.gt 1b

-        mov             s0, v2.s[3]
+        mov s0, v2.s[3]

        ret
 endfunc

 function ff_opus_postfilter_neon, export=1
-        ld1             {v0.4s}, [x2]
-        dup             v1.4s, v0.s[1]
-        dup             v2.4s, v0.s[2]
-        dup             v0.4s, v0.s[0]
+        ld1 {v0.4s}, [x2]
+        dup v1.4s, v0.s[1]
+        dup v2.4s, v0.s[2]
+        dup v0.4s, v0.s[0]

-        add             w1, w1, #2
-        sub             x1, x0, x1, lsl #2
+        add w1, w1, #2
+        sub x1, x0, x1, lsl #2

-        ld1             {v3.4s}, [x1]
-        fmul            v3.4s, v3.4s, v2.4s
+        ld1 {v3.4s}, [x1]
+        fmul v3.4s, v3.4s, v2.4s

-1:      add             x1, x1, #4
-        ld1             {v4.4s}, [x1]
-        add             x1, x1, #4
-        ld1             {v5.4s}, [x1]
-        add             x1, x1, #4
-        ld1             {v6.4s}, [x1]
-        add             x1, x1, #4
-        ld1             {v7.4s}, [x1]
+1:      add x1, x1, #4
+        ld1 {v4.4s}, [x1]
+        add x1, x1, #4
+        ld1 {v5.4s}, [x1]
+        add x1, x1, #4
+        ld1 {v6.4s}, [x1]
+        add x1, x1, #4
+        ld1 {v7.4s}, [x1]

-        fmla            v3.4s, v7.4s, v2.4s
-        fadd            v6.4s, v6.4s, v4.4s
+        fmla v3.4s, v7.4s, v2.4s
+        fadd v6.4s, v6.4s, v4.4s

-        ld1             {v4.4s}, [x0]
-        fmla            v4.4s, v5.4s, v0.4s
+        ld1 {v4.4s}, [x0]
+        fmla v4.4s, v5.4s, v0.4s

-        fmul            v6.4s, v6.4s, v1.4s
-        fadd            v6.4s, v6.4s, v3.4s
+        fmul v6.4s, v6.4s, v1.4s
+        fadd v6.4s, v6.4s, v3.4s

-        fadd            v4.4s, v4.4s, v6.4s
-        fmul            v3.4s, v7.4s, v2.4s
+        fadd v4.4s, v4.4s, v6.4s
+        fmul v3.4s, v7.4s, v2.4s

-        st1             {v4.4s}, [x0], #16
+        st1  {v4.4s}, [x0], #16

-        subs            w3, w3, #4
-        b.gt            1b
+        subs w3, w3, #4
+        b.gt 1b

        ret
 endfunc
@@ -46,49 +46,49 @@ function ff_sbr_sum64x5_neon, export=1
        add             x3, x0, #192*4
        add             x4, x0, #256*4
        mov             x5, #64
-1:      ld1             {v0.4s}, [x0]
-        ld1             {v1.4s}, [x1], #16
-        fadd            v0.4s, v0.4s, v1.4s
-        ld1             {v2.4s}, [x2], #16
-        fadd            v0.4s, v0.4s, v2.4s
-        ld1             {v3.4s}, [x3], #16
-        fadd            v0.4s, v0.4s, v3.4s
-        ld1             {v4.4s}, [x4], #16
-        fadd            v0.4s, v0.4s, v4.4s
-        st1             {v0.4s}, [x0], #16
+1:      ld1             {v0.4S}, [x0]
+        ld1             {v1.4S}, [x1], #16
+        fadd            v0.4S, v0.4S, v1.4S
+        ld1             {v2.4S}, [x2], #16
+        fadd            v0.4S, v0.4S, v2.4S
+        ld1             {v3.4S}, [x3], #16
+        fadd            v0.4S, v0.4S, v3.4S
+        ld1             {v4.4S}, [x4], #16
+        fadd            v0.4S, v0.4S, v4.4S
+        st1             {v0.4S}, [x0], #16
        subs            x5, x5, #4
        b.gt            1b
        ret
 endfunc

 function ff_sbr_sum_square_neon, export=1
-        movi            v0.4s, #0
-1:      ld1             {v1.4s}, [x0], #16
-        fmla            v0.4s, v1.4s, v1.4s
+        movi            v0.4S, #0
+1:      ld1             {v1.4S}, [x0], #16
+        fmla            v0.4S, v1.4S, v1.4S
        subs            w1, w1, #2
        b.gt            1b
-        faddp           v0.4s, v0.4s, v0.4s
-        faddp           v0.4s, v0.4s, v0.4s
+        faddp           v0.4S, v0.4S, v0.4S
+        faddp           v0.4S, v0.4S, v0.4S
        ret
 endfunc

 function ff_sbr_neg_odd_64_neon, export=1
        mov             x1, x0
-        movi            v5.4s, #1<<7, lsl #24
-        ld2             {v0.4s, v1.4s}, [x0], #32
-        eor             v1.16b, v1.16b, v5.16b
-        ld2             {v2.4s, v3.4s}, [x0], #32
+        movi            v5.4S, #1<<7, lsl #24
+        ld2             {v0.4S, v1.4S}, [x0], #32
+        eor             v1.16B, v1.16B, v5.16B
+        ld2             {v2.4S, v3.4S}, [x0], #32
 .rept 3
-        st2             {v0.4s, v1.4s}, [x1], #32
-        eor             v3.16b, v3.16b, v5.16b
-        ld2             {v0.4s, v1.4s}, [x0], #32
-        st2             {v2.4s, v3.4s}, [x1], #32
-        eor             v1.16b, v1.16b, v5.16b
-        ld2             {v2.4s, v3.4s}, [x0], #32
+        st2             {v0.4S, v1.4S}, [x1], #32
+        eor             v3.16B, v3.16B, v5.16B
+        ld2             {v0.4S, v1.4S}, [x0], #32
+        st2             {v2.4S, v3.4S}, [x1], #32
+        eor             v1.16B, v1.16B, v5.16B
+        ld2             {v2.4S, v3.4S}, [x0], #32
 .endr
-        eor             v3.16b, v3.16b, v5.16b
-        st2             {v0.4s, v1.4s}, [x1], #32
-        st2             {v2.4s, v3.4s}, [x1], #32
+        eor             v3.16B, v3.16B, v5.16B
+        st2             {v0.4S, v1.4S}, [x1], #32
+        st2             {v2.4S, v3.4S}, [x1], #32
        ret
 endfunc

@@ -97,26 +97,26 @@ function ff_sbr_qmf_pre_shuffle_neon, export=1
        add             x2, x0, #64*4
        mov             x3, #-16
        mov             x4, #-4
-        movi            v6.4s, #1<<7, lsl #24
-        ld1             {v0.2s}, [x0], #8
-        st1             {v0.2s}, [x2], #8
+        movi            v6.4S, #1<<7, lsl #24
+        ld1             {v0.2S}, [x0], #8
+        st1             {v0.2S}, [x2], #8
 .rept 7
-        ld1             {v1.4s}, [x1], x3
-        ld1             {v2.4s}, [x0], #16
-        eor             v1.16b, v1.16b, v6.16b
-        rev64           v1.4s, v1.4s
-        ext             v1.16b, v1.16b, v1.16b, #8
-        st2             {v1.4s, v2.4s}, [x2], #32
+        ld1             {v1.4S}, [x1], x3
+        ld1             {v2.4S}, [x0], #16
+        eor             v1.16B, v1.16B, v6.16B
+        rev64           v1.4S, v1.4S
+        ext             v1.16B, v1.16B, v1.16B, #8
+        st2             {v1.4S, v2.4S}, [x2], #32
 .endr
        add             x1, x1, #8
-        ld1             {v1.2s}, [x1], x4
-        ld1             {v2.2s}, [x0], #8
-        ld1             {v1.s}[3], [x1]
-        ld1             {v2.s}[2], [x0]
-        eor             v1.16b, v1.16b, v6.16b
-        rev64           v1.4s, v1.4s
-        st2             {v1.2s, v2.2s}, [x2], #16
-        st2             {v1.s, v2.s}[2], [x2]
+        ld1             {v1.2S}, [x1], x4
+        ld1             {v2.2S}, [x0], #8
+        ld1             {v1.S}[3], [x1]
+        ld1             {v2.S}[2], [x0]
+        eor             v1.16B, v1.16B, v6.16B
+        rev64           v1.4S, v1.4S
+        st2             {v1.2S, v2.2S}, [x2], #16
+        st2             {v1.S, v2.S}[2], [x2]
        ret
 endfunc

@@ -124,13 +124,13 @@ function ff_sbr_qmf_post_shuffle_neon, export=1
        add             x2, x1, #60*4
        mov             x3, #-16
        mov             x4, #32
-        movi            v6.4s, #1<<7, lsl #24
-1:      ld1             {v0.4s}, [x2], x3
-        ld1             {v1.4s}, [x1], #16
-        eor             v0.16b, v0.16b, v6.16b
-        rev64           v0.4s, v0.4s
-        ext             v0.16b, v0.16b, v0.16b, #8
-        st2             {v0.4s, v1.4s}, [x0], #32
+        movi            v6.4S, #1<<7, lsl #24
+1:      ld1             {v0.4S}, [x2], x3
+        ld1             {v1.4S}, [x1], #16
+        eor             v0.16B, v0.16B, v6.16B
+        rev64           v0.4S, v0.4S
+        ext             v0.16B, v0.16B, v0.16B, #8
+        st2             {v0.4S, v1.4S}, [x0], #32
        subs            x4, x4, #4
        b.gt            1b
        ret
@@ -141,13 +141,13 @@ function ff_sbr_qmf_deint_neg_neon, export=1
        add             x2, x0, #60*4
        mov             x3, #-32
        mov             x4, #32
-        movi            v2.4s, #1<<7, lsl #24
-1:      ld2             {v0.4s, v1.4s}, [x1], x3
-        eor             v0.16b, v0.16b, v2.16b
-        rev64           v1.4s, v1.4s
-        ext             v1.16b, v1.16b, v1.16b, #8
-        st1             {v0.4s}, [x2]
-        st1             {v1.4s}, [x0], #16
+        movi            v2.4S, #1<<7, lsl #24
+1:      ld2             {v0.4S, v1.4S}, [x1], x3
+        eor             v0.16B, v0.16B, v2.16B
+        rev64           v1.4S, v1.4S
+        ext             v1.16B, v1.16B, v1.16B, #8
+        st1             {v0.4S}, [x2]
+        st1             {v1.4S}, [x0], #16
        sub             x2, x2, #16
        subs            x4, x4, #4
        b.gt            1b
@@ -159,16 +159,16 @@ function ff_sbr_qmf_deint_bfly_neon, export=1
        add             x3, x0, #124*4
        mov             x4, #64
        mov             x5, #-16
-1:      ld1             {v0.4s}, [x1], #16
-        ld1             {v1.4s}, [x2], x5
-        rev64           v2.4s, v0.4s
-        ext             v2.16b, v2.16b, v2.16b, #8
-        rev64           v3.4s, v1.4s
-        ext             v3.16b, v3.16b, v3.16b, #8
-        fadd            v1.4s, v1.4s, v2.4s
-        fsub            v0.4s, v0.4s, v3.4s
-        st1             {v0.4s}, [x0], #16
-        st1             {v1.4s}, [x3], x5
+1:      ld1             {v0.4S}, [x1], #16
+        ld1             {v1.4S}, [x2], x5
+        rev64           v2.4S, v0.4S
+        ext             v2.16B, v2.16B, v2.16B, #8
+        rev64           v3.4S, v1.4S
+        ext             v3.16B, v3.16B, v3.16B, #8
+        fadd            v1.4S, v1.4S, v2.4S
+        fsub            v0.4S, v0.4S, v3.4S
+        st1             {v0.4S}, [x0], #16
+        st1             {v1.4S}, [x3], x5
        subs            x4, x4, #4
        b.gt            1b
        ret
@@ -178,32 +178,32 @@ function ff_sbr_hf_gen_neon, export=1
        sxtw            x4, w4
        sxtw            x5, w5
        movrel          x6, factors
-        ld1             {v7.4s}, [x6]
-        dup             v1.4s, v0.s[0]
-        mov             v2.8b, v1.8b
-        mov             v2.s[2], v7.s[0]
-        mov             v2.s[3], v7.s[0]
-        fmul            v1.4s, v1.4s, v2.4s
-        ld1             {v0.d}[0], [x3]
-        ld1             {v0.d}[1], [x2]
-        fmul            v0.4s, v0.4s, v1.4s
-        fmul            v1.4s, v0.4s, v7.4s
-        rev64           v0.4s, v0.4s
+        ld1             {v7.4S}, [x6]
+        dup             v1.4S, v0.S[0]
+        mov             v2.8B, v1.8B
+        mov             v2.S[2], v7.S[0]
+        mov             v2.S[3], v7.S[0]
+        fmul            v1.4S, v1.4S, v2.4S
+        ld1             {v0.D}[0], [x3]
+        ld1             {v0.D}[1], [x2]
+        fmul            v0.4S, v0.4S, v1.4S
+        fmul            v1.4S, v0.4S, v7.4S
+        rev64           v0.4S, v0.4S
        sub             x7, x5, x4
        add             x0, x0, x4, lsl #3
        add             x1, x1, x4, lsl #3
        sub             x1, x1, #16
-1:      ld1             {v2.4s}, [x1], #16
-        ld1             {v3.2s}, [x1]
-        fmul            v4.4s, v2.4s, v1.4s
-        fmul            v5.4s, v2.4s, v0.4s
-        faddp           v4.4s, v4.4s, v4.4s
-        faddp           v5.4s, v5.4s, v5.4s
-        faddp           v4.4s, v4.4s, v4.4s
-        faddp           v5.4s, v5.4s, v5.4s
-        mov             v4.s[1], v5.s[0]
-        fadd            v4.2s, v4.2s, v3.2s
-        st1             {v4.2s}, [x0], #8
+1:      ld1             {v2.4S}, [x1], #16
+        ld1             {v3.2S}, [x1]
+        fmul            v4.4S, v2.4S, v1.4S
+        fmul            v5.4S, v2.4S, v0.4S
+        faddp           v4.4S, v4.4S, v4.4S
+        faddp           v5.4S, v5.4S, v5.4S
+        faddp           v4.4S, v4.4S, v4.4S
+        faddp           v5.4S, v5.4S, v5.4S
+        mov             v4.S[1], v5.S[0]
+        fadd            v4.2S, v4.2S, v3.2S
+        st1             {v4.2S}, [x0], #8
        sub             x1, x1, #8
        subs            x7, x7, #1
        b.gt            1b
@@ -215,10 +215,10 @@ function ff_sbr_hf_g_filt_neon, export=1
        sxtw            x4, w4
        mov             x5, #40*2*4
        add             x1, x1, x4, lsl #3
-1:      ld1             {v0.2s}, [x1], x5
-        ld1             {v1.s}[0], [x2], #4
-        fmul            v2.4s, v0.4s, v1.s[0]
-        st1             {v2.2s}, [x0], #8
+1:      ld1             {v0.2S}, [x1], x5
+        ld1             {v1.S}[0], [x2], #4
+        fmul            v2.4S, v0.4S, v1.S[0]
+        st1             {v2.2S}, [x0], #8
        subs            x3, x3, #1
        b.gt            1b
        ret
@@ -227,46 +227,46 @@ endfunc
 function ff_sbr_autocorrelate_neon, export=1
        mov             x2, #38
        movrel          x3, factors
-        ld1             {v0.4s}, [x3]
-        movi            v1.4s, #0
-        movi            v2.4s, #0
-        movi            v3.4s, #0
-        ld1             {v4.2s}, [x0], #8
-        ld1             {v5.2s}, [x0], #8
-        fmul            v16.2s, v4.2s, v4.2s
-        fmul            v17.2s, v5.2s, v4.s[0]
-        fmul            v18.2s, v5.2s, v4.s[1]
-1:      ld1             {v5.d}[1], [x0], #8
-        fmla            v1.2s, v4.2s, v4.2s
-        fmla            v2.4s, v5.4s, v4.s[0]
-        fmla            v3.4s, v5.4s, v4.s[1]
-        mov             v4.d[0], v5.d[0]
-        mov             v5.d[0], v5.d[1]
+        ld1             {v0.4S}, [x3]
+        movi            v1.4S, #0
+        movi            v2.4S, #0
+        movi            v3.4S, #0
+        ld1             {v4.2S}, [x0], #8
+        ld1             {v5.2S}, [x0], #8
+        fmul            v16.2S, v4.2S, v4.2S
+        fmul            v17.2S, v5.2S, v4.S[0]
+        fmul            v18.2S, v5.2S, v4.S[1]
+1:      ld1             {v5.D}[1], [x0], #8
+        fmla            v1.2S, v4.2S, v4.2S
+        fmla            v2.4S, v5.4S, v4.S[0]
+        fmla            v3.4S, v5.4S, v4.S[1]
+        mov             v4.D[0], v5.D[0]
+        mov             v5.D[0], v5.D[1]
        subs            x2, x2, #1
        b.gt            1b
-        fmul            v19.2s, v4.2s, v4.2s
-        fmul            v20.2s, v5.2s, v4.s[0]
-        fmul            v21.2s, v5.2s, v4.s[1]
-        fadd            v22.4s, v2.4s, v20.4s
-        fsub            v22.4s, v22.4s, v17.4s
-        fadd            v23.4s, v3.4s, v21.4s
-        fsub            v23.4s, v23.4s, v18.4s
-        rev64           v23.4s, v23.4s
-        fmul            v23.4s, v23.4s, v0.4s
-        fadd            v22.4s, v22.4s, v23.4s
-        st1             {v22.4s}, [x1], #16
-        fadd            v23.2s, v1.2s, v19.2s
-        fsub            v23.2s, v23.2s, v16.2s
-        faddp           v23.2s, v23.2s, v23.2s
-        st1             {v23.s}[0], [x1]
+        fmul            v19.2S, v4.2S, v4.2S
+        fmul            v20.2S, v5.2S, v4.S[0]
+        fmul            v21.2S, v5.2S, v4.S[1]
+        fadd            v22.4S, v2.4S, v20.4S
+        fsub            v22.4S, v22.4S, v17.4S
+        fadd            v23.4S, v3.4S, v21.4S
+        fsub            v23.4S, v23.4S, v18.4S
+        rev64           v23.4S, v23.4S
+        fmul            v23.4S, v23.4S, v0.4S
+        fadd            v22.4S, v22.4S, v23.4S
+        st1             {v22.4S}, [x1], #16
+        fadd            v23.2S, v1.2S, v19.2S
+        fsub            v23.2S, v23.2S, v16.2S
+        faddp           v23.2S, v23.2S, v23.2S
+        st1             {v23.S}[0], [x1]
        add             x1, x1, #8
-        rev64           v3.2s, v3.2s
-        fmul            v3.2s, v3.2s, v0.2s
-        fadd            v2.2s, v2.2s, v3.2s
-        st1             {v2.2s}, [x1]
+        rev64           v3.2S, v3.2S
+        fmul            v3.2S, v3.2S, v0.2S
+        fadd            v2.2S, v2.2S, v3.2S
+        st1             {v2.2S}, [x1]
        add             x1, x1, #16
-        faddp           v1.2s, v1.2s, v1.2s
-        st1             {v1.s}[0], [x1]
+        faddp           v1.2S, v1.2S, v1.2S
+        st1             {v1.S}[0], [x1]
        ret
 endfunc

@@ -278,25 +278,25 @@ endfunc
 1:      and             x3, x3, #0x1ff
        add             x8, x7, x3, lsl #3
        add             x3, x3, #2
-        ld1             {v2.4s}, [x0]
-        ld1             {v3.2s}, [x1], #8
-        ld1             {v4.2s}, [x2], #8
-        ld1             {v5.4s}, [x8]
-        mov             v6.16b, v2.16b
-        zip1            v3.4s, v3.4s, v3.4s
-        zip1            v4.4s, v4.4s, v4.4s
-        fmla            v6.4s, v1.4s, v3.4s
-        fmla            v2.4s, v5.4s, v4.4s
-        fcmeq           v7.4s, v3.4s, #0
-        bif             v2.16b, v6.16b, v7.16b
-        st1             {v2.4s}, [x0], #16
+        ld1             {v2.4S}, [x0]
+        ld1             {v3.2S}, [x1], #8
+        ld1             {v4.2S}, [x2], #8
+        ld1             {v5.4S}, [x8]
+        mov             v6.16B, v2.16B
+        zip1            v3.4S, v3.4S, v3.4S
+        zip1            v4.4S, v4.4S, v4.4S
+        fmla            v6.4S, v1.4S, v3.4S
+        fmla            v2.4S, v5.4S, v4.4S
+        fcmeq           v7.4S, v3.4S, #0
+        bif             v2.16B, v6.16B, v7.16B
+        st1             {v2.4S}, [x0], #16
        subs            x5, x5, #2
        b.gt            1b
 .endm

 function ff_sbr_hf_apply_noise_0_neon, export=1
        movrel          x9, phi_noise_0
-        ld1             {v1.4s}, [x9]
+        ld1             {v1.4S}, [x9]
        apply_noise_common
        ret
 endfunc
@@ -305,14 +305,14 @@ function ff_sbr_hf_apply_noise_1_neon, export=1
        movrel          x9, phi_noise_1
        and             x4, x4, #1
        add             x9, x9, x4, lsl #4
-        ld1             {v1.4s}, [x9]
+        ld1             {v1.4S}, [x9]
        apply_noise_common
        ret
 endfunc

 function ff_sbr_hf_apply_noise_2_neon, export=1
        movrel          x9, phi_noise_2
-        ld1             {v1.4s}, [x9]
+        ld1             {v1.4S}, [x9]
        apply_noise_common
        ret
 endfunc
@@ -321,7 +321,7 @@ function ff_sbr_hf_apply_noise_3_neon, export=1
        movrel          x9, phi_noise_3
        and             x4, x4, #1
        add             x9, x9, x4, lsl #4
-        ld1             {v1.4s}, [x9]
+        ld1             {v1.4S}, [x9]
        apply_noise_common
        ret
 endfunc
@@ -54,7 +54,7 @@ endconst
        prfm            pldl1keep, [\data]
        mov             x10, x30
        movrel          x3, idct_coeff_neon
-        ld1             {v0.2d}, [x3]
+        ld1             {v0.2D}, [x3]
 .endm

 .macro idct_end
@@ -74,146 +74,146 @@ endconst
 .endm

 .macro idct_col4_top y1, y2, y3, y4, i, l
-        smull\i         v7.4s,  \y3\l, z2
-        smull\i         v16.4s, \y3\l, z6
-        smull\i         v17.4s, \y2\l, z1
-        add             v19.4s, v23.4s, v7.4s
-        smull\i         v18.4s, \y2\l, z3
-        add             v20.4s, v23.4s, v16.4s
-        smull\i         v5.4s,  \y2\l, z5
-        sub             v21.4s, v23.4s, v16.4s
-        smull\i         v6.4s,  \y2\l, z7
-        sub             v22.4s, v23.4s, v7.4s
+        smull\i         v7.4S,  \y3\l, z2
+        smull\i         v16.4S, \y3\l, z6
+        smull\i         v17.4S, \y2\l, z1
+        add             v19.4S, v23.4S, v7.4S
+        smull\i         v18.4S, \y2\l, z3
+        add             v20.4S, v23.4S, v16.4S
+        smull\i         v5.4S,  \y2\l, z5
+        sub             v21.4S, v23.4S, v16.4S
+        smull\i         v6.4S,  \y2\l, z7
+        sub             v22.4S, v23.4S, v7.4S

-        smlal\i         v17.4s, \y4\l, z3
-        smlsl\i         v18.4s, \y4\l, z7
-        smlsl\i         v5.4s,  \y4\l, z1
-        smlsl\i         v6.4s,  \y4\l, z5
+        smlal\i         v17.4S, \y4\l, z3
+        smlsl\i         v18.4S, \y4\l, z7
+        smlsl\i         v5.4S,  \y4\l, z1
+        smlsl\i         v6.4S,  \y4\l, z5
 .endm

 .macro idct_row4_neon y1, y2, y3, y4, pass
-        ld1             {\y1\().2d,\y2\().2d}, [x2], #32
-        movi            v23.4s, #1<<2, lsl #8
-        orr             v5.16b, \y1\().16b, \y2\().16b
-        ld1             {\y3\().2d,\y4\().2d}, [x2], #32
-        orr             v6.16b, \y3\().16b, \y4\().16b
-        orr             v5.16b, v5.16b, v6.16b
-        mov             x3, v5.d[1]
-        smlal           v23.4s, \y1\().4h, z4
+        ld1             {\y1\().2D,\y2\().2D}, [x2], #32
+        movi            v23.4S, #1<<2, lsl #8
+        orr             v5.16B, \y1\().16B, \y2\().16B
+        ld1             {\y3\().2D,\y4\().2D}, [x2], #32
+        orr             v6.16B, \y3\().16B, \y4\().16B
+        orr             v5.16B, v5.16B, v6.16B
+        mov             x3, v5.D[1]
+        smlal           v23.4S, \y1\().4H, z4

-        idct_col4_top   \y1, \y2, \y3, \y4, 1, .4h
+        idct_col4_top   \y1, \y2, \y3, \y4, 1, .4H

        cmp             x3, #0
        b.eq            \pass\()f

-        smull2          v7.4s, \y1\().8h, z4
-        smlal2          v17.4s, \y2\().8h, z5
-        smlsl2          v18.4s, \y2\().8h, z1
-        smull2          v16.4s, \y3\().8h, z2
-        smlal2          v5.4s, \y2\().8h, z7
-        add             v19.4s, v19.4s, v7.4s
-        sub             v20.4s, v20.4s, v7.4s
-        sub             v21.4s, v21.4s, v7.4s
-        add             v22.4s, v22.4s, v7.4s
-        smlal2          v6.4s, \y2\().8h, z3
-        smull2          v7.4s, \y3\().8h, z6
-        smlal2          v17.4s, \y4\().8h, z7
-        smlsl2          v18.4s, \y4\().8h, z5
-        smlal2          v5.4s, \y4\().8h, z3
-        smlsl2          v6.4s, \y4\().8h, z1
-        add             v19.4s, v19.4s, v7.4s
-        sub             v20.4s, v20.4s, v16.4s
-        add             v21.4s, v21.4s, v16.4s
-        sub             v22.4s, v22.4s, v7.4s
+        smull2          v7.4S, \y1\().8H, z4
+        smlal2          v17.4S, \y2\().8H, z5
+        smlsl2          v18.4S, \y2\().8H, z1
+        smull2          v16.4S, \y3\().8H, z2
+        smlal2          v5.4S, \y2\().8H, z7
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v7.4S
+        sub             v21.4S, v21.4S, v7.4S
+        add             v22.4S, v22.4S, v7.4S
+        smlal2          v6.4S, \y2\().8H, z3
+        smull2          v7.4S, \y3\().8H, z6
+        smlal2          v17.4S, \y4\().8H, z7
+        smlsl2          v18.4S, \y4\().8H, z5
+        smlal2          v5.4S, \y4\().8H, z3
+        smlsl2          v6.4S, \y4\().8H, z1
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v16.4S
+        add             v21.4S, v21.4S, v16.4S
+        sub             v22.4S, v22.4S, v7.4S

 \pass:  add             \y3\().4S, v19.4S, v17.4S
-        add             \y4\().4s, v20.4s, v18.4s
-        shrn            \y1\().4h, \y3\().4s, #ROW_SHIFT
-        shrn            \y2\().4h, \y4\().4s, #ROW_SHIFT
-        add             v7.4s, v21.4s, v5.4s
-        add             v16.4s, v22.4s, v6.4s
-        shrn            \y3\().4h, v7.4s, #ROW_SHIFT
-        shrn            \y4\().4h, v16.4s, #ROW_SHIFT
-        sub             v22.4s, v22.4s, v6.4s
-        sub             v19.4s, v19.4s, v17.4s
-        sub             v21.4s, v21.4s, v5.4s
-        shrn2           \y1\().8h, v22.4s, #ROW_SHIFT
-        sub             v20.4s, v20.4s, v18.4s
-        shrn2           \y2\().8h, v21.4s, #ROW_SHIFT
-        shrn2           \y3\().8h, v20.4s, #ROW_SHIFT
-        shrn2           \y4\().8h, v19.4s, #ROW_SHIFT
+        add             \y4\().4S, v20.4S, v18.4S
+        shrn            \y1\().4H, \y3\().4S, #ROW_SHIFT
+        shrn            \y2\().4H, \y4\().4S, #ROW_SHIFT
+        add             v7.4S, v21.4S, v5.4S
+        add             v16.4S, v22.4S, v6.4S
+        shrn            \y3\().4H, v7.4S, #ROW_SHIFT
+        shrn            \y4\().4H, v16.4S, #ROW_SHIFT
+        sub             v22.4S, v22.4S, v6.4S
+        sub             v19.4S, v19.4S, v17.4S
+        sub             v21.4S, v21.4S, v5.4S
+        shrn2           \y1\().8H, v22.4S, #ROW_SHIFT
+        sub             v20.4S, v20.4S, v18.4S
+        shrn2           \y2\().8H, v21.4S, #ROW_SHIFT
+        shrn2           \y3\().8H, v20.4S, #ROW_SHIFT
+        shrn2           \y4\().8H, v19.4S, #ROW_SHIFT

-        trn1            v16.8h, \y1\().8h, \y2\().8h
-        trn2            v17.8h, \y1\().8h, \y2\().8h
-        trn1            v18.8h, \y3\().8h, \y4\().8h
-        trn2            v19.8h, \y3\().8h, \y4\().8h
-        trn1            \y1\().4s, v16.4s, v18.4s
-        trn1            \y2\().4s, v17.4s, v19.4s
-        trn2            \y3\().4s, v16.4s, v18.4s
-        trn2            \y4\().4s, v17.4s, v19.4s
+        trn1            v16.8H, \y1\().8H, \y2\().8H
+        trn2            v17.8H, \y1\().8H, \y2\().8H
+        trn1            v18.8H, \y3\().8H, \y4\().8H
+        trn2            v19.8H, \y3\().8H, \y4\().8H
+        trn1            \y1\().4S, v16.4S, v18.4S
+        trn1            \y2\().4S, v17.4S, v19.4S
+        trn2            \y3\().4S, v16.4S, v18.4S
+        trn2            \y4\().4S, v17.4S, v19.4S
 .endm

 .macro declare_idct_col4_neon i, l
 function idct_col4_neon\i
-        dup             v23.4h, z4c
+        dup             v23.4H, z4c
 .if \i == 1
-        add             v23.4h, v23.4h, v24.4h
+        add             v23.4H, v23.4H, v24.4H
 .else
-        mov             v5.d[0], v24.d[1]
-        add             v23.4h, v23.4h, v5.4h
+        mov             v5.D[0], v24.D[1]
+        add             v23.4H, v23.4H, v5.4H
 .endif
-        smull           v23.4s, v23.4h, z4
+        smull           v23.4S, v23.4H, z4

        idct_col4_top   v24, v25, v26, v27, \i, \l

-        mov             x4, v28.d[\i - 1]
-        mov             x5, v29.d[\i - 1]
+        mov             x4, v28.D[\i - 1]
+        mov             x5, v29.D[\i - 1]
        cmp             x4, #0
        b.eq            1f

-        smull\i         v7.4s,  v28\l,  z4
-        add             v19.4s, v19.4s, v7.4s
-        sub             v20.4s, v20.4s, v7.4s
-        sub             v21.4s, v21.4s, v7.4s
-        add             v22.4s, v22.4s, v7.4s
+        smull\i         v7.4S,  v28\l,  z4
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v7.4S
+        sub             v21.4S, v21.4S, v7.4S
+        add             v22.4S, v22.4S, v7.4S

-1:      mov             x4, v30.d[\i - 1]
+1:      mov             x4, v30.D[\i - 1]
        cmp             x5, #0
        b.eq            2f

-        smlal\i         v17.4s, v29\l, z5
-        smlsl\i         v18.4s, v29\l, z1
-        smlal\i         v5.4s,  v29\l, z7
-        smlal\i         v6.4s,  v29\l, z3
+        smlal\i         v17.4S, v29\l, z5
+        smlsl\i         v18.4S, v29\l, z1
+        smlal\i         v5.4S,  v29\l, z7
+        smlal\i         v6.4S,  v29\l, z3

-2:      mov             x5, v31.d[\i - 1]
+2:      mov             x5, v31.D[\i - 1]
        cmp             x4, #0
        b.eq            3f

-        smull\i         v7.4s,  v30\l, z6
-        smull\i         v16.4s, v30\l, z2
-        add             v19.4s, v19.4s, v7.4s
-        sub             v22.4s, v22.4s, v7.4s
-        sub             v20.4s, v20.4s, v16.4s
-        add             v21.4s, v21.4s, v16.4s
+        smull\i         v7.4S,  v30\l, z6
+        smull\i         v16.4S, v30\l, z2
+        add             v19.4S, v19.4S, v7.4S
+        sub             v22.4S, v22.4S, v7.4S
+        sub             v20.4S, v20.4S, v16.4S
+        add             v21.4S, v21.4S, v16.4S

 3:      cmp             x5, #0
        b.eq            4f

-        smlal\i         v17.4s, v31\l, z7
-        smlsl\i         v18.4s, v31\l, z5
-        smlal\i         v5.4s,  v31\l, z3
-        smlsl\i         v6.4s,  v31\l, z1
+        smlal\i         v17.4S, v31\l, z7
+        smlsl\i         v18.4S, v31\l, z5
+        smlal\i         v5.4S,  v31\l, z3
+        smlsl\i         v6.4S,  v31\l, z1

-4:      addhn           v7.4h, v19.4s, v17.4s
-        addhn2          v7.8h, v20.4s, v18.4s
-        subhn           v18.4h, v20.4s, v18.4s
-        subhn2          v18.8h, v19.4s, v17.4s
+4:      addhn           v7.4H, v19.4S, v17.4S
+        addhn2          v7.8H, v20.4S, v18.4S
+        subhn           v18.4H, v20.4S, v18.4S
+        subhn2          v18.8H, v19.4S, v17.4S

-        addhn           v16.4h, v21.4s, v5.4s
-        addhn2          v16.8h, v22.4s, v6.4s
-        subhn           v17.4h, v22.4s, v6.4s
-        subhn2          v17.8h, v21.4s, v5.4s
+        addhn           v16.4H, v21.4S, v5.4S
+        addhn2          v16.8H, v22.4S, v6.4S
+        subhn           v17.4H, v22.4S, v6.4S
+        subhn2          v17.8H, v21.4S, v5.4S

        ret
 endfunc
@@ -229,33 +229,33 @@ function ff_simple_idct_put_neon, export=1
        idct_row4_neon  v28, v29, v30, v31, 2
        bl              idct_col4_neon1

-        sqshrun         v1.8b,  v7.8h, #COL_SHIFT-16
-        sqshrun2        v1.16b, v16.8h, #COL_SHIFT-16
-        sqshrun         v3.8b,  v17.8h, #COL_SHIFT-16
-        sqshrun2        v3.16b, v18.8h, #COL_SHIFT-16
+        sqshrun         v1.8B,  v7.8H, #COL_SHIFT-16
+        sqshrun2        v1.16B, v16.8H, #COL_SHIFT-16
+        sqshrun         v3.8B,  v17.8H, #COL_SHIFT-16
+        sqshrun2        v3.16B, v18.8H, #COL_SHIFT-16

        bl              idct_col4_neon2

-        sqshrun         v2.8b,  v7.8h, #COL_SHIFT-16
-        sqshrun2        v2.16b, v16.8h, #COL_SHIFT-16
-        sqshrun         v4.8b,  v17.8h, #COL_SHIFT-16
-        sqshrun2        v4.16b, v18.8h, #COL_SHIFT-16
+        sqshrun         v2.8B,  v7.8H, #COL_SHIFT-16
+        sqshrun2        v2.16B, v16.8H, #COL_SHIFT-16
+        sqshrun         v4.8B,  v17.8H, #COL_SHIFT-16
+        sqshrun2        v4.16B, v18.8H, #COL_SHIFT-16

-        zip1            v16.4s, v1.4s, v2.4s
-        zip2            v17.4s, v1.4s, v2.4s
+        zip1            v16.4S, v1.4S, v2.4S
+        zip2            v17.4S, v1.4S, v2.4S

-        st1             {v16.d}[0], [x0], x1
-        st1             {v16.d}[1], [x0], x1
+        st1             {v16.D}[0], [x0], x1
+        st1             {v16.D}[1], [x0], x1

-        zip1            v18.4s, v3.4s, v4.4s
-        zip2            v19.4s, v3.4s, v4.4s
+        zip1            v18.4S, v3.4S, v4.4S
+        zip2            v19.4S, v3.4S, v4.4S

-        st1             {v17.d}[0], [x0], x1
-        st1             {v17.d}[1], [x0], x1
-        st1             {v18.d}[0], [x0], x1
-        st1             {v18.d}[1], [x0], x1
-        st1             {v19.d}[0], [x0], x1
-        st1             {v19.d}[1], [x0], x1
+        st1             {v17.D}[0], [x0], x1
+        st1             {v17.D}[1], [x0], x1
+        st1             {v18.D}[0], [x0], x1
+        st1             {v18.D}[1], [x0], x1
+        st1             {v19.D}[0], [x0], x1
+        st1             {v19.D}[1], [x0], x1

        idct_end
 endfunc
@@ -267,59 +267,59 @@ function ff_simple_idct_add_neon, export=1
        idct_row4_neon  v28, v29, v30, v31, 2
        bl              idct_col4_neon1

-        sshr            v1.8h, v7.8h, #COL_SHIFT-16
-        sshr            v2.8h, v16.8h, #COL_SHIFT-16
-        sshr            v3.8h, v17.8h, #COL_SHIFT-16
-        sshr            v4.8h, v18.8h, #COL_SHIFT-16
+        sshr            v1.8H, v7.8H, #COL_SHIFT-16
+        sshr            v2.8H, v16.8H, #COL_SHIFT-16
+        sshr            v3.8H, v17.8H, #COL_SHIFT-16
+        sshr            v4.8H, v18.8H, #COL_SHIFT-16

        bl              idct_col4_neon2

-        sshr            v7.8h, v7.8h, #COL_SHIFT-16
-        sshr            v16.8h, v16.8h, #COL_SHIFT-16
-        sshr            v17.8h, v17.8h, #COL_SHIFT-16
-        sshr            v18.8h, v18.8h, #COL_SHIFT-16
+        sshr            v7.8H, v7.8H, #COL_SHIFT-16
+        sshr            v16.8H, v16.8H, #COL_SHIFT-16
+        sshr            v17.8H, v17.8H, #COL_SHIFT-16
+        sshr            v18.8H, v18.8H, #COL_SHIFT-16

        mov             x9,  x0
-        ld1             {v19.d}[0], [x0], x1
-        zip1            v23.2d, v1.2d, v7.2d
-        zip2            v24.2d, v1.2d, v7.2d
-        ld1             {v19.d}[1], [x0], x1
-        zip1            v25.2d, v2.2d, v16.2d
-        zip2            v26.2d, v2.2d, v16.2d
-        ld1             {v20.d}[0], [x0], x1
-        zip1            v27.2d, v3.2d, v17.2d
-        zip2            v28.2d, v3.2d, v17.2d
-        ld1             {v20.d}[1], [x0], x1
-        zip1            v29.2d, v4.2d, v18.2d
-        zip2            v30.2d, v4.2d, v18.2d
-        ld1             {v21.d}[0], [x0], x1
-        uaddw           v23.8h, v23.8h, v19.8b
-        uaddw2          v24.8h, v24.8h, v19.16b
-        ld1             {v21.d}[1], [x0], x1
-        sqxtun          v23.8b, v23.8h
-        sqxtun2         v23.16b, v24.8h
-        ld1             {v22.d}[0], [x0], x1
-        uaddw           v24.8h, v25.8h, v20.8b
-        uaddw2          v25.8h, v26.8h, v20.16b
-        ld1             {v22.d}[1], [x0], x1
-        sqxtun          v24.8b, v24.8h
-        sqxtun2         v24.16b, v25.8h
-        st1             {v23.d}[0], [x9], x1
-        uaddw           v25.8h, v27.8h, v21.8b
-        uaddw2          v26.8h, v28.8h, v21.16b
-        st1             {v23.d}[1], [x9], x1
-        sqxtun          v25.8b, v25.8h
-        sqxtun2         v25.16b, v26.8h
-        st1             {v24.d}[0], [x9], x1
-        uaddw           v26.8h, v29.8h, v22.8b
-        uaddw2          v27.8h, v30.8h, v22.16b
-        st1             {v24.d}[1], [x9], x1
-        sqxtun          v26.8b, v26.8h
-        sqxtun2         v26.16b, v27.8h
-        st1             {v25.d}[0], [x9], x1
-        st1             {v25.d}[1], [x9], x1
-        st1             {v26.d}[0], [x9], x1
-        st1             {v26.d}[1], [x9], x1
+        ld1             {v19.D}[0], [x0], x1
+        zip1            v23.2D, v1.2D, v7.2D
+        zip2            v24.2D, v1.2D, v7.2D
+        ld1             {v19.D}[1], [x0], x1
+        zip1            v25.2D, v2.2D, v16.2D
+        zip2            v26.2D, v2.2D, v16.2D
+        ld1             {v20.D}[0], [x0], x1
+        zip1            v27.2D, v3.2D, v17.2D
+        zip2            v28.2D, v3.2D, v17.2D
+        ld1             {v20.D}[1], [x0], x1
+        zip1            v29.2D, v4.2D, v18.2D
+        zip2            v30.2D, v4.2D, v18.2D
+        ld1             {v21.D}[0], [x0], x1
+        uaddw           v23.8H, v23.8H, v19.8B
+        uaddw2          v24.8H, v24.8H, v19.16B
+        ld1             {v21.D}[1], [x0], x1
+        sqxtun          v23.8B, v23.8H
+        sqxtun2         v23.16B, v24.8H
+        ld1             {v22.D}[0], [x0], x1
+        uaddw           v24.8H, v25.8H, v20.8B
+        uaddw2          v25.8H, v26.8H, v20.16B
+        ld1             {v22.D}[1], [x0], x1
+        sqxtun          v24.8B, v24.8H
+        sqxtun2         v24.16B, v25.8H
+        st1             {v23.D}[0], [x9], x1
+        uaddw           v25.8H, v27.8H, v21.8B
+        uaddw2          v26.8H, v28.8H, v21.16B
+        st1             {v23.D}[1], [x9], x1
+        sqxtun          v25.8B, v25.8H
+        sqxtun2         v25.16B, v26.8H
+        st1             {v24.D}[0], [x9], x1
+        uaddw           v26.8H, v29.8H, v22.8B
+        uaddw2          v27.8H, v30.8H, v22.16B
+        st1             {v24.D}[1], [x9], x1
+        sqxtun          v26.8B, v26.8H
+        sqxtun2         v26.16B, v27.8H
+        st1             {v25.D}[0], [x9], x1
+        st1             {v25.D}[1], [x9], x1
+        st1             {v26.D}[0], [x9], x1
+        st1             {v26.D}[1], [x9], x1

        idct_end
 endfunc
@@ -333,30 +333,30 @@ function ff_simple_idct_neon, export=1
        sub             x2, x2, #128
        bl              idct_col4_neon1

-        sshr            v1.8h, v7.8h, #COL_SHIFT-16
-        sshr            v2.8h, v16.8h, #COL_SHIFT-16
-        sshr            v3.8h, v17.8h, #COL_SHIFT-16
-        sshr            v4.8h, v18.8h, #COL_SHIFT-16
+        sshr            v1.8H, v7.8H, #COL_SHIFT-16
+        sshr            v2.8H, v16.8H, #COL_SHIFT-16
+        sshr            v3.8H, v17.8H, #COL_SHIFT-16
+        sshr            v4.8H, v18.8H, #COL_SHIFT-16

        bl              idct_col4_neon2

-        sshr            v7.8h, v7.8h, #COL_SHIFT-16
-        sshr            v16.8h, v16.8h, #COL_SHIFT-16
-        sshr            v17.8h, v17.8h, #COL_SHIFT-16
-        sshr            v18.8h, v18.8h, #COL_SHIFT-16
+        sshr            v7.8H, v7.8H, #COL_SHIFT-16
+        sshr            v16.8H, v16.8H, #COL_SHIFT-16
+        sshr            v17.8H, v17.8H, #COL_SHIFT-16
+        sshr            v18.8H, v18.8H, #COL_SHIFT-16

-        zip1            v23.2d, v1.2d, v7.2d
-        zip2            v24.2d, v1.2d, v7.2d
-        st1             {v23.2d,v24.2d}, [x2], #32
-        zip1            v25.2d, v2.2d, v16.2d
-        zip2            v26.2d, v2.2d, v16.2d
-        st1             {v25.2d,v26.2d}, [x2], #32
-        zip1            v27.2d, v3.2d, v17.2d
-        zip2            v28.2d, v3.2d, v17.2d
-        st1             {v27.2d,v28.2d}, [x2], #32
-        zip1            v29.2d, v4.2d, v18.2d
-        zip2            v30.2d, v4.2d, v18.2d
-        st1             {v29.2d,v30.2d}, [x2], #32
+        zip1            v23.2D, v1.2D, v7.2D
+        zip2            v24.2D, v1.2D, v7.2D
+        st1             {v23.2D,v24.2D}, [x2], #32
+        zip1            v25.2D, v2.2D, v16.2D
+        zip2            v26.2D, v2.2D, v16.2D
+        st1             {v25.2D,v26.2D}, [x2], #32
+        zip1            v27.2D, v3.2D, v17.2D
+        zip2            v28.2D, v3.2D, v17.2D
+        st1             {v27.2D,v28.2D}, [x2], #32
+        zip1            v29.2D, v4.2D, v18.2D
+        zip2            v30.2D, v4.2D, v18.2D
+        st1             {v29.2D,v30.2D}, [x2], #32

        idct_end
 endfunc
@@ -330,32 +330,32 @@ endfunc
        //   v17: hev

        // convert to signed value:
-        eor             v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
-        eor             v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80
+        eor            v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
+        eor            v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80

-        movi            v20.8h, #3
-        ssubl           v18.8h, v4.8b,  v3.8b             // QS0 - PS0
-        ssubl2          v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
-        eor             v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
-        eor             v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
-        mul             v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
-        mul             v19.8h, v19.8h, v20.8h
+        movi           v20.8h, #3
+        ssubl          v18.8h, v4.8b,  v3.8b             // QS0 - PS0
+        ssubl2         v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
+        eor            v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
+        eor            v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
+        mul            v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
+        mul            v19.8h, v19.8h, v20.8h

-        sqsub           v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
-        movi            v22.16b, #4
-        movi            v23.16b, #3
+        sqsub          v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
+        movi           v22.16b, #4
+        movi           v23.16b, #3
    .if \inner
-        and             v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
+        and            v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
    .endif
-        saddw           v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
-        saddw2          v19.8h,  v19.8h, v20.16b
-        sqxtn           v18.8b,  v18.8h                   // narrow result back into v18
-        sqxtn2          v18.16b, v19.8h
+        saddw          v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
+        saddw2         v19.8h,  v19.8h, v20.16b
+        sqxtn          v18.8b,  v18.8h                   // narrow result back into v18
+        sqxtn2         v18.16b, v19.8h
    .if !\inner && !\simple
-        eor             v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
-        eor             v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
+        eor            v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
+        eor            v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
    .endif
-        and             v18.16b, v18.16b, v16.16b         // w &= normal_limit
+        and            v18.16b, v18.16b, v16.16b         // w &= normal_limit

        // registers used at this point..
        //   v0 -> P3  (don't corrupt)
@@ -375,44 +375,44 @@ endfunc
        //   P0 = s2u(PS0 + c2);

    .if \simple
-        sqadd           v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
-        sqadd           v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
-        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
-        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
-        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
-        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
-        eor             v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
-        eor             v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
-        eor             v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
-        eor             v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
+        sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
+        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
+        eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
+        eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
+        eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
    .elseif \inner
        // the !is4tap case of filter_common, only used for inner blocks
        //   c3 = ((c1&~hev) + 1) >> 1;
        //   Q1 = s2u(QS1 - c3);
        //   P1 = s2u(PS1 + c3);
-        sqadd           v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
-        sqadd           v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
-        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
-        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
-        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
-        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
-        bic             v19.16b, v19.16b, v17.16b           // c1 & ~hev
-        eor             v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
-        srshr           v19.16b, v19.16b, #1                // c3 >>= 1
-        eor             v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
-        sqsub           v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
-        sqadd           v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
-        eor             v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
-        eor             v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
+        sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
+        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        bic            v19.16b, v19.16b, v17.16b           // c1 & ~hev
+        eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
+        srshr          v19.16b, v19.16b, #1                // c3 >>= 1
+        eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
+        sqsub          v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
+        sqadd          v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
+        eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
+        eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
    .else
-        and             v20.16b, v18.16b, v17.16b           // w & hev
-        sqadd           v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
-        sqadd           v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
-        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
-        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
-        bic             v18.16b, v18.16b, v17.16b           // w &= ~hev
-        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
-        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        and            v20.16b, v18.16b, v17.16b           // w & hev
+        sqadd          v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd          v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
+        bic            v18.16b, v18.16b, v17.16b           // w &= ~hev
+        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)

        // filter_mbedge:
        //   a = clamp((27*w + 63) >> 7);
@@ -424,35 +424,35 @@ endfunc
        //   a = clamp((9*w + 63) >> 7);
        //   Q2 = s2u(QS2 - a);
        //   P2 = s2u(PS2 + a);
-        movi            v17.8h,  #63
-        sshll           v22.8h,  v18.8b, #3
-        sshll2          v23.8h,  v18.16b, #3
-        saddw           v22.8h,  v22.8h, v18.8b
-        saddw2          v23.8h,  v23.8h, v18.16b
-        add             v16.8h,  v17.8h, v22.8h
-        add             v17.8h,  v17.8h, v23.8h           //  9*w + 63
-        add             v19.8h,  v16.8h, v22.8h
-        add             v20.8h,  v17.8h, v23.8h           // 18*w + 63
-        add             v22.8h,  v19.8h, v22.8h
-        add             v23.8h,  v20.8h, v23.8h           // 27*w + 63
-        sqshrn          v16.8b,  v16.8h,  #7
-        sqshrn2         v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
-        sqshrn          v19.8b,  v19.8h, #7
-        sqshrn2         v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
-        sqshrn          v22.8b,  v22.8h, #7
-        sqshrn2         v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
-        sqadd           v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
-        sqsub           v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
-        sqadd           v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
-        sqsub           v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
-        sqadd           v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
-        sqsub           v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
-        eor             v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
-        eor             v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
-        eor             v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
-        eor             v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
-        eor             v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
-        eor             v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
+        movi           v17.8h,  #63
+        sshll          v22.8h,  v18.8b, #3
+        sshll2         v23.8h,  v18.16b, #3
+        saddw          v22.8h,  v22.8h, v18.8b
+        saddw2         v23.8h,  v23.8h, v18.16b
+        add            v16.8h,  v17.8h, v22.8h
+        add            v17.8h,  v17.8h, v23.8h           //  9*w + 63
+        add            v19.8h,  v16.8h, v22.8h
+        add            v20.8h,  v17.8h, v23.8h           // 18*w + 63
+        add            v22.8h,  v19.8h, v22.8h
+        add            v23.8h,  v20.8h, v23.8h           // 27*w + 63
+        sqshrn         v16.8b,  v16.8h,  #7
+        sqshrn2        v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
+        sqshrn         v19.8b,  v19.8h, #7
+        sqshrn2        v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
+        sqshrn         v22.8b,  v22.8h, #7
+        sqshrn2        v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
+        sqadd          v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
+        sqsub          v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
+        sqadd          v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
+        sqsub          v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
+        sqadd          v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
+        sqsub          v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
+        eor            v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
+        eor            v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
+        eor            v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
+        eor            v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
+        eor            v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
+        eor            v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
    .endif
 .endm

@@ -507,48 +507,48 @@ function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
        sub             x0,  x0,  x2,  lsl #2
        sub             x1,  x1,  x2,  lsl #2
        // Load pixels:
-        ld1             {v0.d}[0],     [x0], x2  // P3
-        ld1             {v0.d}[1],     [x1], x2  // P3
-        ld1             {v1.d}[0],     [x0], x2  // P2
-        ld1             {v1.d}[1],     [x1], x2  // P2
-        ld1             {v2.d}[0],     [x0], x2  // P1
-        ld1             {v2.d}[1],     [x1], x2  // P1
-        ld1             {v3.d}[0],     [x0], x2  // P0
-        ld1             {v3.d}[1],     [x1], x2  // P0
-        ld1             {v4.d}[0],     [x0], x2  // Q0
-        ld1             {v4.d}[1],     [x1], x2  // Q0
-        ld1             {v5.d}[0],     [x0], x2  // Q1
-        ld1             {v5.d}[1],     [x1], x2  // Q1
-        ld1             {v6.d}[0],     [x0], x2  // Q2
-        ld1             {v6.d}[1],     [x1], x2  // Q2
-        ld1             {v7.d}[0],     [x0]      // Q3
-        ld1             {v7.d}[1],     [x1]      // Q3
+        ld1          {v0.d}[0],     [x0], x2  // P3
+        ld1          {v0.d}[1],     [x1], x2  // P3
+        ld1          {v1.d}[0],     [x0], x2  // P2
+        ld1          {v1.d}[1],     [x1], x2  // P2
+        ld1          {v2.d}[0],     [x0], x2  // P1
+        ld1          {v2.d}[1],     [x1], x2  // P1
+        ld1          {v3.d}[0],     [x0], x2  // P0
+        ld1          {v3.d}[1],     [x1], x2  // P0
+        ld1          {v4.d}[0],     [x0], x2  // Q0
+        ld1          {v4.d}[1],     [x1], x2  // Q0
+        ld1          {v5.d}[0],     [x0], x2  // Q1
+        ld1          {v5.d}[1],     [x1], x2  // Q1
+        ld1          {v6.d}[0],     [x0], x2  // Q2
+        ld1          {v6.d}[1],     [x1], x2  // Q2
+        ld1          {v7.d}[0],     [x0]      // Q3
+        ld1          {v7.d}[1],     [x1]      // Q3

-        dup             v22.16b, w3                 // flim_E
-        dup             v23.16b, w4                 // flim_I
+        dup          v22.16b, w3                 // flim_E
+        dup          v23.16b, w4                 // flim_I

        vp8_loop_filter inner=\inner, hev_thresh=w5

        // back up to P2:  u,v -= stride * 6
-        sub             x0,  x0,  x2,  lsl #2
-        sub             x1,  x1,  x2,  lsl #2
-        sub             x0,  x0,  x2,  lsl #1
-        sub             x1,  x1,  x2,  lsl #1
+        sub          x0,  x0,  x2,  lsl #2
+        sub          x1,  x1,  x2,  lsl #2
+        sub          x0,  x0,  x2,  lsl #1
+        sub          x1,  x1,  x2,  lsl #1

        // Store pixels:

-        st1             {v1.d}[0],     [x0], x2  // P2
-        st1             {v1.d}[1],     [x1], x2  // P2
-        st1             {v2.d}[0],     [x0], x2  // P1
-        st1             {v2.d}[1],     [x1], x2  // P1
-        st1             {v3.d}[0],     [x0], x2  // P0
-        st1             {v3.d}[1],     [x1], x2  // P0
-        st1             {v4.d}[0],     [x0], x2  // Q0
-        st1             {v4.d}[1],     [x1], x2  // Q0
-        st1             {v5.d}[0],     [x0], x2  // Q1
-        st1             {v5.d}[1],     [x1], x2  // Q1
-        st1             {v6.d}[0],     [x0]      // Q2
-        st1             {v6.d}[1],     [x1]      // Q2
+        st1          {v1.d}[0],     [x0], x2  // P2
+        st1          {v1.d}[1],     [x1], x2  // P2
+        st1          {v2.d}[0],     [x0], x2  // P1
+        st1          {v2.d}[1],     [x1], x2  // P1
+        st1          {v3.d}[0],     [x0], x2  // P0
+        st1          {v3.d}[1],     [x1], x2  // P0
+        st1          {v4.d}[0],     [x0], x2  // Q0
+        st1          {v4.d}[1],     [x1], x2  // Q0
+        st1          {v5.d}[0],     [x0], x2  // Q1
+        st1          {v5.d}[1],     [x1], x2  // Q1
+        st1          {v6.d}[0],     [x0]      // Q2
+        st1          {v6.d}[1],     [x1]      // Q2

        ret
 endfunc
@@ -579,7 +579,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1
        ld1             {v6.d}[1], [x0], x1
        ld1             {v7.d}[1], [x0], x1

-        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31

        dup             v22.16b, w2                 // flim_E
    .if !\simple
@@ -590,7 +590,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1

        sub             x0,  x0,  x1, lsl #4    // backup 16 rows

-        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31

        // Store pixels:
        st1             {v0.d}[0], [x0], x1
@@ -624,24 +624,24 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
        sub             x1,  x1,  #4

        // Load pixels:
-        ld1             {v0.d}[0],     [x0], x2 // load u
-        ld1             {v0.d}[1],     [x1], x2 // load v
-        ld1             {v1.d}[0],     [x0], x2
-        ld1             {v1.d}[1],     [x1], x2
-        ld1             {v2.d}[0],     [x0], x2
-        ld1             {v2.d}[1],     [x1], x2
-        ld1             {v3.d}[0],     [x0], x2
-        ld1             {v3.d}[1],     [x1], x2
-        ld1             {v4.d}[0],     [x0], x2
-        ld1             {v4.d}[1],     [x1], x2
-        ld1             {v5.d}[0],     [x0], x2
-        ld1             {v5.d}[1],     [x1], x2
-        ld1             {v6.d}[0],     [x0], x2
-        ld1             {v6.d}[1],     [x1], x2
-        ld1             {v7.d}[0],     [x0], x2
-        ld1             {v7.d}[1],     [x1], x2
+        ld1          {v0.d}[0],     [x0], x2 // load u
+        ld1          {v0.d}[1],     [x1], x2 // load v
+        ld1          {v1.d}[0],     [x0], x2
+        ld1          {v1.d}[1],     [x1], x2
+        ld1          {v2.d}[0],     [x0], x2
+        ld1          {v2.d}[1],     [x1], x2
+        ld1          {v3.d}[0],     [x0], x2
+        ld1          {v3.d}[1],     [x1], x2
+        ld1          {v4.d}[0],     [x0], x2
+        ld1          {v4.d}[1],     [x1], x2
+        ld1          {v5.d}[0],     [x0], x2
+        ld1          {v5.d}[1],     [x1], x2
+        ld1          {v6.d}[0],     [x0], x2
+        ld1          {v6.d}[1],     [x1], x2
+        ld1          {v7.d}[0],     [x0], x2
+        ld1          {v7.d}[1],     [x1], x2

-        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31

        dup             v22.16b, w3                 // flim_E
        dup             v23.16b, w4                 // flim_I
@@ -651,25 +651,25 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
        sub             x0,  x0,  x2, lsl #3    // backup u 8 rows
        sub             x1,  x1,  x2, lsl #3    // backup v 8 rows

-        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31

        // Store pixels:
-        st1             {v0.d}[0],     [x0], x2 // load u
-        st1             {v0.d}[1],     [x1], x2 // load v
-        st1             {v1.d}[0],     [x0], x2
-        st1             {v1.d}[1],     [x1], x2
-        st1             {v2.d}[0],     [x0], x2
-        st1             {v2.d}[1],     [x1], x2
-        st1             {v3.d}[0],     [x0], x2
-        st1             {v3.d}[1],     [x1], x2
-        st1             {v4.d}[0],     [x0], x2
-        st1             {v4.d}[1],     [x1], x2
-        st1             {v5.d}[0],     [x0], x2
-        st1             {v5.d}[1],     [x1], x2
-        st1             {v6.d}[0],     [x0], x2
-        st1             {v6.d}[1],     [x1], x2
-        st1             {v7.d}[0],     [x0]
-        st1             {v7.d}[1],     [x1]
+        st1          {v0.d}[0],     [x0], x2 // load u
+        st1          {v0.d}[1],     [x1], x2 // load v
+        st1          {v1.d}[0],     [x0], x2
+        st1          {v1.d}[1],     [x1], x2
+        st1          {v2.d}[0],     [x0], x2
+        st1          {v2.d}[1],     [x1], x2
+        st1          {v3.d}[0],     [x0], x2
+        st1          {v3.d}[1],     [x1], x2
+        st1          {v4.d}[0],     [x0], x2
+        st1          {v4.d}[1],     [x1], x2
+        st1          {v5.d}[0],     [x0], x2
+        st1          {v5.d}[1],     [x1], x2
+        st1          {v6.d}[0],     [x0], x2
+        st1          {v6.d}[1],     [x1], x2
+        st1          {v7.d}[0],     [x0]
+        st1          {v7.d}[1],     [x1]

        ret

@@ -230,9 +230,6 @@ function \type\()_8tap_\size\()h_\idx1\idx2
        // reduced dst stride
 .if \size >= 16
        sub             x1,  x1,  x5
-.elseif \size == 4
-        add             x12, x2,  #8
-        add             x13, x7,  #8
 .endif
        // size >= 16 loads two qwords and increments x2,
        // for size 4/8 it's enough with one qword and no
@@ -251,14 +248,9 @@ function \type\()_8tap_\size\()h_\idx1\idx2
 .if \size >= 16
        ld1             {v4.8b,  v5.8b,  v6.8b},  [x2], #24
        ld1             {v16.8b, v17.8b, v18.8b}, [x7], #24
-.elseif \size == 8
+.else
        ld1             {v4.8b,  v5.8b},  [x2]
        ld1             {v16.8b, v17.8b}, [x7]
-.else // \size == 4
-        ld1             {v4.8b},  [x2]
-        ld1             {v16.8b}, [x7]
-        ld1             {v5.s}[0],  [x12], x3
-        ld1             {v17.s}[0], [x13], x3
 .endif
        uxtl            v4.8h,  v4.8b
        uxtl            v5.8h,  v5.8b
@@ -104,26 +104,26 @@ static int aasc_decode_frame(AVCodecContext *avctx,
        ff_msrle_decode(avctx, s->frame, 8, &s->gb);
        break;
    case MKTAG('A', 'A', 'S', 'C'):
-        switch (compr) {
-        case 0:
-            stride = (avctx->width * psize + psize) & ~psize;
-            if (buf_size < stride * avctx->height)
-                return AVERROR_INVALIDDATA;
-            for (i = avctx->height - 1; i >= 0; i--) {
-                memcpy(s->frame->data[0] + i * s->frame->linesize[0], buf, avctx->width * psize);
-                buf += stride;
-                buf_size -= stride;
-            }
-            break;
-        case 1:
-            bytestream2_init(&s->gb, buf, buf_size);
-            ff_msrle_decode(avctx, s->frame, 8, &s->gb);
-            break;
-        default:
-            av_log(avctx, AV_LOG_ERROR, "Unknown compression type %d\n", compr);
+    switch (compr) {
+    case 0:
+        stride = (avctx->width * psize + psize) & ~psize;
+        if (buf_size < stride * avctx->height)
            return AVERROR_INVALIDDATA;
+        for (i = avctx->height - 1; i >= 0; i--) {
+            memcpy(s->frame->data[0] + i * s->frame->linesize[0], buf, avctx->width * psize);
+            buf += stride;
+            buf_size -= stride;
        }
        break;
+    case 1:
+        bytestream2_init(&s->gb, buf, buf_size);
+        ff_msrle_decode(avctx, s->frame, 8, &s->gb);
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unknown compression type %d\n", compr);
+        return AVERROR_INVALIDDATA;
+    }
+        break;
    default:
        av_log(avctx, AV_LOG_ERROR, "Unknown FourCC: %X\n", avctx->codec_tag);
        return -1;
@@ -75,7 +75,6 @@
 #define AC3_DYNAMIC_RANGE1      0

 typedef int                     INTFLOAT;
-typedef unsigned int            UINTFLOAT;
 typedef int16_t                 SHORTFLOAT;

 #else /* USE_FIXED */
@@ -95,7 +94,6 @@ typedef int16_t                 SHORTFLOAT;
 #define AC3_DYNAMIC_RANGE1      1.0f

 typedef float                   INTFLOAT;
-typedef float                   UINTFLOAT;
 typedef float                   SHORTFLOAT;

 #endif /* USE_FIXED */
@@ -179,9 +179,7 @@ int av_ac3_parse_header(const uint8_t *buf, size_t size,
    AC3HeaderInfo hdr;
    int err;

-    err = init_get_bits8(&gb, buf, size);
-    if (err < 0)
-        return AVERROR_INVALIDDATA;
+    init_get_bits8(&gb, buf, size);
    err = ff_ac3_parse_header(&gb, &hdr);
    if (err < 0)
        return AVERROR_INVALIDDATA;
@@ -43,7 +43,7 @@ int AC3_NAME(allocate_sample_buffers)(AC3EncodeContext *s)

    FF_ALLOC_OR_GOTO(s->avctx, s->windowed_samples, AC3_WINDOW_SIZE *
                     sizeof(*s->windowed_samples), alloc_fail);
-    FF_ALLOCZ_ARRAY_OR_GOTO(s->avctx, s->planar_samples, s->channels, sizeof(*s->planar_samples),
+    FF_ALLOC_ARRAY_OR_GOTO(s->avctx, s->planar_samples, s->channels, sizeof(*s->planar_samples),
                     alloc_fail);
    for (ch = 0; ch < s->channels; ch++) {
        FF_ALLOCZ_OR_GOTO(s->avctx, s->planar_samples[ch],
@@ -423,8 +423,8 @@ static int decode_inter_plane(AGMContext *s, GetBitContext *gb, int size,
                int map = s->map[x];

                if (orig_mv_x >= -32) {
-                    if (y * 8 + mv_y < 0 || y * 8 + mv_y + 8 > h ||
-                        x * 8 + mv_x < 0 || x * 8 + mv_x + 8 > w)
+                    if (y * 8 + mv_y < 0 || y * 8 + mv_y + 8 >= h ||
+                        x * 8 + mv_x < 0 || x * 8 + mv_x + 8 >= w)
                        return AVERROR_INVALIDDATA;

                    copy_block8(frame->data[plane] + (s->blocks_h - 1 - y) * 8 * frame->linesize[plane] + x * 8,
@@ -470,7 +470,8 @@ static av_cold int aic_decode_init(AVCodecContext *avctx)
        }
    }

-    ctx->slice_data = av_calloc(ctx->slice_width, AIC_BAND_COEFFS * sizeof(*ctx->slice_data));
+    ctx->slice_data = av_malloc_array(ctx->slice_width, AIC_BAND_COEFFS
+                                * sizeof(*ctx->slice_data));
    if (!ctx->slice_data) {
        av_log(avctx, AV_LOG_ERROR, "Error allocating slice buffer\n");

@@ -302,9 +302,6 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
        decorr_shift       = get_bits(&alac->gb, 8);
        decorr_left_weight = get_bits(&alac->gb, 8);

-        if (channels == 2 && decorr_left_weight && decorr_shift > 31)
-            return AVERROR_INVALIDDATA;
-
        for (ch = 0; ch < channels; ch++) {
            prediction_type[ch]   = get_bits(&alac->gb, 4);
            lpc_quant[ch]         = get_bits(&alac->gb, 4);
@@ -29,12 +29,12 @@ static void decorrelate_stereo(int32_t *buffer[2], int nb_samples,
    int i;

    for (i = 0; i < nb_samples; i++) {
-        uint32_t a, b;
+        int32_t a, b;

        a = buffer[0][i];
        b = buffer[1][i];

-        a -= (int)(b * decorr_left_weight) >> decorr_shift;
+        a -= (b * decorr_left_weight) >> decorr_shift;
        b += a;

        buffer[0][i] = b;
@@ -679,7 +679,9 @@ extern AVCodec ff_xsub_decoder;
 /* external libraries */
 extern AVCodec ff_aac_at_encoder;
 extern AVCodec ff_aac_at_decoder;
+extern AVCodec ff_aac_mf_encoder;
 extern AVCodec ff_ac3_at_decoder;
+extern AVCodec ff_ac3_mf_encoder;
 extern AVCodec ff_adpcm_ima_qt_at_decoder;
 extern AVCodec ff_alac_at_encoder;
 extern AVCodec ff_alac_at_decoder;
@@ -691,6 +693,7 @@ extern AVCodec ff_ilbc_at_decoder;
 extern AVCodec ff_mp1_at_decoder;
 extern AVCodec ff_mp2_at_decoder;
 extern AVCodec ff_mp3_at_decoder;
+extern AVCodec ff_mp3_mf_encoder;
 extern AVCodec ff_pcm_alaw_at_encoder;
 extern AVCodec ff_pcm_alaw_at_decoder;
 extern AVCodec ff_pcm_mulaw_at_encoder;
@@ -754,8 +757,6 @@ extern AVCodec ff_idf_decoder;

 /* external libraries, that shouldn't be used by default if one of the
 * above is available */
-extern AVCodec ff_aac_mf_encoder;
-extern AVCodec ff_ac3_mf_encoder;
 extern AVCodec ff_h263_v4l2m2m_encoder;
 extern AVCodec ff_libaom_av1_decoder;
 extern AVCodec ff_libopenh264_encoder;
@@ -788,7 +789,6 @@ extern AVCodec ff_mjpeg_cuvid_decoder;
 extern AVCodec ff_mjpeg_qsv_encoder;
 extern AVCodec ff_mjpeg_qsv_decoder;
 extern AVCodec ff_mjpeg_vaapi_encoder;
-extern AVCodec ff_mp3_mf_encoder;
 extern AVCodec ff_mpeg1_cuvid_decoder;
 extern AVCodec ff_mpeg2_cuvid_decoder;
 extern AVCodec ff_mpeg2_qsv_encoder;
@@ -762,7 +762,7 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
            }

            for (k = 2; k < opt_order; k++)
-                quant_cof[k] = (quant_cof[k] * (1U << 14)) + (add_base << 13);
+                quant_cof[k] = (quant_cof[k] * (1 << 14)) + (add_base << 13);
        }
    }

@@ -1016,10 +1016,6 @@ static int read_block(ALSDecContext *ctx, ALSBlockData *bd)
    ALSSpecificConfig *sconf = &ctx->sconf;

    *bd->shift_lsbs = 0;
-
-    if (get_bits_left(gb) < 7)
-        return AVERROR_INVALIDDATA;
-
    // read block type flag and read the samples accordingly
    if (get_bits1(gb)) {
        ret = read_var_block_data(ctx, bd);
@@ -1632,7 +1628,7 @@ static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
    AVCodecContext *avctx    = ctx->avctx;
    GetBitContext *gb = &ctx->gb;
    unsigned int div_blocks[32];                ///< block sizes.
-    int c;
+    unsigned int c;
    unsigned int js_blocks[2];
    uint32_t bs_info = 0;
    int ret;
@@ -1810,17 +1806,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
    else
        ctx->cur_frame_length = sconf->frame_length;

-    ctx->highest_decoded_channel = -1;
+    ctx->highest_decoded_channel = 0;
    // decode the frame data
    if ((invalid_frame = read_frame_data(ctx, ra_frame)) < 0)
        av_log(ctx->avctx, AV_LOG_WARNING,
               "Reading frame data failed. Skipping RA unit.\n");

-    if (ctx->highest_decoded_channel == -1) {
-        av_log(ctx->avctx, AV_LOG_WARNING,
-               "No channel data decoded.\n");
+    if (ctx->highest_decoded_channel == 0)
        return AVERROR_INVALIDDATA;
-    }

    ctx->frame_id++;

@@ -2116,8 +2109,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
        ctx->nbits  = av_malloc_array(ctx->cur_frame_length, sizeof(*ctx->nbits));
        ctx->mlz    = av_mallocz(sizeof(*ctx->mlz));

-        if (!ctx->larray || !ctx->nbits || !ctx->mlz || !ctx->acf || !ctx->shift_value
-            || !ctx->last_shift_value || !ctx->last_acf_mantissa || !ctx->raw_mantissa) {
+        if (!ctx->mlz || !ctx->acf || !ctx->shift_value || !ctx->last_shift_value
+            || !ctx->last_acf_mantissa || !ctx->raw_mantissa) {
            av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
            ret = AVERROR(ENOMEM);
            goto fail;
@@ -2128,10 +2121,6 @@ static av_cold int decode_init(AVCodecContext *avctx)

        for (c = 0; c < avctx->channels; ++c) {
            ctx->raw_mantissa[c] = av_mallocz_array(ctx->cur_frame_length, sizeof(**ctx->raw_mantissa));
-            if (!ctx->raw_mantissa[c]) {
-                av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
-                return AVERROR(ENOMEM);
-            }
        }
    }

@@ -431,8 +431,7 @@ static int decode_frame(AVCodecContext *avctx,
                    s->args[s->nb_args] = FFMAX(s->args[s->nb_args], 0) * 10 + buf[0] - '0';
                break;
            case ';':
-                if (s->nb_args < MAX_NB_ARGS)
-                    s->nb_args++;
+                s->nb_args++;
                if (s->nb_args < MAX_NB_ARGS)
                    s->args[s->nb_args] = 0;
                break;
@@ -475,11 +474,6 @@ static av_cold int decode_close(AVCodecContext *avctx)
    return 0;
 }

-static const AVCodecDefault ansi_defaults[] = {
-    { "max_pixels", "640*480" },
-    { NULL },
-};
-
 AVCodec ff_ansi_decoder = {
    .name           = "ansi",
    .long_name      = NULL_IF_CONFIG_SMALL("ASCII/ANSI art"),
@@ -491,5 +485,4 @@ AVCodec ff_ansi_decoder = {
    .decode         = decode_frame,
    .capabilities   = AV_CODEC_CAP_DR1,
    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
-    .defaults       = ansi_defaults,
 };
@@ -102,7 +102,7 @@ typedef struct APEFilter {
    int16_t *historybuffer; ///< filter memory
    int16_t *delay;         ///< filtered values

-    uint32_t avg;
+    int avg;
 } APEFilter;

 typedef struct APERice {
@@ -852,7 +852,7 @@ static av_always_inline int filter_fast_3320(APEPredictor *p,
    }

    predictionA = p->buf[delayA] * 2U - p->buf[delayA - 1];
-    p->lastA[filter] = decoded + (unsigned)((int32_t)(predictionA  * p->coeffsA[filter][0]) >> 9);
+    p->lastA[filter] = decoded + ((int32_t)(predictionA  * p->coeffsA[filter][0]) >> 9);

    if ((decoded ^ predictionA) > 0)
        p->coeffsA[filter][0]++;
@@ -882,8 +882,8 @@ static av_always_inline int filter_3800(APEPredictor *p,
        return predictionA;
    }
    d2 =  p->buf[delayA];
-    d1 = (p->buf[delayA] - (unsigned)p->buf[delayA - 1]) * 2;
-    d0 =  p->buf[delayA] + ((p->buf[delayA - 2] - (unsigned)p->buf[delayA - 1]) * 8);
+    d1 = (p->buf[delayA] - p->buf[delayA - 1]) * 2U;
+    d0 =  p->buf[delayA] + ((p->buf[delayA - 2] - p->buf[delayA - 1]) * 8U);
    d3 =  p->buf[delayB] * 2U - p->buf[delayB - 1];
    d4 =  p->buf[delayB];

@@ -903,7 +903,7 @@ static av_always_inline int filter_3800(APEPredictor *p,
    p->coeffsB[filter][0] += (((d3 >> 29) & 4) - 2) * sign;
    p->coeffsB[filter][1] -= (((d4 >> 30) & 2) - 1) * sign;

-    p->filterB[filter] = p->lastA[filter] + (unsigned)(predictionB >> shift);
+    p->filterB[filter] = p->lastA[filter] + (predictionB >> shift);
    p->filterA[filter] = p->filterB[filter] + (unsigned)((int)(p->filterA[filter] * 31U) >> 5);

    return p->filterA[filter];
@@ -928,7 +928,7 @@ static void long_filter_high_3800(int32_t *buffer, int order, int shift, int len
            dotprod += delay[j] * (unsigned)coeffs[j];
            coeffs[j] += ((delay[j] >> 31) | 1) * sign;
        }
-        buffer[i] -= (unsigned)(dotprod >> shift);
+        buffer[i] -= dotprod >> shift;
        for (j = 0; j < order - 1; j++)
            delay[j] = delay[j + 1];
        delay[order - 1] = buffer[i];
@@ -952,7 +952,7 @@ static void long_filter_ehigh_3830(int32_t *buffer, int length)
        for (j = 7; j > 0; j--)
            delay[j] = delay[j - 1];
        delay[0] = buffer[i];
-        buffer[i] -= (unsigned)(dotprod >> 9);
+        buffer[i] -= dotprod >> 9;
    }
 }

@@ -1061,13 +1061,13 @@ static av_always_inline int predictor_update_3930(APEPredictor *p,
                                                  const int delayA)
 {
    int32_t predictionA, sign;
-    uint32_t d0, d1, d2, d3;
+    int32_t d0, d1, d2, d3;

    p->buf[delayA]     = p->lastA[filter];
    d0 = p->buf[delayA    ];
-    d1 = p->buf[delayA    ] - (unsigned)p->buf[delayA - 1];
-    d2 = p->buf[delayA - 1] - (unsigned)p->buf[delayA - 2];
-    d3 = p->buf[delayA - 2] - (unsigned)p->buf[delayA - 3];
+    d1 = p->buf[delayA    ] - p->buf[delayA - 1];
+    d2 = p->buf[delayA - 1] - p->buf[delayA - 2];
+    d3 = p->buf[delayA - 2] - p->buf[delayA - 3];

    predictionA = d0 * p->coeffsA[filter][0] +
                  d1 * p->coeffsA[filter][1] +
@@ -1078,10 +1078,10 @@ static av_always_inline int predictor_update_3930(APEPredictor *p,
    p->filterA[filter] = p->lastA[filter] + ((int)(p->filterA[filter] * 31U) >> 5);

    sign = APESIGN(decoded);
-    p->coeffsA[filter][0] += (((int32_t)d0 < 0) * 2 - 1) * sign;
-    p->coeffsA[filter][1] += (((int32_t)d1 < 0) * 2 - 1) * sign;
-    p->coeffsA[filter][2] += (((int32_t)d2 < 0) * 2 - 1) * sign;
-    p->coeffsA[filter][3] += (((int32_t)d3 < 0) * 2 - 1) * sign;
+    p->coeffsA[filter][0] += ((d0 < 0) * 2 - 1) * sign;
+    p->coeffsA[filter][1] += ((d1 < 0) * 2 - 1) * sign;
+    p->coeffsA[filter][2] += ((d2 < 0) * 2 - 1) * sign;
+    p->coeffsA[filter][3] += ((d3 < 0) * 2 - 1) * sign;

    return p->filterA[filter];
 }
@@ -1309,7 +1309,7 @@ static void do_apply_filter(APEContext *ctx, int version, APEFilter *f,
            absres = res < 0 ? -(unsigned)res : res;
            if (absres)
                *f->adaptcoeffs = APESIGN(res) *
-                                  (8 << ((absres > f->avg * 3LL) + (absres > (f->avg + f->avg / 3))));
+                                  (8 << ((absres > f->avg * 3) + (absres > f->avg * 4 / 3)));
                /* equivalent to the following code
                    if (absres <= f->avg * 4 / 3)
                        *f->adaptcoeffs = APESIGN(res) * 8;
@@ -1559,7 +1559,7 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
        for (ch = 0; ch < s->channels; ch++) {
            sample8 = (uint8_t *)frame->data[ch];
            for (i = 0; i < blockstodecode; i++)
-                *sample8++ = (s->decoded[ch][i] + 0x80U) & 0xff;
+                *sample8++ = (s->decoded[ch][i] + 0x80) & 0xff;
        }
        break;
    case 16:
@@ -1573,7 +1573,7 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
        for (ch = 0; ch < s->channels; ch++) {
            sample24 = (int32_t *)frame->data[ch];
            for (i = 0; i < blockstodecode; i++)
-                *sample24++ = s->decoded[ch][i] * 256U;
+                *sample24++ = s->decoded[ch][i] * 256;
        }
        break;
    }
@@ -1581,24 +1581,13 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
    s->samples -= blockstodecode;

    if (avctx->err_recognition & AV_EF_CRCCHECK &&
-        s->fileversion >= 3900) {
+        s->fileversion >= 3900 && s->bps < 24) {
        uint32_t crc = s->CRC_state;
        const AVCRC *crc_tab = av_crc_get_table(AV_CRC_32_IEEE_LE);
-        int stride = s->bps == 24 ? 4 : (s->bps>>3);
-        int offset = s->bps == 24;
-        int bytes  = s->bps >> 3;
-
        for (i = 0; i < blockstodecode; i++) {
            for (ch = 0; ch < s->channels; ch++) {
-#if HAVE_BIGENDIAN
-                uint8_t *smp_native = frame->data[ch] + i*stride;
-                uint8_t smp[4];
-                for(int j = 0; j<stride; j++)
-                    smp[j] = smp_native[stride-j-1];
-#else
-                uint8_t *smp = frame->data[ch] + i*stride;
-#endif
-                crc = av_crc(crc_tab, crc, smp+offset, bytes);
+                uint8_t *smp = frame->data[ch] + (i*(s->bps >> 3));
+                crc = av_crc(crc_tab, crc, smp, s->bps >> 3);
            }
        }

@@ -48,3 +48,4 @@ function ff_scalarproduct_int16_neon, export=1
        vmov.32         r0,  d3[0]
        bx              lr
 endfunc
+
@@ -229,7 +229,7 @@ A .endif
  .endif

        // Begin loop
-1:
+01:
  .if TOTAL_TAPS == 0
        // Things simplify a lot in this case
        // In fact this could be pipelined further if it's worth it...
@@ -241,7 +241,7 @@ A .endif
        str     ST0, [PST, #-4]!
        str     ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
        str     ST0, [PSAMP], #4 * MAX_CHANNELS
-        bne     1b
+        bne     01b
  .else
    .if \fir_taps & 1
      .set LOAD_REG, 1
@@ -333,7 +333,7 @@ T       orr     AC0, AC0, AC1
        str     ST3, [PST, #-4]!
        str     ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
        str     ST3, [PSAMP], #4 * MAX_CHANNELS
-        bne     1b
+        bne     01b
  .endif
        b       99f

@@ -279,13 +279,11 @@ function \type\()_8tap_\size\()h_\idx1\idx2
        sub             r1,  r1,  r5
 .endif
        @ size >= 16 loads two qwords and increments r2,
-        @ size 4 loads 1 d word, increments r2 and loads 1 32-bit lane
-        @ for size 8 it's enough with one qword and no postincrement
+        @ for size 4/8 it's enough with one qword and no
+        @ postincrement
 .if \size >= 16
        sub             r3,  r3,  r5
        sub             r3,  r3,  #8
-.elseif \size == 4
-        sub             r3,  r3,  #8
 .endif
        @ Load the filter vector
        vld1.16         {q0},  [r12,:128]
@@ -297,14 +295,9 @@ function \type\()_8tap_\size\()h_\idx1\idx2
 .if \size >= 16
        vld1.8          {d18, d19, d20}, [r2]!
        vld1.8          {d24, d25, d26}, [r7]!
-.elseif \size == 8
+.else
        vld1.8          {q9},  [r2]
        vld1.8          {q12}, [r7]
-.else @ size == 4
-        vld1.8          {d18}, [r2]!
-        vld1.8          {d24}, [r7]!
-        vld1.32         {d19[0]}, [r2]
-        vld1.32         {d25[0]}, [r7]
 .endif
        vmovl.u8        q8,  d18
        vmovl.u8        q9,  d19
@@ -362,10 +362,6 @@ static av_cold int atrac1_decode_init(AVCodecContext *avctx)
    ff_atrac_generate_tables();

    q->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
-    if (!q->fdsp) {
-        atrac1_decode_end(avctx);
-        return AVERROR(ENOMEM);
-    }

    q->bands[0] = q->low;
    q->bands[1] = q->mid;
@@ -45,10 +45,6 @@ static const enum AVPixelFormat pix_fmts_12bit[2][2] = {
    { AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV420P12 },
 };

-static const enum AVPixelFormat pix_fmts_rgb[3] = {
-    AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12,
-};
-
 static int av1_parser_parse(AVCodecParserContext *ctx,
                            AVCodecContext *avctx,
                            const uint8_t **out_data, int *out_size,
@@ -57,8 +53,6 @@ static int av1_parser_parse(AVCodecParserContext *ctx,
    AV1ParseContext *s = ctx->priv_data;
    CodedBitstreamFragment *td = &s->temporal_unit;
    CodedBitstreamAV1Context *av1 = s->cbc->priv_data;
-    AV1RawSequenceHeader *seq;
-    AV1RawColorConfig *color;
    int ret;

    *out_data = data;
@@ -92,12 +86,11 @@ static int av1_parser_parse(AVCodecParserContext *ctx,
        goto end;
    }

-    seq = av1->sequence_header;
-    color = &seq->color_config;
-
    for (int i = 0; i < td->nb_units; i++) {
        CodedBitstreamUnit *unit = &td->units[i];
        AV1RawOBU *obu = unit->content;
+        AV1RawSequenceHeader *seq = av1->sequence_header;
+        AV1RawColorConfig *color = &seq->color_config;
        AV1RawFrameHeader *frame;
        int frame_type;

@@ -134,6 +127,9 @@ static int av1_parser_parse(AVCodecParserContext *ctx,
            ctx->key_frame = frame_type == AV1_FRAME_KEY;
        }

+        avctx->profile = seq->seq_profile;
+        avctx->level   = seq->seq_level_idx[0];
+
        switch (frame_type) {
        case AV1_FRAME_KEY:
        case AV1_FRAME_INTRA_ONLY:
@@ -147,44 +143,33 @@ static int av1_parser_parse(AVCodecParserContext *ctx,
            break;
        }
        ctx->picture_structure = AV_PICTURE_STRUCTURE_FRAME;
-    }

-    switch (av1->bit_depth) {
-    case 8:
-        ctx->format = color->mono_chrome ? AV_PIX_FMT_GRAY8
-                                         : pix_fmts_8bit [color->subsampling_x][color->subsampling_y];
-        break;
-    case 10:
-        ctx->format = color->mono_chrome ? AV_PIX_FMT_GRAY10
-                                         : pix_fmts_10bit[color->subsampling_x][color->subsampling_y];
-        break;
-    case 12:
-        ctx->format = color->mono_chrome ? AV_PIX_FMT_GRAY12
-                                         : pix_fmts_12bit[color->subsampling_x][color->subsampling_y];
-        break;
-    }
-    av_assert2(ctx->format != AV_PIX_FMT_NONE);
+        switch (av1->bit_depth) {
+        case 8:
+            ctx->format = color->mono_chrome ? AV_PIX_FMT_GRAY8
+                                             : pix_fmts_8bit [color->subsampling_x][color->subsampling_y];
+            break;
+        case 10:
+            ctx->format = color->mono_chrome ? AV_PIX_FMT_GRAY10
+                                             : pix_fmts_10bit[color->subsampling_x][color->subsampling_y];
+            break;
+        case 12:
+            ctx->format = color->mono_chrome ? AV_PIX_FMT_GRAY12
+                                             : pix_fmts_12bit[color->subsampling_x][color->subsampling_y];
+            break;
+        }
+        av_assert2(ctx->format != AV_PIX_FMT_NONE);

-    if (!color->subsampling_x && !color->subsampling_y &&
-        color->matrix_coefficients       == AVCOL_SPC_RGB &&
-        color->color_primaries           == AVCOL_PRI_BT709 &&
-        color->transfer_characteristics  == AVCOL_TRC_IEC61966_2_1)
-        ctx->format = pix_fmts_rgb[color->high_bitdepth + color->twelve_bit];
+        avctx->colorspace = (enum AVColorSpace) color->matrix_coefficients;
+        avctx->color_primaries = (enum AVColorPrimaries) color->color_primaries;
+        avctx->color_trc = (enum AVColorTransferCharacteristic) color->transfer_characteristics;
+        avctx->color_range = color->color_range ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;

-    avctx->pix_fmt = ctx->format;
-
-    avctx->profile = seq->seq_profile;
-    avctx->level   = seq->seq_level_idx[0];
-
-    avctx->colorspace = (enum AVColorSpace) color->matrix_coefficients;
-    avctx->color_primaries = (enum AVColorPrimaries) color->color_primaries;
-    avctx->color_trc = (enum AVColorTransferCharacteristic) color->transfer_characteristics;
-    avctx->color_range = color->color_range ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
-
-    if (ctx->width != avctx->width || ctx->height != avctx->height) {
-        ret = ff_set_dimensions(avctx, ctx->width, ctx->height);
-        if (ret < 0)
-            goto end;
+        if (ctx->width != avctx->width || ctx->height != avctx->height) {
+            ret = ff_set_dimensions(avctx, ctx->width, ctx->height);
+            if (ret < 0)
+                goto end;
+        }
    }

    if (avctx->framerate.num)
@@ -1294,10 +1294,6 @@ typedef struct AVCodecContext {
     *   this callback and filled with the extra buffers if there are more
     *   buffers than buf[] can hold. extended_buf will be freed in
     *   av_frame_unref().
-     *   Decoders will generally initialize the whole buffer before it is output
-     *   but it can in rare error conditions happen that uninitialized data is passed
-     *   through. \important The buffers returned by get_buffer* should thus not contain sensitive
-     *   data.
     *
     * If AV_CODEC_CAP_DR1 is not set then get_buffer2() must call
     * avcodec_default_get_buffer2() instead of providing buffers allocated by
@@ -54,8 +54,6 @@ static av_cold int init(AVCodecContext *avctx)
        }

        a->mjpeg_avctx = avcodec_alloc_context3(codec);
-        if (!a->mjpeg_avctx)
-            return AVERROR(ENOMEM);

        av_dict_set(&thread_opt, "threads", "1", 0); // Is this needed ?
        a->mjpeg_avctx->refcounted_frames = 1;
@@ -171,5 +169,5 @@ AVCodec ff_avrn_decoder = {
    .close          = end,
    .decode         = decode_frame,
    .max_lowres     = 3,
-    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_INIT_CLEANUP,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
@@ -867,7 +867,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,

    binkb_init_bundles(c);
    ref_start = frame->data[plane_idx];
-    ref_end   = frame->data[plane_idx] + ((bh - 1) * frame->linesize[plane_idx] + bw - 1) * 8;
+    ref_end   = frame->data[plane_idx] + (bh * frame->linesize[plane_idx] + bw) * 8;

    for (i = 0; i < 64; i++)
        coordmap[i] = (i & 7) + (i >> 3) * stride;
@@ -923,7 +923,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                xoff = binkb_get_value(c, BINKB_SRC_X_OFF);
                yoff = binkb_get_value(c, BINKB_SRC_Y_OFF) + ybias;
                ref = dst + xoff + yoff * stride;
-                if (ref < ref_start || ref > ref_end) {
+                if (ref < ref_start || ref + 8*stride > ref_end) {
                    av_log(c->avctx, AV_LOG_WARNING, "Reference block is out of bounds\n");
                } else if (ref + 8*stride < dst || ref >= dst + 8*stride) {
                    c->hdsp.put_pixels_tab[1][0](dst, ref, stride, 8);
@@ -939,7 +939,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                xoff = binkb_get_value(c, BINKB_SRC_X_OFF);
                yoff = binkb_get_value(c, BINKB_SRC_Y_OFF) + ybias;
                ref = dst + xoff + yoff * stride;
-                if (ref < ref_start || ref > ref_end) {
+                if (ref < ref_start || ref + 8 * stride > ref_end) {
                    av_log(c->avctx, AV_LOG_WARNING, "Reference block is out of bounds\n");
                } else if (ref + 8*stride < dst || ref >= dst + 8*stride) {
                    c->hdsp.put_pixels_tab[1][0](dst, ref, stride, 8);
@@ -971,7 +971,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                xoff = binkb_get_value(c, BINKB_SRC_X_OFF);
                yoff = binkb_get_value(c, BINKB_SRC_Y_OFF) + ybias;
                ref = dst + xoff + yoff * stride;
-                if (ref < ref_start || ref > ref_end) {
+                if (ref < ref_start || ref + 8 * stride > ref_end) {
                    av_log(c->avctx, AV_LOG_WARNING, "Reference block is out of bounds\n");
                } else if (ref + 8*stride < dst || ref >= dst + 8*stride) {
                    c->hdsp.put_pixels_tab[1][0](dst, ref, stride, 8);
@@ -1084,7 +1084,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
        for (bx = 0; bx < bw; bx++, dst += 8, prev += 8) {
            blk = get_value(c, BINK_SRC_BLOCK_TYPES);
            // 16x16 block type on odd line means part of the already decoded block, so skip it
-            if (((by & 1) || (bx & 1)) && blk == SCALED_BLOCK) {
+            if ((by & 1) && blk == SCALED_BLOCK) {
                bx++;
                dst  += 8;
                prev += 8;
@@ -1381,8 +1381,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
    ff_hpeldsp_init(&c->hdsp, avctx->flags);
    ff_binkdsp_init(&c->binkdsp);

-    if ((ret = init_bundles(c)) < 0)
+    if ((ret = init_bundles(c)) < 0) {
+        free_bundles(c);
        return ret;
+    }

    if (c->version == 'b') {
        if (!binkb_initialised) {
@@ -1422,5 +1424,4 @@ AVCodec ff_bink_decoder = {
    .decode         = decode_frame,
    .flush          = flush,
    .capabilities   = AV_CODEC_CAP_DR1,
-    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
 };
@@ -40,6 +40,8 @@
 #include "rdft.h"
 #include "wma_freqs.h"

+static float quant_table[96];
+
 #define MAX_CHANNELS 2
 #define BINK_BLOCK_MAX_SIZE (MAX_CHANNELS << 11)

@@ -56,7 +58,6 @@ typedef struct BinkAudioContext {
    float root;
    DECLARE_ALIGNED(32, FFTSample, coeffs)[BINK_BLOCK_MAX_SIZE];
    float previous[MAX_CHANNELS][BINK_BLOCK_MAX_SIZE / 16];  ///< coeffs from previous audio block
-    float quant_table[96];
    AVPacket *pkt;
    union {
        RDFTContext rdft;
@@ -115,7 +116,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
        s->root = s->frame_len / (sqrt(s->frame_len) * 32768.0);
    for (i = 0; i < 96; i++) {
        /* constant is result of 0.066399999/log10(M_E) */
-        s->quant_table[i] = expf(i * 0.15289164787221953823f) * s->root;
+        quant_table[i] = expf(i * 0.15289164787221953823f) * s->root;
    }

    /* calculate number of bands */
@@ -196,7 +197,7 @@ static int decode_block(BinkAudioContext *s, float **out, int use_dct)
            return AVERROR_INVALIDDATA;
        for (i = 0; i < s->num_bands; i++) {
            int value = get_bits(gb, 8);
-            quant[i]  = s->quant_table[FFMIN(value, 95)];
+            quant[i]  = quant_table[FFMIN(value, 95)];
        }

        k = 0;
@@ -129,7 +129,7 @@ static int alloc_table(VLC *vlc, int size, int use_static)

 typedef struct VLCcode {
    uint8_t bits;
-    VLC_TYPE symbol;
+    uint16_t symbol;
    /** codeword, with the first bit-to-be-read in the msb
     * (even if intended for a little-endian bitstream reader) */
    uint32_t code;
@@ -162,9 +162,9 @@ static int build_table(VLC *vlc, int table_nb_bits, int nb_codes,
    uint32_t code;
    volatile VLC_TYPE (* volatile table)[2]; // the double volatile is needed to prevent an internal compiler error in gcc 4.2

+    table_size = 1 << table_nb_bits;
    if (table_nb_bits > 30)
       return AVERROR(EINVAL);
-    table_size = 1 << table_nb_bits;
    table_index = alloc_table(vlc, table_size, flags & INIT_VLC_USE_NEW_STATIC);
    ff_dlog(NULL, "new table index=%d size=%d\n", table_index, table_size);
    if (table_index < 0)
@@ -693,11 +693,11 @@ static int cbs_insert_unit(CodedBitstreamContext *ctx,
            memmove(units + position + 1, units + position,
                    (frag->nb_units - position) * sizeof(*units));
    } else {
-        units = av_malloc_array(frag->nb_units*2 + 1, sizeof(*units));
+        units = av_malloc_array(frag->nb_units + 1, sizeof(*units));
        if (!units)
            return AVERROR(ENOMEM);

-        frag->nb_units_allocated = 2*frag->nb_units_allocated + 1;
+        ++frag->nb_units_allocated;

        if (position > 0)
            memcpy(units, frag->units, position * sizeof(*units));
@@ -36,7 +36,7 @@ static int cbs_av1_read_uvlc(CodedBitstreamContext *ctx, GetBitContext *gbc,
        position = get_bits_count(gbc);

    zeroes = 0;
-    while (zeroes < 32) {
+    while (1) {
        if (get_bits_left(gbc) < 1) {
            av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid uvlc code at "
                   "%s: bitstream ended.\n", name);
@@ -49,18 +49,7 @@ static int cbs_av1_read_uvlc(CodedBitstreamContext *ctx, GetBitContext *gbc,
    }

    if (zeroes >= 32) {
-        // The spec allows at least thirty-two zero bits followed by a
-        // one to mean 2^32-1, with no constraint on the number of
-        // zeroes.  The libaom reference decoder does not match this,
-        // instead reading thirty-two zeroes but not the following one
-        // to mean 2^32-1.  These two interpretations are incompatible
-        // and other implementations may follow one or the other.
-        // Therefore we reject thirty-two zeroes because the intended
-        // behaviour is not clear.
-        av_log(ctx->log_ctx, AV_LOG_ERROR, "Thirty-two zero bits in "
-               "%s uvlc code: considered invalid due to conflicting "
-               "standard and reference decoder behaviour.\n", name);
-        return AVERROR_INVALIDDATA;
+        value = MAX_UINT_BITS(32);
    } else {
        if (get_bits_left(gbc) < zeroes) {
            av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid uvlc code at "
@@ -136,9 +125,8 @@ static int cbs_av1_write_uvlc(CodedBitstreamContext *ctx, PutBitContext *pbc,
        put_bits(pbc, 1, 1);
    } else {
        zeroes = av_log2(value + 1);
-        v = value - (1U << zeroes) + 1;
-        put_bits(pbc, zeroes, 0);
-        put_bits(pbc, 1, 1);
+        v = value - (1 << zeroes) + 1;
+        put_bits(pbc, zeroes + 1, 1);
        put_bits(pbc, zeroes, v);
    }

@@ -394,7 +382,7 @@ static int cbs_av1_write_increment(CodedBitstreamContext *ctx, PutBitContext *pb
    }

    if (len > 0)
-        put_bits(pbc, len, (1U << len) - 1 - (value != range_max));
+        put_bits(pbc, len, (1 << len) - 1 - (value != range_max));

    return 0;
 }
@@ -723,11 +711,10 @@ static size_t cbs_av1_get_payload_bytes_left(GetBitContext *gbc)

 #define infer(name, value) do { \
        if (current->name != (value)) { \
-            av_log(ctx->log_ctx, AV_LOG_ERROR, \
+            av_log(ctx->log_ctx, AV_LOG_WARNING, "Warning: " \
                   "%s does not match inferred value: " \
                   "%"PRId64", but should be %"PRId64".\n", \
                   #name, (int64_t)current->name, (int64_t)(value)); \
-            return AVERROR_INVALIDDATA; \
        } \
    } while (0)

@@ -158,8 +158,8 @@ typedef struct AV1RawFrameHeader {
    uint8_t  use_superres;
    uint8_t  coded_denom;
    uint8_t  render_and_frame_size_different;
-    uint16_t render_width_minus_1;
-    uint16_t render_height_minus_1;
+    uint8_t  render_width_minus_1;
+    uint8_t  render_height_minus_1;

    uint8_t found_ref[AV1_REFS_PER_FRAME];

@@ -429,7 +429,6 @@ typedef struct CodedBitstreamAV1Context {
    int operating_point_idc;

    int bit_depth;
-    int order_hint;
    int frame_width;
    int frame_height;
    int upscaled_width;
@@ -366,7 +366,7 @@ static int FUNC(set_frame_refs)(CodedBitstreamContext *ctx, RWContext *rw,
    for (i = 0; i < AV1_NUM_REF_FRAMES; i++)
        shifted_order_hints[i] = cur_frame_hint +
                                 cbs_av1_get_relative_dist(seq, priv->ref[i].order_hint,
-                                                           priv->order_hint);
+                                                           current->order_hint);

    latest_order_hint = shifted_order_hints[current->last_frame_idx];
    earliest_order_hint = shifted_order_hints[current->golden_frame_idx];
@@ -541,7 +541,7 @@ static int FUNC(frame_size_with_refs)(CodedBitstreamContext *ctx, RWContext *rw,
            }

            priv->upscaled_width = ref->upscaled_width;
-            priv->frame_width    = priv->upscaled_width;
+            priv->frame_width    = ref->frame_width;
            priv->frame_height   = ref->frame_height;
            priv->render_width   = ref->render_width;
            priv->render_height  = ref->render_height;
@@ -993,7 +993,7 @@ static int FUNC(skip_mode_params)(CodedBitstreamContext *ctx, RWContext *rw,
        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
            ref_hint = priv->ref[current->ref_frame_idx[i]].order_hint;
            dist = cbs_av1_get_relative_dist(seq, ref_hint,
-                                             priv->order_hint);
+                                             current->order_hint);
            if (dist < 0) {
                if (forward_idx < 0 ||
                    cbs_av1_get_relative_dist(seq, ref_hint,
@@ -1261,10 +1261,10 @@ static int FUNC(uncompressed_header)(CodedBitstreamContext *ctx, RWContext *rw,
        flag(show_existing_frame);

        if (current->show_existing_frame) {
-            AV1ReferenceFrameState *ref;
+            AV1ReferenceFrameState *frame;

            fb(3, frame_to_show_map_idx);
-            ref = &priv->ref[current->frame_to_show_map_idx];
+            frame = &priv->ref[current->frame_to_show_map_idx];

            if (seq->decoder_model_info_present_flag &&
                !seq->timing_info.equal_picture_interval) {
@@ -1275,24 +1275,12 @@ static int FUNC(uncompressed_header)(CodedBitstreamContext *ctx, RWContext *rw,
            if (seq->frame_id_numbers_present_flag)
                fb(id_len, display_frame_id);

-            infer(frame_type, ref->frame_type);
-            if (current->frame_type == AV1_FRAME_KEY) {
+            if (frame->frame_type == AV1_FRAME_KEY)
                infer(refresh_frame_flags, all_frames);
-
-                // Section 7.21
-                infer(current_frame_id, ref->frame_id);
-                priv->upscaled_width  = ref->upscaled_width;
-                priv->frame_width     = ref->frame_width;
-                priv->frame_height    = ref->frame_height;
-                priv->render_width    = ref->render_width;
-                priv->render_height   = ref->render_height;
-                priv->bit_depth       = ref->bit_depth;
-                priv->order_hint      = ref->order_hint;
-            } else
+            else
                infer(refresh_frame_flags, 0);

-            // Section 7.20
-            goto update_refs;
+            return 0;
        }

        fb(2, frame_type);
@@ -1378,7 +1366,6 @@ static int FUNC(uncompressed_header)(CodedBitstreamContext *ctx, RWContext *rw,
        fb(order_hint_bits, order_hint);
    else
        infer(order_hint, 0);
-    priv->order_hint = current->order_hint;

    if (frame_is_intra || current->error_resilient_mode)
        infer(primary_ref_frame, AV1_PRIMARY_REF_NONE);
@@ -1394,7 +1381,7 @@ static int FUNC(uncompressed_header)(CodedBitstreamContext *ctx, RWContext *rw,
                    int in_temporal_layer = (op_pt_idc >>  priv->temporal_id    ) & 1;
                    int in_spatial_layer  = (op_pt_idc >> (priv->spatial_id + 8)) & 1;
                    if (seq->operating_point_idc[i] == 0 ||
-                        (in_temporal_layer && in_spatial_layer)) {
+                        in_temporal_layer || in_spatial_layer) {
                        fbs(seq->decoder_model_info.buffer_removal_time_length_minus_1 + 1,
                            buffer_removal_time[i], 1, i);
                    }
@@ -1554,16 +1541,6 @@ static int FUNC(uncompressed_header)(CodedBitstreamContext *ctx, RWContext *rw,

    CHECK(FUNC(film_grain_params)(ctx, rw, current));

-    av_log(ctx->log_ctx, AV_LOG_DEBUG, "Frame %d:  size %dx%d  "
-           "upscaled %d  render %dx%d  subsample %dx%d  "
-           "bitdepth %d  tiles %dx%d.\n", priv->order_hint,
-           priv->frame_width, priv->frame_height, priv->upscaled_width,
-           priv->render_width, priv->render_height,
-           seq->color_config.subsampling_x + 1,
-           seq->color_config.subsampling_y + 1, priv->bit_depth,
-           priv->tile_rows, priv->tile_cols);
-
-update_refs:
    for (i = 0; i < AV1_NUM_REF_FRAMES; i++) {
        if (current->refresh_frame_flags & (1 << i)) {
            priv->ref[i] = (AV1ReferenceFrameState) {
@@ -1578,11 +1555,20 @@ update_refs:
                .subsampling_x  = seq->color_config.subsampling_x,
                .subsampling_y  = seq->color_config.subsampling_y,
                .bit_depth      = priv->bit_depth,
-                .order_hint     = priv->order_hint,
+                .order_hint     = current->order_hint,
            };
        }
    }

+    av_log(ctx->log_ctx, AV_LOG_DEBUG, "Frame %d:  size %dx%d  "
+           "upscaled %d  render %dx%d  subsample %dx%d  "
+           "bitdepth %d  tiles %dx%d.\n", current->order_hint,
+           priv->frame_width, priv->frame_height, priv->upscaled_width,
+           priv->render_width, priv->render_height,
+           seq->color_config.subsampling_x + 1,
+           seq->color_config.subsampling_y + 1, priv->bit_depth,
+           priv->tile_rows, priv->tile_cols);
+
    return 0;
 }

@@ -408,11 +408,10 @@ static int cbs_h2645_read_more_rbsp_data(GetBitContext *gbc)

 #define infer(name, value) do { \
        if (current->name != (value)) { \
-            av_log(ctx->log_ctx, AV_LOG_ERROR, \
+            av_log(ctx->log_ctx, AV_LOG_WARNING, "Warning: " \
                   "%s does not match inferred value: " \
                   "%"PRId64", but should be %"PRId64".\n", \
                   #name, (int64_t)current->name, (int64_t)(value)); \
-            return AVERROR_INVALIDDATA; \
        } \
    } while (0)

@@ -728,7 +728,7 @@ static int FUNC(sps_scc_extension)(CodedBitstreamContext *ctx, RWContext *rw,

        flag(sps_palette_predictor_initializer_present_flag);
        if (current->sps_palette_predictor_initializer_present_flag) {
-            ue(sps_num_palette_predictor_initializer_minus1, 0, 127);
+            ue(sps_num_palette_predictor_initializer_minus1, 0, 128);
            for (comp = 0; comp < (current->chroma_format_idc ? 3 : 1); comp++) {
                int bit_depth = comp == 0 ? current->bit_depth_luma_minus8 + 8
                                          : current->bit_depth_chroma_minus8 + 8;
@@ -744,32 +744,6 @@ static int FUNC(sps_scc_extension)(CodedBitstreamContext *ctx, RWContext *rw,
    return 0;
 }

-static int FUNC(vui_parameters_default)(CodedBitstreamContext *ctx,
-                                        RWContext *rw, H265RawVUI *current,
-                                        H265RawSPS *sps)
-{
-    infer(aspect_ratio_idc, 0);
-
-    infer(video_format,             5);
-    infer(video_full_range_flag,    0);
-    infer(colour_primaries,         2);
-    infer(transfer_characteristics, 2);
-    infer(matrix_coefficients,      2);
-
-    infer(chroma_sample_loc_type_top_field,    0);
-    infer(chroma_sample_loc_type_bottom_field, 0);
-
-    infer(tiles_fixed_structure_flag,    0);
-    infer(motion_vectors_over_pic_boundaries_flag, 1);
-    infer(min_spatial_segmentation_idc,  0);
-    infer(max_bytes_per_pic_denom,       2);
-    infer(max_bits_per_min_cu_denom,     1);
-    infer(log2_max_mv_length_horizontal, 15);
-    infer(log2_max_mv_length_vertical,   15);
-
-    return 0;
-}
-
 static int FUNC(sps)(CodedBitstreamContext *ctx, RWContext *rw,
                     H265RawSPS *current)
 {
@@ -934,8 +908,6 @@ static int FUNC(sps)(CodedBitstreamContext *ctx, RWContext *rw,
    flag(vui_parameters_present_flag);
    if (current->vui_parameters_present_flag)
        CHECK(FUNC(vui_parameters)(ctx, rw, &current->vui, current));
-    else
-        CHECK(FUNC(vui_parameters_default)(ctx, rw, &current->vui, current));

    flag(sps_extension_present_flag);
    if (current->sps_extension_present_flag) {
@@ -149,7 +149,6 @@ static int cbs_jpeg_split_fragment(CodedBitstreamContext *ctx,
            break;
        } else if (marker == JPEG_MARKER_SOS) {
            next_marker = -1;
-            end = start;
            for (i = start; i + 1 < frag->data_size; i++) {
                if (frag->data[i] != 0xff)
                    continue;
@@ -166,13 +165,13 @@ static int cbs_jpeg_split_fragment(CodedBitstreamContext *ctx,
            }
        } else {
            i = start;
-            if (i > frag->data_size - 2) {
+            if (i + 2 > frag->data_size) {
                av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid JPEG image: "
                       "truncated at %02x marker.\n", marker);
                return AVERROR_INVALIDDATA;
            }
            length = AV_RB16(frag->data + i);
-            if (length > frag->data_size - i) {
+            if (i + length > frag->data_size) {
                av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid JPEG image: "
                       "truncated at %02x marker segment.\n", marker);
                return AVERROR_INVALIDDATA;
@@ -422,7 +422,7 @@ static int cbs_vp9_split_fragment(CodedBitstreamContext *ctx,
    superframe_header = frag->data[frag->data_size - 1];

    if ((superframe_header & 0xe0) == 0xc0) {
-        VP9RawSuperframeIndex sfi = {0};
+        VP9RawSuperframeIndex sfi;
        GetBitContext gbc;
        size_t index_size, pos;
        int i;
@@ -239,7 +239,7 @@ static void cdg_scroll(CDGraphicsContext *cc, uint8_t *data,
    for (y = FFMAX(0, vinc); y < FFMIN(CDG_FULL_HEIGHT + vinc, CDG_FULL_HEIGHT); y++)
        memcpy(out + FFMAX(0, hinc) + stride * y,
               in + FFMAX(0, hinc) - hinc + (y - vinc) * stride,
-               FFABS(stride) - FFABS(hinc));
+               FFMIN(stride + hinc, stride));

    if (vinc > 0)
        cdg_fill_wrapper(0, 0, out,
@@ -65,11 +65,11 @@ int ff_celp_lp_synthesis_filter(int16_t *out, const int16_t *filter_coeffs,
    int i,n;

    for (n = 0; n < buffer_length; n++) {
-        int sum = rounder, sum1;
+        int sum = -rounder, sum1;
        for (i = 1; i <= filter_length; i++)
-            sum -= (unsigned)(filter_coeffs[i-1] * out[n-i]);
+            sum += (unsigned)(filter_coeffs[i-1] * out[n-i]);

-        sum1 = ((sum >> 12) + in[n]) >> shift;
+        sum1 = ((-sum >> 12) + in[n]) >> shift;
        sum  = av_clip_int16(sum1);

        if (stop_on_overflow && sum != sum1)
@@ -78,7 +78,7 @@ int64_t ff_dot_product(const int16_t *a, const int16_t *b, int length);
 *
 * @return value << offset, if offset>=0; value >> -offset - otherwise
 */
-static inline unsigned bidir_sal(unsigned value, int offset)
+static inline int bidir_sal(int value, int offset)
 {
    if(offset < 0) return value >> -offset;
    else           return value <<  offset;
@@ -503,10 +503,6 @@ static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
                avpriv_report_missing_feature(avctx, "Transform type of %"PRIu16, data);
                ret = AVERROR_PATCHWELCOME;
                break;
-            } else if (data == 1) {
-                av_log(avctx, AV_LOG_ERROR, "unsupported transform type\n");
-                ret = AVERROR_PATCHWELCOME;
-                break;
            }
            av_log(avctx, AV_LOG_DEBUG, "Transform-type? %"PRIu16"\n", data);
        } else if (abstag >= 0x4000 && abstag <= 0x40ff) {
@@ -611,12 +607,6 @@ static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
            s->peak.level   = 0;
        } else if (tag == -74 && s->peak.offset) {
            s->peak.level = data;
-            if (s->peak.offset < 4 - bytestream2_tell(&s->peak.base) ||
-                s->peak.offset > 4 + bytestream2_get_bytes_left(&s->peak.base)
-            ) {
-                ret = AVERROR_INVALIDDATA;
-                goto end;
-            }
            bytestream2_seek(&s->peak.base, s->peak.offset - 4, SEEK_CUR);
        } else
            av_log(avctx, AV_LOG_DEBUG,  "Unknown tag %i data %x\n", tag, data);
@@ -665,8 +665,8 @@ static av_cold int clv_decode_init(AVCodecContext *avctx)
    }

    c->tile_shift = av_log2(c->tile_size);
-    if (1U << c->tile_shift != c->tile_size || c->tile_shift < 1 || c->tile_shift > 30) {
-        av_log(avctx, AV_LOG_ERROR, "Tile size: %d, is not power of 2 > 1 and < 2^31\n", c->tile_size);
+    if (1U << c->tile_shift != c->tile_size) {
+        av_log(avctx, AV_LOG_ERROR, "Tile size: %d, is not power of 2.\n", c->tile_size);
        return AVERROR_INVALIDDATA;
    }

@@ -91,3 +91,4 @@ AVCodec ff_cljr_decoder = {
    .decode         = decode_frame,
    .capabilities   = AV_CODEC_CAP_DR1,
 };
+
@@ -1084,10 +1084,6 @@ static av_cold int cook_decode_init(AVCodecContext *avctx)
    ff_audiodsp_init(&q->adsp);

    while (bytestream2_get_bytes_left(&gb)) {
-        if (s >= FFMIN(MAX_SUBPACKETS, avctx->block_align)) {
-            avpriv_request_sample(avctx, "subpackets > %d", FFMIN(MAX_SUBPACKETS, avctx->block_align));
-            return AVERROR_PATCHWELCOME;
-        }
        /* 8 for mono, 16 for stereo, ? for multichannel
           Swap to right endianness so we don't need to care later on. */
        q->subpacket[s].cookversion      = bytestream2_get_be32(&gb);
@@ -1219,6 +1215,10 @@ static av_cold int cook_decode_init(AVCodecContext *avctx)

        q->num_subpackets++;
        s++;
+        if (s > FFMIN(MAX_SUBPACKETS, avctx->block_align)) {
+            avpriv_request_sample(avctx, "subpackets > %d", FFMIN(MAX_SUBPACKETS, avctx->block_align));
+            return AVERROR_PATCHWELCOME;
+        }
    }

    /* Try to catch some obviously faulty streams, otherwise it might be exploitable */
@@ -111,7 +111,6 @@ static int cpia_decode_frame(AVCodecContext *avctx,
        // Read line length, two byte little endian
        linelength = AV_RL16(src);
        src += 2;
-        src_size -= 2;

        if (src_size < linelength) {
            frame->decode_error_flags = FF_DECODE_ERROR_INVALID_BITSTREAM;
@@ -71,9 +71,6 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
    int buf_size = avpkt->size;
    CamStudioContext *c = avctx->priv_data;
    int ret;
-    int bpp = avctx->bits_per_coded_sample / 8;
-    int bugdelta = FFALIGN(avctx->width * bpp, 4)       * avctx->height
-                 -        (avctx->width     & ~3) * bpp * avctx->height;

    if (buf_size < 2) {
        av_log(avctx, AV_LOG_ERROR, "coded frame too small\n");
@@ -87,7 +84,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
    switch ((buf[0] >> 1) & 7) {
    case 0: { // lzo compression
        int outlen = c->decomp_size, inlen = buf_size - 2;
-        if (av_lzo1x_decode(c->decomp_buf, &outlen, &buf[2], &inlen) || (outlen && outlen != bugdelta)) {
+        if (av_lzo1x_decode(c->decomp_buf, &outlen, &buf[2], &inlen) || outlen) {
            av_log(avctx, AV_LOG_ERROR, "error during lzo decompression\n");
            return AVERROR_INVALIDDATA;
        }
@@ -96,7 +93,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
    case 1: { // zlib compression
 #if CONFIG_ZLIB
        unsigned long dlen = c->decomp_size;
-        if (uncompress(c->decomp_buf, &dlen, &buf[2], buf_size - 2) != Z_OK || (dlen != c->decomp_size && dlen != c->decomp_size - bugdelta)) {
+        if (uncompress(c->decomp_buf, &dlen, &buf[2], buf_size - 2) != Z_OK) {
            av_log(avctx, AV_LOG_ERROR, "error during zlib decompression\n");
            return AVERROR_INVALIDDATA;
        }
@@ -88,7 +88,7 @@ typedef struct CuvidContext
    CUVIDDECODECAPS caps8, caps10, caps12;

    CUVIDPARSERPARAMS cuparseinfo;
-    CUVIDEOFORMATEX *cuparse_ext;
+    CUVIDEOFORMATEX cuparse_ext;

    CudaFunctions *cudl;
    CuvidFunctions *cvdl;
@@ -684,7 +684,6 @@ static av_cold int cuvid_decode_end(AVCodecContext *avctx)
    av_buffer_unref(&ctx->hwdevice);

    av_freep(&ctx->key_frame);
-    av_freep(&ctx->cuparse_ext);

    cuvid_free_functions(&ctx->cvdl);

@@ -794,8 +793,6 @@ static av_cold int cuvid_decode_init(AVCodecContext *avctx)
    CUVIDSOURCEDATAPACKET seq_pkt;
    CUcontext cuda_ctx = NULL;
    CUcontext dummy;
-    uint8_t *extradata;
-    int extradata_size;
    int ret = 0;

    enum AVPixelFormat pix_fmts[3] = { AV_PIX_FMT_CUDA,
@@ -892,8 +889,11 @@ static av_cold int cuvid_decode_init(AVCodecContext *avctx)
    ctx->cudl = device_hwctx->internal->cuda_dl;

    memset(&ctx->cuparseinfo, 0, sizeof(ctx->cuparseinfo));
+    memset(&ctx->cuparse_ext, 0, sizeof(ctx->cuparse_ext));
    memset(&seq_pkt, 0, sizeof(seq_pkt));

+    ctx->cuparseinfo.pExtVideoInfo = &ctx->cuparse_ext;
+
    switch (avctx->codec->id) {
 #if CONFIG_H264_CUVID_DECODER
    case AV_CODEC_ID_H264:
@@ -947,26 +947,17 @@ static av_cold int cuvid_decode_init(AVCodecContext *avctx)

    if (avctx->codec->bsfs) {
        const AVCodecParameters *par = avctx->internal->bsf->par_out;
-        extradata = par->extradata;
-        extradata_size = par->extradata_size;
-    } else {
-        extradata = avctx->extradata;
-        extradata_size = avctx->extradata_size;
+        ctx->cuparse_ext.format.seqhdr_data_length = par->extradata_size;
+        memcpy(ctx->cuparse_ext.raw_seqhdr_data,
+               par->extradata,
+               FFMIN(sizeof(ctx->cuparse_ext.raw_seqhdr_data), par->extradata_size));
+    } else if (avctx->extradata_size > 0) {
+        ctx->cuparse_ext.format.seqhdr_data_length = avctx->extradata_size;
+        memcpy(ctx->cuparse_ext.raw_seqhdr_data,
+               avctx->extradata,
+               FFMIN(sizeof(ctx->cuparse_ext.raw_seqhdr_data), avctx->extradata_size));
    }

-    ctx->cuparse_ext = av_mallocz(sizeof(*ctx->cuparse_ext)
-            + FFMAX(extradata_size - (int)sizeof(ctx->cuparse_ext->raw_seqhdr_data), 0));
-    if (!ctx->cuparse_ext) {
-        ret = AVERROR(ENOMEM);
-        goto error;
-    }
-
-    if (extradata_size > 0)
-        memcpy(ctx->cuparse_ext->raw_seqhdr_data, extradata, extradata_size);
-    ctx->cuparse_ext->format.seqhdr_data_length = extradata_size;
-
-    ctx->cuparseinfo.pExtVideoInfo = ctx->cuparse_ext;
-
    ctx->key_frame = av_mallocz(ctx->nb_surfaces * sizeof(int));
    if (!ctx->key_frame) {
        ret = AVERROR(ENOMEM);
@@ -995,8 +986,8 @@ static av_cold int cuvid_decode_init(AVCodecContext *avctx)
    if (ret < 0)
        goto error;

-    seq_pkt.payload = ctx->cuparse_ext->raw_seqhdr_data;
-    seq_pkt.payload_size = ctx->cuparse_ext->format.seqhdr_data_length;
+    seq_pkt.payload = ctx->cuparse_ext.raw_seqhdr_data;
+    seq_pkt.payload_size = ctx->cuparse_ext.format.seqhdr_data_length;

    if (seq_pkt.payload && seq_pkt.payload_size) {
        ret = CHECK_CU(ctx->cvdl->cuvidParseVideoData(ctx->cuparser, &seq_pkt));
@@ -1055,8 +1046,8 @@ static void cuvid_flush(AVCodecContext *avctx)
    if (ret < 0)
        goto error;

-    seq_pkt.payload = ctx->cuparse_ext->raw_seqhdr_data;
-    seq_pkt.payload_size = ctx->cuparse_ext->format.seqhdr_data_length;
+    seq_pkt.payload = ctx->cuparse_ext.raw_seqhdr_data;
+    seq_pkt.payload_size = ctx->cuparse_ext.format.seqhdr_data_length;

    if (seq_pkt.payload && seq_pkt.payload_size) {
        ret = CHECK_CU(ctx->cvdl->cuvidParseVideoData(ctx->cuparser, &seq_pkt));
@@ -328,7 +328,7 @@ static void dmix_add_c(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t le
    int i;

    for (i = 0; i < len; i++)
-        dst[i] += (unsigned)mul15(src[i], coeff);
+        dst[i] += mul15(src[i], coeff);
 }

 static void dmix_scale_c(int32_t *dst, int scale, ptrdiff_t len)
@@ -1858,8 +1858,7 @@ int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
    int ret;

    if (avctx->codec_type == AVMEDIA_TYPE_VIDEO) {
-        if ((unsigned)avctx->width > INT_MAX - STRIDE_ALIGN ||
-            (ret = av_image_check_size2(FFALIGN(avctx->width, STRIDE_ALIGN), avctx->height, avctx->max_pixels, AV_PIX_FMT_NONE, 0, avctx)) < 0 || avctx->pix_fmt<0) {
+        if ((ret = av_image_check_size2(FFALIGN(avctx->width, STRIDE_ALIGN), avctx->height, avctx->max_pixels, AV_PIX_FMT_NONE, 0, avctx)) < 0 || avctx->pix_fmt<0) {
            av_log(avctx, AV_LOG_ERROR, "video_get_buffer: image parameters invalid\n");
            ret = AVERROR(EINVAL);
            goto fail;
@@ -215,7 +215,7 @@ static int dirac_combine_frame(AVCodecParserContext *s, AVCodecContext *avctx,
            int64_t pts = AV_RB32(cur_pu + 13);
            if (s->last_pts == 0 && s->last_dts == 0)
                s->dts = pts - 1;
-            else if (s->last_dts != AV_NOPTS_VALUE)
+            else
                s->dts = s->last_dts + 1;
            s->pts = pts;
            if (!avctx->has_b_frames && (cur_pu[4] & 0x03))
@@ -1431,8 +1431,8 @@ static void global_mv(DiracContext *s, DiracBlock *block, int x, int y, int ref)
    int *c      = s->globalmc[ref].perspective;

    int64_t m   = (1<<ep) - (c[0]*(int64_t)x + c[1]*(int64_t)y);
-    int64_t mx  = m * (uint64_t)((A[0][0] * (int64_t)x + A[0][1]*(int64_t)y) + (1LL<<ez) * b[0]);
-    int64_t my  = m * (uint64_t)((A[1][0] * (int64_t)x + A[1][1]*(int64_t)y) + (1LL<<ez) * b[1]);
+    int64_t mx  = m * (int64_t)((A[0][0] * (int64_t)x + A[0][1]*(int64_t)y) + (1LL<<ez) * b[0]);
+    int64_t my  = m * (int64_t)((A[1][0] * (int64_t)x + A[1][1]*(int64_t)y) + (1LL<<ez) * b[1]);

    block->u.mv[ref][0] = (mx + (1<<(ez+ep))) >> (ez+ep);
    block->u.mv[ref][1] = (my + (1<<(ez+ep))) >> (ez+ep);
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .3.9
 .2.git