avfilter/boxblur: Fix off by one errors

Fixes: ada-2-poc.mkv Found-by: Claude and Ada Logics. This issue was found by Anthropic from using agents to study security of open source projects, and I am from Ada Logics helping validate the found issues and report to maintainers. Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
avformat/iamf_writer: reject unset frame size
2026-05-16 17:22:45 +00:00 · 2026-05-16 13:55:23 -03:00 · 2026-05-16 13:55:23 -03:00 · 2026-05-16 13:55:22 -03:00 · 2026-05-16 13:55:22 -03:00 · 2026-05-16 13:55:22 -03:00
973 changed files with 42125 additions and 15037 deletions
@@ -57,6 +57,7 @@ libavcodec/.*png.* @Traneptora
 libavcodec/.*prores.* @lynne
 libavcodec/rangecoder.* @michaelni
 libavcodec/ratecontrol.* @michaelni
+libavcodec/rkmpp* @quink
 libavcodec/rv60.* @pross
 libavcodec/sgirle.* @pross
 libavcodec/.*siren.* @lynne
@@ -109,6 +110,7 @@ libavfilter/vf_find_rect.* @michaelni
 libavfilter/vf_icc.* @haasn
 libavfilter/vf_libplacebo.* @haasn
 libavfilter/vf_libvmaf.* @kylophone
+libavfilter/vf_mpdecimate.* @dana-feng
 libavfilter/vf_premultiply.* @haasn
 libavfilter/vf_scale.* @haasn
 libavfilter/vf_scale_vt.* @quink
@@ -139,6 +141,8 @@ libavformat/electronicarts.* @pross
 libavformat/.*exif.* @Traneptora
 libavformat/filmstrip.* @pross
 libavformat/frm.* @pross
+libavformat/hls.* @kasper93
+libavformat/hxvs.* @quink
 libavformat/iamf.* @jamrial
 libavformat/icecast.c @ePirat
 libavformat/ico.* @pross
@@ -153,6 +157,7 @@ libavformat/mlv.* @pross
 libavformat/mm.* @pross
 libavformat/msp.* @pross
 libavformat/mv.* @pross
+libavformat/ogg.* @toots
 libavformat/pp_bnk.* @zane
 libavformat/rm.* @pross
 libavformat/sauce.* @pross
@@ -193,6 +198,7 @@ libavutil/aarch64/.* @lynne @mstorsjo
 libavutil/arm/.* @mstorsjo
 libavutil/ppc/.* @sean_mcg
 libavutil/riscv/.* @Courmisch
+libavutil/wasm/.* @quink
 libavutil/x86/.* @lynne

 # swresample
@@ -226,8 +232,16 @@ doc/.* @GyanD
 # tests
 # =====
 tests/checkasm/riscv/.* @Courmisch
+libavutil/tests/buffer.* @MarcosAsh
+libavutil/tests/hdr_dynamic_vivid_metadata.* @MarcosAsh
+libavutil/tests/tdrdi.* @MarcosAsh
+libavutil/tests/timestamp.* @MarcosAsh
 tests/ref/.*drawvg.* @ayosec
+tests/ref/fate/buffer @MarcosAsh
+tests/ref/fate/hdr_dynamic_vivid_metadata @MarcosAsh
 tests/ref/fate/sub-mcc.* @programmerjake
+tests/ref/fate/tdrdi @MarcosAsh
+tests/ref/fate/timestamp @MarcosAsh

 # Forgejo
 # =======
@@ -20,9 +20,9 @@ repos:
    - id: trailing-whitespace
 - repo: local
  hooks:
-    - id: aarch64-asm-indent
-      name: fix aarch64 assembly indentation
-      files: ^.*/aarch64/.*\.S$
+    - id: arm-asm-indent
+      name: fix arm/aarch64 assembly indentation
+      files: ^.*/(arm|aarch64)/.*\.S$
      language: script
      entry: ./tools/check_arm_indent.sh --apply
      pass_filenames: false
@@ -69,6 +69,7 @@ pEvents
 PixelX
 Psot
 quater
+re-use
 readd
 recuse
 redY
@@ -22,6 +22,7 @@ jobs:
        with:
          configuration-path: .forgejo/labeler/labeler.yml
          repo-token: ${{ secrets.AUTOLABELER_TOKEN }}
+          sync-labels: true
      - name: Label by title-match
        uses: actions/github-script@v8
        with:
@@ -26,6 +26,8 @@
 *.spv
 *.spv.c
 *.spv.gz
+*.gen.c
+*.gen.S
 *.ptx
 *.ptx.c
 *.ptx.gz
@@ -1,4 +1,5 @@
 # Note to Github users
+
 Patches should be submitted to [Forgejo](https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls) or the [ffmpeg-devel mailing list](https://ffmpeg.org/mailman/listinfo/ffmpeg-devel) using `git format-patch` or `git send-email`. Github pull requests should be avoided because they are not part of our review process and **will be ignored**.

 See [https://ffmpeg.org/developer.html#Contributing](https://ffmpeg.org/developer.html#Contributing) for more information.
@@ -2,6 +2,14 @@ Entries are sorted chronologically from oldest to youngest within each release,
 releases are sorted from youngest to oldest.

 version <next>:
+- Extend AMF Color Converter (vf_vpp_amf) HDR capabilities
+- LCEVC track muxing support in MP4 muxer
+- Playdate video encoder and muxer
+- Add v360_vulkan filter
+- HE-AAC 960 decoding (DAB+)
+- transpose_cuda filter
+- Add AMF Frame Rate Converter (vf_frc_amf) filter
+- SMPTE 2094-50 metadata support and passthrough


 version 8.1:
@@ -1,4 +1,4 @@
-## Installing FFmpeg
+# Installing FFmpeg

 0. If you like to include source plugins, merge them before configure
 for example run tools/merge-all-source-plugins
@@ -14,15 +14,11 @@ path when launching `configure`, e.g. `/ffmpegdir/ffmpeg/configure`.

 3. Type `make install` to install all binaries and libraries you built.

-NOTICE
------
+## NOTICE

- - Non system dependencies (e.g. libx264, libvpx) are disabled by default.
+- Non system dependencies (e.g. libx264, libvpx) are disabled by default.

-NOTICE for Package Maintainers
------------------------------
+## NOTICE for Package Maintainers

- - It is recommended to build FFmpeg twice, first with minimal external dependencies so
-   that 3rd party packages, which depend on FFmpegs libavutil/libavfilter/libavcodec/libavformat
-   can then be built. And last build FFmpeg with full dependencies (which may in turn depend on
-   some of these 3rd party packages). This avoids circular dependencies during build.
+- It is recommended to build FFmpeg twice, first with minimal external dependencies so that 3rd party packages, which depend on FFmpegs libavutil/libavfilter/libavcodec/libavformat
+can then be built. And last build FFmpeg with full dependencies (which may in turn depend on some of these 3rd party packages). This avoids circular dependencies during build.
@@ -93,6 +93,7 @@ Other:
  hash                                  Reimar Doeffinger
  hwcontext_cuda*                       Timo Rothenpieler
  hwcontext_d3d12va*                    Wu Jianhua
+  hwcontext_oh*                         Zhao Zhili
  hwcontext_vulkan*                 [2] Lynne
  intfloat*                             Michael Niedermayer
  integer.c, integer.h                  Michael Niedermayer
@@ -158,7 +159,7 @@ Codecs:
  asv*                                  Michael Niedermayer
  atrac3plus*                           Maxim Poliakovski
  audiotoolbox*                         rcombs
-  avs2*                                 Huiwen Ren
+  avs2*                                 Huiwen Ren, Zhao Zhili
  bgmc.c, bgmc.h                        Thilo Borgmann
  binkaudio.c                           Peter Ross
  cavs*                                 Stefan Gehrer
@@ -232,6 +233,7 @@ Codecs:
  msvideo1.c                            Mike Melanson
  nuv.c                                 Reimar Doeffinger
  nvdec*, nvenc*                        Timo Rothenpieler
+  oh*                                   Zhao Zhili
  omx.c                                 Martin Storsjo, Aman Gupta
  opus*                                 Rostislav Pehlivanov
  pcx.c                                 Ivo van Poorten
@@ -243,6 +245,7 @@ Codecs:
  qtrle.c                               Mike Melanson
  ra144.c, ra144.h, ra288.c, ra288.h    Roberto Togni
  resample2.c                           Michael Niedermayer
+  rkmppenc*                             Zhao Zhili
  rl2.c                                 Sascha Sommer
  rpza.c                                Roberto Togni
  rtjpeg.c, rtjpeg.h                    Reimar Doeffinger
@@ -317,6 +320,10 @@ libavfilter
 ===========

 Generic parts:
+
+  Framework and orphaned filters        Nicolas George
+  (except hardware acceleration)
+
  graphdump.c                           Nicolas George

  motion_estimation.c                   Davinder Singh
@@ -348,7 +355,9 @@ Filters:
  vf_minterpolate.c                     Davinder Singh
  vf_readvitc.c                         Tobias Rapp (CC t.rapp at noa-archive dot com)
  vf_scale.c                        [2] Michael Niedermayer
+  vf_scale_vt.c                         Zhao Zhili
  vf_tonemap_opencl.c                   Ruiling Song
+  vf_transpose_vt.c                     Zhao Zhili
  vf_yadif.c                        [2] Michael Niedermayer
  vf_xfade_vulkan.c                 [2] Marvin Scholz (CC <epirat07@gmail.com>)

@@ -406,7 +415,9 @@ Muxers/Demuxers:
  flvenc.c                              Michael Niedermayer, Steven Liu
  gxf.c                                 Reimar Doeffinger
  gxfenc.c                              Baptiste Coudurier
+  hls.c                                 Kacper Michajłow
  hlsenc.c                              Christian Suloway, Steven Liu
+  hxvs.c                                Zhao Zhili
  iamf*                             [2] James Almer
  idcin.c                               Mike Melanson
  idroqdec.c                            Mike Melanson
@@ -442,9 +453,9 @@ Muxers/Demuxers:
  nsvdec.c                              Francois Revol
  nut*                                  Michael Niedermayer
  nuv.c                                 Reimar Doeffinger
-  oggdec.c, oggdec.h                    David Conrad
-  oggenc.c                              Baptiste Coudurier
-  oggparse*.c                           David Conrad
+  oggdec.c, oggdec.h                    David Conrad, Romain Beauxis
+  oggenc.c                              Baptiste Coudurier, Romain Beauxis
+  oggparse*.c                           David Conrad, Romain Beauxis
  oma.c                                 Maxim Poliakovski
  pp_bnk.c                              Zane van Iperen
  psxstr.c                              Mike Melanson
@@ -1,5 +1,4 @@
-FFmpeg README
-=============
+# FFmpeg README

 FFmpeg is a collection of libraries and tools to process multimedia content
 such as audio, video, subtitles and related metadata.
@@ -182,6 +182,7 @@ static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); }
 static inline __device__ float fabs(float a) { return __builtin_fabsf(a); }
 static inline __device__ double fabs(double a) { return __builtin_fabs(a); }
 static inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); }
+static inline __device__ float rintf(float a) { return __builtin_rintf(a); }

 static inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); }
 static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); }
@@ -390,7 +390,7 @@ Toolchain options:
  --tempprefix=PATH        force fixed dir/prefix instead of mktemp for checks
  --toolchain=NAME         set tool defaults according to NAME
                           (<tool>[-sanitizer[-...]], e.g. clang-asan-ubsan
-                           tools: gcc, clang, msvc, icl, gcov, llvm-cov,
+                           tools: gcc, clang, llvm, msvc, icl, gcov, llvm-cov,
                                  valgrind-memcheck, valgrind-massif, hardened
                           sanitizers: asan, fuzz, lsan, msan, tsan, ubsan)
  --nm=NM                  use nm tool NM [$nm_default]
@@ -415,6 +415,7 @@ Toolchain options:
  --pkg-config-flags=FLAGS pass additional flags to pkgconf []
  --ranlib=RANLIB          use ranlib RANLIB [$ranlib_default]
  --doxygen=DOXYGEN        use DOXYGEN to generate API doc [$doxygen_default]
+  --makeinfo=MAKEINFO      use MAKEINFO to generate documentation [$makeinfo_default]
  --host-cc=HOSTCC         use host C compiler HOSTCC
  --host-cflags=HCFLAGS    use HCFLAGS when compiling for host
  --host-cppflags=HCPPFLAGS use HCPPFLAGS when compiling for host
@@ -482,6 +483,8 @@ Optimization options (experts only):
  --disable-arm-crc        disable ARM/AArch64 CRC optimizations
  --disable-dotprod        disable DOTPROD optimizations
  --disable-i8mm           disable I8MM optimizations
+  --disable-pmull          disable PMULL optimizations
+  --disable-eor3           disable EOR3 optimizations
  --disable-sve            disable SVE optimizations
  --disable-sve2           disable SVE2 optimizations
  --disable-sme            disable SME optimizations
@@ -1079,6 +1082,10 @@ hostcc_o(){
    eval printf '%s\\n' $HOSTCC_O
 }

+hostld_o(){
+    eval printf '%s\\n' $HOSTLD_O
+}
+
 glslc_o(){
    eval printf '%s\\n' $GLSLC_O
 }
@@ -1294,7 +1301,14 @@ test_ld(){
    test_$type $($cflags_filter $flags) || return
    flags=$($ldflags_filter $flags)
    libs=$($ldflags_filter $libs)
-    test_cmd $ld $LDFLAGS $LDEXEFLAGS $flags $(ld_o $TMPE) $TMPO $libs $extralibs
+    log $ld $LDFLAGS $LDEXEFLAGS $flags $(ld_o $TMPE) $TMPO $libs $extralibs
+    output=$($ld $LDFLAGS $LDEXEFLAGS $flags $(ld_o $TMPE) $TMPO $libs $extralibs 2>&1)
+    ret=$?
+    echo "$output" >> $logfile
+    # link.exe and lld-link exit 0 even for unrecognized options, emitting
+    # only a warning (LNK4044 / "ignoring unknown argument").  Treat such
+    # output as failure so check_ldflags rejects those flags correctly.
+    test $ret -eq 0 && ! echo "$output" | grep -qE 'LNK4044|lld-link: warning: ignoring unknown argument'
 }

 check_ld(){
@@ -1833,6 +1847,16 @@ test_host_cc(){
    test_cmd $host_cc $host_cflags "$@" $HOSTCC_C $(hostcc_o $TMPO) $TMPC
 }

+test_host_ld(){
+    log test_host_ld "$@"
+    flags=$(filter_out '-l*|*.so' $@)
+    libs=$(filter '-l*|*.so' $@)
+    test_host_cc $($host_cflags_filter $flags) || return
+    flags=$($host_ldflags_filter $flags)
+    libs=$($host_ldflags_filter $libs)
+    test_cmd $host_ld $host_ldflags $flags $(hostld_o $TMPE) $TMPO $libs $host_extralibs
+}
+
 test_host_cpp(){
    log test_host_cpp "$@"
    cat > $TMPC
@@ -1897,6 +1921,27 @@ check_host_cpp_condition(){
    test_host_cpp_condition "$@" && enable $name
 }

+check_host_lib(){
+    log check_host_lib "$@"
+    headers="$1"
+    funcs="$2"
+    shift 2
+    {
+        for hdr in $headers; do
+            print_include $hdr
+        done
+        echo "#include <stdint.h>"
+        for func in $funcs; do
+            echo "long check_$func(void) { return (long) $func; }"
+        done
+        echo "int main(void) { int ret = 0;"
+        for func in $funcs; do
+            echo " ret |= ((intptr_t)check_$func) & 0xFFFF;"
+        done
+        echo "return ret; }"
+    } | test_host_ld "$@" && append host_extralibs "$@"
+}
+
 cp_if_changed(){
    cmp -s "$1" "$2" && { test "$quiet" != "yes" && echo "$2 is unchanged"; } && return
    mkdir -p "$(dirname $2)"
@@ -2299,6 +2344,8 @@ ARCH_EXT_LIST_ARM="
    arm_crc
    dotprod
    i8mm
+    pmull
+    eor3
    neon
    vfp
    vfpv3
@@ -2450,6 +2497,8 @@ HEADERS_LIST="
    valgrind_valgrind_h
    windows_h
    winsock2_h
+    spirv_headers_spirv_h
+    spirv_unified1_spirv_h
 "

 INTRINSICS_LIST="
@@ -2575,6 +2624,8 @@ TOOLCHAIN_FEATURES="
    as_archext_crc_directive
    as_archext_dotprod_directive
    as_archext_i8mm_directive
+    as_archext_sha3_directive
+    as_archext_aes_directive
    as_archext_sve_directive
    as_archext_sve2_directive
    as_archext_sme_directive
@@ -2666,7 +2717,6 @@ HAVE_LIST="
    gzip
    ioctl_posix
    libdrm_getfb2
-    makeinfo
    makeinfo_html
    opencl_d3d11
    opencl_drm_arm
@@ -2696,7 +2746,9 @@ CONFIG_EXTRA="
    cabac
    cbs
    cbs_apv
+    cbs_apv_lavf
    cbs_av1
+    cbs_av1_lavf
    cbs_h264
    cbs_h265
    cbs_h266
@@ -2848,6 +2900,7 @@ CMDLINE_SET="
    cxx
    dep_cc
    doxygen
+    makeinfo
    env
    extra_version
    gas
@@ -2918,6 +2971,8 @@ setend_deps="arm"
 arm_crc_deps="aarch64"
 dotprod_deps="aarch64 neon"
 i8mm_deps="aarch64 neon"
+pmull_deps="aarch64 neon"
+eor3_deps="aarch64 neon"
 sve_deps="aarch64 neon"
 sve2_deps="aarch64 neon sve"
 sme_deps="aarch64 neon sve sve2"
@@ -3233,6 +3288,7 @@ nuv_decoder_select="idctdsp"
 opus_decoder_deps="swresample"
 opus_encoder_select="audio_frame_queue"
 pdv_decoder_select="inflate_wrapper"
+pdv_encoder_select="deflate_wrapper"
 png_decoder_select="inflate_wrapper"
 png_encoder_select="deflate_wrapper llvidencdsp"
 prores_decoder_select="blockdsp idctdsp"
@@ -3522,6 +3578,8 @@ scale_cuda_filter_deps="ffnvcodec"
 scale_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 thumbnail_cuda_filter_deps="ffnvcodec"
 thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+transpose_cuda_filter_deps="ffnvcodec"
+transpose_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
@@ -3693,6 +3751,7 @@ vvc_parser_select="cbs_h266"
 # bitstream_filters
 aac_adtstoasc_bsf_select="adts_header mpeg4audio"
 ahx_to_mp2_bsf_deps="lgpl_gpl"
+apv_metadata_bsf_select="cbs_apv"
 av1_frame_merge_bsf_select="cbs_av1"
 av1_frame_split_bsf_select="cbs_av1"
 av1_metadata_bsf_select="cbs_av1"
@@ -3786,6 +3845,7 @@ libkvazaar_encoder_deps="libkvazaar"
 liblc3_decoder_deps="liblc3"
 liblc3_encoder_deps="liblc3"
 liblc3_encoder_select="audio_frame_queue"
+liblcevc_dec_select="cbs_lcevc"
 libmodplug_demuxer_deps="libmodplug"
 libmp3lame_encoder_deps="libmp3lame"
 libmp3lame_encoder_select="audio_frame_queue mpegaudioheader"
@@ -3906,7 +3966,7 @@ mlp_demuxer_select="mlp_parser"
 mmf_muxer_select="riffenc"
 mov_demuxer_select="iso_media riffdec"
 mov_demuxer_suggest="iamfdec zlib"
-mov_muxer_select="iso_media iso_writer riffenc rtpenc_chain vp9_superframe_bsf aac_adtstoasc_bsf ac3_parser"
+mov_muxer_select="cbs_apv_lavf cbs_av1_lavf iso_media iso_writer riffenc rtpenc_chain vp9_superframe_bsf aac_adtstoasc_bsf ac3_parser"
 mov_muxer_suggest="iamfenc"
 mp3_demuxer_select="mpegaudio_parser"
 mp3_muxer_select="mpegaudioheader"
@@ -4091,15 +4151,15 @@ avgblur_vulkan_filter_deps="vulkan spirv_compiler"
 azmq_filter_deps="libzmq"
 blackdetect_vulkan_filter_deps="vulkan spirv_library"
 blackframe_filter_deps="gpl"
-blend_vulkan_filter_deps="vulkan spirv_library"
+blend_vulkan_filter_deps="vulkan spirv_compiler"
 boxblur_filter_deps="gpl"
 boxblur_opencl_filter_deps="opencl gpl"
 bs2b_filter_deps="libbs2b"
 bwdif_cuda_filter_deps="ffnvcodec"
 bwdif_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 bwdif_vulkan_filter_deps="vulkan spirv_compiler"
-chromaber_vulkan_filter_deps="vulkan spirv_library"
-color_vulkan_filter_deps="vulkan spirv_library"
+chromaber_vulkan_filter_deps="vulkan spirv_compiler"
+color_vulkan_filter_deps="vulkan spirv_compiler"
 colorkey_opencl_filter_deps="opencl"
 colormatrix_filter_deps="gpl"
 convolution_opencl_filter_deps="opencl"
@@ -4128,7 +4188,7 @@ elbg_filter_deps="avcodec"
 eq_filter_deps="gpl"
 erosion_opencl_filter_deps="opencl"
 find_rect_filter_deps="avcodec avformat gpl"
-flip_vulkan_filter_deps="vulkan spirv_library"
+flip_vulkan_filter_deps="vulkan spirv_compiler"
 flite_filter_deps="libflite threads"
 framerate_filter_select="scene_sad"
 freezedetect_filter_select="scene_sad"
@@ -4137,15 +4197,15 @@ frei0r_filter_deps="frei0r"
 frei0r_src_filter_deps="frei0r"
 fspp_filter_deps="gpl"
 fsync_filter_deps="avformat"
-gblur_vulkan_filter_deps="vulkan spirv_library"
-hflip_vulkan_filter_deps="vulkan spirv_library"
+gblur_vulkan_filter_deps="vulkan spirv_compiler"
+hflip_vulkan_filter_deps="vulkan spirv_compiler"
 histeq_filter_deps="gpl"
 hqdn3d_filter_deps="gpl"
 iccdetect_filter_deps="lcms2"
 iccgen_filter_deps="lcms2"
 identity_filter_select="scene_sad"
 interlace_filter_deps="gpl"
-interlace_vulkan_filter_deps="vulkan spirv_library"
+interlace_vulkan_filter_deps="vulkan spirv_compiler"
 kerndeint_filter_deps="gpl"
 ladspa_filter_deps="ladspa libdl"
 lcevc_filter_deps="liblcevc_dec"
@@ -4176,7 +4236,7 @@ overlay_opencl_filter_deps="opencl"
 overlay_qsv_filter_deps="libmfx"
 overlay_qsv_filter_select="qsvvpp"
 overlay_vaapi_filter_deps="vaapi VAProcPipelineCaps_blend_flags"
-overlay_vulkan_filter_deps="vulkan spirv_library"
+overlay_vulkan_filter_deps="vulkan spirv_compiler"
 owdenoise_filter_deps="gpl"
 pad_opencl_filter_deps="opencl"
 pan_filter_deps="swresample"
@@ -4197,10 +4257,11 @@ scale2ref_filter_deps="swscale"
 scale_filter_deps="swscale"
 sr_amf_filter_deps="amf"
 vpp_amf_filter_deps="amf"
+frc_amf_filter_deps="amf windows_h"
 scale_qsv_filter_deps="libmfx"
 scale_qsv_filter_select="qsvvpp"
 scdet_filter_select="scene_sad"
-scdet_vulkan_filter_deps="vulkan spirv_library"
+scdet_vulkan_filter_deps="vulkan spirv_compiler"
 select_filter_select="scene_sad"
 sharpness_vaapi_filter_deps="vaapi"
 showcqt_filter_deps="avformat swscale"
@@ -4226,11 +4287,12 @@ tonemap_opencl_filter_deps="opencl const_nan"
 transpose_opencl_filter_deps="opencl"
 transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags"
 transpose_vt_filter_deps="videotoolbox VTPixelRotationSessionCreate"
-transpose_vulkan_filter_deps="vulkan spirv_library"
+transpose_vulkan_filter_deps="vulkan spirv_compiler"
 unsharp_opencl_filter_deps="opencl"
 uspp_filter_deps="gpl avcodec"
+v360_vulkan_filter_deps="vulkan spirv_compiler"
 vaguedenoiser_filter_deps="gpl"
-vflip_vulkan_filter_deps="vulkan spirv_library"
+vflip_vulkan_filter_deps="vulkan spirv_compiler"
 vidstabdetect_filter_deps="libvidstab"
 vidstabtransform_filter_deps="libvidstab"
 libvmaf_filter_deps="libvmaf"
@@ -4244,7 +4306,7 @@ scale_vulkan_filter_deps="vulkan spirv_compiler spirv_library"
 vpp_qsv_filter_deps="libmfx"
 vpp_qsv_filter_select="qsvvpp"
 xfade_opencl_filter_deps="opencl"
-xfade_vulkan_filter_deps="vulkan spirv_library"
+xfade_vulkan_filter_deps="vulkan spirv_compiler"
 yadif_cuda_filter_deps="ffnvcodec"
 yadif_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 yadif_videotoolbox_filter_deps="metal corevideo videotoolbox"
@@ -4328,7 +4390,7 @@ podpages_deps="perl"
 manpages_deps="perl pod2man"
 htmlpages_deps="perl"
 htmlpages_deps_any="makeinfo_html texi2html"
-txtpages_deps="perl makeinfo"
+txtpages_deps="perl makeinfo_command"
 doc_deps_any="manpages htmlpages podpages txtpages"

 # default parameters
@@ -4352,6 +4414,7 @@ stdcxx_default="c++17"
 cxx_default="g++"
 host_cc_default="gcc"
 doxygen_default="doxygen"
+makeinfo_default="makeinfo"
 install="install"
 ln_s_default="ln -s -f"
 glslc_default="glslc"
@@ -4461,7 +4524,7 @@ GLSLC_O='-o $@'
 NVCC_C='-c'
 NVCC_O='-o $@'

-host_extralibs='-lm'
+host_extralibs=
 host_cflags_filter=echo
 host_ldflags_filter=echo

@@ -4478,7 +4541,7 @@ mkdir -p ffbuild
 if test -f configure; then
    source_path=.
 elif test -f src/configure; then
-    source_path=src
+    source_path=./src
 else
    source_path=$(cd $(dirname "$0"); pwd)
    case "$source_path" in
@@ -4866,6 +4929,16 @@ case "$toolchain" in
        cc_default="clang"
        cxx_default="clang++"
    ;;
+    llvm|llvm-*)
+        cc_default="clang"
+        cxx_default="clang++"
+        ar_default="llvm-ar"
+        nm_default="llvm-nm -g"
+        ranlib_default="llvm-ranlib"
+        strip_default="llvm-strip"
+        windres_default="llvm-windres"
+        test "$toolchain" != "llvm" && add_sanitizers "${toolchain#llvm-}"
+    ;;
    gcc-*)
        add_sanitizers "${toolchain#gcc-}"
        cc_default="gcc"
@@ -4990,7 +5063,7 @@ if enabled cuda_nvcc; then
 fi

 set_default arch cc cxx doxygen pkg_config ranlib strip sysinclude \
-    target_exec x86asmexe glslc metalcc metallib stdc stdcxx
+    target_exec x86asmexe glslc metalcc metallib stdc stdcxx makeinfo
 enabled cross_compile || host_cc_default=$cc
 set_default host_cc

@@ -5007,7 +5080,7 @@ elif is_in -static $cc $LDFLAGS && ! is_in --static $pkg_config $pkg_config_flag
 Note: When building a static binary, add --pkg-config-flags=\"--static\"."
 fi

-if test $doxygen != $doxygen_default && \
+if test "$doxygen" != "$doxygen_default" && \
  ! $doxygen --version >/dev/null 2>&1; then
    warn "Specified doxygen \"$doxygen\" not found, API documentation will fail to build."
 fi
@@ -6307,7 +6380,7 @@ link_name=$(mktemp -u $TMPDIR/name_XXXXXXXX)
 mkdir "$link_dest"
 $ln_s "$link_dest" "$link_name"
 touch "$link_dest/test_file"
-if [ "$source_path" != "." ] && [ "$source_path" != "src" ] && ([ ! -d src ] || [ -L src ]) && [ -e "$link_name/test_file" ]; then
+if [ "$source_path" != "." ] && [ "$source_path" != "./src" ] && ([ ! -d src ] || [ -L src ]) && [ -e "$link_name/test_file" ]; then
    # create link to source path
    [ -e src ] && rm src
    $ln_s "$source_path" src
@@ -6561,8 +6634,10 @@ if enabled aarch64; then
    # internal assembler in clang 3.3 does not support this instruction
    enabled neon && check_insn neon 'ext   v0.8B, v0.8B, v1.8B, #1'

-    archext_list="arm_crc dotprod i8mm sve sve2 sme sme_i16i64 sme2"
+    archext_list="arm_crc dotprod i8mm pmull eor3 sve sve2 sme sme_i16i64 sme2"
    enabled arm_crc && check_archext_name_insn arm_crc crc 'crc32x w0, w0, x0'
+    enabled pmull   && check_archext_name_insn pmull aes 'pmull v0.1q, v0.1d, v0.1d'
+    enabled eor3    && check_archext_name_insn eor3 sha3 'eor3 v0.16b, v1.16b, v2.16b, v3.16b'
    enabled dotprod && check_archext_insn dotprod 'udot v0.4s, v0.16b, v0.16b'
    enabled i8mm    && check_archext_insn i8mm    'usdot v0.4s, v0.16b, v0.16b'
    enabled sve     && check_archext_insn sve     'whilelt p0.s, x0, x1'
@@ -6711,7 +6786,7 @@ elif enabled ppc; then
    fi

    if enabled vsx; then
-        check_cflags -mvsx &&
+        check_cflags -mvsx
        check_cc vsx altivec.h "int v[4] = { 0 };
                                vector signed int v1 = vec_vsx_ld(0, v);"
    fi
@@ -6769,7 +6844,6 @@ EOF
        x86asmexe_probe=$1
        if test_cmd $x86asmexe_probe -v; then
            x86asmexe=$x86asmexe_probe
-            x86asm_debug="-g -F dwarf"
            X86ASM_DEPFLAGS='-MD $(@:.o=.d)'
        fi
        check_x86asm x86asm "movbe ecx, [5]"
@@ -6783,9 +6857,7 @@ EOF
        disabled x86asm && die "nasm not found or too old. Please install/update nasm or use --disable-x86asm for a build without hand-optimized assembly."
        X86ASMFLAGS="-f $objformat"
        test -n "$extern_prefix"  && append X86ASMFLAGS "-DPREFIX"
-        case "$objformat" in
-            elf*) enabled debug && append X86ASMFLAGS $x86asm_debug ;;
-        esac
+        enabled debug && append X86ASMFLAGS "-g"

        enabled avx512    && check_x86asm avx512_external    "vmovdqa32 [eax]{k1}{z}, zmm0"
        enabled avx512icl && check_x86asm avx512icl_external "vpdpwssds zmm31{k1}{z}, zmm29, zmm28"
@@ -7205,6 +7277,7 @@ enabled zlib_gzip && enabled gzip || disable resource_compression
 check_lib libdl dlfcn.h "dlopen dlsym" || check_lib libdl dlfcn.h "dlopen dlsym" -ldl

 check_lib libm math.h sin -lm
+check_host_lib math.h sin -lm

 atan2f_args=2
 copysign_args=2
@@ -7559,10 +7632,14 @@ enabled schannel &&

 enabled schannel && check_cc dtls_protocol "windows.h security.h schnlsp.h" "int i = SECPKG_ATTR_DTLS_MTU;" -DSECURITY_WIN32

-makeinfo --version > /dev/null 2>&1 && enable makeinfo  || disable makeinfo
-enabled makeinfo \
-    && [ 0$(makeinfo --version | grep "texinfo" | sed 's/.*texinfo[^0-9]*\([0-9]*\)\..*/\1/') -ge 5 ] \
+$makeinfo --version > /dev/null 2>&1 && enable makeinfo_command || disable makeinfo_command makeinfo_html
+if enabled makeinfo_command; then
+  [ 0$($makeinfo --version | grep "texinfo" | sed 's/.*texinfo[^0-9]*\([0-9]*\)\..*/\1/') -ge 5 ] \
    && enable makeinfo_html || disable makeinfo_html
+elif test "$makeinfo" != "$makeinfo_default" ; then
+  warn "Specified makeinfo \"$makeinfo\" not found."
+fi
+
 disabled makeinfo_html && texi2html --help 2> /dev/null | grep -q 'init-file' && enable texi2html || disable texi2html
 perl -v            > /dev/null 2>&1 && enable perl      || disable perl
 pod2man --help     > /dev/null 2>&1 && enable pod2man   || disable pod2man
@@ -7792,6 +7869,12 @@ else
    disable libglslang libshaderc spirv_library spirv_compiler
 fi

+if enabled vulkan; then
+    check_headers spirv-headers/spirv.h ||
+    check_headers spirv/unified1/spirv.h ||
+    { requested vulkan && warn "spirv-headers not found, swscale SPIR-V backend unavailable"; }
+fi
+
 if enabled x86; then
    case $target_os in
        freebsd|mingw32*|mingw64*|win32|win64|linux|cygwin*)
@@ -7832,7 +7915,7 @@ fi

 enabled amf &&
    check_cpp_condition amf "AMF/core/Version.h" \
-        "(AMF_VERSION_MAJOR << 48 | AMF_VERSION_MINOR << 32 | AMF_VERSION_RELEASE << 16 | AMF_VERSION_BUILD_NUM) >= 0x0001000400240000"
+        "(AMF_VERSION_MAJOR << 48 | AMF_VERSION_MINOR << 32 | AMF_VERSION_RELEASE << 16 | AMF_VERSION_BUILD_NUM) >= 0x1000500000000"

 # Funny iconv installations are not unusual, so check it after all flags have been set
 if enabled libc_iconv; then
@@ -7881,6 +7964,15 @@ check_warning -Wempty-body
 # This roughly matches the default thread stack size on Musl, which is 128 KiB,
 # leaving some headroom for caller frames.
 check_warning -Wstack-usage=122880
+
+# GCC accepts warning option level 5 here to warn about all fallthroughs
+# that are not explicitly marked with the appropriate attribute
+if enabled gcc; then
+    check_warning -Wimplicit-fallthrough=5
+else
+    check_warning -Wimplicit-fallthrough
+fi
+
 check_c_warning -Wmissing-prototypes
 check_c_warning -Wstrict-prototypes
 check_c_warning -Wunterminated-string-initialization
@@ -8004,11 +8096,6 @@ fi

 enabled ftrapv && check_cflags -ftrapv

-test_cc -mno-red-zone <<EOF && noredzone_flags="-mno-red-zone"
-int x;
-EOF
-
-
 if enabled icc; then
    # Just warnings, no remarks
    check_allcflags -w1
@@ -8400,6 +8487,8 @@ if enabled aarch64; then
    echo "NEON enabled              ${neon-no}"
    echo "DOTPROD enabled           ${dotprod-no}"
    echo "I8MM enabled              ${i8mm-no}"
+    echo "PMULL enabled             ${pmull-no}"
+    echo "EOR3 enabled              ${eor3-no}"
    echo "SVE enabled               ${sve-no}"
    echo "SVE2 enabled              ${sve2-no}"
    echo "SME enabled               ${sme-no}"
@@ -8449,7 +8538,7 @@ echo "safe bitstream reader     ${safe_bitstream_reader-no}"
 echo "texi2html enabled         ${texi2html-no}"
 echo "perl enabled              ${perl-no}"
 echo "pod2man enabled           ${pod2man-no}"
-echo "makeinfo enabled          ${makeinfo-no}"
+echo "makeinfo enabled          ${makeinfo_command-no}"
 echo "makeinfo supports HTML    ${makeinfo_html-no}"
 echo "experimental features     ${unstable-no}"
 echo "xmllint enabled           ${xmllint-no}"
@@ -8598,6 +8687,7 @@ LD_PATH=$LD_PATH
 DLLTOOL=$dlltool
 WINDRES=$windres
 DOXYGEN=$doxygen
+MAKEINFO=$makeinfo
 LDFLAGS=$LDFLAGS
 LDEXEFLAGS=$LDEXEFLAGS
 LDSOFLAGS=$LDSOFLAGS
@@ -8666,7 +8756,6 @@ SLIB_INSTALL_EXTRA_LIB=${SLIB_INSTALL_EXTRA_LIB}
 SLIB_INSTALL_EXTRA_SHLIB=${SLIB_INSTALL_EXTRA_SHLIB}
 VERSION_SCRIPT_POSTPROCESS_CMD=${VERSION_SCRIPT_POSTPROCESS_CMD}
 SAMPLES:=${samples:-\$(FATE_SAMPLES)}
-NOREDZONE_FLAGS=$noredzone_flags
 LIBFUZZER_PATH=$libfuzzer_path
 IGNORE_TESTS=$ignore_tests
 VERSION_TRACKING=$version_tracking
@@ -2,6 +2,43 @@ The last version increases of all libraries were on 2025-03-28

 API changes, most recent first:

+2026-05-16 - xxxxxxxxxxx - lavf 62.16.100 - avformat.h
+  Add AVFMT_FIXED_FRAMESIZE.
+
+2026-05-16 - xxxxxxxxxxx - lavc 62.33.100 - avcodec.h
+  Add AV_CODEC_FLAG2_FIXED_FRAME_SIZE.
+
+2026-05-12 - xxxxxxxxxx - lavu 60.31.100 - frame.h
+Add IAMF frame side data types to enum AVFrameSideDataType:
+- AV_FRAME_DATA_IAMF_MIX_GAIN_PARAM
+- AV_FRAME_DATA_IAMF_DEMIXING_INFO_PARAM
+- AV_FRAME_DATA_IAMF_RECON_GAIN_INFO_PARAM
+
+2026-05-05 - xxxxxxxxxxx - lavf 62.15.100 - avformat.h
+  Add av_program_copy().
+
+2026-05-05 - xxxxxxxxxxx - lavf 62.14.100 - avformat.h
+  Add av_program_add_stream_index2().
+
+2026-04-14 - 7faa6ee2aa - lavc 62.30.100 - packet.h
+  Add AV_PKT_DATA_DYNAMIC_HDR_SMPTE_2094_APP5 side data type.
+
+2026-04-09 - 6ba6db4f19 - lavu 60.30.100 - hdr_dynamic_metadata.h frame.h
+  Add AVDynamicHDRSmpte2094App5 struct and functions.
+  Add AV_FRAME_DATA_DYNAMIC_HDR_SMPTE_2094_APP5 side data type.
+
+2026-03-14 - xxxxxxxxxx - lavu 60.29.100 - hwcontext_vulkan.h
+  Deprecate AVVulkanDeviceContext.lock_queue and
+  AVVulkanDeviceContext.unlock_queue without replacement.
+
+2026-03-12 - xxxxxxxxxx - lsws 9.7.100 - swscale.h
+  Add enum SwsScaler, and SwsContext.scaler/scaler_sub.
+
+2026-03-11 - 910000fe59d - lavu 60.28.100 - hwcontext_amf.h
+  Add av_amf_display_mastering_meta_to_hdrmeta(), av_amf_light_metadata_to_hdrmeta().
+  Add av_amf_extract_hdr_metadata(), av_amf_attach_hdr_metadata().
+  Add av_amf_get_color_profile().
+
 2026-03-07 - c23d56b173a - lavc 62.26.100 - codec_desc.h
  Add AV_CODEC_PROP_ENHANCEMENT.

@@ -0,0 +1,40 @@
+This document is work in progress
+
+*What is CVSS*
+    The Common Vulnerability Scoring System (CVSS) is an open, industry-standard framework used to measure and communicate the severity of software vulnerabilities, ranging from 0.0 to 10.0.
+
+*Why we need this Document*
+    It is important that FFmpeg CVEs have consistent and correct CVSS, not only for the obvious reason that one can recognize the severity of an issue at first glance.
+    But also as these numbers form the basis of rewards paid in bug bounty systems. Inconsistent CVSS could lead to unfair payouts.
+
+*What is this Document*
+    Prior 2026, FFmpeg had no guideline about CVSS.
+    This document describes how to select the CVSS for a FFmpeg related CVE. It currently only covers the Base Score.
+
+*What is the CVSS Base Score*
+    AV Attack Vector    (Network, Adjacent, Local, Physical)
+    AC Attack Complexity (Low, High)
+    PR Privileges Required (None, Low, High)
+    UI User Interaction (None, Required)
+    S  Scope (Unchanged, Changed)
+    C  Confidentiality (None, Low, High)
+    I  Integrity (None, Low, High)
+    A  Availability (None, Low, High)
+
+
+*Things people have set incorrectly*
+
+Below are general guidelines and in specific cases other things may apply.
+
+Attack Vector.
+    Quote from https://www.first.org/cvss/v3.1/user-guide
+        "Specifically, analysts should only score for Network or Adjacent when a vulnerability is bound to the network stack.
+         Vulnerabilities which require user interaction to download or receive malicious content (which could also be delivered locally, e.g., via USB drives) should be scored as Local."
+
+Availability.
+    FFmpeg Crashes -> AVAILABILITY IMPACT: Low
+    FFmpeg is frequently used as a short-lived, single-run process instead of a continuously running service that handles ongoing streams of user input. In that usage model, a crash usually causes only limited disruption.
+
+User Interaction
+    Please consider if an attacker can actually set the parameters required for an attack.
+    In general arbitrary filter parameters cannot be set by an attacker and require the user/account owner/admin to set them
@@ -54,7 +54,7 @@ TEXIDEP = perl $(SRC_PATH)/doc/texidep.pl $(SRC_PATH) $< $@ >$(@:%=%.d)
 doc/%.txt: TAG = TXT
 doc/%.txt: doc/%.texi
 	$(Q)$(TEXIDEP)
-	$(M)makeinfo --force --no-headers -o $@ $< 2>/dev/null
+	$(M)$(MAKEINFO) --force --no-headers -o $@ $< 2>/dev/null

 GENTEXI  = format codec
 GENTEXI := $(GENTEXI:%=doc/avoptions_%.texi)
@@ -69,11 +69,11 @@ doc/%-all.html: TAG = HTML
 ifdef HAVE_MAKEINFO_HTML
 doc/%.html: doc/%.texi $(SRC_PATH)/doc/t2h.pm $(GENTEXI)
 	$(Q)$(TEXIDEP)
-	$(M)makeinfo --html -I doc --no-split -D config-not-all --init-file=$(SRC_PATH)/doc/t2h.pm --output $@ $<
+	$(M)$(MAKEINFO) --html -I doc --no-split -D config-not-all --init-file=$(SRC_PATH)/doc/t2h.pm --output $@ $<

 doc/%-all.html: doc/%.texi $(SRC_PATH)/doc/t2h.pm $(GENTEXI)
 	$(Q)$(TEXIDEP)
-	$(M)makeinfo --html -I doc --no-split -D config-all --init-file=$(SRC_PATH)/doc/t2h.pm --output $@ $<
+	$(M)$(MAKEINFO) --html -I doc --no-split -D config-all --init-file=$(SRC_PATH)/doc/t2h.pm --output $@ $<
 else
 doc/%.html: doc/%.texi $(SRC_PATH)/doc/t2h.init $(GENTEXI)
 	$(Q)$(TEXIDEP)
@@ -7,11 +7,6 @@ V
    Disable the default terse mode, the full command issued by make and its
    output will be shown on the screen.

-DBG
-    Preprocess x86 external assembler files to a .dbg.asm file in the object
-    directory, which then gets compiled. Helps in developing those assembler
-    files.
-
 DESTDIR
    Destination directory for the install targets, useful to prepare packages
    or install FFmpeg in cross-environments.
@@ -646,6 +646,8 @@ Do not skip samples and export skip information as frame side data.
 Do not reset ASS ReadOrder field on flush.
@item icc_profiles
 Generate/parse embedded ICC profiles from/to colorimetry tags.
+@item fixed_frame_size
+Force audio encoders to use a fixed frame size.
@end table

@item export_side_data @var{flags} (@emph{decoding/encoding,audio,video,subtitles})
@@ -225,6 +225,25 @@ AVStream *stream;
 AVStream* stream;
@end example

+@item
+When sensible, prefer a narrow variable scope, especially in for loops:
+
+@example c, good
+// Good
+for (unsigned i = 0; i < submix->nb_elements; i++) @{
+    // Do something...
+@}
+@end example
+
+@example c, bad
+// Bad style
+unsigned i;
+//...
+for (i = 0; i < submix->nb_elements; i++) @{
+    // Do something...
+@}
+@end example
+
@end itemize

 If you work on a file that does not follow these guidelines consistently,
@@ -368,7 +387,7 @@ symbols. If in doubt, just avoid names starting with @code{_} altogether.
 Casts should be used only when necessary. Unneeded parentheses
 should also be avoided if they don't make the code easier to understand.
@item
-Where applicable, SI units shall be used. For example timeouts should use seconds as the fundamental unit not micro seconds.
+Where applicable, SI units shall be used. For example timeouts should use seconds as the fundamental unit not microseconds.
 That means a bare value like @samp{1.0} must mean 1 second, @samp{50m} means 50 milliseconds. For weight, gram shall be used.
@end itemize

@@ -471,13 +490,18 @@ ask/discuss it on the developer mailing list.

@subheading Cosmetic changes should be kept in separate patches.
 We refuse source indentation and other cosmetic changes if they are mixed
-with functional changes, such commits will be rejected and removed. Every
+with functional changes, such commits will be rejected and removed. However,
+indentation changes that can be ignored by @code{git diff --ignore-all-space}
+(e.g. changes in whitespace amount, leading/trailing spaces) may be mixed with
+functional changes, since reviewers can use @code{git diff -w} or
+@code{git log -p --ignore-all-space} to review only the functional parts of
+the change. Forgejo's pull request interface also provides a
+``Hide whitespace changes'' option for this purpose. Every
 developer has his own indentation style, you should not change it. Of course
 if you (re)write something, you can use your own style, even though we would
 prefer if the indentation throughout FFmpeg was consistent (Many projects
 force a given indentation style - we do not.). If you really need to make
-indentation changes (try to avoid this), separate them strictly from real
-changes.
+non-whitespace cosmetic changes, separate them strictly from real changes.

 NOTE: If you had to put if()@{ .. @} over a large (> 5 lines) chunk of code,
 then either do NOT change the indentation of the inner part within (do not
@@ -1045,6 +1045,10 @@ Other values include 0 for mono and stereo, 1 for surround sound with masking
 and LFE bandwidth optimizations, and 255 for independent streams with an
 unspecified channel layout.

+@item dtx (N.A.)
+Allow discontinuous transmission when set to 1. The default value is 0
+(disabled).
+
@item apply_phase_inv (N.A.) (requires libopus >= 1.2)
 If set to 0, disables the use of phase inversion for intensity stereo,
 improving the quality of mono downmixes, but slightly reducing normal stereo
@@ -42,7 +42,11 @@ static void pgm_save(unsigned char *buf, int wrap, int xsize, int ysize,
    FILE *f;
    int i;

-    f = fopen(filename,"wb");
+    f = fopen(filename, "wb");
+    if (!f) {
+        fprintf(stderr, "Could not open %s\n", filename);
+        return;
+    }
    fprintf(f, "P5\n%d %d\n%d\n", xsize, ysize, 255);
    for (i = 0; i < ysize; i++)
        fwrite(buf + i * wrap, 1, xsize, f);
@@ -336,15 +336,17 @@ int main (int argc, char **argv)

    if (video_stream) {
        printf("Play the output video file with the command:\n"
-               "ffplay -f rawvideo -pix_fmt %s -video_size %dx%d %s\n",
+               "ffplay -f rawvideo -pixel_format %s -video_size %dx%d %s\n",
               av_get_pix_fmt_name(pix_fmt), width, height,
               video_dst_filename);
    }

    if (audio_stream) {
        enum AVSampleFormat sfmt = audio_dec_ctx->sample_fmt;
-        int n_channels = audio_dec_ctx->ch_layout.nb_channels;
+        AVChannelLayout mono = AV_CHANNEL_LAYOUT_MONO;
+        AVChannelLayout *ch_layout = &audio_dec_ctx->ch_layout;
        const char *fmt;
+        char buf[64];

        if (av_sample_fmt_is_planar(sfmt)) {
            const char *packed = av_get_sample_fmt_name(sfmt);
@@ -352,15 +354,18 @@ int main (int argc, char **argv)
                   "(%s). This example will output the first channel only.\n",
                   packed ? packed : "?");
            sfmt = av_get_packed_sample_fmt(sfmt);
-            n_channels = 1;
+            ch_layout = &mono;
        }

        if ((ret = get_format_from_sample_fmt(&fmt, sfmt)) < 0)
            goto end;

+        if ((ret = av_channel_layout_describe(ch_layout, buf, sizeof(buf))) < 0)
+            goto end;
+
        printf("Play the output audio file with the command:\n"
-               "ffplay -f %s -ac %d -ar %d %s\n",
-               fmt, n_channels, audio_dec_ctx->sample_rate,
+               "ffplay -f %s -ch_layout %s -sample_rate %d %s\n",
+               fmt, buf, audio_dec_ctx->sample_rate,
               audio_dst_filename);
    }

@@ -218,10 +218,10 @@ int main(int argc, char **argv)
        samples = (uint16_t*)frame->data[0];

        for (j = 0; j < c->frame_size; j++) {
-            samples[2*j] = (int)(sin(t) * 10000);
+            samples[c->ch_layout.nb_channels*j] = (int)(sin(t) * 10000);

            for (k = 1; k < c->ch_layout.nb_channels; k++)
-                samples[2*j + k] = samples[2*j];
+                samples[c->ch_layout.nb_channels*j + k] = samples[c->ch_layout.nb_channels*j];
            t += tincr;
        }
        encode(c, frame, pkt, f);
@@ -132,8 +132,9 @@ static int decode_write(AVCodecContext *avctx, AVPacket *packet)
            goto fail;
        }

-        if ((ret = fwrite(buffer, 1, size, output_file)) < 0) {
+        if (fwrite(buffer, 1, size, output_file) != size) {
            fprintf(stderr, "Failed to dump raw data.\n");
+            ret = -1;
            goto fail;
        }

@@ -232,6 +233,10 @@ int main(int argc, char *argv[])

    /* open the file to dump raw data */
    output_file = fopen(argv[3], "w+b");
+    if (!output_file) {
+        fprintf(stderr, "Cannot open output file '%s'\n", argv[3]);
+        return -1;
+    }

    /* actual decoding and dump the raw data */
    while (ret >= 0) {
@@ -39,6 +39,7 @@
 #include <libavutil/error.h>
 #include <libavutil/hwcontext.h>
 #include <libavutil/hwcontext_qsv.h>
+#include <libavutil/imgutils.h>
 #include <libavutil/mem.h>

 static int get_format(AVCodecContext *avctx, const enum AVPixelFormat *pix_fmts)
@@ -88,9 +89,16 @@ static int decode_packet(AVCodecContext *decoder_ctx,
            goto fail;
        }

-        for (i = 0; i < FF_ARRAY_ELEMS(sw_frame->data) && sw_frame->data[i]; i++)
-            for (j = 0; j < (sw_frame->height >> (i > 0)); j++)
-                avio_write(output_ctx, sw_frame->data[i] + j * sw_frame->linesize[i], sw_frame->width);
+        for (i = 0; i < FF_ARRAY_ELEMS(sw_frame->data) && sw_frame->data[i]; i++) {
+            int h = sw_frame->height >> (i > 0);
+            int linesize = av_image_get_linesize(sw_frame->format, sw_frame->width, i);
+            if (linesize < 0) {
+                ret = linesize;
+                goto fail;
+            }
+            for (j = 0; j < h; j++)
+                avio_write(output_ctx, sw_frame->data[i] + j * sw_frame->linesize[i], linesize);
+        }

 fail:
        av_frame_unref(sw_frame);
@@ -430,7 +430,9 @@ int main(int argc, char **argv)

 end:
    avformat_close_input(&ifmt_ctx);
-    avformat_close_input(&ofmt_ctx);
+    if (ofmt_ctx && !(ofmt_ctx->oformat->flags & AVFMT_NOFILE))
+        avio_closep(&ofmt_ctx->pb);
+    avformat_free_context(ofmt_ctx);
    avcodec_free_context(&decoder_ctx);
    avcodec_free_context(&encoder_ctx);
    av_buffer_unref(&hw_device_ctx);
@@ -184,7 +184,7 @@ end:
    avformat_close_input(&ifmt_ctx);

    /* close output */
-    if (ofmt_ctx && !(ofmt->flags & AVFMT_NOFILE))
+    if (ofmt_ctx && !(ofmt_ctx->oformat->flags & AVFMT_NOFILE))
        avio_closep(&ofmt_ctx->pb);
    avformat_free_context(ofmt_ctx);

@@ -177,7 +177,7 @@ static int open_output_file(const char *filename)
                enc_ctx->width = dec_ctx->width;
                enc_ctx->sample_aspect_ratio = dec_ctx->sample_aspect_ratio;

-                ret = avcodec_get_supported_config(dec_ctx, NULL,
+                ret = avcodec_get_supported_config(enc_ctx, NULL,
                                                   AV_CODEC_CONFIG_PIX_FORMAT, 0,
                                                   (const void**)&pix_fmts, NULL);

@@ -195,7 +195,7 @@ static int open_output_file(const char *filename)
                if (ret < 0)
                    return ret;

-                ret = avcodec_get_supported_config(dec_ctx, NULL,
+                ret = avcodec_get_supported_config(enc_ctx, NULL,
                                                   AV_CODEC_CONFIG_SAMPLE_FORMAT, 0,
                                                   (const void**)&sample_fmts, NULL);

@@ -96,7 +96,6 @@ static int encode_write(AVCodecContext *avctx, AVFrame *frame, FILE *fout)

 end:
    av_packet_free(&enc_pkt);
-    ret = ((ret == AVERROR(EAGAIN)) ? 0 : -1);
    return ret;
 }

@@ -118,7 +117,7 @@ int main(int argc, char *argv[])
    height = atoi(argv[2]);
    size   = width * height;

-    if (!(fin = fopen(argv[3], "r"))) {
+    if (!(fin = fopen(argv[3], "rb"))) {
        fprintf(stderr, "Fail to open input file : %s\n", strerror(errno));
        return -1;
    }
@@ -198,7 +197,8 @@ int main(int argc, char *argv[])
            goto close;
        }

-        if ((err = (encode_write(avctx, hw_frame, fout))) < 0) {
+        err = encode_write(avctx, hw_frame, fout);
+        if (err != AVERROR(EAGAIN) && err < 0) {
            fprintf(stderr, "Failed to encode.\n");
            goto close;
        }
@@ -294,7 +294,9 @@ int main(int argc, char **argv)

 end:
    avformat_close_input(&ifmt_ctx);
-    avformat_close_input(&ofmt_ctx);
+    if (ofmt_ctx && !(ofmt_ctx->oformat->flags & AVFMT_NOFILE))
+        avio_closep(&ofmt_ctx->pb);
+    avformat_free_context(ofmt_ctx);
    avcodec_free_context(&decoder_ctx);
    avcodec_free_context(&encoder_ctx);
    av_buffer_unref(&hw_device_ctx);
@@ -1569,6 +1569,27 @@ Set whether on display the image should be vertically flipped.

 See the @code{-display_rotation} option for more details.

+@item -mastering_display[:@var{stream_specifier}] @var{G(%u,%u)B(%u,%u)R(%u,%u)WP(%u,%u)L(%u,%u)} (@emph{input,per-stream})
+Set video mastering display metadata.
+
+@var{G(%u,%u)B(%u,%u)R(%u,%u)WP(%u,%u)L(%u,%u)} is a string specifying
+X,Y display primaries for GBR channels and white point (WP) in units of
+0.00002, and max-min luminance (L) values in units of 0.0001 candela per
+meter square. The values are unsigned integers representing the numerator
+of a rational with an implicit denominator of 50000 for GBR and (WP), and
+implicit denominator 10000 for (L).
+
+This option overrides the mastering display metadata stored in the file,
+if any.
+
+@item -content_light[:@var{stream_specifier}] @var{%u,%u} (@emph{input,per-stream})
+Set video content light metadata.
+
+@var{%u,%u} is a string specifying max content light level and maximum picture
+average light level.
+
+This option overrides the content light metadata stored in the file, if any.
+
@item -vn (@emph{input/output})
 As an input option, blocks all video streams of a file from being filtered or
 being automatically selected or mapped for any output. See @code{-discard}
@@ -357,6 +357,7 @@
  <xsd:complexType name="streamGroupComponentType">
    <xsd:sequence>
      <xsd:element name="subcomponents" type="ffprobe:streamGroupSubComponentList" minOccurs="0" maxOccurs="1"/>
+      <xsd:element name="side_data_list" type="ffprobe:packetSideDataListType" minOccurs="0" maxOccurs="1"/>
      <xsd:element name="component_entry" type="ffprobe:streamGroupEntryType" minOccurs="0" maxOccurs="unbounded"/>
    </xsd:sequence>
  </xsd:complexType>
@@ -7743,6 +7743,11 @@ The file path of the downloaded whisper.cpp model (mandatory).
 The language to use for transcription ('auto' for auto-detect).
 Default value: @code{"auto"}

+@item translate
+If enabled, translate the transcription from the source language to English. A
+multilingual model is required to enable this option.
+Default value: @code{"false"}
+
@item queue
 The maximum size that will be queued into the filter before processing the audio
 with whisper. Using a small value the audio stream will be processed more often,
@@ -7775,8 +7780,8 @@ Default value: @code{"text"}

@item max_len
 Maximum segment length in characters. When set to a value greater than 0,
-transcription segments will be split to not exceed this length. This is useful
-for generating subtitles with shorter lines.
+transcription segments will be split by word to not exceed this length. This is
+useful for generating subtitles with shorter lines.
 Default value: @code{"0"}

@item vad_model
@@ -14744,6 +14749,73 @@ This flag is enabled by default.
@end table
@end table

+@section frc_amf
+
+Double the frame rate, using Frame Rate Converter (FRC) provided by
+AMD Advanced Media Framework library for hardware acceleration.
+
+The filter accepts the following options:
+
+@table @option
+@item engine_type
+Specify the engine used to run shaders.
+@table @samp
+@item dx11
+DirectX 11.
+
+@item dx12
+DirectX 12 (Default value).
+@end table
+
+@item enable
+Boolean value: enable/disable FRC. Dynamic value, can be altered at a runtime
+without re-initializing the filter. (Default value: enabled).
+
+@item fallback_mode
+Fallback behavior in case of low interpolation confidence.
+@table @samp
+@item duplicate
+Duplicate frame.
+
+@item blend
+Blend two frames together (Default value).
+@end table
+
+@item indicator
+Boolean value: show FRC indicator square in the top left corner of the video
+(Default value: disabled).
+
+@item profile
+Level of hierarchical motion search.
+@table @samp
+@item low
+Less levels of hierarchical motion search.
+Only recommended for extremely low resolutions.
+
+@item high
+Recommended for any resolution up to 1440p. (Default value)
+
+@item super
+More levels of hierarchical motion search. Recommended for resolutions 1440p
+or higher.
+@end table
+
+@item mv_search_mode
+Performance mode of the motion search.
+@table @samp
+@item native
+Conduct motion search on the full resolution of source images.
+
+@item performance
+Conduct motion search on the down scaled source images.
+Recommended for APU or low end GPU for better performance.
+@end table
+
+@item use_future_frame
+Boolean value: enable dependency on future frame, improves quality for the cost
+of latency (Default value: enabled).
+@end table
+
@section framestep

 Select one frame every N-th frame.
@@ -25648,6 +25720,22 @@ Work the same as the identical @ref{scale} filter options.
@item reset_sar
 Works the same as the identical @ref{scale} filter option.

+@item in_color_range
+Override input color range.
+
+@item out_color_range
+Specify output color range.
+
+The accepted values for in_trc and out_trc are:
+@table @samp
+@item studio
+Studio (or restricted, or MPEG) color range.
+
+@item full
+Full (or JPEG) color range.
+
+@end table
+
@anchor{color_profile}
@item color_profile
 Specify all color properties at once.
@@ -25665,10 +25753,13 @@ BT.2020

@end table

-@item trc
+@item in_trc
+Override input transfer characteristics.
+
+@item out_trc
 Specify output transfer characteristics.

-The accepted values are:
+The accepted values for in_trc and out_trc are:
@table @samp
@item bt709
 BT.709
@@ -25720,10 +25811,13 @@ ARIB_STD_B67

@end table

-@item primaries
+@item in_primaries
+Override input color primaries.
+
+@item out_primaries
 Specify output color primaries.

-The accepted values are:
+The accepted values for in_primaries and out_primaries are:
@table @samp
@item bt709
 BT.709
@@ -25775,6 +25869,13 @@ Upscale to 4K and change color profile to bt2020.
@example
 vpp_amf=4096:2160:color_profile=bt2020
@end example
+
+@item
+Override input primaries and input transfer characteristics, change both to bt709.
+
+@example
+vpp_amf=color_profile=bt2020:in_trc=smpte2084:in_primaries=bt2020:out_trc=bt709:out_primaries=bt709
+@end example
@end itemize

@anchor{vstack}
@@ -27390,6 +27491,56 @@ Thumbnails are extracted from every @var{n}=150-frame batch, selecting one per b

@end itemize

+@subsection transpose_cuda
+
+Transpose rows with columns in the input video and optionally flip it.
+For more in depth examples see the @ref{transpose} video filter, which shares mostly the same options.
+
+It accepts the following parameters:
+
+@table @option
+
+@item dir
+Specify the transposition direction.
+
+Can assume the following values:
+@table @samp
+@item cclock_flip
+Rotate by 90 degrees counterclockwise and vertically flip. (default)
+
+@item clock
+Rotate by 90 degrees clockwise.
+
+@item cclock
+Rotate by 90 degrees counterclockwise.
+
+@item clock_flip
+Rotate by 90 degrees clockwise and vertically flip.
+
+@item reversal
+Rotate by 180 degrees.
+
+@item hflip
+Flip horizontally.
+
+@item vflip
+Flip vertically.
+@end table
+
+@item passthrough
+Do not apply the transposition if the input geometry matches the one
+specified by the specified value. It accepts the following values:
+@table @samp
+@item none
+Always apply transposition. (default)
+@item portrait
+Preserve portrait geometry (when @var{height} >= @var{width}).
+@item landscape
+Preserve landscape geometry (when @var{width} >= @var{height}).
+@end table
+
+@end table
+
@subsection yadif_cuda

 Deinterlace the input video using the @ref{yadif} algorithm, but implemented
@@ -1759,7 +1759,7 @@ See also the @ref{framehash} and @ref{md5} muxers.
 Animated GIF muxer.

 Note that the GIF format has a very large time base: the delay between two frames can
-therefore not be smaller than one centi second.
+therefore not be smaller than one centisecond.

@subsection Options
@table @option
@@ -3069,7 +3069,7 @@ Default is @code{0x0001}.
 Set the @samp{original_network_id}. This is unique identifier of a
 network in DVB. Its main use is in the unique identification of a service
 through the path @samp{Original_Network_ID, Transport_Stream_ID}. Default
-is @code{0x0001}.
+is @code{0xff01}.

@item mpegts_service_id @var{integer}
 Set the @samp{service_id}, also known as program in DVB. Default is
@@ -3096,6 +3096,8 @@ MPEG2 Digital HDTV service.
 Advanced Codec Digital SDTV service.
@item advanced_codec_digital_hdtv
 Advanced Codec Digital HDTV service.
+@item hevc_digital_hdtv
+HEVC Digital Television service.
@end table

@item mpegts_pmt_start_pid @var{integer}
@@ -3272,6 +3274,19 @@ ogg files can be safely chained.

@end table

+@section pdv
+
+Playdate Video muxer.
+
+This muxer writes the Playdate video container used by Panic's Playdate SDK.
+It requires a seekable output and a single PDV video stream.
+
+@table @option
+@item max_frames @var{frames}
+Reserve space for at most @var{frames} video frames in the file header. This
+option is mandatory.
+@end table
+
@anchor{rcwtenc}
@section rcwt

@@ -3951,6 +3966,10 @@ This muxer supports the following options:
 Set the timeout in milliseconds for ICE and DTLS handshake.
 Default value is 5000.

+@item timeout @var{integer}
+Set timeout in seconds for socket I/O operations. Applicable only for HTTP output.
+Default value is -1.
+
@item pkt_size @var{integer}
 Set the maximum size, in bytes, of RTP packets that send out.
 Default value is 1200.
@@ -20,7 +20,7 @@ architecture-specific versions. It is recommended to look at older
 revisions of the interesting files (web frontends for the various FFmpeg
 branches are listed at http://ffmpeg.org/download.html).
 Alternatively, look into the other architecture-specific versions in
-the x86/, ppc/, alpha/ subdirectories. Even if you don't exactly
+the x86/, ppc/, aarch64/ subdirectories. Even if you don't exactly
 comprehend the instructions, it could help understanding the functions
 and how they can be optimized.

@@ -191,11 +191,6 @@ __asm__() block.
 Use external asm (nasm) or inline asm (__asm__()), do not use intrinsics.
 The latter requires a good optimizing compiler which gcc is not.

-When debugging a x86 external asm compilation issue, if lost in the macro
-expansions, add DBG=1 to your make command-line: the input file will be
-preprocessed, stripped of the debug/empty lines, then compiled, showing the
-actual lines causing issues.
-
 Inline asm vs. external asm
 ---------------------------
 Both inline asm (__asm__("..") in a .c file, handled by a compiler such as gcc)
@@ -11,48 +11,88 @@ For programmatic use, they can be set explicitly in the

@table @option

+@anchor{scaler}
+@item scaler, scaler_sub
+Choose the scaling algorithm to use. Default value is @samp{auto} for both.
+It accepts the following values:
+
+@table @samp
+@item auto
+Aumotic choice. For @samp{scaler_sub}, this means the same algorithm as
+@samp{scaler}. For @samp{scaler}, this defaults to the scaler flag selected
+by @samp{sws_flags}.
+
+@item bilinear
+Bilinear filter. (AKA triangle filter)
+
+@item bicubic
+2-tap cubic BC-spline (AKA Mitchell-Netravali spline). The B and C parameters
+can be configured by setting @code{param0} and @code{param1}, defaulting to
+0.0 and 0.6 respectively.
+
+@item point, neighbor
+Point sampling (AKA nearest neighbor).
+
+@item area
+Area averaging. Equivalent to @samp{bilinear} for upscaling.
+
+@item gaussian
+2-tap Gaussian filter approximation. The sharpness parameter can be configured
+by setting @code{param0}, defaulting to 3.0.
+
+@item sinc
+Unwindowed sinc filter.
+
+@item lanczos
+Lanczos resampling (sinc windowed sinc). The number of filter taps can
+be configured by setting @code{param0}, defaulting to 3.
+
+@item spline
+Unwindowed natural bicubic spline.
+@end table
+
@anchor{sws_flags}
@item sws_flags
 Set the scaler flags. This is also used to set the scaling
-algorithm. Only a single algorithm should be selected. Default
-value is @samp{bicubic}.
+algorithm, though this usage is deprecated in favor of setting @samp{scaler}.
+Only a single algorithm may be selected. Default value is @samp{bicubic}.

 It accepts the following values:
@table @samp
@item fast_bilinear
-Select fast bilinear scaling algorithm.
+Select fast bilinear scaling algorithm. (Deprecated)

@item bilinear
-Select bilinear scaling algorithm.
+Select bilinear scaling algorithm. (Deprecated)

@item bicubic
-Select bicubic scaling algorithm.
+Select bicubic scaling algorithm. (Deprecated)

@item experimental
-Select experimental scaling algorithm.
+Select experimental scaling algorithm. (Deprecated)

@item neighbor
-Select nearest neighbor rescaling algorithm.
+Select nearest neighbor rescaling algorithm. (Deprecated)

@item area
-Select averaging area rescaling algorithm.
+Select averaging area rescaling algorithm. (Deprecated)

@item bicublin
 Select bicubic scaling algorithm for the luma component, bilinear for
-chroma components.
+chroma components. (Deprecated)

@item gauss
-Select Gaussian rescaling algorithm.
+Select Gaussian rescaling algorithm. (Deprecated)

@item sinc
-Select sinc rescaling algorithm.
+Select sinc rescaling algorithm. (Deprecated)

@item lanczos
 Select Lanczos rescaling algorithm. The default width (alpha) is 3 and can be
-changed by setting @code{param0}.
+changed by setting @code{param0}. (Deprecated)

@item spline
-Select natural bicubic spline rescaling algorithm.
+Select natural bicubic spline rescaling algorithm. (Deprecated)

@item print_info
 Enable printing/debug logging.
@@ -55,7 +55,8 @@ sub get_formatting_function($$) {

 # determine texinfo version
 my $package_version = ff_get_conf('PACKAGE_VERSION');
-$package_version =~ s/\+dev$//;
+$package_version =~ s/\+nc$//;
+$package_version =~ s/\+?dev$//;
 my $program_version_num = version->declare($package_version)->numify;
 my $program_version_6_8 = $program_version_num >= 6.008000;

@@ -119,29 +120,8 @@ sub ffmpeg_heading_command($$$$$)
    }

    my $heading_level;
-    # node is used as heading if there is nothing else.
-    if ($cmdname eq 'node') {
-        if (!$output_unit or
-            (((!$output_unit->{'extra'}->{'section'}
-              and $output_unit->{'extra'}->{'node'}
-              and $output_unit->{'extra'}->{'node'} eq $command)
-             or
-             ((($output_unit->{'extra'}->{'unit_command'}
-                and $output_unit->{'extra'}->{'unit_command'} eq $command)
-               or
-               ($output_unit->{'unit_command'}
-                and $output_unit->{'unit_command'} eq $command))
-              and $command->{'extra'}
-              and not $command->{'extra'}->{'associated_section'}))
-             # bogus node may not have been normalized
-            and defined($command->{'extra'}->{'normalized'}))) {
-            if ($command->{'extra'}->{'normalized'} eq 'Top') {
-                $heading_level = 0;
-            } else {
-                $heading_level = 3;
-            }
-        }
-    } else {
+    # Never use node for heading
+    if ($cmdname ne 'node') {
        if (defined($command->{'extra'})
            and defined($command->{'extra'}->{'section_level'})) {
          $heading_level = $command->{'extra'}->{'section_level'};
@@ -153,58 +133,58 @@ sub ffmpeg_heading_command($$$$$)
        }
    }

-    my $heading = $self->command_text($command);
-    # $heading not defined may happen if the command is a @node, for example
-    # if there is an error in the node.
-    if (defined($heading) and $heading ne '' and defined($heading_level)) {
-
-        if ($root_commands{$cmdname}
-            and $sectioning_commands{$cmdname}) {
-            my $content_href = $self->command_contents_href($command, 'contents',
-                                                            $self->{'current_filename'});
-            if ($content_href) {
-                my $this_href = $content_href =~ s/^\#toc-/\#/r;
-                $heading .= '<span class="pull-right">'.
-                              '<a class="anchor hidden-xs" '.
-                                 "href=\"$this_href\" aria-hidden=\"true\">".
-            ($ENV{"FA_ICONS"} ? '<i class="fa fa-link"></i>'
-                              : '#').
-                              '</a> '.
-                              '<a class="anchor hidden-xs"'.
-                                 "href=\"$content_href\" aria-hidden=\"true\">".
-            ($ENV{"FA_ICONS"} ? '<i class="fa fa-navicon"></i>'
-                              : 'TOC').
-                              '</a>'.
-                            '</span>';
+    if (defined($heading_level)) {
+        my $heading = $self->command_text($command);
+        # empty heading corresponds to an empty @top
+        if ($heading ne '') {
+            if ($root_commands{$cmdname}
+                and $sectioning_commands{$cmdname}) {
+                my $content_href = $self->command_contents_href($command, 'contents',
+                                                                $self->{'current_filename'});
+                if ($content_href) {
+                    my $this_href = $content_href =~ s/^\#toc-/\#/r;
+                    $heading .= '<span class="pull-right">'.
+                                  '<a class="anchor hidden-xs" '.
+                                     "href=\"$this_href\" aria-hidden=\"true\">".
+                ($ENV{"FA_ICONS"} ? '<i class="fa fa-link"></i>'
+                                  : '#').
+                                  '</a> '.
+                                  '<a class="anchor hidden-xs"'.
+                                     "href=\"$content_href\" aria-hidden=\"true\">".
+                ($ENV{"FA_ICONS"} ? '<i class="fa fa-navicon"></i>'
+                                  : 'TOC').
+                                  '</a>'.
+                                '</span>';
+                }
            }
-        }
-
-        my $in_preformatted;
-        if ($program_version_num >= 7.001090) {
-          $in_preformatted = $self->in_preformatted_context();
-        } else {
-          $in_preformatted = $self->in_preformatted();
-        }
-        if ($in_preformatted) {
-            $result .= $heading."\n";
-        } else {
-            # if the level was changed, set the command name right
-            if ($cmdname ne 'node'
-                and $heading_level ne $Texinfo::Common::command_structuring_level{$cmdname}) {
-                $cmdname
-                    = $Texinfo::Common::level_to_structuring_command{$cmdname}->[$heading_level];
-            }
-            if ($program_version_num >= 7.000000) {
-                $result .= &{get_formatting_function($self,'format_heading_text')}($self,
-                     $cmdname, [$cmdname], $heading,
-                     $heading_level +$self->get_conf('CHAPTER_HEADER_LEVEL') -1,
-                     $heading_id, $command);

+            my $in_preformatted;
+            if ($program_version_num >= 7.001090) {
+              $in_preformatted = $self->in_preformatted_context();
            } else {
-              $result .= &{get_formatting_function($self,'format_heading_text')}(
-                        $self, $cmdname, $heading,
-                        $heading_level +
-                        $self->get_conf('CHAPTER_HEADER_LEVEL') - 1, $command);
+              $in_preformatted = $self->in_preformatted();
+            }
+            if ($in_preformatted) {
+                $result .= $heading."\n";
+            } else {
+                # if the level was changed, set the command name right
+                if ($cmdname ne 'node'
+                    and $heading_level ne $Texinfo::Common::command_structuring_level{$cmdname}) {
+                    $cmdname
+                        = $Texinfo::Common::level_to_structuring_command{$cmdname}->[$heading_level];
+                }
+                if ($program_version_num >= 7.000000) {
+                    $result .= &{get_formatting_function($self,'format_heading_text')}($self,
+                         $cmdname, [$cmdname], $heading,
+                         $heading_level +$self->get_conf('CHAPTER_HEADER_LEVEL') -1,
+                         $heading_id, $command);
+
+                } else {
+                  $result .= &{get_formatting_function($self,'format_heading_text')}(
+                            $self, $cmdname, $heading,
+                            $heading_level +
+                            $self->get_conf('CHAPTER_HEADER_LEVEL') - 1, $command);
+                }
            }
        }
    }
@@ -1,14 +1,17 @@
-The basis transforms used for FFT and various other derived functions are based
-on the following unrollings.
+# Transforms
+
+The basis transforms used for FFT and various other derived functions are based on the following unrollings.
 The functions can be easily adapted to double precision floats as well.

-# Parity permutation
+## Parity permutation
+
 The basis transforms described here all use the following permutation:

 ``` C
 void ff_tx_gen_split_radix_parity_revtab(int *revtab, int len, int inv,
                                         int basis, int dual_stride);
 ```
+
 Parity means even and odd complex numbers will be split, e.g. the even
 coefficients will come first, after which the odd coefficients will be
 placed. For example, a 4-point transform's coefficients after reordering:
@@ -33,7 +36,8 @@ register or 0. This allows to reuse SSE functions as dual-transform
 functions in AVX mode.
 If length is smaller than basis/2 this function will not do anything.

-# 4-point FFT transform
+## 4-point FFT transform
+
 The only permutation this transform needs is to swap the `z[1]` and `z[2]`
 elements when performing an inverse transform, which in the assembly code is
 hardcoded with the function itself being templated and duplicated for each
@@ -80,7 +84,8 @@ static void fft4(FFTComplex *z)
 }
 ```

-# 8-point AVX FFT transform
+## 8-point AVX FFT transform
+
 Input must be pre-permuted using the parity lookup table, generated via
 `ff_tx_gen_split_radix_parity_revtab`.

@@ -193,7 +198,8 @@ This theme continues throughout the document. Note that in the actual assembly c
 the paths are interleaved to improve unit saturation and CPU dependency tracking, so
 to more clearly see them, you'll need to deinterleave the instructions.

-# 8-point SSE/ARM64 FFT transform
+## 8-point SSE/ARM64 FFT transform
+
 Input must be pre-permuted using the parity lookup table, generated via
 `ff_tx_gen_split_radix_parity_revtab`.

@@ -305,7 +311,8 @@ static void fft8(FFTComplex *z)
 Most functions here are highly tuned to use x86's addsub instruction to save on
 external sign mask loading.

-# 16-point AVX FFT transform
+## 16-point AVX FFT transform
+
 This version expects the output of the 8 and 4-point transforms to follow the
 even/odd convention established above.

@@ -445,7 +452,8 @@ static void fft16(FFTComplex *z)
 }
 ```

-# AVX split-radix synthesis
+## AVX split-radix synthesis
+
 To create larger transforms, the following unrolling of the C split-radix
 function is used.

@@ -705,8 +713,8 @@ beginning to overlap, particularly `[o1]` with `[0]` after the second iteration.
 To iterate further, set `z = &z[16]` via `z += 8` for the second iteration. After
 the 4th iteration, the layout resets, so repeat the same.

+## 15-point AVX FFT transform

-# 15-point AVX FFT transform
 The 15-point transform is based on the following unrolling. The input
 must be permuted via the following loop:

@@ -2,14 +2,6 @@
 # common bits used by all libraries
 #

-DEFAULT_X86ASMD=.dbg
-
-ifeq ($(DBG),1)
-X86ASMD=$(DEFAULT_X86ASMD)
-else
-X86ASMD=
-endif
-
 ifndef SUBDIR

 LINK = $(LD) $(1)
@@ -105,10 +97,6 @@ COMPILE_LASX = $(call COMPILE,CC,LASXFLAGS)
 %_host.o: %.c
 	$(COMPILE_HOSTC)

-%$(DEFAULT_X86ASMD).asm: %.asm
-	$(DEPX86ASM) $(X86ASMFLAGS) -M -o $@ $< > $(@:.asm=.d)
-	$(X86ASM) $(X86ASMFLAGS) -e $< | sed '/^%/d;/^$$/d;' > $@
-
 %.o: %.asm
 	$(COMPILE_X86ASM)
 	-$(if $(ASMSTRIPFLAGS), $(STRIP) $(ASMSTRIPFLAGS) $@)
@@ -197,7 +185,7 @@ endif
 clean::
 	$(RM) $(BIN2CEXE) $(CLEANSUFFIXES:%=ffbuild/%)

-%.c %.h %.pc %.ver %.version: TAG = GEN
+%.c %.h %.S %.pc %.ver %.version: TAG = GEN

 # Dummy rule to stop make trying to rebuild removed or renamed headers
 %.h %_template.c:
@@ -266,7 +254,7 @@ $(TOOLOBJS): | tools

 OUTDIRS := $(OUTDIRS) $(dir $(OBJS) $(HOBJS) $(HOSTOBJS) $(SHLIBOBJS) $(STLIBOBJS) $(TESTOBJS))

-CLEANSUFFIXES     = *.d *.gcda *.gcno *.h.c *.ho *.map *.o *.objs *.pc *.ptx *.ptx.gz *.ptx.c *.spv *.spv.gz *.spv.c *.ver *.version *.html.gz *.html.c *.css.min.gz *.css.min *.css.c  *$(DEFAULT_X86ASMD).asm *~ *.ilk *.pdb
+CLEANSUFFIXES     = *.d *.gcda *.gcno *.h.c *.ho *.map *.o *.objs *.pc *.ptx *.ptx.gz *.ptx.c *.spv *.spv.gz *.spv.c *.gen.c *.gen.S *.ver *.version *.html.gz *.html.c *.css.min.gz *.css.min *.css.c *~ *.ilk *.pdb
 LIBSUFFIXES       = *.a *.lib *.so *.so.* *.dylib *.dll *.def *.dll.a

 define RULES
@@ -276,4 +264,4 @@ endef

 $(eval $(RULES))

-include $(wildcard $(OBJS:.o=.d) $(HOSTOBJS:.o=.d) $(TESTOBJS:.o=.d) $(HOBJS:.o=.d) $(SHLIBOBJS:.o=.d) $(STLIBOBJS:.o=.d) $(SPVOBJS:.spv.o=.d)) $(OBJS:.o=$(DEFAULT_X86ASMD).d)
+-include $(wildcard $(OBJS:.o=.d) $(HOSTOBJS:.o=.d) $(TESTOBJS:.o=.d) $(HOBJS:.o=.d) $(SHLIBOBJS:.o=.d) $(STLIBOBJS:.o=.d) $(SPVOBJS:.spv.o=.d))
@@ -1267,7 +1267,7 @@ unsigned stream_specifier_match(const StreamSpecifier *ss,
                break;
            }
        }
-        // fall-through
+        av_fallthrough;
    case STREAM_LIST_GROUP_IDX:
        if (ss->stream_list == STREAM_LIST_GROUP_IDX &&
            ss->list_id >= 0 && ss->list_id < s->nb_stream_groups)
@@ -253,7 +253,6 @@ void term_init(void)
 /* read a key without blocking */
 static int read_key(void)
 {
-    unsigned char ch = -1;
 #if HAVE_TERMIOS_H
    int n = 1;
    struct timeval tv;
@@ -265,6 +264,7 @@ static int read_key(void)
    tv.tv_usec = 0;
    n = select(1, &rfds, NULL, NULL, &tv);
    if (n > 0) {
+        unsigned char ch;
        n = read(0, &ch, 1);
        if (n == 1)
            return ch;
@@ -289,6 +289,7 @@ static int read_key(void)
        }
        //Read it
        if(nchars != 0) {
+            unsigned char ch;
            if (read(0, &ch, 1) == 1)
                return ch;
            return 0;
@@ -300,7 +301,7 @@ static int read_key(void)
    if(kbhit())
        return(getch());
 #endif
-    return ch;
+    return -1;
 }

 static int decode_interrupt_cb(void *ctx)
@@ -218,6 +218,8 @@ typedef struct OptionsContext {
    SpecifierOptList display_rotations;
    SpecifierOptList display_hflips;
    SpecifierOptList display_vflips;
+    SpecifierOptList mastering_displays;
+    SpecifierOptList content_lights;
    SpecifierOptList rc_overrides;
    SpecifierOptList intra_matrices;
    SpecifierOptList inter_matrices;
@@ -28,6 +28,7 @@
 #include "libavutil/display.h"
 #include "libavutil/error.h"
 #include "libavutil/intreadwrite.h"
+#include "libavutil/mastering_display_metadata.h"
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
 #include "libavutil/parseutils.h"
@@ -68,6 +69,8 @@ typedef struct DemuxStream {
    int                      autorotate;
    int                      apply_cropping;
    int                      force_display_matrix;
+    int                      force_mastering_display;
+    int                      force_content_light;
    int                      drop_changed;


@@ -1248,6 +1251,125 @@ static int add_display_matrix_to_stream(const OptionsContext *o,
    return 0;
 }

+static int add_mastering_display_to_stream(const OptionsContext *o,
+                                           AVFormatContext *ctx, InputStream *ist)
+{
+    AVStream *st = ist->st;
+    DemuxStream *ds = ds_from_ist(ist);
+    AVMasteringDisplayMetadata *master_display;
+    AVPacketSideData *sd;
+    const char *p = NULL;
+    const int chroma_den = 50000;
+    const int luma_den = 10000;
+    size_t size;
+    int ret;
+
+    opt_match_per_stream_str(ist, &o->mastering_displays, ctx, st, &p);
+
+    if (!p)
+        return 0;
+
+    master_display = av_mastering_display_metadata_alloc_size(&size);
+    if (!master_display)
+        return AVERROR(ENOMEM);
+
+    ret = sscanf(p,
+                 "G(%u,%u)B(%u,%u)R(%u,%u)WP(%u,%u)L(%u,%u)",
+                 (unsigned*)&master_display->display_primaries[1][0].num,
+                 (unsigned*)&master_display->display_primaries[1][1].num,
+                 (unsigned*)&master_display->display_primaries[2][0].num,
+                 (unsigned*)&master_display->display_primaries[2][1].num,
+                 (unsigned*)&master_display->display_primaries[0][0].num,
+                 (unsigned*)&master_display->display_primaries[0][1].num,
+                 (unsigned*)&master_display->white_point[0].num,
+                 (unsigned*)&master_display->white_point[1].num,
+                 (unsigned*)&master_display->max_luminance.num,
+                 (unsigned*)&master_display->min_luminance.num);
+
+    if (ret != 10 ||
+        (unsigned)(master_display->display_primaries[1][0].num | master_display->display_primaries[1][1].num |
+                   master_display->display_primaries[2][0].num | master_display->display_primaries[2][1].num |
+                   master_display->display_primaries[0][0].num | master_display->display_primaries[0][1].num |
+                   master_display->white_point[0].num | master_display->white_point[1].num) > UINT16_MAX ||
+        (unsigned)(master_display->max_luminance.num  | master_display->min_luminance.num) > INT_MAX ||
+                   master_display->min_luminance.num  > master_display->max_luminance.num) {
+        av_freep(&master_display);
+        av_log(ist, AV_LOG_ERROR, "Failed to parse mastering display option\n");
+        return AVERROR(EINVAL);
+    }
+
+    master_display->display_primaries[1][0].den = chroma_den;
+    master_display->display_primaries[1][1].den = chroma_den;
+    master_display->display_primaries[2][0].den = chroma_den;
+    master_display->display_primaries[2][1].den = chroma_den;
+    master_display->display_primaries[0][0].den = chroma_den;
+    master_display->display_primaries[0][1].den = chroma_den;
+    master_display->white_point[0].den = chroma_den;
+    master_display->white_point[1].den = chroma_den;
+    master_display->max_luminance.den = luma_den;
+    master_display->min_luminance.den = luma_den;
+
+    master_display->has_primaries = 1;
+    master_display->has_luminance = 1;
+
+    sd = av_packet_side_data_add(&st->codecpar->coded_side_data,
+                                 &st->codecpar->nb_coded_side_data,
+                                 AV_PKT_DATA_MASTERING_DISPLAY_METADATA,
+                                 (uint8_t *)master_display, size, 0);
+    if (!sd) {
+        av_freep(&master_display);
+        return AVERROR(ENOMEM);
+    }
+
+    ds->force_mastering_display = 1;
+
+    return 0;
+}
+
+static int add_content_light_to_stream(const OptionsContext *o,
+                                       AVFormatContext *ctx, InputStream *ist)
+{
+    AVStream *st = ist->st;
+    DemuxStream *ds = ds_from_ist(ist);
+    AVContentLightMetadata *cll;
+    AVPacketSideData *sd;
+    const char *p = NULL;
+    size_t size;
+    int ret;
+
+    opt_match_per_stream_str(ist, &o->content_lights, ctx, st, &p);
+
+    if (!p)
+        return 0;
+
+    cll = av_content_light_metadata_alloc(&size);
+    if (!cll)
+        return AVERROR(ENOMEM);
+
+    ret = sscanf(p, "%u,%u",
+                 (unsigned*)&cll->MaxCLL,
+                 (unsigned*)&cll->MaxFALL);
+
+    if (ret != 2 || (unsigned)(cll->MaxCLL | cll->MaxFALL) > UINT16_MAX) {
+        av_freep(&cll);
+        av_log(ist, AV_LOG_ERROR, "Failed to parse content light option\n");
+        return AVERROR(EINVAL);
+    }
+
+    sd = av_packet_side_data_add(&st->codecpar->coded_side_data,
+                                 &st->codecpar->nb_coded_side_data,
+                                 AV_PKT_DATA_CONTENT_LIGHT_LEVEL,
+                                 (uint8_t *)cll, size, 0);
+    if (!sd) {
+        av_freep(&cll);
+        return AVERROR(ENOMEM);
+    }
+
+    ds->force_content_light = 1;
+
+    return 0;
+}
+
 static const char *input_stream_item_name(void *obj)
 {
    const DemuxStream *ds = obj;
@@ -1301,6 +1423,7 @@ static int ist_add(const OptionsContext *o, Demuxer *d, AVStream *st, AVDictiona
    const char *bsfs = NULL;
    char *next;
    const char *discard_str = NULL;
+    AVBPrint bp;
    int ret;

    ds  = demux_stream_alloc(d, st);
@@ -1366,6 +1489,14 @@ static int ist_add(const OptionsContext *o, Demuxer *d, AVStream *st, AVDictiona
        if (ret < 0)
            return ret;

+        ret = add_mastering_display_to_stream(o, ic, ist);
+        if (ret < 0)
+            return ret;
+
+        ret = add_content_light_to_stream(o, ic, ist);
+        if (ret < 0)
+            return ret;
+
        opt_match_per_stream_str(ist, &o->hwaccels, ic, st, &hwaccel);
        opt_match_per_stream_str(ist, &o->hwaccel_output_formats, ic, st,
                                       &hwaccel_output_format);
@@ -1483,15 +1614,26 @@ static int ist_add(const OptionsContext *o, Demuxer *d, AVStream *st, AVDictiona
    av_dict_set_int(&ds->decoder_opts, "apply_cropping",
                    ds->apply_cropping && ds->apply_cropping != CROP_CONTAINER, 0);

+    av_bprint_init(&bp, 0, AV_BPRINT_SIZE_AUTOMATIC);
    if (ds->force_display_matrix) {
-        char buf[32];
        if (av_dict_get(ds->decoder_opts, "side_data_prefer_packet", NULL, 0))
-            buf[0] = ',';
-        else
-            buf[0] = '\0';
-        av_strlcat(buf, "displaymatrix", sizeof(buf));
-        av_dict_set(&ds->decoder_opts, "side_data_prefer_packet", buf, AV_DICT_APPEND);
+            av_bprintf(&bp, ",");
+        av_bprintf(&bp, "displaymatrix");
    }
+    if (ds->force_mastering_display) {
+        if (bp.len || av_dict_get(ds->decoder_opts, "side_data_prefer_packet", NULL, 0))
+            av_bprintf(&bp, ",");
+        av_bprintf(&bp, "mastering_display_metadata");
+    }
+    if (ds->force_content_light) {
+        if (bp.len || av_dict_get(ds->decoder_opts, "side_data_prefer_packet", NULL, 0))
+            av_bprintf(&bp, ",");
+        av_bprintf(&bp, "content_light_level");
+    }
+    if (bp.len)
+        av_dict_set(&ds->decoder_opts, "side_data_prefer_packet", bp.str, AV_DICT_APPEND);
+    av_bprint_finalize(&bp, NULL);
+
    /* Attached pics are sparse, therefore we would not want to delay their decoding
     * till EOF. */
    if (ist->st->disposition & AV_DISPOSITION_ATTACHED_PIC)
@@ -227,6 +227,9 @@ int enc_open(void *opaque, const AVFrame *frame)
                   frame->ch_layout.nb_channels > 0);
        enc_ctx->sample_fmt     = frame->format;
        enc_ctx->sample_rate    = frame->sample_rate;
+        if (!enc_ctx->frame_size && (!(enc->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE) ||
+                                      (enc_ctx->flags2 & AV_CODEC_FLAG2_FIXED_FRAME_SIZE)))
+            enc_ctx->frame_size = frame->nb_samples;
        ret = av_channel_layout_copy(&enc_ctx->ch_layout, &frame->ch_layout);
        if (ret < 0)
            return ret;
@@ -27,6 +27,7 @@
 #include "libavfilter/buffersink.h"
 #include "libavfilter/buffersrc.h"

+#include "libavutil/attributes.h"
 #include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
 #include "libavutil/bprint.h"
@@ -1681,7 +1682,10 @@ static int configure_output_video_filter(FilterGraphPriv *fgp, AVFilterGraph *gr
        av_frame_side_data_remove(&ofp->side_data, &ofp->nb_side_data, AV_FRAME_DATA_DISPLAYMATRIX);
    }

-    if ((ofp->width || ofp->height) && (ofp->flags & OFILTER_FLAG_AUTOSCALE)) {
+    if ((ofp->width || ofp->height) && (ofp->flags & OFILTER_FLAG_AUTOSCALE) &&
+        // skip add scale for hardware format
+        !(ofp->format != AV_PIX_FMT_NONE &&
+          av_pix_fmt_desc_get(ofp->format)->flags & AV_PIX_FMT_FLAG_HWACCEL)) {
        char args[255];
        AVFilterContext *filter;
        const AVDictionaryEntry *e = NULL;
@@ -2569,6 +2573,7 @@ static void video_sync_process(OutputFilterPriv *ofp, AVFrame *frame,
            delta0 = 0;
            ofp->next_pts = llrint(sync_ipts);
        }
+        av_fallthrough;
    case VSYNC_CFR:
        // FIXME set to 0.5 after we fix some dts/pts bugs like in avidec.c
        if (frame_drop_threshold && delta < frame_drop_threshold && fps->frame_number) {
@@ -2836,8 +2841,10 @@ static int fg_output_step(OutputFilterPriv *ofp, FilterGraphThread *fgt,
    if (!fgt->got_frame) {
        ret = clone_side_data(&fd->side_data, &fd->nb_side_data,
                              ofp->side_data, ofp->nb_side_data, 0);
-        if (ret < 0)
+        if (ret < 0) {
+            av_frame_unref(frame);
            return ret;
+        }
    }

    fd->wallclock[LATENCY_PROBE_FILTER_POST] = av_gettime_relative();
@@ -3181,7 +3188,7 @@ static int send_frame(FilterGraph *fg, FilterGraphThread *fgt,
                const char *color_space_name = av_color_space_name(frame->colorspace);
                const char *color_range_name = av_color_range_name(frame->color_range);
                const char *alpha_mode = av_alpha_mode_name(frame->alpha_mode);
-                av_bprintf(&reason, "video parameters changed to %s(%s, %s), %dx%d, %s alpha,",
+                av_bprintf(&reason, "video parameters changed to %s(%s, %s), %dx%d, %s alpha, ",
                        unknown_if_null(pixel_format_name), unknown_if_null(color_range_name),
                        unknown_if_null(color_space_name), frame->width, frame->height,
                        unknown_if_null(alpha_mode));
@@ -3337,8 +3344,6 @@ static int filter_thread(void *arg)

        o = (intptr_t)fgt.frame->opaque;

-        o = (intptr_t)fgt.frame->opaque;
-
        // message on the control stream
        if (input_idx == fg->nb_inputs) {
            FilterCommand *fc;
@@ -1531,6 +1531,8 @@ static int ost_add(Muxer *mux, const OptionsContext *o, enum AVMediaType type,

    if (oc->oformat->flags & AVFMT_GLOBALHEADER && ost->enc)
        ost->enc->enc_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
+    if (oc->oformat->flags & AVFMT_FIXED_FRAMESIZE && ost->enc)
+        ost->enc->enc_ctx->flags2 |= AV_CODEC_FLAG2_FIXED_FRAME_SIZE;

    opt_match_per_stream_int(ost, &o->copy_initial_nonkeyframes,
                             oc, st, &ms->copy_initial_nonkeyframes);
@@ -2127,7 +2129,8 @@ static int setup_sync_queues(Muxer *mux, AVFormatContext *oc,
        nb_interleaved += IS_INTERLEAVED(type);
        nb_av_enc      += IS_AV_ENC(ost, type);
        nb_audio_fs    += (ost->enc && type == AVMEDIA_TYPE_AUDIO &&
-                           !(ost->enc->enc_ctx->codec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE));
+                           (!(ost->enc->enc_ctx->codec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE) ||
+                            (ost->enc->enc_ctx->flags2 & AV_CODEC_FLAG2_FIXED_FRAME_SIZE)));

        limit_frames        |=  ms->max_frames < INT64_MAX;
        limit_frames_av_enc |= (ms->max_frames < INT64_MAX) && IS_AV_ENC(ost, type);
@@ -2552,6 +2555,8 @@ static int of_map_group(Muxer *mux, AVDictionary **dict, AVBPrint *bp, const cha
        }
        break;
    }
+    case AV_STREAM_GROUP_PARAMS_LCEVC:
+        break;
    default:
        av_log(mux, AV_LOG_ERROR, "Unsupported mapped group type %d.\n", stg->type);
        ret = AVERROR(EINVAL);
@@ -2574,6 +2579,8 @@ static int of_parse_group_token(Muxer *mux, const char *token, char *ptr)
                { .i64 = AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT },    .unit = "type" },
            { "iamf_mix_presentation", NULL, 0, AV_OPT_TYPE_CONST,
                { .i64 = AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION }, .unit = "type" },
+            { "lcevc", NULL, 0, AV_OPT_TYPE_CONST,
+                { .i64 = AV_STREAM_GROUP_PARAMS_LCEVC }, .unit = "type" },
        { NULL },
    };
    const AVClass class = {
@@ -2648,6 +2655,10 @@ static int of_parse_group_token(Muxer *mux, const char *token, char *ptr)
        ret = avformat_stream_group_add_stream(stg, oc->streams[idx]);
        if (ret < 0)
            goto end;
+        OutputStream *ost = mux->of.streams[idx];
+        if (ost->enc && (type == AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT ||
+                         type == AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION))
+            ost->enc->enc_ctx->flags2 |= AV_CODEC_FLAG2_FIXED_FRAME_SIZE;
    }
    while (e = av_dict_get(dict, "stg", e, 0)) {
        char *endptr;
@@ -2672,8 +2683,6 @@ static int of_parse_group_token(Muxer *mux, const char *token, char *ptr)
        ret = of_parse_iamf_submixes(mux, stg, ptr);
        break;
    default:
-        av_log(mux, AV_LOG_FATAL, "Unknown group type %d.\n", type);
-        ret = AVERROR(EINVAL);
        break;
    }

@@ -645,6 +645,9 @@ static int opt_map(void *optctx, const char *opt, const char *arg)
            for (i = 0; i < o->nb_stream_maps; i++) {
                m = &o->stream_maps[i];
                if (file_idx == m->file_index &&
+                    !m->linklabel &&
+                    m->stream_index >= 0 &&
+                    m->stream_index < input_files[m->file_index]->nb_streams &&
                    stream_specifier_match(&ss,
                                           input_files[m->file_index]->ctx,
                                           input_files[m->file_index]->ctx->streams[m->stream_index],
@@ -1940,6 +1943,12 @@ const OptionDef options[] = {
        { .off = OFFSET(display_vflips) },
        "set display vertical flip for stream(s) "
        "(overrides any display rotation if it is not set)"},
+    { "mastering_display",          OPT_TYPE_STRING, OPT_VIDEO | OPT_PERSTREAM | OPT_INPUT | OPT_EXPERT,
+        { .off = OFFSET(mastering_displays) },
+        "set SMPTE2084 mastering display color volume info" },
+    { "content_light",              OPT_TYPE_STRING, OPT_VIDEO | OPT_PERSTREAM | OPT_INPUT | OPT_EXPERT,
+        { .off = OFFSET(content_lights) },
+        "set SMPTE2084 Max CLL and Max FALL values" },
    { "vn",                         OPT_TYPE_BOOL,   OPT_VIDEO | OPT_OFFSET | OPT_INPUT | OPT_OUTPUT,
        { .off = OFFSET(video_disable) },
        "disable video" },
@@ -435,7 +435,7 @@ static void task_init(Scheduler *sch, SchTask *task, enum SchedulerNodeType type
    task->func_arg  = func_arg;
 }

-static int64_t trailing_dts(const Scheduler *sch, int count_finished)
+static int64_t trailing_dts(const Scheduler *sch)
 {
    int64_t min_dts = INT64_MAX;

@@ -445,7 +445,7 @@ static int64_t trailing_dts(const Scheduler *sch, int count_finished)
        for (unsigned j = 0; j < mux->nb_streams; j++) {
            const SchMuxStream *ms = &mux->streams[j];

-            if (ms->source_finished && !count_finished)
+            if (ms->source_finished)
                continue;
            if (ms->last_dts == AV_NOPTS_VALUE)
                return AV_NOPTS_VALUE;
@@ -457,6 +457,26 @@ static int64_t trailing_dts(const Scheduler *sch, int count_finished)
    return min_dts == INT64_MAX ? AV_NOPTS_VALUE : min_dts;
 }

+static int64_t progressing_dts(const Scheduler *sch, int count_finished)
+{
+    int64_t max_dts = INT64_MIN;
+
+    for (unsigned i = 0; i < sch->nb_mux; i++) {
+        const SchMux *mux = &sch->mux[i];
+
+        for (unsigned j = 0; j < mux->nb_streams; j++) {
+            const SchMuxStream *ms = &mux->streams[j];
+
+            if (ms->source_finished && !count_finished)
+                continue;
+            if (ms->last_dts != AV_NOPTS_VALUE)
+                max_dts = FFMAX(max_dts, ms->last_dts);
+        }
+    }
+
+    return max_dts == INT64_MIN ? AV_NOPTS_VALUE : max_dts;
+}
+
 void sch_remove_filtergraph(Scheduler *sch, int idx)
 {
    SchFilterGraph *fg = &sch->filters[idx];
@@ -1399,9 +1419,9 @@ static void schedule_update_locked(Scheduler *sch)
    if (atomic_load(&sch->terminate))
        return;

-    dts = trailing_dts(sch, 0);
+    dts = trailing_dts(sch);

-    atomic_store(&sch->last_dts, dts);
+    atomic_store(&sch->last_dts, progressing_dts(sch, 0));

    // initialize our internal state
    for (unsigned type = 0; type < 2; type++)
@@ -2768,7 +2788,7 @@ int sch_stop(Scheduler *sch, int64_t *finish_ts)
    }

    if (finish_ts)
-        *finish_ts = trailing_dts(sch, 1);
+        *finish_ts = progressing_dts(sch, 1);

    sch->state = SCH_STATE_STOPPED;

@@ -3007,6 +3007,7 @@ static int read_thread(void *arg)
        // initial metadata as update.
        st->event_flags &= ~AVSTREAM_EVENT_FLAG_METADATA_UPDATED;
    }
+    ic->event_flags &= ~AVFMT_EVENT_FLAG_METADATA_UPDATED;
    for (i = 0; i < AVMEDIA_TYPE_NB; i++) {
        if (wanted_stream_spec[i] && st_index[i] == -1) {
            av_log(NULL, AV_LOG_ERROR, "Stream specifier %s does not match any %s stream\n", wanted_stream_spec[i], av_get_media_type_string(i));
@@ -3175,16 +3176,24 @@ static int read_thread(void *arg)
            is->eof = 0;
        }

-        if (show_status && ic->streams[pkt->stream_index]->event_flags &
-            AVSTREAM_EVENT_FLAG_METADATA_UPDATED) {
-            fprintf(stderr, "\x1b[2K\r");
-            snprintf(metadata_description,
-                     sizeof(metadata_description),
-                     "\r  New metadata for stream %d",
-                     pkt->stream_index);
-            dump_dictionary(NULL, ic->streams[pkt->stream_index]->metadata,
-                               metadata_description, "    ", AV_LOG_INFO);
+        if (show_status) {
+            if (ic->event_flags & AVFMT_EVENT_FLAG_METADATA_UPDATED) {
+                fprintf(stderr, "\x1b[2K\r");
+                dump_dictionary(NULL, ic->metadata,
+                                "\r  New metadata", "    ", AV_LOG_INFO);
+            }
+            if (ic->streams[pkt->stream_index]->event_flags &
+                AVSTREAM_EVENT_FLAG_METADATA_UPDATED) {
+                fprintf(stderr, "\x1b[2K\r");
+                snprintf(metadata_description,
+                         sizeof(metadata_description),
+                         "\r  New metadata for stream %d",
+                         pkt->stream_index);
+                dump_dictionary(NULL, ic->streams[pkt->stream_index]->metadata,
+                                   metadata_description, "    ", AV_LOG_INFO);
+            }
        }
+        ic->event_flags &= ~AVFMT_EVENT_FLAG_METADATA_UPDATED;
        ic->streams[pkt->stream_index]->event_flags &= ~AVSTREAM_EVENT_FLAG_METADATA_UPDATED;

        /* check if packet is in play range specified by user, then queue, otherwise discard */
@@ -43,6 +43,7 @@

 #include "libavutil/bprint.h"
 #include "libavutil/mem.h"
+#include "libavutil/internal.h"

 #endif

@@ -115,14 +116,22 @@ static void hwctx_lock_queue(void *priv, uint32_t qf, uint32_t qidx)
 {
    AVHWDeviceContext *avhwctx = priv;
    const AVVulkanDeviceContext *hwctx = avhwctx->hwctx;
+#if FF_API_VULKAN_SYNC_QUEUES
+FF_DISABLE_DEPRECATION_WARNINGS
    hwctx->lock_queue(avhwctx, qf, qidx);
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
 }

 static void hwctx_unlock_queue(void *priv, uint32_t qf, uint32_t qidx)
 {
    AVHWDeviceContext *avhwctx = priv;
    const AVVulkanDeviceContext *hwctx = avhwctx->hwctx;
+#if FF_API_VULKAN_SYNC_QUEUES
+FF_DISABLE_DEPRECATION_WARNINGS
    hwctx->unlock_queue(avhwctx, qf, qidx);
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
 }

 static int add_instance_extension(const char **ext, unsigned num_ext,
@@ -283,7 +292,11 @@ static void placebo_lock_queue(struct AVHWDeviceContext *dev_ctx,
 {
    RendererContext *ctx = dev_ctx->user_opaque;
    pl_vulkan vk = ctx->placebo_vulkan;
+#if FF_API_VULKAN_SYNC_QUEUES
+FF_DISABLE_DEPRECATION_WARNINGS
    vk->lock_queue(vk, queue_family, index);
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
 }

 static void placebo_unlock_queue(struct AVHWDeviceContext *dev_ctx,
@@ -292,7 +305,11 @@ static void placebo_unlock_queue(struct AVHWDeviceContext *dev_ctx,
 {
    RendererContext *ctx = dev_ctx->user_opaque;
    pl_vulkan vk = ctx->placebo_vulkan;
+#if FF_API_VULKAN_SYNC_QUEUES
+FF_DISABLE_DEPRECATION_WARNINGS
    vk->unlock_queue(vk, queue_family, index);
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
 }

 static int get_decode_queue(VkRenderer *renderer, int *index, int *count)
@@ -386,8 +403,12 @@ static int create_vk_by_placebo(VkRenderer *renderer,
    device_ctx->user_opaque = ctx;

    vk_dev_ctx = device_ctx->hwctx;
+#if FF_API_VULKAN_SYNC_QUEUES
+FF_DISABLE_DEPRECATION_WARNINGS
    vk_dev_ctx->lock_queue = placebo_lock_queue;
    vk_dev_ctx->unlock_queue = placebo_unlock_queue;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif

    vk_dev_ctx->get_proc_addr = ctx->placebo_instance->get_proc_addr;

@@ -215,6 +215,8 @@ typedef enum {
    SECTION_ID_STREAM_GROUP_SUBPIECE,
    SECTION_ID_STREAM_GROUP_BLOCKS,
    SECTION_ID_STREAM_GROUP_BLOCK,
+    SECTION_ID_STREAM_GROUP_SIDE_DATA_LIST,
+    SECTION_ID_STREAM_GROUP_SIDE_DATA,
    SECTION_ID_STREAM_GROUP_STREAMS,
    SECTION_ID_STREAM_GROUP_STREAM,
    SECTION_ID_STREAM_GROUP_DISPOSITION,
@@ -298,7 +300,7 @@ static const AVTextFormatSection sections[] = {
    [SECTION_ID_STREAM_GROUP_STREAM_TAGS] =        { SECTION_ID_STREAM_GROUP_STREAM_TAGS, "tags", AV_TEXTFORMAT_SECTION_FLAG_HAS_VARIABLE_FIELDS, { -1 }, .element_name = "tag", .unique_name = "stream_group_stream_tags" },
    [SECTION_ID_STREAM_GROUP] =                    { SECTION_ID_STREAM_GROUP, "stream_group", 0, { SECTION_ID_STREAM_GROUP_TAGS, SECTION_ID_STREAM_GROUP_DISPOSITION, SECTION_ID_STREAM_GROUP_COMPONENTS, SECTION_ID_STREAM_GROUP_STREAMS, -1 } },
    [SECTION_ID_STREAM_GROUP_COMPONENTS] =         { SECTION_ID_STREAM_GROUP_COMPONENTS, "components", AV_TEXTFORMAT_SECTION_FLAG_IS_ARRAY, { SECTION_ID_STREAM_GROUP_COMPONENT, -1 }, .element_name = "component", .unique_name = "stream_group_components" },
-    [SECTION_ID_STREAM_GROUP_COMPONENT] =          { SECTION_ID_STREAM_GROUP_COMPONENT, "component", AV_TEXTFORMAT_SECTION_FLAG_HAS_VARIABLE_FIELDS|AV_TEXTFORMAT_SECTION_FLAG_HAS_TYPE, { SECTION_ID_STREAM_GROUP_SUBCOMPONENTS, -1 }, .unique_name = "stream_group_component", .element_name = "component_entry", .get_type = get_stream_group_type },
+    [SECTION_ID_STREAM_GROUP_COMPONENT] =          { SECTION_ID_STREAM_GROUP_COMPONENT, "component", AV_TEXTFORMAT_SECTION_FLAG_HAS_VARIABLE_FIELDS|AV_TEXTFORMAT_SECTION_FLAG_HAS_TYPE, { SECTION_ID_STREAM_GROUP_SIDE_DATA_LIST, SECTION_ID_STREAM_GROUP_SUBCOMPONENTS, -1 }, .unique_name = "stream_group_component", .element_name = "component_entry", .get_type = get_stream_group_type },
    [SECTION_ID_STREAM_GROUP_SUBCOMPONENTS] =      { SECTION_ID_STREAM_GROUP_SUBCOMPONENTS, "subcomponents", AV_TEXTFORMAT_SECTION_FLAG_IS_ARRAY, { SECTION_ID_STREAM_GROUP_SUBCOMPONENT, -1 }, .element_name = "component" },
    [SECTION_ID_STREAM_GROUP_SUBCOMPONENT] =       { SECTION_ID_STREAM_GROUP_SUBCOMPONENT, "subcomponent", AV_TEXTFORMAT_SECTION_FLAG_HAS_VARIABLE_FIELDS|AV_TEXTFORMAT_SECTION_FLAG_HAS_TYPE, { SECTION_ID_STREAM_GROUP_PIECES, -1 }, .element_name = "subcomponent_entry", .get_type = get_raw_string_type },
    [SECTION_ID_STREAM_GROUP_PIECES] =             { SECTION_ID_STREAM_GROUP_PIECES, "pieces", AV_TEXTFORMAT_SECTION_FLAG_IS_ARRAY, { SECTION_ID_STREAM_GROUP_PIECE, -1 }, .element_name = "piece", .unique_name = "stream_group_pieces" },
@@ -307,6 +309,8 @@ static const AVTextFormatSection sections[] = {
    [SECTION_ID_STREAM_GROUP_SUBPIECE] =           { SECTION_ID_STREAM_GROUP_SUBPIECE, "subpiece", AV_TEXTFORMAT_SECTION_FLAG_HAS_VARIABLE_FIELDS|AV_TEXTFORMAT_SECTION_FLAG_HAS_TYPE, { SECTION_ID_STREAM_GROUP_BLOCKS, -1 }, .element_name = "subpiece_entry", .get_type = get_raw_string_type },
    [SECTION_ID_STREAM_GROUP_BLOCKS] =             { SECTION_ID_STREAM_GROUP_BLOCKS, "blocks", AV_TEXTFORMAT_SECTION_FLAG_IS_ARRAY, { SECTION_ID_STREAM_GROUP_BLOCK, -1 }, .element_name = "block" },
    [SECTION_ID_STREAM_GROUP_BLOCK] =              { SECTION_ID_STREAM_GROUP_BLOCK, "block", AV_TEXTFORMAT_SECTION_FLAG_HAS_VARIABLE_FIELDS|AV_TEXTFORMAT_SECTION_FLAG_HAS_TYPE, { -1 }, .element_name = "block_entry", .get_type = get_raw_string_type },
+    [SECTION_ID_STREAM_GROUP_SIDE_DATA_LIST] =     { SECTION_ID_STREAM_GROUP_SIDE_DATA_LIST, "side_data_list", AV_TEXTFORMAT_SECTION_FLAG_IS_ARRAY, { SECTION_ID_STREAM_GROUP_SIDE_DATA, -1 }, .element_name = "side_data", .unique_name = "stream_group_side_data_list" },
+    [SECTION_ID_STREAM_GROUP_SIDE_DATA] =          { SECTION_ID_STREAM_GROUP_SIDE_DATA, "side_data", AV_TEXTFORMAT_SECTION_FLAG_HAS_TYPE|AV_TEXTFORMAT_SECTION_FLAG_HAS_VARIABLE_FIELDS, { -1 }, .unique_name = "stream_group_side_data", .element_name = "side_datum", .get_type = get_packet_side_data_type },
    [SECTION_ID_STREAM_GROUP_STREAMS] =            { SECTION_ID_STREAM_GROUP_STREAMS, "streams", AV_TEXTFORMAT_SECTION_FLAG_IS_ARRAY, { SECTION_ID_STREAM_GROUP_STREAM, -1 }, .unique_name = "stream_group_streams" },
    [SECTION_ID_STREAM_GROUP_STREAM] =             { SECTION_ID_STREAM_GROUP_STREAM, "stream", 0, { SECTION_ID_STREAM_GROUP_STREAM_DISPOSITION, SECTION_ID_STREAM_GROUP_STREAM_TAGS, -1 }, .unique_name = "stream_group_stream" },
    [SECTION_ID_STREAM_GROUP_DISPOSITION] =        { SECTION_ID_STREAM_GROUP_DISPOSITION, "disposition", 0, { -1 }, .unique_name = "stream_group_disposition" },
@@ -345,7 +349,7 @@ static const char unit_hertz_str[]          = "Hz"   ;
 static const char unit_byte_str[]           = "byte" ;
 static const char unit_bit_per_second_str[] = "bit/s";

-static int nb_streams;
+static unsigned int nb_streams;
 static uint64_t *nb_streams_packets;
 static uint64_t *nb_streams_frames;
 static int *selected_streams;
@@ -432,8 +436,8 @@ static void log_callback(void *ptr, int level, const char *fmt, va_list vl)

 #define print_list_fmt(k, f, n, m, ...) do {    \
    av_bprint_clear(&pbuf);                     \
-    for (int idx = 0; idx < n; idx++) {         \
-        for (int idx2 = 0; idx2 < m; idx2++) {  \
+    for (unsigned int idx = 0; idx < n; idx++) {         \
+        for (unsigned int idx2 = 0; idx2 < m; idx2++) {  \
            if (idx > 0 || idx2 > 0)            \
                av_bprint_chars(&pbuf, ' ', 1); \
            av_bprintf(&pbuf, f, __VA_ARGS__);  \
@@ -801,6 +805,62 @@ static void print_dynamic_hdr10_plus(AVTextFormatContext *tfc, const AVDynamicHD
    }
 }

+static void print_dynamic_hdr_smpte2094_app5(AVTextFormatContext *tfc, const AVDynamicHDRSmpte2094App5 *metadata)
+{
+    if (!metadata)
+        return;
+    print_int("application_version", metadata->application_version);
+    print_int("minimum_application_version", metadata->minimum_application_version);
+    print_int("has_custom_hdr_reference_white_flag", metadata->has_custom_hdr_reference_white_flag);
+    print_int("has_adaptive_tone_map_flag", metadata->has_adaptive_tone_map_flag);
+
+    if (metadata->has_custom_hdr_reference_white_flag)
+        print_int("hdr_reference_white", metadata->hdr_reference_white);
+
+    if (!metadata->has_adaptive_tone_map_flag)
+        return;
+
+    print_int("baseline_hdr_headroom", metadata->baseline_hdr_headroom);
+    print_int("use_reference_white_tone_mapping_flag", metadata->use_reference_white_tone_mapping_flag);
+
+    if (metadata->use_reference_white_tone_mapping_flag)
+        return;
+
+    print_int("num_alternate_images", metadata->num_alternate_images);
+    print_int("gain_application_space_chromaticities_flag", metadata->gain_application_space_chromaticities_flag);
+    print_int("has_common_component_mix_params_flag", metadata->has_common_component_mix_params_flag);
+    print_int("has_common_curve_params_flag", metadata->has_common_curve_params_flag);
+
+    if (metadata->gain_application_space_chromaticities_flag == 3) {
+        for (int i = 0; i < 8; i++)
+            print_int("gain_application_space_chromaticities", metadata->gain_application_space_chromaticities[i]);
+    }
+
+    for (int a = 0; a < metadata->num_alternate_images; a++) {
+        print_int("alternate_hdr_headroom", metadata->alternate_hdr_headrooms[a]);
+
+        print_int("component_mixing_type", metadata->component_mixing_type[a]);
+        if (metadata->component_mixing_type[a] == 3) {
+            for (int k = 0; k < 6; k++) {
+                print_int("has_component_mixing_coefficient_flag", metadata->has_component_mixing_coefficient_flag[a][k]);
+                if (metadata->has_component_mixing_coefficient_flag[a][k])
+                    print_int("component_mixing_coefficient", metadata->component_mixing_coefficient[a][k]);
+            }
+        }
+
+        print_int("gain_curve_num_control_points_minus_1", metadata->gain_curve_num_control_points_minus_1[a]);
+        print_int("gain_curve_use_pchip_slope_flag", metadata->gain_curve_use_pchip_slope_flag[a]);
+        for (int c = 0; c <= metadata->gain_curve_num_control_points_minus_1[a]; c++)
+            print_int("gain_curve_control_point_x", metadata->gain_curve_control_points_x[a][c]);
+        for (int c = 0; c <= metadata->gain_curve_num_control_points_minus_1[a]; c++)
+            print_int("gain_curve_control_point_y", metadata->gain_curve_control_points_y[a][c]);
+        if (!metadata->gain_curve_use_pchip_slope_flag[a]) {
+            for (int c = 0; c <= metadata->gain_curve_num_control_points_minus_1[a]; c++)
+                print_int("gain_curve_control_point_theta", metadata->gain_curve_control_points_theta[a][c]);
+        }
+    }
+}
+
 static void print_dynamic_hdr_vivid(AVTextFormatContext *tfc, const AVDynamicHDRVivid *metadata)
 {
    if (!metadata)
@@ -942,7 +1002,7 @@ static void print_film_grain_params(AVTextFormatContext *tfc,
            avtext_print_section_footer(tfc);
        }

-        for (int uv = 0; uv < 2; uv++) {
+        for (unsigned uv = 0; uv < 2; uv++) {
            if (!aom->num_uv_points[uv] && !aom->chroma_scaling_from_luma)
                continue;

@@ -1008,7 +1068,8 @@ static void print_film_grain_params(AVTextFormatContext *tfc,
 }

 static void print_pkt_side_data(AVTextFormatContext *tfc,
-                                AVCodecParameters *par,
+                                int width,
+                                int height,
                                const AVPacketSideData *sd,
                                SectionID id_data)
 {
@@ -1034,7 +1095,7 @@ static void print_pkt_side_data(AVTextFormatContext *tfc,
            print_int("padding", spherical->padding);
        } else if (spherical->projection == AV_SPHERICAL_EQUIRECTANGULAR_TILE) {
            size_t l, t, r, b;
-            av_spherical_tile_bounds(spherical, par->width, par->height,
+            av_spherical_tile_bounds(spherical, width, height,
                                     &l, &t, &r, &b);
            print_int("bound_left", l);
            print_int("bound_top", t);
@@ -1305,7 +1366,7 @@ static void show_packet(AVTextFormatContext *tfc, InputFile *ifile, AVPacket *pk

        avtext_print_section_header(tfc, NULL, SECTION_ID_PACKET_SIDE_DATA_LIST);
        for (int i = 0; i < pkt->side_data_elems; i++) {
-            print_pkt_side_data(tfc, st->codecpar, &pkt->side_data[i],
+            print_pkt_side_data(tfc, st->codecpar->width, st->codecpar->height, &pkt->side_data[i],
                                SECTION_ID_PACKET_SIDE_DATA);
            avtext_print_section_footer(tfc);
        }
@@ -1341,6 +1402,9 @@ static void show_subtitle(AVTextFormatContext *tfc, AVSubtitle *sub, AVStream *s
    fflush(stdout);
 }

+static void print_iamf_param_definition(AVTextFormatContext *tfc, const char *name,
+                                        const AVIAMFParamDefinition *param, SectionID section_id);
+
 static void print_frame_side_data(AVTextFormatContext *tfc,
                                  const AVFrame *frame,
                                  const AVStream *stream)
@@ -1379,6 +1443,9 @@ static void print_frame_side_data(AVTextFormatContext *tfc,
        } else if (sd->type == AV_FRAME_DATA_DYNAMIC_HDR_PLUS) {
            AVDynamicHDRPlus *metadata = (AVDynamicHDRPlus *)sd->data;
            print_dynamic_hdr10_plus(tfc, metadata);
+        } else if (sd->type == AV_FRAME_DATA_DYNAMIC_HDR_SMPTE_2094_APP5) {
+            AVDynamicHDRSmpte2094App5 *metadata = (AVDynamicHDRSmpte2094App5 *)sd->data;
+            print_dynamic_hdr_smpte2094_app5(tfc, metadata);
        } else if (sd->type == AV_FRAME_DATA_CONTENT_LIGHT_LEVEL) {
            print_context_light_level(tfc, (AVContentLightMetadata *)sd->data);
        } else if (sd->type == AV_FRAME_DATA_ICC_PROFILE) {
@@ -1400,6 +1467,11 @@ static void print_frame_side_data(AVTextFormatContext *tfc,
            print_int("view_id", *(int*)sd->data);
        } else if (sd->type == AV_FRAME_DATA_EXIF) {
            print_int("size", sd->size);
+        } else if (sd->type == AV_FRAME_DATA_IAMF_MIX_GAIN_PARAM ||
+            sd->type == AV_FRAME_DATA_IAMF_DEMIXING_INFO_PARAM ||
+            sd->type == AV_FRAME_DATA_IAMF_RECON_GAIN_INFO_PARAM) {
+            const AVIAMFParamDefinition *param = (AVIAMFParamDefinition *)sd->data;
+            print_iamf_param_definition(tfc, NULL, param, SECTION_ID_FRAME_SIDE_DATA);
        }
        avtext_print_section_footer(tfc);
    }
@@ -1696,12 +1768,10 @@ static int read_interval_packets(AVTextFormatContext *tfc, InputFile *ifile,
    }
    av_packet_unref(pkt);
    //Flush remaining frames that are cached in the decoder
-    for (i = 0; i < ifile->nb_streams; i++) {
+    for (int i = 0; i < ifile->nb_streams; i++) {
        pkt->stream_index = i;
        if (do_read_frames) {
            while (process_frame(tfc, ifile, frame, pkt, &(int){1}) > 0);
-            if (ifile->streams[i].dec_ctx)
-                avcodec_flush_buffers(ifile->streams[i].dec_ctx);
        }
    }

@@ -1715,17 +1785,33 @@ end:
    return ret;
 }

+static void flush_buffers(InputFile *ifile)
+{
+    int i;
+
+    if (!do_read_frames)
+        return;
+    for (i = 0; i < ifile->nb_streams; i++) {
+        if (ifile->streams[i].dec_ctx)
+            avcodec_flush_buffers(ifile->streams[i].dec_ctx);
+    }
+}
+
 static int read_packets(AVTextFormatContext *tfc, InputFile *ifile)
 {
    AVFormatContext *fmt_ctx = ifile->fmt_ctx;
-    int i, ret = 0;
+    int ret = 0;
    int64_t cur_ts = fmt_ctx->start_time;

    if (read_intervals_nb == 0) {
        ReadInterval interval = (ReadInterval) { .has_start = 0, .has_end = 0 };
        ret = read_interval_packets(tfc, ifile, &interval, &cur_ts);
    } else {
-        for (i = 0; i < read_intervals_nb; i++) {
+        for (int i = 0; i < read_intervals_nb; i++) {
+            /* flushing buffers can reset parts of the private context which may be
+             * read by show_streams(), so only flush between each read_interval */
+            if (i)
+                flush_buffers(ifile);
            ret = read_interval_packets(tfc, ifile, &read_intervals[i], &cur_ts);
            if (ret < 0)
                break;
@@ -1738,7 +1824,7 @@ static int read_packets(AVTextFormatContext *tfc, InputFile *ifile)
 static void print_dispositions(AVTextFormatContext *tfc, uint32_t disposition, SectionID section_id)
 {
    avtext_print_section_header(tfc, NULL, section_id);
-    for (int i = 0; i < sizeof(disposition) * CHAR_BIT; i++) {
+    for (unsigned i = 0; i < sizeof(disposition) * CHAR_BIT; i++) {
        const char *disposition_str = av_disposition_to_string(1U << i);

        if (disposition_str)
@@ -1961,7 +2047,7 @@ static int show_stream(AVTextFormatContext *tfc, AVFormatContext *fmt_ctx, int s
    if (stream->codecpar->nb_coded_side_data) {
        avtext_print_section_header(tfc, NULL, SECTION_ID_STREAM_SIDE_DATA_LIST);
        for (int i = 0; i < stream->codecpar->nb_coded_side_data; i++) {
-            print_pkt_side_data(tfc, stream->codecpar, &stream->codecpar->coded_side_data[i],
+            print_pkt_side_data(tfc, stream->codecpar->width, stream->codecpar->height, &stream->codecpar->coded_side_data[i],
                                SECTION_ID_STREAM_SIDE_DATA);
            avtext_print_section_footer(tfc);
        }
@@ -1978,10 +2064,10 @@ static int show_stream(AVTextFormatContext *tfc, AVFormatContext *fmt_ctx, int s
 static int show_streams(AVTextFormatContext *tfc, InputFile *ifile)
 {
    AVFormatContext *fmt_ctx = ifile->fmt_ctx;
-    int i, ret = 0;
+    int ret = 0;

    avtext_print_section_header(tfc, NULL, SECTION_ID_STREAMS);
-    for (i = 0; i < ifile->nb_streams; i++)
+    for (int i = 0; i < ifile->nb_streams; i++)
        if (selected_streams[i]) {
            ret = show_stream(tfc, fmt_ctx, i, &ifile->streams[i], 0);
            if (ret < 0)
@@ -1995,7 +2081,7 @@ static int show_streams(AVTextFormatContext *tfc, InputFile *ifile)
 static int show_program(AVTextFormatContext *tfc, InputFile *ifile, AVProgram *program)
 {
    AVFormatContext *fmt_ctx = ifile->fmt_ctx;
-    int i, ret = 0;
+    int ret = 0;

    avtext_print_section_header(tfc, NULL, SECTION_ID_PROGRAM);
    print_int("program_id", program->id);
@@ -2009,7 +2095,7 @@ static int show_program(AVTextFormatContext *tfc, InputFile *ifile, AVProgram *p
        goto end;

    avtext_print_section_header(tfc, NULL, SECTION_ID_PROGRAM_STREAMS);
-    for (i = 0; i < program->nb_stream_indexes; i++) {
+    for (unsigned i = 0; i < program->nb_stream_indexes; i++) {
        if (selected_streams[program->stream_index[i]]) {
            ret = show_stream(tfc, fmt_ctx, program->stream_index[i], &ifile->streams[program->stream_index[i]], IN_PROGRAM);
            if (ret < 0)
@@ -2026,10 +2112,10 @@ end:
 static int show_programs(AVTextFormatContext *tfc, InputFile *ifile)
 {
    AVFormatContext *fmt_ctx = ifile->fmt_ctx;
-    int i, ret = 0;
+    int ret = 0;

    avtext_print_section_header(tfc, NULL, SECTION_ID_PROGRAMS);
-    for (i = 0; i < fmt_ctx->nb_programs; i++) {
+    for (unsigned i = 0; i < fmt_ctx->nb_programs; i++) {
        AVProgram *program = fmt_ctx->programs[i];
        if (!program)
            continue;
@@ -2053,7 +2139,7 @@ static void print_tile_grid_params(AVTextFormatContext *tfc, const AVStreamGroup
    print_int("width",             tile_grid->width);
    print_int("height",            tile_grid->height);
    avtext_print_section_header(tfc, NULL, SECTION_ID_STREAM_GROUP_SUBCOMPONENTS);
-    for (int i = 0; i < tile_grid->nb_tiles; i++) {
+    for (unsigned i = 0; i < tile_grid->nb_tiles; i++) {
        avtext_print_section_header(tfc, "tile_offset", SECTION_ID_STREAM_GROUP_SUBCOMPONENT);
        print_int("stream_index",           tile_grid->offsets[i].idx);
        print_int("tile_horizontal_offset", tile_grid->offsets[i].horizontal);
@@ -2061,6 +2147,15 @@ static void print_tile_grid_params(AVTextFormatContext *tfc, const AVStreamGroup
        avtext_print_section_footer(tfc);
    }
    avtext_print_section_footer(tfc);
+    if (tile_grid->nb_coded_side_data) {
+        avtext_print_section_header(tfc, NULL, SECTION_ID_STREAM_GROUP_SIDE_DATA_LIST);
+        for (int i = 0; i < tile_grid->nb_coded_side_data; i++) {
+            print_pkt_side_data(tfc, tile_grid->width, tile_grid->height, &tile_grid->coded_side_data[i],
+                                SECTION_ID_STREAM_GROUP_SIDE_DATA);
+            avtext_print_section_footer(tfc);
+        }
+        avtext_print_section_footer(tfc);
+    }
    avtext_print_section_footer(tfc);
 }

@@ -2068,12 +2163,21 @@ static void print_iamf_param_definition(AVTextFormatContext *tfc, const char *na
                                        const AVIAMFParamDefinition *param, SectionID section_id)
 {
    SectionID subsection_id, parameter_section_id;
-    subsection_id = sections[section_id].children_ids[0];
-    av_assert0(subsection_id != -1);
+    if (section_id == SECTION_ID_FRAME_SIDE_DATA)
+        subsection_id = SECTION_ID_FRAME_SIDE_DATA_COMPONENT_LIST;
+    else {
+        av_assert0(sections[section_id].children_ids[0] != -1);
+        subsection_id = sections[section_id].children_ids[0];
+    }
+    av_assert0(sections[subsection_id].children_ids[0] != -1);
    parameter_section_id = sections[subsection_id].children_ids[0];
-    av_assert0(parameter_section_id != -1);
-    avtext_print_section_header(tfc, "IAMF Param Definition", section_id);
-    print_str("name",           name);
+
+    // When printing as part of side-data, skip opening a section
+    if (section_id != SECTION_ID_FRAME_SIDE_DATA)
+        avtext_print_section_header(tfc, "IAMF Param Definition", section_id);
+
+    if (name)
+        print_str("name",           name);
    print_int("nb_subblocks",   param->nb_subblocks);
    print_int("type",           param->type);
    print_int("parameter_id",   param->parameter_id);
@@ -2082,7 +2186,7 @@ static void print_iamf_param_definition(AVTextFormatContext *tfc, const char *na
    print_int("constant_subblock_duration",          param->constant_subblock_duration);
    if (param->nb_subblocks > 0)
        avtext_print_section_header(tfc, NULL, subsection_id);
-    for (int i = 0; i < param->nb_subblocks; i++) {
+    for (unsigned i = 0; i < param->nb_subblocks; i++) {
        const void *subblock = av_iamf_param_definition_get_subblock(param, i);
        switch(param->type) {
        case AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN: {
@@ -2116,7 +2220,9 @@ static void print_iamf_param_definition(AVTextFormatContext *tfc, const char *na
    }
    if (param->nb_subblocks > 0)
        avtext_print_section_footer(tfc); // subsection_id
-    avtext_print_section_footer(tfc); // section_id
+
+    if (section_id != SECTION_ID_FRAME_SIDE_DATA)
+        avtext_print_section_footer(tfc); // section_id
 }

 static void print_iamf_audio_element_params(AVTextFormatContext *tfc, const AVStreamGroup *stg,
@@ -2131,7 +2237,7 @@ static void print_iamf_audio_element_params(AVTextFormatContext *tfc, const AVSt
    print_int("audio_element_type", audio_element->audio_element_type);
    print_int("default_w",          audio_element->default_w);
    avtext_print_section_header(tfc, NULL, SECTION_ID_STREAM_GROUP_SUBCOMPONENTS);
-    for (int i = 0; i < audio_element->nb_layers; i++) {
+    for (unsigned i = 0; i < audio_element->nb_layers; i++) {
        const AVIAMFLayer *layer = audio_element->layers[i];
        char val_str[128];
        avtext_print_section_header(tfc, "IAMF Audio Layer", SECTION_ID_STREAM_GROUP_SUBCOMPONENT);
@@ -2167,7 +2273,7 @@ static void print_iamf_submix_params(AVTextFormatContext *tfc, const AVIAMFSubmi
    print_int("nb_layouts",     submix->nb_layouts);
    print_q("default_mix_gain", submix->default_mix_gain, '/');
    avtext_print_section_header(tfc, NULL, SECTION_ID_STREAM_GROUP_PIECES);
-    for (int i = 0; i < submix->nb_elements; i++) {
+    for (unsigned i = 0; i < submix->nb_elements; i++) {
        const AVIAMFSubmixElement *element = submix->elements[i];
        avtext_print_section_header(tfc, "IAMF Submix Element", SECTION_ID_STREAM_GROUP_PIECE);
        print_int("stream_id",                 element->audio_element_id);
@@ -2190,7 +2296,7 @@ static void print_iamf_submix_params(AVTextFormatContext *tfc, const AVIAMFSubmi
    if (submix->output_mix_config)
        print_iamf_param_definition(tfc, "output_mix_config", submix->output_mix_config,
                                    SECTION_ID_STREAM_GROUP_PIECE);
-    for (int i = 0; i < submix->nb_layouts; i++) {
+    for (unsigned i = 0; i < submix->nb_layouts; i++) {
        const AVIAMFSubmixLayout *layout = submix->layouts[i];
        char val_str[128];
        avtext_print_section_header(tfc, "IAMF Submix Layout", SECTION_ID_STREAM_GROUP_PIECE);
@@ -2220,7 +2326,7 @@ static void print_iamf_mix_presentation_params(AVTextFormatContext *tfc, const A
            print_str(annotation->key, annotation->value);
        avtext_print_section_footer(tfc); // SECTION_ID_STREAM_GROUP_SUBCOMPONENT
    }
-    for (int i = 0; i < mix_presentation->nb_submixes; i++)
+    for (unsigned i = 0; i < mix_presentation->nb_submixes; i++)
        print_iamf_submix_params(tfc, mix_presentation->submixes[i]);
    avtext_print_section_footer(tfc); // SECTION_ID_STREAM_GROUP_SUBCOMPONENTS
    avtext_print_section_footer(tfc); // SECTION_ID_STREAM_GROUP_COMPONENT
@@ -2242,7 +2348,7 @@ static int show_stream_group(AVTextFormatContext *tfc, InputFile *ifile, AVStrea
 {
    AVFormatContext *fmt_ctx = ifile->fmt_ctx;
    AVBPrint pbuf;
-    int i, ret = 0;
+    int ret = 0;

    av_bprint_init(&pbuf, 1, AV_BPRINT_SIZE_UNLIMITED);
    avtext_print_section_header(tfc, NULL, SECTION_ID_STREAM_GROUP);
@@ -2267,7 +2373,7 @@ static int show_stream_group(AVTextFormatContext *tfc, InputFile *ifile, AVStrea
        goto end;

    avtext_print_section_header(tfc, NULL, SECTION_ID_STREAM_GROUP_STREAMS);
-    for (i = 0; i < stg->nb_streams; i++) {
+    for (unsigned i = 0; i < stg->nb_streams; i++) {
        if (selected_streams[stg->streams[i]->index]) {
            ret = show_stream(tfc, fmt_ctx, stg->streams[i]->index, &ifile->streams[stg->streams[i]->index], IN_STREAM_GROUP);
            if (ret < 0)
@@ -2285,10 +2391,10 @@ end:
 static int show_stream_groups(AVTextFormatContext *tfc, InputFile *ifile)
 {
    AVFormatContext *fmt_ctx = ifile->fmt_ctx;
-    int i, ret = 0;
+    int ret = 0;

    avtext_print_section_header(tfc, NULL, SECTION_ID_STREAM_GROUPS);
-    for (i = 0; i < fmt_ctx->nb_stream_groups; i++) {
+    for (unsigned i = 0; i < fmt_ctx->nb_stream_groups; i++) {
        AVStreamGroup *stg = fmt_ctx->stream_groups[i];

        ret = show_stream_group(tfc, ifile, stg);
@@ -2302,10 +2408,10 @@ static int show_stream_groups(AVTextFormatContext *tfc, InputFile *ifile)
 static int show_chapters(AVTextFormatContext *tfc, InputFile *ifile)
 {
    AVFormatContext *fmt_ctx = ifile->fmt_ctx;
-    int i, ret = 0;
+    int ret = 0;

    avtext_print_section_header(tfc, NULL, SECTION_ID_CHAPTERS);
-    for (i = 0; i < fmt_ctx->nb_chapters; i++) {
+    for (unsigned i = 0; i < fmt_ctx->nb_chapters; i++) {
        AVChapter *chapter = fmt_ctx->chapters[i];

        avtext_print_section_header(tfc, NULL, SECTION_ID_CHAPTER);
@@ -2425,7 +2531,7 @@ static const AVCodec *get_decoder_for_stream(AVFormatContext *fmt_ctx, AVStream
 static int open_input_file(InputFile *ifile, const char *filename,
                           const char *print_filename)
 {
-    int err, i;
+    int err;
    AVFormatContext *fmt_ctx = NULL;
    const AVDictionaryEntry *t = NULL;
    int scan_all_pmts_set = 0;
@@ -2466,7 +2572,7 @@ static int open_input_file(InputFile *ifile, const char *filename,

        err = avformat_find_stream_info(fmt_ctx, opts);

-        for (i = 0; i < orig_nb_streams; i++)
+        for (int i = 0; i < orig_nb_streams; i++)
            av_dict_free(&opts[i]);
        av_freep(&opts);

@@ -2484,7 +2590,7 @@ static int open_input_file(InputFile *ifile, const char *filename,
    ifile->nb_streams = fmt_ctx->nb_streams;

    /* bind a decoder to each input stream */
-    for (i = 0; i < fmt_ctx->nb_streams; i++) {
+    for (unsigned i = 0; i < fmt_ctx->nb_streams; i++) {
        InputStream *ist = &ifile->streams[i];
        AVStream *stream = fmt_ctx->streams[i];
        const AVCodec *codec;
@@ -2542,10 +2648,9 @@ static int open_input_file(InputFile *ifile, const char *filename,

 static void close_input_file(InputFile *ifile)
 {
-    int i;

    /* close decoder for each stream */
-    for (i = 0; i < ifile->nb_streams; i++)
+    for (int i = 0; i < ifile->nb_streams; i++)
        avcodec_free_context(&ifile->streams[i].dec_ctx);

    av_freep(&ifile->streams);
@@ -2558,7 +2663,7 @@ static int probe_file(AVTextFormatContext *tfc, const char *filename,
                      const char *print_filename)
 {
    InputFile ifile = { 0 };
-    int ret, i;
+    int ret;
    int section_id;

    do_analyze_frames = do_analyze_frames && do_show_streams;
@@ -2578,7 +2683,7 @@ static int probe_file(AVTextFormatContext *tfc, const char *filename,
    REALLOCZ_ARRAY_STREAM(streams_with_closed_captions,0,ifile.fmt_ctx->nb_streams);
    REALLOCZ_ARRAY_STREAM(streams_with_film_grain,0,ifile.fmt_ctx->nb_streams);

-    for (i = 0; i < ifile.fmt_ctx->nb_streams; i++) {
+    for (unsigned i = 0; i < ifile.fmt_ctx->nb_streams; i++) {
        if (stream_specifier) {
            ret = avformat_match_stream_specifier(ifile.fmt_ctx,
                                                  ifile.fmt_ctx->streams[i],
@@ -2796,9 +2901,9 @@ static inline void mark_section_show_entries(SectionID section_id,
 static int match_section(const char *section_name,
                         int show_all_entries, AVDictionary *entries)
 {
-    int i, ret = 0;
+    int ret = 0;

-    for (i = 0; i < FF_ARRAY_ELEMS(sections); i++) {
+    for (unsigned i = 0; i < FF_ARRAY_ELEMS(sections); i++) {
        const struct AVTextFormatSection *section = &sections[i];
        if (!strcmp(section_name, section->name) ||
            (section->unique_name && !strcmp(section_name, section->unique_name))) {
@@ -3238,7 +3343,7 @@ int main(int argc, char **argv)
    char *buf;
    char *f_name = NULL, *f_args = NULL;
    int ret, input_ret;
-    AVTextFormatDataDump data_dump_format_id;
+    AVTextFormatDataDump data_dump_format_id = AV_TEXTFORMAT_DATADUMP_XXD;

    init_dynload();

@@ -33,6 +33,7 @@

 #include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
+#include "libavutil/mem.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/dict.h"
 #include "libavutil/common.h"
@@ -68,6 +68,11 @@ enum show_muxdemuxers {
    SHOW_MUXERS,
 };

+enum show_codec {
+    SHOW_DECODER,
+    SHOW_ENCODER,
+};
+
 static FILE *report_file;
 static int report_file_level = AV_LOG_DEBUG;

@@ -591,9 +596,9 @@ int show_help(void *optctx, const char *opt, const char *arg)
    if (!*topic) {
        show_help_default(topic, par);
    } else if (!strcmp(topic, "decoder")) {
-        show_help_codec(par, 0);
+        show_help_codec(par, SHOW_DECODER);
    } else if (!strcmp(topic, "encoder")) {
-        show_help_codec(par, 1);
+        show_help_codec(par, SHOW_ENCODER);
    } else if (!strcmp(topic, "demuxer")) {
        show_help_demuxer(par);
    } else if (!strcmp(topic, "muxer")) {
@@ -708,16 +713,16 @@ int show_codecs(void *optctx, const char *opt, const char *arg)

        /* print decoders/encoders when there's more than one or their
         * names are different from codec name */
-        while ((codec = next_codec_for_id(desc->id, &iter, 0))) {
+        while ((codec = next_codec_for_id(desc->id, &iter, SHOW_DECODER))) {
            if (strcmp(codec->name, desc->name)) {
-                print_codecs_for_id(desc->id, 0);
+                print_codecs_for_id(desc->id, SHOW_DECODER);
                break;
            }
        }
        iter = NULL;
-        while ((codec = next_codec_for_id(desc->id, &iter, 1))) {
+        while ((codec = next_codec_for_id(desc->id, &iter, SHOW_ENCODER))) {
            if (strcmp(codec->name, desc->name)) {
-                print_codecs_for_id(desc->id, 1);
+                print_codecs_for_id(desc->id, SHOW_ENCODER);
                break;
            }
        }
@@ -774,12 +779,12 @@ static int print_codecs(int encoder)

 int show_decoders(void *optctx, const char *opt, const char *arg)
 {
-    return print_codecs(0);
+    return print_codecs(SHOW_DECODER);
 }

 int show_encoders(void *optctx, const char *opt, const char *arg)
 {
-    return print_codecs(1);
+    return print_codecs(SHOW_ENCODER);
 }

 int show_bsfs(void *optctx, const char *opt, const char *arg)
@@ -876,7 +881,7 @@ static int show_formats_devices(void *optctx, const char *opt, const char *arg,
        const char *name      = NULL;
        const char *long_name = NULL;

-        if (muxdemuxers !=SHOW_DEMUXERS) {
+        if (muxdemuxers != SHOW_DEMUXERS) {
            ofmt_opaque = NULL;
            while ((ofmt = av_muxer_iterate(&ofmt_opaque))) {
                is_dev = is_device(ofmt->priv_class);
@@ -26,6 +26,7 @@

 #include "avtextformat.h"

+#include "libavutil/attributes.h"
 #include "libavutil/bprint.h"
 #include "libavutil/opt.h"
 #include "tf_internal.h"
@@ -74,7 +75,7 @@ static char *ini_escape_str(AVBPrint *dst, const char *src)
        case '=':
        case ':':
            av_bprint_chars(dst, '\\', 1);
-            /* fallthrough */
+            av_fallthrough;
        default:
            if ((unsigned char)c < 32)
                av_bprintf(dst, "\\x00%02x", (unsigned char)c);
@@ -130,7 +130,7 @@ OBJS-$(CONFIG_IVIDSP)                  += ivi_dsp.o
 OBJS-$(CONFIG_JNI)                     += ffjni.o jni.o
 OBJS-$(CONFIG_JPEGTABLES)              += jpegtables.o
 OBJS-$(CONFIG_LCMS2)                   += fflcms2.o
-OBJS-$(CONFIG_LIBLCEVC_DEC)            += lcevcdec.o
+OBJS-$(CONFIG_LIBLCEVC_DEC)            += lcevcdec.o lcevctab.o
 OBJS-$(CONFIG_LLAUDDSP)                += lossless_audiodsp.o
 OBJS-$(CONFIG_LLVIDDSP)                += lossless_videodsp.o
 OBJS-$(CONFIG_LLVIDENCDSP)             += lossless_videoencdsp.o
@@ -626,6 +626,7 @@ OBJS-$(CONFIG_PBM_ENCODER)             += pnmenc.o
 OBJS-$(CONFIG_PCX_DECODER)             += pcx.o
 OBJS-$(CONFIG_PCX_ENCODER)             += pcxenc.o
 OBJS-$(CONFIG_PDV_DECODER)             += pdvdec.o
+OBJS-$(CONFIG_PDV_ENCODER)             += pdvenc.o
 OBJS-$(CONFIG_PFM_DECODER)             += pnmdec.o pnm.o
 OBJS-$(CONFIG_PFM_ENCODER)             += pnmenc.o
 OBJS-$(CONFIG_PGM_DECODER)             += pnmdec.o pnm.o
@@ -1127,7 +1128,7 @@ OBJS-$(CONFIG_FITS_DEMUXER)            += fits.o
 OBJS-$(CONFIG_TAK_DEMUXER)             += tak.o

 # libavformat dependencies for static builds
-STLIBOBJS-$(CONFIG_AVFORMAT)           += to_upper4.o
+STLIBOBJS-$(CONFIG_AVFORMAT)           += h2645_parse.o lcevctab.o to_upper4.o
 STLIBOBJS-$(CONFIG_ISO_MEDIA)          += mpegaudiotabs.o
 STLIBOBJS-$(CONFIG_FLV_MUXER)          += mpeg4audio_sample_rates.o
 STLIBOBJS-$(CONFIG_HLS_DEMUXER)        += ac3_channel_layout_tab.o
@@ -1277,7 +1278,7 @@ OBJS-$(CONFIG_IPU_PARSER)              += ipu_parser.o
 OBJS-$(CONFIG_JPEG2000_PARSER)         += jpeg2000_parser.o
 OBJS-$(CONFIG_JPEGXL_PARSER)           += jpegxl_parser.o jpegxl_parse.o
 OBJS-$(CONFIG_JPEGXS_PARSER)           += jpegxs_parser.o
-OBJS-$(CONFIG_LCEVC_PARSER)            += lcevc_parser.o
+OBJS-$(CONFIG_LCEVC_PARSER)            += lcevc_parser.o lcevctab.o
 OBJS-$(CONFIG_MISC4_PARSER)            += misc4_parser.o
 OBJS-$(CONFIG_MJPEG_PARSER)            += mjpeg_parser.o
 OBJS-$(CONFIG_MLP_PARSER)              += mlp_parse.o mlp_parser.o mlp.o
@@ -1379,7 +1380,6 @@ TESTPROGS-$(CONFIG_GOLOMB)                += golomb
 TESTPROGS-$(CONFIG_IDCTDSP)               += dct
 TESTPROGS-$(CONFIG_DXV_ENCODER)           += hashtable
 TESTPROGS-$(CONFIG_MJPEG_ENCODER)         += mjpegenc_huffman
-TESTPROGS-$(HAVE_MMX)                     += motion
 TESTPROGS-$(CONFIG_MPEGVIDEO)             += mpeg12framerate
 TESTPROGS-$(CONFIG_H264_METADATA_BSF)     += h264_levels
 TESTPROGS-$(CONFIG_HEVC_METADATA_BSF)     += h265_levels
@@ -676,6 +676,7 @@ ChannelElement *ff_aac_get_che(AACDecContext *ac, int type, int elem_id)
            ac->tags_mapped++;
            return ac->tag_che_map[type][elem_id] = ac->che[type][elem_id];
        }
+        av_fallthrough;
    case 13:
        if (ac->tags_mapped > 3 && ((type == TYPE_CPE && elem_id < 8) ||
                                    (type == TYPE_SCE && elem_id < 6) ||
@@ -683,17 +684,20 @@ ChannelElement *ff_aac_get_che(AACDecContext *ac, int type, int elem_id)
            ac->tags_mapped++;
            return ac->tag_che_map[type][elem_id] = ac->che[type][elem_id];
        }
+        av_fallthrough;
    case 12:
    case 7:
        if (ac->tags_mapped == 3 && type == TYPE_CPE) {
            ac->tags_mapped++;
            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][2];
        }
+        av_fallthrough;
    case 11:
        if (ac->tags_mapped == 3 && type == TYPE_SCE) {
            ac->tags_mapped++;
            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][1];
        }
+        av_fallthrough;
    case 6:
        /* Some streams incorrectly code 5.1 audio as
         * SCE[0] CPE[0] CPE[1] SCE[1]
@@ -711,11 +715,13 @@ ChannelElement *ff_aac_get_che(AACDecContext *ac, int type, int elem_id)
            ac->tags_mapped++;
            return ac->tag_che_map[type][elem_id] = ac->che[TYPE_LFE][0];
        }
+        av_fallthrough;
    case 5:
        if (ac->tags_mapped == 2 && type == TYPE_CPE) {
            ac->tags_mapped++;
            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][1];
        }
+        av_fallthrough;
    case 4:
        /* Some streams incorrectly code 4.0 audio as
         * SCE[0] CPE[0] LFE[0]
@@ -739,6 +745,7 @@ ChannelElement *ff_aac_get_che(AACDecContext *ac, int type, int elem_id)
            ac->tags_mapped++;
            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][1];
        }
+        av_fallthrough;
    case 3:
    case 2:
        if (ac->tags_mapped == (ac->oc[1].m4ac.chan_config != 2) &&
@@ -750,11 +757,13 @@ ChannelElement *ff_aac_get_che(AACDecContext *ac, int type, int elem_id)
            ac->tags_mapped++;
            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][1];
        }
+        av_fallthrough;
    case 1:
        if (!ac->tags_mapped && type == TYPE_SCE) {
            ac->tags_mapped++;
            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][0];
        }
+        av_fallthrough;
    default:
        return NULL;
    }
@@ -889,12 +898,6 @@ static int decode_ga_specific_config(AACDecContext *ac, AVCodecContext *avctx,
    int tags = 0;

    m4ac->frame_length_short = get_bits1(gb);
-    if (m4ac->frame_length_short && m4ac->sbr == 1) {
-      avpriv_report_missing_feature(avctx, "SBR with 960 frame length");
-      if (ac) ac->warned_960_sbr = 1;
-      m4ac->sbr = 0;
-      m4ac->ps = 0;
-    }

    if (get_bits1(gb))       // dependsOnCoreCoder
        skip_bits(gb, 14);   // coreCoderDelay
@@ -1246,7 +1249,7 @@ av_cold int ff_aac_decode_init(AVCodecContext *avctx)
        ac->oc[1].m4ac.chan_config = i;

        if (ac->oc[1].m4ac.chan_config) {
-            int ret = ff_aac_set_default_channel_config(ac, avctx, layout_map,
+            ret = ff_aac_set_default_channel_config(ac, avctx, layout_map,
                                                        &layout_map_tags,
                                                        ac->oc[1].m4ac.chan_config);
            if (!ret)
@@ -1946,17 +1949,11 @@ static int decode_extension_payload(AACDecContext *ac, GetBitContext *gb, int cn
    switch (type) { // extension type
    case EXT_SBR_DATA_CRC:
        crc_flag++;
+        av_fallthrough;
    case EXT_SBR_DATA:
        if (!che) {
            av_log(ac->avctx, AV_LOG_ERROR, "SBR was found before the first channel element.\n");
            return res;
-        } else if (ac->oc[1].m4ac.frame_length_short) {
-            if (!ac->warned_960_sbr)
-              avpriv_report_missing_feature(ac->avctx,
-                                            "SBR with 960 frame length");
-            ac->warned_960_sbr = 1;
-            skip_bits_long(gb, 8 * cnt - 4);
-            return res;
        } else if (!ac->oc[1].m4ac.sbr) {
            av_log(ac->avctx, AV_LOG_ERROR, "SBR signaled to be not-present but was found in the bitstream.\n");
            skip_bits_long(gb, 8 * cnt - 4);
@@ -1977,7 +1974,8 @@ static int decode_extension_payload(AACDecContext *ac, GetBitContext *gb, int cn
            ac->avctx->profile = AV_PROFILE_AAC_HE;
        }

-        ac->proc.sbr_decode_extension(ac, che, gb, crc_flag, cnt, elem_type);
+        ac->proc.sbr_decode_extension(ac, che, gb, crc_flag, cnt, elem_type,
+                                      ac->oc[1].m4ac.frame_length_short);

        if (ac->oc[1].m4ac.ps == 1 && !ac->warned_he_aac_mono) {
            av_log(ac->avctx, AV_LOG_VERBOSE, "Treating HE-AAC mono as stereo.\n");
@@ -2087,6 +2085,7 @@ static void spectral_to_sample(AACDecContext *ac, int samples)
                    }
                    if (ac->oc[1].m4ac.sbr > 0) {
                        ac->proc.sbr_apply(ac, che, type,
+                                           ac->oc[1].m4ac.frame_length_short,
                                           che->ch[0].output,
                                           che->ch[1].output);
                    }
@@ -433,9 +433,9 @@ typedef struct AACDecProc {

    int (*sbr_ctx_alloc_init)(AACDecContext *ac, ChannelElement **che, int id_aac);
    int (*sbr_decode_extension)(AACDecContext *ac, ChannelElement *che,
-                                GetBitContext *gb, int crc, int cnt, int id_aac);
-    void (*sbr_apply)(AACDecContext *ac, ChannelElement *che,
-                      int id_aac, void /* INTFLOAT */ *L, void /* INTFLOAT */ *R);
+                                GetBitContext *gb, int crc, int cnt, int id_aac, int fl960);
+    void (*sbr_apply)(AACDecContext *ac, ChannelElement *che, int id_aac, int fl960,
+                      void /* INTFLOAT */ *L, void /* INTFLOAT */ *R);
    void (*sbr_ctx_close)(ChannelElement *che);
 } AACDecProc;

@@ -557,7 +557,6 @@ struct AACDecContext {

    OutputConfiguration oc[2];
    int warned_num_aac_frames;
-    int warned_960_sbr;
    unsigned warned_71_wide;
    int warned_gain_control;
    int warned_he_aac_mono;
@@ -215,6 +215,11 @@ static int decode_usac_element_pair(AACDecContext *ac,

    if (e->stereo_config_index) {
        e->mps.freq_res = get_bits(gb, 3); /* bsFreqRes */
+        if (!e->mps.freq_res)
+            return AVERROR_INVALIDDATA; /* value 0 is reserved */
+
+        int numBands = ((int[]){0,28,20,14,10,7,5,4})[e->mps.freq_res]; // ISO/IEC 23003-1:2007, 5.2, Table 39
+
        e->mps.fixed_gain = get_bits(gb, 3); /* bsFixedGainDMX */
        e->mps.temp_shape_config = get_bits(gb, 2); /* bsTempShapeConfig */
        e->mps.decorr_config = get_bits(gb, 2); /* bsDecorrConfig */
@@ -222,12 +227,21 @@ static int decode_usac_element_pair(AACDecContext *ac,
        e->mps.phase_coding = get_bits1(gb); /* bsPhaseCoding */

        e->mps.otts_bands_phase_present = get_bits1(gb);
-        if (e->mps.otts_bands_phase_present) /* bsOttBandsPhasePresent */
-            e->mps.otts_bands_phase = get_bits(gb, 5); /* bsOttBandsPhase */
+        int otts_bands_phase = ((int[]){0,10,10,7,5,3,2,2})[e->mps.freq_res]; // Table 109 — Default value of bsOttBandsPhase
+        if (e->mps.otts_bands_phase_present) { /* bsOttBandsPhasePresent */
+            otts_bands_phase = get_bits(gb, 5); /* bsOttBandsPhase */
+            if (otts_bands_phase > numBands)
+                return AVERROR_INVALIDDATA;
+        }
+        e->mps.otts_bands_phase = otts_bands_phase;

        e->mps.residual_coding = e->stereo_config_index >= 2; /* bsResidualCoding */
        if (e->mps.residual_coding) {
-            e->mps.residual_bands = get_bits(gb, 5); /* bsResidualBands */
+            int residual_bands = get_bits(gb, 5); /* bsResidualBands */
+            if (residual_bands > numBands)
+                return AVERROR_INVALIDDATA;
+            e->mps.residual_bands = residual_bands;
+
            e->mps.otts_bands_phase = FFMAX(e->mps.otts_bands_phase,
                                            e->mps.residual_bands);
            e->mps.pseudo_lr = get_bits1(gb); /* bsPseudoLr */
@@ -1293,7 +1307,8 @@ static void spectrum_decode(AACDecContext *ac, AACUSACConfig *usac,
        SingleChannelElement *sce = &cpe->ch[ch];
        AACUsacElemData *ue = &sce->ue;

-        spectrum_scale(ac, sce, ue);
+        if (!ue->core_mode)
+            spectrum_scale(ac, sce, ue);
    }

    if (nb_channels > 1 && us->common_window) {
@@ -1327,13 +1342,13 @@ static void spectrum_decode(AACDecContext *ac, AACUSACConfig *usac,

    /* Save coefficients and alpha values for prediction reasons */
    if (nb_channels > 1) {
-        AACUsacStereo *us = &cpe->us;
+        AACUsacStereo *us2 = &cpe->us;
        for (int ch = 0; ch < nb_channels; ch++) {
            SingleChannelElement *sce = &cpe->ch[ch];
            memcpy(sce->prev_coeffs, sce->coeffs, sizeof(sce->coeffs));
        }
-        memcpy(us->prev_alpha_q_re, us->alpha_q_re, sizeof(us->alpha_q_re));
-        memcpy(us->prev_alpha_q_im, us->alpha_q_im, sizeof(us->alpha_q_im));
+        memcpy(us2->prev_alpha_q_re, us2->alpha_q_re, sizeof(us2->alpha_q_re));
+        memcpy(us2->prev_alpha_q_im, us2->alpha_q_im, sizeof(us2->alpha_q_im));
    }

    for (int ch = 0; ch < nb_channels; ch++) {
@@ -1343,8 +1358,9 @@ static void spectrum_decode(AACDecContext *ac, AACUSACConfig *usac,
        if (sce->tns.present && ((nb_channels == 1) || (us->tns_on_lr)))
            ac->dsp.apply_tns(sce->coeffs, &sce->tns, &sce->ics, 1);

-        ac->oc[1].m4ac.frame_length_short ? ac->dsp.imdct_and_windowing_768(ac, sce) :
-                                            ac->dsp.imdct_and_windowing(ac, sce);
+        if (!sce->ue.core_mode)
+            ac->oc[1].m4ac.frame_length_short ? ac->dsp.imdct_and_windowing_768(ac, sce) :
+                                                ac->dsp.imdct_and_windowing(ac, sce);
    }
 }

@@ -1655,7 +1671,7 @@ static int decode_usac_core_coder(AACDecContext *ac, AACUSACConfig *usac,
    spectrum_decode(ac, usac, che, core_nb_channels);

    if (ac->oc[1].m4ac.sbr > 0) {
-        ac->proc.sbr_apply(ac, che, nb_channels == 2 ? TYPE_CPE : TYPE_SCE,
+        ac->proc.sbr_apply(ac, che, nb_channels == 2 ? TYPE_CPE : TYPE_SCE, 0,
                           che->ch[0].output,
                           che->ch[1].output);
    }
@@ -1719,8 +1735,8 @@ static int parse_audio_preroll(AACDecContext *ac, GetBitContext *gb)
        }

        /* Byte alignment is not guaranteed. */
-        for (int i = 0; i < au_len; i++)
-            tmp_buf[i] = get_bits(gb, 8);
+        for (int j = 0; j < au_len; j++)
+            tmp_buf[j] = get_bits(gb, 8);

        ret = init_get_bits8(&gbc, tmp_buf, au_len);
        if (ret < 0)
@@ -240,7 +240,7 @@ static void huff_data_2d(GetBitContext *gb, int16_t *part0_data[2], int16_t (*da
                   0, 2*esc_cnt, 0, (2*lav + 1));
        for (i = 0; i < esc_cnt; i++) {
            data[esc_idx[i]][0] = esc_data[0][i] - lav;
-            data[esc_idx[i]][0] = esc_data[0][i] - lav;
+            data[esc_idx[i]][1] = esc_data[1][i] - lav;
        }
    }
 }
@@ -464,10 +464,10 @@ static int ec_pair_dec(GetBitContext *gb,
    }

    if (pair) {
-        p_data[0] = data_pair[0];
-        p_data[1] = data_pair[1];
+        p_data[0] = data_diff[0];
+        p_data[1] = data_diff[1];
    } else {
-        p_data[0] = data_pair[0];
+        p_data[0] = data_diff[0];
        p_data[1] = NULL;
    }

@@ -480,7 +480,7 @@ static int ec_pair_dec(GetBitContext *gb,
    if (pair && (diff_freq[0] || diff_time_back))
        diff_freq[1] = !get_bits1(gb);

-    int time_pair;
+    int time_pair = 0;
    huff_decode(gb, p_data, data_type, diff_freq,
                nb_bands, &time_pair);

@@ -534,11 +534,11 @@ static int ec_pair_dec(GetBitContext *gb,
    }

    /* Decode LSBs */
-    attach_lsb(gb, p_data[0], quant_offset, attach_lsb_flag,
-               nb_bands, p_data[0]);
+    attach_lsb(gb, data_pair[0], quant_offset, attach_lsb_flag,
+               nb_bands, data_pair[0]);
    if (pair)
-        attach_lsb(gb, p_data[1], quant_offset, attach_lsb_flag,
-                   nb_bands, p_data[1]);
+        attach_lsb(gb, data_pair[1], quant_offset, attach_lsb_flag,
+                   nb_bands, data_pair[1]);

    memcpy(&set1[start_band], data_pair[0], 2*nb_bands);
    if (pair)
@@ -591,9 +591,6 @@ static int get_freq_strides(int16_t *freq_strides, int band_stride,
        }
    }

-    for (int i = 0; i <= data_bands; i++)
-        freq_strides[i] = av_clip_uintp2(freq_strides[i], 2);
-
    return data_bands;
 }

@@ -643,15 +640,16 @@ int ff_aac_ec_data_dec(GetBitContext *gb, AACMPSLosslessData *ld,
                fine_to_coarse(ld->last_data, data_type, start_band, end_band);
        }

-        int data_bands = get_freq_strides(ld->freq_res,
+        int16_t freq_stride_map[MPS_MAX_PARAM_BANDS + 1];
+        int data_bands = get_freq_strides(freq_stride_map,
                                          stride_table[ld->freq_res[set_idx]],
                                          start_band, end_band);

-        if (set_idx + data_pair > MPS_MAX_PARAM_SETS)
+        if (set_idx + data_pair >= MPS_MAX_PARAM_SETS)
            return AVERROR(EINVAL);

        for (int j = 0; j < data_bands; j++)
-            ld->last_data[start_band + j] = ld->last_data[ld->freq_res[j]];
+            ld->last_data[start_band + j] = ld->last_data[freq_stride_map[j]];

        int err = ec_pair_dec(gb,
                              ld->data[set_idx + 0], ld->data[set_idx + 1],
@@ -664,11 +662,11 @@ int ff_aac_ec_data_dec(GetBitContext *gb, AACMPSLosslessData *ld,
        if (data_type == MPS_IPD) {
            const int mask = ld->coarse_quant[set_idx] ? 0x7 : 0xF;
            for (int j = 0; j < data_bands; j++)
-                for (int k = ld->freq_res[j + 0]; k < ld->freq_res[j + 1]; k++)
+                for (int k = freq_stride_map[j + 0]; k < freq_stride_map[j + 1]; k++)
                    ld->last_data[k] = ld->data[set_idx + data_pair][start_band + j] & mask;
        } else {
            for (int j = 0; j < data_bands; j++)
-                for (int k = ld->freq_res[j + 0]; k < ld->freq_res[j + 1]; k++)
+                for (int k = freq_stride_map[j + 0]; k < freq_stride_map[j + 1]; k++)
                    ld->last_data[k] = ld->data[set_idx + data_pair][start_band + j];
        }

@@ -860,7 +858,7 @@ int ff_aac_map_index_data(AACMPSLosslessData *ld,
    for (int i = 0; i < nb_param_sets; i++) {
        if (ld->coarse_quant_no[i] == 1) {
            coarse_to_fine(tmp_idx_data[i], data_type, start_band,
-                           stop_band - start_band);
+                           stop_band);
            ld->coarse_quant_no[i] = 0;
        }
    }
@@ -84,9 +84,11 @@ void ff_aac_sbr_ctx_close_fixed(ChannelElement *che);

 /** Decode one SBR element. */
 int ff_aac_sbr_decode_extension(AACDecContext *ac, ChannelElement *che,
-                                GetBitContext *gb, int crc, int cnt, int id_aac);
+                                GetBitContext *gb, int crc, int cnt, int id_aac,
+                                int fl960);
 int ff_aac_sbr_decode_extension_fixed(AACDecContext *ac, ChannelElement *che,
-                                      GetBitContext *gb, int crc, int cnt, int id_aac);
+                                      GetBitContext *gb, int crc, int cnt, int id_aac,
+                                      int fl960);

 /** Due to channel allocation not being known upon SBR parameter transmission,
 * supply the parameters separately.
@@ -101,9 +103,11 @@ int ff_aac_sbr_decode_usac_data(AACDecContext *ac, ChannelElement *che,

 /** Apply one SBR element to one AAC element. */
 void ff_aac_sbr_apply(AACDecContext *ac, ChannelElement *che,
-                      int id_aac, void /* float */ *L, void /* float */ *R);
+                      int id_aac, int fl960,
+                      void /* float */ *L, void /* float */ *R);
 void ff_aac_sbr_apply_fixed(AACDecContext *ac, ChannelElement *che,
-                            int id_aac, void /* int */ *L, void /* int */ *R);
+                            int id_aac, int fl960,
+                            void /* int */ *L, void /* int */ *R);

 FF_VISIBILITY_POP_HIDDEN

@@ -636,12 +636,11 @@ static const int8_t ceil_log2[] = {
 };

 static int read_sbr_grid(AACDecContext *ac, SpectralBandReplication *sbr,
-                         GetBitContext *gb, SBRData *ch_data)
+                         GetBitContext *gb, SBRData *ch_data, int numTimeSlots)
 {
    int i;
    int bs_pointer = 0;
-    // frameLengthFlag ? 15 : 16; 960 sample length frames unsupported; this value is numTimeSlots
-    int abs_bord_trail = 16;
+    int abs_bord_trail = numTimeSlots;
    int num_rel_lead, num_rel_trail;
    unsigned bs_num_env_old = ch_data->bs_num_env;
    int bs_frame_class, bs_num_env;
@@ -991,15 +990,15 @@ static void read_sbr_extension(AACDecContext *ac, SpectralBandReplication *sbr,
 }

 static int read_sbr_single_channel_element(AACDecContext *ac,
-                                            SpectralBandReplication *sbr,
-                                            GetBitContext *gb)
+                                           SpectralBandReplication *sbr,
+                                           GetBitContext *gb, int numTimeSlots)
 {
    int ret;

    if (get_bits1(gb)) // bs_data_extra
        skip_bits(gb, 4); // bs_reserved

-    if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
+    if (read_sbr_grid(ac, sbr, gb, &sbr->data[0], numTimeSlots))
        return -1;
    read_sbr_dtdf(sbr, gb, &sbr->data[0], 0);
    read_sbr_invf(sbr, gb, &sbr->data[0]);
@@ -1015,8 +1014,8 @@ static int read_sbr_single_channel_element(AACDecContext *ac,
 }

 static int read_sbr_channel_pair_element(AACDecContext *ac,
-                                          SpectralBandReplication *sbr,
-                                          GetBitContext *gb)
+                                         SpectralBandReplication *sbr,
+                                         GetBitContext *gb, int numTimeSlots)
 {
    int ret;

@@ -1024,7 +1023,7 @@ static int read_sbr_channel_pair_element(AACDecContext *ac,
        skip_bits(gb, 8); // bs_reserved

    if ((sbr->bs_coupling = get_bits1(gb))) {
-        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
+        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0], numTimeSlots))
            return -1;
        copy_sbr_grid(&sbr->data[1], &sbr->data[0]);
        read_sbr_dtdf(sbr, gb, &sbr->data[0], 0);
@@ -1041,8 +1040,8 @@ static int read_sbr_channel_pair_element(AACDecContext *ac,
        if((ret = read_sbr_noise(ac, sbr, gb, &sbr->data[1], 1)) < 0)
            return ret;
    } else {
-        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]) ||
-            read_sbr_grid(ac, sbr, gb, &sbr->data[1]))
+        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0], numTimeSlots) ||
+            read_sbr_grid(ac, sbr, gb, &sbr->data[1], numTimeSlots))
            return -1;
        read_sbr_dtdf(sbr, gb, &sbr->data[0], 0);
        read_sbr_dtdf(sbr, gb, &sbr->data[1], 0);
@@ -1067,7 +1066,7 @@ static int read_sbr_channel_pair_element(AACDecContext *ac,
 }

 static unsigned int read_sbr_data(AACDecContext *ac, SpectralBandReplication *sbr,
-                                  GetBitContext *gb, int id_aac)
+                                  GetBitContext *gb, int id_aac, int numTimeSlots)
 {
    unsigned int cnt = get_bits_count(gb);

@@ -1075,12 +1074,12 @@ static unsigned int read_sbr_data(AACDecContext *ac, SpectralBandReplication *sb
    sbr->ready_for_dequant = 1;

    if (id_aac == TYPE_SCE || id_aac == TYPE_CCE) {
-        if (read_sbr_single_channel_element(ac, sbr, gb)) {
+        if (read_sbr_single_channel_element(ac, sbr, gb, numTimeSlots)) {
            sbr_turnoff(sbr);
            return get_bits_count(gb) - cnt;
        }
    } else if (id_aac == TYPE_CPE) {
-        if (read_sbr_channel_pair_element(ac, sbr, gb)) {
+        if (read_sbr_channel_pair_element(ac, sbr, gb, numTimeSlots)) {
            sbr_turnoff(sbr);
            return get_bits_count(gb) - cnt;
        }
@@ -1133,12 +1132,13 @@ static void sbr_reset(AACDecContext *ac, SpectralBandReplication *sbr)
 */
 int AAC_RENAME(ff_aac_sbr_decode_extension)(AACDecContext *ac, ChannelElement *che,
                                            GetBitContext *gb_host, int crc,
-                                            int cnt, int id_aac)
+                                            int cnt, int id_aac, int fl960)
 {
    SpectralBandReplication *sbr = get_sbr(che);
    unsigned int num_sbr_bits = 0, num_align_bits;
    unsigned bytes_read;
    GetBitContext gbc = *gb_host, *gb = &gbc;
+    int numTimeSlots = fl960 ? 15 : 16;
    skip_bits_long(gb_host, cnt*8 - 4);

    sbr->reset = 0;
@@ -1166,7 +1166,7 @@ int AAC_RENAME(ff_aac_sbr_decode_extension)(AACDecContext *ac, ChannelElement *c
        sbr_reset(ac, sbr);

    if (sbr->start)
-        num_sbr_bits  += read_sbr_data(ac, sbr, gb, id_aac);
+        num_sbr_bits += read_sbr_data(ac, sbr, gb, id_aac, numTimeSlots);

    num_align_bits = ((cnt << 3) - 4 - num_sbr_bits) & 7;
    bytes_read = ((num_sbr_bits + num_align_bits + 4) >> 3);
@@ -1272,7 +1272,7 @@ int ff_aac_sbr_decode_usac_data(AACDecContext *ac, ChannelElement *che,
    if (sbr_ch == 1) { /* sbr_single_channel_element */
        /* if (harmonicSBR) ... */

-        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
+        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0], 16))
            return -1;

        read_sbr_dtdf(sbr, gb, &sbr->data[0], indep_flag);
@@ -1291,7 +1291,7 @@ int ff_aac_sbr_decode_usac_data(AACDecContext *ac, ChannelElement *che,

        /* if (harmonicSBR) ... */

-        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
+        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0], 16))
            return -1;
        copy_sbr_grid(&sbr->data[1], &sbr->data[0]);

@@ -1323,9 +1323,9 @@ int ff_aac_sbr_decode_usac_data(AACDecContext *ac, ChannelElement *che,

        /* if (harmonicSBR) ... */

-        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
+        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0], 16))
            return -1;
-        if (read_sbr_grid(ac, sbr, gb, &sbr->data[1]))
+        if (read_sbr_grid(ac, sbr, gb, &sbr->data[1], 16))
            return -1;

        read_sbr_dtdf(sbr, gb, &sbr->data[0], indep_flag);
@@ -1369,16 +1369,17 @@ static void sbr_qmf_analysis(AVFloatDSPContext *dsp, AVTXContext *mdct,
                             av_tx_fn mdct_fn,
 #endif /* USE_FIXED */
                             SBRDSPContext *sbrdsp, const INTFLOAT *in, INTFLOAT *x,
-                             INTFLOAT z[320], INTFLOAT W[2][32][32][2], int buf_idx)
+                             INTFLOAT z[320], INTFLOAT W[2][32][32][2], int buf_idx,
+                             int numTimeSlots)
 {
    int i;
 #if USE_FIXED
    int j;
 #endif
-    memcpy(x    , x+1024, (320-32)*sizeof(x[0]));
-    memcpy(x+288, in,         1024*sizeof(x[0]));
-    for (i = 0; i < 32; i++) { // numTimeSlots*RATE = 16*2 as 960 sample frames
-                               // are not supported
+    int nb = numTimeSlots * 64;
+    memcpy(x    , x+nb, (320-32)*sizeof(x[0]));
+    memcpy(x+288, in,         nb*sizeof(x[0]));
+    for (i = 0; i < numTimeSlots*2; i++) { // RATE*numTimeSlots = 2* 16 or 15
        dsp->vector_fmul_reverse(z, sbr_qmf_window_ds, x, 320);
        sbrdsp->sum64x5(z);
        sbrdsp->qmf_pre_shuffle(z);
@@ -1417,13 +1418,14 @@ static void sbr_qmf_synthesis(AVTXContext *mdct, av_tx_fn mdct_fn,
 #endif /* USE_FIXED */
                              INTFLOAT *out, INTFLOAT X[2][38][64],
                              INTFLOAT mdct_buf[2][64],
-                              INTFLOAT *v0, int *v_off, const unsigned int div)
+                              INTFLOAT *v0, int *v_off, int numTimeSlots,
+                              const unsigned int div)
 {
    int i, n;
    const INTFLOAT *sbr_qmf_window = div ? sbr_qmf_window_ds : sbr_qmf_window_us;
    const int step = 128 >> div;
    INTFLOAT *v;
-    for (i = 0; i < 32; i++) {
+    for (i = 0; i < numTimeSlots*2; i++) {
        if (*v_off < step) {
            int saved_samples = (1280 - 128) >> div;
            memcpy(&v0[SBR_SYNTHESIS_BUF_SIZE - saved_samples], v0, saved_samples * sizeof(INTFLOAT));
@@ -1463,11 +1465,11 @@ static void sbr_qmf_synthesis(AVTXContext *mdct, av_tx_fn mdct_fn,
 /// Generate the subband filtered lowband
 static int sbr_lf_gen(SpectralBandReplication *sbr,
                      INTFLOAT X_low[32][40][2], const INTFLOAT W[2][32][32][2],
-                      int buf_idx)
+                      int buf_idx, int numTimeSlots)
 {
    int i, k;
    const int t_HFGen = 8;
-    const int i_f = 32;
+    const int i_f = numTimeSlots*2;
    memset(X_low, 0, 32*sizeof(*X_low));
    for (k = 0; k < sbr->kx[1]; k++) {
        for (i = t_HFGen; i < i_f + t_HFGen; i++) {
@@ -1523,10 +1525,10 @@ static int sbr_hf_gen(AACDecContext *ac, SpectralBandReplication *sbr,
 /// Generate the subband filtered lowband
 static int sbr_x_gen(SpectralBandReplication *sbr, INTFLOAT X[2][38][64],
                     const INTFLOAT Y0[38][64][2], const INTFLOAT Y1[38][64][2],
-                     const INTFLOAT X_low[32][40][2], int ch)
+                     const INTFLOAT X_low[32][40][2], int ch, int numTimeSlots)
 {
    int k, i;
-    const int i_f = 32;
+    const int i_f = numTimeSlots*2;
    const int i_Temp = FFMAX(2*sbr->data[ch].t_env_num_env_old - i_f, 0);
    memset(X, 0, 2*sizeof(*X));
    for (k = 0; k < sbr->kx[0]; k++) {
@@ -1681,7 +1683,7 @@ static void sbr_env_estimate(AAC_FLOAT (*e_curr)[48], INTFLOAT X_high[64][40][2]
 }

 void AAC_RENAME(ff_aac_sbr_apply)(AACDecContext *ac, ChannelElement *che,
-                                  int id_aac, void *L_, void *R_)
+                                  int id_aac, int fl960, void *L_, void *R_)
 {
    INTFLOAT *L = L_, *R = R_;
    SpectralBandReplication *sbr = get_sbr(che);
@@ -1689,6 +1691,7 @@ void AAC_RENAME(ff_aac_sbr_apply)(AACDecContext *ac, ChannelElement *che,
    int ch;
    int nch = (id_aac == TYPE_CPE) ? 2 : 1;
    int err;
+    int numTimeSlots = fl960 ? 15 : 16;

    if (id_aac != sbr->id_aac) {
        av_log(ac->avctx, id_aac == TYPE_LFE ? AV_LOG_VERBOSE : AV_LOG_WARNING,
@@ -1718,10 +1721,10 @@ void AAC_RENAME(ff_aac_sbr_apply)(AACDecContext *ac, ChannelElement *che,
        sbr_qmf_analysis(ac->fdsp, sbr->mdct_ana, sbr->mdct_ana_fn, &sbr->dsp,
                         ch ? R : L, sbr->data[ch].analysis_filterbank_samples,
                         (INTFLOAT*)sbr->qmf_filter_scratch,
-                         sbr->data[ch].W, sbr->data[ch].Ypos);
+                         sbr->data[ch].W, sbr->data[ch].Ypos, numTimeSlots);
        sbr->c.sbr_lf_gen(sbr, sbr->X_low,
                          (const INTFLOAT (*)[32][32][2]) sbr->data[ch].W,
-                          sbr->data[ch].Ypos);
+                          sbr->data[ch].Ypos, numTimeSlots);
        sbr->data[ch].Ypos ^= 1;
        if (sbr->start) {
            sbr->c.sbr_hf_inverse_filter(&sbr->dsp, sbr->alpha0, sbr->alpha1,
@@ -1749,9 +1752,9 @@ void AAC_RENAME(ff_aac_sbr_apply)(AACDecContext *ac, ChannelElement *che,

        /* synthesis */
        sbr->c.sbr_x_gen(sbr, sbr->X[ch],
-                  (const INTFLOAT (*)[64][2]) sbr->data[ch].Y[1-sbr->data[ch].Ypos],
-                  (const INTFLOAT (*)[64][2]) sbr->data[ch].Y[  sbr->data[ch].Ypos],
-                  (const INTFLOAT (*)[40][2]) sbr->X_low, ch);
+                         (const INTFLOAT (*)[64][2]) sbr->data[ch].Y[1-sbr->data[ch].Ypos],
+                         (const INTFLOAT (*)[64][2]) sbr->data[ch].Y[  sbr->data[ch].Ypos],
+                         (const INTFLOAT (*)[40][2]) sbr->X_low, ch, numTimeSlots);
    }

    if (ac->oc[1].m4ac.ps == 1) {
@@ -1767,13 +1770,13 @@ void AAC_RENAME(ff_aac_sbr_apply)(AACDecContext *ac, ChannelElement *che,
                      L, sbr->X[0], sbr->qmf_filter_scratch,
                      sbr->data[0].synthesis_filterbank_samples,
                      &sbr->data[0].synthesis_filterbank_samples_offset,
-                      downsampled);
+                      numTimeSlots, downsampled);
    if (nch == 2)
        sbr_qmf_synthesis(sbr->mdct, sbr->mdct_fn, &sbr->dsp, ac->fdsp,
                          R, sbr->X[1], sbr->qmf_filter_scratch,
                          sbr->data[1].synthesis_filterbank_samples,
                          &sbr->data[1].synthesis_filterbank_samples_offset,
-                          downsampled);
+                          numTimeSlots, downsampled);
 }

 static void aacsbr_func_ptr_init(AACSBRContext *c)
@@ -22,7 +22,8 @@ OBJS-$(CONFIG_VP8DSP)                   += aarch64/vp8dsp_init_aarch64.o
 OBJS-$(CONFIG_AAC_DECODER)              += aarch64/aacpsdsp_init_aarch64.o \
                                           aarch64/sbrdsp_init_aarch64.o
 OBJS-$(CONFIG_AAC_ENCODER)              += aarch64/aacencdsp_init.o
-OBJS-$(CONFIG_DCA_DECODER)              += aarch64/synth_filter_init.o
+OBJS-$(CONFIG_DCA_DECODER)              += aarch64/dcadsp_init_aarch64.o \
+                                           aarch64/synth_filter_init.o
 OBJS-$(CONFIG_OPUS_DECODER)             += aarch64/opusdsp_init.o
 OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o
 OBJS-$(CONFIG_VC1DSP)                   += aarch64/vc1dsp_init_aarch64.o
@@ -65,7 +66,8 @@ NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o

 # decoders/encoders
 NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/aacpsdsp_neon.o
-NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/dcadsp_neon.o             \
+                                           aarch64/synth_filter_neon.o
 NEON-OBJS-$(CONFIG_OPUS_DECODER)        += aarch64/opusdsp_neon.o
 NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
 NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
@@ -78,6 +80,8 @@ NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_deblock_neon.o      \
                                           aarch64/hevcdsp_dequant_neon.o      \
                                           aarch64/hevcdsp_idct_neon.o         \
                                           aarch64/hevcdsp_init_aarch64.o      \
+                                           aarch64/hevcpred_neon.o             \
+                                           aarch64/hevcpred_init_aarch64.o     \
                                           aarch64/h26x/epel_neon.o            \
                                           aarch64/h26x/qpel_neon.o            \
                                           aarch64/h26x/sao_neon.o
@@ -0,0 +1,42 @@
+/*
+ * AArch64 NEON optimised DCA DSP functions
+ * Copyright (c) 2026 Jeongkeun Kim <variety0724@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/dcadsp.h"
+
+void ff_lfe_fir0_float_neon(float *pcm_samples, const int32_t *lfe_samples,
+                            const float *filter_coeff, ptrdiff_t npcmblocks);
+void ff_lfe_fir1_float_neon(float *pcm_samples, const int32_t *lfe_samples,
+                            const float *filter_coeff, ptrdiff_t npcmblocks);
+
+av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_neon;
+        s->lfe_fir_float[1] = ff_lfe_fir1_float_neon;
+    }
+}
@@ -0,0 +1,101 @@
+/*
+ * AArch64 NEON optimised DCA LFE FIR filter functions
+ * Copyright (c) 2026 Jeongkeun Kim <variety0724@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_lfe_fir0_float_neon, export=1
+        lsr             x3, x3, #1
+        sub             x1, x1, #(7*4)
+.Louter0:
+        ld1             {v4.4s, v5.4s}, [x1]
+        scvtf           v4.4s, v4.4s
+        scvtf           v5.4s, v5.4s
+
+        ext             v6.16b, v5.16b, v5.16b, #8
+        rev64           v6.4s,  v6.4s
+        ext             v7.16b, v4.16b, v4.16b, #8
+        rev64           v7.4s,  v7.4s
+
+        mov             x4, x2
+        add             x5, x2, #(248*4)
+        mov             x6, x0
+        add             x7, x0, #(32*4)
+        mov             w8, #32
+.Linner0:
+        ld1             {v0.4s,  v1.4s},  [x4], #32
+        ld1             {v16.4s, v17.4s}, [x5]
+        sub             x5, x5, #32
+        subs            w8, w8, #1
+        fmul            v2.4s, v0.4s,  v6.4s
+        fmul            v3.4s, v16.4s, v4.4s
+        fmla            v2.4s, v1.4s,  v7.4s
+        fmla            v3.4s, v17.4s, v5.4s
+        faddp           v2.4s, v2.4s, v2.4s
+        faddp           v3.4s, v3.4s, v3.4s
+        faddp           s2, v2.2s
+        faddp           s3, v3.2s
+        str             s2, [x6], #4
+        str             s3, [x7], #4
+        b.gt            .Linner0
+
+        subs            x3, x3, #1
+        add             x1, x1, #4
+        add             x0, x0, #(64*4)
+        b.gt            .Louter0
+        ret
+endfunc
+
+function ff_lfe_fir1_float_neon, export=1
+        lsr             x3, x3, #2
+        sub             x1, x1, #(3*4)
+.Louter1:
+        ld1             {v4.4s}, [x1]
+        scvtf           v4.4s, v4.4s
+
+        ext             v5.16b, v4.16b, v4.16b, #8
+        rev64           v5.4s,  v5.4s
+
+        mov             x4, x2
+        add             x5, x2, #(252*4)
+        mov             x6, x0
+        add             x7, x0, #(64*4)
+        mov             w8, #64
+.Linner1:
+        ld1             {v0.4s},  [x4], #16
+        ld1             {v16.4s}, [x5]
+        sub             x5, x5, #16
+        subs            w8, w8, #1
+        fmul            v2.4s, v0.4s,  v5.4s
+        fmul            v3.4s, v16.4s, v4.4s
+        faddp           v2.4s, v2.4s, v2.4s
+        faddp           v3.4s, v3.4s, v3.4s
+        faddp           s2, v2.2s
+        faddp           s3, v3.2s
+        str             s2, [x6], #4
+        str             s3, [x7], #4
+        b.gt            .Linner1
+
+        subs            x3, x3, #1
+        add             x1, x1, #4
+        add             x0, x0, #(128*4)
+        b.gt            .Louter1
+        ret
+endfunc
@@ -130,6 +130,10 @@ NEON8_FNPROTO(epel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
        const uint8_t *src, ptrdiff_t srcstride,
        int height, intptr_t mx, intptr_t my, int width),);

+NEON8_FNPROTO(epel_uni_h, (uint8_t *dst,  ptrdiff_t dststride,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
+
 NEON8_FNPROTO(epel_uni_hv, (uint8_t *dst, ptrdiff_t _dststride,
        const uint8_t *src, ptrdiff_t srcstride,
        int height, intptr_t mx, intptr_t my, int width),);
@@ -143,7 +147,7 @@ NEON8_FNPROTO(epel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
        int height, int denom, int wx, int ox,
        intptr_t mx, intptr_t my, int width),);

-NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
+NEON8_FNPROTO(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
        const uint8_t *_src, ptrdiff_t _srcstride,
        int height, int denom, int wx, int ox,
        intptr_t mx, intptr_t my, int width),);
@@ -222,12 +226,12 @@ NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
        int height, int denom, int wx, int ox,
        intptr_t mx, intptr_t my, int width), _i8mm);

-NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
+NEON8_FNPROTO(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
        const uint8_t *_src, ptrdiff_t _srcstride,
        int height, int denom, int wx, int ox,
        intptr_t mx, intptr_t my, int width),);

-NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
+NEON8_FNPROTO(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
        const uint8_t *_src, ptrdiff_t _srcstride,
        int height, int denom, int wx, int ox,
        intptr_t mx, intptr_t my, int width), _i8mm);
@@ -1276,6 +1276,7 @@ function ff_hevc_put_hevc_epel_bi_v32_8_neon, export=1
 endfunc

 function ff_hevc_put_hevc_epel_bi_v48_8_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
        stp             x4, x5, [sp, #-64]!
        stp             x2, x3, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -1292,10 +1293,12 @@ function ff_hevc_put_hevc_epel_bi_v48_8_neon, export=1
        bl              X(ff_hevc_put_hevc_epel_bi_v24_8_neon)
        ldr             x30, [sp, #8]
        add             sp, sp, #16
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc

 function ff_hevc_put_hevc_epel_bi_v64_8_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
        stp             x4, x5, [sp, #-64]!
        stp             x2, x3, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -1312,6 +1315,7 @@ function ff_hevc_put_hevc_epel_bi_v64_8_neon, export=1
        bl              X(ff_hevc_put_hevc_epel_bi_v32_8_neon)
        ldr             x30, [sp, #8]
        add             sp, sp, #16
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc

@@ -1744,6 +1748,398 @@ function ff_hevc_put_hevc_epel_uni_v64_8_neon, export=1
        ret
 endfunc

+// epel_uni_h: horizontal EPEL filter with output to uint8_t
+// void put_hevc_epel_uni_h(uint8_t *dst, ptrdiff_t dststride,
+//                          const uint8_t *src, ptrdiff_t srcstride,
+//                          int height, intptr_t mx, intptr_t my, int width)
+// x0: dst, x1: dststride, x2: src, x3: srcstride, w4: height, x5: mx
+
+.macro EPEL_UNI_H_HEADER
+        movrel          x7, epel_filters
+        add             x7, x7, x5, lsl #2
+        ld1r            {v30.4s}, [x7]
+        sxtl            v0.8h, v30.8b
+        sub             x2, x2, #1
+.endm
+
+function ff_hevc_put_hevc_epel_uni_h4_8_neon, export=1
+        EPEL_UNI_H_HEADER
+1:      ld1             {v4.8b}, [x2], x3
+        subs            w4, w4, #1
+        uxtl            v4.8h, v4.8b
+        ext             v5.16b, v4.16b, v4.16b, #2
+        ext             v6.16b, v4.16b, v4.16b, #4
+        ext             v7.16b, v4.16b, v4.16b, #6
+        mul             v16.4h, v4.4h, v0.h[0]
+        mla             v16.4h, v5.4h, v0.h[1]
+        mla             v16.4h, v6.4h, v0.h[2]
+        mla             v16.4h, v7.4h, v0.h[3]
+        sqrshrun        v16.8b, v16.8h, #6
+        st1             {v16.s}[0], [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_h6_8_neon, export=1
+        EPEL_UNI_H_HEADER
+1:      ld1             {v3.16b}, [x2], x3
+        subs            w4, w4, #1
+        uxtl2           v4.8h, v3.16b
+        uxtl            v3.8h, v3.8b
+        ext             v5.16b, v3.16b, v4.16b, #2
+        ext             v6.16b, v3.16b, v4.16b, #4
+        ext             v7.16b, v3.16b, v4.16b, #6
+        mul             v16.8h, v3.8h, v0.h[0]
+        mla             v16.8h, v5.8h, v0.h[1]
+        mla             v16.8h, v6.8h, v0.h[2]
+        mla             v16.8h, v7.8h, v0.h[3]
+        sqrshrun        v16.8b, v16.8h, #6
+        add             x7, x0, #4
+        st1             {v16.s}[0], [x0], x1
+        st1             {v16.h}[2], [x7]
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_h8_8_neon, export=1
+        EPEL_UNI_H_HEADER
+1:      ld1             {v3.16b}, [x2], x3
+        subs            w4, w4, #1
+        uxtl2           v4.8h, v3.16b
+        uxtl            v3.8h, v3.8b
+        ext             v5.16b, v3.16b, v4.16b, #2
+        ext             v6.16b, v3.16b, v4.16b, #4
+        ext             v7.16b, v3.16b, v4.16b, #6
+        mul             v16.8h, v3.8h, v0.h[0]
+        mla             v16.8h, v5.8h, v0.h[1]
+        mla             v16.8h, v6.8h, v0.h[2]
+        mla             v16.8h, v7.8h, v0.h[3]
+        sqrshrun        v16.8b, v16.8h, #6
+        st1             {v16.8b}, [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_h12_8_neon, export=1
+        EPEL_UNI_H_HEADER
+1:      ld1             {v3.16b}, [x2], x3
+        subs            w4, w4, #1
+        uxtl2           v4.8h, v3.16b
+        uxtl            v3.8h, v3.8b
+        ext             v5.16b, v3.16b, v4.16b, #2
+        ext             v6.16b, v3.16b, v4.16b, #4
+        ext             v7.16b, v3.16b, v4.16b, #6
+        ext             v20.16b, v4.16b, v4.16b, #2
+        ext             v21.16b, v4.16b, v4.16b, #4
+        ext             v22.16b, v4.16b, v4.16b, #6
+        mul             v16.8h, v3.8h, v0.h[0]
+        mla             v16.8h, v5.8h, v0.h[1]
+        mla             v16.8h, v6.8h, v0.h[2]
+        mla             v16.8h, v7.8h, v0.h[3]
+        mul             v17.4h, v4.4h, v0.h[0]
+        mla             v17.4h, v20.4h, v0.h[1]
+        mla             v17.4h, v21.4h, v0.h[2]
+        mla             v17.4h, v22.4h, v0.h[3]
+        sqrshrun        v16.8b, v16.8h, #6
+        sqrshrun        v17.8b, v17.8h, #6
+        add             x7, x0, #8
+        st1             {v16.8b}, [x0], x1
+        st1             {v17.s}[0], [x7]
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_h16_8_neon, export=1
+        EPEL_UNI_H_HEADER
+1:      ld1             {v2.16b, v3.16b}, [x2], x3
+        subs            w4, w4, #1
+        uxtl            v4.8h, v2.8b
+        uxtl2           v5.8h, v2.16b
+        uxtl            v6.8h, v3.8b
+        ext             v16.16b, v4.16b, v5.16b, #2
+        ext             v17.16b, v4.16b, v5.16b, #4
+        ext             v18.16b, v4.16b, v5.16b, #6
+        ext             v19.16b, v5.16b, v6.16b, #2
+        ext             v20.16b, v5.16b, v6.16b, #4
+        ext             v21.16b, v5.16b, v6.16b, #6
+        mul             v22.8h, v4.8h, v0.h[0]
+        mla             v22.8h, v16.8h, v0.h[1]
+        mla             v22.8h, v17.8h, v0.h[2]
+        mla             v22.8h, v18.8h, v0.h[3]
+        mul             v23.8h, v5.8h, v0.h[0]
+        mla             v23.8h, v19.8h, v0.h[1]
+        mla             v23.8h, v20.8h, v0.h[2]
+        mla             v23.8h, v21.8h, v0.h[3]
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun2       v22.16b, v23.8h, #6
+        st1             {v22.16b}, [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_h24_8_neon, export=1
+        EPEL_UNI_H_HEADER
+1:      ld1             {v1.16b, v2.16b}, [x2], x3
+        subs            w4, w4, #1
+        uxtl            v3.8h, v1.8b
+        uxtl2           v4.8h, v1.16b
+        uxtl            v5.8h, v2.8b
+        uxtl2           v6.8h, v2.16b
+        // First 8 pixels
+        ext             v16.16b, v3.16b, v4.16b, #2
+        ext             v17.16b, v3.16b, v4.16b, #4
+        ext             v18.16b, v3.16b, v4.16b, #6
+        mul             v22.8h, v3.8h, v0.h[0]
+        mla             v22.8h, v16.8h, v0.h[1]
+        mla             v22.8h, v17.8h, v0.h[2]
+        mla             v22.8h, v18.8h, v0.h[3]
+        // Second 8 pixels
+        ext             v16.16b, v4.16b, v5.16b, #2
+        ext             v17.16b, v4.16b, v5.16b, #4
+        ext             v18.16b, v4.16b, v5.16b, #6
+        mul             v23.8h, v4.8h, v0.h[0]
+        mla             v23.8h, v16.8h, v0.h[1]
+        mla             v23.8h, v17.8h, v0.h[2]
+        mla             v23.8h, v18.8h, v0.h[3]
+        // Third 8 pixels
+        ext             v16.16b, v5.16b, v6.16b, #2
+        ext             v17.16b, v5.16b, v6.16b, #4
+        ext             v18.16b, v5.16b, v6.16b, #6
+        mul             v24.8h, v5.8h, v0.h[0]
+        mla             v24.8h, v16.8h, v0.h[1]
+        mla             v24.8h, v17.8h, v0.h[2]
+        mla             v24.8h, v18.8h, v0.h[3]
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun2       v22.16b, v23.8h, #6
+        sqrshrun        v23.8b, v24.8h, #6
+        add             x7, x0, #16
+        st1             {v22.16b}, [x0], x1
+        st1             {v23.8b}, [x7]
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_h32_8_neon, export=1
+        EPEL_UNI_H_HEADER
+1:      ld1             {v1.16b, v2.16b, v3.16b}, [x2], x3
+        subs            w4, w4, #1
+        uxtl            v4.8h, v1.8b
+        uxtl2           v5.8h, v1.16b
+        uxtl            v6.8h, v2.8b
+        uxtl2           v7.8h, v2.16b
+        uxtl            v26.8h, v3.8b
+        // First 8 pixels
+        ext             v16.16b, v4.16b, v5.16b, #2
+        ext             v17.16b, v4.16b, v5.16b, #4
+        ext             v18.16b, v4.16b, v5.16b, #6
+        mul             v22.8h, v4.8h, v0.h[0]
+        mla             v22.8h, v16.8h, v0.h[1]
+        mla             v22.8h, v17.8h, v0.h[2]
+        mla             v22.8h, v18.8h, v0.h[3]
+        // Second 8 pixels
+        ext             v16.16b, v5.16b, v6.16b, #2
+        ext             v17.16b, v5.16b, v6.16b, #4
+        ext             v18.16b, v5.16b, v6.16b, #6
+        mul             v23.8h, v5.8h, v0.h[0]
+        mla             v23.8h, v16.8h, v0.h[1]
+        mla             v23.8h, v17.8h, v0.h[2]
+        mla             v23.8h, v18.8h, v0.h[3]
+        // Third 8 pixels
+        ext             v16.16b, v6.16b, v7.16b, #2
+        ext             v17.16b, v6.16b, v7.16b, #4
+        ext             v18.16b, v6.16b, v7.16b, #6
+        mul             v24.8h, v6.8h, v0.h[0]
+        mla             v24.8h, v16.8h, v0.h[1]
+        mla             v24.8h, v17.8h, v0.h[2]
+        mla             v24.8h, v18.8h, v0.h[3]
+        // Fourth 8 pixels
+        ext             v16.16b, v7.16b, v26.16b, #2
+        ext             v17.16b, v7.16b, v26.16b, #4
+        ext             v18.16b, v7.16b, v26.16b, #6
+        mul             v25.8h, v7.8h, v0.h[0]
+        mla             v25.8h, v16.8h, v0.h[1]
+        mla             v25.8h, v17.8h, v0.h[2]
+        mla             v25.8h, v18.8h, v0.h[3]
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun2       v22.16b, v23.8h, #6
+        sqrshrun        v23.8b, v24.8h, #6
+        sqrshrun2       v23.16b, v25.8h, #6
+        st1             {v22.16b, v23.16b}, [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_h48_8_neon, export=1
+        EPEL_UNI_H_HEADER
+        sub             sp, sp, #32
+        st1             {v8.16b, v9.16b}, [sp]
+1:      ld1             {v1.16b, v2.16b, v3.16b}, [x2]
+        add             x7, x2, #48
+        ld1             {v26.8b}, [x7]
+        add             x2, x2, x3
+        subs            w4, w4, #1
+        uxtl            v4.8h, v1.8b
+        uxtl2           v5.8h, v1.16b
+        uxtl            v6.8h, v2.8b
+        uxtl2           v7.8h, v2.16b
+        uxtl            v8.8h, v3.8b
+        uxtl2           v9.8h, v3.16b
+        uxtl            v27.8h, v26.8b
+        // First 8 pixels
+        ext             v16.16b, v4.16b, v5.16b, #2
+        ext             v17.16b, v4.16b, v5.16b, #4
+        ext             v18.16b, v4.16b, v5.16b, #6
+        mul             v22.8h, v4.8h, v0.h[0]
+        mla             v22.8h, v16.8h, v0.h[1]
+        mla             v22.8h, v17.8h, v0.h[2]
+        mla             v22.8h, v18.8h, v0.h[3]
+        // Second 8 pixels
+        ext             v16.16b, v5.16b, v6.16b, #2
+        ext             v17.16b, v5.16b, v6.16b, #4
+        ext             v18.16b, v5.16b, v6.16b, #6
+        mul             v23.8h, v5.8h, v0.h[0]
+        mla             v23.8h, v16.8h, v0.h[1]
+        mla             v23.8h, v17.8h, v0.h[2]
+        mla             v23.8h, v18.8h, v0.h[3]
+        // Third 8 pixels
+        ext             v16.16b, v6.16b, v7.16b, #2
+        ext             v17.16b, v6.16b, v7.16b, #4
+        ext             v18.16b, v6.16b, v7.16b, #6
+        mul             v24.8h, v6.8h, v0.h[0]
+        mla             v24.8h, v16.8h, v0.h[1]
+        mla             v24.8h, v17.8h, v0.h[2]
+        mla             v24.8h, v18.8h, v0.h[3]
+        // Fourth 8 pixels
+        ext             v16.16b, v7.16b, v8.16b, #2
+        ext             v17.16b, v7.16b, v8.16b, #4
+        ext             v18.16b, v7.16b, v8.16b, #6
+        mul             v25.8h, v7.8h, v0.h[0]
+        mla             v25.8h, v16.8h, v0.h[1]
+        mla             v25.8h, v17.8h, v0.h[2]
+        mla             v25.8h, v18.8h, v0.h[3]
+        // Fifth 8 pixels
+        ext             v16.16b, v8.16b, v9.16b, #2
+        ext             v17.16b, v8.16b, v9.16b, #4
+        ext             v18.16b, v8.16b, v9.16b, #6
+        mul             v28.8h, v8.8h, v0.h[0]
+        mla             v28.8h, v16.8h, v0.h[1]
+        mla             v28.8h, v17.8h, v0.h[2]
+        mla             v28.8h, v18.8h, v0.h[3]
+        // Sixth 8 pixels
+        ext             v16.16b, v9.16b, v27.16b, #2
+        ext             v17.16b, v9.16b, v27.16b, #4
+        ext             v18.16b, v9.16b, v27.16b, #6
+        mul             v29.8h, v9.8h, v0.h[0]
+        mla             v29.8h, v16.8h, v0.h[1]
+        mla             v29.8h, v17.8h, v0.h[2]
+        mla             v29.8h, v18.8h, v0.h[3]
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun2       v22.16b, v23.8h, #6
+        sqrshrun        v23.8b, v24.8h, #6
+        sqrshrun2       v23.16b, v25.8h, #6
+        sqrshrun        v24.8b, v28.8h, #6
+        sqrshrun2       v24.16b, v29.8h, #6
+        st1             {v22.16b, v23.16b, v24.16b}, [x0], x1
+        b.ne            1b
+        ld1             {v8.16b, v9.16b}, [sp], #32
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_h64_8_neon, export=1
+        EPEL_UNI_H_HEADER
+        sub             sp, sp, #64
+        st1             {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+1:      add             x7, x2, #48
+        ld1             {v1.16b, v2.16b, v3.16b}, [x2]
+        ld1             {v26.16b, v27.16b}, [x7]
+        add             x2, x2, x3
+        subs            w4, w4, #1
+        uxtl            v4.8h, v1.8b
+        uxtl2           v5.8h, v1.16b
+        uxtl            v6.8h, v2.8b
+        uxtl2           v7.8h, v2.16b
+        uxtl            v8.8h, v3.8b
+        uxtl2           v9.8h, v3.16b
+        uxtl            v10.8h, v26.8b
+        uxtl2           v11.8h, v26.16b
+        uxtl            v28.8h, v27.8b
+        // First 8 pixels
+        ext             v16.16b, v4.16b, v5.16b, #2
+        ext             v17.16b, v4.16b, v5.16b, #4
+        ext             v18.16b, v4.16b, v5.16b, #6
+        mul             v22.8h, v4.8h, v0.h[0]
+        mla             v22.8h, v16.8h, v0.h[1]
+        mla             v22.8h, v17.8h, v0.h[2]
+        mla             v22.8h, v18.8h, v0.h[3]
+        // Second 8 pixels
+        ext             v16.16b, v5.16b, v6.16b, #2
+        ext             v17.16b, v5.16b, v6.16b, #4
+        ext             v18.16b, v5.16b, v6.16b, #6
+        mul             v23.8h, v5.8h, v0.h[0]
+        mla             v23.8h, v16.8h, v0.h[1]
+        mla             v23.8h, v17.8h, v0.h[2]
+        mla             v23.8h, v18.8h, v0.h[3]
+        // Third 8 pixels
+        ext             v16.16b, v6.16b, v7.16b, #2
+        ext             v17.16b, v6.16b, v7.16b, #4
+        ext             v18.16b, v6.16b, v7.16b, #6
+        mul             v24.8h, v6.8h, v0.h[0]
+        mla             v24.8h, v16.8h, v0.h[1]
+        mla             v24.8h, v17.8h, v0.h[2]
+        mla             v24.8h, v18.8h, v0.h[3]
+        // Fourth 8 pixels
+        ext             v16.16b, v7.16b, v8.16b, #2
+        ext             v17.16b, v7.16b, v8.16b, #4
+        ext             v18.16b, v7.16b, v8.16b, #6
+        mul             v25.8h, v7.8h, v0.h[0]
+        mla             v25.8h, v16.8h, v0.h[1]
+        mla             v25.8h, v17.8h, v0.h[2]
+        mla             v25.8h, v18.8h, v0.h[3]
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun2       v22.16b, v23.8h, #6
+        sqrshrun        v23.8b, v24.8h, #6
+        sqrshrun2       v23.16b, v25.8h, #6
+        // Fifth 8 pixels
+        ext             v16.16b, v8.16b, v9.16b, #2
+        ext             v17.16b, v8.16b, v9.16b, #4
+        ext             v18.16b, v8.16b, v9.16b, #6
+        mul             v24.8h, v8.8h, v0.h[0]
+        mla             v24.8h, v16.8h, v0.h[1]
+        mla             v24.8h, v17.8h, v0.h[2]
+        mla             v24.8h, v18.8h, v0.h[3]
+        // Sixth 8 pixels
+        ext             v16.16b, v9.16b, v10.16b, #2
+        ext             v17.16b, v9.16b, v10.16b, #4
+        ext             v18.16b, v9.16b, v10.16b, #6
+        mul             v25.8h, v9.8h, v0.h[0]
+        mla             v25.8h, v16.8h, v0.h[1]
+        mla             v25.8h, v17.8h, v0.h[2]
+        mla             v25.8h, v18.8h, v0.h[3]
+        // Seventh 8 pixels
+        ext             v16.16b, v10.16b, v11.16b, #2
+        ext             v17.16b, v10.16b, v11.16b, #4
+        ext             v18.16b, v10.16b, v11.16b, #6
+        mul             v26.8h, v10.8h, v0.h[0]
+        mla             v26.8h, v16.8h, v0.h[1]
+        mla             v26.8h, v17.8h, v0.h[2]
+        mla             v26.8h, v18.8h, v0.h[3]
+        // Eighth 8 pixels
+        ext             v16.16b, v11.16b, v28.16b, #2
+        ext             v17.16b, v11.16b, v28.16b, #4
+        ext             v18.16b, v11.16b, v28.16b, #6
+        mul             v27.8h, v11.8h, v0.h[0]
+        mla             v27.8h, v16.8h, v0.h[1]
+        mla             v27.8h, v17.8h, v0.h[2]
+        mla             v27.8h, v18.8h, v0.h[3]
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        sqrshrun        v25.8b, v26.8h, #6
+        sqrshrun2       v25.16b, v27.8h, #6
+        st1             {v22.16b, v23.16b, v24.16b, v25.16b}, [x0], x1
+        b.ne            1b
+        ld1             {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], #64
+        ret
+endfunc

 .macro EPEL_H_HEADER
        movrel          x5, epel_filters
@@ -2824,6 +3220,7 @@ function ff_hevc_put_hevc_epel_hv4_8_\suffix, export=1
        add             w10, w3, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        stp             x5, x30, [sp, #-32]!
        stp             x0, x3, [sp, #16]
        add             x0, sp, #32
@@ -2832,6 +3229,7 @@ function ff_hevc_put_hevc_epel_hv4_8_\suffix, export=1
        bl              X(ff_hevc_put_hevc_epel_h4_8_\suffix)
        ldp             x0, x3, [sp, #16]
        ldp             x5, x30, [sp], #32
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_hv4_8_end_neon
 endfunc

@@ -2839,6 +3237,7 @@ function ff_vvc_put_epel_hv4_8_\suffix, export=1
        add             w10, w3, #3
        lsl             x10, x10, #8
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        stp             x5, x30, [sp, #-32]!
        stp             x0, x3, [sp, #16]
        add             x0, sp, #32
@@ -2847,6 +3246,7 @@ function ff_vvc_put_epel_hv4_8_\suffix, export=1
        bl              X(ff_vvc_put_epel_h4_8_\suffix)
        ldp             x0, x3, [sp, #16]
        ldp             x5, x30, [sp], #32
+        AARCH64_VALIDATE_LINK_REGISTER
        b               vvc_put_epel_hv4_8_end_neon
 endfunc

@@ -2854,6 +3254,7 @@ function ff_hevc_put_hevc_epel_hv6_8_\suffix, export=1
        add             w10, w3, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        stp             x5, x30, [sp, #-32]!
        stp             x0, x3, [sp, #16]
        add             x0, sp, #32
@@ -2862,6 +3263,7 @@ function ff_hevc_put_hevc_epel_hv6_8_\suffix, export=1
        bl              X(ff_hevc_put_hevc_epel_h6_8_\suffix)
        ldp             x0, x3, [sp, #16]
        ldp             x5, x30, [sp], #32
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_hv6_8_end_neon
 endfunc

@@ -2869,6 +3271,7 @@ function ff_hevc_put_hevc_epel_hv8_8_\suffix, export=1
        add             w10, w3, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        stp             x5, x30, [sp, #-32]!
        stp             x0, x3, [sp, #16]
        add             x0, sp, #32
@@ -2877,6 +3280,7 @@ function ff_hevc_put_hevc_epel_hv8_8_\suffix, export=1
        bl              X(ff_hevc_put_hevc_epel_h8_8_\suffix)
        ldp             x0, x3, [sp, #16]
        ldp             x5, x30, [sp], #32
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_hv8_8_end_neon
 endfunc

@@ -2884,6 +3288,7 @@ function ff_vvc_put_epel_hv8_8_\suffix, export=1
        add             w10, w3, #3
        lsl             x10, x10, #8
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        stp             x5, x30, [sp, #-32]!
        stp             x0, x3, [sp, #16]
        add             x0, sp, #32
@@ -2892,6 +3297,7 @@ function ff_vvc_put_epel_hv8_8_\suffix, export=1
        bl              X(ff_vvc_put_epel_h8_8_\suffix)
        ldp             x0, x3, [sp, #16]
        ldp             x5, x30, [sp], #32
+        AARCH64_VALIDATE_LINK_REGISTER
        b               vvc_put_epel_hv8_8_end_neon
 endfunc

@@ -2899,6 +3305,7 @@ function ff_hevc_put_hevc_epel_hv12_8_\suffix, export=1
        add             w10, w3, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        stp             x5, x30, [sp, #-32]!
        stp             x0, x3, [sp, #16]
        add             x0, sp, #32
@@ -2907,6 +3314,7 @@ function ff_hevc_put_hevc_epel_hv12_8_\suffix, export=1
        bl              X(ff_hevc_put_hevc_epel_h12_8_\suffix)
        ldp             x0, x3, [sp, #16]
        ldp             x5, x30, [sp], #32
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_hv12_8_end_neon
 endfunc

@@ -2914,6 +3322,7 @@ function ff_hevc_put_hevc_epel_hv16_8_\suffix, export=1
        add             w10, w3, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        stp             x5, x30, [sp, #-32]!
        stp             x0, x3, [sp, #16]
        add             x0, sp, #32
@@ -2922,6 +3331,7 @@ function ff_hevc_put_hevc_epel_hv16_8_\suffix, export=1
        bl              X(ff_hevc_put_hevc_epel_h16_8_\suffix)
        ldp             x0, x3, [sp, #16]
        ldp             x5, x30, [sp], #32
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_hv16_8_end_neon
 endfunc

@@ -2929,6 +3339,7 @@ function ff_vvc_put_epel_hv16_8_\suffix, export=1
        add             w10, w3, #3
        lsl             x10, x10, #8
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        stp             x5, x30, [sp, #-32]!
        stp             x0, x3, [sp, #16]
        add             x0, sp, #32
@@ -2937,6 +3348,7 @@ function ff_vvc_put_epel_hv16_8_\suffix, export=1
        bl              X(ff_vvc_put_epel_h16_8_\suffix)
        ldp             x0, x3, [sp, #16]
        ldp             x5, x30, [sp], #32
+        AARCH64_VALIDATE_LINK_REGISTER
        b               vvc_put_epel_hv16_8_end_neon
 endfunc

@@ -2944,6 +3356,7 @@ function ff_hevc_put_hevc_epel_hv24_8_\suffix, export=1
        add             w10, w3, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        stp             x5, x30, [sp, #-32]!
        stp             x0, x3, [sp, #16]
        add             x0, sp, #32
@@ -2952,10 +3365,12 @@ function ff_hevc_put_hevc_epel_hv24_8_\suffix, export=1
        bl              X(ff_hevc_put_hevc_epel_h24_8_\suffix)
        ldp             x0, x3, [sp, #16]
        ldp             x5, x30, [sp], #32
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_hv24_8_end_neon
 endfunc

 function ff_hevc_put_hevc_epel_hv32_8_\suffix, export=1
+        AARCH64_SIGN_LINK_REGISTER
        stp             x4, x5, [sp, #-64]!
        stp             x2, x3, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -2970,10 +3385,12 @@ function ff_hevc_put_hevc_epel_hv32_8_\suffix, export=1
        mov             x6, #16
        bl              X(ff_hevc_put_hevc_epel_hv16_8_\suffix)
        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc

 function ff_vvc_put_epel_hv32_8_\suffix, export=1
+        AARCH64_SIGN_LINK_REGISTER
        stp             x4, x5, [sp, #-64]!
        stp             x2, x3, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -2988,10 +3405,12 @@ function ff_vvc_put_epel_hv32_8_\suffix, export=1
        mov             x6, #16
        bl              X(ff_vvc_put_epel_hv16_8_\suffix)
        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc

 function ff_hevc_put_hevc_epel_hv48_8_\suffix, export=1
+        AARCH64_SIGN_LINK_REGISTER
        stp             x4, x5, [sp, #-64]!
        stp             x2, x3, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -3006,10 +3425,12 @@ function ff_hevc_put_hevc_epel_hv48_8_\suffix, export=1
        mov             x6, #24
        bl              X(ff_hevc_put_hevc_epel_hv24_8_\suffix)
        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc

 function ff_hevc_put_hevc_epel_hv64_8_\suffix, export=1
+        AARCH64_SIGN_LINK_REGISTER
        stp             x4, x5, [sp, #-64]!
        stp             x2, x3, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -3038,10 +3459,12 @@ function ff_hevc_put_hevc_epel_hv64_8_\suffix, export=1
        mov             x6, #16
        bl              X(ff_hevc_put_hevc_epel_hv16_8_\suffix)
        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc

 function ff_vvc_put_epel_hv64_8_\suffix, export=1
+        AARCH64_SIGN_LINK_REGISTER
        stp             x4, x5, [sp, #-64]!
        stp             x2, x3, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -3056,10 +3479,12 @@ function ff_vvc_put_epel_hv64_8_\suffix, export=1
        mov             x6, #32
        bl              X(ff_vvc_put_epel_hv32_8_\suffix)
        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc

 function ff_vvc_put_epel_hv128_8_\suffix, export=1
+        AARCH64_SIGN_LINK_REGISTER
        stp             x4, x5, [sp, #-64]!
        stp             x2, x3, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -3074,6 +3499,7 @@ function ff_vvc_put_epel_hv128_8_\suffix, export=1
        mov             x6, #64
        bl              X(ff_vvc_put_epel_hv64_8_\suffix)
        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc

@@ -3214,6 +3640,7 @@ function ff_hevc_put_hevc_epel_uni_hv4_8_\suffix, export=1
        add             w10, w4, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        str             x30, [sp, #-48]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -3226,6 +3653,7 @@ function ff_hevc_put_hevc_epel_uni_hv4_8_\suffix, export=1
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_uni_hv4_8_end_neon
 endfunc

@@ -3233,6 +3661,7 @@ function ff_hevc_put_hevc_epel_uni_hv6_8_\suffix, export=1
        add             w10, w4, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        str             x30, [sp, #-48]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -3245,6 +3674,7 @@ function ff_hevc_put_hevc_epel_uni_hv6_8_\suffix, export=1
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_uni_hv6_8_end_neon
 endfunc

@@ -3252,6 +3682,7 @@ function ff_hevc_put_hevc_epel_uni_hv8_8_\suffix, export=1
        add             w10, w4, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        str             x30, [sp, #-48]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -3264,6 +3695,7 @@ function ff_hevc_put_hevc_epel_uni_hv8_8_\suffix, export=1
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_uni_hv8_8_end_neon
 endfunc

@@ -3271,6 +3703,7 @@ function ff_hevc_put_hevc_epel_uni_hv12_8_\suffix, export=1
        add             w10, w4, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        str             x30, [sp, #-48]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -3283,6 +3716,7 @@ function ff_hevc_put_hevc_epel_uni_hv12_8_\suffix, export=1
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_uni_hv12_8_end_neon
 endfunc

@@ -3290,6 +3724,7 @@ function ff_hevc_put_hevc_epel_uni_hv16_8_\suffix, export=1
        add             w10, w4, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        str             x30, [sp, #-48]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -3302,6 +3737,7 @@ function ff_hevc_put_hevc_epel_uni_hv16_8_\suffix, export=1
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_uni_hv16_8_end_neon
 endfunc

@@ -3309,6 +3745,7 @@ function ff_hevc_put_hevc_epel_uni_hv24_8_\suffix, export=1
        add             w10, w4, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        str             x30, [sp, #-48]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -3321,10 +3758,12 @@ function ff_hevc_put_hevc_epel_uni_hv24_8_\suffix, export=1
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_uni_hv24_8_end_neon
 endfunc

 function ff_hevc_put_hevc_epel_uni_hv32_8_\suffix, export=1
+        AARCH64_SIGN_LINK_REGISTER
        stp             x5, x6, [sp, #-64]!
        stp             x3, x4, [sp, #16]
        stp             x1, x2, [sp, #32]
@@ -3341,10 +3780,12 @@ function ff_hevc_put_hevc_epel_uni_hv32_8_\suffix, export=1
        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_\suffix)
        ldr             x30, [sp, #56]
        add             sp, sp, #64
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc

 function ff_hevc_put_hevc_epel_uni_hv48_8_\suffix, export=1
+        AARCH64_SIGN_LINK_REGISTER
        stp             x5, x6, [sp, #-64]!
        stp             x3, x4, [sp, #16]
        stp             x1, x2, [sp, #32]
@@ -3361,10 +3802,12 @@ function ff_hevc_put_hevc_epel_uni_hv48_8_\suffix, export=1
        bl              X(ff_hevc_put_hevc_epel_uni_hv24_8_\suffix)
        ldr             x30, [sp, #56]
        add             sp, sp, #64
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc

 function ff_hevc_put_hevc_epel_uni_hv64_8_\suffix, export=1
+        AARCH64_SIGN_LINK_REGISTER
        stp             x5, x6, [sp, #-64]!
        stp             x3, x4, [sp, #16]
        stp             x1, x2, [sp, #32]
@@ -3397,6 +3840,7 @@ function ff_hevc_put_hevc_epel_uni_hv64_8_\suffix, export=1
        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_\suffix)
        ldr             x30, [sp, #56]
        add             sp, sp, #64
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc
 .endm
@@ -4202,6 +4646,7 @@ function ff_hevc_put_hevc_epel_uni_w_hv4_8_\suffix, export=1
        add             x10, x4, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10     // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        str             x30, [sp, #-48]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -4214,6 +4659,7 @@ function ff_hevc_put_hevc_epel_uni_w_hv4_8_\suffix, export=1
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_uni_w_hv4_8_end_neon
 endfunc

@@ -4224,6 +4670,7 @@ function ff_hevc_put_hevc_epel_uni_w_hv6_8_\suffix, export=1
        add             x10, x4, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10     // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        str             x30, [sp, #-48]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -4236,6 +4683,7 @@ function ff_hevc_put_hevc_epel_uni_w_hv6_8_\suffix, export=1
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_uni_w_hv6_8_end_neon
 endfunc

@@ -4246,6 +4694,7 @@ function ff_hevc_put_hevc_epel_uni_w_hv8_8_\suffix, export=1
        add             x10, x4, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10     // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        str             x30, [sp, #-48]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -4258,6 +4707,7 @@ function ff_hevc_put_hevc_epel_uni_w_hv8_8_\suffix, export=1
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_uni_w_hv8_8_end_neon
 endfunc

@@ -4268,6 +4718,7 @@ function ff_hevc_put_hevc_epel_uni_w_hv12_8_\suffix, export=1
        add             x10, x4, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10     // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        str             x30, [sp, #-48]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -4280,6 +4731,7 @@ function ff_hevc_put_hevc_epel_uni_w_hv12_8_\suffix, export=1
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_uni_w_hv12_8_end_neon
 endfunc

@@ -4290,6 +4742,7 @@ function ff_hevc_put_hevc_epel_uni_w_hv16_8_\suffix, export=1
        add             x10, x4, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10     // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        str             x30, [sp, #-48]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -4302,6 +4755,7 @@ function ff_hevc_put_hevc_epel_uni_w_hv16_8_\suffix, export=1
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_uni_w_hv16_8_end_neon
 endfunc

@@ -4312,6 +4766,7 @@ function ff_hevc_put_hevc_epel_uni_w_hv24_8_\suffix, export=1
        add             x10, x4, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10     // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        str             x30, [sp, #-48]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -4324,10 +4779,12 @@ function ff_hevc_put_hevc_epel_uni_w_hv24_8_\suffix, export=1
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_uni_w_hv24_8_end_neon
 endfunc

 function ff_hevc_put_hevc_epel_uni_w_hv32_8_\suffix, export=1
+        AARCH64_SIGN_LINK_REGISTER
        ldp             x15, x16, [sp]
        mov             x17, #16
        stp             x15, x16, [sp, #-96]!
@@ -4352,10 +4809,12 @@ function ff_hevc_put_hevc_epel_uni_w_hv32_8_\suffix, export=1
        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_\suffix)
        ldp             x17, x30, [sp, #16]
        ldp             x15, x16, [sp], #32
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc

 function ff_hevc_put_hevc_epel_uni_w_hv48_8_\suffix, export=1
+        AARCH64_SIGN_LINK_REGISTER
        ldp             x15, x16, [sp]
        mov             x17, #24
        stp             x15, x16, [sp, #-96]!
@@ -4379,10 +4838,12 @@ function ff_hevc_put_hevc_epel_uni_w_hv48_8_\suffix, export=1
        bl              X(ff_hevc_put_hevc_epel_uni_w_hv24_8_\suffix)
        ldp             x17, x30, [sp, #16]
        ldp             x15, x16, [sp], #32
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc

 function ff_hevc_put_hevc_epel_uni_w_hv64_8_\suffix, export=1
+        AARCH64_SIGN_LINK_REGISTER
        ldp             x15, x16, [sp]
        mov             x17, #32
        stp             x15, x16, [sp, #-96]!
@@ -4407,6 +4868,7 @@ function ff_hevc_put_hevc_epel_uni_w_hv64_8_\suffix, export=1
        bl              X(ff_hevc_put_hevc_epel_uni_w_hv32_8_\suffix)
        ldp             x17, x30, [sp, #16]
        ldp             x15, x16, [sp], #32
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc
 .endm
@@ -4597,6 +5059,7 @@ function ff_hevc_put_hevc_epel_bi_hv4_8_\suffix, export=1
        add             w10, w5, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        stp             x7, x30, [sp, #-48]!
        stp             x4, x5, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -4610,6 +5073,7 @@ function ff_hevc_put_hevc_epel_bi_hv4_8_\suffix, export=1
        ldp             x4, x5, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldp             x7, x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_bi_hv4_8_end_neon
 endfunc

@@ -4617,6 +5081,7 @@ function ff_hevc_put_hevc_epel_bi_hv6_8_\suffix, export=1
        add             w10, w5, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        stp             x7, x30, [sp, #-48]!
        stp             x4, x5, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -4630,6 +5095,7 @@ function ff_hevc_put_hevc_epel_bi_hv6_8_\suffix, export=1
        ldp             x4, x5, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldp             x7, x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_bi_hv6_8_end_neon
 endfunc

@@ -4637,6 +5103,7 @@ function ff_hevc_put_hevc_epel_bi_hv8_8_\suffix, export=1
        add             w10, w5, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        stp             x7, x30, [sp, #-48]!
        stp             x4, x5, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -4650,6 +5117,7 @@ function ff_hevc_put_hevc_epel_bi_hv8_8_\suffix, export=1
        ldp             x4, x5, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldp             x7, x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_bi_hv8_8_end_neon
 endfunc

@@ -4657,6 +5125,7 @@ function ff_hevc_put_hevc_epel_bi_hv12_8_\suffix, export=1
        add             w10, w5, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        stp             x7, x30, [sp, #-48]!
        stp             x4, x5, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -4670,6 +5139,7 @@ function ff_hevc_put_hevc_epel_bi_hv12_8_\suffix, export=1
        ldp             x4, x5, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldp             x7, x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_bi_hv12_8_end_neon
 endfunc

@@ -4677,6 +5147,7 @@ function ff_hevc_put_hevc_epel_bi_hv16_8_\suffix, export=1
        add             w10, w5, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        stp             x7, x30, [sp, #-48]!
        stp             x4, x5, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -4690,6 +5161,7 @@ function ff_hevc_put_hevc_epel_bi_hv16_8_\suffix, export=1
        ldp             x4, x5, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldp             x7, x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_bi_hv16_8_end_neon
 endfunc

@@ -4697,6 +5169,7 @@ function ff_hevc_put_hevc_epel_bi_hv24_8_\suffix, export=1
        add             w10, w5, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        stp             x7, x30, [sp, #-48]!
        stp             x4, x5, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -4710,6 +5183,7 @@ function ff_hevc_put_hevc_epel_bi_hv24_8_\suffix, export=1
        ldp             x4, x5, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldp             x7, x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_bi_hv24_8_end_neon
 endfunc

@@ -4718,6 +5192,7 @@ function ff_hevc_put_hevc_epel_bi_hv32_8_\suffix, export=1
        add             w10, w5, #3
        lsl             x10, x10, #7
        sub             sp, sp, x10 // tmp_array
+        AARCH64_SIGN_LINK_REGISTER
        stp             x7, x30, [sp, #-48]!
        stp             x4, x5, [sp, #16]
        stp             x0, x1, [sp, #32]
@@ -4732,10 +5207,12 @@ function ff_hevc_put_hevc_epel_bi_hv32_8_\suffix, export=1
        ldp             x4, x5, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldp             x7, x30, [sp], #48
+        AARCH64_VALIDATE_LINK_REGISTER
        b               hevc_put_hevc_epel_bi_hv32_8_end_neon
 endfunc

 function ff_hevc_put_hevc_epel_bi_hv48_8_\suffix, export=1
+        AARCH64_SIGN_LINK_REGISTER
        stp             x6, x7, [sp, #-80]!
        stp             x4, x5, [sp, #16]
        stp             x2, x3, [sp, #32]
@@ -4751,10 +5228,12 @@ function ff_hevc_put_hevc_epel_bi_hv48_8_\suffix, export=1
        add             x4, x4, #48
        bl              X(ff_hevc_put_hevc_epel_bi_hv24_8_\suffix)
        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc

 function ff_hevc_put_hevc_epel_bi_hv64_8_\suffix, export=1
+        AARCH64_SIGN_LINK_REGISTER
        stp             x6, x7, [sp, #-80]!
        stp             x4, x5, [sp, #16]
        stp             x2, x3, [sp, #32]
@@ -4770,6 +5249,7 @@ function ff_hevc_put_hevc_epel_bi_hv64_8_\suffix, export=1
        add             x4, x4, #64
        bl              X(ff_hevc_put_hevc_epel_bi_hv32_8_\suffix)
        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc
 .endm
@@ -511,8 +511,11 @@ function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0
        sqxtun          v6.8b, v6.8h
        sqxtun          v7.8b, v7.8h
 .endif
+        // Use x15 to signal whether any pixels should be updated or not.
+        mov             x15, #1
+        ret
+3:      mov             x15, #0
        ret
-3:      ret             x6
 endfunc
 .endm

@@ -562,6 +565,7 @@ function ff_hevc_\dir\()_loop_filter_luma_\bitdepth\()_neon, export=1
 .endif
 .endif
        bl              hevc_loop_filter_luma_body_\bitdepth\()_neon
+        cbz             x15, 9f
 .if \bitdepth > 8
 .ifc \dir, v
        transpose_8x8H  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
@@ -587,6 +591,7 @@ function ff_hevc_\dir\()_loop_filter_luma_\bitdepth\()_neon, export=1
        st1             {v6.8b}, [x10], x1
        st1             {v7.8b}, [x10]
 .endif
+9:
        ret             x6
 endfunc
 .endm
@@ -194,6 +194,24 @@ static void hevc_dequant_12_neon(int16_t *coeffs, int16_t log2_size)
        member[8][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
        member[9][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext;

+/*
+ * qpel horizontal (non-i8mm): no dedicated w24/w48/w64 NEON functions,
+ * w12 and w24 share h12 (loop x2), w32/w48/w64 share h32 (loop).
+ *
+ * Index-to-width: [1]=4 [2]=6 [3]=8 [4]=12 [5]=16
+ *                 [6]=24 [7]=32 [8]=48 [9]=64
+ */
+#define NEON8_FNASSIGN_QPEL_H(member, fn)                        \
+        member[1][0][1] = ff_hevc_put_hevc_##fn##_h4_8_neon;     \
+        member[2][0][1] = ff_hevc_put_hevc_##fn##_h6_8_neon;     \
+        member[3][0][1] = ff_hevc_put_hevc_##fn##_h8_8_neon;     \
+        member[4][0][1] =                                        \
+        member[6][0][1] = ff_hevc_put_hevc_##fn##_h12_8_neon;    \
+        member[5][0][1] = ff_hevc_put_hevc_##fn##_h16_8_neon;    \
+        member[7][0][1] =                                        \
+        member[8][0][1] =                                        \
+        member[9][0][1] = ff_hevc_put_hevc_##fn##_h32_8_neon;
+
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
    int cpu_flags = av_get_cpu_flags();
@@ -228,82 +246,77 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
        c->sao_edge_filter[2]          =
        c->sao_edge_filter[3]          =
        c->sao_edge_filter[4]          = ff_hevc_sao_edge_filter_16x16_8_neon;
-        c->put_hevc_qpel[1][0][1]      = ff_hevc_put_hevc_qpel_h4_8_neon;
-        c->put_hevc_qpel[2][0][1]      = ff_hevc_put_hevc_qpel_h6_8_neon;
-        c->put_hevc_qpel[3][0][1]      = ff_hevc_put_hevc_qpel_h8_8_neon;
-        c->put_hevc_qpel[4][0][1]      =
-        c->put_hevc_qpel[6][0][1]      = ff_hevc_put_hevc_qpel_h12_8_neon;
-        c->put_hevc_qpel[5][0][1]      = ff_hevc_put_hevc_qpel_h16_8_neon;
-        c->put_hevc_qpel[7][0][1]      =
-        c->put_hevc_qpel[8][0][1]      =
-        c->put_hevc_qpel[9][0][1]      = ff_hevc_put_hevc_qpel_h32_8_neon;
-        c->put_hevc_qpel_uni[1][0][1]  = ff_hevc_put_hevc_qpel_uni_h4_8_neon;
-        c->put_hevc_qpel_uni[2][0][1]  = ff_hevc_put_hevc_qpel_uni_h6_8_neon;
-        c->put_hevc_qpel_uni[3][0][1]  = ff_hevc_put_hevc_qpel_uni_h8_8_neon;
-        c->put_hevc_qpel_uni[4][0][1]  =
-        c->put_hevc_qpel_uni[6][0][1]  = ff_hevc_put_hevc_qpel_uni_h12_8_neon;
-        c->put_hevc_qpel_uni[5][0][1]  = ff_hevc_put_hevc_qpel_uni_h16_8_neon;
-        c->put_hevc_qpel_uni[7][0][1]  =
-        c->put_hevc_qpel_uni[8][0][1]  =
-        c->put_hevc_qpel_uni[9][0][1]  = ff_hevc_put_hevc_qpel_uni_h32_8_neon;
-        c->put_hevc_qpel_bi[1][0][1]   = ff_hevc_put_hevc_qpel_bi_h4_8_neon;
-        c->put_hevc_qpel_bi[2][0][1]   = ff_hevc_put_hevc_qpel_bi_h6_8_neon;
-        c->put_hevc_qpel_bi[3][0][1]   = ff_hevc_put_hevc_qpel_bi_h8_8_neon;
-        c->put_hevc_qpel_bi[4][0][1]   =
-        c->put_hevc_qpel_bi[6][0][1]   = ff_hevc_put_hevc_qpel_bi_h12_8_neon;
-        c->put_hevc_qpel_bi[5][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
-        c->put_hevc_qpel_bi[7][0][1]   =
-        c->put_hevc_qpel_bi[8][0][1]   =
-        c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h32_8_neon;

-        NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
-        NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
+        /* ============ qpel ============ */
        NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
+        NEON8_FNASSIGN_QPEL_H(c->put_hevc_qpel, qpel);
        NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v,);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv,);
+
+        /* qpel_uni: pixels, h, v, hv */
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
+        NEON8_FNASSIGN_QPEL_H(c->put_hevc_qpel_uni, qpel_uni);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv,);
+
+        /* qpel_bi: pixels, h, v, hv */
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 0, pel_bi_pixels,);
+        NEON8_FNASSIGN_QPEL_H(c->put_hevc_qpel_bi, qpel_bi);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 0, qpel_bi_v,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv,);
+
+        /* qpel_uni_w: pixels, h, v, hv */
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN_SHARED_32(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv,);
+
+        /* qpel_bi_w: pixels only */
+        NEON8_FNASSIGN_PARTIAL_6(c->put_hevc_qpel_bi_w, 0, 0, pel_bi_w_pixels,);
+
+        /* ============ epel ============ */
+        NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
+        NEON8_FNASSIGN_SHARED_32(c->put_hevc_epel, 0, 1, epel_h,);
+        NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
+        NEON8_FNASSIGN(c->put_hevc_epel, 1, 1, epel_hv,);
+
+        /* epel_uni: pixels, h, v, hv */
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 1, epel_uni_h,);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv,);
+
+        /* epel_bi: pixels, h, v, hv */
        NEON8_FNASSIGN(c->put_hevc_epel_bi, 0, 0, pel_bi_pixels,);
        NEON8_FNASSIGN(c->put_hevc_epel_bi, 0, 1, epel_bi_h,);
        NEON8_FNASSIGN(c->put_hevc_epel_bi, 1, 0, epel_bi_v,);
-        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 0, pel_bi_pixels,);
-        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 0, qpel_bi_v,);
-        NEON8_FNASSIGN_PARTIAL_6(c->put_hevc_qpel_bi_w, 0, 0, pel_bi_w_pixels,);
-        NEON8_FNASSIGN_PARTIAL_6(c->put_hevc_epel_bi_w, 0, 0, pel_bi_w_pixels,);
-        NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
-        NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
-        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
-        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v,);
-        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
-        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
-        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,);
-        NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
-
-        NEON8_FNASSIGN_SHARED_32(c->put_hevc_epel, 0, 1, epel_h,);
-        NEON8_FNASSIGN_SHARED_32(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h,);
-
-        NEON8_FNASSIGN(c->put_hevc_epel, 1, 1, epel_hv,);
-        NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv,);
-        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv,);
        NEON8_FNASSIGN(c->put_hevc_epel_bi, 1, 1, epel_bi_hv,);

-        NEON8_FNASSIGN_SHARED_32(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h,);
+        /* epel_uni_w: pixels, h, v, hv */
+        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN_SHARED_32(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h,);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv,);

-        NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv,);
-        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv,);
-        NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv,);
-        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv,);
+        /* epel_bi_w: pixels only */
+        NEON8_FNASSIGN_PARTIAL_6(c->put_hevc_epel_bi_w, 0, 0, pel_bi_w_pixels,);

        if (have_i8mm(cpu_flags)) {
-            NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
-            NEON8_FNASSIGN(c->put_hevc_epel, 1, 1, epel_hv, _i8mm);
-            NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv, _i8mm);
-            NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
-            NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv, _i8mm);
-            NEON8_FNASSIGN(c->put_hevc_epel_bi, 1, 1, epel_bi_hv, _i8mm);
+            /* i8mm overrides: qpel */
            NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
            NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv, _i8mm);
            NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv, _i8mm);
            NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
-            NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm);
            NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv, _i8mm);
+
+            /* i8mm overrides: epel */
+            NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_epel, 1, 1, epel_hv, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_epel_bi, 1, 1, epel_bi_hv, _i8mm);
        }

    }
@@ -0,0 +1,111 @@
+/*
+ * HEVC Intra Prediction NEON initialization
+ *
+ * Copyright (c) 2026 Jun Zhao <barryjzhao@tencent.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/hevc/pred.h"
+
+// DC prediction
+void ff_hevc_pred_dc_4x4_8_neon(uint8_t *src, const uint8_t *top,
+                               const uint8_t *left, ptrdiff_t stride,
+                               int c_idx);
+void ff_hevc_pred_dc_8x8_8_neon(uint8_t *src, const uint8_t *top,
+                               const uint8_t *left, ptrdiff_t stride,
+                               int c_idx);
+void ff_hevc_pred_dc_16x16_8_neon(uint8_t *src, const uint8_t *top,
+                                const uint8_t *left, ptrdiff_t stride,
+                                int c_idx);
+void ff_hevc_pred_dc_32x32_8_neon(uint8_t *src, const uint8_t *top,
+                                const uint8_t *left, ptrdiff_t stride,
+                                int c_idx);
+
+// Planar prediction
+void ff_hevc_pred_planar_4x4_8_neon(uint8_t *src, const uint8_t *top,
+                                   const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_pred_planar_8x8_8_neon(uint8_t *src, const uint8_t *top,
+                                   const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_pred_planar_16x16_8_neon(uint8_t *src, const uint8_t *top,
+                                    const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_pred_planar_32x32_8_neon(uint8_t *src, const uint8_t *top,
+                                    const uint8_t *left, ptrdiff_t stride);
+
+// 3-tap reference sample filter
+void ff_hevc_ref_filter_3tap_8x8_8_neon(uint8_t *filtered_left,
+                                        uint8_t *filtered_top,
+                                        const uint8_t *left,
+                                        const uint8_t *top, int size);
+void ff_hevc_ref_filter_3tap_16x16_8_neon(uint8_t *filtered_left,
+                                          uint8_t *filtered_top,
+                                          const uint8_t *left,
+                                          const uint8_t *top, int size);
+void ff_hevc_ref_filter_3tap_32x32_8_neon(uint8_t *filtered_left,
+                                          uint8_t *filtered_top,
+                                          const uint8_t *left,
+                                          const uint8_t *top, int size);
+
+// Strong intra smoothing
+void ff_hevc_ref_filter_strong_8_neon(uint8_t *filtered_top, uint8_t *left,
+                                      const uint8_t *top);
+
+static void pred_dc_neon(uint8_t *src, const uint8_t *top,
+                         const uint8_t *left, ptrdiff_t stride,
+                         int log2_size, int c_idx)
+{
+    switch (log2_size) {
+    case 2:
+        ff_hevc_pred_dc_4x4_8_neon(src, top, left, stride, c_idx);
+        break;
+    case 3:
+        ff_hevc_pred_dc_8x8_8_neon(src, top, left, stride, c_idx);
+        break;
+    case 4:
+        ff_hevc_pred_dc_16x16_8_neon(src, top, left, stride, c_idx);
+        break;
+    case 5:
+        ff_hevc_pred_dc_32x32_8_neon(src, top, left, stride, c_idx);
+        break;
+    default:
+        av_unreachable("log2_size must be 2, 3, 4 or 5");
+    }
+}
+
+av_cold void ff_hevc_pred_init_aarch64(HEVCPredContext *hpc, int bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (!have_neon(cpu_flags))
+        return;
+
+    if (bit_depth == 8) {
+        hpc->pred_dc        = pred_dc_neon;
+        hpc->pred_planar[0] = ff_hevc_pred_planar_4x4_8_neon;
+        hpc->pred_planar[1] = ff_hevc_pred_planar_8x8_8_neon;
+        hpc->pred_planar[2] = ff_hevc_pred_planar_16x16_8_neon;
+        hpc->pred_planar[3] = ff_hevc_pred_planar_32x32_8_neon;
+
+        hpc->ref_filter_3tap[0] = ff_hevc_ref_filter_3tap_8x8_8_neon;
+        hpc->ref_filter_3tap[1] = ff_hevc_ref_filter_3tap_16x16_8_neon;
+        hpc->ref_filter_3tap[2] = ff_hevc_ref_filter_3tap_32x32_8_neon;
+        hpc->ref_filter_strong  = ff_hevc_ref_filter_strong_8_neon;
+    }
+}
@@ -1169,9 +1169,11 @@ function nsse16_neon, export=1
        str             x0, [sp, #-0x40]!
        stp             x1, x2, [sp, #0x10]
        stp             x3, x4, [sp, #0x20]
+        AARCH64_SIGN_LINK_REGISTER
        str             x30, [sp, #0x30]
        bl              X(sse16_neon)
        ldr             x30, [sp, #0x30]
+        AARCH64_VALIDATE_LINK_REGISTER
        mov             w9, w0                                  // here we store score1
        ldp             x1, x2, [sp, #0x10]
        ldp             x3, x4, [sp, #0x20]
@@ -1290,9 +1292,11 @@ function nsse8_neon, export=1
        str             x0, [sp, #-0x40]!
        stp             x1, x2, [sp, #0x10]
        stp             x3, x4, [sp, #0x20]
+        AARCH64_SIGN_LINK_REGISTER
        str             x30, [sp, #0x30]
        bl              X(sse8_neon)
        ldr             x30, [sp, #0x30]
+        AARCH64_VALIDATE_LINK_REGISTER
        mov             w9, w0                                  // here we store score1
        ldp             x1, x2, [sp, #0x10]
        ldp             x3, x4, [sp, #0x20]
@@ -43,6 +43,36 @@ void ff_vvc_put_luma_h16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdif
 void ff_vvc_put_luma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
                                   const int height, const int8_t *hf, const int8_t *vf, const int width);

+void ff_vvc_put_chroma_h8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                  const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_h16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                   const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_h_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                     const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_h8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                  const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_h16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                   const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                     const int height, const int8_t *hf, const int8_t *vf, const int width);
+
+void ff_vvc_put_chroma_v4_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                  const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_v8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                  const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_v16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                   const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_v_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                     const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_v4_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                  const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_v8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                  const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_v16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                   const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_v_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                     const int height, const int8_t *hf, const int8_t *vf, const int width);
+
 void ff_vvc_put_luma_v4_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
                                const int height, const int8_t *hf, const int8_t *vf, const int width);
 void ff_vvc_put_luma_v8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
@@ -73,6 +103,19 @@ void ff_vvc_put_luma_hv16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdi
 void ff_vvc_put_luma_hv_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
                                    const int height, const int8_t *hf, const int8_t *vf, const int width);

+void ff_vvc_put_chroma_hv8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                   const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_hv16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                    const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_hv_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                      const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_hv8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                   const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_hv16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                    const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_chroma_hv_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+                                      const int height, const int8_t *hf, const int8_t *vf, const int width);
+
 void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshift, uint32_t steps);

 #define BIT_DEPTH 8
@@ -290,12 +333,26 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
        c->inter.dmvr[0][1] = ff_vvc_dmvr_h_10_neon;
        c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
        c->inter.apply_bdof = ff_vvc_apply_bdof_10_neon;
+
+        c->inter.put[1][2][0][1] = ff_vvc_put_chroma_h8_10_neon;
+        c->inter.put[1][3][0][1] = ff_vvc_put_chroma_h16_10_neon;
+        c->inter.put[1][4][0][1] =
+        c->inter.put[1][5][0][1] =
+        c->inter.put[1][6][0][1] = ff_vvc_put_chroma_h_x16_10_neon;
+
        c->inter.put[0][2][0][1] = ff_vvc_put_luma_h8_10_neon;
        c->inter.put[0][3][0][1] = ff_vvc_put_luma_h16_10_neon;
        c->inter.put[0][4][0][1] =
        c->inter.put[0][5][0][1] =
        c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_10_neon;

+        c->inter.put[1][1][1][0] = ff_vvc_put_chroma_v4_10_neon;
+        c->inter.put[1][2][1][0] = ff_vvc_put_chroma_v8_10_neon;
+        c->inter.put[1][3][1][0] = ff_vvc_put_chroma_v16_10_neon;
+        c->inter.put[1][4][1][0] =
+        c->inter.put[1][5][1][0] =
+        c->inter.put[1][6][1][0] = ff_vvc_put_chroma_v_x16_10_neon;
+
        c->inter.put[0][1][1][0] = ff_vvc_put_luma_v4_10_neon;
        c->inter.put[0][2][1][0] = ff_vvc_put_luma_v8_10_neon;
        c->inter.put[0][3][1][0] = ff_vvc_put_luma_v16_10_neon;
@@ -309,6 +366,12 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
        c->inter.put[0][5][1][1] =
        c->inter.put[0][6][1][1] = ff_vvc_put_luma_hv_x16_10_neon;

+        c->inter.put[1][2][1][1] = ff_vvc_put_chroma_hv8_10_neon;
+        c->inter.put[1][3][1][1] = ff_vvc_put_chroma_hv16_10_neon;
+        c->inter.put[1][4][1][1] =
+        c->inter.put[1][5][1][1] =
+        c->inter.put[1][6][1][1] = ff_vvc_put_chroma_hv_x16_10_neon;
+
        c->alf.filter[LUMA] = alf_filter_luma_10_neon;
        c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
        c->alf.classify = alf_classify_10_neon;
@@ -322,6 +385,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
        c->inter.dmvr[0][1] = ff_vvc_dmvr_h_12_neon;
        c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
        c->inter.apply_bdof = ff_vvc_apply_bdof_12_neon;
+
+        c->inter.put[1][2][0][1] = ff_vvc_put_chroma_h8_12_neon;
+        c->inter.put[1][3][0][1] = ff_vvc_put_chroma_h16_12_neon;
+        c->inter.put[1][4][0][1] =
+        c->inter.put[1][5][0][1] =
+        c->inter.put[1][6][0][1] = ff_vvc_put_chroma_h_x16_12_neon;
+
        c->inter.put[0][2][0][1] = ff_vvc_put_luma_h8_12_neon;
        c->inter.put[0][3][0][1] = ff_vvc_put_luma_h16_12_neon;
        c->inter.put[0][4][0][1] =
@@ -341,6 +411,19 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
        c->inter.put[0][5][1][0] =
        c->inter.put[0][6][1][0] = ff_vvc_put_luma_v_x16_12_neon;

+        c->inter.put[1][1][1][0] = ff_vvc_put_chroma_v4_12_neon;
+        c->inter.put[1][2][1][0] = ff_vvc_put_chroma_v8_12_neon;
+        c->inter.put[1][3][1][0] = ff_vvc_put_chroma_v16_12_neon;
+        c->inter.put[1][4][1][0] =
+        c->inter.put[1][5][1][0] =
+        c->inter.put[1][6][1][0] = ff_vvc_put_chroma_v_x16_12_neon;
+
+        c->inter.put[1][2][1][1] = ff_vvc_put_chroma_hv8_12_neon;
+        c->inter.put[1][3][1][1] = ff_vvc_put_chroma_hv16_12_neon;
+        c->inter.put[1][4][1][1] =
+        c->inter.put[1][5][1][1] =
+        c->inter.put[1][6][1][1] = ff_vvc_put_chroma_hv_x16_12_neon;
+
        c->alf.filter[LUMA] = alf_filter_luma_12_neon;
        c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
        c->alf.classify = alf_classify_12_neon;
@@ -1611,6 +1611,7 @@ endfunc
 function ff_vvc_apply_bdof_8_neon, export=1
        mov             w6, #8
 0:
+        AARCH64_SIGN_LINK_REGISTER
        stp             x19, x20, [sp, #-0x40]!
        stp             x21, x22, [sp, #0x10]
        stp             x23, x24, [sp, #0x20]
@@ -1703,6 +1704,7 @@ function ff_vvc_apply_bdof_8_neon, export=1
        ldp             x23, x24, [sp, #0x20]
        ldp             x21, x22, [sp, #0x10]
        ldp             x19, x20, [sp], #0x40
+        AARCH64_VALIDATE_LINK_REGISTER
        ret
 endfunc

@@ -1833,6 +1835,137 @@ function ff_vvc_put_luma_h_x16_12_neon, export=1
        put_luma_h_x16_xx_neon 4
 endfunc

+.macro put_chroma_h_x8_horizontal_filter shift
+        // 4 bytes from hf loaded to v0.4h
+        // 24 bytes from _src loaded to v20.8h & v21.4h where v21.4h is loaded for shift to v1.8h,v2.8h,v3.8h
+        // v24.4h & v25.4h are output vectors to store
+        ext             v1.16b, v20.16b, v21.16b, #2
+        ext             v2.16b, v20.16b, v21.16b, #4
+        ext             v3.16b, v20.16b, v21.16b, #6
+        smull           v24.4s, v20.4h, v0.h[0]
+        smull2          v25.4s, v20.8h, v0.h[0]
+        smlal           v24.4s, v1.4h, v0.h[1]
+        smlal2          v25.4s, v1.8h, v0.h[1]
+        smlal           v24.4s, v2.4h, v0.h[2]
+        smlal2          v25.4s, v2.8h, v0.h[2]
+        smlal           v24.4s, v3.4h, v0.h[3]
+        smlal2          v25.4s, v3.8h, v0.h[3]
+        sqshrn          v24.4h, v24.4s, #(\shift)
+        sqshrn          v25.4h, v25.4s, #(\shift)
+.endm
+
+.macro put_chroma_h8_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ldr             s0, [x4]
+        sub             x1, x1, #2
+        sub             x2, x2, #16
+        sxtl            v0.8h, v0.8b
+1:
+        ld1             {v20.8h}, [x1], #16
+        ld1             {v21.4h}, [x1], x2
+        put_chroma_h_x8_horizontal_filter \shift
+        subs            w3, w3, #1
+        st1             {v24.4h, v25.4h}, [x0], x9
+        b.gt            1b
+        ret
+.endm
+
+.macro put_chroma_h16_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ldr             s0, [x4]
+        sub             x9, x9, #16
+        sub             x1, x1, #2
+        sub             x2, x2, #32
+        sxtl            v0.8h, v0.8b
+1:
+        ld1             {v20.8h, v21.8h}, [x1], #32
+        ld1             {v22.4h}, [x1], x2
+        put_chroma_h_x8_horizontal_filter \shift
+        mov             v20.16b, v21.16b
+        mov             v21.16b, v22.16b
+        st1             {v24.4h, v25.4h}, [x0], #16
+        put_chroma_h_x8_horizontal_filter \shift
+        subs            w3, w3, #1
+        st1             {v24.4h, v25.4h}, [x0], x9
+        b.gt            1b
+        ret
+.endm
+
+.macro put_chroma_h_x16_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ldr             s0, [x4]
+        sub             x9, x9, w6, uxtw #1
+        sub             x2, x2, w6, uxtw #1
+        sxtl            v0.8h, v0.8b
+        sub             x1, x1, #2
+        sub             x2, x2, #16
+1:
+        ld1             {v20.8h}, [x1], #16
+        mov             w8, w6
+2:
+        ld1             {v21.8h, v22.8h}, [x1], #32
+        put_chroma_h_x8_horizontal_filter \shift
+        mov             v20.16b, v21.16b
+        mov             v21.16b, v22.16b
+        st1             {v24.4h, v25.4h}, [x0], #16
+        put_chroma_h_x8_horizontal_filter \shift
+        mov             v20.16b, v21.16b
+        subs            w8, w8, #16
+        st1             {v24.4h, v25.4h}, [x0], #16
+        b.gt            2b
+        subs            w3, w3, #1
+        add             x0, x0, x9
+        add             x1, x1, x2
+        b.gt            1b
+        ret
+.endm
+
+
+function ff_vvc_put_chroma_h8_10_neon, export=1
+        put_chroma_h8_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_h8_12_neon, export=1
+        put_chroma_h8_xx_neon 4
+endfunc
+
+function ff_vvc_put_chroma_h16_10_neon, export=1
+        put_chroma_h16_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_h16_12_neon, export=1
+        put_chroma_h16_xx_neon 4
+endfunc
+
+function ff_vvc_put_chroma_h_x16_10_neon, export=1
+        put_chroma_h_x16_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_h_x16_12_neon, export=1
+        put_chroma_h_x16_xx_neon 4
+endfunc
+
 .macro put_luma_v4_xx_neon shift
        mov             x9, #(VVC_MAX_PB_SIZE * 2)
        sub             x1, x1, x2, lsl #1
@@ -2225,6 +2358,229 @@ function ff_vvc_put_luma_v_x16_12_neon, export=1
        put_luma_v_x16_xx_neon 4
 endfunc

+.macro put_chroma_v4_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ldr             s0, [x5]
+        sub             x1, x1, x2
+        sxtl            v0.8h, v0.8b
+        ld1             {v20.4h}, [x1], x2
+        ld1             {v21.4h}, [x1], x2
+        ld1             {v22.4h}, [x1], x2
+1:
+        ld1             {v23.4h}, [x1], x2
+
+        smull           v1.4s, v20.4h, v0.h[0]
+        smull           v2.4s, v21.4h, v0.h[1]
+        smlal           v1.4s, v22.4h, v0.h[2]
+        smlal           v2.4s, v23.4h, v0.h[3]
+
+        ld1             {v24.4h}, [x1], x2
+
+        smull           v3.4s, v21.4h, v0.h[0]
+        smull           v4.4s, v22.4h, v0.h[1]
+        smlal           v3.4s, v23.4h, v0.h[2]
+        smlal           v4.4s, v24.4h, v0.h[3]
+
+        add             v1.4s, v1.4s, v2.4s
+        add             v3.4s, v3.4s, v4.4s
+        sqshrn          v1.4h, v1.4s, #(\shift)
+        sqshrn          v3.4h, v3.4s, #(\shift)
+
+        st1             {v1.4h}, [x0], x9
+        mov             v20.16b, v22.16b
+        mov             v21.16b, v23.16b
+        mov             v22.16b, v24.16b
+        subs            w3, w3, #2
+        st1             {v3.4h}, [x0], x9
+        b.gt            1b
+        ret
+.endm
+
+function ff_vvc_put_chroma_v4_10_neon, export=1
+        put_chroma_v4_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_v4_12_neon, export=1
+        put_chroma_v4_xx_neon 4
+endfunc
+
+.macro put_chroma_v8_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ldr             s0, [x5]
+        sub             x1, x1, x2
+        sxtl            v0.8h, v0.8b
+        ld1             {v20.8h}, [x1], x2
+        ld1             {v21.8h}, [x1], x2
+        ld1             {v22.8h}, [x1], x2
+1:
+        ld1             {v23.8h}, [x1], x2
+
+        smull           v1.4s, v20.4h, v0.h[0]
+        smull2          v2.4s, v20.8h, v0.h[0]
+        smlal           v1.4s, v21.4h, v0.h[1]
+        smlal2          v2.4s, v21.8h, v0.h[1]
+        smlal           v1.4s, v22.4h, v0.h[2]
+        smlal2          v2.4s, v22.8h, v0.h[2]
+        smlal           v1.4s, v23.4h, v0.h[3]
+        smlal2          v2.4s, v23.8h, v0.h[3]
+        sqshrn          v1.4h, v1.4s, #(\shift)
+        sqshrn          v2.4h, v2.4s, #(\shift)
+
+        ld1             {v24.8h}, [x1], x2
+        st1             {v1.4h-v2.4h}, [x0], x9
+
+        smull           v3.4s, v21.4h, v0.h[0]
+        smull2          v4.4s, v21.8h, v0.h[0]
+        smlal           v3.4s, v22.4h, v0.h[1]
+        smlal2          v4.4s, v22.8h, v0.h[1]
+        smlal           v3.4s, v23.4h, v0.h[2]
+        smlal2          v4.4s, v23.8h, v0.h[2]
+        smlal           v3.4s, v24.4h, v0.h[3]
+        smlal2          v4.4s, v24.8h, v0.h[3]
+        sqshrn          v3.4h, v3.4s, #(\shift)
+        sqshrn          v4.4h, v4.4s, #(\shift)
+
+        mov             v20.16b, v22.16b
+        mov             v21.16b, v23.16b
+        mov             v22.16b, v24.16b
+        subs            w3, w3, #2
+        st1             {v3.4h-v4.4h}, [x0], x9
+        b.gt            1b
+        ret
+.endm
+
+function ff_vvc_put_chroma_v8_10_neon, export=1
+        put_chroma_v8_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_v8_12_neon, export=1
+        put_chroma_v8_xx_neon 4
+endfunc
+
+.macro put_chroma_v_x16_horizontal_filter shift, src0, src1, src2, src3, src4, src5, src6, src7
+        smull           v2.4s, \src0\().4h, v0.h[0]
+        smull2          v3.4s, \src0\().8h, v0.h[0]
+        smlal           v2.4s, \src2\().4h, v0.h[1]
+        smlal2          v3.4s, \src2\().8h, v0.h[1]
+        smlal           v2.4s, \src4\().4h, v0.h[2]
+        smlal2          v3.4s, \src4\().8h, v0.h[2]
+        smlal           v2.4s, \src6\().4h, v0.h[3]
+        smlal2          v3.4s, \src6\().8h, v0.h[3]
+
+        smull           v4.4s, \src1\().4h, v0.h[0]
+        smull2          v5.4s, \src1\().8h, v0.h[0]
+        smlal           v4.4s, \src3\().4h, v0.h[1]
+        smlal2          v5.4s, \src3\().8h, v0.h[1]
+        smlal           v4.4s, \src5\().4h, v0.h[2]
+        smlal2          v5.4s, \src5\().8h, v0.h[2]
+        smlal           v4.4s, \src7\().4h, v0.h[3]
+        smlal2          v5.4s, \src7\().8h, v0.h[3]
+
+        sqshrn          v6.4h, v2.4s, #(\shift)
+        sqshrn          v7.4h, v4.4s, #(\shift)
+        sqshrn2         v6.8h, v3.4s, #(\shift)
+        sqshrn2         v7.8h, v5.4s, #(\shift)
+.endm
+
+.macro put_chroma_v16_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ldr             s0, [x5]
+        sub             x1, x1, x2
+        sxtl            v0.8h, v0.8b
+        ld1             {v16.8h-v17.8h}, [x1], x2
+        ld1             {v18.8h-v19.8h}, [x1], x2
+        ld1             {v20.8h-v21.8h}, [x1], x2
+1:
+        ld1             {v22.8h-v23.8h}, [x1], x2
+        put_chroma_v_x16_horizontal_filter \shift, v16, v17, v18, v19, v20, v21, v22, v23
+        ld1             {v24.8h-v25.8h}, [x1], x2
+        st1             {v6.8h-v7.8h}, [x0], x9
+        put_chroma_v_x16_horizontal_filter \shift, v18, v19, v20, v21, v22, v23, v24, v25
+        subs            w3, w3, #2
+        st1             {v6.8h-v7.8h}, [x0], x9
+
+        mov             v16.16b, v20.16b
+        mov             v17.16b, v21.16b
+        mov             v18.16b, v22.16b
+        mov             v19.16b, v23.16b
+        mov             v20.16b, v24.16b
+        mov             v21.16b, v25.16b
+        b.gt            1b
+        ret
+.endm
+
+function ff_vvc_put_chroma_v16_10_neon, export=1
+        put_chroma_v16_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_v16_12_neon, export=1
+        put_chroma_v16_xx_neon 4
+endfunc
+
+.macro put_chroma_v_x16_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ldr             s0, [x5]
+        sub             x1, x1, x2
+        sxtl            v0.8h, v0.8b
+1:
+        mov             w8, #0
+2:
+        add             x11, x1, x8, lsl #1
+        add             x10, x0, x8, lsl #1
+        ld1             {v16.8h-v17.8h}, [x11], x2
+        add             x8, x8, #16
+        ld1             {v18.8h-v19.8h}, [x11], x2
+        cmp             w8, w6
+        ld1             {v20.8h-v21.8h}, [x11], x2
+        ld1             {v22.8h-v23.8h}, [x11], x2
+        ld1             {v24.8h-v25.8h}, [x11], x2
+        put_chroma_v_x16_horizontal_filter \shift, v16, v17, v18, v19, v20, v21, v22, v23
+        st1             {v6.8h-v7.8h}, [x10], x9
+        put_chroma_v_x16_horizontal_filter \shift, v18, v19, v20, v21, v22, v23, v24, v25
+        st1             {v6.8h-v7.8h}, [x10], x9
+        b.lt            2b
+        add             x0, x0, x9, lsl #1
+        subs            w3, w3, #2
+        add             x1, x1, x2, lsl #1
+        b.gt            1b
+        ret
+.endm
+
+function ff_vvc_put_chroma_v_x16_10_neon, export=1
+        put_chroma_v_x16_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_v_x16_12_neon, export=1
+        put_chroma_v_x16_xx_neon 4
+endfunc

 .macro put_luma_hv_x8_horizontal_filter shift, dst, src0, src1
        ext             v2.16b, \src0\().16b, \src1\().16b, #2
@@ -2575,3 +2931,199 @@ endfunc
 function ff_vvc_put_luma_hv_x16_12_neon, export=1
        put_luma_hv_x16_xx_neon 4
 endfunc
+
+.macro put_chroma_hv_x8_horizontal_filter shift, dst, src0, src1
+        ext             v2.16b, \src0\().16b, \src1\().16b, #2
+        ext             v3.16b, \src0\().16b, \src1\().16b, #4
+        ext             v4.16b, \src0\().16b, \src1\().16b, #6
+        smull           v6.4s, \src0\().4h, v0.h[0]
+        smull2          v7.4s, \src0\().8h, v0.h[0]
+        smlal           v6.4s, v2.4h, v0.h[1]
+        smlal2          v7.4s, v2.8h, v0.h[1]
+        smlal           v6.4s, v3.4h, v0.h[2]
+        smlal2          v7.4s, v3.8h, v0.h[2]
+        smlal           v6.4s, v4.4h, v0.h[3]
+        smlal2          v7.4s, v4.8h, v0.h[3]
+        sqshrn          \dst\().4h, v6.4s, #(\shift)
+        sqshrn2         \dst\().8h, v7.4s, #(\shift)
+.endm
+
+.macro put_chroma_hv_x8_vertical_filter dst0, dst1, src0, src1, src2, src3
+        smull           \dst0\().4s, \src0\().4h, v1.h[0]
+        smull2          \dst1\().4s, \src0\().8h, v1.h[0]
+        smlal           \dst0\().4s, \src1\().4h, v1.h[1]
+        smlal2          \dst1\().4s, \src1\().8h, v1.h[1]
+        smlal           \dst0\().4s, \src2\().4h, v1.h[2]
+        smlal2          \dst1\().4s, \src2\().8h, v1.h[2]
+        smlal           \dst0\().4s, \src3\().4h, v1.h[3]
+        smlal2          \dst1\().4s, \src3\().8h, v1.h[3]
+        sqshrn          \dst0\().4h, \dst0\().4s, #6
+        sqshrn          \dst1\().4h, \dst1\().4s, #6
+.endm
+
+.macro put_chroma_hv8_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        sub             x1, x1, #2
+        ldr             s0, [x4]
+        ldr             s1, [x5]
+        sxtl            v0.8h, v0.8b
+        sub             x1, x1, x2
+        sxtl            v1.8h, v1.8b
+        ld1             {v16.8h, v17.8h}, [x1], x2
+        ld1             {v18.8h, v19.8h}, [x1], x2
+        ld1             {v20.8h, v21.8h}, [x1], x2
+        put_chroma_hv_x8_horizontal_filter \shift, v16, v16, v17
+        put_chroma_hv_x8_horizontal_filter \shift, v18, v18, v19
+        put_chroma_hv_x8_horizontal_filter \shift, v20, v20, v21
+1:
+        ld1             {v22.8h, v23.8h}, [x1], x2
+        put_chroma_hv_x8_horizontal_filter \shift, v22, v22, v23
+        put_chroma_hv_x8_vertical_filter v2, v3, v16, v18, v20, v22
+        ld1             {v24.8h, v25.8h}, [x1], x2
+        st1             {v2.4h-v3.4h}, [x0], x9
+        put_chroma_hv_x8_horizontal_filter \shift, v24, v24, v25
+        put_chroma_hv_x8_vertical_filter v2, v3, v18, v20, v22, v24
+        st1             {v2.4h-v3.4h}, [x0], x9
+
+        mov             v16.16b, v20.16b
+        mov             v18.16b, v22.16b
+        subs            w3, w3, #2
+        mov             v20.16b, v24.16b
+        b.gt            1b
+        ret
+.endm
+
+function ff_vvc_put_chroma_hv8_10_neon, export=1
+        put_chroma_hv8_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_hv8_12_neon, export=1
+        put_chroma_hv8_xx_neon 4
+endfunc
+
+.macro put_chroma_hv16_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        sub             x1, x1, #2
+        ldr             s0, [x4]
+        ldr             s1, [x5]
+        sxtl            v0.8h, v0.8b
+        sub             x1, x1, x2
+        sxtl            v1.8h, v1.8b
+        ld1             {v16.8h-v18.8h}, [x1], x2
+        ld1             {v19.8h-v21.8h}, [x1], x2
+        ld1             {v22.8h-v24.8h}, [x1], x2
+        put_chroma_hv_x8_horizontal_filter \shift, v16, v16, v17
+        put_chroma_hv_x8_horizontal_filter \shift, v17, v17, v18
+        put_chroma_hv_x8_horizontal_filter \shift, v19, v19, v20
+        put_chroma_hv_x8_horizontal_filter \shift, v20, v20, v21
+        put_chroma_hv_x8_horizontal_filter \shift, v22, v22, v23
+        put_chroma_hv_x8_horizontal_filter \shift, v23, v23, v24
+1:
+        ld1             {v25.8h-v27.8h}, [x1], x2
+        put_chroma_hv_x8_horizontal_filter \shift, v25, v25, v26
+        put_chroma_hv_x8_horizontal_filter \shift, v26, v26, v27
+        put_chroma_hv_x8_vertical_filter v2, v3, v16, v19, v22, v25
+        put_chroma_hv_x8_vertical_filter v4, v5, v17, v20, v23, v26
+        ld1             {v28.8h-v30.8h}, [x1], x2
+        st1             {v2.4h-v5.4h}, [x0], x9
+        put_chroma_hv_x8_horizontal_filter \shift, v28, v28, v29
+        put_chroma_hv_x8_horizontal_filter \shift, v29, v29, v30
+        put_chroma_hv_x8_vertical_filter v2, v3, v19, v22, v25, v28
+        put_chroma_hv_x8_vertical_filter v4, v5, v20, v23, v26, v29
+        st1             {v2.4h-v5.4h}, [x0], x9
+        mov             v16.16b, v22.16b
+        mov             v17.16b, v23.16b
+        mov             v19.16b, v25.16b
+        mov             v20.16b, v26.16b
+        subs            w3, w3, #2
+        mov             v22.16b, v28.16b
+        mov             v23.16b, v29.16b
+        b.gt            1b
+        ret
+.endm
+
+function ff_vvc_put_chroma_hv16_10_neon, export=1
+        put_chroma_hv16_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_hv16_12_neon, export=1
+        put_chroma_hv16_xx_neon 4
+endfunc
+
+.macro put_chroma_hv_x16_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        sub             x1, x1, #2
+        ldr             s0, [x4]
+        ldr             s1, [x5]
+        sxtl            v0.8h, v0.8b
+        sub             x1, x1, x2
+        sxtl            v1.8h, v1.8b
+1:
+        mov             w13, w3
+        mov             x11, x1
+        mov             x10, x0
+        ld1             {v16.8h-v18.8h}, [x11], x2
+        ld1             {v19.8h-v21.8h}, [x11], x2
+        ld1             {v22.8h-v24.8h}, [x11], x2
+        put_chroma_hv_x8_horizontal_filter \shift, v16, v16, v17
+        put_chroma_hv_x8_horizontal_filter \shift, v17, v17, v18
+        put_chroma_hv_x8_horizontal_filter \shift, v19, v19, v20
+        put_chroma_hv_x8_horizontal_filter \shift, v20, v20, v21
+        put_chroma_hv_x8_horizontal_filter \shift, v22, v22, v23
+        put_chroma_hv_x8_horizontal_filter \shift, v23, v23, v24
+2:
+        ld1             {v25.8h-v27.8h}, [x11], x2
+        put_chroma_hv_x8_horizontal_filter \shift, v25, v25, v26
+        put_chroma_hv_x8_horizontal_filter \shift, v26, v26, v27
+        put_chroma_hv_x8_vertical_filter v2, v3, v16, v19, v22, v25
+        put_chroma_hv_x8_vertical_filter v4, v5, v17, v20, v23, v26
+        ld1             {v28.8h-v30.8h}, [x11], x2
+        st1             {v2.4h-v5.4h}, [x10], x9
+        put_chroma_hv_x8_horizontal_filter \shift, v28, v28, v29
+        put_chroma_hv_x8_horizontal_filter \shift, v29, v29, v30
+        put_chroma_hv_x8_vertical_filter v2, v3, v19, v22, v25, v28
+        put_chroma_hv_x8_vertical_filter v4, v5, v20, v23, v26, v29
+        st1             {v2.4h-v5.4h}, [x10], x9
+        mov             v16.16b, v22.16b
+        mov             v17.16b, v23.16b
+        mov             v19.16b, v25.16b
+        mov             v20.16b, v26.16b
+        subs            w13, w13, #2
+        mov             v22.16b, v28.16b
+        mov             v23.16b, v29.16b
+        b.gt            2b
+        subs            w6, w6, #16
+        add             x0, x0, #32
+        add             x1, x1, #32
+        b.gt            1b
+        ret
+.endm
+
+function ff_vvc_put_chroma_hv_x16_10_neon, export=1
+        put_chroma_hv_x16_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_hv_x16_12_neon, export=1
+        put_chroma_hv_x16_xx_neon 4
+endfunc
@@ -122,13 +122,11 @@ function ff_vvc_alf_filter_luma_8_sme2, export=1
        // clip          .req x5
        // vb            .req x6
        sme_entry
-        stp             x29, x30, [sp, #-96]!
-        mov             x29, sp
-        stp             x19, x20, [sp, #16]
-        stp             x21, x22, [sp, #32]
-        stp             x23, x24, [sp, #48]
-        stp             x25, x26, [sp, #64]
-        stp             x27, x28, [sp, #80]
+        stp             x19, x20, [sp, #-80]!
+        stp             x21, x22, [sp, #16]
+        stp             x23, x24, [sp, #32]
+        stp             x25, x26, [sp, #48]
+        stp             x27, x28, [sp, #64]

        lsr             x7, x3, #32
        cnth            x11
@@ -356,12 +354,11 @@ function ff_vvc_alf_filter_luma_8_sme2, export=1
        add             x0, x0, x2, lsl #2
        b.gt            1b

-        ldp             x19, x20, [sp, #16]
-        ldp             x21, x22, [sp, #32]
-        ldp             x23, x24, [sp, #48]
-        ldp             x25, x26, [sp, #64]
-        ldp             x27, x28, [sp, #80]
-        ldp             x29, x30, [sp], #96
+        ldp             x21, x22, [sp, #16]
+        ldp             x23, x24, [sp, #32]
+        ldp             x25, x26, [sp, #48]
+        ldp             x27, x28, [sp, #64]
+        ldp             x19, x20, [sp], #80
        sme_exit
        ret
 endfunc
@@ -410,13 +407,11 @@ function ff_vvc_alf_filter_luma_10_sme2, export=1
        mov             w12, #1023
 0:
        sme_entry
-        stp             x29, x30, [sp, #-96]!
-        mov             x29, sp
-        stp             x19, x20, [sp, #16]
-        stp             x21, x22, [sp, #32]
-        stp             x23, x24, [sp, #48]
-        stp             x25, x26, [sp, #64]
-        stp             x27, x28, [sp, #80]
+        stp             x19, x20, [sp, #-80]!
+        stp             x21, x22, [sp, #16]
+        stp             x23, x24, [sp, #32]
+        stp             x25, x26, [sp, #48]
+        stp             x27, x28, [sp, #64]

        lsr             x7, x3, #32
        cnth            x11
@@ -644,12 +639,11 @@ function ff_vvc_alf_filter_luma_10_sme2, export=1
        add             x0, x0, x2, lsl #3
        b.gt            1b

-        ldp             x19, x20, [sp, #16]
-        ldp             x21, x22, [sp, #32]
-        ldp             x23, x24, [sp, #48]
-        ldp             x25, x26, [sp, #64]
-        ldp             x27, x28, [sp, #80]
-        ldp             x29, x30, [sp], #96
+        ldp             x21, x22, [sp, #16]
+        ldp             x23, x24, [sp, #32]
+        ldp             x25, x26, [sp, #48]
+        ldp             x27, x28, [sp, #64]
+        ldp             x19, x20, [sp], #80
        sme_exit
        ret
 endfunc
@@ -31,6 +31,7 @@
 #include <math.h>
 #include <string.h>

+#include "libavutil/attributes.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/crc.h"
 #include "libavutil/downmix_info.h"
@@ -338,7 +339,9 @@ static int decode_exponents(AC3DecodeContext *s,
        switch (group_size) {
        case 4: dexps[j++] = prevexp;
                dexps[j++] = prevexp;
+                av_fallthrough;
        case 2: dexps[j++] = prevexp;
+                av_fallthrough;
        case 1: dexps[j++] = prevexp;
        }
    }
@@ -614,13 +617,16 @@ static void ac3_upmix_delay(AC3DecodeContext *s)
        break;
    case AC3_CHMODE_2F2R:
        memset(s->delay[3], 0, channel_data_size);
+        av_fallthrough;
    case AC3_CHMODE_2F1R:
        memset(s->delay[2], 0, channel_data_size);
        break;
    case AC3_CHMODE_3F2R:
        memset(s->delay[4], 0, channel_data_size);
+        av_fallthrough;
    case AC3_CHMODE_3F1R:
        memset(s->delay[3], 0, channel_data_size);
+        av_fallthrough;
    case AC3_CHMODE_3F:
        memcpy(s->delay[2], s->delay[1], channel_data_size);
        memset(s->delay[1], 0, channel_data_size);
@@ -2704,7 +2704,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx, AVFrame *frame,

                    for (int k = i-1; k > -1; k--) {
                        for (int o = 1; o < order; o++)
-                            delta += sf_codes[(i-1) - k] * coefs[(o*8) + k];
+                            delta += sf_codes[(i-1) - k] * (unsigned)coefs[(o*8) + k];
                    }

                    sample = sf_codes[i] * 2048;
@@ -252,6 +252,7 @@ extern const FFCodec ff_pbm_encoder;
 extern const FFCodec ff_pbm_decoder;
 extern const FFCodec ff_pcx_encoder;
 extern const FFCodec ff_pcx_decoder;
+extern const FFCodec ff_pdv_encoder;
 extern const FFCodec ff_pdv_decoder;
 extern const FFCodec ff_pfm_encoder;
 extern const FFCodec ff_pfm_decoder;
@@ -1548,8 +1548,12 @@ static int read_diff_float_data(ALSDecContext *ctx, unsigned int ra_frame) {
                    return AVERROR_INVALIDDATA;
                }

+                j = 0;
                for (i = 0; i < frame_length; ++i) {
-                    ctx->raw_mantissa[c][i] = AV_RB32(larray);
+                    if (ctx->raw_samples[c][i] == 0) {
+                        ctx->raw_mantissa[c][i] = AV_RB32(larray + j);
+                        j += 4;
+                    }
                }
            }
        }
@@ -1560,7 +1564,10 @@ static int read_diff_float_data(ALSDecContext *ctx, unsigned int ra_frame) {
                if (ctx->raw_samples[c][i] != 0) {
                    //The following logic is taken from Table 14.45 and 14.46 from the ISO spec
                    if (av_cmp_sf_ieee754(acf[c], FLOAT_1)) {
-                        nbits[i] = 23 - av_log2(abs(ctx->raw_samples[c][i]));
+                        int nbit = av_log2(FFABSU(ctx->raw_samples[c][i]));
+                        if (nbit > 23)
+                            return AVERROR_INVALIDDATA;
+                        nbits[i] = 23 - nbit;
                    } else {
                        nbits[i] = 23;
                    }
@@ -1634,7 +1641,7 @@ static int read_diff_float_data(ALSDecContext *ctx, unsigned int ra_frame) {
                tmp_32 = (sign << 31) | ((e + EXP_BIAS) << 23) | (mantissa);
                ctx->raw_samples[c][i] = tmp_32;
            } else {
-                ctx->raw_samples[c][i] = raw_mantissa[c][i] & 0x007fffffUL;
+                ctx->raw_samples[c][i] = raw_mantissa[c][i];
            }
        }
        align_get_bits(gb);
@@ -1790,7 +1797,9 @@ static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
    }

    if (sconf->floating) {
-        read_diff_float_data(ctx, ra_frame);
+        ret = read_diff_float_data(ctx, ra_frame);
+        if (ret < 0)
+            return ret;
    }

    if (get_bits_left(gb) < 0) {
@@ -21,8 +21,6 @@
 #include "amfdec.h"
 #include "codec_internal.h"
 #include "hwconfig.h"
-#include "libavutil/imgutils.h"
-#include "libavutil/mem.h"
 #include "libavutil/time.h"
 #include "decode.h"
 #include "decode_bsf.h"
@@ -125,31 +123,7 @@ static int amf_init_decoder(AVCodecContext *avctx)
    } else if (avctx->color_range != AVCOL_RANGE_UNSPECIFIED) {
        AMF_ASSIGN_PROPERTY_BOOL(res, ctx->decoder, AMF_VIDEO_DECODER_FULL_RANGE_COLOR, 0);
    }
-    color_profile = AMF_VIDEO_CONVERTER_COLOR_PROFILE_UNKNOWN;
-    switch (avctx->colorspace) {
-    case AVCOL_SPC_SMPTE170M:
-        if (avctx->color_range == AVCOL_RANGE_JPEG) {
-            color_profile = AMF_VIDEO_CONVERTER_COLOR_PROFILE_FULL_601;
-        } else {
-            color_profile = AMF_VIDEO_CONVERTER_COLOR_PROFILE_601;
-        }
-        break;
-    case AVCOL_SPC_BT709:
-        if (avctx->color_range == AVCOL_RANGE_JPEG) {
-            color_profile = AMF_VIDEO_CONVERTER_COLOR_PROFILE_FULL_709;
-        } else {
-            color_profile = AMF_VIDEO_CONVERTER_COLOR_PROFILE_709;
-        }
-        break;
-    case AVCOL_SPC_BT2020_NCL:
-    case AVCOL_SPC_BT2020_CL:
-        if (avctx->color_range == AVCOL_RANGE_JPEG) {
-            color_profile = AMF_VIDEO_CONVERTER_COLOR_PROFILE_FULL_2020;
-        } else {
-            color_profile = AMF_VIDEO_CONVERTER_COLOR_PROFILE_2020;
-        }
-        break;
-    }
+    color_profile = av_amf_get_color_profile(avctx->color_range, avctx->colorspace);
    if (color_profile != AMF_VIDEO_CONVERTER_COLOR_PROFILE_UNKNOWN)
        AMF_ASSIGN_PROPERTY_INT64(res, ctx->decoder, AMF_VIDEO_DECODER_COLOR_PROFILE, color_profile);
    if (avctx->color_trc != AVCOL_TRC_UNSPECIFIED)
@@ -269,12 +243,13 @@ static int amf_init_frames_context(AVCodecContext *avctx, int sw_format, int new
 static int amf_decode_init(AVCodecContext *avctx)
 {
    AMFDecoderContext *ctx = avctx->priv_data;
+    ctx->dimensions_initialized = 0;
    int ret;
    ctx->in_pkt = av_packet_alloc();
    if (!ctx->in_pkt)
        return AVERROR(ENOMEM);

-    if  (avctx->hw_device_ctx) {
+    if (avctx->hw_device_ctx) {
        AVHWDeviceContext   *hwdev_ctx;
        hwdev_ctx = (AVHWDeviceContext*)avctx->hw_device_ctx->data;
        if (hwdev_ctx->type == AV_HWDEVICE_TYPE_AMF)
@@ -297,7 +272,7 @@ static int amf_decode_init(AVCodecContext *avctx)
        AVAMFDeviceContext    *amf_device_ctx = (AVAMFDeviceContext*)hw_device_ctx->hwctx;
        enum AVPixelFormat    surf_pix_fmt = AV_PIX_FMT_NONE;

-        if(amf_legacy_driver_no_bitness_detect(amf_device_ctx)){
+        if (amf_legacy_driver_no_bitness_detect(amf_device_ctx)) {
            // if bitness detection is not supported in legacy driver use format from container
            switch (avctx->pix_fmt) {
            case AV_PIX_FMT_YUV420P:
@@ -306,7 +281,7 @@ static int amf_decode_init(AVCodecContext *avctx)
            case AV_PIX_FMT_YUV420P10:
                surf_pix_fmt = AV_PIX_FMT_P010; break;
            }
-        }else{
+        } else {
            AMFVariantStruct format_var = {0};

            ret = ctx->decoder->pVtbl->GetProperty(ctx->decoder, AMF_VIDEO_DECODER_OUTPUT_FORMAT, &format_var);
@@ -314,17 +289,26 @@ static int amf_decode_init(AVCodecContext *avctx)

            surf_pix_fmt = av_amf_to_av_format(format_var.int64Value);
        }
-        if(avctx->hw_frames_ctx)
+        if (avctx->hw_frames_ctx)
        {
            // this values should be set for avcodec_open2
            // will be updated after header decoded if not true.
-            if(surf_pix_fmt == AV_PIX_FMT_NONE)
+            if (surf_pix_fmt == AV_PIX_FMT_NONE)
                surf_pix_fmt = AV_PIX_FMT_NV12; // for older drivers
-            if (!avctx->coded_width)
-                avctx->coded_width = 1280;
-            if (!avctx->coded_height)
-                avctx->coded_height = 720;
-            ret = amf_init_frames_context(avctx, surf_pix_fmt, avctx->coded_width, avctx->coded_height);
+            int frames_w = 0;
+            int frames_h = 0;
+
+            if (avctx->coded_width > 0 && avctx->coded_height > 0) {
+                frames_w = avctx->coded_width;
+                frames_h = avctx->coded_height;
+            } else if (avctx->width > 0 && avctx->height > 0) {
+                frames_w = avctx->width;
+                frames_h = avctx->height;
+            } else {
+                frames_w = 1280;
+                frames_h = 720;
+            }
+            ret = amf_init_frames_context(avctx, surf_pix_fmt, frames_w, frames_h);
            AMF_GOTO_FAIL_IF_FALSE(avctx, ret == 0, ret, "Failed to init frames context (AMF) : %s\n", av_err2str(ret));
        }
        else
@@ -375,7 +359,7 @@ static int amf_amfsurface_to_avframe(AVCodecContext *avctx, AMFSurface* surface,

        avctx->sw_pix_fmt = avctx->pix_fmt;

-        ret = ff_attach_decode_data(frame);
+        ret = ff_attach_decode_data(avctx, frame);
        if (ret < 0)
            return ret;
        frame->width  = avctx->width;
@@ -435,41 +419,10 @@ static int amf_amfsurface_to_avframe(AVCodecContext *avctx, AMFSurface* surface,
            AMFHDRMetadata * hdrmeta = (AMFHDRMetadata*)hdrmeta_buffer->pVtbl->GetNative(hdrmeta_buffer);
            if (ret != AMF_OK)
                return ret;
-            if (hdrmeta != NULL) {
-                AVMasteringDisplayMetadata *mastering = av_mastering_display_metadata_create_side_data(frame);
-                const int chroma_den = 50000;
-                const int luma_den = 10000;

-                if (!mastering)
-                    return AVERROR(ENOMEM);
-
-                mastering->display_primaries[0][0] = av_make_q(hdrmeta->redPrimary[0], chroma_den);
-                mastering->display_primaries[0][1] = av_make_q(hdrmeta->redPrimary[1], chroma_den);
-
-                mastering->display_primaries[1][0] = av_make_q(hdrmeta->greenPrimary[0], chroma_den);
-                mastering->display_primaries[1][1] = av_make_q(hdrmeta->greenPrimary[1], chroma_den);
-
-                mastering->display_primaries[2][0] = av_make_q(hdrmeta->bluePrimary[0], chroma_den);
-                mastering->display_primaries[2][1] = av_make_q(hdrmeta->bluePrimary[1], chroma_den);
-
-                mastering->white_point[0] = av_make_q(hdrmeta->whitePoint[0], chroma_den);
-                mastering->white_point[1] = av_make_q(hdrmeta->whitePoint[1], chroma_den);
-
-                mastering->max_luminance = av_make_q(hdrmeta->maxMasteringLuminance, luma_den);
-                mastering->min_luminance = av_make_q(hdrmeta->maxMasteringLuminance, luma_den);
-
-                mastering->has_luminance = 1;
-                mastering->has_primaries = 1;
-                if (hdrmeta->maxContentLightLevel) {
-                   AVContentLightMetadata *light = av_content_light_metadata_create_side_data(frame);
-
-                    if (!light)
-                        return AVERROR(ENOMEM);
-
-                    light->MaxCLL  = hdrmeta->maxContentLightLevel;
-                    light->MaxFALL = hdrmeta->maxFrameAverageLightLevel;
-                }
-            }
+            ret = av_amf_attach_hdr_metadata(frame, hdrmeta);
+            if (ret < 0)
+                return ret;
        }
    }
    return 0;
@@ -552,6 +505,25 @@ static AMF_RESULT amf_buffer_from_packet(AVCodecContext *avctx, const AVPacket*
    return amf_update_buffer_properties(avctx, buf, pkt);
 }

+static void amf_init_dimensions(AVCodecContext *avctx)
+{
+    AMFDecoderContext *ctx = avctx->priv_data;
+    AMFVariantStruct size_var = {0};
+    AMF_RESULT res = AMF_OK;
+
+    res = ctx->decoder->pVtbl->GetProperty(ctx->decoder, AMF_VIDEO_DECODER_CURRENT_SIZE, &size_var);
+    if (res == AMF_OK && size_var.sizeValue.width > 0 && size_var.sizeValue.height > 0) {
+        avctx->width        = size_var.sizeValue.width;
+        avctx->height       = size_var.sizeValue.height;
+        avctx->coded_width  = size_var.sizeValue.width;
+        avctx->coded_height = size_var.sizeValue.height;
+
+        ctx->dimensions_initialized = 1;
+
+        av_log(avctx, AV_LOG_DEBUG, "AMF: detected initial decoder size %dx%d\n", avctx->width, avctx->height);
+    }
+}
+
 static int amf_decode_frame(AVCodecContext *avctx, struct AVFrame *frame)
 {
    AMFDecoderContext *ctx = avctx->priv_data;
@@ -613,9 +585,11 @@ static int amf_decode_frame(AVCodecContext *avctx, struct AVFrame *frame)
    }

    res = amf_receive_frame(avctx, frame);
-    if (res == AMF_OK)
+    if (res == AMF_OK) {
        got_frame = 1;
-    else if (res == AMF_REPEAT)
+        if (!ctx->dimensions_initialized)
+            amf_init_dimensions(avctx);
+    } else if (res == AMF_REPEAT)
        // decoder has no output yet
        res = AMF_OK;
    else if (res == AMF_EOF) {
@@ -55,6 +55,7 @@ typedef struct AMFDecoderContext {
    int                 drain;
    int                 resolution_changed;
    int                 copy_output;
+    int                 dimensions_initialized;
    AVPacket*           in_pkt;
    enum AMF_SURFACE_FORMAT output_format;

@@ -17,13 +17,11 @@
 */

 #include "config.h"
-#include "config_components.h"

 #include "libavutil/avassert.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/hwcontext.h"
 #include "libavutil/hwcontext_amf.h"
-#include "libavutil/hwcontext_amf_internal.h"
 #if CONFIG_D3D11VA
 #include "libavutil/hwcontext_d3d11va.h"
 #endif
@@ -37,62 +35,10 @@

 #include "amfenc.h"
 #include "encode.h"
-#include "internal.h"
-#include "libavutil/mastering_display_metadata.h"

 #define AMF_AV_FRAME_REF    L"av_frame_ref"
 #define PTS_PROP            L"PtsProp"

-static int amf_save_hdr_metadata(AVCodecContext *avctx, const AVFrame *frame, AMFHDRMetadata *hdrmeta)
-{
-    AVFrameSideData            *sd_display;
-    AVFrameSideData            *sd_light;
-    AVMasteringDisplayMetadata *display_meta;
-    AVContentLightMetadata     *light_meta;
-
-    sd_display = av_frame_get_side_data(frame, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
-    if (sd_display) {
-        display_meta = (AVMasteringDisplayMetadata *)sd_display->data;
-        if (display_meta->has_luminance) {
-            const unsigned int luma_den = 10000;
-            hdrmeta->maxMasteringLuminance =
-                (amf_uint32)(luma_den * av_q2d(display_meta->max_luminance));
-            hdrmeta->minMasteringLuminance =
-                FFMIN((amf_uint32)(luma_den * av_q2d(display_meta->min_luminance)), hdrmeta->maxMasteringLuminance);
-        }
-        if (display_meta->has_primaries) {
-            const unsigned int chroma_den = 50000;
-            hdrmeta->redPrimary[0] =
-                FFMIN((amf_uint16)(chroma_den * av_q2d(display_meta->display_primaries[0][0])), chroma_den);
-            hdrmeta->redPrimary[1] =
-                FFMIN((amf_uint16)(chroma_den * av_q2d(display_meta->display_primaries[0][1])), chroma_den);
-            hdrmeta->greenPrimary[0] =
-                FFMIN((amf_uint16)(chroma_den * av_q2d(display_meta->display_primaries[1][0])), chroma_den);
-            hdrmeta->greenPrimary[1] =
-                FFMIN((amf_uint16)(chroma_den * av_q2d(display_meta->display_primaries[1][1])), chroma_den);
-            hdrmeta->bluePrimary[0] =
-                FFMIN((amf_uint16)(chroma_den * av_q2d(display_meta->display_primaries[2][0])), chroma_den);
-            hdrmeta->bluePrimary[1] =
-                FFMIN((amf_uint16)(chroma_den * av_q2d(display_meta->display_primaries[2][1])), chroma_den);
-            hdrmeta->whitePoint[0] =
-                FFMIN((amf_uint16)(chroma_den * av_q2d(display_meta->white_point[0])), chroma_den);
-            hdrmeta->whitePoint[1] =
-                FFMIN((amf_uint16)(chroma_den * av_q2d(display_meta->white_point[1])), chroma_den);
-        }
-
-        sd_light = av_frame_get_side_data(frame, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
-        if (sd_light) {
-            light_meta = (AVContentLightMetadata *)sd_light->data;
-            if (light_meta) {
-                hdrmeta->maxContentLightLevel = (amf_uint16)light_meta->MaxCLL;
-                hdrmeta->maxFrameAverageLightLevel = (amf_uint16)light_meta->MaxFALL;
-            }
-        }
-        return 0;
-    }
-    return 1;
-}
-
 #if CONFIG_D3D11VA
 #include <d3d11.h>
 #endif
@@ -251,6 +197,8 @@ static int amf_copy_buffer(AVCodecContext *avctx, AVPacket *pkt, AMFBuffer *buff
    AMFVariantStruct var = {0};
    int64_t          timestamp = AV_NOPTS_VALUE;
    int64_t          size = buffer->pVtbl->GetSize(buffer);
+    enum AVPictureType pict_type = 0;
+    int              average_qp = -1;

    if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0) {
        return ret;
@@ -258,25 +206,52 @@ static int amf_copy_buffer(AVCodecContext *avctx, AVPacket *pkt, AMFBuffer *buff
    memcpy(pkt->data, buffer->pVtbl->GetNative(buffer), size);

    switch (avctx->codec->id) {
-        case AV_CODEC_ID_H264:
-            buffer->pVtbl->GetProperty(buffer, AMF_VIDEO_ENCODER_OUTPUT_DATA_TYPE, &var);
-            if(var.int64Value == AMF_VIDEO_ENCODER_OUTPUT_DATA_TYPE_IDR) {
-                pkt->flags = AV_PKT_FLAG_KEY;
+    case AV_CODEC_ID_H264:
+        buffer->pVtbl->GetProperty(buffer, AMF_VIDEO_ENCODER_OUTPUT_DATA_TYPE, &var);
+        pkt->flags |= AV_PKT_FLAG_KEY * (var.int64Value == AMF_VIDEO_ENCODER_OUTPUT_DATA_TYPE_IDR);
+        pict_type = var.int64Value == AMF_VIDEO_ENCODER_OUTPUT_DATA_TYPE_IDR ? AV_PICTURE_TYPE_I :
+                    var.int64Value == AMF_VIDEO_ENCODER_OUTPUT_DATA_TYPE_I ? AV_PICTURE_TYPE_I :
+                    var.int64Value == AMF_VIDEO_ENCODER_OUTPUT_DATA_TYPE_P ? AV_PICTURE_TYPE_P :
+                    var.int64Value == AMF_VIDEO_ENCODER_OUTPUT_DATA_TYPE_B ? AV_PICTURE_TYPE_B : 0;
+
+        var.int64Value = -1;
+        if ((buffer->pVtbl->GetProperty(buffer, AMF_VIDEO_ENCODER_STATISTIC_AVERAGE_QP, &var)) == AMF_OK) {
+            average_qp = FFMAX((int)var.int64Value, -1);
+        }
+        break;
+    case AV_CODEC_ID_HEVC:
+        buffer->pVtbl->GetProperty(buffer, AMF_VIDEO_ENCODER_HEVC_OUTPUT_DATA_TYPE, &var);
+        pkt->flags |= AV_PKT_FLAG_KEY * (var.int64Value == AMF_VIDEO_ENCODER_HEVC_OUTPUT_DATA_TYPE_IDR);
+        pict_type = var.int64Value == AMF_VIDEO_ENCODER_HEVC_OUTPUT_DATA_TYPE_IDR ? AV_PICTURE_TYPE_I :
+                    var.int64Value == AMF_VIDEO_ENCODER_HEVC_OUTPUT_DATA_TYPE_I ? AV_PICTURE_TYPE_I :
+                    var.int64Value == AMF_VIDEO_ENCODER_HEVC_OUTPUT_DATA_TYPE_P ? AV_PICTURE_TYPE_P : 0;
+
+        var.int64Value = -1;
+        if ((buffer->pVtbl->GetProperty(buffer, AMF_VIDEO_ENCODER_HEVC_STATISTIC_AVERAGE_QP, &var)) == AMF_OK) {
+            average_qp = FFMAX((int)var.int64Value, -1);
+        }
+        break;
+    case AV_CODEC_ID_AV1:
+        buffer->pVtbl->GetProperty(buffer, AMF_VIDEO_ENCODER_AV1_OUTPUT_FRAME_TYPE, &var);
+        pkt->flags |= AV_PKT_FLAG_KEY * (var.int64Value == AMF_VIDEO_ENCODER_AV1_OUTPUT_FRAME_TYPE_KEY);
+        pict_type = var.int64Value == AMF_VIDEO_ENCODER_AV1_OUTPUT_FRAME_TYPE_KEY ? AV_PICTURE_TYPE_I :
+                    var.int64Value == AMF_VIDEO_ENCODER_AV1_OUTPUT_FRAME_TYPE_INTRA_ONLY ? AV_PICTURE_TYPE_I :
+                    var.int64Value == AMF_VIDEO_ENCODER_AV1_OUTPUT_FRAME_TYPE_INTER ? AV_PICTURE_TYPE_P : 0;
+
+        var.int64Value = -1;
+        if ((buffer->pVtbl->GetProperty(buffer, AMF_VIDEO_ENCODER_AV1_STATISTIC_AVERAGE_Q_INDEX, &var)) == AMF_OK) {
+            average_qp = FFMAX((int)var.int64Value, -1); // av1 qindex
+            if (average_qp >= 0) {
+                average_qp = (average_qp > 244) ? (average_qp <= 249 ? 62 : 63) : (average_qp + 3) >> 2; // av1 quantizer
            }
-            break;
-        case AV_CODEC_ID_HEVC:
-            buffer->pVtbl->GetProperty(buffer, AMF_VIDEO_ENCODER_HEVC_OUTPUT_DATA_TYPE, &var);
-            if (var.int64Value == AMF_VIDEO_ENCODER_HEVC_OUTPUT_DATA_TYPE_IDR) {
-                pkt->flags = AV_PKT_FLAG_KEY;
-            }
-            break;
-        case AV_CODEC_ID_AV1:
-            buffer->pVtbl->GetProperty(buffer, AMF_VIDEO_ENCODER_AV1_OUTPUT_FRAME_TYPE, &var);
-            if (var.int64Value == AMF_VIDEO_ENCODER_AV1_OUTPUT_FRAME_TYPE_KEY) {
-                pkt->flags = AV_PKT_FLAG_KEY;
-            }
-        default:
-            break;
+        }
+        break;
+    default:
+        break;
+    }
+
+    if (average_qp >= 0) {
+        ff_encode_add_stats_side_data(pkt, average_qp * FF_QP2LAMBDA, NULL, 0, pict_type);
    }

    buffer->pVtbl->GetProperty(buffer, ctx->pts_property_name, &var);
@@ -479,7 +454,7 @@ static int amf_submit_frame(AVCodecContext *avctx, AVFrame    *frame, AMFSurface
        res = amf_device_ctx->context->pVtbl->AllocBuffer(amf_device_ctx->context, AMF_MEMORY_HOST, sizeof(AMFHDRMetadata), &hdrmeta_buffer);
        if (res == AMF_OK) {
            AMFHDRMetadata * hdrmeta = (AMFHDRMetadata*)hdrmeta_buffer->pVtbl->GetNative(hdrmeta_buffer);
-            if (amf_save_hdr_metadata(avctx, frame, hdrmeta) == 0) {
+            if (av_amf_extract_hdr_metadata(frame, hdrmeta) == 0) {
                switch (avctx->codec->id) {
                case AV_CODEC_ID_H264:
                    AMF_ASSIGN_PROPERTY_INTERFACE(res, ctx->encoder, AMF_VIDEO_ENCODER_INPUT_HDR_METADATA, hdrmeta_buffer); break;
@@ -500,6 +475,7 @@ static int amf_submit_frame(AVCodecContext *avctx, AVFrame    *frame, AMFSurface

    switch (avctx->codec->id) {
    case AV_CODEC_ID_H264:
+        AMF_ASSIGN_PROPERTY_BOOL(res, surface, AMF_VIDEO_ENCODER_STATISTICS_FEEDBACK, 1);
        AMF_ASSIGN_PROPERTY_INT64(res, surface, AMF_VIDEO_ENCODER_INSERT_AUD, !!ctx->aud);
        switch (frame->pict_type) {
        case AV_PICTURE_TYPE_I:
@@ -520,6 +496,7 @@ static int amf_submit_frame(AVCodecContext *avctx, AVFrame    *frame, AMFSurface
        }
        break;
    case AV_CODEC_ID_HEVC:
+        AMF_ASSIGN_PROPERTY_BOOL(res, surface, AMF_VIDEO_ENCODER_HEVC_STATISTICS_FEEDBACK, 1);
        AMF_ASSIGN_PROPERTY_INT64(res, surface, AMF_VIDEO_ENCODER_HEVC_INSERT_AUD, !!ctx->aud);
        switch (frame->pict_type) {
        case AV_PICTURE_TYPE_I:
@@ -536,6 +513,7 @@ static int amf_submit_frame(AVCodecContext *avctx, AVFrame    *frame, AMFSurface
        }
        break;
    case AV_CODEC_ID_AV1:
+        AMF_ASSIGN_PROPERTY_BOOL(res, surface, AMF_VIDEO_ENCODER_AV1_STATISTICS_FEEDBACK, 1);
        if (frame->pict_type == AV_PICTURE_TYPE_I) {
            if (ctx->forced_idr) {
                AMF_ASSIGN_PROPERTY_INT64(res, surface, AMF_VIDEO_ENCODER_AV1_FORCE_INSERT_SEQUENCE_HEADER, 1);
@@ -733,41 +711,6 @@ int ff_amf_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
    return ret;
 }

-int ff_amf_get_color_profile(AVCodecContext *avctx)
-{
-    amf_int64 color_profile = AMF_VIDEO_CONVERTER_COLOR_PROFILE_UNKNOWN;
-    if (avctx->color_range == AVCOL_RANGE_JPEG) {
-        /// Color Space for Full (JPEG) Range
-        switch (avctx->colorspace) {
-        case AVCOL_SPC_SMPTE170M:
-            color_profile = AMF_VIDEO_CONVERTER_COLOR_PROFILE_FULL_601;
-            break;
-        case AVCOL_SPC_BT709:
-            color_profile = AMF_VIDEO_CONVERTER_COLOR_PROFILE_FULL_709;
-            break;
-        case AVCOL_SPC_BT2020_NCL:
-        case AVCOL_SPC_BT2020_CL:
-            color_profile = AMF_VIDEO_CONVERTER_COLOR_PROFILE_FULL_2020;
-            break;
-        }
-    } else {
-        /// Color Space for Limited (MPEG) range
-        switch (avctx->colorspace) {
-        case AVCOL_SPC_SMPTE170M:
-            color_profile = AMF_VIDEO_CONVERTER_COLOR_PROFILE_601;
-            break;
-        case AVCOL_SPC_BT709:
-            color_profile = AMF_VIDEO_CONVERTER_COLOR_PROFILE_709;
-            break;
-        case AVCOL_SPC_BT2020_NCL:
-        case AVCOL_SPC_BT2020_CL:
-            color_profile = AMF_VIDEO_CONVERTER_COLOR_PROFILE_2020;
-            break;
-        }
-    }
-    return color_profile;
-}
-
 const AVCodecHWConfigInternal *const ff_amfenc_hw_configs[] = {
 #if CONFIG_D3D11VA
    HW_CONFIG_ENCODER_FRAMES(D3D11, D3D11VA),
@@ -163,8 +163,6 @@ int ff_amf_receive_packet(AVCodecContext *avctx, AVPacket *avpkt);
 */
 extern const enum AVPixelFormat ff_amf_pix_fmts[];

-int ff_amf_get_color_profile(AVCodecContext *avctx);
-
 /**
 * Error handling helper
 */
@@ -16,16 +16,16 @@
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

+#include "libavutil/avassert.h"
+#include "libavutil/hwcontext_amf.h"
 #include "libavutil/internal.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
 #include "amfenc.h"
 #include "codec_internal.h"

-#define AMF_VIDEO_ENCODER_AV1_CAP_WIDTH_ALIGNMENT_FACTOR_LOCAL                L"Av1WidthAlignmentFactor"          // amf_int64; default = 1
-#define AMF_VIDEO_ENCODER_AV1_CAP_HEIGHT_ALIGNMENT_FACTOR_LOCAL               L"Av1HeightAlignmentFactor"         // amf_int64; default = 1
-
 #define OFFSET(x) offsetof(AMFEncoderContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
@@ -205,6 +205,7 @@ static av_cold int amf_encode_init_av1(AVCodecContext* avctx)
    amf_int64           bit_depth;
    amf_int64           color_profile;
    enum                AVPixelFormat pix_fmt;
+    const AVPixFmtDescriptor *pix_desc;

    //for av1 alignment and crop
    uint32_t            crop_right  = 0;
@@ -250,35 +251,40 @@ static av_cold int amf_encode_init_av1(AVCodecContext* avctx)

    // Color bit depth
    pix_fmt = avctx->hw_frames_ctx ? ((AVHWFramesContext*)avctx->hw_frames_ctx->data)->sw_format
-                                : avctx->pix_fmt;
+                                   : avctx->pix_fmt;
+    pix_desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(pix_desc);
    bit_depth = ctx->bit_depth;
-    if(bit_depth == AMF_COLOR_BIT_DEPTH_UNDEFINED){
-        bit_depth = pix_fmt == AV_PIX_FMT_P010 ? AMF_COLOR_BIT_DEPTH_10 : AMF_COLOR_BIT_DEPTH_8;
+    if (bit_depth == AMF_COLOR_BIT_DEPTH_UNDEFINED) {
+        bit_depth = pix_desc->comp[0].depth >= 10 ? AMF_COLOR_BIT_DEPTH_10 : AMF_COLOR_BIT_DEPTH_8;
    }
    AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_AV1_COLOR_BIT_DEPTH, bit_depth);

    // Color profile
-    color_profile = ff_amf_get_color_profile(avctx);
+    color_profile = av_amf_get_color_profile(avctx->color_range, avctx->colorspace);
    AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_AV1_OUTPUT_COLOR_PROFILE, color_profile);

    // Color Range
-    // TODO
+    AMF_ASSIGN_PROPERTY_BOOL(res, ctx->encoder, AMF_VIDEO_ENCODER_AV1_OUTPUT_FULL_RANGE_COLOR, (avctx->color_range == AVCOL_RANGE_JPEG));

-    // Color Transfer Characteristics (AMF matches ISO/IEC)
-    if(avctx->color_primaries != AVCOL_PRI_UNSPECIFIED && (pix_fmt == AV_PIX_FMT_NV12 || pix_fmt == AV_PIX_FMT_P010)){
-        // if input is YUV, color_primaries are for VUI only
-        // AMF VCN color conversion supports only specific output primaries BT2020 for 10-bit and BT709 for 8-bit
-        // vpp_amf supports more
-        AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_AV1_OUTPUT_TRANSFER_CHARACTERISTIC, avctx->color_trc);
+    if (!(pix_desc->flags & AV_PIX_FMT_FLAG_RGB)) {
+        // Color Transfer Characteristics (AMF matches ISO/IEC)
+        if (avctx->color_trc != AVCOL_TRC_UNSPECIFIED) {
+            // if input is YUV, color_trc is for VUI only - any value
+            // AMF VCN color conversion supports only specific output transfer characteristic SMPTE2084 for 10-bit and BT709 for 8-bit
+            // vpp_amf supports more
+            AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_AV1_OUTPUT_TRANSFER_CHARACTERISTIC, avctx->color_trc);
+        }
+
+        // Color Primaries (AMF matches ISO/IEC)
+        if (avctx->color_primaries != AVCOL_PRI_UNSPECIFIED) {
+            // if input is YUV, color_primaries are for VUI only
+            // AMF VCN color conversion supports only specific primaries BT2020 for 10-bit and BT709 for 8-bit
+            // vpp_amf supports more
+            AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_AV1_OUTPUT_COLOR_PRIMARIES, avctx->color_primaries);
+        }
    }

-    // Color Primaries (AMF matches ISO/IEC)
-    if(avctx->color_primaries != AVCOL_PRI_UNSPECIFIED || pix_fmt == AV_PIX_FMT_NV12 || pix_fmt == AV_PIX_FMT_P010 )
-    {
-        // AMF VCN color conversion supports only specific primaries BT2020 for 10-bit and BT709 for 8-bit
-        // vpp_amf supports more
-        AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_AV1_OUTPUT_COLOR_PRIMARIES, avctx->color_primaries);
-    }
    profile_level = avctx->level;
    if (profile_level == AV_LEVEL_UNKNOWN) {
        profile_level = ctx->level;
@@ -658,13 +664,13 @@ static av_cold int amf_encode_init_av1(AVCodecContext* avctx)
    var.pInterface->pVtbl->Release(var.pInterface);

    //processing crop information according to alignment
-    if (ctx->encoder->pVtbl->GetProperty(ctx->encoder, AMF_VIDEO_ENCODER_AV1_CAP_WIDTH_ALIGNMENT_FACTOR_LOCAL, &var) != AMF_OK)
+    if (ctx->encoder->pVtbl->GetProperty(ctx->encoder, AMF_VIDEO_ENCODER_AV1_CAP_WIDTH_ALIGNMENT_FACTOR, &var) != AMF_OK)
        // assume older driver and Navi3x
        width_alignment_factor = 64;
    else
        width_alignment_factor = (int)var.int64Value;

-    if (ctx->encoder->pVtbl->GetProperty(ctx->encoder, AMF_VIDEO_ENCODER_AV1_CAP_HEIGHT_ALIGNMENT_FACTOR_LOCAL, &var) != AMF_OK)
+    if (ctx->encoder->pVtbl->GetProperty(ctx->encoder, AMF_VIDEO_ENCODER_AV1_CAP_HEIGHT_ALIGNMENT_FACTOR, &var) != AMF_OK)
        // assume older driver and Navi3x
        height_alignment_factor = 16;
    else
@@ -746,7 +752,7 @@ const FFCodec ff_av1_amf_encoder = {
                      AV_CODEC_CAP_DR1,
    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
    CODEC_PIXFMTS_ARRAY(ff_amf_pix_fmts),
-    .color_ranges   = AVCOL_RANGE_MPEG, /* FIXME: implement tagging */
+    .color_ranges   = AVCOL_RANGE_MPEG | AVCOL_RANGE_JPEG,
    .p.wrapper_name   = "amf",
    .hw_configs     = ff_amfenc_hw_configs,
 };
@@ -16,10 +16,12 @@
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

-
+#include "libavutil/avassert.h"
+#include "libavutil/hwcontext_amf.h"
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
 #include "amfenc.h"
 #include "codec_internal.h"
 #include <AMF/components/PreAnalysis.h>
@@ -206,6 +208,7 @@ static av_cold int amf_encode_init_h264(AVCodecContext *avctx)
    int                              deblocking_filter = (avctx->flags & AV_CODEC_FLAG_LOOP_FILTER) ? 1 : 0;
    amf_int64                        color_profile;
    enum                             AVPixelFormat pix_fmt;
+    const AVPixFmtDescriptor        *pix_desc;

    if (avctx->framerate.num > 0 && avctx->framerate.den > 0) {
        framerate = AMFConstructRate(avctx->framerate.num, avctx->framerate.den);
@@ -270,18 +273,20 @@ static av_cold int amf_encode_init_h264(AVCodecContext *avctx)
        AMF_ASSIGN_PROPERTY_RATIO(res, ctx->encoder, AMF_VIDEO_ENCODER_ASPECT_RATIO, ratio);
    }

-    color_profile = ff_amf_get_color_profile(avctx);
+    color_profile = av_amf_get_color_profile(avctx->color_range, avctx->colorspace);
    AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_OUTPUT_COLOR_PROFILE, color_profile);

    /// Color Range (Support for older Drivers)
-    AMF_ASSIGN_PROPERTY_BOOL(res, ctx->encoder, AMF_VIDEO_ENCODER_FULL_RANGE_COLOR, !!(avctx->color_range == AVCOL_RANGE_JPEG));
+    AMF_ASSIGN_PROPERTY_BOOL(res, ctx->encoder, AMF_VIDEO_ENCODER_OUTPUT_FULL_RANGE_COLOR, (avctx->color_range == AVCOL_RANGE_JPEG));

    /// Color Depth
    pix_fmt = avctx->hw_frames_ctx ? ((AVHWFramesContext*)avctx->hw_frames_ctx->data)->sw_format
-                                : avctx->pix_fmt;
+                                   : avctx->pix_fmt;
+    pix_desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(pix_desc);

    // 10 bit input video is not supported by AMF H264 encoder
-    AMF_RETURN_IF_FALSE(ctx, pix_fmt != AV_PIX_FMT_P010, AVERROR_INVALIDDATA, "10-bit input video is not supported by AMF H264 encoder\n");
+    AMF_RETURN_IF_FALSE(ctx, pix_desc->comp[0].depth == 8, AVERROR_INVALIDDATA, "10-bit input video is not supported by AMF H264 encoder\n");

    AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_COLOR_BIT_DEPTH, AMF_COLOR_BIT_DEPTH_8);
    /// Color Transfer Characteristics (AMF matches ISO/IEC)
@@ -16,9 +16,12 @@
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

+#include "libavutil/avassert.h"
+#include "libavutil/hwcontext_amf.h"
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
 #include "amfenc.h"
 #include "codec_internal.h"
 #include <AMF/components/PreAnalysis.h>
@@ -166,6 +169,7 @@ static av_cold int amf_encode_init_hevc(AVCodecContext *avctx)
    AMFEncoderContext  *ctx = avctx->priv_data;
    AMFVariantStruct    var = {0};
    amf_int64           profile = 0;
+    amf_int64           profile_from_bitdepth = 0;
    amf_int64           profile_level = 0;
    AMFBuffer          *buffer;
    AMFGuid             guid;
@@ -175,6 +179,7 @@ static av_cold int amf_encode_init_hevc(AVCodecContext *avctx)
    amf_int64           bit_depth;
    amf_int64           color_profile;
    enum                AVPixelFormat pix_fmt;
+    const AVPixFmtDescriptor *pix_desc;

    if (avctx->framerate.num > 0 && avctx->framerate.den > 0) {
        framerate = AMFConstructRate(avctx->framerate.num, avctx->framerate.den);
@@ -243,35 +248,49 @@ static av_cold int amf_encode_init_hevc(AVCodecContext *avctx)

    // Color bit depth
    pix_fmt = avctx->hw_frames_ctx ? ((AVHWFramesContext*)avctx->hw_frames_ctx->data)->sw_format
-                                    : avctx->pix_fmt;
-
+                                   : avctx->pix_fmt;
+    pix_desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(pix_desc);
    bit_depth = ctx->bit_depth;
-    if(bit_depth == AMF_COLOR_BIT_DEPTH_UNDEFINED){
-        bit_depth = pix_fmt == AV_PIX_FMT_P010 ? AMF_COLOR_BIT_DEPTH_10 : AMF_COLOR_BIT_DEPTH_8;
+    if (bit_depth == AMF_COLOR_BIT_DEPTH_UNDEFINED) {
+        bit_depth = pix_desc->comp[0].depth >= 10 ? AMF_COLOR_BIT_DEPTH_10 : AMF_COLOR_BIT_DEPTH_8;
    }
    AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_HEVC_COLOR_BIT_DEPTH, bit_depth);

+    // HEVC profile follows target bit depth
+    profile_from_bitdepth = bit_depth == AMF_COLOR_BIT_DEPTH_10 ? AMF_VIDEO_ENCODER_HEVC_PROFILE_MAIN_10
+                                                                : AMF_VIDEO_ENCODER_HEVC_PROFILE_MAIN;
+    if (profile != profile_from_bitdepth) {
+        AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_HEVC_PROFILE, profile_from_bitdepth);
+        if (profile != 0) {
+            av_log(avctx, AV_LOG_WARNING, "The video profile and bit depth did not match, but this has been corrected\n");
+        }
+    }
+    avctx->profile = bit_depth == AMF_COLOR_BIT_DEPTH_10 ? AV_PROFILE_HEVC_MAIN_10 : AV_PROFILE_HEVC_MAIN;
+
    // Color profile
-    color_profile = ff_amf_get_color_profile(avctx);
+    color_profile = av_amf_get_color_profile(avctx->color_range, avctx->colorspace);
    AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_HEVC_OUTPUT_COLOR_PROFILE, color_profile);

    // Color Range (Support for older Drivers)
-    AMF_ASSIGN_PROPERTY_BOOL(res, ctx->encoder, AMF_VIDEO_ENCODER_HEVC_NOMINAL_RANGE, !!(avctx->color_range == AVCOL_RANGE_JPEG));
+    AMF_ASSIGN_PROPERTY_BOOL(res, ctx->encoder, AMF_VIDEO_ENCODER_HEVC_OUTPUT_FULL_RANGE_COLOR, (avctx->color_range == AVCOL_RANGE_JPEG));

-    // Color Transfer Characteristics (AMF matches ISO/IEC)
-    if(avctx->color_trc != AVCOL_TRC_UNSPECIFIED && (pix_fmt == AV_PIX_FMT_NV12 || pix_fmt == AV_PIX_FMT_P010)){
-        // if input is YUV, color_trc is for VUI only - any value
-        // AMF VCN color conversion supports only specific output transfer characteristic SMPTE2084 for 10-bit and BT709 for 8-bit
-        // vpp_amf supports more
-        AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_HEVC_OUTPUT_TRANSFER_CHARACTERISTIC, avctx->color_trc);
-    }
+    if (!(pix_desc->flags & AV_PIX_FMT_FLAG_RGB)) {
+        // Color Transfer Characteristics (AMF matches ISO/IEC)
+        if (avctx->color_trc != AVCOL_TRC_UNSPECIFIED) {
+            // if input is YUV, color_trc is for VUI only - any value
+            // AMF VCN color conversion supports only specific output transfer characteristic SMPTE2084 for 10-bit and BT709 for 8-bit
+            // vpp_amf supports more
+            AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_HEVC_OUTPUT_TRANSFER_CHARACTERISTIC, avctx->color_trc);
+        }

-    // Color Primaries (AMF matches ISO/IEC)
-    if(avctx->color_primaries != AVCOL_PRI_UNSPECIFIED && (pix_fmt == AV_PIX_FMT_NV12 || pix_fmt == AV_PIX_FMT_P010)){
-        // if input is YUV, color_primaries are for VUI only
-        // AMF VCN color conversion supports only specific output primaries BT2020 for 10-bit and BT709 for 8-bit
-        // vpp_amf supports more
-        AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_HEVC_OUTPUT_COLOR_PRIMARIES, avctx->color_primaries);
+        // Color Primaries (AMF matches ISO/IEC)
+        if (avctx->color_primaries != AVCOL_PRI_UNSPECIFIED) {
+            // if input is YUV, color_primaries are for VUI only
+            // AMF VCN color conversion supports only specific output primaries BT2020 for 10-bit and BT709 for 8-bit
+            // vpp_amf supports more
+            AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_HEVC_OUTPUT_COLOR_PRIMARIES, avctx->color_primaries);
+        }
    }

    // Picture control properties
@@ -24,6 +24,7 @@
 * ASCII/ANSI art decoder
 */

+#include "libavutil/attributes.h"
 #include "libavutil/common.h"
 #include "libavutil/frame.h"
 #include "libavutil/xga_font_data.h"
@@ -396,6 +397,7 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *rframe,
                break;
            case 0x0A: //LF
                hscroll(avctx);
+                av_fallthrough;
            case 0x0D: //CR
                s->x = 0;
                break;
@@ -152,8 +152,9 @@ int ff_aom_parse_film_grain_sets(AVFilmGrainAFGS1Params *s,
        payload_4byte = get_bits1(gb);
        payload_size = get_bits(gb, payload_4byte ? 2 : 8);
        set_idx = get_bits(gb, 3);
+
        fgp = av_film_grain_params_alloc(&fgp_size);
-        if (!fgp)
+        if (!fgp || s->sets[set_idx])
            goto error;
        aom = &fgp->codec.aom;

@@ -212,7 +213,7 @@ int ff_aom_parse_film_grain_sets(AVFilmGrainAFGS1Params *s,
        }

        predict_scaling = get_bits1(gb);
-        if (predict_scaling && (!ref || ref == fgp))
+        if (predict_scaling && !ref)
            goto error; // prediction must be from valid, different set

        predict_y_scaling = predict_scaling ? get_bits1(gb) : 0;
@@ -19,6 +19,7 @@
 #include <stdatomic.h>

 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/mastering_display_metadata.h"
 #include "libavutil/mem_internal.h"
 #include "libavutil/pixdesc.h"
@@ -63,12 +64,12 @@ typedef struct APVDecodeContext {
    uint8_t warned_unknown_pbu_types;
 } APVDecodeContext;

-static const enum AVPixelFormat apv_format_table[5][5] = {
-    { AV_PIX_FMT_GRAY8,    AV_PIX_FMT_GRAY10,     AV_PIX_FMT_GRAY12,     AV_PIX_FMT_GRAY14, AV_PIX_FMT_GRAY16 },
-    { 0 }, // 4:2:0 is not valid.
-    { AV_PIX_FMT_YUV422P,  AV_PIX_FMT_YUV422P10,  AV_PIX_FMT_YUV422P12,  AV_PIX_FMT_YUV422P14, AV_PIX_FMT_YUV422P16 },
-    { AV_PIX_FMT_YUV444P,  AV_PIX_FMT_YUV444P10,  AV_PIX_FMT_YUV444P12,  AV_PIX_FMT_YUV444P14, AV_PIX_FMT_YUV444P16 },
-    { AV_PIX_FMT_YUVA444P, AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_YUVA444P12, 0                   ,AV_PIX_FMT_YUVA444P16 },
+static const enum AVPixelFormat apv_format_table[5][4] = {
+    { AV_PIX_FMT_GRAY10,     AV_PIX_FMT_GRAY12,     AV_PIX_FMT_GRAY14,    AV_PIX_FMT_GRAY16     },
+    { AV_PIX_FMT_NONE,       AV_PIX_FMT_NONE,       AV_PIX_FMT_NONE,      AV_PIX_FMT_NONE       }, // 4:2:0 is not valid.
+    { AV_PIX_FMT_YUV422P10,  AV_PIX_FMT_YUV422P12,  AV_PIX_FMT_YUV422P14, AV_PIX_FMT_YUV422P16  },
+    { AV_PIX_FMT_YUV444P10,  AV_PIX_FMT_YUV444P12,  AV_PIX_FMT_YUV444P14, AV_PIX_FMT_YUV444P16  },
+    { AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_YUVA444P12, AV_PIX_FMT_NONE,      AV_PIX_FMT_YUVA444P16 },
 };

 static APVVLCLUT decode_lut;
@@ -82,14 +83,15 @@ static int apv_decode_check_format(AVCodecContext *avctx,
    avctx->level   = header->frame_info.level_idc;

    bit_depth = header->frame_info.bit_depth_minus8 + 8;
-    if (bit_depth < 8 || bit_depth > 16 || bit_depth % 2) {
+    av_assert1(bit_depth >= 10 && bit_depth <= 16); // checked by CBS
+    if (bit_depth % 2) {
        avpriv_request_sample(avctx, "Bit depth %d", bit_depth);
        return AVERROR_PATCHWELCOME;
    }
    avctx->pix_fmt =
-        apv_format_table[header->frame_info.chroma_format_idc][bit_depth - 4 >> 2];
+        apv_format_table[header->frame_info.chroma_format_idc][(bit_depth - 10) >> 1];

-    if (!avctx->pix_fmt) {
+    if (avctx->pix_fmt == AV_PIX_FMT_NONE) {
        avpriv_request_sample(avctx, "YUVA444P14");
        return AVERROR_PATCHWELCOME;
    }
@@ -20,6 +20,7 @@

 #include "config.h"
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"

 #include "apv.h"
@@ -100,33 +101,19 @@ static void apv_decode_transquant_c(void *output,
    }

    // Output.
-    if (bit_depth == 8) {
-        uint8_t *ptr = output;
-        int bd_shift = 20 - bit_depth;
+    av_assert2(bit_depth > 8 && bit_depth <= 16);
+    uint16_t *ptr = output;
+    int bd_shift = 20 - bit_depth;
+    pitch /= 2; // Pitch was in bytes, 2 bytes per sample.

-        for (int y = 0; y < 8; y++) {
-            for (int x = 0; x < 8; x++) {
-                int sample = ((recon_sample[y][x] +
-                               (1 << (bd_shift - 1))) >> bd_shift) +
-                    (1 << (bit_depth - 1));
-                ptr[x] = av_clip_uintp2(sample, bit_depth);
-            }
-            ptr += pitch;
-        }
-    } else {
-        uint16_t *ptr = output;
-        int bd_shift = 20 - bit_depth;
-        pitch /= 2; // Pitch was in bytes, 2 bytes per sample.
-
-        for (int y = 0; y < 8; y++) {
-            for (int x = 0; x < 8; x++) {
-                int sample = ((recon_sample[y][x] +
-                               (1 << (bd_shift - 1))) >> bd_shift) +
-                    (1 << (bit_depth - 1));
-                ptr[x] = av_clip_uintp2(sample, bit_depth);
-            }
-            ptr += pitch;
+    for (int y = 0; y < 8; y++) {
+        for (int x = 0; x < 8; x++) {
+            int sample = ((recon_sample[y][x] +
+                           (1 << (bd_shift - 1))) >> bd_shift) +
+                (1 << (bit_depth - 1));
+            ptr[x] = av_clip_uintp2(sample, bit_depth);
        }
+        ptr += pitch;
    }
 }

@@ -634,24 +634,28 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *rframe,
            ret = decode_avcf(avctx, frame);
            break;
        }
+        av_fallthrough;
    case MKBETAG('A', 'L', 'C', 'D'):
        if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
            s->key = 0;
            ret = decode_alcd(avctx, frame);
            break;
        }
+        av_fallthrough;
    case MKBETAG('R', 'L', 'E', 'F'):
        if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
            s->key = 1;
            ret = decode_rle(avctx, frame);
            break;
        }
+        av_fallthrough;
    case MKBETAG('R', 'L', 'E', 'D'):
        if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
            s->key = 0;
            ret = decode_rle(avctx, frame);
            break;
        }
+        av_fallthrough;
    default:
        av_log(avctx, AV_LOG_DEBUG, "unknown chunk 0x%X\n", chunk);
        break;
@@ -28,25 +28,25 @@
 */
@ void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst, const int32_t *src, const float *mul, int len)
 function ff_int32_to_float_fmul_array8_vfp, export=1
-        push    {lr}
-        ldr     a1, [sp, #4]
-        subs    lr, a1, #3*8
-        bcc     50f                        @ too short to pipeline
+        push            {lr}
+        ldr             a1, [sp, #4]
+        subs            lr, a1, #3*8
+        bcc             50f                        @ too short to pipeline
        @ Now need to find (len / 8) % 3. The approximation
        @ x / 24 = (x * 0xAB) >> 12
        @ is good for x < 4096, which is true for both AC3 and DCA.
-        mov     a1, #0xAB
-        ldr     ip, =0x03070000            @ RunFast mode, short vectors of length 8, stride 1
-        mul     a1, lr, a1
-        vpush   {s16-s31}
-        mov     a1, a1, lsr #12
-        add     a1, a1, a1, lsl #1
-        rsb     a1, a1, lr, lsr #3
-        cmp     a1, #1
-        fmrx    a1, FPSCR
-        fmxr    FPSCR, ip
-        beq     11f
-        blo     10f
+        mov             a1, #0xAB
+        ldr             ip, =0x03070000            @ RunFast mode, short vectors of length 8, stride 1
+        mul             a1, lr, a1
+        vpush           {s16-s31}
+        mov             a1, a1, lsr #12
+        add             a1, a1, a1, lsl #1
+        rsb             a1, a1, lr, lsr #3
+        cmp             a1, #1
+        fmrx            a1, FPSCR
+        fmxr            FPSCR, ip
+        beq             11f
+        blo             10f
        @ Array is (2 + multiple of 3) x 8 floats long
        @ drop through...
        vldmia          a3!, {s16-s23}
@@ -122,9 +122,9 @@ function ff_int32_to_float_fmul_array8_vfp, export=1
        vstmia          a2!, {s24-s27}
        vstmia          a2!, {s28-s31}

-        fmxr    FPSCR, a1
-        vpop    {s16-s31}
-        pop     {pc}
+        fmxr            FPSCR, a1
+        vpop            {s16-s31}
+        pop             {pc}

 10:     @ Array is (multiple of 3) x 8 floats long
        vldmia          a3!, {s8-s15}
@@ -158,9 +158,9 @@ function ff_int32_to_float_fmul_array8_vfp, export=1
        b               2b

 50:
-        ldr     lr, =0x03070000         @ RunFast mode, short vectors of length 8, stride 1
-        fmrx    ip, FPSCR
-        fmxr    FPSCR, lr
+        ldr             lr, =0x03070000         @ RunFast mode, short vectors of length 8, stride 1
+        fmrx            ip, FPSCR
+        fmxr            FPSCR, lr
 51:
        vldmia          a3!, {s8-s15}
        vldmia          a4!, {s0}
@@ -178,8 +178,8 @@ function ff_int32_to_float_fmul_array8_vfp, export=1
        vstmia          a2!, {s12-s15}
        bne             51b

-        fmxr    FPSCR, ip
-        pop     {pc}
+        fmxr            FPSCR, ip
+        pop             {pc}
 endfunc

 /**
@@ -195,9 +195,9 @@ VFP     len     .req    a3
 NOVFP   tmp     .req    a3
 NOVFP   len     .req    a4
 NOVFP   vmov    s0, a3
-        ldr     tmp, =0x03070000           @ RunFast mode, short vectors of length 8, stride 1
-        fmrx    ip, FPSCR
-        fmxr    FPSCR, tmp
+        ldr             tmp, =0x03070000           @ RunFast mode, short vectors of length 8, stride 1
+        fmrx            ip, FPSCR
+        fmxr            FPSCR, tmp
 1:
        vldmia          a2!, {s8-s15}
        vcvt.f32.s32    s8, s8
@@ -214,8 +214,8 @@ NOVFP   vmov    s0, a3
        vstmia          a1!, {s12-s15}
        bne             1b

-        fmxr    FPSCR, ip
-        bx      lr
+        fmxr            FPSCR, ip
+        bx              lr
 endfunc
        .unreq  tmp
        .unreq  len
@@ -571,8 +571,8 @@ function \type\()_h264_qpel16_hv_lowpass_l2_neon
 endfunc
 .endm

-        h264_qpel16_hv put
-        h264_qpel16_hv avg
+        h264_qpel16_hv  put
+        h264_qpel16_hv  avg

 .macro  h264_qpel8      type
 function ff_\type\()_h264_qpel8_mc10_neon, export=1
@@ -760,8 +760,8 @@ function ff_\type\()_h264_qpel8_mc33_neon, export=1
 endfunc
 .endm

-        h264_qpel8 put
-        h264_qpel8 avg
+        h264_qpel8      put
+        h264_qpel8      avg

 .macro  h264_qpel16     type
 function ff_\type\()_h264_qpel16_mc10_neon, export=1
@@ -942,5 +942,5 @@ function ff_\type\()_h264_qpel16_mc33_neon, export=1
 endfunc
 .endm

-        h264_qpel16 put
-        h264_qpel16 avg
+        h264_qpel16     put
+        h264_qpel16     avg
@@ -23,363 +23,363 @@
 #include "neon.S"

 .macro hevc_loop_filter_chroma_start
-        ldr      r12, [r2]
-        ldr      r3, [r2, #4]
-        add      r2, r3, r12
-        cmp      r2, #0
-        it       eq
-        bxeq     lr
+        ldr             r12, [r2]
+        ldr             r3, [r2, #4]
+        add             r2, r3, r12
+        cmp             r2, #0
+        it              eq
+        bxeq            lr
 .endm

 .macro hevc_loop_filter_chroma_body
-        vsubl.u8  q3, d4, d2
-        vsubl.u8  q11, d18, d19
-        vshl.i16  q3, #2
-        vadd.i16  q11, q3
-        vdup.16   d0, r12
-        vdup.16   d1, r3
-        vrshr.s16 q11, q11, #3
-        vneg.s16  q12, q0
-        vmovl.u8  q2, d4
-        vmin.s16  q11, q11, q0
-        vmax.s16  q11, q11, q12
-        vaddw.u8  q1, q11, d2
-        vsub.i16  q2, q11
-        vqmovun.s16 d2, q1
-        vqmovun.s16 d4, q2
+        vsubl.u8        q3, d4, d2
+        vsubl.u8        q11, d18, d19
+        vshl.i16        q3, #2
+        vadd.i16        q11, q3
+        vdup.16         d0, r12
+        vdup.16         d1, r3
+        vrshr.s16       q11, q11, #3
+        vneg.s16        q12, q0
+        vmovl.u8        q2, d4
+        vmin.s16        q11, q11, q0
+        vmax.s16        q11, q11, q12
+        vaddw.u8        q1, q11, d2
+        vsub.i16        q2, q11
+        vqmovun.s16     d2, q1
+        vqmovun.s16     d4, q2
 .endm

 .macro hevc_loop_filter_luma_start
-        ldr     r12, [r3]
-        ldr      r3, [r3, #4]
-        lsl      r3, #16
-        orr      r3, r12
-        cmp      r3, #0
-        it       eq
-        bxeq     lr
-        lsr      r3, #16
+        ldr             r12, [r3]
+        ldr             r3, [r3, #4]
+        lsl             r3, #16
+        orr             r3, r12
+        cmp             r3, #0
+        it              eq
+        bxeq            lr
+        lsr             r3, #16
 .endm

 .macro hevc_loop_filter_luma_body
-        vmovl.u8  q8, d16
-        vmovl.u8  q9, d18
-        vmovl.u8  q10, d20
-        vmovl.u8  q11, d22
-        vmovl.u8  q12, d24
-        vmovl.u8  q13, d26
-        vmovl.u8  q14, d28
-        vmovl.u8  q15, d30
+        vmovl.u8        q8, d16
+        vmovl.u8        q9, d18
+        vmovl.u8        q10, d20
+        vmovl.u8        q11, d22
+        vmovl.u8        q12, d24
+        vmovl.u8        q13, d26
+        vmovl.u8        q14, d28
+        vmovl.u8        q15, d30

-        vadd.i16   q7, q9, q11
-        vadd.i16   q6, q14, q12
-        vsub.i16   q7, q10
-        vsub.i16   q6, q13
-        vabd.s16   q7, q7, q10
-        vabd.s16   q6, q6, q13
+        vadd.i16        q7, q9, q11
+        vadd.i16        q6, q14, q12
+        vsub.i16        q7, q10
+        vsub.i16        q6, q13
+        vabd.s16        q7, q7, q10
+        vabd.s16        q6, q6, q13


-        vdup.16    q0, r2
-        vmov       q4, q7
-        vmov       q5, q6
-        vdup.16    d4, r12
-        vtrn.16    q7, q4
-        vtrn.16    q6, q5
+        vdup.16         q0, r2
+        vmov            q4, q7
+        vmov            q5, q6
+        vdup.16         d4, r12
+        vtrn.16         q7, q4
+        vtrn.16         q6, q5

-        vshl.u64   q7, #32
-        vshr.u64   q4, #32
-        vshl.u64   q6, #32
-        vshr.u64   q5, #32
-        vshr.u64   q7, #32
-        vshr.u64   q6, #32
-        vshl.u64   q5, #32
-        vshl.u64   q4, #32
-        vorr       q6, q5
-        vorr       q7, q4
-        vdup.16    d5, r3
-        vadd.i16   q5, q7, q6
+        vshl.u64        q7, #32
+        vshr.u64        q4, #32
+        vshl.u64        q6, #32
+        vshr.u64        q5, #32
+        vshr.u64        q7, #32
+        vshr.u64        q6, #32
+        vshl.u64        q5, #32
+        vshl.u64        q4, #32
+        vorr            q6, q5
+        vorr            q7, q4
+        vdup.16         d5, r3
+        vadd.i16        q5, q7, q6

-        vmov       q4, q5
-        vmov       q3, q5
-        vtrn.32    q3, q4
+        vmov            q4, q5
+        vmov            q3, q5
+        vtrn.32         q3, q4

-        vadd.i16   q4, q3
+        vadd.i16        q4, q3

-        vshl.s16   q5, q5, #1
-        vcgt.s16   q3, q0, q4
+        vshl.s16        q5, q5, #1
+        vcgt.s16        q3, q0, q4

-        vmovn.i16  d6, q3
-        vshr.s16   q1, q0, #2
-        vmovn.i16  d6, q3
-        vcgt.s16   q5, q1, q5
-        vmov       r7, s12
-        cmp        r7, #0
-        beq        bypasswrite
+        vmovn.i16       d6, q3
+        vshr.s16        q1, q0, #2
+        vmovn.i16       d6, q3
+        vcgt.s16        q5, q1, q5
+        vmov            r7, s12
+        cmp             r7, #0
+        beq             bypasswrite

-        vpadd.i32  d0, d14, d12
-        vpadd.i32  d1, d15, d13
-        vmov       q4, q2
-        vshl.s16   q2, #2
-        vshr.s16   q1, q1, #1
-        vrhadd.s16 q2, q4
+        vpadd.i32       d0, d14, d12
+        vpadd.i32       d1, d15, d13
+        vmov            q4, q2
+        vshl.s16        q2, #2
+        vshr.s16        q1, q1, #1
+        vrhadd.s16      q2, q4

-        vabd.s16   q7, q8, q11
-        vaba.s16   q7, q15, q12
+        vabd.s16        q7, q8, q11
+        vaba.s16        q7, q15, q12

-        vmovn.i32  d0, q0
-        vmov       r5, r6, s0, s1
-        vcgt.s16   q6, q1, q7
-        vand       q5, q5, q6
-        vabd.s16   q7, q11, q12
-        vcgt.s16   q6, q2, q7
-        vand       q5, q5, q6
+        vmovn.i32       d0, q0
+        vmov            r5, r6, s0, s1
+        vcgt.s16        q6, q1, q7
+        vand            q5, q5, q6
+        vabd.s16        q7, q11, q12
+        vcgt.s16        q6, q2, q7
+        vand            q5, q5, q6

-        vmov       q2, q5
-        vtrn.s16   q5, q2
-        vshr.u64   q2, #32
-        vshl.u64   q5, #32
-        vshl.u64   q2, #32
-        vshr.u64   q5, #32
-        vorr       q5, q2
+        vmov            q2, q5
+        vtrn.s16        q5, q2
+        vshr.u64        q2, #32
+        vshl.u64        q5, #32
+        vshl.u64        q2, #32
+        vshr.u64        q5, #32
+        vorr            q5, q2

-        vmov       q2, q5
-        vshl.i16   q7, q4, #1
-        vtrn.32    q2, q5
-        vand       q5, q2
-        vneg.s16   q6, q7
-        vmovn.i16  d4, q5
-        vmovn.i16  d4, q2
-        vmov       r8, s8
+        vmov            q2, q5
+        vshl.i16        q7, q4, #1
+        vtrn.32         q2, q5
+        vand            q5, q2
+        vneg.s16        q6, q7
+        vmovn.i16       d4, q5
+        vmovn.i16       d4, q2
+        vmov            r8, s8

-        and        r9, r8, r7
-        cmp        r9, #0
-        beq        1f
+        and             r9, r8, r7
+        cmp             r9, #0
+        beq             1f

-        vadd.i16  q2, q11, q12
-        vadd.i16  q4, q9, q8
-        vadd.i16  q1, q2, q10
-        vdup.16   d10, r9
-        vadd.i16  q0, q1, q9
-        vshl.i16  q4, #1
-        lsr        r9, #16
-        vadd.i16  q1, q0
-        vrshr.s16 q3, q0, #2
-        vadd.i16  q1, q13
-        vadd.i16  q4, q0
-        vsub.i16  q3, q10
-        vrshr.s16 q1, #3
-        vrshr.s16 q4, #3
-        vmax.s16  q3, q6
-        vsub.i16  q1, q11
-        vsub.i16  q4, q9
-        vmin.s16  q3, q7
-        vmax.s16  q4, q6
-        vmax.s16  q1, q6
-        vadd.i16  q3, q10
-        vmin.s16  q4, q7
-        vmin.s16  q1, q7
-        vdup.16   d11, r9
-        vadd.i16  q4, q9
-        vadd.i16  q1, q11
-        vbit      q9, q4, q5
-        vadd.i16  q4, q2, q13
-        vbit      q11, q1, q5
-        vadd.i16  q0, q4, q14
-        vadd.i16  q2, q15, q14
-        vadd.i16  q4, q0
+        vadd.i16        q2, q11, q12
+        vadd.i16        q4, q9, q8
+        vadd.i16        q1, q2, q10
+        vdup.16         d10, r9
+        vadd.i16        q0, q1, q9
+        vshl.i16        q4, #1
+        lsr             r9, #16
+        vadd.i16        q1, q0
+        vrshr.s16       q3, q0, #2
+        vadd.i16        q1, q13
+        vadd.i16        q4, q0
+        vsub.i16        q3, q10
+        vrshr.s16       q1, #3
+        vrshr.s16       q4, #3
+        vmax.s16        q3, q6
+        vsub.i16        q1, q11
+        vsub.i16        q4, q9
+        vmin.s16        q3, q7
+        vmax.s16        q4, q6
+        vmax.s16        q1, q6
+        vadd.i16        q3, q10
+        vmin.s16        q4, q7
+        vmin.s16        q1, q7
+        vdup.16         d11, r9
+        vadd.i16        q4, q9
+        vadd.i16        q1, q11
+        vbit            q9, q4, q5
+        vadd.i16        q4, q2, q13
+        vbit            q11, q1, q5
+        vadd.i16        q0, q4, q14
+        vadd.i16        q2, q15, q14
+        vadd.i16        q4, q0

-        vshl.i16  q2, #1
-        vadd.i16  q4, q10
-        vbit      q10, q3, q5
-        vrshr.s16 q4, #3
-        vadd.i16  q2, q0
-        vrshr.s16 q3, q0, #2
-        vsub.i16  q4, q12
-        vrshr.s16 q2, #3
-        vsub.i16  q3, q13
-        vmax.s16  q4, q6
-        vsub.i16  q2, q14
-        vmax.s16  q3, q6
-        vmin.s16  q4, q7
-        vmax.s16  q2, q6
-        vmin.s16  q3, q7
-        vadd.i16  q4, q12
-        vmin.s16  q2, q7
-        vadd.i16  q3, q13
-        vbit      q12, q4, q5
-        vadd.i16  q2, q14
-        vbit      q13, q3, q5
-        vbit      q14, q2, q5
+        vshl.i16        q2, #1
+        vadd.i16        q4, q10
+        vbit            q10, q3, q5
+        vrshr.s16       q4, #3
+        vadd.i16        q2, q0
+        vrshr.s16       q3, q0, #2
+        vsub.i16        q4, q12
+        vrshr.s16       q2, #3
+        vsub.i16        q3, q13
+        vmax.s16        q4, q6
+        vsub.i16        q2, q14
+        vmax.s16        q3, q6
+        vmin.s16        q4, q7
+        vmax.s16        q2, q6
+        vmin.s16        q3, q7
+        vadd.i16        q4, q12
+        vmin.s16        q2, q7
+        vadd.i16        q3, q13
+        vbit            q12, q4, q5
+        vadd.i16        q2, q14
+        vbit            q13, q3, q5
+        vbit            q14, q2, q5

 1:
-        mvn       r8, r8
-        and       r9, r8, r7
-        cmp       r9, #0
-        beq       2f
+        mvn             r8, r8
+        and             r9, r8, r7
+        cmp             r9, #0
+        beq             2f

-        vdup.16    q4, r2
+        vdup.16         q4, r2

-        vdup.16   d10, r9
-        lsr       r9, #16
-        vmov       q1, q4
-        vdup.16   d11, r9
-        vshr.s16   q1, #1
-        vsub.i16  q2, q12, q11
-        vadd.i16   q4, q1
-        vshl.s16  q0, q2, #3
-        vshr.s16   q4, #3
-        vadd.i16  q2, q0
-        vsub.i16  q0, q13, q10
-        vsub.i16  q2, q0
-        vshl.i16  q0, q0, #1
-        vsub.i16  q2, q0
-        vshl.s16  q1, q7, 2
-        vrshr.s16 q2, q2, #4
-        vadd.i16  q1, q7
-        vabs.s16  q3, q2
-        vshr.s16  q6, q6, #1
-        vcgt.s16  q1, q1, q3
-        vand      q5, q1
-        vshr.s16  q7, q7, #1
-        vmax.s16  q2, q2, q6
-        vmin.s16  q2, q2, q7
+        vdup.16         d10, r9
+        lsr             r9, #16
+        vmov            q1, q4
+        vdup.16         d11, r9
+        vshr.s16        q1, #1
+        vsub.i16        q2, q12, q11
+        vadd.i16        q4, q1
+        vshl.s16        q0, q2, #3
+        vshr.s16        q4, #3
+        vadd.i16        q2, q0
+        vsub.i16        q0, q13, q10
+        vsub.i16        q2, q0
+        vshl.i16        q0, q0, #1
+        vsub.i16        q2, q0
+        vshl.s16        q1, q7, 2
+        vrshr.s16       q2, q2, #4
+        vadd.i16        q1, q7
+        vabs.s16        q3, q2
+        vshr.s16        q6, q6, #1
+        vcgt.s16        q1, q1, q3
+        vand            q5, q1
+        vshr.s16        q7, q7, #1
+        vmax.s16        q2, q2, q6
+        vmin.s16        q2, q2, q7

-        vshr.s16  q7, q7, #1
-        vrhadd.s16 q3, q9, q11
-        vneg.s16  q6, q7
-        vsub.s16  q3, q10
-        vdup.16   d2, r5
-        vhadd.s16 q3, q2
-        vdup.16   d3, r6
-        vmax.s16  q3, q3, q6
-        vcgt.s16  q1, q4, q1
-        vmin.s16  q3, q3, q7
-        vand      q1, q5
-        vadd.i16  q3, q10
-        lsr       r5, #16
-        lsr       r6, #16
-        vbit      q10, q3, q1
+        vshr.s16        q7, q7, #1
+        vrhadd.s16      q3, q9, q11
+        vneg.s16        q6, q7
+        vsub.s16        q3, q10
+        vdup.16         d2, r5
+        vhadd.s16       q3, q2
+        vdup.16         d3, r6
+        vmax.s16        q3, q3, q6
+        vcgt.s16        q1, q4, q1
+        vmin.s16        q3, q3, q7
+        vand            q1, q5
+        vadd.i16        q3, q10
+        lsr             r5, #16
+        lsr             r6, #16
+        vbit            q10, q3, q1

-        vrhadd.s16 q3, q14, q12
-        vdup.16   d2, r5
-        vsub.s16  q3, q13
-        vdup.16   d3, r6
-        vhsub.s16 q3, q2
-        vcgt.s16  q1, q4, q1
-        vmax.s16  q3, q3, q6
-        vand      q1, q5
-        vmin.s16  q3, q3, q7
-        vadd.i16  q3, q13
-        vbit      q13, q3, q1
-        vadd.i16  q0, q11, q2
-        vsub.i16  q4, q12, q2
-        vbit      q11, q0, q5
-        vbit      q12, q4, q5
+        vrhadd.s16      q3, q14, q12
+        vdup.16         d2, r5
+        vsub.s16        q3, q13
+        vdup.16         d3, r6
+        vhsub.s16       q3, q2
+        vcgt.s16        q1, q4, q1
+        vmax.s16        q3, q3, q6
+        vand            q1, q5
+        vmin.s16        q3, q3, q7
+        vadd.i16        q3, q13
+        vbit            q13, q3, q1
+        vadd.i16        q0, q11, q2
+        vsub.i16        q4, q12, q2
+        vbit            q11, q0, q5
+        vbit            q12, q4, q5

 2:
-        vqmovun.s16 d16, q8
-        vqmovun.s16 d18, q9
-        vqmovun.s16 d20, q10
-        vqmovun.s16 d22, q11
-        vqmovun.s16 d24, q12
-        vqmovun.s16 d26, q13
-        vqmovun.s16 d28, q14
-        vqmovun.s16 d30, q15
+        vqmovun.s16     d16, q8
+        vqmovun.s16     d18, q9
+        vqmovun.s16     d20, q10
+        vqmovun.s16     d22, q11
+        vqmovun.s16     d24, q12
+        vqmovun.s16     d26, q13
+        vqmovun.s16     d28, q14
+        vqmovun.s16     d30, q15
 .endm

 function ff_hevc_v_loop_filter_luma_neon, export=1
        hevc_loop_filter_luma_start
-        push     {r5-r11}
-        vpush    {d8-d15}
-        sub      r0, #4
-        vld1.8   {d16}, [r0], r1
-        vld1.8   {d18}, [r0], r1
-        vld1.8   {d20}, [r0], r1
-        vld1.8   {d22}, [r0], r1
-        vld1.8   {d24}, [r0], r1
-        vld1.8   {d26}, [r0], r1
-        vld1.8   {d28}, [r0], r1
-        vld1.8   {d30}, [r0], r1
-        sub      r0, r0, r1, lsl #3
-        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+        push            {r5-r11}
+        vpush           {d8-d15}
+        sub             r0, #4
+        vld1.8          {d16}, [r0], r1
+        vld1.8          {d18}, [r0], r1
+        vld1.8          {d20}, [r0], r1
+        vld1.8          {d22}, [r0], r1
+        vld1.8          {d24}, [r0], r1
+        vld1.8          {d26}, [r0], r1
+        vld1.8          {d28}, [r0], r1
+        vld1.8          {d30}, [r0], r1
+        sub             r0, r0, r1, lsl #3
+        transpose_8x8   d16, d18, d20, d22, d24, d26, d28, d30
        hevc_loop_filter_luma_body
-        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
-        vst1.8   {d16}, [r0], r1
-        vst1.8   {d18}, [r0], r1
-        vst1.8   {d20}, [r0], r1
-        vst1.8   {d22}, [r0], r1
-        vst1.8   {d24}, [r0], r1
-        vst1.8   {d26}, [r0], r1
-        vst1.8   {d28}, [r0], r1
-        vst1.8   {d30}, [r0]
-        vpop     {d8-d15}
-        pop      {r5-r11}
-        bx lr
+        transpose_8x8   d16, d18, d20, d22, d24, d26, d28, d30
+        vst1.8          {d16}, [r0], r1
+        vst1.8          {d18}, [r0], r1
+        vst1.8          {d20}, [r0], r1
+        vst1.8          {d22}, [r0], r1
+        vst1.8          {d24}, [r0], r1
+        vst1.8          {d26}, [r0], r1
+        vst1.8          {d28}, [r0], r1
+        vst1.8          {d30}, [r0]
+        vpop            {d8-d15}
+        pop             {r5-r11}
+        bx              lr
 endfunc

 function ff_hevc_h_loop_filter_luma_neon, export=1
        hevc_loop_filter_luma_start
-        push     {r5-r11}
-        vpush    {d8-d15}
-        sub      r0, r0, r1, lsl #2
-        vld1.8  {d16}, [r0], r1
-        vld1.8  {d18}, [r0], r1
-        vld1.8  {d20}, [r0], r1
-        vld1.8  {d22}, [r0], r1
-        vld1.8  {d24}, [r0], r1
-        vld1.8  {d26}, [r0], r1
-        vld1.8  {d28}, [r0], r1
-        vld1.8  {d30}, [r0], r1
-        sub        r0, r0, r1, lsl #3
-        add        r0, r1
+        push            {r5-r11}
+        vpush           {d8-d15}
+        sub             r0, r0, r1, lsl #2
+        vld1.8          {d16}, [r0], r1
+        vld1.8          {d18}, [r0], r1
+        vld1.8          {d20}, [r0], r1
+        vld1.8          {d22}, [r0], r1
+        vld1.8          {d24}, [r0], r1
+        vld1.8          {d26}, [r0], r1
+        vld1.8          {d28}, [r0], r1
+        vld1.8          {d30}, [r0], r1
+        sub             r0, r0, r1, lsl #3
+        add             r0, r1
        hevc_loop_filter_luma_body
-        vst1.8   {d18}, [r0], r1
-        vst1.8   {d20}, [r0], r1
-        vst1.8   {d22}, [r0], r1
-        vst1.8   {d24}, [r0], r1
-        vst1.8   {d26}, [r0], r1
-        vst1.8   {d28}, [r0]
+        vst1.8          {d18}, [r0], r1
+        vst1.8          {d20}, [r0], r1
+        vst1.8          {d22}, [r0], r1
+        vst1.8          {d24}, [r0], r1
+        vst1.8          {d26}, [r0], r1
+        vst1.8          {d28}, [r0]
 bypasswrite:
-        vpop     {d8-d15}
-        pop      {r5-r11}
-        bx lr
+        vpop            {d8-d15}
+        pop             {r5-r11}
+        bx              lr
 endfunc

 function ff_hevc_v_loop_filter_chroma_neon, export=1
        hevc_loop_filter_chroma_start
-        sub      r0, #4
-        vld1.8   {d16}, [r0], r1
-        vld1.8   {d17}, [r0], r1
-        vld1.8   {d18}, [r0], r1
-        vld1.8   {d2},  [r0], r1
-        vld1.8   {d4},  [r0], r1
-        vld1.8   {d19}, [r0], r1
-        vld1.8   {d20}, [r0], r1
-        vld1.8   {d21}, [r0], r1
-        sub      r0, r0, r1, lsl #3
-        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+        sub             r0, #4
+        vld1.8          {d16}, [r0], r1
+        vld1.8          {d17}, [r0], r1
+        vld1.8          {d18}, [r0], r1
+        vld1.8          {d2},  [r0], r1
+        vld1.8          {d4},  [r0], r1
+        vld1.8          {d19}, [r0], r1
+        vld1.8          {d20}, [r0], r1
+        vld1.8          {d21}, [r0], r1
+        sub             r0, r0, r1, lsl #3
+        transpose_8x8   d16, d17, d18, d2, d4, d19, d20, d21
        hevc_loop_filter_chroma_body
-        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
-        vst1.8   {d16}, [r0], r1
-        vst1.8   {d17}, [r0], r1
-        vst1.8   {d18}, [r0], r1
-        vst1.8   {d2},  [r0], r1
-        vst1.8   {d4},  [r0], r1
-        vst1.8   {d19}, [r0], r1
-        vst1.8   {d20}, [r0], r1
-        vst1.8   {d21}, [r0]
-        bx       lr
+        transpose_8x8   d16, d17, d18, d2, d4, d19, d20, d21
+        vst1.8          {d16}, [r0], r1
+        vst1.8          {d17}, [r0], r1
+        vst1.8          {d18}, [r0], r1
+        vst1.8          {d2},  [r0], r1
+        vst1.8          {d4},  [r0], r1
+        vst1.8          {d19}, [r0], r1
+        vst1.8          {d20}, [r0], r1
+        vst1.8          {d21}, [r0]
+        bx              lr
 endfunc

 function ff_hevc_h_loop_filter_chroma_neon, export=1
        hevc_loop_filter_chroma_start
-        sub      r0, r0, r1, lsl #1
-        vld1.8   {d18}, [r0], r1
-        vld1.8   {d2}, [r0], r1
-        vld1.8   {d4}, [r0], r1
-        vld1.8   {d19}, [r0]
-        sub      r0, r0, r1, lsl #1
+        sub             r0, r0, r1, lsl #1
+        vld1.8          {d18}, [r0], r1
+        vld1.8          {d2}, [r0], r1
+        vld1.8          {d4}, [r0], r1
+        vld1.8          {d19}, [r0]
+        sub             r0, r0, r1, lsl #1
        hevc_loop_filter_chroma_body
-        vst1.8   {d2}, [r0], r1
-        vst1.8   {d4}, [r0]
-        bx       lr
+        vst1.8          {d2}, [r0], r1
+        vst1.8          {d4}, [r0]
+        bx              lr
 endfunc
@@ -322,44 +322,44 @@ endfunc
 .endm

 .macro tr_4x4 in0, in1, in2, in3, out0, out1, out2, out3, shift, tmp0, tmp1, tmp2, tmp3, tmp4
-         vshll.s16      \tmp0, \in0, #6
-         vmull.s16      \tmp2, \in1, d4[1]
-         vmov           \tmp1, \tmp0
-         vmull.s16      \tmp3, \in1, d4[3]
-         vmlal.s16      \tmp0, \in2, d4[0] @e0
-         vmlsl.s16      \tmp1, \in2, d4[0] @e1
-         vmlal.s16      \tmp2, \in3, d4[3] @o0
-         vmlsl.s16      \tmp3, \in3, d4[1] @o1
+        vshll.s16       \tmp0, \in0, #6
+        vmull.s16       \tmp2, \in1, d4[1]
+        vmov            \tmp1, \tmp0
+        vmull.s16       \tmp3, \in1, d4[3]
+        vmlal.s16       \tmp0, \in2, d4[0] @e0
+        vmlsl.s16       \tmp1, \in2, d4[0] @e1
+        vmlal.s16       \tmp2, \in3, d4[3] @o0
+        vmlsl.s16       \tmp3, \in3, d4[1] @o1

-         vadd.s32       \tmp4, \tmp0, \tmp2
-         vsub.s32       \tmp0, \tmp0, \tmp2
-         vadd.s32       \tmp2, \tmp1, \tmp3
-         vsub.s32       \tmp1, \tmp1, \tmp3
-         vqrshrn.s32    \out0, \tmp4, #\shift
-         vqrshrn.s32    \out3, \tmp0, #\shift
-         vqrshrn.s32    \out1, \tmp2, #\shift
-         vqrshrn.s32    \out2, \tmp1, #\shift
+        vadd.s32        \tmp4, \tmp0, \tmp2
+        vsub.s32        \tmp0, \tmp0, \tmp2
+        vadd.s32        \tmp2, \tmp1, \tmp3
+        vsub.s32        \tmp1, \tmp1, \tmp3
+        vqrshrn.s32     \out0, \tmp4, #\shift
+        vqrshrn.s32     \out3, \tmp0, #\shift
+        vqrshrn.s32     \out1, \tmp2, #\shift
+        vqrshrn.s32     \out2, \tmp1, #\shift
 .endm

 .macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3
-         vshll.s16      \tmp0, \in0, #6
-         vld1.s16       {\in0}, [r1, :64]!
-         vmov           \tmp1, \tmp0
-         vmull.s16      \tmp2, \in1, \in0[1]
-         vmull.s16      \tmp3, \in1, \in0[3]
-         vmlal.s16      \tmp0, \in2, \in0[0] @e0
-         vmlsl.s16      \tmp1, \in2, \in0[0] @e1
-         vmlal.s16      \tmp2, \in3, \in0[3] @o0
-         vmlsl.s16      \tmp3, \in3, \in0[1] @o1
+        vshll.s16       \tmp0, \in0, #6
+        vld1.s16        {\in0}, [r1, :64]!
+        vmov            \tmp1, \tmp0
+        vmull.s16       \tmp2, \in1, \in0[1]
+        vmull.s16       \tmp3, \in1, \in0[3]
+        vmlal.s16       \tmp0, \in2, \in0[0] @e0
+        vmlsl.s16       \tmp1, \in2, \in0[0] @e1
+        vmlal.s16       \tmp2, \in3, \in0[3] @o0
+        vmlsl.s16       \tmp3, \in3, \in0[1] @o1

-         vld1.s16       {\in0}, [r1, :64]
+        vld1.s16        {\in0}, [r1, :64]

-         vadd.s32       \out0, \tmp0, \tmp2
-         vadd.s32       \out1, \tmp1, \tmp3
-         vsub.s32       \out2, \tmp1, \tmp3
-         vsub.s32       \out3, \tmp0, \tmp2
+        vadd.s32        \out0, \tmp0, \tmp2
+        vadd.s32        \out1, \tmp1, \tmp3
+        vsub.s32        \out2, \tmp1, \tmp3
+        vsub.s32        \out3, \tmp0, \tmp2

-         sub            r1,  r1,  #8
+        sub             r1,  r1,  #8
 .endm

@ Do a 4x4 transpose, using q registers for the subtransposes that don't
@@ -385,7 +385,7 @@ function ff_hevc_idct_4x4_\bitdepth\()_neon, export=1
        tr_4x4          d16, d17, d18, d19, d0, d1, d2, d3, 20 - \bitdepth, q10, q11, q12, q13, q0
        transpose_4x4   q0, q1, d0, d1, d2, d3
        vst1.s16        {d0-d3}, [r0, :128]
-        bx lr
+        bx              lr
 endfunc
 .endm

@@ -557,14 +557,14 @@ endfunc
 .endm

 .macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7
-        sum_sub q5,     \in, \t0, \op0
-        sum_sub q6,     \in, \t1, \op1
-        sum_sub q7,     \in, \t2, \op2
-        sum_sub q8,     \in, \t3, \op3
-        sum_sub q9,     \in, \t4, \op4
-        sum_sub q10,    \in, \t5, \op5
-        sum_sub q11,    \in, \t6, \op6
-        sum_sub q12,    \in, \t7, \op7
+        sum_sub         q5,  \in, \t0, \op0
+        sum_sub         q6,  \in, \t1, \op1
+        sum_sub         q7,  \in, \t2, \op2
+        sum_sub         q8,  \in, \t3, \op3
+        sum_sub         q9,  \in, \t4, \op4
+        sum_sub         q10, \in, \t5, \op5
+        sum_sub         q11, \in, \t6, \op6
+        sum_sub         q12, \in, \t7, \op7
 .endm

 .macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
@@ -682,7 +682,7 @@ function func_tr_16x4_\name
        mov             r4, #-32
        store16         d26, d27, d28, d29, d30, d31, d8, d9, r4
    .else
-        store_to_stack (\offset + 64), (\offset + 176), q4, q9, q10, q11, q3, q2, q1, q0
+        store_to_stack  (\offset + 64), (\offset + 176), q4, q9, q10, q11, q3, q2, q1, q0
    .endif

        bx              lr
@@ -744,10 +744,10 @@ endfunc
 .endm

 .macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3
-        sum_sub q10,     \in, \t0, \op0
-        sum_sub q11,     \in, \t1, \op1
-        sum_sub q12,     \in, \t2, \op2
-        sum_sub q13,     \in, \t3, \op3
+        sum_sub         q10, \in, \t0, \op0
+        sum_sub         q11, \in, \t1, \op1
+        sum_sub         q12, \in, \t2, \op2
+        sum_sub         q13, \in, \t3, \op3
 .endm

 .macro butterfly32 in0, in1, in2, in3
@@ -900,7 +900,7 @@ function func_tr_32x4_\name
        add             r3, r11, #(32 + 3 * 64)
        scale_store     \shift

-        bx               r10
+        bx              r10
 endfunc
 .endm

@@ -965,59 +965,59 @@ idct_32x32_dc 10
 /* uses registers q2 - q9 for temp values */
 /* TODO: reorder */
 .macro tr4_luma_shift r0, r1, r2, r3, shift
-        vaddl.s16   q5, \r0, \r2    // c0 = src0 + src2
-        vaddl.s16   q2, \r2, \r3    // c1 = src2 + src3
-        vsubl.s16   q4, \r0, \r3    // c2 = src0 - src3
-        vmull.s16   q6, \r1, d0[0]  // c3 = 74 * src1
+        vaddl.s16       q5, \r0, \r2    // c0 = src0 + src2
+        vaddl.s16       q2, \r2, \r3    // c1 = src2 + src3
+        vsubl.s16       q4, \r0, \r3    // c2 = src0 - src3
+        vmull.s16       q6, \r1, d0[0]  // c3 = 74 * src1

-        vaddl.s16   q7, \r0, \r3    // src0 + src3
-        vsubw.s16   q7, q7, \r2     // src0 - src2 + src3
-        vmul.s32    q7, q7, d0[0]   // dst2 = 74 * (src0 - src2 + src3)
+        vaddl.s16       q7, \r0, \r3    // src0 + src3
+        vsubw.s16       q7, q7, \r2     // src0 - src2 + src3
+        vmul.s32        q7, q7, d0[0]   // dst2 = 74 * (src0 - src2 + src3)

-        vmul.s32    q8, q5, d0[1]   // 29 * c0
-        vmul.s32    q9, q2, d1[0]   // 55 * c1
-        vadd.s32    q8, q9          // 29 * c0 + 55 * c1
-        vadd.s32    q8, q6          // dst0 = 29 * c0 + 55 * c1 + c3
+        vmul.s32        q8, q5, d0[1]   // 29 * c0
+        vmul.s32        q9, q2, d1[0]   // 55 * c1
+        vadd.s32        q8, q9          // 29 * c0 + 55 * c1
+        vadd.s32        q8, q6          // dst0 = 29 * c0 + 55 * c1 + c3

-        vmul.s32    q2, q2, d0[1]   // 29 * c1
-        vmul.s32    q9, q4, d1[0]   // 55 * c2
-        vsub.s32    q9, q2          // 55 * c2 - 29 * c1
-        vadd.s32    q9, q6          // dst1 = 55 * c2 - 29 * c1 + c3
+        vmul.s32        q2, q2, d0[1]   // 29 * c1
+        vmul.s32        q9, q4, d1[0]   // 55 * c2
+        vsub.s32        q9, q2          // 55 * c2 - 29 * c1
+        vadd.s32        q9, q6          // dst1 = 55 * c2 - 29 * c1 + c3

-        vmul.s32    q5, q5, d1[0]   // 55 * c0
-        vmul.s32    q4, q4, d0[1]   // 29 * c2
-        vadd.s32    q5, q4          // 55 * c0 + 29 * c2
-        vsub.s32    q5, q6          // dst3 = 55 * c0 + 29 * c2 - c3
+        vmul.s32        q5, q5, d1[0]   // 55 * c0
+        vmul.s32        q4, q4, d0[1]   // 29 * c2
+        vadd.s32        q5, q4          // 55 * c0 + 29 * c2
+        vsub.s32        q5, q6          // dst3 = 55 * c0 + 29 * c2 - c3

-        vqrshrn.s32   \r0, q8, \shift
-        vqrshrn.s32   \r1, q9, \shift
-        vqrshrn.s32   \r2, q7, \shift
-        vqrshrn.s32   \r3, q5, \shift
+        vqrshrn.s32     \r0, q8, \shift
+        vqrshrn.s32     \r1, q9, \shift
+        vqrshrn.s32     \r2, q7, \shift
+        vqrshrn.s32     \r3, q5, \shift
 .endm

 .ltorg
 function ff_hevc_transform_luma_4x4_neon_8, export=1
-        vpush       {d8-d15}
-        vld1.16     {q14, q15}, [r0]  // coeffs
-        ldr         r3, =0x4a  // 74
-        vmov.32     d0[0], r3
-        ldr         r3, =0x1d  // 29
-        vmov.32     d0[1], r3
-        ldr         r3, =0x37  // 55
-        vmov.32     d1[0], r3
+        vpush           {d8-d15}
+        vld1.16         {q14, q15}, [r0]  // coeffs
+        ldr             r3, =0x4a  // 74
+        vmov.32         d0[0], r3
+        ldr             r3, =0x1d  // 29
+        vmov.32         d0[1], r3
+        ldr             r3, =0x37  // 55
+        vmov.32         d1[0], r3

-        tr4_luma_shift d28, d29, d30, d31, #7
+        tr4_luma_shift  d28, d29, d30, d31, #7

-        vtrn.16     d28, d29
-        vtrn.16     d30, d31
-        vtrn.32     q14, q15
+        vtrn.16         d28, d29
+        vtrn.16         d30, d31
+        vtrn.32         q14, q15

-        tr4_luma_shift d28, d29, d30, d31, #12
+        tr4_luma_shift  d28, d29, d30, d31, #12

-        vtrn.16     d28, d29
-        vtrn.16     d30, d31
-        vtrn.32     q14, q15
-        vst1.16     {q14, q15}, [r0]
-        vpop        {d8-d15}
-        bx lr
+        vtrn.16         d28, d29
+        vtrn.16         d30, d31
+        vtrn.32         q14, q15
+        vst1.16         {q14, q15}, [r0]
+        vpop            {d8-d15}
+        bx              lr
 endfunc
@@ -23,155 +23,155 @@
 #include "neon.S"

 function ff_hevc_sao_band_filter_neon_8, export=1
-        push    {r4-r10}
-        ldr     r5,  [sp, #28]   // width
-        ldr     r4,  [sp, #32]   // height
-        ldr     r8,  [sp, #36]   // offset_table
-        vpush   {d8-d15}
-        mov     r12,  r4         // r12 = height
-        mov     r6,   r0         // r6 = r0 = dst
-        mov     r7,   r1         // r7 = r1 = src
-        vldm    r8,   {q0-q3}
-        vmov.u16    q15,  #1
-        vmov.u8     q14,  #32
-0:      pld      [r1]
-        cmp      r5,    #4
-        beq      4f
-8:      subs     r4,    #1
-        vld1.8   {d16},  [r1], r3
-        vshr.u8  d17,   d16,  #3   // index = [src>>3]
-        vshll.u8 q9,    d17,  #1   // lowIndex = 2*index
-        vadd.u16 q11,   q9,   q15  // highIndex = (2*index+1) << 8
-        vshl.u16 q10,   q11,  #8   // q10: highIndex;  q9: lowIndex;
-        vadd.u16 q10,   q9         // combine high and low index;
+        push            {r4-r10}
+        ldr             r5,  [sp, #28]   // width
+        ldr             r4,  [sp, #32]   // height
+        ldr             r8,  [sp, #36]   // offset_table
+        vpush           {d8-d15}
+        mov             r12,  r4         // r12 = height
+        mov             r6,   r0         // r6 = r0 = dst
+        mov             r7,   r1         // r7 = r1 = src
+        vldm            r8,   {q0-q3}
+        vmov.u16        q15,  #1
+        vmov.u8         q14,  #32
+0:      pld             [r1]
+        cmp             r5,    #4
+        beq             4f
+8:      subs            r4,    #1
+        vld1.8          {d16},  [r1], r3
+        vshr.u8         d17,   d16,  #3   // index = [src>>3]
+        vshll.u8        q9,    d17,  #1   // lowIndex = 2*index
+        vadd.u16        q11,   q9,   q15  // highIndex = (2*index+1) << 8
+        vshl.u16        q10,   q11,  #8   // q10: highIndex;  q9: lowIndex;
+        vadd.u16        q10,   q9         // combine high and low index;
        // Look-up Table Round 1; index range: 0-15
-        vtbx.8   d24,   {q0-q1},   d20
-        vtbx.8   d25,   {q0-q1},   d21
+        vtbx.8          d24,   {q0-q1},   d20
+        vtbx.8          d25,   {q0-q1},   d21
        // Look-up Table Round 2; index range: 16-31
-        vsub.u8  q10,   q14        // Look-up with 8bit
-        vtbx.8   d24,   {q2-q3},   d20
-        vtbx.8   d25,   {q2-q3},   d21
-        vaddw.u8 q13,   q12,       d16
-        vqmovun.s16      d8,         q13
-        vst1.8    d8,   [r0],      r2
-        bne      8b
-        subs     r5,    #8
-        beq      99f
-        mov      r4,    r12
-        add r6, #8
-        mov r0, r6
-        add r7, #8
-        mov r1, r7
-        b        0b
-4:      subs     r4,    #1
-        vld1.32   {d16[0]},  [r1],  r3
-        vshr.u8  d17,   d16,  #3  // src>>3
-        vshll.u8 q9,    d17,  #1   // lowIndex = 2*index
-        vadd.u16 q11,   q9,   q15  // highIndex = (2*index+1) << 8
-        vshl.u16 q10,   q11,  #8   // q10: highIndex;  q9: lowIndex;
-        vadd.u16 q10,   q9         // combine high and low index;
+        vsub.u8         q10,   q14        // Look-up with 8bit
+        vtbx.8          d24,   {q2-q3},   d20
+        vtbx.8          d25,   {q2-q3},   d21
+        vaddw.u8        q13,   q12,       d16
+        vqmovun.s16     d8,    q13
+        vst1.8          d8,    [r0],      r2
+        bne             8b
+        subs            r5,    #8
+        beq             99f
+        mov             r4,    r12
+        add             r6, #8
+        mov             r0, r6
+        add             r7, #8
+        mov             r1, r7
+        b               0b
+4:      subs            r4,    #1
+        vld1.32         {d16[0]},  [r1],  r3
+        vshr.u8         d17,   d16,  #3  // src>>3
+        vshll.u8        q9,    d17,  #1   // lowIndex = 2*index
+        vadd.u16        q11,   q9,   q15  // highIndex = (2*index+1) << 8
+        vshl.u16        q10,   q11,  #8   // q10: highIndex;  q9: lowIndex;
+        vadd.u16        q10,   q9         // combine high and low index;
        // Look-up Table Round 1; index range: 0-15
-        vtbx.8   d24,   {q0-q1},   d20
-        vtbx.8   d25,   {q0-q1},   d21
+        vtbx.8          d24,   {q0-q1},   d20
+        vtbx.8          d25,   {q0-q1},   d21
        // Look-up Table Round 2; index range: 16-32
-        vsub.u8  q10,   q14        // Look-up with 8bit
-        vtbx.8   d24,   {q2-q3},   d20
-        vtbx.8   d25,   {q2-q3},   d21
-        vaddw.u8 q13,   q12,       d16
+        vsub.u8         q10,   q14        // Look-up with 8bit
+        vtbx.8          d24,   {q2-q3},   d20
+        vtbx.8          d25,   {q2-q3},   d21
+        vaddw.u8        q13,   q12,       d16
        vqmovun.s16     d14,       q13
-        vst1.32   d14[0],    [r0],     r2
-        bne      4b
-        b        99f
+        vst1.32         d14[0],    [r0],     r2
+        bne             4b
+        b               99f
 99:
-        vpop {d8-d15}
-        pop  {r4-r10}
-        bx   lr
+        vpop            {d8-d15}
+        pop             {r4-r10}
+        bx              lr
 endfunc

 function ff_hevc_sao_edge_filter_neon_8, export=1
-        push    {r4-r11}
-        ldr     r5,  [sp, #32]   // width
-        ldr     r4,  [sp, #36]   // height
-        ldr     r8,  [sp, #40]   // a_stride
-        ldr     r9,  [sp, #44]   // b_stride
-        ldr     r10, [sp, #48]   // sao_offset_val
-        ldr     r11, [sp, #52]   // edge_idx
-        vpush   {d8-d15}
-        mov     r12,  r4         // r12 = height
-        mov     r6,   r0         // r6 = r0 = dst
-        mov     r7,   r1         // r7 = r1 = src
-        vld1.8  {d0}, [r11]      // edge_idx table load in d0 5x8bit
-        vld1.16 {q1}, [r10]      // sao_offset_val table load in q1, 5x16bit
-        vmov.u8  d1,  #2
-        vmov.u16 q2,  #1
-0:      mov      r10,    r1
-        add      r10,    r8           // src[x + a_stride]
-        mov      r11,    r1
-        add      r11,    r9           // src[x + b_stride]
-        pld      [r1]
-        cmp      r5,     #4
-        beq      4f
-8:      subs     r4,     #1
-        vld1.8   {d16},  [r1],  r3    // src[x]  8x8bit
-        vld1.8   {d17},  [r10], r3    // src[x + a_stride]
-        vld1.8   {d18},  [r11], r3    // src[x + b_stride]
-        vcgt.u8  d8,     d16,   d17
-        vshr.u8  d9,     d8,    #7
-        vclt.u8  d8,     d16,   d17
-        vadd.u8  d8,     d9           // diff0
-        vcgt.u8  d10,    d16,   d18
-        vshr.u8  d11,    d10,   #7
-        vclt.u8  d10,    d16,   d18
-        vadd.u8  d10,    d11          // diff1
-        vadd.s8  d8,     d10
-        vadd.s8  d8,     d1
-        vtbx.8   d9,     {d0},  d8    // offset_val
-        vshll.u8 q6,     d9,    #1    // lowIndex
-        vadd.u16 q7,     q6,    q2
-        vshl.u16 q10,    q7,    #8    // highIndex
-        vadd.u16 q10,    q6           // combine lowIndex and highIndex, offset_val
-        vtbx.8   d22,    {q1},  d20
-        vtbx.8   d23,    {q1},  d21
-        vaddw.u8 q12,    q11,   d16
-        vqmovun.s16      d26,   q12
-        vst1.8   d26,    [r0],  r2
-        bne      8b
-        subs     r5,     #8
-        beq      99f
-        mov      r4,     r12
-        add      r6,     #8
-        mov      r0,     r6
-        add      r7,     #8
-        mov      r1,     r7
-        b        0b
-4:      subs     r4,    #1
-        vld1.32   {d16[0]},  [r1],  r3
-        vld1.32   {d17[0]},  [r10], r3    // src[x + a_stride]
-        vld1.32   {d18[0]},  [r11], r3    // src[x + b_stride]
-        vcgt.u8  d8,     d16,   d17
-        vshr.u8  d9,     d8,    #7
-        vclt.u8  d8,     d16,   d17
-        vadd.u8  d8,     d9           // diff0
-        vcgt.u8  d10,    d16,   d18
-        vshr.u8  d11,    d10,   #7
-        vclt.u8  d10,    d16,   d18
-        vadd.u8  d10,    d11          // diff1
-        vadd.s8  d8,     d10
-        vadd.s8  d8,     d1
-        vtbx.8   d9,     {d0},  d8    // offset_val
-        vshll.u8 q6,     d9,    #1    // lowIndex
-        vadd.u16 q7,     q6,    q2
-        vshl.u16 q10,    q7,    #8    // highIndex
-        vadd.u16 q10,    q6           // combine lowIndex and highIndex, offset_val
-        vtbx.8   d22,    {q1},  d20
-        vtbx.8   d23,    {q1},  d21
-        vaddw.u8 q12,    q11,   d16
-        vqmovun.s16      d26,   q12
-        vst1.32  d26[0], [r0],  r2
-        bne      4b
-        b        99f
+        push            {r4-r11}
+        ldr             r5,  [sp, #32]   // width
+        ldr             r4,  [sp, #36]   // height
+        ldr             r8,  [sp, #40]   // a_stride
+        ldr             r9,  [sp, #44]   // b_stride
+        ldr             r10, [sp, #48]   // sao_offset_val
+        ldr             r11, [sp, #52]   // edge_idx
+        vpush           {d8-d15}
+        mov             r12,  r4         // r12 = height
+        mov             r6,   r0         // r6 = r0 = dst
+        mov             r7,   r1         // r7 = r1 = src
+        vld1.8          {d0}, [r11]      // edge_idx table load in d0 5x8bit
+        vld1.16         {q1}, [r10]      // sao_offset_val table load in q1, 5x16bit
+        vmov.u8         d1,  #2
+        vmov.u16        q2,  #1
+0:      mov             r10,    r1
+        add             r10,    r8           // src[x + a_stride]
+        mov             r11,    r1
+        add             r11,    r9           // src[x + b_stride]
+        pld             [r1]
+        cmp             r5,     #4
+        beq             4f
+8:      subs            r4,     #1
+        vld1.8          {d16},  [r1],  r3    // src[x]  8x8bit
+        vld1.8          {d17},  [r10], r3    // src[x + a_stride]
+        vld1.8          {d18},  [r11], r3    // src[x + b_stride]
+        vcgt.u8         d8,     d16,   d17
+        vshr.u8         d9,     d8,    #7
+        vclt.u8         d8,     d16,   d17
+        vadd.u8         d8,     d9           // diff0
+        vcgt.u8         d10,    d16,   d18
+        vshr.u8         d11,    d10,   #7
+        vclt.u8         d10,    d16,   d18
+        vadd.u8         d10,    d11          // diff1
+        vadd.s8         d8,     d10
+        vadd.s8         d8,     d1
+        vtbx.8          d9,     {d0},  d8    // offset_val
+        vshll.u8        q6,     d9,    #1    // lowIndex
+        vadd.u16        q7,     q6,    q2
+        vshl.u16        q10,    q7,    #8    // highIndex
+        vadd.u16        q10,    q6           // combine lowIndex and highIndex, offset_val
+        vtbx.8          d22,    {q1},  d20
+        vtbx.8          d23,    {q1},  d21
+        vaddw.u8        q12,    q11,   d16
+        vqmovun.s16     d26,   q12
+        vst1.8          d26,    [r0],  r2
+        bne             8b
+        subs            r5,     #8
+        beq             99f
+        mov             r4,     r12
+        add             r6,     #8
+        mov             r0,     r6
+        add             r7,     #8
+        mov             r1,     r7
+        b               0b
+4:      subs            r4,    #1
+        vld1.32         {d16[0]},  [r1],  r3
+        vld1.32         {d17[0]},  [r10], r3    // src[x + a_stride]
+        vld1.32         {d18[0]},  [r11], r3    // src[x + b_stride]
+        vcgt.u8         d8,     d16,   d17
+        vshr.u8         d9,     d8,    #7
+        vclt.u8         d8,     d16,   d17
+        vadd.u8         d8,     d9           // diff0
+        vcgt.u8         d10,    d16,   d18
+        vshr.u8         d11,    d10,   #7
+        vclt.u8         d10,    d16,   d18
+        vadd.u8         d10,    d11          // diff1
+        vadd.s8         d8,     d10
+        vadd.s8         d8,     d1
+        vtbx.8          d9,     {d0},  d8    // offset_val
+        vshll.u8        q6,     d9,    #1    // lowIndex
+        vadd.u16        q7,     q6,    q2
+        vshl.u16        q10,    q7,    #8    // highIndex
+        vadd.u16        q10,    q6           // combine lowIndex and highIndex, offset_val
+        vtbx.8          d22,    {q1},  d20
+        vtbx.8          d23,    {q1},  d21
+        vaddw.u8        q12,    q11,   d16
+        vqmovun.s16     d26,    q12
+        vst1.32         d26[0], [r0],  r2
+        bne             4b
+        b               99f
 99:
-        vpop {d8-d15}
-        pop  {r4-r11}
-        bx   lr
+        vpop            {d8-d15}
+        pop             {r4-r11}
+        bx              lr
 endfunc
@@ -85,7 +85,7 @@
        beq             2f
        subs            \tmp, \tmp, #1
        beq             3f
-        b    4f
+        b               4f
 .endm

@ ----------------------------------------------------------------
@@ -56,314 +56,314 @@
 #define FIX_0xFFFF_ID          48

 function ff_j_rev_dct_arm, export=1
-        push {r0, r4 - r11, lr}
+        push            {r0, r4 - r11, lr}

-        mov lr, r0                      @ lr = pointer to the current row
-        mov r12, #8                     @ r12 = row-counter
-        movrel r11, const_array         @ r11 = base pointer to the constants array
+        mov             lr, r0                    @ lr = pointer to the current row
+        mov             r12, #8                   @ r12 = row-counter
+        movrel          r11, const_array          @ r11 = base pointer to the constants array
 row_loop:
-        ldrsh r0, [lr, # 0]             @ r0 = 'd0'
-        ldrsh r2, [lr, # 2]             @ r2 = 'd2'
+        ldrsh           r0, [lr, # 0]             @ r0 = 'd0'
+        ldrsh           r2, [lr, # 2]             @ r2 = 'd2'

        @ Optimization for row that have all items except the first set to 0
        @ (this works as the int16_t are always 4-byte aligned)
-        ldr r5, [lr, # 0]
-        ldr r6, [lr, # 4]
-        ldr r3, [lr, # 8]
-        ldr r4, [lr, #12]
-        orr r3, r3, r4
-        orr r3, r3, r6
-        orrs r5, r3, r5
-        beq end_of_row_loop             @ nothing to be done as ALL of them are '0'
-        orrs r3, r3, r2
-        beq empty_row
+        ldr             r5, [lr, # 0]
+        ldr             r6, [lr, # 4]
+        ldr             r3, [lr, # 8]
+        ldr             r4, [lr, #12]
+        orr             r3, r3, r4
+        orr             r3, r3, r6
+        orrs            r5, r3, r5
+        beq             end_of_row_loop           @ nothing to be done as ALL of them are '0'
+        orrs            r3, r3, r2
+        beq             empty_row

-        ldrsh r1, [lr, # 8]             @ r1 = 'd1'
-        ldrsh r4, [lr, # 4]             @ r4 = 'd4'
-        ldrsh r6, [lr, # 6]             @ r6 = 'd6'
+        ldrsh           r1, [lr, # 8]             @ r1 = 'd1'
+        ldrsh           r4, [lr, # 4]             @ r4 = 'd4'
+        ldrsh           r6, [lr, # 6]             @ r6 = 'd6'

-        ldr r3, [r11, #FIX_0_541196100_ID]
-        add r7, r2, r6
-        ldr r5, [r11, #FIX_M_1_847759065_ID]
-        mul r7, r3, r7                      @ r7 = z1
-        ldr r3, [r11, #FIX_0_765366865_ID]
-        mla r6, r5, r6, r7                  @ r6 = tmp2
-        add r5, r0, r4                      @ r5 = tmp0
-        mla r2, r3, r2, r7                  @ r2 = tmp3
-        sub r3, r0, r4                      @ r3 = tmp1
+        ldr             r3, [r11, #FIX_0_541196100_ID]
+        add             r7, r2, r6
+        ldr             r5, [r11, #FIX_M_1_847759065_ID]
+        mul             r7, r3, r7                @ r7 = z1
+        ldr             r3, [r11, #FIX_0_765366865_ID]
+        mla             r6, r5, r6, r7            @ r6 = tmp2
+        add             r5, r0, r4                @ r5 = tmp0
+        mla             r2, r3, r2, r7            @ r2 = tmp3
+        sub             r3, r0, r4                @ r3 = tmp1

-        add r0, r2, r5, lsl #13             @ r0 = tmp10
-        rsb r2, r2, r5, lsl #13             @ r2 = tmp13
-        add r4, r6, r3, lsl #13             @ r4 = tmp11
-        rsb r3, r6, r3, lsl #13             @ r3 = tmp12
+        add             r0, r2, r5, lsl #13       @ r0 = tmp10
+        rsb             r2, r2, r5, lsl #13       @ r2 = tmp13
+        add             r4, r6, r3, lsl #13       @ r4 = tmp11
+        rsb             r3, r6, r3, lsl #13       @ r3 = tmp12

-        push {r0, r2, r3, r4} @ save on the stack tmp10, tmp13, tmp12, tmp11
+        push            {r0, r2, r3, r4}          @ save on the stack tmp10, tmp13, tmp12, tmp11

-        ldrsh r3, [lr, #10]             @ r3 = 'd3'
-        ldrsh r5, [lr, #12]             @ r5 = 'd5'
-        ldrsh r7, [lr, #14]             @ r7 = 'd7'
+        ldrsh           r3, [lr, #10]             @ r3 = 'd3'
+        ldrsh           r5, [lr, #12]             @ r5 = 'd5'
+        ldrsh           r7, [lr, #14]             @ r7 = 'd7'

-        add r0, r3, r5                        @ r0 = 'z2'
-        add r2, r1, r7                  @ r2 = 'z1'
-        add r4, r3, r7                  @ r4 = 'z3'
-        add r6, r1, r5                  @ r6 = 'z4'
-        ldr r9, [r11, #FIX_1_175875602_ID]
-        add r8, r4, r6                  @ r8 = z3 + z4
-        ldr r10, [r11, #FIX_M_0_899976223_ID]
-        mul r8, r9, r8                  @ r8 = 'z5'
-        ldr r9, [r11, #FIX_M_2_562915447_ID]
-        mul r2, r10, r2                 @ r2 = 'z1'
-        ldr r10, [r11, #FIX_M_1_961570560_ID]
-        mul r0, r9, r0                  @ r0 = 'z2'
-        ldr r9, [r11, #FIX_M_0_390180644_ID]
-        mla r4, r10, r4, r8             @ r4 = 'z3'
-        ldr r10, [r11, #FIX_0_298631336_ID]
-        mla r6, r9, r6, r8              @ r6 = 'z4'
-        ldr r9, [r11, #FIX_2_053119869_ID]
-        mla r7, r10, r7, r2             @ r7 = tmp0 + z1
-        ldr r10, [r11, #FIX_3_072711026_ID]
-        mla r5, r9, r5, r0              @ r5 = tmp1 + z2
-        ldr r9, [r11, #FIX_1_501321110_ID]
-        mla r3, r10, r3, r0             @ r3 = tmp2 + z2
-        add r7, r7, r4                  @ r7 = tmp0
-        mla r1, r9, r1, r2              @ r1 = tmp3 + z1
-        add r5,        r5, r6                  @ r5 = tmp1
-        add r3, r3, r4                  @ r3 = tmp2
-        add r1, r1, r6                  @ r1 = tmp3
+        add             r0, r3, r5                @ r0 = 'z2'
+        add             r2, r1, r7                @ r2 = 'z1'
+        add             r4, r3, r7                @ r4 = 'z3'
+        add             r6, r1, r5                @ r6 = 'z4'
+        ldr             r9, [r11, #FIX_1_175875602_ID]
+        add             r8, r4, r6                @ r8 = z3 + z4
+        ldr             r10, [r11, #FIX_M_0_899976223_ID]
+        mul             r8, r9, r8                @ r8 = 'z5'
+        ldr             r9, [r11, #FIX_M_2_562915447_ID]
+        mul             r2, r10, r2               @ r2 = 'z1'
+        ldr             r10, [r11, #FIX_M_1_961570560_ID]
+        mul             r0, r9, r0                @ r0 = 'z2'
+        ldr             r9, [r11, #FIX_M_0_390180644_ID]
+        mla             r4, r10, r4, r8           @ r4 = 'z3'
+        ldr             r10, [r11, #FIX_0_298631336_ID]
+        mla             r6, r9, r6, r8            @ r6 = 'z4'
+        ldr             r9, [r11, #FIX_2_053119869_ID]
+        mla             r7, r10, r7, r2           @ r7 = tmp0 + z1
+        ldr             r10, [r11, #FIX_3_072711026_ID]
+        mla             r5, r9, r5, r0            @ r5 = tmp1 + z2
+        ldr             r9, [r11, #FIX_1_501321110_ID]
+        mla             r3, r10, r3, r0           @ r3 = tmp2 + z2
+        add             r7, r7, r4                @ r7 = tmp0
+        mla             r1, r9, r1, r2            @ r1 = tmp3 + z1
+        add             r5, r5, r6                @ r5 = tmp1
+        add             r3, r3, r4                @ r3 = tmp2
+        add             r1, r1, r6                @ r1 = tmp3

-        pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
-                             @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
+        pop             {r0, r2, r4, r6}          @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
+                                                  @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0

        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
-        add r8, r0, r1
-        add r8, r8, #(1<<10)
-        mov r8, r8, asr #11
-        strh r8, [lr, # 0]
+        add             r8, r0, r1
+        add             r8, r8, #(1<<10)
+        mov             r8, r8, asr #11
+        strh            r8, [lr, # 0]

        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
-        sub r8, r0, r1
-        add r8, r8, #(1<<10)
-        mov r8, r8, asr #11
-        strh r8, [lr, #14]
+        sub             r8, r0, r1
+        add             r8, r8, #(1<<10)
+        mov             r8, r8, asr #11
+        strh            r8, [lr, #14]

        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
-        add r8, r6, r3
-        add r8, r8, #(1<<10)
-        mov r8, r8, asr #11
-        strh r8, [lr, # 2]
+        add             r8, r6, r3
+        add             r8, r8, #(1<<10)
+        mov             r8, r8, asr #11
+        strh            r8, [lr, # 2]

        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
-        sub r8, r6, r3
-        add r8, r8, #(1<<10)
-        mov r8, r8, asr #11
-        strh r8, [lr, #12]
+        sub             r8, r6, r3
+        add             r8, r8, #(1<<10)
+        mov             r8, r8, asr #11
+        strh            r8, [lr, #12]

        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
-        add r8, r4, r5
-        add r8, r8, #(1<<10)
-        mov r8, r8, asr #11
-        strh r8, [lr, # 4]
+        add             r8, r4, r5
+        add             r8, r8, #(1<<10)
+        mov             r8, r8, asr #11
+        strh            r8, [lr, # 4]

        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
-        sub r8, r4, r5
-        add r8, r8, #(1<<10)
-        mov r8, r8, asr #11
-        strh r8, [lr, #10]
+        sub             r8, r4, r5
+        add             r8, r8, #(1<<10)
+        mov             r8, r8, asr #11
+        strh            r8, [lr, #10]

        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
-        add r8, r2, r7
-        add r8, r8, #(1<<10)
-        mov r8, r8, asr #11
-        strh r8, [lr, # 6]
+        add             r8, r2, r7
+        add             r8, r8, #(1<<10)
+        mov             r8, r8, asr #11
+        strh            r8, [lr, # 6]

        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
-        sub r8, r2, r7
-        add r8, r8, #(1<<10)
-        mov r8, r8, asr #11
-        strh r8, [lr, # 8]
+        sub             r8, r2, r7
+        add             r8, r8, #(1<<10)
+        mov             r8, r8, asr #11
+        strh            r8, [lr, # 8]

        @ End of row loop
-        add lr, lr, #16
-        subs r12, r12, #1
-        bne row_loop
-        beq start_column_loop
+        add             lr, lr, #16
+        subs            r12, r12, #1
+        bne             row_loop
+        beq             start_column_loop

 empty_row:
-        ldr r1, [r11, #FIX_0xFFFF_ID]
-        mov r0, r0, lsl #2
-        and r0, r0, r1
-        add r0, r0, r0, lsl #16
-        str r0, [lr, # 0]
-        str r0, [lr, # 4]
-        str r0, [lr, # 8]
-        str r0, [lr, #12]
+        ldr             r1, [r11, #FIX_0xFFFF_ID]
+        mov             r0, r0, lsl #2
+        and             r0, r0, r1
+        add             r0, r0, r0, lsl #16
+        str             r0, [lr, # 0]
+        str             r0, [lr, # 4]
+        str             r0, [lr, # 8]
+        str             r0, [lr, #12]

 end_of_row_loop:
        @ End of loop
-        add lr, lr, #16
-        subs r12, r12, #1
-        bne row_loop
+        add             lr, lr, #16
+        subs            r12, r12, #1
+        bne             row_loop

 start_column_loop:
        @ Start of column loop
-        pop {lr}
-        mov r12, #8
+        pop             {lr}
+        mov             r12, #8
 column_loop:
-        ldrsh r0, [lr, #( 0*8)]             @ r0 = 'd0'
-        ldrsh r2, [lr, #( 4*8)]             @ r2 = 'd2'
-        ldrsh r4, [lr, #( 8*8)]             @ r4 = 'd4'
-        ldrsh r6, [lr, #(12*8)]             @ r6 = 'd6'
+        ldrsh           r0, [lr, #( 0*8)]         @ r0 = 'd0'
+        ldrsh           r2, [lr, #( 4*8)]         @ r2 = 'd2'
+        ldrsh           r4, [lr, #( 8*8)]         @ r4 = 'd4'
+        ldrsh           r6, [lr, #(12*8)]         @ r6 = 'd6'

-        ldr r3, [r11, #FIX_0_541196100_ID]
-        add r1, r2, r6
-        ldr r5, [r11, #FIX_M_1_847759065_ID]
-        mul r1, r3, r1                      @ r1 = z1
-        ldr r3, [r11, #FIX_0_765366865_ID]
-        mla r6, r5, r6, r1                  @ r6 = tmp2
-        add r5, r0, r4                      @ r5 = tmp0
-        mla r2, r3, r2, r1                  @ r2 = tmp3
-        sub r3, r0, r4                      @ r3 = tmp1
+        ldr             r3, [r11, #FIX_0_541196100_ID]
+        add             r1, r2, r6
+        ldr             r5, [r11, #FIX_M_1_847759065_ID]
+        mul             r1, r3, r1                @ r1 = z1
+        ldr             r3, [r11, #FIX_0_765366865_ID]
+        mla             r6, r5, r6, r1            @ r6 = tmp2
+        add             r5, r0, r4                @ r5 = tmp0
+        mla             r2, r3, r2, r1            @ r2 = tmp3
+        sub             r3, r0, r4                @ r3 = tmp1

-        add r0, r2, r5, lsl #13             @ r0 = tmp10
-        rsb r2, r2, r5, lsl #13             @ r2 = tmp13
-        add r4, r6, r3, lsl #13             @ r4 = tmp11
-        rsb r6, r6, r3, lsl #13             @ r6 = tmp12
+        add             r0, r2, r5, lsl #13       @ r0 = tmp10
+        rsb             r2, r2, r5, lsl #13       @ r2 = tmp13
+        add             r4, r6, r3, lsl #13       @ r4 = tmp11
+        rsb             r6, r6, r3, lsl #13       @ r6 = tmp12

-        ldrsh r1, [lr, #( 2*8)]             @ r1 = 'd1'
-        ldrsh r3, [lr, #( 6*8)]             @ r3 = 'd3'
-        ldrsh r5, [lr, #(10*8)]             @ r5 = 'd5'
-        ldrsh r7, [lr, #(14*8)]             @ r7 = 'd7'
+        ldrsh           r1, [lr, #( 2*8)]         @ r1 = 'd1'
+        ldrsh           r3, [lr, #( 6*8)]         @ r3 = 'd3'
+        ldrsh           r5, [lr, #(10*8)]         @ r5 = 'd5'
+        ldrsh           r7, [lr, #(14*8)]         @ r7 = 'd7'

        @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
-        orr r9, r1, r3
-        orr r10, r5, r7
-        orrs r10, r9, r10
-        beq empty_odd_column
+        orr             r9, r1, r3
+        orr             r10, r5, r7
+        orrs            r10, r9, r10
+        beq             empty_odd_column

-        push {r0, r2, r4, r6} @ save on the stack tmp10, tmp13, tmp12, tmp11
+        push            {r0, r2, r4, r6}          @ save on the stack tmp10, tmp13, tmp12, tmp11

-        add r0, r3, r5                  @ r0 = 'z2'
-        add r2, r1, r7                  @ r2 = 'z1'
-        add r4, r3, r7                  @ r4 = 'z3'
-        add r6, r1, r5                  @ r6 = 'z4'
-        ldr r9, [r11, #FIX_1_175875602_ID]
-        add r8, r4, r6
-        ldr r10, [r11, #FIX_M_0_899976223_ID]
-        mul r8, r9, r8                  @ r8 = 'z5'
-        ldr r9, [r11, #FIX_M_2_562915447_ID]
-        mul r2, r10, r2                 @ r2 = 'z1'
-        ldr r10, [r11, #FIX_M_1_961570560_ID]
-        mul r0, r9, r0                  @ r0 = 'z2'
-        ldr r9, [r11, #FIX_M_0_390180644_ID]
-        mla r4, r10, r4, r8             @ r4 = 'z3'
-        ldr r10, [r11, #FIX_0_298631336_ID]
-        mla r6, r9, r6, r8              @ r6 = 'z4'
-        ldr r9, [r11, #FIX_2_053119869_ID]
-        mla r7, r10, r7, r2             @ r7 = tmp0 + z1
-        ldr r10, [r11, #FIX_3_072711026_ID]
-        mla r5, r9, r5, r0              @ r5 = tmp1 + z2
-        ldr r9, [r11, #FIX_1_501321110_ID]
-        mla r3, r10, r3, r0             @ r3 = tmp2 + z2
-        add r7, r7, r4                  @ r7 = tmp0
-        mla r1, r9, r1, r2              @ r1 = tmp3 + z1
-        add r5,        r5, r6                  @ r5 = tmp1
-        add r3, r3, r4                  @ r3 = tmp2
-        add r1, r1, r6                  @ r1 = tmp3
+        add             r0, r3, r5                @ r0 = 'z2'
+        add             r2, r1, r7                @ r2 = 'z1'
+        add             r4, r3, r7                @ r4 = 'z3'
+        add             r6, r1, r5                @ r6 = 'z4'
+        ldr             r9, [r11, #FIX_1_175875602_ID]
+        add             r8, r4, r6
+        ldr             r10, [r11, #FIX_M_0_899976223_ID]
+        mul             r8, r9, r8                @ r8 = 'z5'
+        ldr             r9, [r11, #FIX_M_2_562915447_ID]
+        mul             r2, r10, r2               @ r2 = 'z1'
+        ldr             r10, [r11, #FIX_M_1_961570560_ID]
+        mul             r0, r9, r0                @ r0 = 'z2'
+        ldr             r9, [r11, #FIX_M_0_390180644_ID]
+        mla             r4, r10, r4, r8           @ r4 = 'z3'
+        ldr             r10, [r11, #FIX_0_298631336_ID]
+        mla             r6, r9, r6, r8            @ r6 = 'z4'
+        ldr             r9, [r11, #FIX_2_053119869_ID]
+        mla             r7, r10, r7, r2           @ r7 = tmp0 + z1
+        ldr             r10, [r11, #FIX_3_072711026_ID]
+        mla             r5, r9, r5, r0            @ r5 = tmp1 + z2
+        ldr             r9, [r11, #FIX_1_501321110_ID]
+        mla             r3, r10, r3, r0           @ r3 = tmp2 + z2
+        add             r7, r7, r4                @ r7 = tmp0
+        mla             r1, r9, r1, r2            @ r1 = tmp3 + z1
+        add             r5, r5, r6                @ r5 = tmp1
+        add             r3, r3, r4                @ r3 = tmp2
+        add             r1, r1, r6                @ r1 = tmp3

-        pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
-                             @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
+        pop             {r0, r2, r4, r6}          @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
+                                                  @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0

        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
-        add r8, r0, r1
-        add r8, r8, #(1<<17)
-        mov r8, r8, asr #18
-        strh r8, [lr, #( 0*8)]
+        add             r8, r0, r1
+        add             r8, r8, #(1<<17)
+        mov             r8, r8, asr #18
+        strh            r8, [lr, #( 0*8)]

        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
-        sub r8, r0, r1
-        add r8, r8, #(1<<17)
-        mov r8, r8, asr #18
-        strh r8, [lr, #(14*8)]
+        sub             r8, r0, r1
+        add             r8, r8, #(1<<17)
+        mov             r8, r8, asr #18
+        strh            r8, [lr, #(14*8)]

        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
-        add r8, r4, r3
-        add r8, r8, #(1<<17)
-        mov r8, r8, asr #18
-        strh r8, [lr, #( 2*8)]
+        add             r8, r4, r3
+        add             r8, r8, #(1<<17)
+        mov             r8, r8, asr #18
+        strh            r8, [lr, #( 2*8)]

        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
-        sub r8, r4, r3
-        add r8, r8, #(1<<17)
-        mov r8, r8, asr #18
-        strh r8, [lr, #(12*8)]
+        sub             r8, r4, r3
+        add             r8, r8, #(1<<17)
+        mov             r8, r8, asr #18
+        strh            r8, [lr, #(12*8)]

        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
-        add r8, r6, r5
-        add r8, r8, #(1<<17)
-        mov r8, r8, asr #18
-        strh r8, [lr, #( 4*8)]
+        add             r8, r6, r5
+        add             r8, r8, #(1<<17)
+        mov             r8, r8, asr #18
+        strh            r8, [lr, #( 4*8)]

        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
-        sub r8, r6, r5
-        add r8, r8, #(1<<17)
-        mov r8, r8, asr #18
-        strh r8, [lr, #(10*8)]
+        sub             r8, r6, r5
+        add             r8, r8, #(1<<17)
+        mov             r8, r8, asr #18
+        strh            r8, [lr, #(10*8)]

        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
-        add r8, r2, r7
-        add r8, r8, #(1<<17)
-        mov r8, r8, asr #18
-        strh r8, [lr, #( 6*8)]
+        add             r8, r2, r7
+        add             r8, r8, #(1<<17)
+        mov             r8, r8, asr #18
+        strh            r8, [lr, #( 6*8)]

        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
-        sub r8, r2, r7
-        add r8, r8, #(1<<17)
-        mov r8, r8, asr #18
-        strh r8, [lr, #( 8*8)]
+        sub             r8, r2, r7
+        add             r8, r8, #(1<<17)
+        mov             r8, r8, asr #18
+        strh            r8, [lr, #( 8*8)]

        @ End of row loop
-        add lr, lr, #2
-        subs r12, r12, #1
-        bne column_loop
-        beq the_end
+        add             lr, lr, #2
+        subs            r12, r12, #1
+        bne             column_loop
+        beq             the_end

 empty_odd_column:
        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
-        add r0, r0, #(1<<17)
-        mov r0, r0, asr #18
-        strh r0, [lr, #( 0*8)]
-        strh r0, [lr, #(14*8)]
+        add             r0, r0, #(1<<17)
+        mov             r0, r0, asr #18
+        strh            r0, [lr, #( 0*8)]
+        strh            r0, [lr, #(14*8)]

        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
-        add r4, r4, #(1<<17)
-        mov r4, r4, asr #18
-        strh r4, [lr, #( 2*8)]
-        strh r4, [lr, #(12*8)]
+        add             r4, r4, #(1<<17)
+        mov             r4, r4, asr #18
+        strh            r4, [lr, #( 2*8)]
+        strh            r4, [lr, #(12*8)]

        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
-        add r6, r6, #(1<<17)
-        mov r6, r6, asr #18
-        strh r6, [lr, #( 4*8)]
-        strh r6, [lr, #(10*8)]
+        add             r6, r6, #(1<<17)
+        mov             r6, r6, asr #18
+        strh            r6, [lr, #( 4*8)]
+        strh            r6, [lr, #(10*8)]

        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
-        add r2, r2, #(1<<17)
-        mov r2, r2, asr #18
-        strh r2, [lr, #( 6*8)]
-        strh r2, [lr, #( 8*8)]
+        add             r2, r2, #(1<<17)
+        mov             r2, r2, asr #18
+        strh            r2, [lr, #( 6*8)]
+        strh            r2, [lr, #( 8*8)]

        @ End of row loop
-        add lr, lr, #2
-        subs r12, r12, #1
-        bne column_loop
+        add             lr, lr, #2
+        subs            r12, r12, #1
+        bne             column_loop

 the_end:
        @ The end....
-        pop {r4 - r11, pc}
+        pop             {r4 - r11, pc}
 endfunc

 const const_array
@@ -70,7 +70,7 @@
 /* void rv34_idct_add_c(uint8_t *dst, int stride, int16_t *block) */
 function ff_rv34_idct_add_neon, export=1
        mov             r3,  r0
-        rv34_inv_transform   r2
+        rv34_inv_transform r2
        vmov.i16        q12, #0
        vrshrn.s32      d16, q1,  #10   @ (z0 + z3) >> 10
        vrshrn.s32      d17, q2,  #10   @ (z1 + z2) >> 10
@@ -99,7 +99,7 @@ endfunc

 /* void rv34_inv_transform_noround_neon(int16_t *block); */
 function ff_rv34_inv_transform_noround_neon, export=1
-        rv34_inv_transform   r0
+        rv34_inv_transform r0
        vshl.s32        q11, q2,  #1
        vshl.s32        q10, q1,  #1
        vshl.s32        q12, q3,  #1
@@ -687,7 +687,7 @@ endfunc
 .endm

 /* void ff_rv40_weight_func_16_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
-                                    int w1, int w2, int stride) */
+ *                                  int w1, int w2, int stride) */
 function ff_rv40_weight_func_16_neon, export=1
        ldr             r12, [sp]
        vmov            d0,  r3,  r12
@@ -704,7 +704,7 @@ function ff_rv40_weight_func_16_neon, export=1
 endfunc

 /* void ff_rv40_weight_func_8_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
-                                   int w1, int w2, int stride) */
+ *                                 int w1, int w2, int stride) */
 function ff_rv40_weight_func_8_neon, export=1
        ldr             r12, [sp]
        vmov            d0,  r3,  r12
--- a/Show More
+++ b/Show More