LCOV - code coverage report
Current view: top level - ASM_AVX2 - warp_plane_avx2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 766 777 98.6 %
Date: 2019-11-25 17:38:06 Functions: 24 24 100.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2019, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <immintrin.h>
      13             : #include "aom_dsp_rtcd.h"
      14             : #include "EbWarpedMotion.h"
      15             : 
      16             :  /* This is a modified version of 'eb_warped_filter' from warped_motion.c:
      17             :     * Each coefficient is stored in 8 bits instead of 16 bits
      18             :     * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
      19             : 
      20             :       This is done in order to avoid overflow: Since the tap with the largest
      21             :       coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
      22             :       order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
      23             :       convolve functions.
      24             : 
      25             :       Instead, we use the summation order
      26             :       ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
      27             :       The rearrangement of coefficients in this table is so that we can get the
      28             :       coefficients into the correct order more quickly.
      29             :  */
      30             :  /* clang-format off */
      31             : DECLARE_ALIGNED(8, const int8_t,
      32             : eb_av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
      33             : #if WARPEDPIXEL_PREC_BITS == 6
      34             :         // [-1, 0)
      35             :         { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
      36             :         { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
      37             :         { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
      38             :         { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
      39             :         { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
      40             :         { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
      41             :         { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
      42             :         { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
      43             :         { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
      44             :         { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
      45             :         { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
      46             :         { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
      47             :         { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
      48             :         { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
      49             :         { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
      50             :         { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
      51             :         { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
      52             :         { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
      53             :         { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
      54             :         { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
      55             :         { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
      56             :         { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
      57             :         { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
      58             :         { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
      59             :         { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
      60             :         { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
      61             :         { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
      62             :         { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
      63             :         { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
      64             :         { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
      65             :         { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
      66             :         { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
      67             :         // [0, 1)
      68             :         { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
      69             :         { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
      70             :         { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
      71             :         {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
      72             :         {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
      73             :         {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
      74             :         {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
      75             :         {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
      76             :         {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
      77             :         {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
      78             :         {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
      79             :         {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
      80             :         {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
      81             :         {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
      82             :         {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
      83             :         {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
      84             :         {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
      85             :         {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
      86             :         {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
      87             :         {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
      88             :         {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
      89             :         {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
      90             :         {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
      91             :         {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
      92             :         {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
      93             :         {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
      94             :         {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
      95             :         {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
      96             :         {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
      97             :         {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
      98             :         { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
      99             :         { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
     100             :         // [1, 2)
     101             :         { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
     102             :         { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
     103             :         { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
     104             :         { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
     105             :         { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
     106             :         { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
     107             :         { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
     108             :         { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
     109             :         { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
     110             :         { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
     111             :         { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
     112             :         { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
     113             :         { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
     114             :         { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
     115             :         { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
     116             :         { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
     117             :         { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
     118             :         { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
     119             :         { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
     120             :         { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
     121             :         { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
     122             :         { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
     123             :         { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
     124             :         { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
     125             :         { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
     126             :         { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
     127             :         { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
     128             :         { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
     129             :         { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
     130             :         { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
     131             :         { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
     132             :         { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
     133             :         // dummy (replicate row index 191)
     134             :         { 0, 0,   2,  -1, 0,   0, 127, 0},
     135             : 
     136             :       #else
     137             :         // [-1, 0)
     138             :         { 0, 127,   0, 0,   0,   1, 0, 0}, { 1, 127,  -1, 0,  -3,   4, 0, 0},
     139             :         { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 124,  -4, 0,  -7,  13, 1, 0},
     140             :         { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 120,  -7, 0, -11,  22, 2, 0},
     141             :         { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 114, -10, 0, -14,  32, 3, 0},
     142             :         { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 108, -12, 0, -16,  42, 3, 0},
     143             :         { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 100, -14, 0, -17,  52, 3, 0},
     144             :         { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  91, -16, 0, -18,  63, 4, 0},
     145             :         { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  82, -17, 0, -18,  73, 4, 0},
     146             :         { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  73, -18, 0, -17,  82, 4, 0},
     147             :         { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  63, -18, 0, -16,  91, 4, 0},
     148             :         { 3,  58, -18, 0, -15,  96, 4, 0}, { 3,  52, -17, 0, -14, 100, 4, 0},
     149             :         { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  42, -16, 0, -12, 108, 3, 0},
     150             :         { 3,  37, -15, 0, -11, 111, 3, 0}, { 3,  32, -14, 0, -10, 114, 3, 0},
     151             :         { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  22, -11, 0,  -7, 120, 2, 0},
     152             :         { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  13,  -7, 0,  -4, 124, 1, 0},
     153             :         { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 0,   4,  -3, 0,  -1, 127, 1, 0},
     154             :         // [0, 1)
     155             :         { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -3,   4, 1, 1, 127,  -2,  0},
     156             :         { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -8,  13, 2, 3, 125,  -5, -1},
     157             :         {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -13,  23, 3, 4, 121,  -8, -1},
     158             :         {-1, -15,  27, 4, 5, 119, -10, -1}, {-2, -17,  33, 5, 6, 116, -12, -1},
     159             :         {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  43, 6, 7, 110, -15, -2},
     160             :         {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  54, 7, 7, 102, -17, -2},
     161             :         {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  64, 7, 8,  94, -19, -2},
     162             :         {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -21,  74, 8, 8,  84, -21, -2},
     163             :         {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  84, 8, 8,  74, -21, -2},
     164             :         {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -19,  94, 8, 7,  64, -22, -2},
     165             :         {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -17, 102, 7, 7,  54, -21, -2},
     166             :         {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 110, 7, 6,  43, -19, -2},
     167             :         {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 116, 6, 5,  33, -17, -2},
     168             :         {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -8, 121, 4, 3,  23, -13, -1},
     169             :         {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -5, 125, 3, 2,  13,  -8, -1},
     170             :         { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   4,  -3,  0},
     171             :         // [1, 2)
     172             :         { 0,  0, 127,   0, 0,   1,   0, 0}, { 0, 1, 127,  -1, 0,  -3,   4, 0},
     173             :         { 0,  1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 124,  -4, 0,  -7,  13, 1},
     174             :         { 0,  2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 120,  -7, 0, -11,  22, 2},
     175             :         { 0,  3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 114, -10, 0, -14,  32, 3},
     176             :         { 0,  3, 111, -11, 0, -15,  37, 3}, { 0, 3, 108, -12, 0, -16,  42, 3},
     177             :         { 0,  4, 104, -13, 0, -17,  47, 3}, { 0, 4, 100, -14, 0, -17,  52, 3},
     178             :         { 0,  4,  96, -15, 0, -18,  58, 3}, { 0, 4,  91, -16, 0, -18,  63, 4},
     179             :         { 0,  4,  87, -17, 0, -18,  68, 4}, { 0, 4,  82, -17, 0, -18,  73, 4},
     180             :         { 0,  4,  78, -18, 0, -18,  78, 4}, { 0, 4,  73, -18, 0, -17,  82, 4},
     181             :         { 0,  4,  68, -18, 0, -17,  87, 4}, { 0, 4,  63, -18, 0, -16,  91, 4},
     182             :         { 0,  3,  58, -18, 0, -15,  96, 4}, { 0, 3,  52, -17, 0, -14, 100, 4},
     183             :         { 0,  3,  47, -17, 0, -13, 104, 4}, { 0, 3,  42, -16, 0, -12, 108, 3},
     184             :         { 0,  3,  37, -15, 0, -11, 111, 3}, { 0, 3,  32, -14, 0, -10, 114, 3},
     185             :         { 0,  2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  22, -11, 0,  -7, 120, 2},
     186             :         { 0,  1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  13,  -7, 0,  -4, 124, 1},
     187             :         { 0,  1,   8,  -5, 0,  -3, 126, 1}, { 0, 0,   4,  -3, 0,  -1, 127, 1},
     188             :         // dummy (replicate row index 95)
     189             :         { 0, 0,   4,  -3, 0,  -1, 127, 1},
     190             :       #endif  // WARPEDPIXEL_PREC_BITS == 6
     191             : };
     192             : /* clang-format on */
     193             : 
     194             : DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = {
     195             :   0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
     196             :   0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
     197             : };
     198             : 
     199             : DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = {
     200             :   2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
     201             :   2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
     202             : };
     203             : 
     204             : DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = {
     205             :   4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
     206             :   4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5
     207             : };
     208             : 
     209             : DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = {
     210             :   6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
     211             :   6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7
     212             : };
     213             : 
     214             : DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = {
     215             :   0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
     216             :   0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
     217             : };
     218             : 
     219             : DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = {
     220             :   4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
     221             :   4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
     222             : };
     223             : 
     224             : DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = {
     225             :   8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
     226             :   8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
     227             : };
     228             : 
     229             : DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = {
     230             :   12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15,
     231             :   12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
     232             : };
     233             : 
     234             : DECLARE_ALIGNED(32, static const uint8_t,
     235             : shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3,
     236             :                       5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6,
     237             :                       6, 8, 1, 3, 3, 5, 5, 7, 7, 9 };
     238             : 
     239             : DECLARE_ALIGNED(32, static const uint8_t,
     240             : shuffle_src1[32]) = { 4,  6,  6,  8,  8,  10, 10, 12, 5,  7, 7,
     241             :                       9,  9,  11, 11, 13, 4,  6,  6,  8,  8, 10,
     242             :                       10, 12, 5,  7,  7,  9,  9,  11, 11, 13 };
     243             : 
     244             : DECLARE_ALIGNED(32, static const uint8_t,
     245             : shuffle_src2[32]) = { 1, 3, 3, 5, 5,  7, 7, 9, 2, 4, 4,
     246             :                       6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7,
     247             :                       7, 9, 2, 4, 4,  6, 6, 8, 8, 10 };
     248             : 
     249             : DECLARE_ALIGNED(32, static const uint8_t,
     250             : shuffle_src3[32]) = { 5,  7,  7,  9,  9,  11, 11, 13, 6,  8, 8,
     251             :                       10, 10, 12, 12, 14, 5,  7,  7,  9,  9, 11,
     252             :                       11, 13, 6,  8,  8,  10, 10, 12, 12, 14 };
     253             : 
     254  2559100000 : static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out,
     255             :     __m256i *coeff,
     256             :     const __m256i *shuffle_src,
     257             :     const __m256i *round_const,
     258             :     const __m128i *shift, int row) {
     259  2559100000 :     const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]);
     260  2559100000 :     const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]);
     261  2559100000 :     const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]);
     262  2559100000 :     const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]);
     263             : 
     264  2559100000 :     const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]);
     265  2559100000 :     const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]);
     266  2559100000 :     const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]);
     267  5118200000 :     const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]);
     268             : 
     269  2559100000 :     const __m256i res_even = _mm256_add_epi16(res_02, res_46);
     270  2559100000 :     const __m256i res_odd = _mm256_add_epi16(res_13, res_57);
     271             :     const __m256i res =
     272  5118200000 :         _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const);
     273  2559100000 :     horz_out[row] = _mm256_srl_epi16(res, *shift);
     274  2559100000 : }
     275             : 
     276  1570640000 : static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta,
     277             :     int sx,
     278             :     __m256i *coeff) {
     279  1570640000 :     __m128i tmp_0 = _mm_loadl_epi64(
     280  1570640000 :         (__m128i *)&eb_av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
     281  1570640000 :     __m128i tmp_1 = _mm_loadl_epi64(
     282  1570640000 :         (__m128i *)&eb_av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
     283  1570640000 :     __m128i tmp_2 = _mm_loadl_epi64(
     284  1570640000 :         (__m128i *)&eb_av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
     285  1570640000 :     __m128i tmp_3 = _mm_loadl_epi64(
     286  1570640000 :         (__m128i *)&eb_av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
     287  1570640000 :     __m128i tmp_4 = _mm_loadl_epi64(
     288  1570640000 :         (__m128i *)&eb_av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
     289  1570640000 :     __m128i tmp_5 = _mm_loadl_epi64(
     290  1570640000 :         (__m128i *)&eb_av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
     291  1570640000 :     __m128i tmp_6 = _mm_loadl_epi64(
     292  1570640000 :         (__m128i *)&eb_av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
     293  1570640000 :     __m128i tmp_7 = _mm_loadl_epi64(
     294  1570640000 :         (__m128i *)&eb_av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
     295             : 
     296  1570640000 :     tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2);
     297  1570640000 :     tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3);
     298  1570640000 :     tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6);
     299  1570640000 :     tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7);
     300             : 
     301             :     __m128i tmp_8 =
     302  1570640000 :         _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[((sx + beta) + 0 * alpha) >>
     303             :             WARPEDDIFF_PREC_BITS]);
     304             :     __m128i tmp_9 =
     305  1570640000 :         _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[((sx + beta) + 1 * alpha) >>
     306             :             WARPEDDIFF_PREC_BITS]);
     307             :     __m128i tmp_10 =
     308  1570640000 :         _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[((sx + beta) + 2 * alpha) >>
     309             :             WARPEDDIFF_PREC_BITS]);
     310             :     __m128i tmp_11 =
     311  1570640000 :         _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[((sx + beta) + 3 * alpha) >>
     312             :             WARPEDDIFF_PREC_BITS]);
     313             :     tmp_2 =
     314  1570640000 :         _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[((sx + beta) + 4 * alpha) >>
     315             :             WARPEDDIFF_PREC_BITS]);
     316             :     tmp_3 =
     317  1570640000 :         _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[((sx + beta) + 5 * alpha) >>
     318             :             WARPEDDIFF_PREC_BITS]);
     319             :     tmp_6 =
     320  1570640000 :         _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[((sx + beta) + 6 * alpha) >>
     321             :             WARPEDDIFF_PREC_BITS]);
     322             :     tmp_7 =
     323  3141290000 :         _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[((sx + beta) + 7 * alpha) >>
     324             :             WARPEDDIFF_PREC_BITS]);
     325             : 
     326  1570640000 :     tmp_8 = _mm_unpacklo_epi16(tmp_8, tmp_10);
     327  1570640000 :     tmp_2 = _mm_unpacklo_epi16(tmp_2, tmp_6);
     328  1570640000 :     tmp_9 = _mm_unpacklo_epi16(tmp_9, tmp_11);
     329  1570640000 :     tmp_3 = _mm_unpacklo_epi16(tmp_3, tmp_7);
     330             : 
     331             :     const __m256i tmp_12 =
     332  1570640000 :         _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_8, 0x1);
     333             :     const __m256i tmp_13 =
     334  1570640000 :         _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_1), tmp_9, 0x1);
     335             :     const __m256i tmp_14 =
     336  1570640000 :         _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_4), tmp_2, 0x1);
     337             :     const __m256i tmp_15 =
     338  1570640000 :         _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_5), tmp_3, 0x1);
     339             : 
     340  1570640000 :     const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
     341  1570640000 :     const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
     342  1570640000 :     const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
     343  1570640000 :     const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
     344             : 
     345  1570640000 :     coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
     346  1570640000 :     coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
     347  1570640000 :     coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
     348  1570640000 :     coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
     349  1570640000 : }
     350             : 
     351      898176 : static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx,
     352             :     __m256i *coeff) {
     353      898176 :     __m128i tmp_0 = _mm_loadl_epi64(
     354      898176 :         (__m128i *)&eb_av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
     355      898176 :     __m128i tmp_1 = _mm_loadl_epi64(
     356      898176 :         (__m128i *)&eb_av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
     357      898176 :     __m128i tmp_2 = _mm_loadl_epi64(
     358      898176 :         (__m128i *)&eb_av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
     359      898176 :     __m128i tmp_3 = _mm_loadl_epi64(
     360      898176 :         (__m128i *)&eb_av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
     361      898176 :     __m128i tmp_4 = _mm_loadl_epi64(
     362      898176 :         (__m128i *)&eb_av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
     363      898176 :     __m128i tmp_5 = _mm_loadl_epi64(
     364      898176 :         (__m128i *)&eb_av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
     365      898176 :     __m128i tmp_6 = _mm_loadl_epi64(
     366      898176 :         (__m128i *)&eb_av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
     367      898176 :     __m128i tmp_7 = _mm_loadl_epi64(
     368      898176 :         (__m128i *)&eb_av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
     369             : 
     370      898176 :     tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2);
     371      898176 :     tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3);
     372      898176 :     tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6);
     373      898176 :     tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7);
     374             : 
     375      898176 :     const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0);
     376      898176 :     const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1);
     377      898176 :     const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4);
     378      898176 :     const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5);
     379             : 
     380      898176 :     const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
     381      898176 :     const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
     382      898176 :     const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
     383      898176 :     const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
     384             : 
     385      898176 :     coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
     386      898176 :     coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
     387      898176 :     coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
     388      898176 :     coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
     389      898176 : }
     390             : 
     391   798570000 : static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx,
     392             :     __m256i *coeff) {
     393             :     const __m128i tmp_0 =
     394   798570000 :         _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
     395   798570000 :     const __m128i tmp_1 = _mm_loadl_epi64(
     396   798570000 :         (__m128i *)&eb_av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]);
     397             : 
     398             :     const __m256i res_0 =
     399   798570000 :         _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1);
     400             : 
     401  1597140000 :     coeff[0] = _mm256_shuffle_epi8(
     402             :         res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2));
     403  1597140000 :     coeff[1] = _mm256_shuffle_epi8(
     404             :         res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2));
     405  1597140000 :     coeff[2] = _mm256_shuffle_epi8(
     406             :         res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2));
     407   798570000 :     coeff[3] = _mm256_shuffle_epi8(
     408             :         res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2));
     409   798570000 : }
     410             : 
     411  1569300000 : static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out,
     412             :     int sx, int alpha, int beta, int row,
     413             :     const __m256i *shuffle_src,
     414             :     const __m256i *round_const,
     415             :     const __m128i *shift) {
     416             :     __m256i coeff[4];
     417  1569300000 :     prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff);
     418  1617870000 :     filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift,
     419             :         row);
     420  1607110000 : }
     421   236804000 : static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
     422             :     __m256i *coeff) {
     423   236804000 :     const __m128i tmp_0 = _mm_loadl_epi64(
     424   236804000 :         (__m128i *)&eb_av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
     425   236804000 :     const __m128i tmp_1 = _mm_loadl_epi64(
     426   236804000 :         (__m128i *)&eb_av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
     427   236804000 :     const __m128i tmp_2 = _mm_loadl_epi64(
     428   236804000 :         (__m128i *)&eb_av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
     429   236804000 :     const __m128i tmp_3 = _mm_loadl_epi64(
     430   236804000 :         (__m128i *)&eb_av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
     431   236804000 :     const __m128i tmp_4 = _mm_loadl_epi64(
     432   236804000 :         (__m128i *)&eb_av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
     433   236804000 :     const __m128i tmp_5 = _mm_loadl_epi64(
     434   236804000 :         (__m128i *)&eb_av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
     435   236804000 :     const __m128i tmp_6 = _mm_loadl_epi64(
     436   236804000 :         (__m128i *)&eb_av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
     437   236804000 :     const __m128i tmp_7 = _mm_loadl_epi64(
     438   236804000 :         (__m128i *)&eb_av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
     439             : 
     440   236804000 :     const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
     441   236804000 :     const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
     442   236804000 :     const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
     443   236804000 :     const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
     444             : 
     445   236804000 :     const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
     446   236804000 :     const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
     447   236804000 :     const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
     448   236804000 :     const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
     449             : 
     450   473607000 :     coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14));
     451   473607000 :     coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14));
     452   473607000 :     coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15));
     453   236804000 :     coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15));
     454   236804000 : }
     455             : 
     456   229262000 : static INLINE void warp_horizontal_filter_avx2(
     457             :     const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
     458             :     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     459             :     const __m256i *round_const, const __m128i *shift,
     460             :     const __m256i *shuffle_src) {
     461   229262000 :     int k, iy, sx, row = 0;
     462             :     __m256i coeff[4];
     463  1785720000 :     for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
     464  1551920000 :         iy = iy4 + k;
     465  1551920000 :         iy = clamp(iy, 0, height - 1);
     466             :         const __m128i src_0 =
     467  1536200000 :             _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
     468  1536200000 :         iy = iy4 + k + 1;
     469  1536200000 :         iy = clamp(iy, 0, height - 1);
     470             :         const __m128i src_1 =
     471  3020130000 :             _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
     472             :         const __m256i src_01 =
     473  1510070000 :             _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
     474  1510070000 :         sx = sx4 + beta * (k + 4);
     475  1510070000 :         horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src,
     476             :             round_const, shift);
     477  1556460000 :         row += 1;
     478             :     }
     479   233803000 :     iy = iy4 + k;
     480   233803000 :     iy = clamp(iy, 0, height - 1);
     481   228961000 :     const __m256i src_01 = _mm256_castsi128_si256(
     482   228961000 :         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
     483   228961000 :     sx = sx4 + beta * (k + 4);
     484   228961000 :     prepare_horizontal_filter_coeff(alpha, sx, coeff);
     485   229219000 :     filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
     486             :         shift, row);
     487   229051000 : }
     488             : 
     489   102409000 : static INLINE void warp_horizontal_filter_alpha0_avx2(
     490             :     const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
     491             :     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     492             :     const __m256i *round_const, const __m128i *shift,
     493             :     const __m256i *shuffle_src) {
     494             :     (void)alpha;
     495   102409000 :     int k, iy, sx, row = 0;
     496             :     __m256i coeff[4];
     497   809625000 :     for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
     498   707262000 :         iy = iy4 + k;
     499   707262000 :         iy = clamp(iy, 0, height - 1);
     500             :         const __m128i src_0 =
     501   705226000 :             _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
     502   705226000 :         iy = iy4 + k + 1;
     503   705226000 :         iy = clamp(iy, 0, height - 1);
     504             :         const __m128i src_1 =
     505  1403130000 :             _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
     506             :         const __m256i src_01 =
     507   701564000 :             _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
     508   701564000 :         sx = sx4 + beta * (k + 4);
     509   701564000 :         prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
     510   701394000 :         filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
     511             :             shift, row);
     512   707216000 :         row += 1;
     513             :     }
     514   102363000 :     iy = iy4 + k;
     515   102363000 :     iy = clamp(iy, 0, height - 1);
     516   102313000 :     const __m256i src_01 = _mm256_castsi128_si256(
     517   102313000 :         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
     518   102313000 :     sx = sx4 + beta * (k + 4);
     519   102313000 :     prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
     520   102297000 :     filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
     521             :         shift, row);
     522   102372000 : }
     523             : 
     524      898179 : static INLINE void warp_horizontal_filter_beta0_avx2(
     525             :     const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
     526             :     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     527             :     const __m256i *round_const, const __m128i *shift,
     528             :     const __m256i *shuffle_src) {
     529             :     (void)beta;
     530      898179 :     int k, iy, row = 0;
     531             :     __m256i coeff[4];
     532      898179 :     prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff);
     533     7184370 :     for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
     534     6286120 :         iy = iy4 + k;
     535     6286120 :         iy = clamp(iy, 0, height - 1);
     536             :         const __m128i src_0 =
     537     6285460 :             _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
     538     6285460 :         iy = iy4 + k + 1;
     539     6285460 :         iy = clamp(iy, 0, height - 1);
     540             :         const __m128i src_1 =
     541    12569300 :             _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
     542             :         const __m256i src_01 =
     543     6284670 :             _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
     544     6284670 :         filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
     545             :             shift, row);
     546     6286180 :         row += 1;
     547             :     }
     548      898250 :     iy = iy4 + k;
     549      898250 :     iy = clamp(iy, 0, height - 1);
     550      898171 :     const __m256i src_01 = _mm256_castsi128_si256(
     551      898171 :         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
     552      898171 :     filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
     553             :         shift, row);
     554      898178 : }
     555             : 
     556      274245 : static INLINE void warp_horizontal_filter_alpha0_beta0_avx2(
     557             :     const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
     558             :     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     559             :     const __m256i *round_const, const __m128i *shift,
     560             :     const __m256i *shuffle_src) {
     561             :     (void)alpha;
     562      274245 :     int k, iy, row = 0;
     563             :     __m256i coeff[4];
     564      274245 :     prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff);
     565     2193920 :     for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
     566     1919680 :         iy = iy4 + k;
     567     1919680 :         iy = clamp(iy, 0, height - 1);
     568             :         const __m128i src0 =
     569     1919640 :             _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
     570     1919640 :         iy = iy4 + k + 1;
     571     1919640 :         iy = clamp(iy, 0, height - 1);
     572             :         const __m128i src1 =
     573     3839190 :             _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
     574             :         const __m256i src_01 =
     575     1919600 :             _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
     576     1919600 :         filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
     577             :             shift, row);
     578     1919680 :         row += 1;
     579             :     }
     580      274249 :     iy = iy4 + k;
     581      274249 :     iy = clamp(iy, 0, height - 1);
     582      274246 :     const __m256i src_01 = _mm256_castsi128_si256(
     583      274246 :         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
     584      274246 :     filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
     585             :         shift, row);
     586      274246 : }
     587             : 
     588    26608900 : static INLINE void unpack_weights_and_set_round_const_avx2(
     589             :     ConvolveParams *conv_params, const int round_bits, const int offset_bits,
     590             :     __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) {
     591    26608900 :     *res_sub_const =
     592    53217800 :         _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
     593    26608900 :         (1 << (offset_bits - conv_params->round_1 - 1)));
     594    26608900 :     *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1));
     595             : 
     596    26608900 :     const int w0 = conv_params->fwd_offset;
     597    26608900 :     const int w1 = conv_params->bck_offset;
     598    26608900 :     const __m256i wt0 = _mm256_set1_epi16(w0);
     599    53217800 :     const __m256i wt1 = _mm256_set1_epi16(w1);
     600    26608900 :     *wt = _mm256_unpacklo_epi16(wt0, wt1);
     601    26608900 : }
     602             : 
     603   933780000 : static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta,
     604             :     int sy,
     605             :     __m256i *coeffs) {
     606             :     __m128i filt_00 =
     607  1867560000 :         _mm_loadu_si128((__m128i *)(eb_warped_filter +
     608   933780000 :         ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
     609             :     __m128i filt_01 =
     610  1867560000 :         _mm_loadu_si128((__m128i *)(eb_warped_filter +
     611   933780000 :         ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
     612             :     __m128i filt_02 =
     613  1867560000 :         _mm_loadu_si128((__m128i *)(eb_warped_filter +
     614   933780000 :         ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
     615             :     __m128i filt_03 =
     616  1867560000 :         _mm_loadu_si128((__m128i *)(eb_warped_filter +
     617   933780000 :         ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
     618             : 
     619  1867560000 :     __m128i filt_10 = _mm_loadu_si128(
     620   933780000 :         (__m128i *)(eb_warped_filter +
     621   933780000 :         (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
     622  1867560000 :     __m128i filt_11 = _mm_loadu_si128(
     623   933780000 :         (__m128i *)(eb_warped_filter +
     624   933780000 :         (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
     625  1867560000 :     __m128i filt_12 = _mm_loadu_si128(
     626   933780000 :         (__m128i *)(eb_warped_filter +
     627   933780000 :         (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
     628   933780000 :     __m128i filt_13 = _mm_loadu_si128(
     629   933780000 :         (__m128i *)(eb_warped_filter +
     630   933780000 :         (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
     631             : 
     632             :     __m256i filt_0 =
     633   933780000 :         _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
     634             :     __m256i filt_1 =
     635   933780000 :         _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
     636             :     __m256i filt_2 =
     637   933780000 :         _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
     638             :     __m256i filt_3 =
     639   933780000 :         _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
     640             : 
     641   933780000 :     __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
     642   933780000 :     __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
     643   933780000 :     __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
     644   933780000 :     __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
     645             : 
     646   933780000 :     coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
     647   933780000 :     coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
     648   933780000 :     coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
     649   933780000 :     coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
     650             : 
     651             :     filt_00 =
     652  1867560000 :         _mm_loadu_si128((__m128i *)(eb_warped_filter +
     653   933780000 :         ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
     654             :     filt_01 =
     655  1867560000 :         _mm_loadu_si128((__m128i *)(eb_warped_filter +
     656   933780000 :         ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
     657             :     filt_02 =
     658  1867560000 :         _mm_loadu_si128((__m128i *)(eb_warped_filter +
     659   933780000 :         ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
     660             :     filt_03 =
     661  1867560000 :         _mm_loadu_si128((__m128i *)(eb_warped_filter +
     662   933780000 :         ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
     663             : 
     664  1867560000 :     filt_10 = _mm_loadu_si128(
     665   933780000 :         (__m128i *)(eb_warped_filter +
     666   933780000 :         (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
     667  1867560000 :     filt_11 = _mm_loadu_si128(
     668   933780000 :         (__m128i *)(eb_warped_filter +
     669   933780000 :         (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
     670  1867560000 :     filt_12 = _mm_loadu_si128(
     671   933780000 :         (__m128i *)(eb_warped_filter +
     672   933780000 :         (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
     673   933780000 :     filt_13 = _mm_loadu_si128(
     674   933780000 :         (__m128i *)(eb_warped_filter +
     675   933780000 :         (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
     676             : 
     677             :     filt_0 =
     678   933780000 :         _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
     679             :     filt_1 =
     680   933780000 :         _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
     681             :     filt_2 =
     682   933780000 :         _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
     683             :     filt_3 =
     684   933780000 :         _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
     685             : 
     686   933780000 :     res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
     687   933780000 :     res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
     688   933780000 :     res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
     689   933780000 :     res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
     690             : 
     691   933780000 :     coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
     692   933780000 :     coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
     693   933780000 :     coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
     694   933780000 :     coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
     695   933780000 : }
     696             : 
     697   105699000 : static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy,
     698             :     __m256i *coeffs) {
     699             :     __m128i filt_00 =
     700   211399000 :         _mm_loadu_si128((__m128i *)(eb_warped_filter +
     701   105699000 :         ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
     702             :     __m128i filt_01 =
     703   211399000 :         _mm_loadu_si128((__m128i *)(eb_warped_filter +
     704   211399000 :         ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
     705             :     __m128i filt_02 =
     706   211399000 :         _mm_loadu_si128((__m128i *)(eb_warped_filter +
     707   105699000 :         ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
     708             :     __m128i filt_03 =
     709   105699000 :         _mm_loadu_si128((__m128i *)(eb_warped_filter +
     710   105699000 :         ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
     711             : 
     712   105699000 :     __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00);
     713   105699000 :     __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01);
     714   105699000 :     __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02);
     715   105699000 :     __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03);
     716             : 
     717   105699000 :     __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
     718   105699000 :     __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
     719   105699000 :     __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
     720   105699000 :     __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
     721             : 
     722   105699000 :     coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
     723   105699000 :     coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
     724   105699000 :     coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
     725   105699000 :     coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
     726             : 
     727             :     filt_00 =
     728   211399000 :         _mm_loadu_si128((__m128i *)(eb_warped_filter +
     729   105699000 :         ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
     730             :     filt_01 =
     731   211399000 :         _mm_loadu_si128((__m128i *)(eb_warped_filter +
     732   105699000 :         ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
     733             :     filt_02 =
     734   211399000 :         _mm_loadu_si128((__m128i *)(eb_warped_filter +
     735   105699000 :         ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
     736             :     filt_03 =
     737   105699000 :         _mm_loadu_si128((__m128i *)(eb_warped_filter +
     738   105699000 :         ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
     739             : 
     740   105699000 :     filt_0 = _mm256_broadcastsi128_si256(filt_00);
     741   105699000 :     filt_1 = _mm256_broadcastsi128_si256(filt_01);
     742   105699000 :     filt_2 = _mm256_broadcastsi128_si256(filt_02);
     743   105699000 :     filt_3 = _mm256_broadcastsi128_si256(filt_03);
     744             : 
     745   105699000 :     res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
     746   105699000 :     res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
     747   105699000 :     res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
     748   105699000 :     res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
     749             : 
     750   105699000 :     coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
     751   105699000 :     coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
     752   105699000 :     coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
     753   105699000 :     coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
     754   105699000 : }
     755             : 
     756     2838490 : static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy,
     757             :     __m256i *coeffs) {
     758     5676990 :     const __m128i filt_0 = _mm_loadu_si128(
     759     2838490 :         (__m128i *)(eb_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
     760     2838490 :     const __m128i filt_1 = _mm_loadu_si128(
     761     2838490 :         (__m128i *)(eb_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS)));
     762             : 
     763             :     __m256i res_0 =
     764     2838490 :         _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1);
     765             : 
     766     5676990 :     coeffs[0] = _mm256_shuffle_epi8(
     767             :         res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2));
     768     5676990 :     coeffs[1] = _mm256_shuffle_epi8(
     769             :         res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2));
     770     5676990 :     coeffs[2] = _mm256_shuffle_epi8(
     771             :         res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2));
     772     2838490 :     coeffs[3] = _mm256_shuffle_epi8(
     773             :         res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2));
     774             : 
     775     2838490 :     coeffs[4] = coeffs[0];
     776     2838490 :     coeffs[5] = coeffs[1];
     777     2838490 :     coeffs[6] = coeffs[2];
     778     2838490 :     coeffs[7] = coeffs[3];
     779     2838490 : }
     780             : 
     781  1338280000 : static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out,
     782             :     __m256i *src,
     783             :     __m256i *coeffs,
     784             :     __m256i *res_lo,
     785             :     __m256i *res_hi, int row) {
     786  1338280000 :     const __m256i src_6 = horz_out[row + 3];
     787             :     const __m256i src_7 =
     788  1338280000 :         _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21);
     789             : 
     790  1338280000 :     src[6] = _mm256_unpacklo_epi16(src_6, src_7);
     791             : 
     792  1338280000 :     const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]);
     793  1338280000 :     const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]);
     794  1338280000 :     const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]);
     795  2676550000 :     const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]);
     796             : 
     797  2676550000 :     const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2),
     798             :         _mm256_add_epi32(res_4, res_6));
     799             : 
     800  1338280000 :     src[7] = _mm256_unpackhi_epi16(src_6, src_7);
     801             : 
     802  1338280000 :     const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]);
     803  1338280000 :     const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]);
     804  1338280000 :     const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]);
     805  2676550000 :     const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]);
     806             : 
     807  4014830000 :     const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3),
     808             :         _mm256_add_epi32(res_5, res_7));
     809             : 
     810             :     // Rearrange pixels back into the order 0 ... 7
     811  1338280000 :     *res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
     812  1338280000 :     *res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
     813  1338280000 : }
     814             : 
     815  1344320000 : static INLINE void store_vertical_filter_output_avx2(
     816             :     const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const,
     817             :     const __m256i *wt, const __m256i *res_sub_const,
     818             :     const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params,
     819             :     int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width,
     820             :     const int round_bits) {
     821  1344320000 :     __m256i res_lo_1 = *res_lo;
     822  1344320000 :     __m256i res_hi_1 = *res_hi;
     823             : 
     824  1344320000 :     if (conv_params->is_compound) {
     825    99195600 :         __m128i *const p_0 =
     826    99195600 :             (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
     827    99195600 :         __m128i *const p_1 =
     828             :             (__m128i *)&conv_params
     829    99195600 :             ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j];
     830             : 
     831   297587000 :         res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const),
     832             :             reduce_bits_vert);
     833             : 
     834    99195600 :         const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1);
     835             :         __m256i res_lo_16;
     836    99195600 :         if (conv_params->do_average) {
     837    49651300 :             __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
     838    49651300 :             __m128i *const dst8_1 =
     839    49651300 :                 (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
     840    49651300 :             const __m128i p_16_0 = _mm_loadl_epi64(p_0);
     841    49651300 :             const __m128i p_16_1 = _mm_loadl_epi64(p_1);
     842             :             const __m256i p_16 =
     843    49651300 :                 _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1);
     844    49651300 :             if (conv_params->use_jnt_comp_avg) {
     845    23963700 :                 const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16);
     846    47927300 :                 const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt);
     847             :                 const __m256i shifted_32 =
     848    23963700 :                     _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
     849    23963700 :                 res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32);
     850             :             }
     851             :             else
     852    51375300 :                 res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1);
     853    49651300 :             res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const);
     854   148954000 :             res_lo_16 = _mm256_srai_epi16(
     855             :                 _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits);
     856    49651300 :             const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16);
     857    49651300 :             const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo);
     858    49651300 :             const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1);
     859    49651300 :             *(uint32_t *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0);
     860    49651300 :             *(uint32_t *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1);
     861             :         }
     862             :         else {
     863    49544300 :             const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16);
     864    49544300 :             const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1);
     865    49544300 :             _mm_storel_epi64(p_0, temp_lo_16_0);
     866    49544300 :             _mm_storel_epi64(p_1, temp_lo_16_1);
     867             :         }
     868    99195600 :         if (p_width > 4) {
     869    99209800 :             __m128i *const p4_0 =
     870             :                 (__m128i *)&conv_params
     871    99209800 :                 ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
     872    99209800 :             __m128i *const p4_1 =
     873             :                 (__m128i *)&conv_params
     874    99209800 :                 ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4];
     875   297629000 :             res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const),
     876             :                 reduce_bits_vert);
     877    99209800 :             const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1);
     878             :             __m256i res_hi_16;
     879    99209800 :             if (conv_params->do_average) {
     880    49649000 :                 __m128i *const dst8_4_0 =
     881    49649000 :                     (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
     882    49649000 :                 __m128i *const dst8_4_1 =
     883    49649000 :                     (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4];
     884    49649000 :                 const __m128i p4_16_0 = _mm_loadl_epi64(p4_0);
     885    49649000 :                 const __m128i p4_16_1 = _mm_loadl_epi64(p4_1);
     886    49649000 :                 const __m256i p4_16 = _mm256_inserti128_si256(
     887             :                     _mm256_castsi128_si256(p4_16_0), p4_16_1, 1);
     888    49649000 :                 if (conv_params->use_jnt_comp_avg) {
     889    23963600 :                     const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16);
     890    47927100 :                     const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt);
     891             :                     const __m256i shifted_32 =
     892    23963600 :                         _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
     893    23963600 :                     res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32);
     894             :                 }
     895             :                 else
     896    51370800 :                     res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1);
     897    49649000 :                 res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const);
     898   148947000 :                 res_hi_16 = _mm256_srai_epi16(
     899             :                     _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits);
     900    49649000 :                 __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16);
     901    49649000 :                 const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi);
     902    49649000 :                 const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1);
     903    49649000 :                 *(uint32_t *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0);
     904    49649000 :                 *(uint32_t *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1);
     905             :             }
     906             :             else {
     907    49560800 :                 const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16);
     908    49560800 :                 const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1);
     909    49560800 :                 _mm_storel_epi64(p4_0, temp_hi_16_0);
     910    49560800 :                 _mm_storel_epi64(p4_1, temp_hi_16_1);
     911             :             }
     912             :         }
     913             :     }
     914             :     else {
     915  2490240000 :         const __m256i res_lo_round = _mm256_srai_epi32(
     916             :             _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
     917  3735360000 :         const __m256i res_hi_round = _mm256_srai_epi32(
     918             :             _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
     919             : 
     920  1245120000 :         const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round);
     921  1245120000 :         const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit);
     922  1245120000 :         const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit);
     923  1245120000 :         const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1);
     924             : 
     925             :         // Store, blending with 'pred' if needed
     926  1245120000 :         __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
     927  1245120000 :         __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
     928             : 
     929  1245120000 :         if (p_width == 4) {
     930           0 :             *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit0);
     931           0 :             *(uint32_t *)p1 = _mm_cvtsi128_si32(res_8bit1);
     932             :         }
     933             :         else {
     934  1245120000 :             _mm_storel_epi64(p, res_8bit0);
     935  1245120000 :             _mm_storel_epi64(p1, res_8bit1);
     936             :         }
     937             :     }
     938  1344320000 : }
     939             : 
     940   235245000 : static INLINE void warp_vertical_filter_avx2(
     941             :     uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
     942             :     int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
     943             :     int i, int j, int sy4, const int reduce_bits_vert,
     944             :     const __m256i *res_add_const, const int round_bits,
     945             :     const __m256i *res_sub_const, const __m256i *round_bits_const,
     946             :     const __m256i *wt) {
     947   235245000 :     int k, row = 0;
     948             :     __m256i src[8];
     949   235245000 :     const __m256i src_0 = horz_out[0];
     950             :     const __m256i src_1 =
     951   235245000 :         _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
     952   235245000 :     const __m256i src_2 = horz_out[1];
     953             :     const __m256i src_3 =
     954   235245000 :         _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
     955   235245000 :     const __m256i src_4 = horz_out[2];
     956             :     const __m256i src_5 =
     957   235245000 :         _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
     958             : 
     959   235245000 :     src[0] = _mm256_unpacklo_epi16(src_0, src_1);
     960   235245000 :     src[2] = _mm256_unpacklo_epi16(src_2, src_3);
     961   235245000 :     src[4] = _mm256_unpacklo_epi16(src_4, src_5);
     962             : 
     963   235245000 :     src[1] = _mm256_unpackhi_epi16(src_0, src_1);
     964   235245000 :     src[3] = _mm256_unpackhi_epi16(src_2, src_3);
     965   235245000 :     src[5] = _mm256_unpackhi_epi16(src_4, src_5);
     966             : 
     967  1170020000 :     for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
     968   934544000 :         int sy = sy4 + delta * (k + 4);
     969             :         __m256i coeffs[8];
     970   934544000 :         prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs);
     971             :         __m256i res_lo, res_hi;
     972   935932000 :         filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
     973             :             row);
     974   934838000 :         store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
     975             :             res_sub_const, round_bits_const, pred,
     976             :             conv_params, i, j, k, reduce_bits_vert,
     977             :             p_stride, p_width, round_bits);
     978   934780000 :         src[0] = src[2];
     979   934780000 :         src[2] = src[4];
     980   934780000 :         src[4] = src[6];
     981   934780000 :         src[1] = src[3];
     982   934780000 :         src[3] = src[5];
     983   934780000 :         src[5] = src[7];
     984             : 
     985   934780000 :         row += 1;
     986             :     }
     987   235480000 : }
     988             : 
     989      653834 : static INLINE void warp_vertical_filter_gamma0_avx2(
     990             :     uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
     991             :     int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
     992             :     int i, int j, int sy4, const int reduce_bits_vert,
     993             :     const __m256i *res_add_const, const int round_bits,
     994             :     const __m256i *res_sub_const, const __m256i *round_bits_const,
     995             :     const __m256i *wt) {
     996             :     (void)gamma;
     997      653834 :     int k, row = 0;
     998             :     __m256i src[8];
     999      653834 :     const __m256i src_0 = horz_out[0];
    1000             :     const __m256i src_1 =
    1001      653834 :         _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
    1002      653834 :     const __m256i src_2 = horz_out[1];
    1003             :     const __m256i src_3 =
    1004      653834 :         _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
    1005      653834 :     const __m256i src_4 = horz_out[2];
    1006             :     const __m256i src_5 =
    1007      653834 :         _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
    1008             : 
    1009      653834 :     src[0] = _mm256_unpacklo_epi16(src_0, src_1);
    1010      653834 :     src[2] = _mm256_unpacklo_epi16(src_2, src_3);
    1011      653834 :     src[4] = _mm256_unpacklo_epi16(src_4, src_5);
    1012             : 
    1013      653834 :     src[1] = _mm256_unpackhi_epi16(src_0, src_1);
    1014      653834 :     src[3] = _mm256_unpackhi_epi16(src_2, src_3);
    1015      653834 :     src[5] = _mm256_unpackhi_epi16(src_4, src_5);
    1016             : 
    1017     3269010 :     for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
    1018     2615180 :         int sy = sy4 + delta * (k + 4);
    1019             :         __m256i coeffs[8];
    1020     2615180 :         prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs);
    1021             :         __m256i res_lo, res_hi;
    1022     2615210 :         filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
    1023             :             row);
    1024     2615230 :         store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
    1025             :             res_sub_const, round_bits_const, pred,
    1026             :             conv_params, i, j, k, reduce_bits_vert,
    1027             :             p_stride, p_width, round_bits);
    1028     2615180 :         src[0] = src[2];
    1029     2615180 :         src[2] = src[4];
    1030     2615180 :         src[4] = src[6];
    1031     2615180 :         src[1] = src[3];
    1032     2615180 :         src[3] = src[5];
    1033     2615180 :         src[5] = src[7];
    1034     2615180 :         row += 1;
    1035             :     }
    1036      653831 : }
    1037             : 
    1038   105581000 : static INLINE void warp_vertical_filter_delta0_avx2(
    1039             :     uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
    1040             :     int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
    1041             :     int i, int j, int sy4, const int reduce_bits_vert,
    1042             :     const __m256i *res_add_const, const int round_bits,
    1043             :     const __m256i *res_sub_const, const __m256i *round_bits_const,
    1044             :     const __m256i *wt) {
    1045             :     (void)delta;
    1046   105581000 :     int k, row = 0;
    1047             :     __m256i src[8], coeffs[8];
    1048   105581000 :     const __m256i src_0 = horz_out[0];
    1049             :     const __m256i src_1 =
    1050   105581000 :         _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
    1051   105581000 :     const __m256i src_2 = horz_out[1];
    1052             :     const __m256i src_3 =
    1053   105581000 :         _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
    1054   105581000 :     const __m256i src_4 = horz_out[2];
    1055             :     const __m256i src_5 =
    1056   105581000 :         _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
    1057             : 
    1058   105581000 :     src[0] = _mm256_unpacklo_epi16(src_0, src_1);
    1059   105581000 :     src[2] = _mm256_unpacklo_epi16(src_2, src_3);
    1060   105581000 :     src[4] = _mm256_unpacklo_epi16(src_4, src_5);
    1061             : 
    1062   105581000 :     src[1] = _mm256_unpackhi_epi16(src_0, src_1);
    1063   105581000 :     src[3] = _mm256_unpackhi_epi16(src_2, src_3);
    1064   105581000 :     src[5] = _mm256_unpackhi_epi16(src_4, src_5);
    1065             : 
    1066   105581000 :     prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs);
    1067             : 
    1068   526373000 :     for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
    1069             :         __m256i res_lo, res_hi;
    1070   420703000 :         filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
    1071             :             row);
    1072   420797000 :         store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
    1073             :             res_sub_const, round_bits_const, pred,
    1074             :             conv_params, i, j, k, reduce_bits_vert,
    1075             :             p_stride, p_width, round_bits);
    1076   420648000 :         src[0] = src[2];
    1077   420648000 :         src[2] = src[4];
    1078   420648000 :         src[4] = src[6];
    1079   420648000 :         src[1] = src[3];
    1080   420648000 :         src[3] = src[5];
    1081   420648000 :         src[5] = src[7];
    1082   420648000 :         row += 1;
    1083             :     }
    1084   105671000 : }
    1085             : 
    1086      223388 : static INLINE void warp_vertical_filter_gamma0_delta0_avx2(
    1087             :     uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
    1088             :     int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
    1089             :     int i, int j, int sy4, const int reduce_bits_vert,
    1090             :     const __m256i *res_add_const, const int round_bits,
    1091             :     const __m256i *res_sub_const, const __m256i *round_bits_const,
    1092             :     const __m256i *wt) {
    1093             :     (void)gamma;
    1094      223388 :     int k, row = 0;
    1095             :     __m256i src[8], coeffs[8];
    1096      223388 :     const __m256i src_0 = horz_out[0];
    1097             :     const __m256i src_1 =
    1098      223388 :         _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
    1099      223388 :     const __m256i src_2 = horz_out[1];
    1100             :     const __m256i src_3 =
    1101      223388 :         _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
    1102      223388 :     const __m256i src_4 = horz_out[2];
    1103             :     const __m256i src_5 =
    1104      223388 :         _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
    1105             : 
    1106      223388 :     src[0] = _mm256_unpacklo_epi16(src_0, src_1);
    1107      223388 :     src[2] = _mm256_unpacklo_epi16(src_2, src_3);
    1108      223388 :     src[4] = _mm256_unpacklo_epi16(src_4, src_5);
    1109             : 
    1110      223388 :     src[1] = _mm256_unpackhi_epi16(src_0, src_1);
    1111      223388 :     src[3] = _mm256_unpackhi_epi16(src_2, src_3);
    1112      223388 :     src[5] = _mm256_unpackhi_epi16(src_4, src_5);
    1113             : 
    1114      223388 :     prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs);
    1115             : 
    1116     1116920 :     for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
    1117             :         __m256i res_lo, res_hi;
    1118      893539 :         filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
    1119             :             row);
    1120      893537 :         store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
    1121             :             res_sub_const, round_bits_const, pred,
    1122             :             conv_params, i, j, k, reduce_bits_vert,
    1123             :             p_stride, p_width, round_bits);
    1124      893536 :         src[0] = src[2];
    1125      893536 :         src[2] = src[4];
    1126      893536 :         src[4] = src[6];
    1127      893536 :         src[1] = src[3];
    1128      893536 :         src[3] = src[5];
    1129      893536 :         src[5] = src[7];
    1130      893536 :         row += 1;
    1131             :     }
    1132      223386 : }
    1133             : 
    1134   339906000 : static INLINE void prepare_warp_vertical_filter_avx2(
    1135             :     uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
    1136             :     int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
    1137             :     int i, int j, int sy4, const int reduce_bits_vert,
    1138             :     const __m256i *res_add_const, const int round_bits,
    1139             :     const __m256i *res_sub_const, const __m256i *round_bits_const,
    1140             :     const __m256i *wt) {
    1141   339906000 :     if (gamma == 0 && delta == 0)
    1142      223388 :         warp_vertical_filter_gamma0_delta0_avx2(
    1143             :             pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
    1144             :             i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
    1145             :             round_bits_const, wt);
    1146   339683000 :     else if (gamma == 0 && delta != 0)
    1147      653834 :         warp_vertical_filter_gamma0_avx2(
    1148             :             pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
    1149             :             i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
    1150             :             round_bits_const, wt);
    1151   339029000 :     else if (gamma != 0 && delta == 0)
    1152   105591000 :         warp_vertical_filter_delta0_avx2(
    1153             :             pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
    1154             :             i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
    1155             :             round_bits_const, wt);
    1156             :     else
    1157   233438000 :         warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta,
    1158             :             p_height, p_stride, p_width, i, j, sy4,
    1159             :             reduce_bits_vert, res_add_const, round_bits,
    1160             :             res_sub_const, round_bits_const, wt);
    1161   342018000 : }
    1162             : 
    1163   332135000 : static INLINE void prepare_warp_horizontal_filter_avx2(
    1164             :     const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
    1165             :     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
    1166             :     const __m256i *round_const, const __m128i *shift,
    1167             :     const __m256i *shuffle_src) {
    1168   332135000 :     if (alpha == 0 && beta == 0)
    1169      274245 :         warp_horizontal_filter_alpha0_beta0_avx2(
    1170             :             ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
    1171             :             round_const, shift, shuffle_src);
    1172   331861000 :     else if (alpha == 0 && beta != 0)
    1173   102404000 :         warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
    1174             :             alpha, beta, p_height, height, i,
    1175             :             round_const, shift, shuffle_src);
    1176   229457000 :     else if (alpha != 0 && beta == 0)
    1177      898172 :         warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
    1178             :             alpha, beta, p_height, height, i,
    1179             :             round_const, shift, shuffle_src);
    1180             :     else
    1181   228558000 :         warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha,
    1182             :             beta, p_height, height, i, round_const, shift,
    1183             :             shuffle_src);
    1184   332617000 : }
    1185             : 
    1186    19023800 : int64_t eb_av1_calc_frame_error_avx2(const uint8_t *const ref, int ref_stride,
    1187             :     const uint8_t *const dst, int p_width,
    1188             :     int p_height, int dst_stride) {
    1189    19023800 :     int64_t sum_error = 0;
    1190             :     int i, j;
    1191             :     __m256i row_error, col_error;
    1192    19023800 :     __m256i zero = _mm256_set1_epi16(0);
    1193    19023800 :     __m256i dup_255 = _mm256_set1_epi16(255);
    1194    19023800 :     col_error = zero;
    1195             : 
    1196   169151000 :     for (i = 0; i < (p_height / 4); i++) {
    1197   150127000 :         row_error = _mm256_set1_epi16(0);
    1198   449895000 :         for (j = 0; j < (p_width / 16); j++) {
    1199   299768000 :             __m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
    1200   299768000 :                 (__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride))));
    1201   299768000 :             __m256i dst_1_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
    1202   299768000 :                 (__m128i *)(dst + (j * 16) + (((i * 4) + 0) * dst_stride))));
    1203   299768000 :             __m256i ref_2_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
    1204   299768000 :                 (__m128i *)(ref + (j * 16) + (((i * 4) + 1) * ref_stride))));
    1205   299768000 :             __m256i dst_2_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
    1206   299768000 :                 (__m128i *)(dst + (j * 16) + (((i * 4) + 1) * dst_stride))));
    1207   299768000 :             __m256i ref_3_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
    1208   299768000 :                 (__m128i *)(ref + (j * 16) + (((i * 4) + 2) * ref_stride))));
    1209   299768000 :             __m256i dst_3_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
    1210   299768000 :                 (__m128i *)(dst + (j * 16) + (((i * 4) + 2) * dst_stride))));
    1211   299768000 :             __m256i ref_4_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
    1212   299768000 :                 (__m128i *)(ref + (j * 16) + (((i * 4) + 3) * ref_stride))));
    1213   599535000 :             __m256i dst_4_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
    1214   299768000 :                 (__m128i *)(dst + (j * 16) + (((i * 4) + 3) * dst_stride))));
    1215             : 
    1216             :             __m256i diff_1 =
    1217   599535000 :                 _mm256_add_epi16(_mm256_sub_epi16(dst_1_16, ref_1_16), dup_255);
    1218             :             __m256i diff_2 =
    1219   599535000 :                 _mm256_add_epi16(_mm256_sub_epi16(dst_2_16, ref_2_16), dup_255);
    1220             :             __m256i diff_3 =
    1221   599535000 :                 _mm256_add_epi16(_mm256_sub_epi16(dst_3_16, ref_3_16), dup_255);
    1222             :             __m256i diff_4 =
    1223   599535000 :                 _mm256_add_epi16(_mm256_sub_epi16(dst_4_16, ref_4_16), dup_255);
    1224             : 
    1225   299768000 :             __m256i diff_1_lo = _mm256_unpacklo_epi16(diff_1, zero);
    1226   299768000 :             __m256i diff_1_hi = _mm256_unpackhi_epi16(diff_1, zero);
    1227   299768000 :             __m256i diff_2_lo = _mm256_unpacklo_epi16(diff_2, zero);
    1228   299768000 :             __m256i diff_2_hi = _mm256_unpackhi_epi16(diff_2, zero);
    1229   299768000 :             __m256i diff_3_lo = _mm256_unpacklo_epi16(diff_3, zero);
    1230   299768000 :             __m256i diff_3_hi = _mm256_unpackhi_epi16(diff_3, zero);
    1231   299768000 :             __m256i diff_4_lo = _mm256_unpacklo_epi16(diff_4, zero);
    1232   299768000 :             __m256i diff_4_hi = _mm256_unpackhi_epi16(diff_4, zero);
    1233             : 
    1234   299768000 :             __m256i error_1_lo =
    1235   599535000 :                 _mm256_i32gather_epi32(error_measure_lut, diff_1_lo, 4);
    1236   299768000 :             __m256i error_1_hi =
    1237   899303000 :                 _mm256_i32gather_epi32(error_measure_lut, diff_1_hi, 4);
    1238   299768000 :             __m256i error_2_lo =
    1239   599535000 :                 _mm256_i32gather_epi32(error_measure_lut, diff_2_lo, 4);
    1240   299768000 :             __m256i error_2_hi =
    1241   599535000 :                 _mm256_i32gather_epi32(error_measure_lut, diff_2_hi, 4);
    1242   299768000 :             __m256i error_3_lo =
    1243   599535000 :                 _mm256_i32gather_epi32(error_measure_lut, diff_3_lo, 4);
    1244   299768000 :             __m256i error_3_hi =
    1245   599535000 :                 _mm256_i32gather_epi32(error_measure_lut, diff_3_hi, 4);
    1246   299768000 :             __m256i error_4_lo =
    1247   599535000 :                 _mm256_i32gather_epi32(error_measure_lut, diff_4_lo, 4);
    1248   299768000 :             __m256i error_4_hi =
    1249   599535000 :                 _mm256_i32gather_epi32(error_measure_lut, diff_4_hi, 4);
    1250             : 
    1251   299768000 :             __m256i error_1 = _mm256_add_epi32(error_1_lo, error_1_hi);
    1252   299768000 :             __m256i error_2 = _mm256_add_epi32(error_2_lo, error_2_hi);
    1253   299768000 :             __m256i error_3 = _mm256_add_epi32(error_3_lo, error_3_hi);
    1254   299768000 :             __m256i error_4 = _mm256_add_epi32(error_4_lo, error_4_hi);
    1255             : 
    1256   299768000 :             __m256i error_1_2 = _mm256_add_epi32(error_1, error_2);
    1257   299768000 :             __m256i error_3_4 = _mm256_add_epi32(error_3, error_4);
    1258             : 
    1259   299768000 :             __m256i error_1_2_3_4 = _mm256_add_epi32(error_1_2, error_3_4);
    1260   299768000 :             row_error = _mm256_add_epi32(row_error, error_1_2_3_4);
    1261             :         }
    1262   150127000 :         __m256i col_error_lo = _mm256_unpacklo_epi32(row_error, zero);
    1263   150127000 :         __m256i col_error_hi = _mm256_unpackhi_epi32(row_error, zero);
    1264   150127000 :         __m256i col_error_temp = _mm256_add_epi64(col_error_lo, col_error_hi);
    1265   150127000 :         col_error = _mm256_add_epi64(col_error, col_error_temp);
    1266             :         // Error summation for remaining width, which is not multiple of 16
    1267   150127000 :         if (p_width & 0xf) {
    1268           0 :             for (int k = 0; k < 4; ++k) {
    1269           0 :                 for (int l = j * 16; l < p_width; ++l)
    1270           0 :                     sum_error +=
    1271           0 :                     (int64_t)error_measure(dst[l + ((i * 4) + k) * dst_stride] -
    1272           0 :                         ref[l + ((i * 4) + k) * ref_stride]);
    1273             :             }
    1274             :         }
    1275             :     }
    1276    19023800 :     __m128i sum_error_q_0 = _mm256_castsi256_si128(col_error);
    1277    19023800 :     __m128i sum_error_q_1 = _mm256_extracti128_si256(col_error, 1);
    1278    19023800 :     sum_error_q_0 = _mm_add_epi64(sum_error_q_0, sum_error_q_1);
    1279             :     int64_t sum_error_d_0, sum_error_d_1;
    1280    19023800 :     _mm_storel_epi64((__m128i *)&sum_error_d_0, sum_error_q_0);
    1281    19023800 :     _mm_storel_epi64((__m128i *)&sum_error_d_1, _mm_srli_si128(sum_error_q_0, 8));
    1282    19023800 :     sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
    1283             :     // Error summation for remaining height, which is not multiple of 4
    1284    19023800 :     if (p_height & 0x3) {
    1285           0 :         for (int k = i * 4; k < p_height; ++k) {
    1286           0 :             for (int l = 0; l < p_width; ++l)
    1287           0 :                 sum_error += (int64_t)error_measure(dst[l + k * dst_stride] -
    1288           0 :                     ref[l + k * ref_stride]);
    1289             :         }
    1290             :     }
    1291    19023800 :     return sum_error;
    1292             : }
    1293             : 
    1294    26609200 : void eb_av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width,
    1295             :     int height, int stride, uint8_t *pred, int p_col,
    1296             :     int p_row, int p_width, int p_height, int p_stride,
    1297             :     int subsampling_x, int subsampling_y,
    1298             :     ConvolveParams *conv_params, int16_t alpha,
    1299             :     int16_t beta, int16_t gamma, int16_t delta) {
    1300             :     __m256i horz_out[8];
    1301             :     int i, j, k;
    1302    26609200 :     const int bd = 8;
    1303    26609200 :     const int reduce_bits_horiz = conv_params->round_0;
    1304    53218400 :     const int reduce_bits_vert = conv_params->is_compound
    1305             :         ? conv_params->round_1
    1306    26609200 :         : 2 * FILTER_BITS - reduce_bits_horiz;
    1307    26609200 :     const int offset_bits_horiz = bd + FILTER_BITS - 1;
    1308    26609200 :     assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
    1309             : 
    1310    26609200 :     const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
    1311             :     const __m256i reduce_bits_vert_const =
    1312    26609200 :         _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
    1313    26609200 :     const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
    1314    26609200 :     const int round_bits =
    1315    26609200 :         2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    1316    26609200 :     const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    1317    26609200 :     assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
    1318             : 
    1319    26609200 :     const __m256i round_const = _mm256_set1_epi16(
    1320    26609200 :         (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
    1321    26609200 :     const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz);
    1322             : 
    1323             :     __m256i res_sub_const, round_bits_const, wt;
    1324    26609200 :     unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits,
    1325             :         &res_sub_const, &round_bits_const,
    1326             :         &wt);
    1327             : 
    1328             :     __m256i res_add_const_1;
    1329    26611200 :     if (conv_params->is_compound == 1)
    1330     4275700 :         res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const);
    1331             :     else
    1332    22335500 :         res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
    1333    22335500 :         ((1 << reduce_bits_vert) >> 1));
    1334    26611200 :     const int32_t const1 = alpha * (-4) + beta * (-4) +
    1335    26611200 :         (1 << (WARPEDDIFF_PREC_BITS - 1)) +
    1336             :         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
    1337    26611200 :     const int32_t const2 = gamma * (-4) + delta * (-4) +
    1338    26611200 :         (1 << (WARPEDDIFF_PREC_BITS - 1)) +
    1339             :         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
    1340    26611200 :     const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1);
    1341    26611200 :     const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1));
    1342    26611200 :     const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz));
    1343             : 
    1344             :     __m256i shuffle_src[4];
    1345    26611200 :     shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0);
    1346    26611200 :     shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1);
    1347    26611200 :     shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2);
    1348    26611200 :     shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3);
    1349             : 
    1350   118607000 :     for (i = 0; i < p_height; i += 8) {
    1351   433200000 :         for (j = 0; j < p_width; j += 8) {
    1352   341205000 :             const int32_t src_x = (p_col + j + 4) << subsampling_x;
    1353   341205000 :             const int32_t src_y = (p_row + i + 4) << subsampling_y;
    1354   341205000 :             const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
    1355   341205000 :             const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
    1356   341205000 :             const int32_t x4 = dst_x >> subsampling_x;
    1357   341205000 :             const int32_t y4 = dst_y >> subsampling_y;
    1358             : 
    1359   341205000 :             int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
    1360   341205000 :             int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
    1361   341205000 :             int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
    1362   341205000 :             int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
    1363             : 
    1364             :             // Add in all the constant terms, including rounding and offset
    1365   341205000 :             sx4 += const1;
    1366   341205000 :             sy4 += const2;
    1367             : 
    1368   341205000 :             sx4 &= ~const3;
    1369   341205000 :             sy4 &= ~const3;
    1370             : 
    1371             :             // Horizontal filter
    1372             :             // If the block is aligned such that, after clamping, every sample
    1373             :             // would be taken from the leftmost/rightmost column, then we can
    1374             :             // skip the expensive horizontal filter.
    1375             : 
    1376   341205000 :             if (ix4 <= -7) {
    1377     1048310 :                 int iy, row = 0;
    1378     8386170 :                 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
    1379     7337880 :                     iy = iy4 + k;
    1380     7337880 :                     iy = clamp(iy, 0, height - 1);
    1381             :                     const __m256i temp_0 =
    1382     7337820 :                         _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
    1383     7337820 :                     iy = iy4 + k + 1;
    1384     7337820 :                     iy = clamp(iy, 0, height - 1);
    1385             :                     const __m256i temp_1 =
    1386     7337860 :                         _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
    1387     7337860 :                     horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
    1388     7337860 :                     row += 1;
    1389             :                 }
    1390     1048290 :                 iy = iy4 + k;
    1391     1048290 :                 iy = clamp(iy, 0, height - 1);
    1392     2096630 :                 horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
    1393             :             }
    1394   340157000 :             else if (ix4 >= width + 6) {
    1395      178405 :                 int iy, row = 0;
    1396     1427240 :                 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
    1397     1248840 :                     iy = iy4 + k;
    1398     1248840 :                     iy = clamp(iy, 0, height - 1);
    1399     2497670 :                     const __m256i temp_0 = _mm256_set1_epi16(
    1400     1248840 :                         const4 + ref[iy * stride + (width - 1)] * const5);
    1401     1248840 :                     iy = iy4 + k + 1;
    1402     1248840 :                     iy = clamp(iy, 0, height - 1);
    1403     2497670 :                     const __m256i temp_1 = _mm256_set1_epi16(
    1404     1248840 :                         const4 + ref[iy * stride + (width - 1)] * const5);
    1405     1248840 :                     horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
    1406     1248840 :                     row += 1;
    1407             :                 }
    1408      178405 :                 iy = iy4 + k;
    1409      178405 :                 iy = clamp(iy, 0, height - 1);
    1410      178405 :                 horz_out[row] =
    1411      356810 :                     _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5);
    1412             :             }
    1413   347928000 :             else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
    1414     7853550 :                 const int out_of_boundary_left = -(ix4 - 6);
    1415     7853550 :                 const int out_of_boundary_right = (ix4 + 8) - width;
    1416     7853550 :                 int iy, sx, row = 0;
    1417    63456900 :                 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
    1418    55589700 :                     iy = iy4 + k;
    1419    55589700 :                     iy = clamp(iy, 0, height - 1);
    1420             :                     __m128i src0 =
    1421    55581300 :                         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
    1422    55581300 :                     iy = iy4 + k + 1;
    1423    55581300 :                     iy = clamp(iy, 0, height - 1);
    1424             :                     __m128i src1 =
    1425    55538100 :                         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
    1426             : 
    1427    55538100 :                     if (out_of_boundary_left >= 0) {
    1428             :                         const __m128i shuffle_reg_left =
    1429    57098900 :                             _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
    1430    28549500 :                         src0 = _mm_shuffle_epi8(src0, shuffle_reg_left);
    1431    28549500 :                         src1 = _mm_shuffle_epi8(src1, shuffle_reg_left);
    1432             :                     }
    1433    55538100 :                     if (out_of_boundary_right >= 0) {
    1434    27052500 :                         const __m128i shuffle_reg_right = _mm_loadu_si128(
    1435    27052500 :                             (__m128i *)warp_pad_right[out_of_boundary_right]);
    1436    27052500 :                         src0 = _mm_shuffle_epi8(src0, shuffle_reg_right);
    1437    27052500 :                         src1 = _mm_shuffle_epi8(src1, shuffle_reg_right);
    1438             :                     }
    1439    55538100 :                     sx = sx4 + beta * (k + 4);
    1440             :                     const __m256i src_01 =
    1441    55538100 :                         _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
    1442    55538100 :                     horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row,
    1443             :                         shuffle_src, &round_const, &shift);
    1444    55603300 :                     row += 1;
    1445             :                 }
    1446     7867200 :                 iy = iy4 + k;
    1447     7867200 :                 iy = clamp(iy, 0, height - 1);
    1448     7949600 :                 __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
    1449     7949600 :                 if (out_of_boundary_left >= 0) {
    1450             :                     const __m128i shuffle_reg_left =
    1451     8163920 :                         _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
    1452     4081960 :                     src = _mm_shuffle_epi8(src, shuffle_reg_left);
    1453             :                 }
    1454     7949600 :                 if (out_of_boundary_right >= 0) {
    1455             :                     const __m128i shuffle_reg_right =
    1456     7736110 :                         _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]);
    1457     3868050 :                     src = _mm_shuffle_epi8(src, shuffle_reg_right);
    1458             :                 }
    1459     7949600 :                 sx = sx4 + beta * (k + 4);
    1460     7949600 :                 const __m256i src_01 = _mm256_castsi128_si256(src);
    1461             :                 __m256i coeff[4];
    1462     7949600 :                 prepare_horizontal_filter_coeff(alpha, sx, coeff);
    1463     7949850 :                 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src,
    1464             :                     &round_const, &shift, row);
    1465             :             }
    1466             :             else
    1467   332125000 :                 prepare_warp_horizontal_filter_avx2(
    1468             :                     ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height,
    1469             :                     i, &round_const, &shift, shuffle_src);
    1470             : 
    1471             :             // Vertical filter
    1472   339806000 :             prepare_warp_vertical_filter_avx2(
    1473             :                 pred, horz_out, conv_params, gamma, delta, p_height, p_stride,
    1474             :                 p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits,
    1475             :                 &res_sub_const, &round_bits_const, &wt);
    1476             :         }
    1477             :     }
    1478    26668900 : }

Generated by: LCOV version 1.14