LCOV - code coverage report
Current view: top level - ASM_AVX2 - EbIntraPrediction_Intrinsic_AVX2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 1371 2765 49.6 %
Date: 2019-11-25 17:38:06 Functions: 91 151 60.3 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include <string.h>
       7             : 
       8             : #include "EbDefinitions.h"
       9             : #include "immintrin.h"
      10             : #include "EbIntrinMacros_SSE2.h"
      11             : #include "EbIntraPrediction_AVX2.h"
      12             : #include "lpf_common_sse2.h"
      13             : #include "txfm_common_avx2.h"
      14             : #include "aom_dsp_rtcd.h"
      15             : 
      16             : // Indices are sign, integer, and fractional part of the gradient value
      17             : static const uint8_t gradient_to_angle_bin[2][7][16] = {
      18             :   {
      19             :       { 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0 },
      20             :       { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },
      21             :       { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
      22             :       { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
      23             :       { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
      24             :       { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
      25             :       { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
      26             :   },
      27             :   {
      28             :       { 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4 },
      29             :       { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3 },
      30             :       { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
      31             :       { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
      32             :       { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
      33             :       { 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
      34             :       { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
      35             :   },
      36             : };
      37             : 
      38     8323770 : static INLINE __m256i __m256i_div_epi32(const __m256i *a, const __m256i *b)
      39             : {
      40    24971300 :     __m256 d_f = _mm256_div_ps(_mm256_cvtepi32_ps(*a), _mm256_cvtepi32_ps(*b));
      41             :     //Integer devide round down
      42    16647500 :     return _mm256_cvtps_epi32(_mm256_floor_ps(d_f));
      43             : }
      44             : 
      45     4163900 : static INLINE void get_gradient_hist_avx2_internal(const __m256i *src1,
      46             :     const __m256i *src2, const __m256i *src3, int16_t *dy_mask_array,
      47             :     int16_t *quot_array, int16_t *remd_array, int16_t * sn_array,
      48             :     int32_t *temp_array) {
      49             : 
      50     4163900 :     const __m256i zero = _mm256_setzero_si256();
      51     4163900 :     const __m256i val_15_i16 = _mm256_set1_epi16(15);
      52     4163900 :     const __m256i val_6_i16 = _mm256_set1_epi16(6);
      53             :     __m256i dx, dy;
      54             :     __m256i tmp1_32, tmp2_32;
      55             :     __m256i dx1_32, dx2_32;
      56             :     __m256i dy1_32, dy2_32;
      57             :     __m256i sn;
      58             :     __m256i remd;
      59             :     __m256i quot;
      60             :     __m256i dy_mask;
      61             : 
      62     4163900 :     dx = _mm256_sub_epi16(*src1, *src2);
      63     8327800 :     dy = _mm256_sub_epi16(*src1, *src3);
      64             : 
      65             :     //sn = (dx > 0) ^ (dy > 0);
      66     4163900 :     sn = _mm256_xor_si256(dx, dy);  //result is 0 or 0xFFFF
      67     4163900 :     sn = _mm256_srli_epi16(sn, 15);  //change output from 0xFFFF to 1
      68             : 
      69             :     //mask which shows where are zeros in dy register 0/1
      70     8327800 :     dy_mask = _mm256_srli_epi16(_mm256_cmpeq_epi16(dy, zero), 15);
      71             : 
      72             :     //dx = abs(dx); dy = abs(dy);
      73     4163900 :     dx = _mm256_abs_epi16(dx);
      74     4163900 :     dy = _mm256_abs_epi16(dy);
      75             : 
      76             :     _mm256_add_epi16(dy, dy_mask);
      77             : 
      78             :     //  temp = dx * dx + dy * dy;
      79     8327800 :     dx1_32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(dx)); //dx
      80     4163900 :     dy1_32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(dy)); //dy
      81             : 
      82    12491700 :     tmp1_32 = _mm256_add_epi32(
      83             :         _mm256_mullo_epi32(dx1_32, dx1_32),
      84             :         _mm256_mullo_epi32(dy1_32, dy1_32));
      85             : 
      86     4163900 :     dx2_32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dx, 1));
      87     4163900 :     dy2_32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dy, 1));
      88             : 
      89    12491700 :     tmp2_32 = _mm256_add_epi32(
      90             :         _mm256_mullo_epi32(dx2_32, dx2_32),
      91             :         _mm256_mullo_epi32(dy2_32, dy2_32));
      92             : 
      93             :     /* Code:
      94             :      quot16 = (dx << 4) / dy;
      95             :      quot = quot16 >> 4;
      96             :      remd = = (quot16 & (15));
      97             :     Equivalent of:
      98             :      quot = dx / dy;
      99             :      remd = (dx % dy) * 16 / dy;*/
     100             : 
     101             :      //quot16 = (dx << 4) / dy;
     102     4163900 :     dx1_32 = _mm256_slli_epi32(dx1_32, 4);
     103     4163900 :     dx2_32 = _mm256_slli_epi32(dx2_32, 4);
     104     4163900 :     const __m256i d1_i32 = __m256i_div_epi32(&dx1_32, &dy1_32);
     105     4173550 :     const __m256i d2_i32 = __m256i_div_epi32(&dx2_32, &dy2_32);
     106     4166010 :     __m256i quot16 = _mm256_permute4x64_epi64(
     107             :         _mm256_packs_epi32(d1_i32, d2_i32), 0xD8);
     108             : 
     109     4166010 :     quot = _mm256_srli_epi16(quot16, 4);
     110             : 
     111             :     //remd = (quot16 & (15));
     112     4166010 :     remd = _mm256_and_si256(quot16, val_15_i16);
     113             : 
     114             :     //AOMMIN(remdA, 15)
     115     4166010 :     remd = _mm256_min_epi16(remd, val_15_i16);
     116             :     //AOMMIN(quotA, 6)
     117     4166010 :     quot = _mm256_min_epi16(quot, val_6_i16);
     118             : 
     119             :     _mm256_store_si256((__m256i *)dy_mask_array, dy_mask);
     120             :     _mm256_store_si256((__m256i *)quot_array, quot);
     121             :     _mm256_store_si256((__m256i *)remd_array, remd);
     122             :     _mm256_store_si256((__m256i *)sn_array, sn);
     123             :     _mm256_store_si256((__m256i *)temp_array, tmp1_32);
     124     4166010 :     _mm256_store_si256((__m256i *)&temp_array[8], tmp2_32);
     125     4166010 : }
     126             : 
     127      159253 : void av1_get_gradient_hist_avx2(const uint8_t *src, int src_stride, int rows,
     128             :     int cols, uint64_t *hist) {
     129      159253 :     src += src_stride;
     130             : 
     131             :     __m128i tmp_src;
     132             :     __m256i src1; //src[c]
     133             :     __m256i src2; //src[c-1]
     134             :     __m256i src3; //src[c - src_stride]
     135             : 
     136             :     DECLARE_ALIGNED(64, int16_t, dy_mask_array[16]);
     137             :     DECLARE_ALIGNED(64, int16_t, quot_array[16]);
     138             :     DECLARE_ALIGNED(64, int16_t, remd_array[16]);
     139             :     DECLARE_ALIGNED(64, int16_t, sn_array[16]);
     140             :     DECLARE_ALIGNED(64, int32_t, temp_array[16]);
     141             : 
     142      159253 :     if (cols < 8) { //i.e cols ==4
     143       19295 :         for (int r = 1; r < rows; r += 4) {
     144       15436 :             if ((r + 3) >= rows) {
     145        3859 :                 tmp_src = _mm_set_epi32(
     146             :                     0,
     147        3859 :                     *(uint32_t*)(src + 1),
     148        3859 :                     *(uint32_t*)(src + 1 + src_stride),
     149        3859 :                     *(uint32_t*)(src + 1 + 2 * src_stride));
     150        3859 :                 src1 = _mm256_cvtepu8_epi16(tmp_src);
     151             : 
     152        3859 :                 tmp_src = _mm_set_epi32(
     153             :                     0,
     154        3859 :                     *(uint32_t*)(src),
     155        3859 :                     *(uint32_t*)(src + src_stride),
     156        3859 :                     *(uint32_t*)(src + 2 * src_stride));
     157        3859 :                 src2 = _mm256_cvtepu8_epi16(tmp_src);
     158             : 
     159        3859 :                 tmp_src = _mm_set_epi32(
     160             :                     0,
     161        3859 :                     *(uint32_t*)(src + 1 - src_stride),
     162        3859 :                     *(uint32_t*)(src + 1),
     163        3859 :                     *(uint32_t*)(src + 1 + src_stride));
     164        3859 :                 src3 = _mm256_cvtepu8_epi16(tmp_src);
     165             :             }
     166             :             else {
     167       11577 :                 tmp_src = _mm_set_epi32(
     168       11577 :                     *(uint32_t*)(src + 1),
     169       11577 :                     *(uint32_t*)(src + 1 + src_stride),
     170       11577 :                     *(uint32_t*)(src + 1 + 2 * src_stride),
     171       11577 :                     *(uint32_t*)(src + 1 + 3 * src_stride));
     172       11577 :                 src1 = _mm256_cvtepu8_epi16(tmp_src);
     173             : 
     174       11577 :                 tmp_src = _mm_set_epi32(
     175       11577 :                     *(uint32_t*)(src),
     176       11577 :                     *(uint32_t*)(src + src_stride),
     177       11577 :                     *(uint32_t*)(src + 2 * src_stride),
     178       11577 :                     *(uint32_t*)(src + 3 * src_stride));
     179       11577 :                 src2 = _mm256_cvtepu8_epi16(tmp_src);
     180             : 
     181       11577 :                 tmp_src = _mm_set_epi32(
     182       11577 :                     *(uint32_t*)(src + 1 - src_stride),
     183       11577 :                     *(uint32_t*)(src + 1),
     184       11577 :                     *(uint32_t*)(src + 1 + src_stride),
     185       11577 :                     *(uint32_t*)(src + 1 + 2 * src_stride));
     186       11577 :                 src3 = _mm256_cvtepu8_epi16(tmp_src);
     187             :             }
     188             : 
     189       15436 :             get_gradient_hist_avx2_internal(&src1, &src2, &src3, dy_mask_array,
     190             :                 quot_array, remd_array, sn_array, temp_array);
     191             : 
     192       15436 :             if ((r + 3) >= rows) {
     193       46308 :                 for (int w = 0; w < 11; ++w) {
     194       42449 :                     if (w == 3 || w == 7)
     195        7718 :                         continue;
     196       34731 :                     if (dy_mask_array[w] != 1) {
     197       31739 :                         int index = gradient_to_angle_bin[sn_array[w]]
     198       31739 :                             [quot_array[w]][remd_array[w]];
     199       31739 :                         hist[index] += temp_array[w];
     200             :                     }
     201             :                     else {
     202        2992 :                         hist[2] += temp_array[w];
     203             :                     }
     204             :                 }
     205             :             }
     206             :             else {
     207      185232 :                 for (int w = 0; w < 15; ++w) {
     208      173655 :                     if (w == 3 || w == 7 || w == 11)
     209       34731 :                         continue;
     210      138924 :                     if (dy_mask_array[w] != 1) {
     211      128333 :                         int index = gradient_to_angle_bin[sn_array[w]]
     212      128333 :                             [quot_array[w]][remd_array[w]];
     213      128333 :                         hist[index] += temp_array[w];
     214             :                     }
     215             :                     else {
     216       10591 :                         hist[2] += temp_array[w];
     217             :                     }
     218             :                 }
     219             :             }
     220       15436 :             src += 4 * src_stride;
     221             :         }
     222             :     }
     223      155394 :     else if (cols < 16) { //i.e cols ==8
     224      377884 :         for (int r = 1; r < rows; r += 2) {
     225      308013 :             if ((r + 1) >= rows) {
     226      139734 :                 tmp_src = _mm_set1_epi64x(*(uint64_t*)(src + 1));
     227       69867 :                 src1 = _mm256_cvtepu8_epi16(tmp_src);
     228             : 
     229      139734 :                 tmp_src = _mm_set1_epi64x(*(uint64_t*)(src));
     230       69867 :                 src2 = _mm256_cvtepu8_epi16(tmp_src);
     231             : 
     232      139734 :                 tmp_src = _mm_set1_epi64x(*(uint64_t*)(src + 1 - src_stride));
     233       69867 :                 src3 = _mm256_cvtepu8_epi16(tmp_src);
     234             :             }
     235             :             else {
     236      238146 :                 tmp_src = _mm_set_epi64x(*(uint64_t*)(src + 1 + src_stride),
     237      238146 :                     *(uint64_t*)(src + 1));
     238      238146 :                 src1 = _mm256_cvtepu8_epi16(tmp_src);
     239             : 
     240      238146 :                 tmp_src = _mm_set_epi64x(*(uint64_t*)(src + src_stride),
     241      238146 :                     *(uint64_t*)(src));
     242      238146 :                 src2 = _mm256_cvtepu8_epi16(tmp_src);
     243             : 
     244      238146 :                 tmp_src = _mm_set_epi64x(*(uint64_t*)(src + 1),
     245      238146 :                     *(uint64_t*)(src + 1 - src_stride));
     246      238146 :                 src3 = _mm256_cvtepu8_epi16(tmp_src);
     247             :             }
     248             : 
     249      308013 :             get_gradient_hist_avx2_internal(&src1, &src2, &src3, dy_mask_array,
     250             :                 quot_array, remd_array, sn_array, temp_array);
     251             : 
     252      308085 :             if ((r + 1) >= rows) {
     253      558939 :                 for (int w = 0; w < 7; ++w) {
     254      489062 :                     if (dy_mask_array[w] != 1) {
     255      421559 :                         int index = gradient_to_angle_bin[sn_array[w]]
     256      421559 :                             [quot_array[w]][remd_array[w]];
     257      421559 :                         hist[index] += temp_array[w];
     258             :                     }
     259             :                     else {
     260       67503 :                         hist[2] += temp_array[w];
     261             :                     }
     262             :                 }
     263             :             }
     264             :             else {
     265     3807580 :                 for (int w = 0; w < 15; ++w) {
     266     3569380 :                     if (w == 7)
     267      238266 :                         continue;
     268     3331110 :                     if (dy_mask_array[w] != 1) {
     269     2880700 :                         int index = gradient_to_angle_bin[sn_array[w]]
     270     2880700 :                             [quot_array[w]][remd_array[w]];
     271     2880700 :                         hist[index] += temp_array[w];
     272             :                     }
     273             :                     else {
     274      450407 :                         hist[2] += temp_array[w];
     275             :                     }
     276             :                 }
     277             :             }
     278      308085 :             src += 2 * src_stride;
     279             :         }
     280             :     }
     281             :     else {
     282     2031160 :         for (int r = 1; r < rows; ++r) {
     283     1928880 :             int c = 1;
     284     5786400 :             for (; cols - c >= 15; c += 16) {
     285             : 
     286             :                 //read too many [1:16], while max is 15
     287     3840840 :                 src1 = _mm256_cvtepu8_epi16(
     288     3840840 :                     _mm_loadu_si128((__m128i const*)&src[c]));
     289     3840840 :                 src2 = _mm256_cvtepu8_epi16(
     290     3840840 :                     _mm_loadu_si128((__m128i const*)&src[c - 1]));
     291     3840840 :                 src3 = _mm256_cvtepu8_epi16(
     292     3840840 :                     _mm_loadu_si128((__m128i const*)&src[c - src_stride]));
     293             : 
     294     3840840 :                 get_gradient_hist_avx2_internal(&src1, &src2, &src3,
     295             :                     dy_mask_array, quot_array, remd_array, sn_array, temp_array);
     296             : 
     297     3857520 :                 int max = 16;
     298     3857520 :                 if (c + 16 > cols) {
     299     1928680 :                     max = 15;
     300             :                 }
     301             : 
     302    62670700 :                 for (int w = 0; w < max; ++w) {
     303             : 
     304    58813100 :                     if (dy_mask_array[w] != 1) {
     305    40042400 :                         int index = gradient_to_angle_bin[sn_array[w]]
     306    40042400 :                             [quot_array[w]][remd_array[w]];
     307    40042400 :                         hist[index] += temp_array[w];
     308             :                     }
     309             :                     else {
     310    18770700 :                         hist[2] += temp_array[w];
     311             :                     }
     312             :                 }
     313             :             }
     314     1945560 :             src += src_stride;
     315             :         }
     316             :     }
     317      176004 : }
     318             : 
     319             : #ifndef _mm256_setr_m128i
     320             : #define _mm256_setr_m128i(/* __m128i */ lo, /* __m128i */ hi) \
     321             :     _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
     322             : #endif
     323             : 
     324             : #define MACRO_VERTICAL_LUMA_4(A, B, C) \
     325             :     *(uint32_t*)prediction_ptr = _mm_cvtsi128_si32(_mm_or_si128(_mm_and_si128(A, B), C)); \
     326             :     A = _mm_srli_si128(A, 1); \
     327             :     *(uint32_t*)(prediction_ptr + pStride) = _mm_cvtsi128_si32(_mm_or_si128(_mm_and_si128(A, B), C)); \
     328             :     A = _mm_srli_si128(A, 1);
     329             : 
     330             : #define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \
     331             :     _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
     332             : 
     333           0 : static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
     334             :     __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
     335             : 
     336           0 :     r0 = _mm_unpacklo_epi16(x[0], x[1]);
     337           0 :     r1 = _mm_unpacklo_epi16(x[2], x[3]);
     338           0 :     r2 = _mm_unpacklo_epi16(x[4], x[5]);
     339           0 :     r3 = _mm_unpacklo_epi16(x[6], x[7]);
     340             : 
     341           0 :     r4 = _mm_unpacklo_epi16(x[8], x[9]);
     342           0 :     r5 = _mm_unpacklo_epi16(x[10], x[11]);
     343           0 :     r6 = _mm_unpacklo_epi16(x[12], x[13]);
     344           0 :     r7 = _mm_unpacklo_epi16(x[14], x[15]);
     345             : 
     346           0 :     r8 = _mm_unpacklo_epi32(r0, r1);
     347           0 :     r9 = _mm_unpackhi_epi32(r0, r1);
     348           0 :     r10 = _mm_unpacklo_epi32(r2, r3);
     349           0 :     r11 = _mm_unpackhi_epi32(r2, r3);
     350             : 
     351           0 :     r12 = _mm_unpacklo_epi32(r4, r5);
     352           0 :     r13 = _mm_unpackhi_epi32(r4, r5);
     353           0 :     r14 = _mm_unpacklo_epi32(r6, r7);
     354           0 :     r15 = _mm_unpackhi_epi32(r6, r7);
     355             : 
     356           0 :     r0 = _mm_unpacklo_epi64(r8, r9);
     357           0 :     r1 = _mm_unpackhi_epi64(r8, r9);
     358           0 :     r2 = _mm_unpacklo_epi64(r10, r11);
     359           0 :     r3 = _mm_unpackhi_epi64(r10, r11);
     360             : 
     361           0 :     r4 = _mm_unpacklo_epi64(r12, r13);
     362           0 :     r5 = _mm_unpackhi_epi64(r12, r13);
     363           0 :     r6 = _mm_unpacklo_epi64(r14, r15);
     364           0 :     r7 = _mm_unpackhi_epi64(r14, r15);
     365             : 
     366           0 :     d[0] = _mm_unpacklo_epi64(r0, r2);
     367           0 :     d[1] = _mm_unpacklo_epi64(r4, r6);
     368           0 :     d[2] = _mm_unpacklo_epi64(r1, r3);
     369           0 :     d[3] = _mm_unpacklo_epi64(r5, r7);
     370             : 
     371           0 :     d[4] = _mm_unpackhi_epi64(r0, r2);
     372           0 :     d[5] = _mm_unpackhi_epi64(r4, r6);
     373           0 :     d[6] = _mm_unpackhi_epi64(r1, r3);
     374           0 :     d[7] = _mm_unpackhi_epi64(r5, r7);
     375           0 : }
     376           0 : static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
     377             :     __m256i w0, w1, w2, w3, ww0, ww1;
     378             : 
     379           0 :     w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
     380           0 :     w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
     381           0 :     w2 = _mm256_unpackhi_epi16(x[0], x[1]);  // 40 50 41 51 42 52 43 53
     382           0 :     w3 = _mm256_unpackhi_epi16(x[2], x[3]);  // 60 70 61 71 62 72 63 73
     383             : 
     384           0 :     ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
     385           0 :     ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
     386             : 
     387           0 :     d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
     388           0 :     d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
     389             : 
     390           0 :     ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
     391           0 :     ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
     392             : 
     393           0 :     d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
     394           0 :     d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
     395           0 : }
     396             : 
     397           0 : static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
     398             :     __m256i w0, w1, w2, w3, ww0, ww1;
     399             : 
     400           0 :     w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
     401           0 :     w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
     402           0 :     w2 = _mm256_unpacklo_epi16(x[4], x[5]);  // 40 50 41 51 42 52 43 53
     403           0 :     w3 = _mm256_unpacklo_epi16(x[6], x[7]);  // 60 70 61 71 62 72 63 73
     404             : 
     405           0 :     ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
     406           0 :     ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
     407             : 
     408           0 :     d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
     409           0 :     d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
     410             : 
     411           0 :     ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
     412           0 :     ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
     413             : 
     414           0 :     d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
     415           0 :     d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
     416             : 
     417           0 :     w0 = _mm256_unpackhi_epi16(x[0], x[1]);  // 04 14 05 15 06 16 07 17
     418           0 :     w1 = _mm256_unpackhi_epi16(x[2], x[3]);  // 24 34 25 35 26 36 27 37
     419           0 :     w2 = _mm256_unpackhi_epi16(x[4], x[5]);  // 44 54 45 55 46 56 47 57
     420           0 :     w3 = _mm256_unpackhi_epi16(x[6], x[7]);  // 64 74 65 75 66 76 67 77
     421             : 
     422           0 :     ww0 = _mm256_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
     423           0 :     ww1 = _mm256_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
     424             : 
     425           0 :     d[4] = _mm256_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
     426           0 :     d[5] = _mm256_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
     427             : 
     428           0 :     ww0 = _mm256_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
     429           0 :     ww1 = _mm256_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
     430             : 
     431           0 :     d[6] = _mm256_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
     432           0 :     d[7] = _mm256_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
     433           0 : }
     434             : 
     435             : // TODO(luoyi) The following two functions are shared with intrapred_sse2.c.
     436             : // Use a header file, intrapred_common_x86.h
     437      123034 : static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
     438      123034 :     __m128i x = _mm_load_si128((__m128i const *)ref);
     439      123034 :     const __m128i zero = _mm_setzero_si128();
     440      123034 :     x = _mm_sad_epu8(x, zero);
     441      123034 :     const __m128i high = _mm_unpackhi_epi64(x, x);
     442      123034 :     return _mm_add_epi16(x, high);
     443             : }
     444             : 
     445      114153 : static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
     446      114153 :     __m128i x0 = _mm_load_si128((__m128i const *)ref);
     447      228306 :     __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
     448      114153 :     const __m128i zero = _mm_setzero_si128();
     449      114153 :     x0 = _mm_sad_epu8(x0, zero);
     450      114153 :     x1 = _mm_sad_epu8(x1, zero);
     451      114153 :     x0 = _mm_add_epi16(x0, x1);
     452      114153 :     const __m128i high = _mm_unpackhi_epi64(x0, x0);
     453      114153 :     return _mm_add_epi16(x0, high);
     454             : }
     455             : 
     456      270523 : static INLINE __m256i dc_sum_32(const uint8_t *ref) {
     457      270523 :     const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
     458      270523 :     const __m256i zero = _mm256_setzero_si256();
     459      270523 :     __m256i y = _mm256_sad_epu8(x, zero);
     460      270523 :     __m256i u = _mm256_permute2x128_si256(y, y, 1);
     461      270523 :     y = _mm256_add_epi64(u, y);
     462      270523 :     u = _mm256_unpackhi_epi64(y, y);
     463      270523 :     return _mm256_add_epi16(y, u);
     464             : }
     465      642432 : static INLINE void row_store_32xh(const __m256i *r, int32_t height, uint8_t *dst,
     466             :     ptrdiff_t stride) {
     467    16513500 :     for (int32_t i = 0; i < height; ++i) {
     468    15871000 :         _mm256_storeu_si256((__m256i *)dst, *r);
     469    15871000 :         dst += stride;
     470             :     }
     471      642432 : }
     472             : 
     473       10710 : static INLINE void row_store_64xh(const __m256i *r, int32_t height, uint8_t *dst,
     474             :     ptrdiff_t stride) {
     475      521085 :     for (int32_t i = 0; i < height; ++i) {
     476      510375 :         _mm256_storeu_si256((__m256i *)dst, *r);
     477      510375 :         _mm256_storeu_si256((__m256i *)(dst + 32), *r);
     478      510375 :         dst += stride;
     479             :     }
     480       10710 : }
     481       17141 : static INLINE __m256i dc_sum_64(const uint8_t *ref) {
     482       17141 :     const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
     483       34282 :     const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
     484       17141 :     const __m256i zero = _mm256_setzero_si256();
     485       17141 :     __m256i y0 = _mm256_sad_epu8(x0, zero);
     486       17141 :     __m256i y1 = _mm256_sad_epu8(x1, zero);
     487       17141 :     y0 = _mm256_add_epi64(y0, y1);
     488       17141 :     __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
     489       17141 :     y0 = _mm256_add_epi64(u0, y0);
     490       17141 :     u0 = _mm256_unpackhi_epi64(y0, y0);
     491       17141 :     return _mm256_add_epi16(y0, u0);
     492             : }
     493        4789 : void eb_aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
     494             :     const uint8_t *above, const uint8_t *left) {
     495        4789 :     const __m256i sum_above = dc_sum_64(above);
     496        4790 :     __m256i sum_left = dc_sum_64(left);
     497        4792 :     sum_left = _mm256_add_epi16(sum_left, sum_above);
     498        4792 :     uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
     499        4792 :     sum += 64;
     500        4792 :     sum /= 128;
     501        4792 :     const __m256i row = _mm256_set1_epi8((uint8_t)sum);
     502        4792 :     row_store_64xh(&row, 64, dst, stride);
     503        4795 : }
     504             : 
     505        1072 : void eb_aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
     506             :     const uint8_t *above,
     507             :     const uint8_t *left) {
     508        1072 :     __m256i sum = dc_sum_64(left);
     509             :     (void)above;
     510             : 
     511        1072 :     const __m256i thirtytwo = _mm256_set1_epi16(32);
     512        1072 :     sum = _mm256_add_epi16(sum, thirtytwo);
     513        1072 :     sum = _mm256_srai_epi16(sum, 6);
     514        1072 :     const __m256i zero = _mm256_setzero_si256();
     515        1072 :     __m256i row = _mm256_shuffle_epi8(sum, zero);
     516        1072 :     row_store_64xh(&row, 64, dst, stride);
     517        1072 : }
     518         530 : void eb_aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
     519             :     const uint8_t *above,
     520             :     const uint8_t *left) {
     521         530 :     __m256i sum = dc_sum_64(above);
     522             :     (void)left;
     523             : 
     524         530 :     const __m256i thirtytwo = _mm256_set1_epi16(32);
     525         530 :     sum = _mm256_add_epi16(sum, thirtytwo);
     526         530 :     sum = _mm256_srai_epi16(sum, 6);
     527         530 :     const __m256i zero = _mm256_setzero_si256();
     528         530 :     __m256i row = _mm256_shuffle_epi8(sum, zero);
     529         530 :     row_store_64xh(&row, 64, dst, stride);
     530         530 : }
     531       14623 : void eb_aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
     532             :     const uint8_t *above,
     533             :     const uint8_t *left) {
     534       14623 :     __m256i sum = dc_sum_32(above);
     535             :     (void)left;
     536             : 
     537       14623 :     const __m256i sixteen = _mm256_set1_epi16(16);
     538       14623 :     sum = _mm256_add_epi16(sum, sixteen);
     539       14623 :     sum = _mm256_srai_epi16(sum, 5);
     540       14623 :     const __m256i zero = _mm256_setzero_si256();
     541       14623 :     __m256i row = _mm256_shuffle_epi8(sum, zero);
     542       14623 :     row_store_32xh(&row, 32, dst, stride);
     543       14623 : }
     544       12499 : void eb_aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
     545             :     const uint8_t *above,
     546             :     const uint8_t *left) {
     547       12499 :     __m256i sum = dc_sum_32(left);
     548             :     (void)above;
     549             : 
     550       12499 :     const __m256i sixteen = _mm256_set1_epi16(16);
     551       12499 :     sum = _mm256_add_epi16(sum, sixteen);
     552       12499 :     sum = _mm256_srai_epi16(sum, 5);
     553       12499 :     const __m256i zero = _mm256_setzero_si256();
     554       12499 :     __m256i row = _mm256_shuffle_epi8(sum, zero);
     555       12499 :     row_store_32xh(&row, 32, dst, stride);
     556       12499 : }
     557         118 : void eb_aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
     558             :     const uint8_t *above,
     559             :     const uint8_t *left) {
     560             :     (void)above;
     561             :     (void)left;
     562         118 :     const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
     563         118 :     row_store_64xh(&row, 64, dst, stride);
     564         118 : }
     565         434 : void eb_aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
     566             :     const uint8_t *above,
     567             :     const uint8_t *left) {
     568             :     (void)above;
     569             :     (void)left;
     570         434 :     const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
     571         434 :     row_store_32xh(&row, 32, dst, stride);
     572         434 : }
     573             : 
     574      114153 : void eb_aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
     575             :     const uint8_t *above, const uint8_t *left) {
     576      114153 :     const __m128i top_sum = dc_sum_32_sse2(above);
     577      114153 :     __m128i left_sum = dc_sum_16_sse2(left);
     578      114153 :     left_sum = _mm_add_epi16(top_sum, left_sum);
     579      114153 :     uint32_t sum = _mm_cvtsi128_si32(left_sum);
     580      114153 :     sum += 24;
     581      114153 :     sum /= 48;
     582      114153 :     const __m256i row = _mm256_set1_epi8((uint8_t)sum);
     583      114153 :     row_store_32xh(&row, 16, dst, stride);
     584      114153 : }
     585             : 
     586        1729 : void eb_aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
     587             :     const uint8_t *above, const uint8_t *left) {
     588        1729 :     const __m256i sum_above = dc_sum_32(above);
     589        1729 :     __m256i sum_left = dc_sum_64(left);
     590        1729 :     sum_left = _mm256_add_epi16(sum_left, sum_above);
     591        1729 :     uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
     592        1729 :     sum += 48;
     593        1729 :     sum /= 96;
     594        1729 :     const __m256i row = _mm256_set1_epi8((uint8_t)sum);
     595        1729 :     row_store_32xh(&row, 64, dst, stride);
     596        1729 : }
     597             : 
     598        1222 : void eb_aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
     599             :     const uint8_t *above, const uint8_t *left) {
     600        1222 :     const __m256i sum_above = dc_sum_64(above);
     601        1222 :     __m256i sum_left = dc_sum_32(left);
     602        1222 :     sum_left = _mm256_add_epi16(sum_left, sum_above);
     603        1222 :     uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
     604        1222 :     sum += 48;
     605        1222 :     sum /= 96;
     606        1222 :     const __m256i row = _mm256_set1_epi8((uint8_t)sum);
     607        1222 :     row_store_64xh(&row, 32, dst, stride);
     608        1222 : }
     609             : 
     610        1938 : void eb_aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
     611             :     const uint8_t *above, const uint8_t *left) {
     612        1938 :     const __m256i sum_above = dc_sum_64(above);
     613        3876 :     __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
     614        1938 :     sum_left = _mm256_add_epi16(sum_left, sum_above);
     615        1938 :     uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
     616        1938 :     sum += 40;
     617        1938 :     sum /= 80;
     618        1938 :     const __m256i row = _mm256_set1_epi8((uint8_t)sum);
     619        1938 :     row_store_64xh(&row, 16, dst, stride);
     620        1938 : }
     621             : 
     622        6854 : void eb_aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
     623             :     const uint8_t *above,
     624             :     const uint8_t *left) {
     625        6854 :     __m128i sum = dc_sum_16_sse2(left);
     626             :     (void)above;
     627             : 
     628        6854 :     const __m128i eight = _mm_set1_epi16(8);
     629        6854 :     sum = _mm_add_epi16(sum, eight);
     630        6854 :     sum = _mm_srai_epi16(sum, 4);
     631        6854 :     const __m128i zero = _mm_setzero_si128();
     632        6854 :     const __m128i r = _mm_shuffle_epi8(sum, zero);
     633        6854 :     const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
     634        6854 :     row_store_32xh(&row, 16, dst, stride);
     635        6854 : }
     636             : 
     637         258 : void eb_aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
     638             :     const uint8_t *above,
     639             :     const uint8_t *left) {
     640         258 :     __m256i sum = dc_sum_64(left);
     641             :     (void)above;
     642             : 
     643         258 :     const __m256i thirtytwo = _mm256_set1_epi16(32);
     644         258 :     sum = _mm256_add_epi16(sum, thirtytwo);
     645         258 :     sum = _mm256_srai_epi16(sum, 6);
     646         258 :     const __m256i zero = _mm256_setzero_si256();
     647         258 :     __m256i row = _mm256_shuffle_epi8(sum, zero);
     648         258 :     row_store_32xh(&row, 64, dst, stride);
     649         258 : }
     650             : 
     651          86 : void eb_aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
     652             :     const uint8_t *above,
     653             :     const uint8_t *left) {
     654          86 :     __m256i sum = dc_sum_32(left);
     655             :     (void)above;
     656             : 
     657          86 :     const __m256i sixteen = _mm256_set1_epi16(16);
     658          86 :     sum = _mm256_add_epi16(sum, sixteen);
     659          86 :     sum = _mm256_srai_epi16(sum, 5);
     660          86 :     const __m256i zero = _mm256_setzero_si256();
     661          86 :     __m256i row = _mm256_shuffle_epi8(sum, zero);
     662          86 :     row_store_64xh(&row, 32, dst, stride);
     663          86 : }
     664             : 
     665          89 : void eb_aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
     666             :     const uint8_t *above,
     667             :     const uint8_t *left) {
     668          89 :     __m128i sum = dc_sum_16_sse2(left);
     669             :     (void)above;
     670             : 
     671          89 :     const __m128i eight = _mm_set1_epi16(8);
     672          89 :     sum = _mm_add_epi16(sum, eight);
     673          89 :     sum = _mm_srai_epi16(sum, 4);
     674          89 :     const __m128i zero = _mm_setzero_si128();
     675          89 :     const __m128i r = _mm_shuffle_epi8(sum, zero);
     676          89 :     const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
     677          89 :     row_store_64xh(&row, 16, dst, stride);
     678          89 : }
     679             : 
     680       19822 : void eb_aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
     681             :     const uint8_t *above,
     682             :     const uint8_t *left) {
     683       19822 :     __m256i sum = dc_sum_32(above);
     684             :     (void)left;
     685             : 
     686       19822 :     const __m256i sixteen = _mm256_set1_epi16(16);
     687       19822 :     sum = _mm256_add_epi16(sum, sixteen);
     688       19822 :     sum = _mm256_srai_epi16(sum, 5);
     689       19822 :     const __m256i zero = _mm256_setzero_si256();
     690       19822 :     __m256i row = _mm256_shuffle_epi8(sum, zero);
     691       19822 :     row_store_32xh(&row, 16, dst, stride);
     692       19822 : }
     693             : 
     694         138 : void eb_aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
     695             :     const uint8_t *above,
     696             :     const uint8_t *left) {
     697         138 :     __m256i sum = dc_sum_32(above);
     698             :     (void)left;
     699             : 
     700         138 :     const __m256i sixteen = _mm256_set1_epi16(16);
     701         138 :     sum = _mm256_add_epi16(sum, sixteen);
     702         138 :     sum = _mm256_srai_epi16(sum, 5);
     703         138 :     const __m256i zero = _mm256_setzero_si256();
     704         138 :     __m256i row = _mm256_shuffle_epi8(sum, zero);
     705         138 :     row_store_32xh(&row, 64, dst, stride);
     706         138 : }
     707             : 
     708         347 : void eb_aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
     709             :     const uint8_t *above,
     710             :     const uint8_t *left) {
     711         347 :     __m256i sum = dc_sum_64(above);
     712             :     (void)left;
     713             : 
     714         347 :     const __m256i thirtytwo = _mm256_set1_epi16(32);
     715         347 :     sum = _mm256_add_epi16(sum, thirtytwo);
     716         347 :     sum = _mm256_srai_epi16(sum, 6);
     717         347 :     const __m256i zero = _mm256_setzero_si256();
     718         347 :     __m256i row = _mm256_shuffle_epi8(sum, zero);
     719         347 :     row_store_64xh(&row, 32, dst, stride);
     720         347 : }
     721             : 
     722         469 : void eb_aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
     723             :     const uint8_t *above,
     724             :     const uint8_t *left) {
     725         469 :     __m256i sum = dc_sum_64(above);
     726             :     (void)left;
     727             : 
     728         469 :     const __m256i thirtytwo = _mm256_set1_epi16(32);
     729         469 :     sum = _mm256_add_epi16(sum, thirtytwo);
     730         469 :     sum = _mm256_srai_epi16(sum, 6);
     731         469 :     const __m256i zero = _mm256_setzero_si256();
     732         469 :     __m256i row = _mm256_shuffle_epi8(sum, zero);
     733         469 :     row_store_64xh(&row, 16, dst, stride);
     734         469 : }
     735             : 
     736         449 : void eb_aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
     737             :     const uint8_t *above,
     738             :     const uint8_t *left) {
     739             :     (void)above;
     740             :     (void)left;
     741         449 :     const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
     742         449 :     row_store_32xh(&row, 16, dst, stride);
     743         449 : }
     744          23 : void eb_aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
     745             :     const uint8_t *above,
     746             :     const uint8_t *left) {
     747             :     (void)above;
     748             :     (void)left;
     749          23 :     const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
     750          23 :     row_store_32xh(&row, 64, dst, stride);
     751          23 : }
     752          23 : void eb_aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
     753             :     const uint8_t *above,
     754             :     const uint8_t *left) {
     755             :     (void)above;
     756             :     (void)left;
     757          23 :     const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
     758          23 :     row_store_64xh(&row, 16, dst, stride);
     759          23 : }
     760          23 : void eb_aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
     761             :     const uint8_t *above,
     762             :     const uint8_t *left) {
     763             :     (void)above;
     764             :     (void)left;
     765          23 :     const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
     766          23 :     row_store_64xh(&row, 32, dst, stride);
     767          23 : }
     768             : 
     769             : // There are 32 rows togeter. This function does line:
     770             : // 0,1,2,3, and 16,17,18,19. The next call would do
     771             : // 4,5,6,7, and 20,21,22,23. So 4 times of calling
     772             : // would finish 32 rows.
     773     1506500 : static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
     774             :     ptrdiff_t stride) {
     775             :     __m256i t[4];
     776     1506500 :     __m256i m = _mm256_setzero_si256();
     777     1506500 :     const __m256i inc = _mm256_set1_epi8(4);
     778             :     int32_t i;
     779             : 
     780     7532280 :     for (i = 0; i < 4; i++) {
     781     6025780 :         t[i] = _mm256_shuffle_epi8(*row, m);
     782     6025780 :         __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
     783     6025780 :         __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
     784             :         _mm256_storeu_si256((__m256i *)dst, r0);
     785     6025780 :         _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
     786     6025780 :         dst += stride;
     787     6025780 :         m = _mm256_add_epi8(m, inc);
     788             :     }
     789     1506500 : }
     790             : 
     791      376632 : void eb_aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
     792             :     const uint8_t *above, const uint8_t *left) {
     793             :     (void)above;
     794      376632 :     const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
     795             : 
     796      376632 :     __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
     797             : 
     798      376632 :     __m256i v = _mm256_unpacklo_epi8(u, u);
     799      376632 :     h_predictor_32x8line(&v, dst, stride);
     800      376633 :     dst += stride << 2;
     801             : 
     802      376633 :     v = _mm256_unpackhi_epi8(u, u);
     803      376633 :     h_predictor_32x8line(&v, dst, stride);
     804      376634 :     dst += stride << 2;
     805             : 
     806      376634 :     u = _mm256_unpackhi_epi8(left_col, left_col);
     807             : 
     808      376634 :     v = _mm256_unpacklo_epi8(u, u);
     809      376634 :     h_predictor_32x8line(&v, dst, stride);
     810      376633 :     dst += stride << 2;
     811             : 
     812      376633 :     v = _mm256_unpackhi_epi8(u, u);
     813      376633 :     h_predictor_32x8line(&v, dst, stride);
     814      376634 : }
     815        4416 : static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
     816             :     int32_t height, uint8_t *dst,
     817             :     ptrdiff_t stride) {
     818      186128 :     for (int32_t i = 0; i < height; ++i) {
     819      181712 :         _mm256_storeu_si256((__m256i *)dst, *r0);
     820      181712 :         _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
     821      181712 :         dst += stride;
     822             :     }
     823        4416 : }
     824        2000 : void eb_aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
     825             :     const uint8_t *above, const uint8_t *left) {
     826        2000 :     const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
     827        2000 :     const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
     828             :     (void)left;
     829        2000 :     row_store_32x2xh(&row0, &row1, 64, dst, stride);
     830        2000 : }
     831      201131 : void eb_aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
     832             :     const uint8_t *above, const uint8_t *left) {
     833      201131 :     const __m256i row = _mm256_loadu_si256((const __m256i *)above);
     834             :     (void)left;
     835      201131 :     row_store_32xh(&row, 32, dst, stride);
     836      201132 : }
     837             : 
     838      158700 : void eb_aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
     839             :     const uint8_t *above, const uint8_t *left) {
     840      158700 :     const __m256i row = _mm256_loadu_si256((const __m256i *)above);
     841             :     (void)left;
     842      158700 :     row_store_32xh(&row, 16, dst, stride);
     843      158700 : }
     844        1417 : void eb_aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
     845             :     const uint8_t *above, const uint8_t *left) {
     846        1417 :     const __m256i row = _mm256_loadu_si256((const __m256i *)above);
     847             :     (void)left;
     848        1417 :     row_store_32xh(&row, 64, dst, stride);
     849        1417 : }
     850        1475 : void eb_aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
     851             :     const uint8_t *above, const uint8_t *left) {
     852        1475 :     const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
     853        1475 :     const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
     854             :     (void)left;
     855        1475 :     row_store_32x2xh(&row0, &row1, 16, dst, stride);
     856        1475 : }
     857         941 : void eb_aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
     858             :     const uint8_t *above, const uint8_t *left) {
     859         941 :     const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
     860         941 :     const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
     861             :     (void)left;
     862         941 :     row_store_32x2xh(&row0, &row1, 32, dst, stride);
     863         941 : }
     864             : 
     865      110202 : void eb_aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
     866             :     const uint8_t *above, const uint8_t *left) {
     867      110202 :     const __m256i sum_above = dc_sum_32(above);
     868      110205 :     __m256i sum_left = dc_sum_32(left);
     869      110212 :     sum_left = _mm256_add_epi16(sum_left, sum_above);
     870      110212 :     const __m256i thirtytwo = _mm256_set1_epi16(32);
     871      110212 :     sum_left = _mm256_add_epi16(sum_left, thirtytwo);
     872      110212 :     sum_left = _mm256_srai_epi16(sum_left, 6);
     873      110212 :     const __m256i zero = _mm256_setzero_si256();
     874      110212 :     __m256i row = _mm256_shuffle_epi8(sum_left, zero);
     875      110212 :     row_store_32xh(&row, 32, dst, stride);
     876      110212 : }
     877             : 
     878             : // only define these intrinsics if immintrin.h doesn't have them
     879             : #if defined(_MSC_VER) && _MSC_VER < 1910
     880             : static inline int32_t _mm256_extract_epi32(__m256i a, const int32_t i)
     881             : {
     882             :     return a.m256i_i32[i & 7];
     883             : }
     884             : 
     885             : static inline __m256i _mm256_insert_epi32(__m256i a, int32_t b, const int32_t i)
     886             : {
     887             :     __m256i c = a;
     888             :     c.m256i_i32[i & 7] = b;
     889             :     return c;
     890             : }
     891             : #endif
     892             : 
     893             : #define PERM4x64(c0, c1, c2, c3) c0+(c1<<2)+(c2<<4)+(c3<<6)
     894             : #define PERM2x128(c0, c1) c0+(c1<<4)
     895             : 
     896           0 : void transpose_16bit_TX_4X4(const uint16_t *src, uint32_t srcStride, uint16_t *dst, uint32_t dstStride)
     897             : {
     898           0 :     assert(srcStride == 4);
     899             :     (void)srcStride;
     900             : 
     901           0 :     if (dstStride == 4)
     902             :     {
     903           0 :         __m128i s = _mm_loadu_si128((__m128i*)src);
     904           0 :         __m128i r1 = _mm_srli_si128(s, 8);
     905           0 :         __m128i r2 = _mm_loadu_si128((__m128i*)(src + 8));
     906           0 :         __m128i r3 = _mm_srli_si128(r2, 8);
     907             : 
     908           0 :         __m128i r0_Lo = _mm_unpacklo_epi16(s, r1);
     909           0 :         __m128i r2_Lo = _mm_unpacklo_epi16(r2, r3);
     910           0 :         __m128i r1_Lo = _mm_unpacklo_epi32(r0_Lo, r2_Lo);
     911           0 :         r0_Lo = _mm_unpackhi_epi32(r0_Lo, r2_Lo);
     912             : 
     913             :         _mm_storeu_si128((__m128i*)(dst + 0 * dstStride), r1_Lo);
     914           0 :         _mm_storeu_si128((__m128i*)(dst + 2 * dstStride), r0_Lo);
     915             :     }
     916             :     else
     917             :     {
     918           0 :         __m128i s = _mm_loadu_si128((__m128i*)src);
     919           0 :         __m128i r1 = _mm_srli_si128(s, 8);
     920           0 :         __m128i r2 = _mm_loadu_si128((__m128i*)(src + 8));
     921           0 :         __m128i r3 = _mm_srli_si128(r2, 8);
     922             : 
     923           0 :         __m128i r0_Lo = _mm_unpacklo_epi16(s, r1);
     924           0 :         __m128i r2_Lo = _mm_unpacklo_epi16(r2, r3);
     925           0 :         __m128i r1_Lo = _mm_unpacklo_epi32(r0_Lo, r2_Lo);
     926           0 :         r0_Lo = _mm_unpackhi_epi32(r0_Lo, r2_Lo);
     927             : 
     928           0 :         _mm_storel_epi64((__m128i*)(dst + 0 * dstStride), r1_Lo);
     929           0 :         _mm_storel_epi64((__m128i*)(dst + 1 * dstStride), _mm_srli_si128(r1_Lo, 8));
     930           0 :         _mm_storel_epi64((__m128i*)(dst + 2 * dstStride), r0_Lo);
     931           0 :         _mm_storel_epi64((__m128i*)(dst + 3 * dstStride), _mm_srli_si128(r0_Lo, 8));
     932             :     }
     933           0 : }
     934           0 : void transpose_16bit_TX_8X8(const uint16_t *src, uint32_t srcStride, uint16_t *dst, uint32_t dstStride)
     935             : {
     936             :     __m128i r0, r1, r2, r3, r4, r5, r6, r7, r0_Lo, r1_Lo, r2_Lo, r3_Lo, r4_Lo, r5_Lo, r6_Lo;
     937           0 :     r0 = _mm_loadu_si128((__m128i*)(src + 0 * srcStride));   // 07,06,05,04,03,02,01,00
     938           0 :     r1 = _mm_loadu_si128((__m128i*)(src + 1 * srcStride));   // 17,16,15,14,13,12,11,10
     939           0 :     r2 = _mm_loadu_si128((__m128i*)(src + 2 * srcStride));   // 27,26,25,24,23,22,21,20
     940           0 :     r3 = _mm_loadu_si128((__m128i*)(src + 3 * srcStride));   // 37,36,35,34,33,32,31,30
     941           0 :     r4 = _mm_loadu_si128((__m128i*)(src + 4 * srcStride));   // 47,46,45,44,43,42,41,40
     942           0 :     r5 = _mm_loadu_si128((__m128i*)(src + 5 * srcStride));   // 57,56,55,54,53,52,51,50
     943           0 :     r6 = _mm_loadu_si128((__m128i*)(src + 6 * srcStride));   // 67,66,65,64,63,62,61,60
     944           0 :     r7 = _mm_loadu_si128((__m128i*)(src + 7 * srcStride));   // 77,76,75,74,73,72,71,70
     945             : 
     946           0 :     r0_Lo = _mm_unpacklo_epi16(r0, r1);
     947           0 :     r2_Lo = _mm_unpacklo_epi16(r2, r3);
     948           0 :     r4_Lo = _mm_unpacklo_epi16(r4, r5);
     949           0 :     r6_Lo = _mm_unpacklo_epi16(r6, r7);
     950             : 
     951           0 :     r1_Lo = r0_Lo;
     952           0 :     r0_Lo = _mm_unpacklo_epi32(r0_Lo, r2_Lo);
     953           0 :     r1_Lo = _mm_unpackhi_epi32(r1_Lo, r2_Lo);
     954           0 :     r5_Lo = r4_Lo;
     955           0 :     r4_Lo = _mm_unpacklo_epi32(r4_Lo, r6_Lo);
     956           0 :     r5_Lo = _mm_unpackhi_epi32(r5_Lo, r6_Lo);
     957           0 :     r2_Lo = r0_Lo;
     958           0 :     r0_Lo = _mm_unpacklo_epi64(r0_Lo, r4_Lo); //64
     959           0 :     r2_Lo = _mm_unpackhi_epi64(r2_Lo, r4_Lo);
     960           0 :     r3_Lo = r1_Lo;
     961           0 :     r1_Lo = _mm_unpacklo_epi64(r1_Lo, r5_Lo);
     962           0 :     r3_Lo = _mm_unpackhi_epi64(r3_Lo, r5_Lo);
     963             : 
     964             :     _mm_storeu_si128((__m128i*)(dst + 0 * dstStride), r0_Lo);
     965           0 :     _mm_storeu_si128((__m128i*)(dst + 1 * dstStride), r2_Lo);
     966           0 :     _mm_storeu_si128((__m128i*)(dst + 2 * dstStride), r1_Lo);
     967           0 :     _mm_storeu_si128((__m128i*)(dst + 3 * dstStride), r3_Lo);
     968             : 
     969           0 :     r0 = _mm_unpackhi_epi16(r0, r1);
     970           0 :     r2 = _mm_unpackhi_epi16(r2, r3);
     971           0 :     r4 = _mm_unpackhi_epi16(r4, r5);
     972           0 :     r6 = _mm_unpackhi_epi16(r6, r7);
     973             : 
     974           0 :     r1 = r0;
     975           0 :     r0 = _mm_unpacklo_epi32(r0, r2);
     976           0 :     r1 = _mm_unpackhi_epi32(r1, r2);
     977           0 :     r5 = r4;
     978           0 :     r4 = _mm_unpacklo_epi32(r4, r6);
     979           0 :     r5 = _mm_unpackhi_epi32(r5, r6);
     980           0 :     r2 = r0;
     981           0 :     r0 = _mm_unpacklo_epi64(r0, r4);
     982           0 :     r2 = _mm_unpackhi_epi64(r2, r4);
     983           0 :     r3 = r1;
     984           0 :     r1 = _mm_unpacklo_epi64(r1, r5);
     985           0 :     r3 = _mm_unpackhi_epi64(r3, r5);
     986             : 
     987           0 :     _mm_storeu_si128((__m128i*)(dst + 4 * dstStride), r0);
     988           0 :     _mm_storeu_si128((__m128i*)(dst + 5 * dstStride), r2);
     989           0 :     _mm_storeu_si128((__m128i*)(dst + 6 * dstStride), r1);
     990           0 :     _mm_storeu_si128((__m128i*)(dst + 7 * dstStride), r3);
     991           0 : }
     992           0 : void transpose_16bit(const uint16_t *src, uint32_t srcStride, uint16_t *dst, uint32_t dstStride, int32_t width, int32_t height)
     993             : {
     994           0 :     for (int32_t j = 0; j < height; j += 8)
     995           0 :         for (int32_t i = 0; i < width; i += 8)
     996           0 :             transpose_16bit_TX_8X8(src + i * srcStride + j, srcStride, dst + j * dstStride + i, dstStride);
     997           0 : }
     998             : 
     999             : // Low bit depth functions
    1000             : static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
    1001             :     { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1002             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1003             :     { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1004             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1005             :     { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1006             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1007             :     { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1008             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1009             :     { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1010             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1011             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1012             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1013             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1014             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1015             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1016             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1017             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
    1018             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1019             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
    1020             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1021             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
    1022             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1023             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1024             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1025             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1026             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1027             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1028             :     0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1029             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1030             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1031             :     0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1032             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1033             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1034             :     0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
    1035             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1036             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1037             :     0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
    1038             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1039             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1040             :     0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
    1041             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1042             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1043             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
    1044             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1045             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1046             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
    1047             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1048             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1049             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
    1050             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1051             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1052             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
    1053             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1054             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1055             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
    1056             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1057             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1058             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1059             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1060             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1061             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1062             :     0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    1063             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1064             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1065             :     0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
    1066             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1067             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1068             :     0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 },
    1069             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1070             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1071             :     0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 },
    1072             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1073             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1074             :     0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 },
    1075             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1076             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1077             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 },
    1078             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1079             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1080             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 },
    1081             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1082             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1083             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 },
    1084             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1085             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1086             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
    1087             :     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1088             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    1089             :     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
    1090             : };
    1091             : 
    1092             : static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = {
    1093             :     {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
    1094             :     {0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
    1095             :     {0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13},
    1096             :     {0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
    1097             :     {0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
    1098             :     {0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
    1099             :     {0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
    1100             :     {0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8},
    1101             :     {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7},
    1102             :     {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6},
    1103             :     {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5},
    1104             :     {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4},
    1105             :     {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3},
    1106             :     {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2},
    1107             :     {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1},
    1108             :     {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    1109             : };
    1110             : 
    1111             : static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
    1112             :     {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15},
    1113             :     {0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14},
    1114             :     {0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13},
    1115             :     {0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12},
    1116             :     {0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11},
    1117             :     {0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10},
    1118             :     {0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9},
    1119             :     {0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8}};
    1120             : 
    1121             : static AOM_FORCE_INLINE void
    1122             : dr_prediction_z1_HxW_internal_avx2(int H, int W, __m128i *dst,
    1123             :                                    const uint8_t *above, int upsample_above,
    1124             :                                    int dx) {
    1125     6495780 :     const int frac_bits = 6 - upsample_above;
    1126     6495780 :     const int max_base_x = ((W + H) - 1) << upsample_above;
    1127             : 
    1128           0 :     assert(dx > 0);
    1129             :     // pre-filter above pixels
    1130             :     // store in temp buffers:
    1131             :     //   above[x] * 32 + 16
    1132             :     //   above[x+1] - above[x]
    1133             :     // final pixels will be caluculated as:
    1134             :     //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    1135             :     __m256i a0, a1, a32, a16;
    1136             :     __m256i diff, c3f;
    1137             :     __m128i a_mbase_x;
    1138             : 
    1139     6495780 :     a16 = _mm256_set1_epi16(16);
    1140    12991600 :     a_mbase_x = _mm_set1_epi8(above[max_base_x]);
    1141     6495780 :     c3f = _mm256_set1_epi16(0x3f);
    1142             : 
    1143     6495780 :     int x = dx;
    1144    69768800 :     for (int r = 0; r < W; r++) {
    1145             :         __m256i b, res, shift;
    1146             :         __m128i res1, a0_128, a1_128;
    1147             : 
    1148    63343900 :         int base = x >> frac_bits;
    1149    63343900 :         int base_max_diff = (max_base_x - base) >> upsample_above;
    1150    63343900 :         if (base_max_diff <= 0) {
    1151      207779 :             for (int i = r; i < W; ++i) {
    1152      136862 :                 dst[i] = a_mbase_x; // save 4 values
    1153             :             }
    1154       70917 :             return;
    1155             :         }
    1156    63273000 :         if (base_max_diff > H)
    1157    60098000 :             base_max_diff = H;
    1158    63273000 :         a0_128 = _mm_loadu_si128((__m128i *)(above + base));
    1159    63273000 :         a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
    1160             : 
    1161    63273000 :         if (upsample_above) {
    1162    16667600 :             a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
    1163    16667600 :             a1_128 = _mm_srli_si128(a0_128, 8);
    1164             : 
    1165    83337800 :             shift = _mm256_srli_epi16(
    1166             :                 _mm256_and_si256(
    1167             :                     _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
    1168             :                 1);
    1169             :         }
    1170             :         else {
    1171   186422000 :             shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
    1172             :         }
    1173    63273000 :         a0 = _mm256_cvtepu8_epi16(a0_128);
    1174    63273000 :         a1 = _mm256_cvtepu8_epi16(a1_128);
    1175             : 
    1176    63273000 :         diff = _mm256_sub_epi16(a1, a0);  // a[x+1] - a[x]
    1177    63273000 :         a32 = _mm256_slli_epi16(a0, 5);   // a[x] * 32
    1178    63273000 :         a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
    1179             : 
    1180    63273000 :         b = _mm256_mullo_epi16(diff, shift);
    1181    63273000 :         res = _mm256_add_epi16(a32, b);
    1182    63273000 :         res = _mm256_srli_epi16(res, 5);
    1183             : 
    1184   126546000 :         res = _mm256_packus_epi16(
    1185             :             res,
    1186    63273000 :             _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); // goto 8 bit
    1187    63273000 :         res1 = _mm256_castsi256_si128(res); // 16 8bit values
    1188             : 
    1189   126546000 :         dst[r] =
    1190    63273000 :             _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
    1191    63273000 :         x += dx;
    1192             :     }
    1193             : }
    1194             : 
    1195     1527060 : static void dr_prediction_z1_4xN_avx2(int32_t N, uint8_t *dst, ptrdiff_t stride,
    1196             :                                       const uint8_t *above,
    1197             :                                       int32_t upsample_above, int32_t dx) {
    1198             :   __m128i dstvec[16];
    1199             :   dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
    1200             : 
    1201    12616300 :   for (int32_t i = 0; i < N; i++) {
    1202    22178400 :     *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
    1203             :   }
    1204     1527060 : }
    1205             : 
    1206     1418700 : static void dr_prediction_z1_8xN_avx2(int32_t N, uint8_t *dst, ptrdiff_t stride,
    1207             :     const uint8_t *above, int32_t upsample_above,
    1208             :     int32_t dx) {
    1209             :     __m128i dstvec[32];
    1210             : 
    1211             :     dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
    1212    15590100 :     for (int32_t i = 0; i < N; i++) {
    1213    14171400 :         _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
    1214             :     }
    1215     1418700 : }
    1216             : 
    1217      955254 : static void dr_prediction_z1_16xN_avx2(int32_t N, uint8_t *dst, ptrdiff_t stride,
    1218             :     const uint8_t *above, int32_t upsample_above,
    1219             :     int32_t dx) {
    1220             :     __m128i dstvec[64];
    1221             : 
    1222             :     dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
    1223             : 
    1224    13415700 :     for (int32_t i = 0; i < N; i++) {
    1225    12460400 :         _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
    1226             :     }
    1227      955254 : }
    1228             : 
    1229             : static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
    1230             :     int32_t N, __m256i *dstvec, const uint8_t *above, int32_t upsample_above, int32_t dx) {
    1231             :     int32_t x;
    1232             :     // here upsample_above is 0 by design of av1_use_intra_edge_upsample
    1233             :     (void)upsample_above;
    1234      672650 :     const int32_t frac_bits = 6;
    1235      672650 :     const int32_t max_base_x = ((32 + N) - 1);
    1236             : 
    1237             :     // pre-filter above pixels
    1238             :     // store in temp buffers:
    1239             :     //   above[x] * 32 + 16
    1240             :     //   above[x+1] - above[x]
    1241             :     // final pixels will be caluculated as:
    1242             :     //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    1243             :     __m256i a0, a0_1, a1, a1_1, a32, a16;
    1244             :     __m256i a_mbase_x, diff;
    1245             : 
    1246      672650 :     a16 = _mm256_set1_epi32(16);
    1247      672650 :     a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
    1248             : 
    1249      672650 :     x = dx;
    1250    15670500 :     for (int32_t r = 0; r < N; r++) {
    1251             :         __m256i b, res[2], res16[2];
    1252             : 
    1253    14997900 :         int32_t base = x >> frac_bits;
    1254    14997900 :         int32_t base_max_diff = (max_base_x - base);
    1255    14997900 :         if (base_max_diff <= 0) {
    1256           0 :             for (int32_t i = r; i < N; ++i) {
    1257           0 :                 dstvec[i] = a_mbase_x;  // save 32 values
    1258             :             }
    1259           0 :             return;
    1260             :         }
    1261    14997900 :         if (base_max_diff > 32) base_max_diff = 32;
    1262    44993600 :         __m256i shift = _mm256_srli_epi32(
    1263             :             _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
    1264             : 
    1265    44984000 :         for (int32_t j = 0, jj = 0; j < 32; j += 16, jj++) {
    1266    29986100 :             int32_t mdiff = base_max_diff - j;
    1267    29986100 :             if (mdiff <= 0) {
    1268        8350 :                 res16[jj] = a_mbase_x;
    1269             :             }
    1270             :             else {
    1271    29977800 :                 a0 = _mm256_cvtepu8_epi32(
    1272    29977800 :                     _mm_loadu_si128((__m128i *)(above + base + j)));
    1273    59955500 :                 a1 = _mm256_cvtepu8_epi32(
    1274    29977800 :                     _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
    1275             : 
    1276    29977800 :                 diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
    1277    29977800 :                 a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
    1278    29977800 :                 a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
    1279    29977800 :                 b = _mm256_mullo_epi32(diff, shift);
    1280             : 
    1281    29977800 :                 res[0] = _mm256_add_epi32(a32, b);
    1282    29977800 :                 res[0] = _mm256_srli_epi32(res[0], 5);
    1283    29977800 :                 res[0] = _mm256_packus_epi32(
    1284             :                     res[0],
    1285    29977800 :                     _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
    1286             : 
    1287             :                 // goto 8 bit
    1288    29977800 :                 res[0] = _mm256_packus_epi16(res[0], res[0]);
    1289             : 
    1290    29977800 :                 if (mdiff > 8) {
    1291    29923800 :                     a0_1 = _mm256_cvtepu8_epi32(
    1292    29923800 :                         _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
    1293    59847500 :                     a1_1 = _mm256_cvtepu8_epi32(
    1294    29923800 :                         _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
    1295             : 
    1296    29923800 :                     diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
    1297    29923800 :                     a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
    1298    29923800 :                     a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
    1299    29923800 :                     b = _mm256_mullo_epi32(diff, shift);
    1300             : 
    1301    29923800 :                     res[1] = _mm256_add_epi32(a32, b);
    1302    29923800 :                     res[1] = _mm256_srli_epi32(res[1], 5);
    1303    29923800 :                     res[1] = _mm256_packus_epi32(
    1304             :                         res[1],
    1305    29923800 :                         _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
    1306    59847500 :                     res[1] = _mm256_packus_epi16(res[1], res[1]);
    1307             :                     // goto 8 bit
    1308             :                 }
    1309             :                 else {
    1310       54005 :                     res[1] = a_mbase_x;
    1311             :                 }
    1312    59955500 :                 res16[jj] = _mm256_unpacklo_epi64(res[0], res[1]);  // 16 8bit values
    1313             :             }
    1314             :         }
    1315    14997900 :         res16[1] =
    1316    14997900 :             _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
    1317             :             1);  // 32 8bit values
    1318             : 
    1319    29995700 :         dstvec[r] = _mm256_blendv_epi8(
    1320             :             a_mbase_x, res16[1],
    1321    14997900 :             *(__m256i *)BaseMask[base_max_diff]);  // 32 8bit values
    1322    14997900 :         x += dx;
    1323             :     }
    1324             : }
    1325             : 
    1326      382076 : static void dr_prediction_z1_32xN_avx2(int32_t N, uint8_t *dst, ptrdiff_t stride,
    1327             :     const uint8_t *above, int32_t upsample_above,
    1328             :     int32_t dx) {
    1329             :     __m256i dstvec[64];
    1330             :     dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
    1331     9355190 :     for (int32_t i = 0; i < N; i++) {
    1332     8973110 :         _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
    1333             :     }
    1334      382076 : }
    1335             : 
    1336      139904 : static void dr_prediction_z1_64xN_avx2(int32_t N, uint8_t *dst, ptrdiff_t stride,
    1337             :     const uint8_t *above, int32_t upsample_above,
    1338             :     int32_t dx) {
    1339             :     int32_t x;
    1340             : 
    1341             :     // here upsample_above is 0 by design of av1_use_intra_edge_upsample
    1342             :     (void)upsample_above;
    1343      139904 :     const int32_t frac_bits = 6;
    1344      139904 :     const int32_t max_base_x = ((64 + N) - 1);
    1345             : 
    1346             :     // pre-filter above pixels
    1347             :     // store in temp buffers:
    1348             :     //   above[x] * 32 + 16
    1349             :     //   above[x+1] - above[x]
    1350             :     // final pixels will be caluculated as:
    1351             :     //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    1352             :     __m256i a0, a0_1, a1, a1_1, a32, a16;
    1353             :     __m256i a_mbase_x, diff;
    1354             :     __m128i max_base_x128, base_inc128, mask128;
    1355             : 
    1356      139904 :     a16 = _mm256_set1_epi32(16);
    1357      139904 :     a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
    1358      139904 :     max_base_x128 = _mm_set1_epi8(max_base_x);
    1359             : 
    1360      139904 :     x = dx;
    1361     5701190 :     for (int32_t r = 0; r < N; r++, dst += stride) {
    1362             :         __m256i b, res[2];
    1363             :         __m128i res1;
    1364             : 
    1365     5561290 :         int32_t base = x >> frac_bits;
    1366     5561290 :         if (base >= max_base_x) {
    1367           0 :             for (int32_t i = r; i < N; ++i) {
    1368             :                 _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
    1369           0 :                 _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
    1370           0 :                 dst += stride;
    1371             :             }
    1372           0 :             return;
    1373             :         }
    1374             : 
    1375    16683900 :         __m256i shift = _mm256_srli_epi32(
    1376             :             _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
    1377             : 
    1378             :         __m128i a0_128, a0_1_128, a1_128, a1_1_128;
    1379    27779000 :         for (int32_t j = 0; j < 64; j += 16) {
    1380    22217700 :             int32_t mdif = max_base_x - (base + j);
    1381    22217700 :             if (mdif <= 0) {
    1382       15879 :                 _mm_storeu_si128((__m128i *)(dst + j),
    1383             :                     _mm256_castsi256_si128(a_mbase_x));
    1384             :             }
    1385             :             else {
    1386    22201800 :                 a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
    1387    44403600 :                 a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
    1388    22201800 :                 a0 = _mm256_cvtepu8_epi32(a0_128);
    1389    22201800 :                 a1 = _mm256_cvtepu8_epi32(a1_128);
    1390             : 
    1391    22201800 :                 diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
    1392    22201800 :                 a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
    1393    22201800 :                 a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
    1394    22201800 :                 b = _mm256_mullo_epi32(diff, shift);
    1395             : 
    1396    22201800 :                 res[0] = _mm256_add_epi32(a32, b);
    1397    22201800 :                 res[0] = _mm256_srli_epi32(res[0], 5);
    1398    22201800 :                 res[0] = _mm256_packus_epi32(
    1399             :                     res[0],
    1400    22201800 :                     _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
    1401             :                 // goto 8 bit
    1402    22201800 :                 res[0] = _mm256_packus_epi16(res[0], res[0]);
    1403             : 
    1404    22201800 :                 if (mdif > 8) {
    1405    22165400 :                     a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
    1406    44330700 :                     a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
    1407    22165400 :                     a0_1 = _mm256_cvtepu8_epi32(a0_1_128);
    1408    22165400 :                     a1_1 = _mm256_cvtepu8_epi32(a1_1_128);
    1409             : 
    1410    22165400 :                     diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
    1411    22165400 :                     a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
    1412    22165400 :                     a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
    1413    22165400 :                     b = _mm256_mullo_epi32(diff, shift);
    1414             : 
    1415    22165400 :                     res[1] = _mm256_add_epi32(a32, b);
    1416    22165400 :                     res[1] = _mm256_srli_epi32(res[1], 5);
    1417    22165400 :                     res[1] = _mm256_packus_epi32(
    1418             :                         res[1],
    1419    22165400 :                         _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
    1420    44330700 :                     res[1] = _mm256_packus_epi16(res[1], res[1]);
    1421             : 
    1422             :                 }
    1423             :                 else {
    1424       36437 :                     res[1] = a_mbase_x;
    1425             :                 }
    1426    66605400 :                 res1 = _mm_unpacklo_epi64(
    1427             :                     _mm256_castsi256_si128(res[0]),
    1428             :                     _mm256_castsi256_si128(res[1]));  // 16 8bit values
    1429             : 
    1430    22201800 :                 base_inc128 = _mm_setr_epi8(
    1431    22201800 :                     base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
    1432    22201800 :                     base + j + 5, base + j + 6, base + j + 7, base + j + 8,
    1433    22201800 :                     base + j + 9, base + j + 10, base + j + 11, base + j + 12,
    1434    22201800 :                     base + j + 13, base + j + 14, base + j + 15);
    1435             : 
    1436    66605400 :                 mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
    1437             :                     _mm_setzero_si128());
    1438             :                 res1 =
    1439    22201800 :                     _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x), res1, mask128);
    1440    22201800 :                 _mm_storeu_si128((__m128i *)(dst + j), res1);
    1441             :             }
    1442             :         }
    1443     5561290 :         x += dx;
    1444             :     }
    1445             : }
    1446             : 
    1447             : // Directional prediction, zone 1: 0 < angle < 90
    1448     4343370 : void eb_av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int32_t bw, int32_t bh,
    1449             :     const uint8_t *above, const uint8_t *left,
    1450             :     int32_t upsample_above, int32_t dx, int32_t dy) {
    1451             :     (void)left;
    1452             :     (void)dy;
    1453     4343370 :     switch (bw) {
    1454     1527050 :     case 4:
    1455     1527050 :         dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
    1456     1527030 :         break;
    1457     1418690 :     case 8:
    1458     1418690 :         dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
    1459     1418640 :         break;
    1460      955247 :     case 16:
    1461      955247 :         dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
    1462      955227 :         break;
    1463      371409 :     case 32:
    1464      371409 :         dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
    1465      371402 :         break;
    1466       71499 :     case 64:
    1467       71499 :         dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
    1468       71499 :         break;
    1469           0 :     default: break;
    1470             :     }
    1471     4343280 :     return;
    1472             : }
    1473             : 
    1474             : static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
    1475             :     int32_t N, __m128i *dst, const uint16_t *above, int32_t upsample_above, int32_t dx) {
    1476           0 :     const int32_t frac_bits = 6 - upsample_above;
    1477           0 :     const int32_t max_base_x = ((N + 4) - 1) << upsample_above;
    1478             :     int32_t x;
    1479             :     // a assert(dx > 0);
    1480             :     // pre-filter above pixels
    1481             :     // store in temp buffers:
    1482             :     //   above[x] * 32 + 16
    1483             :     //   above[x+1] - above[x]
    1484             :     // final pixels will be caluculated as:
    1485             :     //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    1486             :     __m256i a0, a1, a32, a16;
    1487             :     __m256i diff;
    1488             :     __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
    1489             : 
    1490           0 :     a16 = _mm256_set1_epi32(16);
    1491           0 :     a_mbase_x = _mm_set1_epi16(above[max_base_x]);
    1492           0 :     max_base_x128 = _mm_set1_epi32(max_base_x);
    1493             : 
    1494           0 :     x = dx;
    1495           0 :     for (int32_t r = 0; r < N; r++) {
    1496             :         __m256i b, res, shift;
    1497             :         __m128i res1;
    1498             : 
    1499           0 :         int32_t base = x >> frac_bits;
    1500           0 :         if (base >= max_base_x) {
    1501           0 :             for (int32_t i = r; i < N; ++i) {
    1502           0 :                 dst[i] = a_mbase_x;  // save 4 values
    1503             :             }
    1504           0 :             return;
    1505             :         }
    1506             : 
    1507           0 :         a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
    1508           0 :         a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
    1509             : 
    1510           0 :         if (upsample_above) {
    1511           0 :             a0 = _mm256_permutevar8x32_epi32(
    1512             :                 a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
    1513           0 :             a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
    1514           0 :             base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
    1515           0 :             shift = _mm256_srli_epi32(
    1516             :                 _mm256_and_si256(
    1517             :                 _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
    1518             :                 _mm256_set1_epi32(0x3f)),
    1519             :                 1);
    1520             :         }
    1521             :         else {
    1522           0 :             base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
    1523           0 :             shift = _mm256_srli_epi32(
    1524             :                 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
    1525             :         }
    1526             : 
    1527           0 :         diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
    1528           0 :         a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
    1529           0 :         a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
    1530             : 
    1531           0 :         b = _mm256_mullo_epi32(diff, shift);
    1532           0 :         res = _mm256_add_epi32(a32, b);
    1533           0 :         res = _mm256_srli_epi32(res, 5);
    1534             : 
    1535           0 :         res1 = _mm256_castsi256_si128(res);
    1536           0 :         res1 = _mm_packus_epi32(res1, res1);
    1537             : 
    1538           0 :         mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
    1539           0 :         mask128 = _mm_packs_epi32(mask128, mask128);  // goto 16 bit
    1540           0 :         dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
    1541           0 :         x += dx;
    1542             :     }
    1543             : }
    1544             : 
    1545           0 : static void highbd_dr_prediction_z1_4xN_avx2(int32_t N, uint16_t *dst,
    1546             :     ptrdiff_t stride,
    1547             :     const uint16_t *above,
    1548             :     int32_t upsample_above, int32_t dx) {
    1549             :     __m128i dstvec[16];
    1550             : 
    1551             :     highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
    1552             :         dx);
    1553           0 :     for (int32_t i = 0; i < N; i++) {
    1554           0 :         _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
    1555             :     }
    1556           0 : }
    1557             : 
    1558             : static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
    1559             :     int32_t N, __m128i *dst, const uint16_t *above, int32_t upsample_above, int32_t dx) {
    1560           0 :     const int32_t frac_bits = 6 - upsample_above;
    1561           0 :     const int32_t max_base_x = ((8 + N) - 1) << upsample_above;
    1562             : 
    1563             :     int32_t x;
    1564             :     // a assert(dx > 0);
    1565             :     // pre-filter above pixels
    1566             :     // store in temp buffers:
    1567             :     //   above[x] * 32 + 16
    1568             :     //   above[x+1] - above[x]
    1569             :     // final pixels will be caluculated as:
    1570             :     //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    1571             :     __m256i a0, a1, a0_1, a1_1, a32, a16;
    1572             :     __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
    1573             : 
    1574           0 :     a16 = _mm256_set1_epi32(16);
    1575           0 :     a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
    1576           0 :     max_base_x256 = _mm256_set1_epi32(max_base_x);
    1577             : 
    1578           0 :     x = dx;
    1579           0 :     for (int32_t r = 0; r < N; r++) {
    1580             :         __m256i b, res, res1, shift;
    1581             : 
    1582           0 :         int32_t base = x >> frac_bits;
    1583           0 :         if (base >= max_base_x) {
    1584           0 :             for (int32_t i = r; i < N; ++i) {
    1585           0 :                 dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
    1586             :             }
    1587           0 :             return;
    1588             :         }
    1589             : 
    1590           0 :         a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
    1591           0 :         a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
    1592             : 
    1593           0 :         if (upsample_above) {
    1594           0 :             a0 = _mm256_permutevar8x32_epi32(
    1595             :                 a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
    1596           0 :             a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
    1597             : 
    1598             :             a0_1 =
    1599           0 :                 _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
    1600           0 :             a0_1 = _mm256_permutevar8x32_epi32(
    1601             :                 a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
    1602           0 :             a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
    1603             : 
    1604           0 :             a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
    1605           0 :             a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
    1606             :             base_inc256 =
    1607           0 :                 _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
    1608             :                 base + 10, base + 12, base + 14);
    1609           0 :             shift = _mm256_srli_epi32(
    1610             :                 _mm256_and_si256(
    1611             :                 _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
    1612             :                 _mm256_set1_epi32(0x3f)),
    1613             :                 1);
    1614             :         }
    1615             :         else {
    1616           0 :             base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
    1617             :                 base + 4, base + 5, base + 6, base + 7);
    1618           0 :             shift = _mm256_srli_epi32(
    1619             :                 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
    1620             :         }
    1621             : 
    1622           0 :         diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
    1623           0 :         a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
    1624           0 :         a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
    1625             : 
    1626           0 :         b = _mm256_mullo_epi32(diff, shift);
    1627           0 :         res = _mm256_add_epi32(a32, b);
    1628           0 :         res = _mm256_srli_epi32(res, 5);
    1629             : 
    1630           0 :         res1 = _mm256_packus_epi32(
    1631           0 :             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
    1632             : 
    1633           0 :         mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
    1634           0 :         mask256 = _mm256_packs_epi32(
    1635             :             mask256, _mm256_castsi128_si256(
    1636           0 :             _mm256_extracti128_si256(mask256, 1)));  // goto 16 bit
    1637           0 :         res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
    1638           0 :         dst[r] = _mm256_castsi256_si128(res1);
    1639           0 :         x += dx;
    1640             :     }
    1641             : }
    1642             : 
    1643           0 : static void highbd_dr_prediction_z1_8xN_avx2(int32_t N, uint16_t *dst,
    1644             :     ptrdiff_t stride,
    1645             :     const uint16_t *above,
    1646             :     int32_t upsample_above, int32_t dx) {
    1647             :     __m128i dstvec[32];
    1648             : 
    1649             :     highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
    1650             :         dx);
    1651           0 :     for (int32_t i = 0; i < N; i++) {
    1652           0 :         _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
    1653             :     }
    1654           0 : }
    1655             : 
    1656             : static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
    1657             :     int32_t N, __m256i *dstvec, const uint16_t *above, int32_t upsample_above, int32_t dx) {
    1658             :     int32_t x;
    1659             :     // here upsample_above is 0 by design of av1_use_intra_edge_upsample
    1660             :     (void)upsample_above;
    1661           0 :     const int32_t frac_bits = 6;
    1662           0 :     const int32_t max_base_x = ((16 + N) - 1);
    1663             : 
    1664             :     // pre-filter above pixels
    1665             :     // store in temp buffers:
    1666             :     //   above[x] * 32 + 16
    1667             :     //   above[x+1] - above[x]
    1668             :     // final pixels will be caluculated as:
    1669             :     //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    1670             :     __m256i a0, a0_1, a1, a1_1, a32, a16;
    1671             :     __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
    1672             : 
    1673           0 :     a16 = _mm256_set1_epi32(16);
    1674           0 :     a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
    1675           0 :     max_base_x256 = _mm256_set1_epi16(max_base_x);
    1676             : 
    1677           0 :     x = dx;
    1678           0 :     for (int32_t r = 0; r < N; r++) {
    1679             :         __m256i b, res[2], res1;
    1680             : 
    1681           0 :         int32_t base = x >> frac_bits;
    1682           0 :         if (base >= max_base_x) {
    1683           0 :             for (int32_t i = r; i < N; ++i) {
    1684           0 :                 dstvec[i] = a_mbase_x;  // save 16 values
    1685             :             }
    1686           0 :             return;
    1687             :         }
    1688           0 :         __m256i shift = _mm256_srli_epi32(
    1689             :             _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
    1690             : 
    1691           0 :         a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
    1692           0 :         a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
    1693             : 
    1694           0 :         diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
    1695           0 :         a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
    1696           0 :         a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
    1697           0 :         b = _mm256_mullo_epi32(diff, shift);
    1698             : 
    1699           0 :         res[0] = _mm256_add_epi32(a32, b);
    1700           0 :         res[0] = _mm256_srli_epi32(res[0], 5);
    1701           0 :         res[0] = _mm256_packus_epi32(
    1702           0 :             res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
    1703             : 
    1704           0 :         int32_t mdif = max_base_x - base;
    1705           0 :         if (mdif > 8) {
    1706             :             a0_1 =
    1707           0 :                 _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
    1708             :             a1_1 =
    1709           0 :                 _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
    1710             : 
    1711           0 :             diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
    1712           0 :             a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
    1713           0 :             a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
    1714           0 :             b = _mm256_mullo_epi32(diff, shift);
    1715             : 
    1716           0 :             res[1] = _mm256_add_epi32(a32, b);
    1717           0 :             res[1] = _mm256_srli_epi32(res[1], 5);
    1718           0 :             res[1] = _mm256_packus_epi32(
    1719           0 :                 res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
    1720             :         }
    1721             :         else {
    1722           0 :             res[1] = a_mbase_x;
    1723             :         }
    1724           0 :         res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
    1725             :             1);  // 16 16bit values
    1726             : 
    1727           0 :         base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
    1728           0 :             base + 4, base + 5, base + 6, base + 7,
    1729           0 :             base + 8, base + 9, base + 10, base + 11,
    1730           0 :             base + 12, base + 13, base + 14, base + 15);
    1731           0 :         mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
    1732           0 :         dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
    1733           0 :         x += dx;
    1734             :     }
    1735             : }
    1736             : 
    1737           0 : static void highbd_dr_prediction_z1_16xN_avx2(int32_t N, uint16_t *dst,
    1738             :     ptrdiff_t stride,
    1739             :     const uint16_t *above,
    1740             :     int32_t upsample_above, int32_t dx) {
    1741             :     __m256i dstvec[64];
    1742             :     highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
    1743             :         dx);
    1744           0 :     for (int32_t i = 0; i < N; i++) {
    1745           0 :         _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
    1746             :     }
    1747           0 : }
    1748             : 
    1749             : static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
    1750             :     int32_t N, __m256i *dstvec, const uint16_t *above, int32_t upsample_above, int32_t dx) {
    1751             :     int32_t x;
    1752             :     // here upsample_above is 0 by design of av1_use_intra_edge_upsample
    1753             :     (void)upsample_above;
    1754           0 :     const int32_t frac_bits = 6;
    1755           0 :     const int32_t max_base_x = ((32 + N) - 1);
    1756             : 
    1757             :     // pre-filter above pixels
    1758             :     // store in temp buffers:
    1759             :     //   above[x] * 32 + 16
    1760             :     //   above[x+1] - above[x]
    1761             :     // final pixels will be caluculated as:
    1762             :     //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    1763             :     __m256i a0, a0_1, a1, a1_1, a32, a16;
    1764             :     __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
    1765             : 
    1766           0 :     a16 = _mm256_set1_epi32(16);
    1767           0 :     a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
    1768           0 :     max_base_x256 = _mm256_set1_epi16(max_base_x);
    1769             : 
    1770           0 :     x = dx;
    1771           0 :     for (int32_t r = 0; r < N; r++) {
    1772             :         __m256i b, res[2], res1;
    1773             : 
    1774           0 :         int32_t base = x >> frac_bits;
    1775           0 :         if (base >= max_base_x) {
    1776           0 :             for (int32_t i = r; i < N; ++i) {
    1777           0 :                 dstvec[i] = a_mbase_x;  // save 32 values
    1778           0 :                 dstvec[i + N] = a_mbase_x;
    1779             :             }
    1780           0 :             return;
    1781             :         }
    1782             : 
    1783           0 :         __m256i shift = _mm256_srli_epi32(
    1784             :             _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
    1785             : 
    1786           0 :         for (int32_t j = 0; j < 32; j += 16) {
    1787           0 :             int32_t mdif = max_base_x - (base + j);
    1788           0 :             if (mdif <= 0) {
    1789           0 :                 res1 = a_mbase_x;
    1790             :             }
    1791             :             else {
    1792           0 :                 a0 = _mm256_cvtepu16_epi32(
    1793           0 :                     _mm_loadu_si128((__m128i *)(above + base + j)));
    1794           0 :                 a1 = _mm256_cvtepu16_epi32(
    1795           0 :                     _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
    1796             : 
    1797           0 :                 diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
    1798           0 :                 a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
    1799           0 :                 a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
    1800           0 :                 b = _mm256_mullo_epi32(diff, shift);
    1801             : 
    1802           0 :                 res[0] = _mm256_add_epi32(a32, b);
    1803           0 :                 res[0] = _mm256_srli_epi32(res[0], 5);
    1804           0 :                 res[0] = _mm256_packus_epi32(
    1805             :                     res[0],
    1806           0 :                     _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
    1807           0 :                 if (mdif > 8) {
    1808           0 :                     a0_1 = _mm256_cvtepu16_epi32(
    1809           0 :                         _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
    1810           0 :                     a1_1 = _mm256_cvtepu16_epi32(
    1811           0 :                         _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
    1812             : 
    1813           0 :                     diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
    1814           0 :                     a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
    1815           0 :                     a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
    1816           0 :                     b = _mm256_mullo_epi32(diff, shift);
    1817             : 
    1818           0 :                     res[1] = _mm256_add_epi32(a32, b);
    1819           0 :                     res[1] = _mm256_srli_epi32(res[1], 5);
    1820           0 :                     res[1] = _mm256_packus_epi32(
    1821             :                         res[1],
    1822           0 :                         _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
    1823             :                 }
    1824             :                 else {
    1825           0 :                     res[1] = a_mbase_x;
    1826             :                 }
    1827           0 :                 res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
    1828             :                     1);  // 16 16bit values
    1829           0 :                 base_inc256 = _mm256_setr_epi16(
    1830           0 :                     base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
    1831           0 :                     base + j + 5, base + j + 6, base + j + 7, base + j + 8,
    1832           0 :                     base + j + 9, base + j + 10, base + j + 11, base + j + 12,
    1833           0 :                     base + j + 13, base + j + 14, base + j + 15);
    1834             : 
    1835           0 :                 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
    1836           0 :                 res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
    1837             :             }
    1838           0 :             if (!j)
    1839           0 :                 dstvec[r] = res1;
    1840             :             else
    1841           0 :                 dstvec[r + N] = res1;
    1842             :         }
    1843           0 :         x += dx;
    1844             :     }
    1845             : }
    1846             : 
    1847           0 : static void highbd_dr_prediction_z1_32xN_avx2(int32_t N, uint16_t *dst,
    1848             :     ptrdiff_t stride,
    1849             :     const uint16_t *above,
    1850             :     int32_t upsample_above, int32_t dx) {
    1851             :     __m256i dstvec[128];
    1852             : 
    1853             :     highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
    1854             :         dx);
    1855           0 :     for (int32_t i = 0; i < N; i++) {
    1856           0 :         _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
    1857           0 :         _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
    1858             :     }
    1859           0 : }
    1860             : 
    1861           0 : static void highbd_dr_prediction_z1_64xN_avx2(int32_t N, uint16_t *dst,
    1862             :     ptrdiff_t stride,
    1863             :     const uint16_t *above,
    1864             :     int32_t upsample_above, int32_t dx) {
    1865             :     int32_t x;
    1866             : 
    1867             :     // here upsample_above is 0 by design of av1_use_intra_edge_upsample
    1868             :     (void)upsample_above;
    1869           0 :     const int32_t frac_bits = 6;
    1870           0 :     const int32_t max_base_x = ((64 + N) - 1);
    1871             : 
    1872             :     // pre-filter above pixels
    1873             :     // store in temp buffers:
    1874             :     //   above[x] * 32 + 16
    1875             :     //   above[x+1] - above[x]
    1876             :     // final pixels will be caluculated as:
    1877             :     //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    1878             :     __m256i a0, a0_1, a1, a1_1, a32, a16;
    1879             :     __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
    1880             : 
    1881           0 :     a16 = _mm256_set1_epi32(16);
    1882           0 :     a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
    1883           0 :     max_base_x256 = _mm256_set1_epi16(max_base_x);
    1884             : 
    1885           0 :     x = dx;
    1886           0 :     for (int32_t r = 0; r < N; r++, dst += stride) {
    1887             :         __m256i b, res[2], res1;
    1888             : 
    1889           0 :         int32_t base = x >> frac_bits;
    1890           0 :         if (base >= max_base_x) {
    1891           0 :             for (int32_t i = r; i < N; ++i) {
    1892             :                 _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
    1893           0 :                 _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
    1894           0 :                 _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
    1895           0 :                 _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
    1896           0 :                 dst += stride;
    1897             :             }
    1898           0 :             return;
    1899             :         }
    1900             : 
    1901           0 :         __m256i shift = _mm256_srli_epi32(
    1902             :             _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
    1903             : 
    1904             :         __m128i a0_128, a0_1_128, a1_128, a1_1_128;
    1905           0 :         for (int32_t j = 0; j < 64; j += 16) {
    1906           0 :             int32_t mdif = max_base_x - (base + j);
    1907           0 :             if (mdif <= 0) {
    1908           0 :                 _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
    1909             :             }
    1910             :             else {
    1911           0 :                 a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
    1912           0 :                 a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
    1913           0 :                 a0 = _mm256_cvtepu16_epi32(a0_128);
    1914           0 :                 a1 = _mm256_cvtepu16_epi32(a1_128);
    1915             : 
    1916           0 :                 diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
    1917           0 :                 a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
    1918           0 :                 a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
    1919           0 :                 b = _mm256_mullo_epi32(diff, shift);
    1920             : 
    1921           0 :                 res[0] = _mm256_add_epi32(a32, b);
    1922           0 :                 res[0] = _mm256_srli_epi32(res[0], 5);
    1923           0 :                 res[0] = _mm256_packus_epi32(
    1924             :                     res[0],
    1925           0 :                     _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
    1926           0 :                 if (mdif > 8) {
    1927           0 :                     a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
    1928           0 :                     a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
    1929           0 :                     a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
    1930           0 :                     a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
    1931             : 
    1932           0 :                     diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
    1933           0 :                     a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
    1934           0 :                     a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
    1935           0 :                     b = _mm256_mullo_epi32(diff, shift);
    1936             : 
    1937           0 :                     res[1] = _mm256_add_epi32(a32, b);
    1938           0 :                     res[1] = _mm256_srli_epi32(res[1], 5);
    1939           0 :                     res[1] = _mm256_packus_epi32(
    1940             :                         res[1],
    1941           0 :                         _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
    1942             :                 }
    1943             :                 else {
    1944           0 :                     res[1] = a_mbase_x;
    1945             :                 }
    1946           0 :                 res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
    1947             :                     1);  // 16 16bit values
    1948           0 :                 base_inc256 = _mm256_setr_epi16(
    1949           0 :                     base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
    1950           0 :                     base + j + 5, base + j + 6, base + j + 7, base + j + 8,
    1951           0 :                     base + j + 9, base + j + 10, base + j + 11, base + j + 12,
    1952           0 :                     base + j + 13, base + j + 14, base + j + 15);
    1953             : 
    1954           0 :                 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
    1955           0 :                 res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
    1956           0 :                 _mm256_storeu_si256((__m256i *)(dst + j), res1);
    1957             :             }
    1958             :         }
    1959           0 :         x += dx;
    1960             :     }
    1961             : }
    1962             : 
    1963             : // Directional prediction, zone 1: 0 < angle < 90
    1964           0 : void eb_av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int32_t bw,
    1965             :     int32_t bh, const uint16_t *above,
    1966             :     const uint16_t *left, int32_t upsample_above,
    1967             :     int32_t dx, int32_t dy, int32_t bd) {
    1968             :     (void)left;
    1969             :     (void)dy;
    1970             :     (void)bd;
    1971             : 
    1972           0 :     switch (bw) {
    1973           0 :     case 4:
    1974           0 :         highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
    1975             :             dx);
    1976           0 :         break;
    1977           0 :     case 8:
    1978           0 :         highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
    1979             :             dx);
    1980           0 :         break;
    1981           0 :     case 16:
    1982           0 :         highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
    1983             :             dx);
    1984           0 :         break;
    1985           0 :     case 32:
    1986           0 :         highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
    1987             :             dx);
    1988           0 :         break;
    1989           0 :     case 64:
    1990           0 :         highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above,
    1991             :             dx);
    1992           0 :         break;
    1993           0 :     default: break;
    1994             :     }
    1995           0 :     return;
    1996             : }
    1997             : 
    1998     2524300 : static void dr_prediction_z2_Nx4_avx2(int32_t N, uint8_t *dst, ptrdiff_t stride,
    1999             :     const uint8_t *above, const uint8_t *left,
    2000             :     int32_t upsample_above, int32_t upsample_left,
    2001             :     int32_t dx, int32_t dy) {
    2002     2524300 :     const int32_t min_base_x = -(1 << upsample_above);
    2003     2524300 :     const int32_t min_base_y = -(1 << upsample_left);
    2004     2524300 :     const int32_t frac_bits_x = 6 - upsample_above;
    2005     2524300 :     const int32_t frac_bits_y = 6 - upsample_left;
    2006             : 
    2007     2524300 :   assert(dx > 0);
    2008             :     // pre-filter above pixels
    2009             :     // store in temp buffers:
    2010             :     //   above[x] * 32 + 16
    2011             :     //   above[x+1] - above[x]
    2012             :     // final pixels will be caluculated as:
    2013             :     //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    2014             :   __m128i a0_x, a1_x, a32, a16, diff;
    2015             :   __m128i c3f, min_base_y128, c1234, dy128;
    2016             : 
    2017     2524300 :   a16 = _mm_set1_epi16(16);
    2018     2524300 :   c3f = _mm_set1_epi16(0x3f);
    2019     5048600 :   min_base_y128 = _mm_set1_epi16(min_base_y);
    2020     2524300 :   c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
    2021     2524300 :   dy128 = _mm_set1_epi16(dy);
    2022             : 
    2023    21333700 :   for (int r = 0; r < N; r++) {
    2024             :     __m128i b, res, shift, r6, ydx;
    2025             :     __m128i resx, resy, resxy;
    2026             :     __m128i a0_x128, a1_x128;
    2027    18809400 :     int y = r + 1;
    2028    18809400 :     int base_x = (-y * dx) >> frac_bits_x;
    2029    18809400 :     int base_shift = 0;
    2030    18809400 :     if (base_x < (min_base_x - 1)) {
    2031    13511300 :       base_shift = (min_base_x - base_x - 1) >> upsample_above;
    2032             :     }
    2033    18809400 :     int base_min_diff =
    2034    18809400 :         (min_base_x - base_x + upsample_above) >> upsample_above;
    2035    18809400 :     if (base_min_diff > 4) {
    2036     8472380 :       base_min_diff = 4;
    2037             :     } else {
    2038    10337000 :       if (base_min_diff < 0)
    2039           0 :         base_min_diff = 0;
    2040             :     }
    2041             : 
    2042    18809400 :     if (base_shift > 3) {
    2043     8472300 :       a0_x = _mm_setzero_si128();
    2044     8472300 :       a1_x = _mm_setzero_si128();
    2045     8472300 :       shift = _mm_setzero_si128();
    2046             :     } else {
    2047    10337100 :       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
    2048    20674200 :       ydx = _mm_set1_epi16(y * dx);
    2049    10337100 :       r6 = _mm_slli_epi16(c1234, 6);
    2050             : 
    2051    10337100 :       if (upsample_above) {
    2052             :         a0_x128 =
    2053     4306800 :             _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
    2054     4306800 :         a1_x128 = _mm_srli_si128(a0_x128, 8);
    2055             : 
    2056    17227200 :         shift = _mm_srli_epi16(
    2057             :             _mm_and_si128(
    2058             :                 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
    2059             :             1);
    2060             :       } else {
    2061     6030320 :         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
    2062     6030320 :         a1_x128 = _mm_srli_si128(a0_x128, 1);
    2063             : 
    2064    18091000 :         shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
    2065             :       }
    2066    10337100 :       a0_x = _mm_cvtepu8_epi16(a0_x128);
    2067    10337100 :       a1_x = _mm_cvtepu8_epi16(a1_x128);
    2068             :     }
    2069             :     // y calc
    2070             :     __m128i a0_y, a1_y, shifty;
    2071    18809400 :     if (base_x < min_base_x) {
    2072             :       DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
    2073             :       __m128i y_c128, base_y_c128, mask128, c1234_;
    2074    15534400 :       c1234_ = _mm_srli_si128(c1234, 2);
    2075    31068900 :       r6 = _mm_set1_epi16(r << 6);
    2076    31068900 :       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
    2077    15534400 :       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
    2078    15534400 :       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
    2079    15534400 :       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
    2080             :       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
    2081             : 
    2082    15534400 :       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
    2083    15534400 :                             left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
    2084    31068900 :       base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
    2085             :       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
    2086    31068900 :       a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
    2087    15534400 :                             left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
    2088             : 
    2089    15534400 :       if (upsample_left) {
    2090    13769000 :         shifty = _mm_srli_epi16(
    2091             :             _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
    2092             :       } else {
    2093    21889500 :         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
    2094             :       }
    2095    15534400 :       a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
    2096    15534400 :       a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
    2097    15534400 :       shift = _mm_unpacklo_epi64(shift, shifty);
    2098             :     }
    2099             : 
    2100    18809400 :     diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
    2101    18809400 :     a32 = _mm_slli_epi16(a0_x, 5);    // a[x] * 32
    2102    18809400 :     a32 = _mm_add_epi16(a32, a16);    // a[x] * 32 + 16
    2103             : 
    2104    18809400 :     b = _mm_mullo_epi16(diff, shift);
    2105    18809400 :     res = _mm_add_epi16(a32, b);
    2106    18809400 :     res = _mm_srli_epi16(res, 5);
    2107             : 
    2108    18809400 :     resx = _mm_packus_epi16(res, res);
    2109    18809400 :     resy = _mm_srli_si128(resx, 4);
    2110             : 
    2111    37618800 :     resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
    2112    18809400 :     *(uint32_t *)(dst) = _mm_cvtsi128_si32(resxy);
    2113    18809400 :     dst += stride;
    2114             :   }
    2115     2524300 : }
    2116             : 
    2117     2423490 : static void dr_prediction_z2_Nx8_avx2(int32_t N, uint8_t *dst, ptrdiff_t stride,
    2118             :     const uint8_t *above, const uint8_t *left,
    2119             :     int32_t upsample_above, int32_t upsample_left,
    2120             :     int32_t dx, int32_t dy) {
    2121     2423490 :     const int32_t min_base_x = -(1 << upsample_above);
    2122     2423490 :     const int32_t min_base_y = -(1 << upsample_left);
    2123     2423490 :     const int32_t frac_bits_x = 6 - upsample_above;
    2124     2423490 :     const int32_t frac_bits_y = 6 - upsample_left;
    2125             : 
    2126             :     // pre-filter above pixels
    2127             :     // store in temp buffers:
    2128             :     //   above[x] * 32 + 16
    2129             :     //   above[x+1] - above[x]
    2130             :     // final pixels will be caluculated as:
    2131             :     //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    2132             :   __m256i diff, a32, a16;
    2133             :   __m256i a0_x, a1_x;
    2134             :   __m128i a0_x128, a1_x128, min_base_y128, c3f;
    2135             :   __m128i c1234, dy128;
    2136             : 
    2137     2423490 :   a16 = _mm256_set1_epi16(16);
    2138     2423490 :   c3f = _mm_set1_epi16(0x3f);
    2139     2423490 :   min_base_y128 = _mm_set1_epi16(min_base_y);
    2140     4846970 :   dy128 = _mm_set1_epi16(dy);
    2141     2423490 :   c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
    2142             : 
    2143    27487200 :   for (int r = 0; r < N; r++) {
    2144             :     __m256i b, res, shift;
    2145             :     __m128i resx, resy, resxy, r6, ydx;
    2146             : 
    2147    25063700 :         int32_t y = r + 1;
    2148    25063700 :         int32_t base_x = (-y * dx) >> frac_bits_x;
    2149    25063700 :         int32_t base_shift = 0;
    2150    25063700 :     if (base_x < (min_base_x - 1)) {
    2151    19586900 :       base_shift = (min_base_x - base_x - 1) >> upsample_above;
    2152             :     }
    2153    25063700 :         int32_t base_min_diff =
    2154    25063700 :             (min_base_x - base_x + upsample_above) >> upsample_above;
    2155    25063700 :     if (base_min_diff > 8) {
    2156    10476500 :       base_min_diff = 8;
    2157             :     } else {
    2158    14587200 :       if (base_min_diff < 0)
    2159           0 :         base_min_diff = 0;
    2160             :     }
    2161             : 
    2162    25063700 :     if (base_shift > 7) {
    2163    10476400 :       a0_x = _mm256_setzero_si256();
    2164    10476400 :       a1_x = _mm256_setzero_si256();
    2165    10476400 :       shift = _mm256_setzero_si256();
    2166             :     } else {
    2167    14587200 :       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
    2168    14587200 :       ydx = _mm_set1_epi16(y * dx);
    2169    14587200 :       r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
    2170    14587200 :       if (upsample_above) {
    2171             :         a0_x128 =
    2172     4215520 :             _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
    2173     4215520 :         a1_x128 = _mm_srli_si128(a0_x128, 8);
    2174             : 
    2175    21077600 :         shift = _mm256_castsi128_si256(_mm_srli_epi16(
    2176             :             _mm_and_si128(
    2177             :                 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
    2178             :             1));
    2179             :       } else {
    2180    10371700 :         a1_x128 = _mm_srli_si128(a0_x128, 1);
    2181    10371700 :         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
    2182    20743400 :         a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
    2183             : 
    2184    41486900 :         shift = _mm256_castsi128_si256(
    2185             :             _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
    2186             :       }
    2187    29174500 :       a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
    2188    29174500 :       a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
    2189             :     }
    2190             : 
    2191             :     // y calc
    2192             :     __m128i a0_y, a1_y, shifty;
    2193    25063700 :     if (base_x < min_base_x) {
    2194             :       DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
    2195             :       __m128i y_c128, base_y_c128, mask128;
    2196    43509300 :       r6 = _mm_set1_epi16(r << 6);
    2197    43509300 :       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
    2198    21754600 :       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
    2199    21754600 :       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
    2200    21754600 :       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
    2201             :       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
    2202             : 
    2203    21754600 :       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
    2204    21754600 :                             left[base_y_c[2]], left[base_y_c[3]],
    2205    21754600 :                             left[base_y_c[4]], left[base_y_c[5]],
    2206    21754600 :                             left[base_y_c[6]], left[base_y_c[7]]);
    2207    65263900 :       base_y_c128 = _mm_add_epi16(
    2208             :           base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
    2209             :       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
    2210             : 
    2211    43509300 :       a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
    2212    21754600 :                             left[base_y_c[2]], left[base_y_c[3]],
    2213    21754600 :                             left[base_y_c[4]], left[base_y_c[5]],
    2214    21754600 :                             left[base_y_c[6]], left[base_y_c[7]]);
    2215             : 
    2216    21754600 :       if (upsample_left) {
    2217    15280900 :         shifty = _mm_srli_epi16(
    2218             :             _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
    2219             :       } else {
    2220    33322000 :         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
    2221             :       }
    2222             : 
    2223    21754600 :       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
    2224    21754600 :       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
    2225    21754600 :       shift = _mm256_inserti128_si256(shift, shifty, 1);
    2226             :     }
    2227             : 
    2228    25063700 :     diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
    2229    25063700 :     a32 = _mm256_slli_epi16(a0_x, 5);    // a[x] * 32
    2230    25063700 :     a32 = _mm256_add_epi16(a32, a16);    // a[x] * 32 + 16
    2231             : 
    2232    25063700 :     b = _mm256_mullo_epi16(diff, shift);
    2233    25063700 :     res = _mm256_add_epi16(a32, b);
    2234    25063700 :     res = _mm256_srli_epi16(res, 5);
    2235             : 
    2236    50127300 :     resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
    2237             :                             _mm256_castsi256_si128(res));
    2238    25063700 :     resy = _mm256_extracti128_si256(res, 1);
    2239    25063700 :     resy = _mm_packus_epi16(resy, resy);
    2240             : 
    2241    50127300 :     resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
    2242    25063700 :     _mm_storel_epi64((__m128i *)(dst), resxy);
    2243    25063700 :     dst += stride;
    2244             :   }
    2245     2423490 : }
    2246             : 
    2247     2437990 : static void dr_prediction_z2_HxW_avx2(int32_t H, int32_t W, uint8_t *dst,
    2248             :     ptrdiff_t stride, const uint8_t *above,
    2249             :     const uint8_t *left, int32_t upsample_above,
    2250             :     int32_t upsample_left, int32_t dx, int32_t dy) {
    2251             :     // here upsample_above and upsample_left are 0 by design of
    2252             :     // av1_use_intra_edge_upsample
    2253     2437990 :     const int32_t min_base_x = -1;
    2254     2437990 :     const int32_t min_base_y = -1;
    2255             :     (void)upsample_above;
    2256             :     (void)upsample_left;
    2257     2437990 :     const int32_t frac_bits_x = 6;
    2258     2437990 :     const int32_t frac_bits_y = 6;
    2259             : 
    2260             :   __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
    2261             :   __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
    2262             :   __m128i a0_x128, a1_x128;
    2263             : 
    2264             :   DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
    2265     2437990 :   a16 = _mm256_set1_epi16(16);
    2266     2437990 :   c1 = _mm256_srli_epi16(a16, 4);
    2267     4875990 :   min_base_y256 = _mm256_set1_epi16(min_base_y);
    2268     2437990 :   c3f = _mm256_set1_epi16(0x3f);
    2269     4875990 :   dy256 = _mm256_set1_epi16(dy);
    2270             :   c0123 =
    2271     2437990 :       _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
    2272     2437990 :   c1234 = _mm256_add_epi16(c0123, c1);
    2273             : 
    2274    44162600 :   for (int r = 0; r < H; r++) {
    2275             :     __m256i b, res, shift, j256, r6, ydx;
    2276             :     __m128i resx, resy;
    2277             :     __m128i resxy;
    2278    41724600 :     int y = r + 1;
    2279    41724600 :     ydx = _mm256_set1_epi16(y * dx);
    2280             : 
    2281    41724600 :     int base_x = (-y * dx) >> frac_bits_x;
    2282   111368000 :     for (int j = 0; j < W; j += 16) {
    2283    69643300 :       j256 = _mm256_set1_epi16(j);
    2284    69643300 :       int base_shift = 0;
    2285    69643300 :       if ((base_x + j) < (min_base_x - 1)) {
    2286    46967300 :         base_shift = (min_base_x - (base_x + j) - 1);
    2287             :       }
    2288    69643300 :       int base_min_diff = (min_base_x - base_x - j);
    2289    69643300 :       if (base_min_diff > 16) {
    2290    25858400 :         base_min_diff = 16;
    2291             :       } else {
    2292    43784900 :         if (base_min_diff < 0)
    2293    15478600 :           base_min_diff = 0;
    2294             :       }
    2295             : 
    2296    69643300 :       if (base_shift < 16) {
    2297    43865600 :         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
    2298             :         a1_x128 =
    2299    43865600 :             _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
    2300    43865600 :         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
    2301    87731300 :         a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
    2302             : 
    2303    43865600 :         a0_x = _mm256_cvtepu8_epi16(a0_x128);
    2304    43865600 :         a1_x = _mm256_cvtepu8_epi16(a1_x128);
    2305             : 
    2306    87731300 :         r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
    2307   131597000 :         shift = _mm256_srli_epi16(
    2308             :             _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
    2309             : 
    2310    43865600 :         diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
    2311    43865600 :         a32 = _mm256_slli_epi16(a0_x, 5);    // a[x] * 32
    2312    43865600 :         a32 = _mm256_add_epi16(a32, a16);    // a[x] * 32 + 16
    2313             : 
    2314    43865600 :         b = _mm256_mullo_epi16(diff, shift);
    2315    43865600 :         res = _mm256_add_epi16(a32, b);
    2316    43865600 :         res = _mm256_srli_epi16(res, 5); // 16 16-bit values
    2317   131597000 :         resx = _mm256_castsi256_si128(_mm256_packus_epi16(
    2318    43865600 :             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
    2319             :       } else {
    2320    25777700 :         resx = _mm_setzero_si128();
    2321             :       }
    2322             : 
    2323             :       // y calc
    2324    69643300 :       if (base_x < min_base_x) {
    2325             :         __m256i c256, y_c256, base_y_c256, mask256, mul16;
    2326   128689000 :         r6 = _mm256_set1_epi16(r << 6);
    2327    64344300 :         c256 = _mm256_add_epi16(j256, c1234);
    2328   193033000 :         mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
    2329             :                                  _mm256_srli_epi16(min_base_y256, 1));
    2330    64344300 :         y_c256 = _mm256_sub_epi16(r6, mul16);
    2331             : 
    2332    64344300 :         base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
    2333    64344300 :         mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
    2334    64344300 :         base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
    2335             :         _mm256_store_si256((__m256i *)base_y_c, base_y_c256); /**/
    2336             : 
    2337    64344300 :         a0_y = _mm256_setr_epi16(
    2338    64344300 :             left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
    2339    64344300 :             left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
    2340    64344300 :             left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
    2341    64344300 :             left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
    2342    64344300 :             left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
    2343    64344300 :             left[base_y_c[15]]);
    2344    64344300 :         base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
    2345             :         _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
    2346             : 
    2347    64344300 :         a1_y = _mm256_setr_epi16(
    2348    64344300 :             left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
    2349    64344300 :             left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
    2350    64344300 :             left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
    2351    64344300 :             left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
    2352    64344300 :             left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
    2353    64344300 :             left[base_y_c[15]]);
    2354             : 
    2355   128689000 :         shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
    2356             : 
    2357    64344300 :         diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
    2358    64344300 :         a32 = _mm256_slli_epi16(a0_y, 5);    // a[x] * 32
    2359    64344300 :         a32 = _mm256_add_epi16(a32, a16);    // a[x] * 32 + 16
    2360             : 
    2361    64344300 :         b = _mm256_mullo_epi16(diff, shifty);
    2362    64344300 :         res = _mm256_add_epi16(a32, b);
    2363    64344300 :         res = _mm256_srli_epi16(res, 5); // 16 16-bit values
    2364   193033000 :         resy = _mm256_castsi256_si128(_mm256_packus_epi16(
    2365    64344300 :             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
    2366             :       } else {
    2367     5299010 :         resy = _mm_setzero_si128();
    2368             :       }
    2369    69643300 :       resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
    2370    69643300 :       _mm_storeu_si128((__m128i *)(dst + j), resxy);
    2371             :     } // for j
    2372    41724600 :     dst += stride;
    2373             :   }
    2374     2437990 : }
    2375             : 
    2376             : // Directional prediction, zone 2: 90 < angle < 180
    2377     7384800 : void eb_av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int32_t bw, int32_t bh,
    2378             :     const uint8_t *above, const uint8_t *left,
    2379             :     int32_t upsample_above, int32_t upsample_left, int32_t dx,
    2380             :     int32_t dy) {
    2381     7384800 :     assert(dx > 0);
    2382     7384800 :     assert(dy > 0);
    2383     7384800 :     switch (bw) {
    2384     2524300 :     case 4:
    2385     2524300 :         dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
    2386             :             upsample_left, dx, dy);
    2387     2524330 :         break;
    2388     2423460 :     case 8:
    2389     2423460 :         dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
    2390             :             upsample_left, dx, dy);
    2391             : 
    2392     2423480 :         break;
    2393     2437030 :     default:
    2394     2437030 :         dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
    2395             :             upsample_above, upsample_left, dx, dy);
    2396     2437980 :         break;
    2397             :     }
    2398     7385800 :     return;
    2399             : }
    2400             : 
    2401             : // z3 functions
    2402      185813 : static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) {
    2403             :     __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
    2404      185813 :     w0 = _mm_unpacklo_epi8(x[0], x[1]);
    2405      185813 :     w1 = _mm_unpacklo_epi8(x[2], x[3]);
    2406      185813 :     w2 = _mm_unpackhi_epi8(x[0], x[1]);
    2407      371626 :     w3 = _mm_unpackhi_epi8(x[2], x[3]);
    2408             : 
    2409      185813 :     ww0 = _mm_unpacklo_epi16(w0, w1);
    2410      185813 :     ww1 = _mm_unpacklo_epi16(w2, w3);
    2411      185813 :     ww2 = _mm_unpackhi_epi16(w0, w1);
    2412      185813 :     ww3 = _mm_unpackhi_epi16(w2, w3);
    2413             : 
    2414      185813 :     w0 = _mm_unpacklo_epi32(ww0, ww1);
    2415      185813 :     w2 = _mm_unpacklo_epi32(ww2, ww3);
    2416      185813 :     w1 = _mm_unpackhi_epi32(ww0, ww1);
    2417      185813 :     w3 = _mm_unpackhi_epi32(ww2, ww3);
    2418             : 
    2419      185813 :     d[0] = _mm_unpacklo_epi64(w0, w2);
    2420      185813 :     d[1] = _mm_unpackhi_epi64(w0, w2);
    2421      185813 :     d[2] = _mm_unpacklo_epi64(w1, w3);
    2422      185813 :     d[3] = _mm_unpackhi_epi64(w1, w3);
    2423             : 
    2424      185813 :     d[4] = _mm_srli_si128(d[0], 8);
    2425      185813 :     d[5] = _mm_srli_si128(d[1], 8);
    2426      185813 :     d[6] = _mm_srli_si128(d[2], 8);
    2427      185813 :     d[7] = _mm_srli_si128(d[3], 8);
    2428             : 
    2429      185813 :     d[8] = _mm_srli_si128(d[0], 4);
    2430      185813 :     d[9] = _mm_srli_si128(d[1], 4);
    2431      185813 :     d[10] = _mm_srli_si128(d[2], 4);
    2432      185813 :     d[11] = _mm_srli_si128(d[3], 4);
    2433             : 
    2434      185813 :     d[12] = _mm_srli_si128(d[0], 12);
    2435      185813 :     d[13] = _mm_srli_si128(d[1], 12);
    2436      185813 :     d[14] = _mm_srli_si128(d[2], 12);
    2437      185813 :     d[15] = _mm_srli_si128(d[3], 12);
    2438      185813 : }
    2439             : 
    2440      424346 : static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
    2441             :     __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
    2442             :     __m256i w10, w11, w12, w13, w14, w15;
    2443             : 
    2444      424346 :     w0 = _mm256_unpacklo_epi8(x[0], x[1]);
    2445      424346 :     w1 = _mm256_unpacklo_epi8(x[2], x[3]);
    2446      424346 :     w2 = _mm256_unpacklo_epi8(x[4], x[5]);
    2447      424346 :     w3 = _mm256_unpacklo_epi8(x[6], x[7]);
    2448             : 
    2449      424346 :     w8 = _mm256_unpacklo_epi8(x[8], x[9]);
    2450      424346 :     w9 = _mm256_unpacklo_epi8(x[10], x[11]);
    2451      424346 :     w10 = _mm256_unpacklo_epi8(x[12], x[13]);
    2452      848692 :     w11 = _mm256_unpacklo_epi8(x[14], x[15]);
    2453             : 
    2454      424346 :     w4 = _mm256_unpacklo_epi16(w0, w1);
    2455      424346 :     w5 = _mm256_unpacklo_epi16(w2, w3);
    2456      424346 :     w12 = _mm256_unpacklo_epi16(w8, w9);
    2457      424346 :     w13 = _mm256_unpacklo_epi16(w10, w11);
    2458             : 
    2459      424346 :     w6 = _mm256_unpacklo_epi32(w4, w5);
    2460      424346 :     w7 = _mm256_unpackhi_epi32(w4, w5);
    2461      424346 :     w14 = _mm256_unpacklo_epi32(w12, w13);
    2462      424346 :     w15 = _mm256_unpackhi_epi32(w12, w13);
    2463             : 
    2464             :     // Store first 4-line result
    2465      424346 :     d[0] = _mm256_unpacklo_epi64(w6, w14);
    2466      424346 :     d[1] = _mm256_unpackhi_epi64(w6, w14);
    2467      424346 :     d[2] = _mm256_unpacklo_epi64(w7, w15);
    2468      848692 :     d[3] = _mm256_unpackhi_epi64(w7, w15);
    2469             : 
    2470      424346 :     w4 = _mm256_unpackhi_epi16(w0, w1);
    2471      424346 :     w5 = _mm256_unpackhi_epi16(w2, w3);
    2472      424346 :     w12 = _mm256_unpackhi_epi16(w8, w9);
    2473      424346 :     w13 = _mm256_unpackhi_epi16(w10, w11);
    2474             : 
    2475      424346 :     w6 = _mm256_unpacklo_epi32(w4, w5);
    2476      424346 :     w7 = _mm256_unpackhi_epi32(w4, w5);
    2477      424346 :     w14 = _mm256_unpacklo_epi32(w12, w13);
    2478      424346 :     w15 = _mm256_unpackhi_epi32(w12, w13);
    2479             : 
    2480             :     // Store second 4-line result
    2481      424346 :     d[4] = _mm256_unpacklo_epi64(w6, w14);
    2482      424346 :     d[5] = _mm256_unpackhi_epi64(w6, w14);
    2483      424346 :     d[6] = _mm256_unpacklo_epi64(w7, w15);
    2484      424346 :     d[7] = _mm256_unpackhi_epi64(w7, w15);
    2485             : 
    2486             :     // upper half
    2487      424346 :     w0 = _mm256_unpackhi_epi8(x[0], x[1]);
    2488      424346 :     w1 = _mm256_unpackhi_epi8(x[2], x[3]);
    2489      424346 :     w2 = _mm256_unpackhi_epi8(x[4], x[5]);
    2490      424346 :     w3 = _mm256_unpackhi_epi8(x[6], x[7]);
    2491             : 
    2492      424346 :     w8 = _mm256_unpackhi_epi8(x[8], x[9]);
    2493      424346 :     w9 = _mm256_unpackhi_epi8(x[10], x[11]);
    2494      424346 :     w10 = _mm256_unpackhi_epi8(x[12], x[13]);
    2495      848692 :     w11 = _mm256_unpackhi_epi8(x[14], x[15]);
    2496             : 
    2497      424346 :     w4 = _mm256_unpacklo_epi16(w0, w1);
    2498      424346 :     w5 = _mm256_unpacklo_epi16(w2, w3);
    2499      424346 :     w12 = _mm256_unpacklo_epi16(w8, w9);
    2500      424346 :     w13 = _mm256_unpacklo_epi16(w10, w11);
    2501             : 
    2502      424346 :     w6 = _mm256_unpacklo_epi32(w4, w5);
    2503      424346 :     w7 = _mm256_unpackhi_epi32(w4, w5);
    2504      424346 :     w14 = _mm256_unpacklo_epi32(w12, w13);
    2505      424346 :     w15 = _mm256_unpackhi_epi32(w12, w13);
    2506             : 
    2507             :     // Store first 4-line result
    2508      424346 :     d[8] = _mm256_unpacklo_epi64(w6, w14);
    2509      424346 :     d[9] = _mm256_unpackhi_epi64(w6, w14);
    2510      424346 :     d[10] = _mm256_unpacklo_epi64(w7, w15);
    2511      848692 :     d[11] = _mm256_unpackhi_epi64(w7, w15);
    2512             : 
    2513      424346 :     w4 = _mm256_unpackhi_epi16(w0, w1);
    2514      424346 :     w5 = _mm256_unpackhi_epi16(w2, w3);
    2515      424346 :     w12 = _mm256_unpackhi_epi16(w8, w9);
    2516      424346 :     w13 = _mm256_unpackhi_epi16(w10, w11);
    2517             : 
    2518      424346 :     w6 = _mm256_unpacklo_epi32(w4, w5);
    2519      424346 :     w7 = _mm256_unpackhi_epi32(w4, w5);
    2520      424346 :     w14 = _mm256_unpacklo_epi32(w12, w13);
    2521      424346 :     w15 = _mm256_unpackhi_epi32(w12, w13);
    2522             : 
    2523             :     // Store second 4-line result
    2524      424346 :     d[12] = _mm256_unpacklo_epi64(w6, w14);
    2525      424346 :     d[13] = _mm256_unpackhi_epi64(w6, w14);
    2526      424346 :     d[14] = _mm256_unpacklo_epi64(w7, w15);
    2527      424346 :     d[15] = _mm256_unpackhi_epi64(w7, w15);
    2528      424346 : }
    2529             : 
    2530      751669 : static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
    2531             :                                uint8_t *dst, ptrdiff_t pitchDst) {
    2532             :   __m128i r[16];
    2533             :   __m128i d[16];
    2534    12777600 :   for (int j = 0; j < 16; j++) {
    2535    24051800 :     r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
    2536             :   }
    2537      751669 :   transpose16x16_sse2(r, d);
    2538    12777900 :   for (int j = 0; j < 16; j++) {
    2539    12026200 :     _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
    2540             :   }
    2541      751714 : }
    2542             : 
    2543       79074 : static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
    2544             :     ptrdiff_t pitchDst, int32_t width, int32_t height) {
    2545      374032 :   for (int j = 0; j < height; j += 16)
    2546     1046630 :     for (int i = 0; i < width; i += 16)
    2547      751669 :       transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
    2548      751669 :                          dst + j * pitchDst + i, pitchDst);
    2549       79075 : }
    2550             : 
    2551      508543 : static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
    2552             :     const uint8_t *left, int32_t upsample_left,
    2553             :     int32_t dy) {
    2554             :     __m128i dstvec[4], d[4];
    2555             : 
    2556             :   dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
    2557      508543 :   transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
    2558             :                             &d[0], &d[1], &d[2], &d[3]);
    2559             : 
    2560      508537 :     *(uint32_t *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
    2561      508537 :     *(uint32_t *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
    2562      508537 :     *(uint32_t *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
    2563      508537 :     *(uint32_t *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
    2564      508537 :     return;
    2565             : }
    2566             : 
    2567      503028 : static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
    2568             :     const uint8_t *left, int32_t upsample_left,
    2569             :     int32_t dy) {
    2570             :     __m128i dstvec[8], d[8];
    2571             : 
    2572             :   dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
    2573      503028 :   transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
    2574             :                     &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
    2575             :                     &d[3]);
    2576             : 
    2577      503026 :     _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
    2578      503026 :     _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
    2579      503026 :     _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
    2580      503026 :     _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
    2581      503026 :     _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
    2582      503026 :     _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
    2583      503026 :     _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
    2584      503026 :     _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
    2585      503026 : }
    2586             : 
    2587      262039 : static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
    2588             :     const uint8_t *left, int32_t upsample_left,
    2589             :     int32_t dy) {
    2590             :     __m128i dstvec[4], d[8];
    2591             : 
    2592             :   dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
    2593             : 
    2594      262039 :     transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
    2595             :         &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
    2596     2358360 :   for (int32_t i = 0; i < 8; i++) {
    2597     4192630 :     *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
    2598             :   }
    2599      262040 : }
    2600             : 
    2601      235052 : static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
    2602             :     const uint8_t *left, int32_t upsample_left,
    2603             :     int32_t dy) {
    2604             :     __m128i dstvec[8], d[4];
    2605             : 
    2606             :   dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
    2607      235052 :     transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
    2608             :         &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
    2609             :         &d[1], &d[2], &d[3]);
    2610      235052 :     _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
    2611      235052 :     _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
    2612      235052 :     _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
    2613      235052 :     _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
    2614      235052 : }
    2615             : 
    2616      143056 : static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
    2617             :     const uint8_t *left, int32_t upsample_left,
    2618             :     int32_t dy) {
    2619             :     __m128i dstvec[8], d[8];
    2620             : 
    2621             :   dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
    2622             : 
    2623      143056 :     transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
    2624             :         dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
    2625             :         d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
    2626     1287500 :     for (int32_t i = 0; i < 8; i++) {
    2627     1144440 :         _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
    2628     1144440 :         _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
    2629     1144440 :             _mm_srli_si128(d[i], 8));
    2630             :     }
    2631      143056 : }
    2632             : 
    2633      136872 : static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
    2634             :     const uint8_t *left, int32_t upsample_left,
    2635             :     int32_t dy) {
    2636             :     __m128i dstvec[16], d[16];
    2637             : 
    2638             :   dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
    2639             : 
    2640      136872 :     transpose16x8_8x16_sse2(
    2641             :         &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
    2642             :         &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
    2643             :         &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
    2644             :         &d[3], &d[4], &d[5], &d[6], &d[7]);
    2645             : 
    2646     1231830 :   for (int32_t i = 0; i < 8; i++) {
    2647     1094960 :     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
    2648             :   }
    2649      136871 : }
    2650             : 
    2651      185811 : static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
    2652             :     const uint8_t *left, int32_t upsample_left,
    2653             :     int32_t dy) {
    2654             :     __m128i dstvec[4], d[16];
    2655             : 
    2656             :   dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
    2657             : 
    2658      185811 :   transpose4x16_sse2(dstvec, d);
    2659     3158810 :   for (int32_t i = 0; i < 16; i++) {
    2660     5945990 :     *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
    2661             :   }
    2662      185813 : }
    2663             : 
    2664      181559 : static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
    2665             :     const uint8_t *left, int32_t upsample_left,
    2666             :     int32_t dy) {
    2667             :     __m128i dstvec[16], d[8];
    2668             :   dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
    2669             : 
    2670      907799 :   for (int32_t i = 4; i < 8; i++) {
    2671      726240 :     d[i] = _mm_setzero_si128();
    2672             :   }
    2673      181559 :     transpose16x8_8x16_sse2(
    2674             :         &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
    2675             :         &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
    2676             :         &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
    2677             :         &d[3], &d[4], &d[5], &d[6], &d[7]);
    2678             : 
    2679      907800 :   for (int32_t i = 0; i < 4; i++) {
    2680      726240 :     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
    2681             :   }
    2682      181560 : }
    2683             : 
    2684       95468 : static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
    2685             :     const uint8_t *left, int32_t upsample_left,
    2686             :     int32_t dy) {
    2687             :     __m256i dstvec[16], d[16];
    2688             : 
    2689             :   dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
    2690      859209 :   for (int32_t i = 8; i < 16; i++) {
    2691      763741 :     dstvec[i] = _mm256_setzero_si256();
    2692             :   }
    2693       95468 :   transpose16x32_avx2(dstvec, d);
    2694             : 
    2695     1622920 :     for (int32_t i = 0; i < 16; i++) {
    2696     3054900 :         _mm_storel_epi64((__m128i *)(dst + i * stride),
    2697             :             _mm256_castsi256_si128(d[i]));
    2698             :     }
    2699     1622910 :     for (int32_t i = 0; i < 16; i++) {
    2700     1527440 :         _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
    2701     1527440 :             _mm256_extracti128_si256(d[i], 1));
    2702             :     }
    2703       95468 : }
    2704             : 
    2705       69623 : static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
    2706             :     const uint8_t *left, int32_t upsample_left,
    2707             :     int32_t dy) {
    2708             :     __m128i dstvec[32], d[16];
    2709             : 
    2710             :   dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
    2711             : 
    2712       69623 :     transpose16x8_8x16_sse2(
    2713             :         &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
    2714             :         &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
    2715             :         &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
    2716             :         &d[3], &d[4], &d[5], &d[6], &d[7]);
    2717       69623 :     transpose16x8_8x16_sse2(
    2718             :         &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
    2719             :         &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
    2720             :         &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
    2721             :         &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
    2722             :         &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
    2723             :         &d[6 + 8], &d[7 + 8]);
    2724             : 
    2725      626607 :     for (int32_t i = 0; i < 8; i++) {
    2726      556984 :         _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
    2727      556984 :         _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
    2728             :     }
    2729       69623 : }
    2730             : 
    2731      303524 : static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
    2732             :     const uint8_t *left, int32_t upsample_left,
    2733             :     int32_t dy) {
    2734             :     __m128i dstvec[16], d[16];
    2735             : 
    2736             :   dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
    2737      303524 :   transpose16x16_sse2(dstvec, d);
    2738             : 
    2739     5159860 :   for (int32_t i = 0; i < 16; i++) {
    2740     4856340 :     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
    2741             :   }
    2742      303524 : }
    2743             : 
    2744      133774 : static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
    2745             :     const uint8_t *left, int32_t upsample_left,
    2746             :     int32_t dy) {
    2747             :     __m256i dstvec[32], d[32];
    2748             : 
    2749             :     dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
    2750      133774 :     transpose16x32_avx2(dstvec, d);
    2751      133774 :     transpose16x32_avx2(dstvec + 16, d + 16);
    2752     2274130 :     for (int32_t j = 0; j < 16; j++) {
    2753     4280710 :         _mm_storeu_si128((__m128i *)(dst + j * stride),
    2754             :             _mm256_castsi256_si128(d[j]));
    2755     2140360 :         _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
    2756     2140360 :             _mm256_castsi256_si128(d[j + 16]));
    2757             :     }
    2758     2274120 :     for (int32_t j = 0; j < 16; j++) {
    2759     2140340 :         _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
    2760     2140340 :             _mm256_extracti128_si256(d[j], 1));
    2761     2140340 :         _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
    2762     2140340 :             _mm256_extracti128_si256(d[j + 16], 1));
    2763             :     }
    2764      133774 : }
    2765             : 
    2766       27692 : static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
    2767             :     const uint8_t *left, int32_t upsample_left,
    2768             :     int32_t dy) {
    2769             :     DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
    2770       27692 :     dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
    2771       27693 :     transpose(dstT, 64, dst, stride, 64, 64);
    2772       27693 : }
    2773             : 
    2774       61332 : static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
    2775             :     const uint8_t *left, int32_t upsample_left,
    2776             :     int32_t dy) {
    2777             :     __m256i dstvec[16], d[16];
    2778             : 
    2779             :     dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
    2780       61332 :     transpose16x32_avx2(dstvec, d);
    2781             :     // store
    2782     1042660 :     for (int32_t j = 0; j < 16; j++) {
    2783     1962650 :         _mm_storeu_si128((__m128i *)(dst + j * stride),
    2784             :             _mm256_castsi256_si128(d[j]));
    2785      981325 :         _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
    2786      981325 :             _mm256_extracti128_si256(d[j], 1));
    2787             :     }
    2788       61333 : }
    2789             : 
    2790       48602 : static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
    2791             :     const uint8_t *left, int32_t upsample_left,
    2792             :     int32_t dy) {
    2793             :     __m128i dstvec[32], d[16];
    2794             : 
    2795             :   dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
    2796             : 
    2797      145808 :   for (int32_t i = 0; i < 32; i += 16) {
    2798       97205 :     transpose16x16_sse2((dstvec + i), d);
    2799     1652460 :     for (int32_t j = 0; j < 16; j++) {
    2800     1555250 :       _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
    2801             :     }
    2802             :   }
    2803       48603 : }
    2804             : 
    2805       15113 : static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
    2806             :     const uint8_t *left, int32_t upsample_left,
    2807             :     int32_t dy) {
    2808             :     EB_ALIGN(32) uint8_t dstT[64 * 32];
    2809       15113 :     dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
    2810       15113 :     transpose(dstT, 64, dst, stride, 32, 64);
    2811       15113 : }
    2812             : 
    2813       10667 : static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
    2814             :     const uint8_t *left, int32_t upsample_left,
    2815             :     int32_t dy) {
    2816             :     EB_ALIGN(32) uint8_t dstT[32 * 64];
    2817       10667 :     dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
    2818       10667 :     transpose(dstT, 32, dst, stride, 64, 32);
    2819       10667 :     return;
    2820             : }
    2821             : 
    2822       25601 : static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
    2823             :     const uint8_t *left, int32_t upsample_left,
    2824             :     int32_t dy) {
    2825             :     EB_ALIGN(32) uint8_t dstT[64 * 16];
    2826       25601 :     dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
    2827       25601 :     transpose(dstT, 64, dst, stride, 16, 64);
    2828       25601 : }
    2829             : 
    2830       17044 : static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
    2831             :     const uint8_t *left, int32_t upsample_left,
    2832             :     int32_t dy) {
    2833             :     __m128i dstvec[64], d[16];
    2834             : 
    2835             :   dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
    2836             : 
    2837       85220 :   for (int32_t i = 0; i < 64; i += 16) {
    2838       68176 :     transpose16x16_sse2((dstvec + i), d);
    2839     1158990 :     for (int32_t j = 0; j < 16; j++) {
    2840     1090820 :       _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
    2841             :     }
    2842             :   }
    2843       17044 : }
    2844             : 
    2845     2964200 : void eb_av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int32_t bw, int32_t bh,
    2846             :     const uint8_t *above, const uint8_t *left,
    2847             :     int32_t upsample_left, int32_t dx, int32_t dy) {
    2848             :     (void)above;
    2849             :     (void)dx;
    2850     2964200 :     assert(dx == 1);
    2851     2964200 :     assert(dy > 0);
    2852             : 
    2853     2964200 :     if (bw == bh) {
    2854     1476520 :         switch (bw) {
    2855      508542 :         case 4:
    2856      508542 :             dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
    2857      508535 :             break;
    2858      503027 :         case 8:
    2859      503027 :             dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
    2860      503026 :             break;
    2861      303525 :         case 16:
    2862      303525 :             dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
    2863      303524 :             break;
    2864      133774 :         case 32:
    2865      133774 :             dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
    2866      133773 :             break;
    2867       27693 :         case 64:
    2868       27693 :             dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
    2869       27693 :             break;
    2870             :         }
    2871     1476510 :     }
    2872             :     else {
    2873     1487680 :         if (bw < bh) {
    2874      788407 :             if (bw + bw == bh) {
    2875      481537 :                 switch (bw) {
    2876      262039 :                 case 4:
    2877      262039 :                     dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
    2878      262040 :                     break;
    2879      143056 :                 case 8:
    2880      143056 :                     dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
    2881      143056 :                     break;
    2882       61332 :                 case 16:
    2883       61332 :                     dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
    2884       61333 :                     break;
    2885       15113 :                 case 32:
    2886       15113 :                     dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
    2887       15113 :                     break;
    2888             :                 }
    2889      481539 :             }
    2890             :             else {
    2891      306870 :                 switch (bw) {
    2892      185811 :                 case 4:
    2893      185811 :                     dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
    2894      185813 :                     break;
    2895       95468 :                 case 8:
    2896       95468 :                     dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
    2897       95467 :                     break;
    2898       25601 :                 case 16:
    2899       25601 :                     dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
    2900       25601 :                     break;
    2901             :                 }
    2902      788410 :             }
    2903             :         }
    2904             :         else {
    2905      699273 :             if (bh + bh == bw) {
    2906      431194 :                 switch (bh) {
    2907      235054 :                 case 4:
    2908      235054 :                     dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
    2909      235052 :                     break;
    2910      136872 :                 case 8:
    2911      136872 :                     dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
    2912      136869 :                     break;
    2913       48603 :                 case 16:
    2914       48603 :                     dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
    2915       48603 :                     break;
    2916       10667 :                 case 32:
    2917       10667 :                     dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
    2918       10667 :                     break;
    2919             :                 }
    2920      431189 :             }
    2921             :             else {
    2922      268079 :                 switch (bh) {
    2923      181559 :                 case 4:
    2924      181559 :                     dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
    2925      181560 :                     break;
    2926       69623 :                 case 8:
    2927       69623 :                     dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
    2928       69623 :                     break;
    2929       17044 :                 case 16:
    2930       17044 :                     dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
    2931       17044 :                     break;
    2932             :                 }
    2933     2964190 :             }
    2934             :         }
    2935             :     }
    2936     2964190 :     return;
    2937             : }
    2938             : 
    2939             : static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
    2940             :     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
    2941             :     { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
    2942             :     { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
    2943             :     { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
    2944             :     { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
    2945             :     { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
    2946             :     { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
    2947             :     { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
    2948             : };
    2949             : 
    2950             : static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
    2951             :     {0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15},
    2952             :     {0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13},
    2953             :     {0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11},
    2954             :     {0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9}};
    2955             : 
    2956             : static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
    2957             :     {0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
    2958             :      2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31},
    2959             :     {0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
    2960             :      0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29},
    2961             :     {0, 1, 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25,
    2962             :      0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27},
    2963             :     {0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
    2964             :      0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25},
    2965             :     {0, 1, 0, 1, 0, 1, 0, 1, 8,  9,  12, 13, 16, 17, 20, 21,
    2966             :      0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23},
    2967             :     {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
    2968             :      0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21},
    2969             :     {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
    2970             :      0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19},
    2971             :     {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
    2972             :      0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17}};
    2973             : 
    2974             : static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
    2975             :     {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    2976             :     { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    2977             :     { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    2978             :     { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    2979             :     { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    2980             :     { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    2981             :     { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    2982             :     0 },
    2983             :     { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
    2984             :     0, 0 },
    2985             :     { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
    2986             :     0, 0, 0, 0 },
    2987             :     { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
    2988             :     0, 0, 0, 0, 0, 0 },
    2989             :     { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
    2990             :     0xffff, 0, 0, 0, 0, 0, 0 },
    2991             :     { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
    2992             :     0xffff, 0xffff, 0, 0, 0, 0, 0 },
    2993             :     { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
    2994             :     0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
    2995             :     { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
    2996             :     0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
    2997             :     { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
    2998             :     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
    2999             :     { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
    3000             :     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
    3001             :     { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
    3002             :     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
    3003             : };
    3004             : 
    3005           0 : static void highbd_dr_prediction_z2_Nx4_avx2(
    3006             :     int32_t N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
    3007             :     const uint16_t *left, int32_t upsample_above, int32_t upsample_left, int32_t dx,
    3008             :     int32_t dy) {
    3009           0 :     const int32_t min_base_x = -(1 << upsample_above);
    3010           0 :     const int32_t min_base_y = -(1 << upsample_left);
    3011           0 :     const int32_t frac_bits_x = 6 - upsample_above;
    3012           0 :     const int32_t frac_bits_y = 6 - upsample_left;
    3013             : 
    3014           0 :   assert(dx > 0);
    3015             :     // pre-filter above pixels
    3016             :     // store in temp buffers:
    3017             :     //   above[x] * 32 + 16
    3018             :     //   above[x+1] - above[x]
    3019             :     // final pixels will be caluculated as:
    3020             :     //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    3021             :   __m256i a0_x, a1_x, a32, a16;
    3022             :     __m256i diff;
    3023             :     __m128i c3f, min_base_y128;
    3024             : 
    3025           0 :   a16 = _mm256_set1_epi16(16);
    3026           0 :   c3f = _mm_set1_epi16(0x3f);
    3027           0 :   min_base_y128 = _mm_set1_epi16(min_base_y);
    3028             : 
    3029           0 :     for (int32_t r = 0; r < N; r++) {
    3030             :         __m256i b, res, shift;
    3031             :         __m128i resx, resy, resxy;
    3032             :         __m128i a0_x128, a1_x128;
    3033           0 :         int32_t y = r + 1;
    3034           0 :         int32_t base_x = (-y * dx) >> frac_bits_x;
    3035           0 :         int32_t base_shift = 0;
    3036           0 :     if (base_x < (min_base_x - 1)) {
    3037           0 :       base_shift = (min_base_x - base_x - 1) >> upsample_above;
    3038             :     }
    3039           0 :         int32_t base_min_diff =
    3040           0 :         (min_base_x - base_x + upsample_above) >> upsample_above;
    3041           0 :     if (base_min_diff > 4) {
    3042           0 :       base_min_diff = 4;
    3043             :     } else {
    3044           0 :       if (base_min_diff < 0)
    3045           0 :         base_min_diff = 0;
    3046             :     }
    3047             : 
    3048           0 :     if (base_shift > 3) {
    3049           0 :       a0_x = _mm256_setzero_si256();
    3050           0 :       a1_x = _mm256_setzero_si256();
    3051           0 :       shift = _mm256_setzero_si256();
    3052             :     } else {
    3053           0 :       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
    3054           0 :       if (upsample_above) {
    3055           0 :         a0_x128 = _mm_shuffle_epi8(a0_x128,
    3056           0 :                                    *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
    3057           0 :         a1_x128 = _mm_srli_si128(a0_x128, 8);
    3058             : 
    3059           0 :         shift = _mm256_castsi128_si256(_mm_srli_epi16(
    3060             :             _mm_and_si128(
    3061           0 :                 _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
    3062           0 :                                               (2 << 6) - y * dx,
    3063           0 :                                               (3 << 6) - y * dx, 0, 0, 0, 0),
    3064             :                                upsample_above),
    3065             :                 c3f),
    3066             :             1));
    3067             :       } else {
    3068             :         a0_x128 =
    3069           0 :             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
    3070           0 :         a1_x128 = _mm_srli_si128(a0_x128, 2);
    3071             : 
    3072           0 :         shift = _mm256_castsi128_si256(_mm_srli_epi16(
    3073           0 :             _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
    3074           0 :                                          (2 << 6) - y * dx, (3 << 6) - y * dx,
    3075             :                                          0, 0, 0, 0),
    3076             :                           c3f),
    3077             :             1));
    3078             :       }
    3079           0 :       a0_x = _mm256_castsi128_si256(a0_x128);
    3080           0 :       a1_x = _mm256_castsi128_si256(a1_x128);
    3081             :     }
    3082             :     // y calc
    3083             :     __m128i a0_y, a1_y, shifty;
    3084           0 :     if (base_x < min_base_x) {
    3085             :       __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
    3086             :       DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
    3087           0 :       r6 = _mm_set1_epi16(r << 6);
    3088           0 :       dy128 = _mm_set1_epi16(dy);
    3089           0 :       c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
    3090           0 :       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
    3091           0 :       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
    3092           0 :       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
    3093           0 :       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
    3094             :       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
    3095             : 
    3096           0 :       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
    3097           0 :                             left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
    3098           0 :       a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
    3099           0 :                             left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
    3100             :                             0, 0);
    3101             : 
    3102           0 :       if (upsample_left) {
    3103           0 :         shifty = _mm_srli_epi16(
    3104             :             _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
    3105             :       } else {
    3106           0 :         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
    3107             :       }
    3108           0 :       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
    3109           0 :       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
    3110           0 :       shift = _mm256_inserti128_si256(shift, shifty, 1);
    3111             :     }
    3112             : 
    3113           0 :     diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
    3114           0 :     a32 = _mm256_slli_epi16(a0_x, 5);    // a[x] * 32
    3115           0 :     a32 = _mm256_add_epi16(a32, a16);    // a[x] * 32 + 16
    3116             : 
    3117           0 :     b = _mm256_mullo_epi16(diff, shift);
    3118           0 :     res = _mm256_add_epi16(a32, b);
    3119           0 :     res = _mm256_srli_epi16(res, 5);
    3120             : 
    3121           0 :     resx = _mm256_castsi256_si128(res);
    3122           0 :     resy = _mm256_extracti128_si256(res, 1);
    3123             :     resxy =
    3124           0 :         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
    3125           0 :     _mm_storel_epi64((__m128i *)(dst), resxy);
    3126           0 :     dst += stride;
    3127             :   }
    3128           0 : }
    3129             : 
    3130           0 : static void highbd_dr_prediction_z2_Nx8_avx2(int32_t N, uint16_t *dst, ptrdiff_t stride,
    3131             :                                  const uint16_t *above, const uint16_t *left,
    3132             :                                  int32_t upsample_above, int32_t upsample_left,
    3133             :                                  int32_t dx, int32_t dy) {
    3134           0 :   const int min_base_x = -(1 << upsample_above);
    3135           0 :   const int min_base_y = -(1 << upsample_left);
    3136           0 :   const int frac_bits_x = 6 - upsample_above;
    3137           0 :   const int frac_bits_y = 6 - upsample_left;
    3138             : 
    3139             :   // pre-filter above pixels
    3140             :   // store in temp buffers:
    3141             :   //   above[x] * 32 + 16
    3142             :   //   above[x+1] - above[x]
    3143             :   // final pixels will be calculated as:
    3144             :   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    3145             :   __m128i c3f, min_base_y128;
    3146             :   __m256i a0_x, a1_x, diff, a32, a16;
    3147             :   __m128i a0_x128, a1_x128;
    3148             : 
    3149           0 :   a16 = _mm256_set1_epi16(16);
    3150           0 :   c3f = _mm_set1_epi16(0x3f);
    3151           0 :   min_base_y128 = _mm_set1_epi16(min_base_y);
    3152             : 
    3153           0 :   for (int r = 0; r < N; r++) {
    3154             :     __m256i b, res, shift;
    3155             :     __m128i resx, resy, resxy;
    3156           0 :     int y = r + 1;
    3157           0 :     int base_x = (-y * dx) >> frac_bits_x;
    3158           0 :     int base_shift = 0;
    3159           0 :     if (base_x < (min_base_x - 1)) {
    3160           0 :       base_shift = (min_base_x - base_x - 1) >> upsample_above;
    3161             :     }
    3162           0 :     int base_min_diff =
    3163           0 :         (min_base_x - base_x + upsample_above) >> upsample_above;
    3164           0 :     if (base_min_diff > 8) {
    3165           0 :       base_min_diff = 8;
    3166             :     } else {
    3167           0 :       if (base_min_diff < 0)
    3168           0 :         base_min_diff = 0;
    3169             :     }
    3170             : 
    3171           0 :     if (base_shift > 7) {
    3172           0 :       a0_x = _mm256_setzero_si256();
    3173           0 :       a1_x = _mm256_setzero_si256();
    3174           0 :       shift = _mm256_setzero_si256();
    3175             :     } else {
    3176           0 :       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
    3177           0 :       if (upsample_above) {
    3178             :         __m128i mask, atmp0, atmp1, atmp2, atmp3;
    3179           0 :         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
    3180           0 :         atmp0 = _mm_shuffle_epi8(a0_x128,
    3181           0 :                                  *(__m128i *)HighbdEvenOddMaskx[base_shift]);
    3182           0 :         atmp1 = _mm_shuffle_epi8(a1_x128,
    3183           0 :                                  *(__m128i *)HighbdEvenOddMaskx[base_shift]);
    3184           0 :         atmp2 = _mm_shuffle_epi8(
    3185           0 :             a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
    3186           0 :         atmp3 = _mm_shuffle_epi8(
    3187           0 :             a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
    3188           0 :         mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
    3189             :                               _mm_set1_epi8(15));
    3190           0 :         a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
    3191           0 :         mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
    3192             :                               _mm_set1_epi8(15));
    3193           0 :         a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
    3194             : 
    3195           0 :         shift = _mm256_castsi128_si256(_mm_srli_epi16(
    3196             :             _mm_and_si128(
    3197             :                 _mm_slli_epi16(
    3198           0 :                     _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
    3199           0 :                                    (2 << 6) - y * dx, (3 << 6) - y * dx,
    3200           0 :                                    (4 << 6) - y * dx, (5 << 6) - y * dx,
    3201           0 :                                    (6 << 6) - y * dx, (7 << 6) - y * dx),
    3202             :                     upsample_above),
    3203             :                 c3f),
    3204             :             1));
    3205             :       } else {
    3206           0 :         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
    3207             :         a0_x128 =
    3208           0 :             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
    3209             :         a1_x128 =
    3210           0 :             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
    3211             : 
    3212           0 :         shift = _mm256_castsi128_si256(_mm_srli_epi16(
    3213           0 :             _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
    3214           0 :                                          (2 << 6) - y * dx, (3 << 6) - y * dx,
    3215           0 :                                          (4 << 6) - y * dx, (5 << 6) - y * dx,
    3216           0 :                                          (6 << 6) - y * dx, (7 << 6) - y * dx),
    3217             :                           c3f),
    3218             :             1));
    3219             :       }
    3220           0 :       a0_x = _mm256_castsi128_si256(a0_x128);
    3221           0 :       a1_x = _mm256_castsi128_si256(a1_x128);
    3222             :     }
    3223             : 
    3224             :     // y calc
    3225             :     __m128i a0_y, a1_y, shifty;
    3226           0 :     if (base_x < min_base_x) {
    3227             :       DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
    3228             :       __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
    3229           0 :       r6 = _mm_set1_epi16(r << 6);
    3230           0 :       dy128 = _mm_set1_epi16(dy);
    3231           0 :       c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
    3232           0 :       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
    3233           0 :       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
    3234           0 :       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
    3235           0 :       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
    3236             :       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
    3237             : 
    3238           0 :       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
    3239           0 :                             left[base_y_c[2]], left[base_y_c[3]],
    3240           0 :                             left[base_y_c[4]], left[base_y_c[5]],
    3241           0 :                             left[base_y_c[6]], left[base_y_c[7]]);
    3242           0 :       a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
    3243           0 :                             left[base_y_c[2] + 1], left[base_y_c[3] + 1],
    3244           0 :                             left[base_y_c[4] + 1], left[base_y_c[5] + 1],
    3245           0 :                             left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
    3246             : 
    3247           0 :       if (upsample_left) {
    3248           0 :         shifty = _mm_srli_epi16(
    3249             :             _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
    3250             :       } else {
    3251           0 :         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
    3252             :       }
    3253           0 :       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
    3254           0 :       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
    3255           0 :       shift = _mm256_inserti128_si256(shift, shifty, 1);
    3256             :     }
    3257             : 
    3258           0 :     diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
    3259           0 :     a32 = _mm256_slli_epi16(a0_x, 5);    // a[x] * 32
    3260           0 :     a32 = _mm256_add_epi16(a32, a16);    // a[x] * 32 + 16
    3261             : 
    3262           0 :     b = _mm256_mullo_epi16(diff, shift);
    3263           0 :     res = _mm256_add_epi16(a32, b);
    3264           0 :     res = _mm256_srli_epi16(res, 5);
    3265             : 
    3266           0 :     resx = _mm256_castsi256_si128(res);
    3267           0 :     resy = _mm256_extracti128_si256(res, 1);
    3268             : 
    3269             :     resxy =
    3270           0 :         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
    3271             :     _mm_storeu_si128((__m128i *)(dst), resxy);
    3272           0 :     dst += stride;
    3273             :   }
    3274           0 : }
    3275             : 
    3276           0 : static void highbd_dr_prediction_z2_Nx8_32bit_avx2(
    3277             :     int32_t N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
    3278             :     const uint16_t *left, int32_t upsample_above, int32_t upsample_left, int32_t dx,
    3279             :     int32_t dy) {
    3280           0 :     const int32_t min_base_x = -(1 << upsample_above);
    3281           0 :     const int32_t min_base_y = -(1 << upsample_left);
    3282           0 :     const int32_t frac_bits_x = 6 - upsample_above;
    3283           0 :     const int32_t frac_bits_y = 6 - upsample_left;
    3284             : 
    3285             :     // pre-filter above pixels
    3286             :     // store in temp buffers:
    3287             :     //   above[x] * 32 + 16
    3288             :     //   above[x+1] - above[x]
    3289             :   // final pixels will be calculated as:
    3290             :     //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    3291             :     __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
    3292             :     __m256i diff;
    3293             :     __m128i a0_x128, a1_x128;
    3294             : 
    3295           0 :     a16 = _mm256_set1_epi32(16);
    3296           0 :     c3f = _mm256_set1_epi32(0x3f);
    3297           0 :     min_base_y256 = _mm256_set1_epi32(min_base_y);
    3298             : 
    3299           0 :     for (int32_t r = 0; r < N; r++) {
    3300             :         __m256i b, res, shift;
    3301             :         __m128i resx, resy, resxy;
    3302           0 :         int32_t y = r + 1;
    3303           0 :         int32_t base_x = (-y * dx) >> frac_bits_x;
    3304           0 :         int32_t base_shift = 0;
    3305           0 :         if (base_x < (min_base_x - 1)) {
    3306           0 :             base_shift = (min_base_x - base_x - 1) >> upsample_above;
    3307             :         }
    3308           0 :         int32_t base_min_diff =
    3309           0 :             (min_base_x - base_x + upsample_above) >> upsample_above;
    3310           0 :         if (base_min_diff > 8) {
    3311           0 :             base_min_diff = 8;
    3312             :         }
    3313             :         else {
    3314           0 :             if (base_min_diff < 0) base_min_diff = 0;
    3315             :         }
    3316             : 
    3317           0 :     if (base_shift > 7) {
    3318           0 :       resx = _mm_setzero_si128();
    3319             :     } else {
    3320           0 :       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
    3321           0 :       if (upsample_above) {
    3322             :         __m128i mask, atmp0, atmp1, atmp2, atmp3;
    3323           0 :         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
    3324           0 :         atmp0 = _mm_shuffle_epi8(a0_x128,
    3325           0 :                                  *(__m128i *)HighbdEvenOddMaskx[base_shift]);
    3326           0 :         atmp1 = _mm_shuffle_epi8(a1_x128,
    3327           0 :                                  *(__m128i *)HighbdEvenOddMaskx[base_shift]);
    3328           0 :         atmp2 = _mm_shuffle_epi8(
    3329           0 :             a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
    3330           0 :         atmp3 = _mm_shuffle_epi8(
    3331           0 :             a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
    3332           0 :         mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
    3333             :                               _mm_set1_epi8(15));
    3334           0 :         a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
    3335           0 :         mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
    3336             :                               _mm_set1_epi8(15));
    3337           0 :         a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
    3338           0 :         shift = _mm256_srli_epi32(
    3339             :             _mm256_and_si256(
    3340             :                 _mm256_slli_epi32(
    3341           0 :                     _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
    3342           0 :                     (2 << 6) - y * dx, (3 << 6) - y * dx,
    3343           0 :                     (4 << 6) - y * dx, (5 << 6) - y * dx,
    3344           0 :                     (6 << 6) - y * dx, (7 << 6) - y * dx),
    3345             :                     upsample_above),
    3346             :                     c3f),
    3347             :                     1);
    3348             :       } else {
    3349           0 :                 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
    3350             :                 a0_x128 =
    3351           0 :                     _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
    3352             :                 a1_x128 =
    3353           0 :                     _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
    3354             : 
    3355           0 :                 shift = _mm256_srli_epi32(
    3356             :                     _mm256_and_si256(
    3357           0 :                     _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
    3358           0 :                     (3 << 6) - y * dx, (4 << 6) - y * dx,
    3359           0 :                     (5 << 6) - y * dx, (6 << 6) - y * dx,
    3360           0 :                     (7 << 6) - y * dx),
    3361             :                     c3f),
    3362             :                     1);
    3363             :             }
    3364             : 
    3365           0 :             a0_x = _mm256_cvtepu16_epi32(a0_x128);
    3366           0 :             a1_x = _mm256_cvtepu16_epi32(a1_x128);
    3367             : 
    3368           0 :             diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
    3369           0 :             a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
    3370           0 :             a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
    3371             : 
    3372           0 :             b = _mm256_mullo_epi32(diff, shift);
    3373           0 :             res = _mm256_add_epi32(a32, b);
    3374           0 :             res = _mm256_srli_epi32(res, 5);
    3375             : 
    3376           0 :             resx = _mm256_castsi256_si128(_mm256_packus_epi32(
    3377           0 :                 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
    3378             :         }
    3379             :         // y calc
    3380           0 :         if (base_x < min_base_x) {
    3381             :             DECLARE_ALIGNED(32, int32_t, base_y_c[8]);
    3382             :             __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
    3383           0 :             r6 = _mm256_set1_epi32(r << 6);
    3384           0 :             dy256 = _mm256_set1_epi32(dy);
    3385           0 :             c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
    3386           0 :             y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
    3387           0 :             base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
    3388           0 :             mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
    3389           0 :             base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
    3390             :             _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
    3391             : 
    3392           0 :             a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
    3393           0 :                 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
    3394           0 :                 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
    3395           0 :                 left[base_y_c[6]], left[base_y_c[7]]));
    3396           0 :             a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
    3397           0 :                 left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
    3398           0 :                 left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
    3399           0 :                 left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
    3400             : 
    3401           0 :             if (upsample_left) {
    3402           0 :                 shift = _mm256_srli_epi32(
    3403             :                     _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
    3404             :                     1);
    3405             :             }
    3406             :             else {
    3407           0 :                 shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
    3408             :             }
    3409           0 :             diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
    3410           0 :             a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
    3411           0 :             a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
    3412             : 
    3413           0 :             b = _mm256_mullo_epi32(diff, shift);
    3414           0 :             res = _mm256_add_epi32(a32, b);
    3415           0 :             res = _mm256_srli_epi32(res, 5);
    3416             : 
    3417           0 :             resy = _mm256_castsi256_si128(_mm256_packus_epi32(
    3418           0 :                 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
    3419             :         }
    3420             :         else {
    3421           0 :             resy = resx;
    3422             :         }
    3423             :         resxy =
    3424           0 :             _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
    3425             :         _mm_storeu_si128((__m128i *)(dst), resxy);
    3426           0 :         dst += stride;
    3427             :     }
    3428           0 : }
    3429             : 
    3430           0 : static void highbd_dr_prediction_z2_HxW_avx2(
    3431             :     int32_t H, int32_t W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
    3432             :     const uint16_t *left, int32_t upsample_above, int32_t upsample_left, int32_t dx,
    3433             :     int32_t dy) {
    3434             :   // here upsample_above and upsample_left are 0 by design of
    3435             :   // av1_use_intra_edge_upsample
    3436           0 :   const int min_base_x = -1;
    3437           0 :   const int min_base_y = -1;
    3438             :   (void)upsample_above;
    3439             :   (void)upsample_left;
    3440           0 :   const int frac_bits_x = 6;
    3441           0 :   const int frac_bits_y = 6;
    3442             : 
    3443             :   // pre-filter above pixels
    3444             :   // store in temp buffers:
    3445             :   //   above[x] * 32 + 16
    3446             :   //   above[x+1] - above[x]
    3447             :   // final pixels will be calculated as:
    3448             :   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    3449             :   __m256i a0_x, a1_x, a32, a16, c3f, c1;
    3450             :   __m256i diff, min_base_y256, dy256, c1234, c0123;
    3451             :   DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
    3452             : 
    3453           0 :   a16 = _mm256_set1_epi16(16);
    3454           0 :   c1 = _mm256_srli_epi16(a16, 4);
    3455           0 :   min_base_y256 = _mm256_set1_epi16(min_base_y);
    3456           0 :   c3f = _mm256_set1_epi16(0x3f);
    3457           0 :   dy256 = _mm256_set1_epi16(dy);
    3458             :   c0123 =
    3459           0 :       _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
    3460           0 :   c1234 = _mm256_add_epi16(c0123, c1);
    3461             : 
    3462           0 :   for (int r = 0; r < H; r++) {
    3463             :     __m256i b, res, shift;
    3464             :     __m256i resx, resy, ydx;
    3465             :     __m256i resxy, j256, r6;
    3466             :     __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
    3467           0 :     int y = r + 1;
    3468           0 :     ydx = _mm256_set1_epi16(y * dx);
    3469             : 
    3470           0 :     for (int j = 0; j < W; j += 16) {
    3471           0 :       j256 = _mm256_set1_epi16(j);
    3472           0 :       int base_x = (-y * dx) >> frac_bits_x;
    3473           0 :       int base_shift = 0;
    3474           0 :       if ((base_x + j) < (min_base_x - 1)) {
    3475           0 :         base_shift = (min_base_x - (base_x + j) - 1);
    3476             :       }
    3477           0 :       int base_min_diff = (min_base_x - base_x - j);
    3478           0 :       if (base_min_diff > 16) {
    3479           0 :         base_min_diff = 16;
    3480             :       } else {
    3481           0 :         if (base_min_diff < 0)
    3482           0 :           base_min_diff = 0;
    3483             :       }
    3484             : 
    3485           0 :       if (base_shift < 8) {
    3486           0 :         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
    3487             :         a1_x128 =
    3488           0 :             _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
    3489             :         a0_x128 =
    3490           0 :             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
    3491             :         a1_x128 =
    3492           0 :             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
    3493             : 
    3494           0 :         a0_x = _mm256_castsi128_si256(a0_x128);
    3495           0 :         a1_x = _mm256_castsi128_si256(a1_x128);
    3496             :       } else {
    3497           0 :         a0_x = _mm256_setzero_si256();
    3498           0 :         a1_x = _mm256_setzero_si256();
    3499             :       }
    3500             : 
    3501           0 :       int base_shift1 = 0;
    3502           0 :       if (base_shift > 8) {
    3503           0 :         base_shift1 = base_shift - 8;
    3504             :       }
    3505           0 :       if (base_shift1 < 8) {
    3506             :         a0_1_x128 =
    3507           0 :             _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8 + j));
    3508             :         a1_1_x128 =
    3509           0 :             _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9 + j));
    3510           0 :         a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
    3511           0 :                                      *(__m128i *)HighbdLoadMaskx[base_shift1]);
    3512           0 :         a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
    3513           0 :                                      *(__m128i *)HighbdLoadMaskx[base_shift1]);
    3514             : 
    3515           0 :         a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
    3516           0 :         a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
    3517             :       }
    3518           0 :       r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
    3519           0 :       shift = _mm256_srli_epi16(
    3520             :           _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
    3521             : 
    3522           0 :       diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
    3523           0 :       a32 = _mm256_slli_epi16(a0_x, 5);    // a[x] * 32
    3524           0 :       a32 = _mm256_add_epi16(a32, a16);    // a[x] * 32 + 16
    3525             : 
    3526           0 :       b = _mm256_mullo_epi16(diff, shift);
    3527           0 :       res = _mm256_add_epi16(a32, b);
    3528           0 :       resx = _mm256_srli_epi16(res, 5); // 16 16-bit values
    3529             : 
    3530             :       // y calc
    3531           0 :       resy = _mm256_setzero_si256();
    3532             :       __m256i a0_y, a1_y, shifty;
    3533           0 :       if ((base_x < min_base_x)) {
    3534             :         __m256i c256, y_c256, base_y_c256, mask256, mul16;
    3535           0 :         r6 = _mm256_set1_epi16(r << 6);
    3536           0 :         c256 = _mm256_add_epi16(j256, c1234);
    3537           0 :         mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
    3538             :                                  _mm256_srli_epi16(min_base_y256, 1));
    3539           0 :         y_c256 = _mm256_sub_epi16(r6, mul16);
    3540           0 :         base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
    3541           0 :         mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
    3542           0 :         base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
    3543             :         _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
    3544             : 
    3545           0 :         a0_y = _mm256_setr_epi16(
    3546           0 :             left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
    3547           0 :             left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
    3548           0 :             left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
    3549           0 :             left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
    3550           0 :             left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
    3551           0 :             left[base_y_c[15]]);
    3552           0 :         base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
    3553             :         _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
    3554             : 
    3555           0 :         a1_y = _mm256_setr_epi16(
    3556           0 :             left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
    3557           0 :             left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
    3558           0 :             left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
    3559           0 :             left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
    3560           0 :             left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
    3561           0 :             left[base_y_c[15]]);
    3562             : 
    3563           0 :         shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
    3564             : 
    3565           0 :         diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
    3566           0 :         a32 = _mm256_slli_epi16(a0_y, 5);    // a[x] * 32
    3567           0 :         a32 = _mm256_add_epi16(a32, a16);    // a[x] * 32 + 16
    3568             : 
    3569           0 :         b = _mm256_mullo_epi16(diff, shifty);
    3570           0 :         res = _mm256_add_epi16(a32, b);
    3571           0 :         resy = _mm256_srli_epi16(res, 5);
    3572             :       }
    3573             : 
    3574           0 :       resxy = _mm256_blendv_epi8(resx, resy,
    3575           0 :                                  *(__m256i *)HighbdBaseMask[base_min_diff]);
    3576           0 :       _mm256_storeu_si256((__m256i *)(dst + j), resxy);
    3577             :     } // for j
    3578           0 :     dst += stride;
    3579             :   }
    3580           0 : }
    3581             : 
    3582           0 : static void highbd_dr_prediction_z2_HxW_32bit_avx2(
    3583             :     int32_t H, int32_t W, uint16_t *dst, ptrdiff_t stride,
    3584             :     const uint16_t *above, const uint16_t *left, int32_t upsample_above,
    3585             :     int32_t upsample_left, int32_t dx, int32_t dy) {
    3586             :     // here upsample_above and upsample_left are 0 by design of
    3587             :     // av1_use_intra_edge_upsample
    3588           0 :     const int32_t min_base_x = -1;
    3589           0 :     const int32_t min_base_y = -1;
    3590             :     (void)upsample_above;
    3591             :     (void)upsample_left;
    3592           0 :     const int32_t frac_bits_x = 6;
    3593           0 :     const int32_t frac_bits_y = 6;
    3594             : 
    3595             :     // pre-filter above pixels
    3596             :     // store in temp buffers:
    3597             :     //   above[x] * 32 + 16
    3598             :     //   above[x+1] - above[x]
    3599             :     // final pixels will be caluculated as:
    3600             :     //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    3601             :     __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16;
    3602             :     __m256i diff, min_base_y256, c3f;
    3603             :     __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
    3604             : 
    3605           0 :     a16 = _mm256_set1_epi32(16);
    3606           0 :     min_base_y256 = _mm256_set1_epi16(min_base_y);
    3607           0 :     c3f = _mm256_set1_epi32(0x3f);
    3608           0 :     for (int32_t r = 0; r < H; r++) {
    3609             :         __m256i b, res, shift;
    3610             :         __m256i resx[2], resy[2];
    3611             :         __m256i resxy;
    3612           0 :         for (int32_t j = 0; j < W; j += 16) {
    3613           0 :             int32_t y = r + 1;
    3614           0 :             int32_t base_x = (-y * dx) >> frac_bits_x;
    3615           0 :             int32_t base_shift = 0;
    3616           0 :             if ((base_x + j) < (min_base_x - 1)) {
    3617           0 :                 base_shift = (min_base_x - (base_x + j) - 1);
    3618             :             }
    3619           0 :             int32_t base_min_diff = (min_base_x - base_x - j);
    3620           0 :             if (base_min_diff > 16) {
    3621           0 :                 base_min_diff = 16;
    3622             :             }
    3623             :             else {
    3624           0 :                 if (base_min_diff < 0) base_min_diff = 0;
    3625             :             }
    3626             : 
    3627           0 :             if (base_shift > 7) {
    3628           0 :                 resx[0] = _mm256_setzero_si256();
    3629             :             }
    3630             :             else {
    3631           0 :                 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
    3632             :                 a1_x128 =
    3633           0 :                     _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
    3634             :                 a0_x128 =
    3635           0 :                     _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
    3636             :                 a1_x128 =
    3637           0 :                     _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
    3638             : 
    3639           0 :                 a0_x = _mm256_cvtepu16_epi32(a0_x128);
    3640           0 :                 a1_x = _mm256_cvtepu16_epi32(a1_x128);
    3641             : 
    3642           0 :                 shift = _mm256_srli_epi32(
    3643             :                     _mm256_and_si256(
    3644             :                     _mm256_setr_epi32(
    3645           0 :                     ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx,
    3646           0 :                     ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx,
    3647           0 :                     ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx,
    3648           0 :                     ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx),
    3649             :                     _mm256_set1_epi32(0x3f)),
    3650             :                     1);
    3651             : 
    3652           0 :                 diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
    3653           0 :                 a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
    3654           0 :                 a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
    3655             : 
    3656           0 :                 b = _mm256_mullo_epi32(diff, shift);
    3657           0 :                 res = _mm256_add_epi32(a32, b);
    3658           0 :                 res = _mm256_srli_epi32(res, 5);
    3659             : 
    3660           0 :                 resx[0] = _mm256_packus_epi32(
    3661           0 :                     res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
    3662             :             }
    3663           0 :             base_shift = 0;
    3664           0 :             if ((base_x + j + 8) < (min_base_x - 1)) {
    3665           0 :                 base_shift = (min_base_x - (base_x + j + 8) - 1);
    3666             :             }
    3667           0 :             if (base_shift > 7) {
    3668           0 :                 resx[1] = _mm256_setzero_si256();
    3669             :             }
    3670             :             else {
    3671             :                 a0_1_x128 =
    3672           0 :                     _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 8 + j));
    3673             :                 a1_1_x128 =
    3674           0 :                     _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 9 + j));
    3675           0 :                 a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
    3676           0 :                     *(__m128i *)HighbdLoadMaskx[base_shift]);
    3677           0 :                 a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
    3678           0 :                     *(__m128i *)HighbdLoadMaskx[base_shift]);
    3679             : 
    3680           0 :                 a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
    3681           0 :                 a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
    3682             : 
    3683           0 :                 shift = _mm256_srli_epi32(
    3684             :                     _mm256_and_si256(
    3685             :                     _mm256_setr_epi32(
    3686           0 :                     ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx,
    3687           0 :                     ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx,
    3688           0 :                     ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx,
    3689           0 :                     ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx),
    3690             :                     _mm256_set1_epi32(0x3f)),
    3691             :                     1);
    3692             : 
    3693           0 :                 diff = _mm256_sub_epi32(a1_1_x, a0_1_x);  // a[x+1] - a[x]
    3694           0 :                 a32 = _mm256_slli_epi32(a0_1_x, 5);       // a[x] * 32
    3695           0 :                 a32 = _mm256_add_epi32(a32, a16);         // a[x] * 32 + 16
    3696           0 :                 b = _mm256_mullo_epi32(diff, shift);
    3697             : 
    3698           0 :                 resx[1] = _mm256_add_epi32(a32, b);
    3699           0 :                 resx[1] = _mm256_srli_epi32(resx[1], 5);
    3700           0 :                 resx[1] = _mm256_packus_epi32(
    3701             :                     resx[1],
    3702           0 :                     _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
    3703             :             }
    3704           0 :             resx[0] =
    3705           0 :                 _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
    3706             :                 1);  // 16 16bit values
    3707             : 
    3708             :             // y calc
    3709           0 :             if ((base_x < min_base_x)) {
    3710             :                 DECLARE_ALIGNED(32, int32_t, base_y_c[16]);
    3711             :                 __m256i r6, c256, dy256, y_c256, y_c_1_256, base_y_c256, mask256;
    3712           0 :                 r6 = _mm256_set1_epi32(r << 6);
    3713           0 :                 dy256 = _mm256_set1_epi32(dy);
    3714           0 :                 c256 = _mm256_setr_epi32(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j,
    3715             :                     7 + j, 8 + j);
    3716           0 :                 y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
    3717           0 :                 base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
    3718           0 :                 mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
    3719           0 :                 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
    3720             :                 _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
    3721           0 :                 c256 = _mm256_setr_epi32(9 + j, 10 + j, 11 + j, 12 + j, 13 + j, 14 + j,
    3722             :                     15 + j, 16 + j);
    3723           0 :                 y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
    3724           0 :                 base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
    3725           0 :                 mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
    3726           0 :                 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
    3727           0 :                 _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
    3728             : 
    3729           0 :                 a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
    3730           0 :                     left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
    3731           0 :                     left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
    3732           0 :                     left[base_y_c[6]], left[base_y_c[7]]));
    3733           0 :                 a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
    3734           0 :                     left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
    3735           0 :                     left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
    3736           0 :                     left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
    3737             : 
    3738           0 :                 shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
    3739             : 
    3740           0 :                 diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
    3741           0 :                 a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
    3742           0 :                 a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
    3743             : 
    3744           0 :                 b = _mm256_mullo_epi32(diff, shift);
    3745           0 :                 res = _mm256_add_epi32(a32, b);
    3746           0 :                 res = _mm256_srli_epi32(res, 5);
    3747             : 
    3748           0 :                 resy[0] = _mm256_packus_epi32(
    3749           0 :                     res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
    3750             : 
    3751           0 :                 a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
    3752           0 :                     left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
    3753           0 :                     left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
    3754           0 :                     left[base_y_c[14]], left[base_y_c[15]]));
    3755           0 :                 a1_y = _mm256_cvtepu16_epi32(
    3756           0 :                     _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
    3757           0 :                     left[base_y_c[10] + 1], left[base_y_c[11] + 1],
    3758           0 :                     left[base_y_c[12] + 1], left[base_y_c[13] + 1],
    3759           0 :                     left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
    3760           0 :                 shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
    3761             : 
    3762           0 :                 diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
    3763           0 :                 a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
    3764           0 :                 a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
    3765             : 
    3766           0 :                 b = _mm256_mullo_epi32(diff, shift);
    3767           0 :                 res = _mm256_add_epi32(a32, b);
    3768           0 :                 res = _mm256_srli_epi32(res, 5);
    3769             : 
    3770           0 :                 resy[1] = _mm256_packus_epi32(
    3771           0 :                     res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
    3772             : 
    3773           0 :                 resy[0] =
    3774           0 :                     _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
    3775             :                     1);  // 16 16bit values
    3776             :             }
    3777             :             else {
    3778           0 :                 resy[0] = resx[0];
    3779             :             }
    3780           0 :             resxy = _mm256_blendv_epi8(resx[0], resy[0],
    3781           0 :                 *(__m256i *)HighbdBaseMask[base_min_diff]);
    3782           0 :             _mm256_storeu_si256((__m256i *)(dst + j), resxy);
    3783             :         }  // for j
    3784           0 :         dst += stride;
    3785             :     }
    3786           0 : }
    3787             : 
    3788             : // Directional prediction, zone 2: 90 < angle < 180
    3789           0 : void eb_av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int32_t bw,
    3790             :     int32_t bh, const uint16_t *above, const uint16_t *left, int32_t upsample_above,
    3791             :     int32_t upsample_left, int32_t dx, int32_t dy, int32_t bd) {
    3792             :     (void)bd;
    3793           0 :     assert(dx > 0);
    3794           0 :     assert(dy > 0);
    3795           0 :     switch (bw) {
    3796           0 :     case 4:
    3797           0 :         highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
    3798             :             upsample_above, upsample_left, dx, dy);
    3799           0 :         break;
    3800           0 :     case 8:
    3801           0 :     if (bd < 12) {
    3802           0 :       highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
    3803             :                                        upsample_above, upsample_left, dx, dy);
    3804             :     } else {
    3805           0 :       highbd_dr_prediction_z2_Nx8_32bit_avx2(
    3806             :           bh, dst, stride, above, left, upsample_above, upsample_left, dx, dy);
    3807             :     }
    3808           0 :     break;
    3809           0 :   default:
    3810           0 :     if (bd < 12) {
    3811           0 :       highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
    3812             :                                        upsample_above, upsample_left, dx, dy);
    3813             :     } else {
    3814           0 :       highbd_dr_prediction_z2_HxW_32bit_avx2(bh, bw, dst, stride, above, left,
    3815             :                                              upsample_above, upsample_left, dx,
    3816             :                                              dy);
    3817             :     }
    3818           0 :     break;
    3819             :   }
    3820           0 :   return;
    3821             : }
    3822             : 
    3823           0 : static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
    3824             :                                       uint16_t *dst, ptrdiff_t pitchDst) {
    3825             :   __m256i r[16];
    3826             :   __m256i d[16];
    3827           0 :   for (int j = 0; j < 16; j++) {
    3828           0 :     r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
    3829             :   }
    3830           0 :   transpose_16bit_16x16_avx2(r, d);
    3831           0 :   for (int j = 0; j < 16; j++) {
    3832           0 :     _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
    3833             :   }
    3834           0 : }
    3835             : 
    3836           0 : static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
    3837             :     uint16_t *dst, ptrdiff_t pitchDst, int32_t width,
    3838             :     int32_t height) {
    3839           0 :   for (int j = 0; j < height; j += 16)
    3840           0 :     for (int i = 0; i < width; i += 16)
    3841           0 :       highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
    3842           0 :                                 dst + j * pitchDst + i, pitchDst);
    3843           0 : }
    3844             : 
    3845           0 : static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
    3846             :     const uint16_t *left,
    3847             :     int32_t upsample_left, int32_t dy) {
    3848             :     __m128i dstvec[4], d[4];
    3849             : 
    3850             :     highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, dy);
    3851           0 :     highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
    3852             :         &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
    3853           0 :     _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
    3854           0 :     _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
    3855           0 :     _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
    3856           0 :     _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
    3857           0 :     return;
    3858             : }
    3859             : 
    3860           0 : static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
    3861             :     const uint16_t *left,
    3862             :     int32_t upsample_left, int32_t dy) {
    3863             :     __m128i dstvec[8], d[8];
    3864             : 
    3865             :     highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, dy);
    3866           0 :     highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
    3867             :         &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
    3868             :         &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
    3869             :         &d[7]);
    3870           0 :     for (int32_t i = 0; i < 8; i++) {
    3871           0 :         _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
    3872             :     }
    3873           0 : }
    3874             : 
    3875           0 : static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
    3876             :     const uint16_t *left,
    3877             :     int32_t upsample_left, int32_t dy) {
    3878             :     __m128i dstvec[4], d[8];
    3879             : 
    3880             :     highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, dy);
    3881           0 :     highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
    3882             :         &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
    3883             :         &d[7]);
    3884           0 :     for (int32_t i = 0; i < 8; i++) {
    3885           0 :         _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
    3886             :     }
    3887           0 : }
    3888             : 
    3889           0 : static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
    3890             :     const uint16_t *left,
    3891             :     int32_t upsample_left, int32_t dy) {
    3892             :     __m128i dstvec[8], d[4];
    3893             : 
    3894             :     highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, dy);
    3895           0 :     highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
    3896             :         &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
    3897             :         &d[0], &d[1], &d[2], &d[3]);
    3898           0 :     _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
    3899           0 :     _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
    3900           0 :     _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
    3901           0 :     _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
    3902           0 : }
    3903             : 
    3904           0 : static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
    3905             :     const uint16_t *left,
    3906             :     int32_t upsample_left, int32_t dy) {
    3907             :     __m256i dstvec[8], d[16];
    3908             : 
    3909             :     highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
    3910             :         dy);
    3911           0 :     highbd_transpose8x16_16x8_avx2(dstvec, d);
    3912           0 :     for (int32_t i = 0; i < 8; i++) {
    3913           0 :         _mm_storeu_si128((__m128i *)(dst + i * stride),
    3914             :             _mm256_castsi256_si128(d[i]));
    3915             :     }
    3916           0 :     for (int32_t i = 8; i < 16; i++) {
    3917           0 :         _mm_storeu_si128((__m128i *)(dst + i * stride),
    3918           0 :             _mm256_extracti128_si256(d[i - 8], 1));
    3919             :     }
    3920           0 : }
    3921             : 
    3922           0 : static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
    3923             :     const uint16_t *left,
    3924             :     int32_t upsample_left, int32_t dy) {
    3925             :     __m128i dstvec[16], d[16];
    3926             : 
    3927             :     highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
    3928             :         dy);
    3929           0 :     for (int32_t i = 0; i < 16; i += 8) {
    3930           0 :         highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
    3931           0 :             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
    3932           0 :             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
    3933           0 :             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
    3934           0 :             &d[5 + i], &d[6 + i], &d[7 + i]);
    3935             :     }
    3936           0 :     for (int32_t i = 0; i < 8; i++) {
    3937           0 :         _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
    3938           0 :         _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
    3939             :     }
    3940           0 : }
    3941             : 
    3942           0 : static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
    3943             :     const uint16_t *left,
    3944             :     int32_t upsample_left, int32_t dy) {
    3945             :     __m256i dstvec[4], d[4], d1;
    3946             : 
    3947             :     highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
    3948             :         dy);
    3949           0 :     highbd_transpose4x16_avx2(dstvec, d);
    3950           0 :     for (int32_t i = 0; i < 4; i++) {
    3951           0 :         _mm_storel_epi64((__m128i *)(dst + i * stride),
    3952             :             _mm256_castsi256_si128(d[i]));
    3953           0 :         d1 = _mm256_srli_si256(d[i], 8);
    3954           0 :         _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
    3955             :             _mm256_castsi256_si128(d1));
    3956           0 :         _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
    3957           0 :             _mm256_extracti128_si256(d[i], 1));
    3958           0 :         _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
    3959           0 :             _mm256_extracti128_si256(d1, 1));
    3960             :     }
    3961           0 : }
    3962             : 
    3963           0 : static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
    3964             :     const uint16_t *left,
    3965             :     int32_t upsample_left, int32_t dy) {
    3966             :     __m128i dstvec[16], d[8];
    3967             : 
    3968             :     highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
    3969             :         dy);
    3970           0 :     highbd_transpose16x4_8x8_sse2(dstvec, d);
    3971             : 
    3972           0 :     _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
    3973           0 :     _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
    3974           0 :     _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
    3975           0 :     _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
    3976           0 :     _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
    3977           0 :     _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
    3978           0 :     _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
    3979           0 :     _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
    3980           0 : }
    3981             : 
    3982           0 : static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
    3983             :     const uint16_t *left,
    3984             :     int32_t upsample_left, int32_t dy) {
    3985             :     __m256i dstvec[16], d[16];
    3986             : 
    3987             :     highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
    3988             :         dy);
    3989           0 :     for (int32_t i = 0; i < 16; i += 8) {
    3990           0 :         highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
    3991             :     }
    3992             : 
    3993           0 :     for (int32_t i = 0; i < 8; i++) {
    3994           0 :         _mm_storeu_si128((__m128i *)(dst + i * stride),
    3995             :             _mm256_castsi256_si128(d[i]));
    3996             :     }
    3997           0 :     for (int32_t i = 0; i < 8; i++) {
    3998           0 :         _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
    3999           0 :             _mm256_extracti128_si256(d[i], 1));
    4000             :     }
    4001           0 :     for (int32_t i = 8; i < 16; i++) {
    4002           0 :         _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
    4003             :             _mm256_castsi256_si128(d[i]));
    4004             :     }
    4005           0 :     for (int32_t i = 8; i < 16; i++) {
    4006           0 :         _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
    4007           0 :             _mm256_extracti128_si256(d[i], 1));
    4008             :     }
    4009           0 : }
    4010             : 
    4011           0 : static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
    4012             :     const uint16_t *left,
    4013             :     int32_t upsample_left, int32_t dy) {
    4014             :     __m128i dstvec[32], d[32];
    4015             : 
    4016             :     highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
    4017             :         dy);
    4018           0 :     for (int32_t i = 0; i < 32; i += 8) {
    4019           0 :         highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
    4020           0 :             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
    4021           0 :             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
    4022           0 :             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
    4023           0 :             &d[5 + i], &d[6 + i], &d[7 + i]);
    4024             :     }
    4025           0 :     for (int32_t i = 0; i < 8; i++) {
    4026           0 :         _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
    4027           0 :         _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
    4028           0 :         _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
    4029           0 :         _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
    4030             :     }
    4031           0 : }
    4032             : 
    4033           0 : static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
    4034             :     const uint16_t *left,
    4035             :     int32_t upsample_left, int32_t dy) {
    4036             :     __m256i dstvec[16], d[16];
    4037             : 
    4038             :     highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
    4039             :         dy);
    4040           0 :     transpose_16bit_16x16_avx2(dstvec, d);
    4041             : 
    4042           0 :     for (int32_t i = 0; i < 16; i++) {
    4043           0 :         _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
    4044             :     }
    4045           0 : }
    4046             : 
    4047           0 : static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
    4048             :     const uint16_t *left,
    4049             :     int32_t upsample_left, int32_t dy) {
    4050             :     __m256i dstvec[64], d[16];
    4051             : 
    4052             :     highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
    4053             :         dy);
    4054             : 
    4055           0 :     transpose_16bit_16x16_avx2(dstvec, d);
    4056           0 :     for (int32_t j = 0; j < 16; j++) {
    4057           0 :         _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
    4058             :     }
    4059           0 :     transpose_16bit_16x16_avx2(dstvec + 16, d);
    4060           0 :     for (int32_t j = 0; j < 16; j++) {
    4061           0 :         _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
    4062             :     }
    4063           0 :     transpose_16bit_16x16_avx2(dstvec + 32, d);
    4064           0 :     for (int32_t j = 0; j < 16; j++) {
    4065           0 :         _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
    4066             :     }
    4067           0 :     transpose_16bit_16x16_avx2(dstvec + 48, d);
    4068           0 :     for (int32_t j = 0; j < 16; j++) {
    4069           0 :         _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
    4070             :     }
    4071           0 : }
    4072             : 
    4073           0 : static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
    4074             :     const uint16_t *left,
    4075             :     int32_t upsample_left, int32_t dy) {
    4076             :     DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
    4077           0 :     highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
    4078           0 :     highbd_transpose(dstT, 64, dst, stride, 64, 64);
    4079           0 : }
    4080             : 
    4081           0 : static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
    4082             :     const uint16_t *left,
    4083             :     int32_t upsample_left, int32_t dy) {
    4084             :     __m256i dstvec[32], d[32];
    4085             : 
    4086             :     highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
    4087             :         dy);
    4088           0 :     for (int32_t i = 0; i < 32; i += 8) {
    4089           0 :         highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
    4090             :     }
    4091             :     // store
    4092           0 :     for (int32_t j = 0; j < 32; j += 16) {
    4093           0 :         for (int32_t i = 0; i < 8; i++) {
    4094           0 :             _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
    4095           0 :                 _mm256_castsi256_si128(d[(i + j)]));
    4096             :         }
    4097           0 :         for (int32_t i = 0; i < 8; i++) {
    4098           0 :             _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
    4099           0 :                 _mm256_castsi256_si128(d[(i + j) + 8]));
    4100             :         }
    4101           0 :         for (int32_t i = 8; i < 16; i++) {
    4102             :             _mm256_storeu_si256(
    4103           0 :                 (__m256i *)(dst + (i + j) * stride),
    4104           0 :                 _mm256_inserti128_si256(
    4105             :                 d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
    4106             :         }
    4107             :     }
    4108           0 : }
    4109             : 
    4110           0 : static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
    4111             :     const uint16_t *left,
    4112             :     int32_t upsample_left, int32_t dy) {
    4113             :     __m256i dstvec[32], d[16];
    4114             : 
    4115             :     highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
    4116             :         dy);
    4117           0 :     for (int32_t i = 0; i < 32; i += 16) {
    4118           0 :         transpose_16bit_16x16_avx2((dstvec + i), d);
    4119           0 :         for (int32_t j = 0; j < 16; j++) {
    4120           0 :             _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
    4121             :         }
    4122             :     }
    4123           0 : }
    4124             : 
    4125           0 : static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
    4126             :     const uint16_t *left,
    4127             :     int32_t upsample_left, int32_t dy) {
    4128             :     uint16_t dstT[64 * 32];
    4129           0 :     highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
    4130           0 :     highbd_transpose(dstT, 64, dst, stride, 32, 64);
    4131           0 : }
    4132             : 
    4133           0 : static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
    4134             :     const uint16_t *left,
    4135             :     int32_t upsample_left, int32_t dy) {
    4136             :     DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
    4137           0 :     highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
    4138           0 :     highbd_transpose(dstT, 32, dst, stride, 64, 32);
    4139           0 :     return;
    4140             : }
    4141             : 
    4142           0 : static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
    4143             :     const uint16_t *left,
    4144             :     int32_t upsample_left, int32_t dy) {
    4145             :     DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
    4146           0 :     highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
    4147           0 :     highbd_transpose(dstT, 64, dst, stride, 16, 64);
    4148           0 : }
    4149             : 
    4150           0 : static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
    4151             :     const uint16_t *left,
    4152             :     int32_t upsample_left, int32_t dy) {
    4153             :     __m256i dstvec[64], d[16];
    4154             : 
    4155             :     highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
    4156             :         dy);
    4157           0 :     for (int32_t i = 0; i < 64; i += 16) {
    4158           0 :         transpose_16bit_16x16_avx2((dstvec + i), d);
    4159           0 :         for (int32_t j = 0; j < 16; j++) {
    4160           0 :             _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
    4161             :         }
    4162             :     }
    4163           0 : }
    4164             : 
    4165           0 : void eb_av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int32_t bw,
    4166             :     int32_t bh, const uint16_t *above,
    4167             :     const uint16_t *left, int32_t upsample_left,
    4168             :     int32_t dx, int32_t dy, int32_t bd) {
    4169             :     (void)above;
    4170             :     (void)dx;
    4171             :     (void)bd;
    4172           0 :     assert(dx == 1);
    4173           0 :     assert(dy > 0);
    4174           0 :     if (bw == bh) {
    4175           0 :         switch (bw) {
    4176           0 :         case 4:
    4177           0 :             highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
    4178           0 :             break;
    4179           0 :         case 8:
    4180           0 :             highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
    4181           0 :             break;
    4182           0 :         case 16:
    4183           0 :             highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left,
    4184             :                 dy);
    4185           0 :             break;
    4186           0 :         case 32:
    4187           0 :             highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left,
    4188             :                 dy);
    4189           0 :             break;
    4190           0 :         case 64:
    4191           0 :             highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left,
    4192             :                 dy);
    4193           0 :             break;
    4194             :         }
    4195           0 :     }
    4196             :     else {
    4197           0 :         if (bw < bh) {
    4198           0 :             if (bw + bw == bh) {
    4199           0 :                 switch (bw) {
    4200           0 :                 case 4:
    4201           0 :                     highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
    4202             :                         dy);
    4203           0 :                     break;
    4204           0 :                 case 8:
    4205           0 :                     highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
    4206             :                         dy);
    4207           0 :                     break;
    4208           0 :                 case 16:
    4209           0 :                     highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
    4210             :                         dy);
    4211           0 :                     break;
    4212           0 :                 case 32:
    4213           0 :                     highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
    4214             :                         dy);
    4215           0 :                     break;
    4216             :                 }
    4217           0 :             }
    4218             :             else {
    4219           0 :                 switch (bw) {
    4220           0 :                 case 4:
    4221           0 :                     highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
    4222             :                         dy);
    4223           0 :                     break;
    4224           0 :                 case 8:
    4225           0 :                     highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
    4226             :                         dy);
    4227           0 :                     break;
    4228           0 :                 case 16:
    4229           0 :                     highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
    4230             :                         dy);
    4231           0 :                     break;
    4232             :                 }
    4233           0 :             }
    4234             :         }
    4235             :         else {
    4236           0 :             if (bh + bh == bw) {
    4237           0 :                 switch (bh) {
    4238           0 :                 case 4:
    4239           0 :                     highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
    4240             :                         dy);
    4241           0 :                     break;
    4242           0 :                 case 8:
    4243           0 :                     highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
    4244             :                         dy);
    4245           0 :                     break;
    4246           0 :                 case 16:
    4247           0 :                     highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
    4248             :                         dy);
    4249           0 :                     break;
    4250           0 :                 case 32:
    4251           0 :                     highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
    4252             :                         dy);
    4253           0 :                     break;
    4254             :                 }
    4255           0 :             }
    4256             :             else {
    4257           0 :                 switch (bh) {
    4258           0 :                 case 4:
    4259           0 :                     highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
    4260             :                         dy);
    4261           0 :                     break;
    4262           0 :                 case 8:
    4263           0 :                     highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
    4264             :                         dy);
    4265           0 :                     break;
    4266           0 :                 case 16:
    4267           0 :                     highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
    4268             :                         dy);
    4269           0 :                     break;
    4270             :                 }
    4271           0 :             }
    4272             :         }
    4273             :     }
    4274           0 :     return;
    4275             : }
    4276             : 
    4277     4223800 : static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
    4278             :                                  const __m256i *topleft) {
    4279     4223800 :   __m256i pl = _mm256_sub_epi16(*top, *topleft);
    4280     8447600 :   __m256i pt = _mm256_sub_epi16(*left, *topleft);
    4281     8447600 :   __m256i ptl = _mm256_abs_epi16(_mm256_add_epi16(pl, pt));
    4282     4223800 :   pl = _mm256_abs_epi16(pl);
    4283     4223800 :   pt = _mm256_abs_epi16(pt);
    4284             : 
    4285     4223800 :   __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
    4286     8447600 :   mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
    4287     4223800 :   __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
    4288             : 
    4289     4223800 :   pl = _mm256_andnot_si256(mask1, *left);
    4290             : 
    4291     4223800 :   ptl = _mm256_and_si256(mask2, *topleft);
    4292     8447600 :   pt = _mm256_andnot_si256(mask2, *top);
    4293     4223800 :   pt = _mm256_or_si256(pt, ptl);
    4294     4223800 :   pt = _mm256_and_si256(mask1, pt);
    4295             : 
    4296     4223800 :   return _mm256_or_si256(pt, pl);
    4297             : }
    4298             : 
    4299             : // Return 16 8-bit pixels in one row (__m128i)
    4300     3904150 : static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
    4301             :                                       const __m256i *topleft) {
    4302     3904150 :   const __m256i p0 = paeth_pred(left, top, topleft);
    4303     3904180 :   const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
    4304     3904180 :   const __m256i p = _mm256_packus_epi16(p0, p1);
    4305     3904180 :   return _mm256_castsi256_si128(p);
    4306             : }
    4307             : 
    4308      153841 : static INLINE __m256i get_top_vector(const uint8_t *above) {
    4309      153841 :   const __m128i x = _mm_load_si128((const __m128i *)above);
    4310      153841 :   const __m128i zero = _mm_setzero_si128();
    4311      153841 :   const __m128i t0 = _mm_unpacklo_epi8(x, zero);
    4312      153841 :   const __m128i t1 = _mm_unpackhi_epi8(x, zero);
    4313      153841 :   return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
    4314             : }
    4315             : 
    4316       16758 : void eb_aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
    4317             :                                    const uint8_t *above, const uint8_t *left) {
    4318       16758 :   __m128i x = _mm_loadl_epi64((const __m128i *)left);
    4319       16758 :   const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
    4320       33516 :   const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
    4321       16758 :   __m256i rep = _mm256_set1_epi16(0x8000);
    4322       16758 :   const __m256i one = _mm256_set1_epi16(1);
    4323       16758 :   const __m256i top = get_top_vector(above);
    4324             : 
    4325             :   int i;
    4326      150820 :   for (i = 0; i < 8; ++i) {
    4327      134062 :     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    4328      134062 :     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
    4329             : 
    4330             :     _mm_storeu_si128((__m128i *)dst, row);
    4331      134062 :     dst += stride;
    4332      134062 :     rep = _mm256_add_epi16(rep, one);
    4333             :   }
    4334       16758 : }
    4335             : 
    4336      139790 : static INLINE __m256i get_left_vector(const uint8_t *left) {
    4337      139790 :   const __m128i x = _mm_load_si128((const __m128i *)left);
    4338      139790 :   return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
    4339             : }
    4340             : 
    4341       32909 : void eb_aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
    4342             :                                     const uint8_t *above, const uint8_t *left) {
    4343       32909 :   const __m256i l = get_left_vector(left);
    4344       65818 :   const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
    4345       32909 :   __m256i rep = _mm256_set1_epi16(0x8000);
    4346       32909 :   const __m256i one = _mm256_set1_epi16(1);
    4347       32909 :   const __m256i top = get_top_vector(above);
    4348             : 
    4349             :   int i;
    4350      559437 :   for (i = 0; i < 16; ++i) {
    4351      526525 :     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    4352      526525 :     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
    4353             : 
    4354             :     _mm_storeu_si128((__m128i *)dst, row);
    4355      526529 :     dst += stride;
    4356      526529 :     rep = _mm256_add_epi16(rep, one);
    4357             :   }
    4358       32912 : }
    4359             : 
    4360        9193 : void eb_aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
    4361             :                                     const uint8_t *above, const uint8_t *left) {
    4362        9193 :   __m256i l = get_left_vector(left);
    4363       18386 :   const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
    4364        9193 :   __m256i rep = _mm256_set1_epi16(0x8000);
    4365        9193 :   const __m256i one = _mm256_set1_epi16(1);
    4366        9193 :   const __m256i top = get_top_vector(above);
    4367             : 
    4368             :   int i;
    4369      156281 :   for (i = 0; i < 16; ++i) {
    4370      147088 :     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    4371      147088 :     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
    4372             : 
    4373             :     _mm_storeu_si128((__m128i *)dst, row);
    4374      147088 :     dst += stride;
    4375      147088 :     rep = _mm256_add_epi16(rep, one);
    4376             :   }
    4377             : 
    4378        9193 :   l = get_left_vector(left + 16);
    4379        9193 :   rep = _mm256_set1_epi16(0x8000);
    4380      156281 :   for (i = 0; i < 16; ++i) {
    4381      147088 :     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    4382      147088 :     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
    4383             : 
    4384             :     _mm_storeu_si128((__m128i *)dst, row);
    4385      147088 :     dst += stride;
    4386      147088 :     rep = _mm256_add_epi16(rep, one);
    4387             :   }
    4388        9193 : }
    4389             : 
    4390        3222 : void eb_aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
    4391             :                                     const uint8_t *above, const uint8_t *left) {
    4392        6444 :   const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
    4393        3222 :   const __m256i one = _mm256_set1_epi16(1);
    4394        3222 :   const __m256i top = get_top_vector(above);
    4395             : 
    4396       16110 :   for (int j = 0; j < 4; ++j) {
    4397       12888 :     const __m256i l = get_left_vector(left + j * 16);
    4398       12888 :     __m256i rep = _mm256_set1_epi16(0x8000);
    4399      219096 :     for (int i = 0; i < 16; ++i) {
    4400      206208 :       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    4401      206208 :       const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
    4402             : 
    4403             :       _mm_storeu_si128((__m128i *)dst, row);
    4404      206208 :       dst += stride;
    4405      206208 :       rep = _mm256_add_epi16(rep, one);
    4406             :     }
    4407             :   }
    4408        3222 : }
    4409             : 
    4410             : // Return 32 8-bit pixels in one row (__m256i)
    4411      159904 : static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
    4412             :                                       const __m256i *top1,
    4413             :                                       const __m256i *topleft) {
    4414      159904 :   __m256i p0 = paeth_pred(left, top0, topleft);
    4415      159904 :   __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
    4416      159904 :   const __m256i x0 = _mm256_packus_epi16(p0, p1);
    4417             : 
    4418      159904 :   p0 = paeth_pred(left, top1, topleft);
    4419      159904 :   p1 = _mm256_permute4x64_epi64(p0, 0xe);
    4420      159904 :   const __m256i x1 = _mm256_packus_epi16(p0, p1);
    4421             : 
    4422      159904 :   return _mm256_permute2x128_si256(x0, x1, 0x20);
    4423             : }
    4424             : 
    4425        9994 : void eb_aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
    4426             :                                     const uint8_t *above, const uint8_t *left) {
    4427        9994 :   const __m256i l = get_left_vector(left);
    4428        9994 :   const __m256i t0 = get_top_vector(above);
    4429        9994 :   const __m256i t1 = get_top_vector(above + 16);
    4430       19988 :   const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
    4431        9994 :   __m256i rep = _mm256_set1_epi16(0x8000);
    4432        9994 :   const __m256i one = _mm256_set1_epi16(1);
    4433             : 
    4434             :   int i;
    4435      169898 :   for (i = 0; i < 16; ++i) {
    4436      159904 :     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    4437             : 
    4438      159904 :     const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
    4439             : 
    4440             :     _mm256_storeu_si256((__m256i *)dst, r);
    4441             : 
    4442      159904 :     dst += stride;
    4443      159904 :     rep = _mm256_add_epi16(rep, one);
    4444             :   }
    4445        9994 : }
    4446             : 
    4447       18773 : void eb_aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
    4448             :                                     const uint8_t *above, const uint8_t *left) {
    4449       18773 :   __m256i l = get_left_vector(left);
    4450       18773 :   const __m256i t0 = get_top_vector(above);
    4451       18773 :   const __m256i t1 = get_top_vector(above + 16);
    4452       37546 :   const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
    4453       18773 :   __m256i rep = _mm256_set1_epi16(0x8000);
    4454       18773 :   const __m256i one = _mm256_set1_epi16(1);
    4455             : 
    4456             :   int i;
    4457      319139 :   for (i = 0; i < 16; ++i) {
    4458      300366 :     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    4459             : 
    4460      300366 :     const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
    4461      300367 :     const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
    4462             : 
    4463             :     _mm_storeu_si128((__m128i *)dst, r0);
    4464      300366 :     _mm_storeu_si128((__m128i *)(dst + 16), r1);
    4465             : 
    4466      300366 :     dst += stride;
    4467      300366 :     rep = _mm256_add_epi16(rep, one);
    4468             :   }
    4469             : 
    4470       18773 :   l = get_left_vector(left + 16);
    4471       18773 :   rep = _mm256_set1_epi16(0x8000);
    4472      319139 :   for (i = 0; i < 16; ++i) {
    4473      300367 :     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    4474             : 
    4475      300367 :     const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
    4476      300367 :     const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
    4477             : 
    4478             :     _mm_storeu_si128((__m128i *)dst, r0);
    4479      300366 :     _mm_storeu_si128((__m128i *)(dst + 16), r1);
    4480             : 
    4481      300366 :     dst += stride;
    4482      300366 :     rep = _mm256_add_epi16(rep, one);
    4483             :   }
    4484       18772 : }
    4485             : 
    4486        1983 : void eb_aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
    4487             :                                     const uint8_t *above, const uint8_t *left) {
    4488        1983 :   const __m256i t0 = get_top_vector(above);
    4489        1983 :   const __m256i t1 = get_top_vector(above + 16);
    4490        3966 :   const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
    4491        1983 :   const __m256i one = _mm256_set1_epi16(1);
    4492             : 
    4493             :   int i, j;
    4494        9915 :   for (j = 0; j < 4; ++j) {
    4495        7932 :     const __m256i l = get_left_vector(left + j * 16);
    4496        7932 :     __m256i rep = _mm256_set1_epi16(0x8000);
    4497      134844 :     for (i = 0; i < 16; ++i) {
    4498      126912 :       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    4499             : 
    4500      126912 :       const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
    4501      126912 :       const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
    4502             : 
    4503             :       _mm_storeu_si128((__m128i *)dst, r0);
    4504      126912 :       _mm_storeu_si128((__m128i *)(dst + 16), r1);
    4505             : 
    4506      126912 :       dst += stride;
    4507      126912 :       rep = _mm256_add_epi16(rep, one);
    4508             :     }
    4509             :   }
    4510        1983 : }
    4511             : 
    4512        1513 : void eb_aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
    4513             :                                     const uint8_t *above, const uint8_t *left) {
    4514        1513 :   const __m256i t0 = get_top_vector(above);
    4515        1513 :   const __m256i t1 = get_top_vector(above + 16);
    4516        1513 :   const __m256i t2 = get_top_vector(above + 32);
    4517        1513 :   const __m256i t3 = get_top_vector(above + 48);
    4518        3026 :   const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
    4519        1513 :   const __m256i one = _mm256_set1_epi16(1);
    4520             : 
    4521             :   int i, j;
    4522        4539 :   for (j = 0; j < 2; ++j) {
    4523        3026 :     const __m256i l = get_left_vector(left + j * 16);
    4524        3026 :     __m256i rep = _mm256_set1_epi16(0x8000);
    4525       51442 :     for (i = 0; i < 16; ++i) {
    4526       48416 :       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    4527             : 
    4528       48416 :       const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
    4529       48416 :       const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
    4530       48416 :       const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
    4531       48416 :       const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
    4532             : 
    4533             :       _mm_storeu_si128((__m128i *)dst, r0);
    4534       48416 :       _mm_storeu_si128((__m128i *)(dst + 16), r1);
    4535       48416 :       _mm_storeu_si128((__m128i *)(dst + 32), r2);
    4536       48416 :       _mm_storeu_si128((__m128i *)(dst + 48), r3);
    4537             : 
    4538       48416 :       dst += stride;
    4539       48416 :       rep = _mm256_add_epi16(rep, one);
    4540             :     }
    4541             :   }
    4542        1513 : }
    4543             : 
    4544        3686 : void eb_aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
    4545             :                                     const uint8_t *above, const uint8_t *left) {
    4546        3686 :   const __m256i t0 = get_top_vector(above);
    4547        3686 :   const __m256i t1 = get_top_vector(above + 16);
    4548        3686 :   const __m256i t2 = get_top_vector(above + 32);
    4549        3686 :   const __m256i t3 = get_top_vector(above + 48);
    4550        7372 :   const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
    4551        3686 :   const __m256i one = _mm256_set1_epi16(1);
    4552             : 
    4553             :   int i, j;
    4554       18427 :   for (j = 0; j < 4; ++j) {
    4555       14744 :     const __m256i l = get_left_vector(left + j * 16);
    4556       14744 :     __m256i rep = _mm256_set1_epi16(0x8000);
    4557      250623 :     for (i = 0; i < 16; ++i) {
    4558      235882 :       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    4559             : 
    4560      235882 :       const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
    4561      235893 :       const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
    4562      235883 :       const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
    4563      235881 :       const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
    4564             : 
    4565             :       _mm_storeu_si128((__m128i *)dst, r0);
    4566      235879 :       _mm_storeu_si128((__m128i *)(dst + 16), r1);
    4567      235879 :       _mm_storeu_si128((__m128i *)(dst + 32), r2);
    4568      235879 :       _mm_storeu_si128((__m128i *)(dst + 48), r3);
    4569             : 
    4570      235879 :       dst += stride;
    4571      235879 :       rep = _mm256_add_epi16(rep, one);
    4572             :     }
    4573             :   }
    4574        3683 : }
    4575             : 
    4576        2366 : void eb_aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
    4577             :                                     const uint8_t *above, const uint8_t *left) {
    4578        2366 :   const __m256i t0 = get_top_vector(above);
    4579        2366 :   const __m256i t1 = get_top_vector(above + 16);
    4580        2366 :   const __m256i t2 = get_top_vector(above + 32);
    4581        2366 :   const __m256i t3 = get_top_vector(above + 48);
    4582        4732 :   const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
    4583        2366 :   const __m256i one = _mm256_set1_epi16(1);
    4584             : 
    4585             :   int i;
    4586        2366 :   const __m256i l = get_left_vector(left);
    4587        2366 :   __m256i rep = _mm256_set1_epi16(0x8000);
    4588       40222 :   for (i = 0; i < 16; ++i) {
    4589       37856 :     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    4590             : 
    4591       37856 :     const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
    4592       37856 :     const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
    4593       37856 :     const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
    4594       37856 :     const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
    4595             : 
    4596             :     _mm_storeu_si128((__m128i *)dst, r0);
    4597       37856 :     _mm_storeu_si128((__m128i *)(dst + 16), r1);
    4598       37856 :     _mm_storeu_si128((__m128i *)(dst + 32), r2);
    4599       37856 :     _mm_storeu_si128((__m128i *)(dst + 48), r3);
    4600             : 
    4601       37856 :     dst += stride;
    4602       37856 :     rep = _mm256_add_epi16(rep, one);
    4603             :   }
    4604        2366 : }
    4605             : 
    4606           0 : void eb_aom_highbd_paeth_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
    4607             :     const uint16_t *above, const uint16_t *left, int bd) {
    4608             :     (void) bd;
    4609           0 :     const __m256i tl16 = _mm256_set1_epi16(above[-1]);
    4610           0 :     const __m256i top = _mm256_loadu_si256((const __m256i *)above);
    4611             :     __m256i l16, row;
    4612             :     int i;
    4613             : 
    4614           0 :     for (i = 0; i < 4; ++i) {
    4615           0 :         l16 = _mm256_set1_epi16(left[i]);
    4616           0 :         row = paeth_pred(&l16, &top, &tl16);
    4617             :         _mm256_storeu_si256((__m256i *)dst, row);
    4618           0 :         dst += stride;
    4619             :     }
    4620           0 : }
    4621             : 
    4622           0 : void eb_aom_highbd_paeth_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
    4623             :     const uint16_t *above, const uint16_t *left, int bd) {
    4624           0 :     const __m256i tl16 = _mm256_set1_epi16(above[-1]);
    4625           0 :     const __m256i top = _mm256_loadu_si256((const __m256i *)above);
    4626             :     __m256i l16, row;
    4627             :     int i;
    4628             :     (void) bd;
    4629             : 
    4630           0 :     for (i = 0; i < 8; ++i) {
    4631           0 :         l16 = _mm256_set1_epi16(left[i]);
    4632           0 :         row = paeth_pred(&l16, &top, &tl16);
    4633             :         _mm256_storeu_si256((__m256i *)dst, row);
    4634           0 :         dst += stride;
    4635             :     }
    4636           0 : }
    4637             : 
    4638           0 : void eb_aom_highbd_paeth_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
    4639             :     const uint16_t *above, const uint16_t *left, int bd) {
    4640           0 :     const __m256i tl16 = _mm256_set1_epi16(above[-1]);
    4641           0 :     const __m256i top = _mm256_loadu_si256((const __m256i *)above);
    4642             :     __m256i l16, row;
    4643             :     int i;
    4644             :     (void) bd;
    4645             : 
    4646           0 :     for (i = 0; i < 16; ++i) {
    4647           0 :         l16 = _mm256_set1_epi16(left[i]);
    4648           0 :         row = paeth_pred(&l16, &top, &tl16);
    4649             :         _mm256_storeu_si256((__m256i *)dst, row);
    4650           0 :         dst += stride;
    4651             :     }
    4652           0 : }
    4653             : 
    4654           0 : void eb_aom_highbd_paeth_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
    4655             :     const uint16_t *above, const uint16_t *left, int bd) {
    4656           0 :     const __m256i tl16 = _mm256_set1_epi16(above[-1]);
    4657           0 :     const __m256i top = _mm256_loadu_si256((const __m256i *)above);
    4658             :     __m256i l16, row;
    4659             :     int i;
    4660             :     (void) bd;
    4661             : 
    4662           0 :     for (i = 0; i < 32; ++i) {
    4663           0 :         l16 = _mm256_set1_epi16(left[i]);
    4664           0 :         row = paeth_pred(&l16, &top, &tl16);
    4665             :         _mm256_storeu_si256((__m256i *)dst, row);
    4666           0 :         dst += stride;
    4667             :     }
    4668           0 : }
    4669             : 
    4670           0 : void eb_aom_highbd_paeth_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
    4671             :     const uint16_t *above, const uint16_t *left, int bd) {
    4672           0 :     const __m256i tl16 = _mm256_set1_epi16(above[-1]);
    4673           0 :     const __m256i top = _mm256_loadu_si256((const __m256i *)above);
    4674             :     __m256i l16, row;
    4675             :     int i;
    4676             :     (void) bd;
    4677             : 
    4678           0 :     for (i = 0; i < 64; ++i) {
    4679           0 :         l16 = _mm256_set1_epi16(left[i]);
    4680           0 :         row = paeth_pred(&l16, &top, &tl16);
    4681             :         _mm256_storeu_si256((__m256i *)dst, row);
    4682           0 :         dst += stride;
    4683             :     }
    4684           0 : }
    4685             : 
    4686           0 : void eb_aom_highbd_paeth_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
    4687             :     const uint16_t *above, const uint16_t *left, int bd) {
    4688           0 :     const __m256i t0 = _mm256_loadu_si256((const __m256i *)above);
    4689           0 :     const __m256i t1 = _mm256_loadu_si256((const __m256i *)(above + 16));
    4690           0 :     const __m256i tl = _mm256_set1_epi16(above[-1]);
    4691             :     __m256i l16, row;
    4692             :     int i;
    4693             :     (void) bd;
    4694             : 
    4695           0 :     for (i = 0; i < 8; ++i) {
    4696           0 :         l16 = _mm256_set1_epi16(left[i]);
    4697             : 
    4698           0 :         row = paeth_pred(&l16, &t0, &tl);
    4699             :         _mm256_storeu_si256((__m256i *)dst, row);
    4700             : 
    4701           0 :         row = paeth_pred(&l16, &t1, &tl);
    4702           0 :         _mm256_storeu_si256((__m256i *)(dst + 16), row);
    4703             : 
    4704           0 :         dst += stride;
    4705             :     }
    4706           0 : }
    4707             : 
    4708           0 : void eb_aom_highbd_paeth_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
    4709             :     const uint16_t *above, const uint16_t *left, int bd) {
    4710           0 :     const __m256i t0 = _mm256_loadu_si256((const __m256i *)above);
    4711           0 :     const __m256i t1 = _mm256_loadu_si256((const __m256i *)(above + 16));
    4712           0 :     const __m256i tl = _mm256_set1_epi16(above[-1]);
    4713             :     __m256i l16, row;
    4714             :     int i;
    4715             :     (void) bd;
    4716             : 
    4717           0 :     for (i = 0; i < 16; ++i) {
    4718           0 :         l16 = _mm256_set1_epi16(left[i]);
    4719             : 
    4720           0 :         row = paeth_pred(&l16, &t0, &tl);
    4721             :         _mm256_storeu_si256((__m256i *)dst, row);
    4722             : 
    4723           0 :         row = paeth_pred(&l16, &t1, &tl);
    4724           0 :         _mm256_storeu_si256((__m256i *)(dst + 16), row);
    4725             : 
    4726           0 :         dst += stride;
    4727             :     }
    4728           0 : }
    4729             : 
    4730           0 : void eb_aom_highbd_paeth_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
    4731             :     const uint16_t *above, const uint16_t *left, int bd) {
    4732           0 :     const __m256i t0 = _mm256_loadu_si256((const __m256i *)above);
    4733           0 :     const __m256i t1 = _mm256_loadu_si256((const __m256i *)(above + 16));
    4734           0 :     const __m256i tl = _mm256_set1_epi16(above[-1]);
    4735             :     __m256i l16, row;
    4736             :     int i;
    4737             :     (void) bd;
    4738             : 
    4739           0 :     for (i = 0; i < 32; ++i) {
    4740           0 :         l16 = _mm256_set1_epi16(left[i]);
    4741             : 
    4742           0 :         row = paeth_pred(&l16, &t0, &tl);
    4743             :         _mm256_storeu_si256((__m256i *)dst, row);
    4744             : 
    4745           0 :         row = paeth_pred(&l16, &t1, &tl);
    4746           0 :         _mm256_storeu_si256((__m256i *)(dst + 16), row);
    4747             : 
    4748           0 :         dst += stride;
    4749             :     }
    4750           0 : }
    4751             : 
    4752           0 : void eb_aom_highbd_paeth_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
    4753             :     const uint16_t *above, const uint16_t *left, int bd) {
    4754           0 :     const __m256i t0 = _mm256_loadu_si256((const __m256i *)above);
    4755           0 :     const __m256i t1 = _mm256_loadu_si256((const __m256i *)(above + 16));
    4756           0 :     const __m256i tl = _mm256_set1_epi16(above[-1]);
    4757             :     __m256i l16, row;
    4758             :     int i;
    4759             :     (void) bd;
    4760             : 
    4761           0 :     for (i = 0; i < 64; ++i) {
    4762           0 :         l16 = _mm256_set1_epi16(left[i]);
    4763             : 
    4764           0 :         row = paeth_pred(&l16, &t0, &tl);
    4765             :         _mm256_storeu_si256((__m256i *)dst, row);
    4766             : 
    4767           0 :         row = paeth_pred(&l16, &t1, &tl);
    4768           0 :         _mm256_storeu_si256((__m256i *)(dst + 16), row);
    4769             : 
    4770           0 :         dst += stride;
    4771             :     }
    4772           0 : }
    4773             : 
    4774           0 : void eb_aom_highbd_paeth_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
    4775             :     const uint16_t *above, const uint16_t *left, int bd) {
    4776           0 :     const __m256i t0 = _mm256_loadu_si256((const __m256i *)above);
    4777           0 :     const __m256i t1 = _mm256_loadu_si256((const __m256i *)(above + 16));
    4778           0 :     const __m256i t2 = _mm256_loadu_si256((const __m256i *)(above + 32));
    4779           0 :     const __m256i t3 = _mm256_loadu_si256((const __m256i *)(above + 48));
    4780           0 :     const __m256i tl = _mm256_set1_epi16(above[-1]);
    4781             :     __m256i l16, row;
    4782             :     int i;
    4783             :     (void) bd;
    4784             : 
    4785           0 :     for (i = 0; i < 16; ++i) {
    4786           0 :         l16 = _mm256_set1_epi16(left[i]);
    4787             : 
    4788           0 :         row = paeth_pred(&l16, &t0, &tl);
    4789             :         _mm256_storeu_si256((__m256i *)dst, row);
    4790             : 
    4791           0 :         row = paeth_pred(&l16, &t1, &tl);
    4792           0 :         _mm256_storeu_si256((__m256i *)(dst + 16), row);
    4793             : 
    4794           0 :         row = paeth_pred(&l16, &t2, &tl);
    4795           0 :         _mm256_storeu_si256((__m256i *)(dst + 32), row);
    4796             : 
    4797           0 :         row = paeth_pred(&l16, &t3, &tl);
    4798           0 :         _mm256_storeu_si256((__m256i *)(dst + 48), row);
    4799             : 
    4800           0 :         dst += stride;
    4801             :     }
    4802           0 : }
    4803             : 
    4804           0 : void eb_aom_highbd_paeth_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
    4805             :     const uint16_t *above, const uint16_t *left, int bd) {
    4806           0 :     const __m256i t0 = _mm256_loadu_si256((const __m256i *)above);
    4807           0 :     const __m256i t1 = _mm256_loadu_si256((const __m256i *)(above + 16));
    4808           0 :     const __m256i t2 = _mm256_loadu_si256((const __m256i *)(above + 32));
    4809           0 :     const __m256i t3 = _mm256_loadu_si256((const __m256i *)(above + 48));
    4810           0 :     const __m256i tl = _mm256_set1_epi16(above[-1]);
    4811             :     __m256i l16, row;
    4812             :     int i;
    4813             :     (void) bd;
    4814             : 
    4815           0 :     for (i = 0; i < 32; ++i) {
    4816           0 :         l16 = _mm256_set1_epi16(left[i]);
    4817             : 
    4818           0 :         row = paeth_pred(&l16, &t0, &tl);
    4819             :         _mm256_storeu_si256((__m256i *)dst, row);
    4820             : 
    4821           0 :         row = paeth_pred(&l16, &t1, &tl);
    4822           0 :         _mm256_storeu_si256((__m256i *)(dst + 16), row);
    4823             : 
    4824           0 :         row = paeth_pred(&l16, &t2, &tl);
    4825           0 :         _mm256_storeu_si256((__m256i *)(dst + 32), row);
    4826             : 
    4827           0 :         row = paeth_pred(&l16, &t3, &tl);
    4828           0 :         _mm256_storeu_si256((__m256i *)(dst + 48), row);
    4829             : 
    4830           0 :         dst += stride;
    4831             :     }
    4832           0 : }
    4833             : 
    4834           0 : void eb_aom_highbd_paeth_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
    4835             :     const uint16_t *above, const uint16_t *left, int bd) {
    4836           0 :     const __m256i t0 = _mm256_loadu_si256((const __m256i *)above);
    4837           0 :     const __m256i t1 = _mm256_loadu_si256((const __m256i *)(above + 16));
    4838           0 :     const __m256i t2 = _mm256_loadu_si256((const __m256i *)(above + 32));
    4839           0 :     const __m256i t3 = _mm256_loadu_si256((const __m256i *)(above + 48));
    4840           0 :     const __m256i tl = _mm256_set1_epi16(above[-1]);
    4841             :     __m256i l16, row;
    4842             :     int i;
    4843             :     (void) bd;
    4844             : 
    4845           0 :     for (i = 0; i < 64; ++i) {
    4846           0 :         l16 = _mm256_set1_epi16(left[i]);
    4847             : 
    4848           0 :         row = paeth_pred(&l16, &t0, &tl);
    4849             :         _mm256_storeu_si256((__m256i *)dst, row);
    4850             : 
    4851           0 :         row = paeth_pred(&l16, &t1, &tl);
    4852           0 :         _mm256_storeu_si256((__m256i *)(dst + 16), row);
    4853             : 
    4854           0 :         row = paeth_pred(&l16, &t2, &tl);
    4855           0 :         _mm256_storeu_si256((__m256i *)(dst + 32), row);
    4856             : 
    4857           0 :         row = paeth_pred(&l16, &t3, &tl);
    4858           0 :         _mm256_storeu_si256((__m256i *)(dst + 48), row);
    4859             : 
    4860           0 :         dst += stride;
    4861             :     }
    4862           0 : }
    4863             : 
    4864           0 : void eb_aom_highbd_paeth_predictor_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
    4865             :     const uint16_t *above, const uint16_t *left, int bd) {
    4866           0 :     const __m128i t = _mm_loadu_si128((const __m128i *)above);
    4867           0 :     const __m256i t0 = _mm256_setr_m128i(t, t);
    4868           0 :     const __m256i tl = _mm256_set1_epi16(above[-1]);
    4869             :     __m256i l16, row;
    4870             :     int i;
    4871             :     (void) bd;
    4872             : 
    4873           0 :     for (i = 0; i < 4; i += 2) {
    4874           0 :         l16 = _mm256_setr_m128i(_mm_set1_epi16(left[i]),
    4875             :             _mm_set1_epi16(left[i + 1]));
    4876             : 
    4877           0 :         row = paeth_pred(&l16, &t0, &tl);
    4878           0 :         _mm_storeu_si128((__m128i *)dst, _mm256_extractf128_si256(row, 0));
    4879           0 :         dst += stride;
    4880           0 :         _mm_storeu_si128((__m128i *)dst, _mm256_extractf128_si256(row, 1));
    4881           0 :         dst += stride;
    4882             :     }
    4883           0 : }
    4884             : 
    4885           0 : void eb_aom_highbd_paeth_predictor_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
    4886             :     const uint16_t *above, const uint16_t *left, int bd) {
    4887           0 :     const __m128i t = _mm_loadu_si128((const __m128i *)above);
    4888           0 :     const __m256i t0 = _mm256_setr_m128i(t, t);
    4889           0 :     const __m256i tl = _mm256_set1_epi16(above[-1]);
    4890             :     __m256i l16, row;
    4891             :     int i;
    4892             :     (void) bd;
    4893             : 
    4894           0 :     for (i = 0; i < 8; i += 2) {
    4895           0 :         l16 = _mm256_setr_m128i(_mm_set1_epi16(left[i]),
    4896             :             _mm_set1_epi16(left[i + 1]));
    4897             : 
    4898           0 :         row = paeth_pred(&l16, &t0, &tl);
    4899           0 :         _mm_storeu_si128((__m128i *)dst, _mm256_extractf128_si256(row, 0));
    4900           0 :         dst += stride;
    4901           0 :         _mm_storeu_si128((__m128i *)dst, _mm256_extractf128_si256(row, 1));
    4902           0 :         dst += stride;
    4903             :     }
    4904           0 : }
    4905             : 
    4906           0 : void eb_aom_highbd_paeth_predictor_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
    4907             :     const uint16_t *above, const uint16_t *left, int bd) {
    4908           0 :     const __m128i t = _mm_loadu_si128((const __m128i *)above);
    4909           0 :     const __m256i t0 = _mm256_setr_m128i(t, t);
    4910           0 :     const __m256i tl = _mm256_set1_epi16(above[-1]);
    4911             :     __m256i l16, row;
    4912             :     int i;
    4913             :     (void) bd;
    4914             : 
    4915           0 :     for (i = 0; i < 16; i += 2) {
    4916           0 :         l16 = _mm256_setr_m128i(_mm_set1_epi16(left[i]),
    4917             :             _mm_set1_epi16(left[i + 1]));
    4918             : 
    4919           0 :         row = paeth_pred(&l16, &t0, &tl);
    4920           0 :         _mm_storeu_si128((__m128i *)dst, _mm256_extractf128_si256(row, 0));
    4921           0 :         dst += stride;
    4922           0 :         _mm_storeu_si128((__m128i *)dst, _mm256_extractf128_si256(row, 1));
    4923           0 :         dst += stride;
    4924             :     }
    4925           0 : }
    4926             : 
    4927           0 : void eb_aom_highbd_paeth_predictor_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
    4928             :     const uint16_t *above, const uint16_t *left, int bd) {
    4929           0 :     const __m128i t = _mm_loadu_si128((const __m128i *)above);
    4930           0 :     const __m256i t0 = _mm256_setr_m128i(t, t);
    4931           0 :     const __m256i tl = _mm256_set1_epi16(above[-1]);
    4932             :     __m256i l16, row;
    4933             :     int i;
    4934             :     (void) bd;
    4935             : 
    4936           0 :     for (i = 0; i < 32; i += 2) {
    4937           0 :         l16 = _mm256_setr_m128i(_mm_set1_epi16(left[i]),
    4938             :             _mm_set1_epi16(left[i + 1]));
    4939             : 
    4940           0 :         row = paeth_pred(&l16, &t0, &tl);
    4941           0 :         _mm_storeu_si128((__m128i *)dst, _mm256_extractf128_si256(row, 0));
    4942           0 :         dst += stride;
    4943           0 :         _mm_storeu_si128((__m128i *)dst, _mm256_extractf128_si256(row, 1));
    4944           0 :         dst += stride;
    4945             :     }
    4946           0 : }
    4947             : 
    4948           0 : void eb_aom_highbd_paeth_predictor_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
    4949             :     const uint16_t *above, const uint16_t *left, int bd) {
    4950           0 :     const __m256i t0 = _mm256_set1_epi64x(((uint64_t*)above)[0]);
    4951           0 :     const __m256i tl = _mm256_set1_epi16(above[-1]);
    4952             :     __m256i l16, row;
    4953             :     (void) bd;
    4954             : 
    4955             :     /* l16 = left: 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 */
    4956           0 :     __m256i t1 = _mm256_cvtepi16_epi64(
    4957             :         _mm_lddqu_si128((__m128i const*)left));
    4958           0 :     __m256i t1s = _mm256_slli_epi64(t1, 16);
    4959           0 :     t1 = _mm256_or_si256(t1s, t1);
    4960           0 :     t1s = _mm256_slli_epi64(t1, 32);
    4961           0 :     l16 = _mm256_or_si256(t1s, t1);
    4962             : 
    4963           0 :     row = paeth_pred(&l16, &t0, &tl);
    4964             : 
    4965           0 :     *(uint64_t*)&dst[0 * stride] = _mm256_extract_epi64(row, 0);
    4966           0 :     *(uint64_t*)&dst[1 * stride] = _mm256_extract_epi64(row, 1);
    4967           0 :     *(uint64_t*)&dst[2 * stride] = _mm256_extract_epi64(row, 2);
    4968           0 :     *(uint64_t*)&dst[3 * stride] = _mm256_extract_epi64(row, 3);
    4969           0 : }
    4970             : 
    4971           0 : void eb_aom_highbd_paeth_predictor_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
    4972             :     const uint16_t *above, const uint16_t *left, int bd) {
    4973           0 :     const __m256i t0 = _mm256_set1_epi64x(((uint64_t*)above)[0]);
    4974           0 :     const __m256i tl = _mm256_set1_epi16(above[-1]);
    4975             :     __m256i l16, row;
    4976             :     int i;
    4977             :     (void) bd;
    4978             : 
    4979           0 :     for (i = 0; i < 8; i += 4) {
    4980             :         /* l16 = left: 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 */
    4981           0 :         __m256i t1 = _mm256_cvtepi16_epi64(
    4982           0 :             _mm_lddqu_si128((__m128i const*)&left[i]));
    4983           0 :         __m256i t1s = _mm256_slli_epi64(t1, 16);
    4984           0 :         t1 = _mm256_or_si256(t1s, t1);
    4985           0 :         t1s = _mm256_slli_epi64(t1, 32);
    4986           0 :         l16 = _mm256_or_si256(t1s, t1);
    4987             : 
    4988           0 :         row = paeth_pred(&l16, &t0, &tl);
    4989             : 
    4990           0 :         *(uint64_t*)&dst[0 * stride] = _mm256_extract_epi64(row, 0);
    4991           0 :         *(uint64_t*)&dst[1 * stride] = _mm256_extract_epi64(row, 1);
    4992           0 :         *(uint64_t*)&dst[2 * stride] = _mm256_extract_epi64(row, 2);
    4993           0 :         *(uint64_t*)&dst[3 * stride] = _mm256_extract_epi64(row, 3);
    4994           0 :         dst += 4 * stride;
    4995             :     }
    4996           0 : }
    4997             : 
    4998           0 : void eb_aom_highbd_paeth_predictor_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
    4999             :     const uint16_t *above, const uint16_t *left, int bd) {
    5000           0 :     const __m256i t0 = _mm256_set1_epi64x(((uint64_t*)above)[0]);
    5001           0 :     const __m256i tl = _mm256_set1_epi16(above[-1]);
    5002             :     __m256i l16, row;
    5003             : 
    5004             :     (void) bd;
    5005             :     int i;
    5006           0 :     for (i = 0; i < 16; i += 4) {
    5007             :         /* l16 = left: 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 */
    5008           0 :         __m256i t1 = _mm256_cvtepi16_epi64(
    5009           0 :             _mm_lddqu_si128((__m128i const*)&left[i]));
    5010           0 :         __m256i t1s = _mm256_slli_epi64(t1, 16);
    5011           0 :         t1 = _mm256_or_si256(t1s, t1);
    5012           0 :         t1s = _mm256_slli_epi64(t1, 32);
    5013           0 :         l16 = _mm256_or_si256(t1s, t1);
    5014             : 
    5015           0 :         row = paeth_pred(&l16, &t0, &tl);
    5016             : 
    5017           0 :         *(uint64_t*)&dst[0 * stride] = _mm256_extract_epi64(row, 0);
    5018           0 :         *(uint64_t*)&dst[1 * stride] = _mm256_extract_epi64(row, 1);
    5019           0 :         *(uint64_t*)&dst[2 * stride] = _mm256_extract_epi64(row, 2);
    5020           0 :         *(uint64_t*)&dst[3 * stride] = _mm256_extract_epi64(row, 3);
    5021           0 :         dst += 4 * stride;
    5022             :     }
    5023           0 : }
    5024             : 
    5025           0 : void eb_aom_highbd_paeth_predictor_2x2_avx2(uint16_t *dst, ptrdiff_t stride,
    5026             :     const uint16_t *above, const uint16_t *left, int bd) {
    5027             :     (void) bd;
    5028           0 :     __m256i tl = _mm256_set1_epi16(above[-1]);
    5029           0 :     __m256i t0 = _mm256_set1_epi32(((uint32_t*)above)[0]);
    5030             : 
    5031             :     /* l16 = left: 0, 0, 1, 1, 0, 0, 0, 0 */
    5032           0 :     __m256i gg = _mm256_cvtepi16_epi32(_mm_cvtsi32_si128(*(uint32_t const*)(left)));
    5033           0 :     __m256i ss = _mm256_slli_epi64(gg, 16);
    5034           0 :     __m256i l16 = _mm256_or_si256(gg, ss);
    5035             : 
    5036           0 :     __m256i row = paeth_pred(&l16, &t0, &tl);
    5037             : 
    5038           0 :     *(uint32_t*)&dst[0] = _mm256_extract_epi32(row, 0);
    5039           0 :     *(uint32_t*)&dst[stride] = _mm256_extract_epi32(row, 1);
    5040           0 : }

Generated by: LCOV version 1.14