LCOV - code coverage report
Current view: top level - ASM_AVX2 - wiener_convolve_avx2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 93 214 43.5 %
Date: 2019-11-25 17:38:06 Functions: 1 2 50.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include "EbDefinitions.h"
      13             : #include <immintrin.h>
      14             : #include <assert.h>
      15             : 
      16             : #include "aom_dsp_rtcd.h"
      17             : #include "convolve.h"
      18             : #include "synonyms.h"
      19             : #include "synonyms_avx2.h"
      20             : #include "convolve_avx2.h"
      21             : 
      22             : DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = {
      23             :   3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255,
      24             :   3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255
      25             : };
      26             : 
      27             : // 128-bit xmmwords are written as [ ... ] with the MSB on the left.
      28             : // 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
      29             : // on the left.
      30             : // A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
      31             : // loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
      32             : 
      33             : // Exploiting the range of wiener filter coefficients,
      34             : // horizontal filtering can be done in 16 bit intermediate precision.
      35             : // The details are as follows :
      36             : // Consider the horizontal wiener filter coefficients of the following form :
      37             : //      [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0]
      38             : // Subtracting  2^(FILTER_BITS) from the centre tap we get the following  :
      39             : //      [C0, C1, C2,     -2 * (C0 + C1 + C2),             C2, C1, C0]
      40             : // The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3
      41             : // + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit
      42             : // precision. Finally, after rounding the above result by round_0, we multiply
      43             : // the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the
      44             : // horizontal filter output.
      45             : 
      46      298043 : void eb_av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
      47             :     uint8_t *dst, ptrdiff_t dst_stride,
      48             :     const int16_t *filter_x, int32_t x_step_q4,
      49             :     const int16_t *filter_y, int32_t y_step_q4,
      50             :     int32_t w, int32_t h,
      51             :     const ConvolveParams *conv_params) {
      52      298043 :     const int32_t bd = 8;
      53      298043 :     assert(x_step_q4 == 16 && y_step_q4 == 16);
      54      298043 :     assert(!(w & 7));
      55             :     (void)x_step_q4;
      56             :     (void)y_step_q4;
      57             : 
      58             :     DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]);
      59      298043 :     int32_t im_h = h + SUBPEL_TAPS - 1;
      60      298043 :     int32_t im_stride = 8;
      61             :     int32_t i, j;
      62      298043 :     const int32_t center_tap = (SUBPEL_TAPS - 1) / 2;
      63      298043 :     const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
      64             : 
      65             :     __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center;
      66             : 
      67      298043 :     assert(conv_params->round_0 > 0);
      68             : 
      69      298043 :     filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
      70      298043 :     filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
      71      298043 :     filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
      72      298043 :     filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
      73             : 
      74      298043 :     filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2);
      75             : 
      76      298043 :     const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x);
      77      298043 :     const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x);
      78             : 
      79             :     // coeffs 0 1 0 1 0 1 0 1
      80      298043 :     coeffs_h[0] =
      81      596086 :         _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u));
      82             :     // coeffs 2 3 2 3 2 3 2 3
      83      298043 :     coeffs_h[1] =
      84      596086 :         _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u));
      85             :     // coeffs 4 5 4 5 4 5 4 5
      86      298043 :     coeffs_h[2] =
      87      596086 :         _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u));
      88             :     // coeffs 6 7 6 7 6 7 6 7
      89      298043 :     coeffs_h[3] =
      90      298043 :         _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu));
      91             : 
      92             :     const __m256i round_const_h =
      93      298043 :         _mm256_set1_epi16((1 << (conv_params->round_0 - 1)));
      94             :     const __m256i round_const_horz =
      95      596086 :         _mm256_set1_epi16((1 << (bd + FILTER_BITS - conv_params->round_0 - 1)));
      96      298043 :     const __m256i clamp_low = _mm256_setzero_si256();
      97             :     const __m256i clamp_high =
      98      298043 :         _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
      99      596086 :     const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0);
     100             : 
     101             :     // Add an offset to account for the "add_src" part of the convolve function.
     102      298043 :     const __m128i zero_128 = _mm_setzero_si128();
     103      298043 :     const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
     104      596086 :     const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
     105             : 
     106      298043 :     const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y);
     107             : 
     108             :     // coeffs 0 1 0 1 0 1 0 1
     109      298043 :     coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00);
     110             :     // coeffs 2 3 2 3 2 3 2 3
     111      298043 :     coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
     112             :     // coeffs 4 5 4 5 4 5 4 5
     113      298043 :     coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa);
     114             :     // coeffs 6 7 6 7 6 7 6 7
     115      298043 :     coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff);
     116             : 
     117             :     const __m256i round_const_v =
     118      596086 :         _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
     119      298043 :                           (1 << (bd + conv_params->round_1 - 1)));
     120      298043 :     const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
     121             : 
     122     1894680 :     for (j = 0; j < w; j += 8) {
     123    48730200 :         for (i = 0; i < im_h; i += 2) {
     124    47074100 :             __m256i data = _mm256_castsi128_si256(
     125    47074100 :                 _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
     126             : 
     127             :             // Load the next line
     128    47074100 :             if (i + 1 < im_h)
     129    90774200 :                 data = _mm256_inserti128_si256(
     130             :                     data,
     131             :                     _mm_loadu_si128(
     132             :                     (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
     133             :                     1);
     134             : 
     135    47074100 :             __m256i res = x_convolve_8tap_avx2(data, coeffs_h, filt);
     136             : 
     137             :             res =
     138    94073100 :                 _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
     139             : 
     140    47036600 :             __m256i data_0 = _mm256_shuffle_epi8(data, filt_center);
     141             : 
     142             :             // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to
     143             :             // the result
     144    94073100 :             data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0);
     145    47036600 :             res = _mm256_add_epi16(res, data_0);
     146    47036600 :             res = _mm256_add_epi16(res, round_const_horz);
     147    47036600 :             res = _mm256_max_epi16(res, clamp_low);
     148    47036600 :             const __m256i res_clamped = _mm256_min_epi16(res, clamp_high);
     149    47036600 :             _mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped);
     150             :         }
     151             : 
     152             :         /* Vertical filter */
     153             :         {
     154     1656090 :             __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
     155     1656090 :             __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
     156     1656090 :             __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
     157     1656090 :             __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
     158     1656090 :             __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
     159     3312170 :             __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
     160             : 
     161             :             __m256i s[8];
     162     1656090 :             s[0] = _mm256_unpacklo_epi16(src_0, src_1);
     163     1656090 :             s[1] = _mm256_unpacklo_epi16(src_2, src_3);
     164     1656090 :             s[2] = _mm256_unpacklo_epi16(src_4, src_5);
     165             : 
     166     1656090 :             s[4] = _mm256_unpackhi_epi16(src_0, src_1);
     167     1656090 :             s[5] = _mm256_unpackhi_epi16(src_2, src_3);
     168     1656090 :             s[6] = _mm256_unpackhi_epi16(src_4, src_5);
     169             : 
     170    41895600 :             for (i = 0; i < h - 1; i += 2) {
     171    40298900 :                 const int16_t *data = &im_block[i * im_stride];
     172             : 
     173             :                 const __m256i s6 =
     174    40298900 :                     _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
     175             :                 const __m256i s7 =
     176    80597900 :                     _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
     177             : 
     178    40298900 :                 s[3] = _mm256_unpacklo_epi16(s6, s7);
     179    40298900 :                 s[7] = _mm256_unpackhi_epi16(s6, s7);
     180             : 
     181    40298900 :                 __m256i res_a = convolve16_8tap_avx2(s, coeffs_v);
     182    40273500 :                 __m256i res_b = convolve16_8tap_avx2(s + 4, coeffs_v);
     183             : 
     184    80479000 :                 const __m256i res_a_round = _mm256_sra_epi32(
     185             :                     _mm256_add_epi32(res_a, round_const_v), round_shift_v);
     186    80479000 :                 const __m256i res_b_round = _mm256_sra_epi32(
     187             :                     _mm256_add_epi32(res_b, round_const_v), round_shift_v);
     188             : 
     189             :                 /* rounding code */
     190             :                 // 16 bit conversion
     191    40239500 :                 const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
     192             :                 // 8 bit conversion and saturation to uint8
     193    40239500 :                 const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
     194             : 
     195    40239500 :                 const __m128i res_0 = _mm256_castsi256_si128(res_8b);
     196    40239500 :                 const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
     197             : 
     198             :                 // Store values into the destination buffer
     199    40239500 :                 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
     200    40239500 :                 __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
     201             : 
     202    40239500 :                 _mm_storel_epi64(p_0, res_0);
     203    40239500 :                 _mm_storel_epi64(p_1, res_1);
     204             : 
     205    40239500 :                 s[0] = s[1];
     206    40239500 :                 s[1] = s[2];
     207    40239500 :                 s[2] = s[3];
     208             : 
     209    40239500 :                 s[4] = s[5];
     210    40239500 :                 s[5] = s[6];
     211    40239500 :                 s[6] = s[7];
     212             :             }
     213     1596640 :             if (h - i) {
     214           0 :                 s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20);
     215           0 :                 s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20);
     216           0 :                 s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20);
     217             : 
     218           0 :                 const int16_t *data = &im_block[i * im_stride];
     219           0 :                 const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride));
     220           0 :                 const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
     221             : 
     222           0 :                 __m128i s3 = _mm_unpacklo_epi16(s6_, s7_);
     223           0 :                 __m128i s7 = _mm_unpackhi_epi16(s6_, s7_);
     224             : 
     225           0 :                 s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1);
     226           0 :                 __m256i convolveres = convolve16_8tap_avx2(s, coeffs_v);
     227             : 
     228           0 :                 const __m256i res_round = _mm256_sra_epi32(
     229             :                     _mm256_add_epi32(convolveres, round_const_v), round_shift_v);
     230             : 
     231             :                 /* rounding code */
     232             :                 // 16 bit conversion
     233           0 :                 __m128i reslo = _mm256_castsi256_si128(res_round);
     234           0 :                 __m128i reshi = _mm256_extracti128_si256(res_round, 1);
     235           0 :                 const __m128i res_16bit = _mm_packus_epi32(reslo, reshi);
     236             : 
     237             :                 // 8 bit conversion and saturation to uint8
     238           0 :                 const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit);
     239           0 :                 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
     240           0 :                 _mm_storel_epi64(p_0, res_8b);
     241             :             }
     242             :         }
     243             :     }
     244      201031 : }
     245             : 
     246             : // 128-bit xmmwords are written as [ ... ] with the MSB on the left.
     247             : // 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
     248             : // on the left.
     249             : // A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be
     250             : // loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ].
     251           0 : void eb_av1_highbd_wiener_convolve_add_src_avx2(
     252             :     const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
     253             :     ptrdiff_t dst_stride, const int16_t *filter_x, int32_t x_step_q4,
     254             :     const int16_t *filter_y, int32_t y_step_q4, int32_t w, int32_t h,
     255             :     const ConvolveParams *conv_params, int32_t bd) {
     256           0 :     assert(x_step_q4 == 16 && y_step_q4 == 16);
     257           0 :     assert(!(w & 7));
     258           0 :     assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
     259             :     (void)x_step_q4;
     260             :     (void)y_step_q4;
     261             : 
     262           0 :     const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
     263           0 :     uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
     264             : 
     265             :     DECLARE_ALIGNED(32, uint16_t,
     266             :     temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
     267           0 :     int32_t intermediate_height = h + SUBPEL_TAPS - 1;
     268           0 :     const int32_t center_tap = ((SUBPEL_TAPS - 1) / 2);
     269           0 :     const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
     270             : 
     271           0 :     const __m128i zero_128 = _mm_setzero_si128();
     272           0 :     const __m256i zero_256 = _mm256_setzero_si256();
     273             : 
     274             :     // Add an offset to account for the "add_src" part of the convolve function.
     275           0 :     const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
     276             : 
     277           0 :     const __m256i clamp_low = zero_256;
     278             : 
     279             :     /* Horizontal filter */
     280             :     {
     281             :         const __m256i clamp_high_ep =
     282           0 :             _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
     283             : 
     284             :         // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
     285           0 :         const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
     286             : 
     287             :         // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
     288           0 :         const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
     289             :         // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
     290           0 :         const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
     291             : 
     292             :         // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
     293           0 :         const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
     294             :         // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
     295           0 :         const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
     296             :         // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
     297           0 :         const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
     298             :         // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
     299           0 :         const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
     300             : 
     301             :         // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
     302           0 :         const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
     303             :         // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
     304           0 :         const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
     305             :         // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
     306           0 :         const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
     307             :         // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
     308           0 :         const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
     309             : 
     310           0 :         const __m256i round_const = _mm256_set1_epi32(
     311           0 :             (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
     312             : 
     313           0 :         for (int32_t i = 0; i < intermediate_height; ++i) {
     314           0 :             for (int32_t j = 0; j < w; j += 16) {
     315           0 :                 const uint16_t *src_ij = src_ptr + i * src_stride + j;
     316             : 
     317             :                 // Load 16-bit src data
     318           0 :                 const __m256i src_0 = yy_loadu_256(src_ij + 0);
     319           0 :                 const __m256i src_1 = yy_loadu_256(src_ij + 1);
     320           0 :                 const __m256i src_2 = yy_loadu_256(src_ij + 2);
     321           0 :                 const __m256i src_3 = yy_loadu_256(src_ij + 3);
     322           0 :                 const __m256i src_4 = yy_loadu_256(src_ij + 4);
     323           0 :                 const __m256i src_5 = yy_loadu_256(src_ij + 5);
     324           0 :                 const __m256i src_6 = yy_loadu_256(src_ij + 6);
     325           0 :                 const __m256i src_7 = yy_loadu_256(src_ij + 7);
     326             : 
     327             :                 // Multiply src data by filter coeffs and sum pairs
     328           0 :                 const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
     329           0 :                 const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
     330           0 :                 const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
     331           0 :                 const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
     332           0 :                 const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
     333           0 :                 const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
     334           0 :                 const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
     335           0 :                 const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
     336             : 
     337             :                 // Calculate scalar product for even- and odd-indices separately,
     338             :                 // increasing to 32-bit precision
     339           0 :                 const __m256i res_even_sum = _mm256_add_epi32(
     340             :                     _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
     341           0 :                 const __m256i res_even = _mm256_srai_epi32(
     342             :                     _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
     343             : 
     344           0 :                 const __m256i res_odd_sum = _mm256_add_epi32(
     345             :                     _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
     346           0 :                 const __m256i res_odd = _mm256_srai_epi32(
     347             :                     _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
     348             : 
     349             :                 // Reduce to 16-bit precision and pack even- and odd-index results
     350             :                 // back into one register. The _mm256_packs_epi32 intrinsic returns
     351             :                 // a register with the pixels ordered as follows:
     352             :                 // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
     353           0 :                 const __m256i res = _mm256_packs_epi32(res_even, res_odd);
     354             :                 const __m256i res_clamped =
     355           0 :                     _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep);
     356             : 
     357             :                 // Store in a temporary array
     358           0 :                 yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
     359             :             }
     360             :         }
     361             :     }
     362             : 
     363             :     /* Vertical filter */
     364             :     {
     365           0 :         const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1);
     366             : 
     367             :         // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
     368           0 :         const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
     369             : 
     370             :         // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
     371           0 :         const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
     372             :         // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
     373           0 :         const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
     374             : 
     375             :         // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
     376           0 :         const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
     377             :         // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
     378           0 :         const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
     379             :         // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
     380           0 :         const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
     381             :         // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
     382           0 :         const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
     383             : 
     384             :         // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
     385           0 :         const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
     386             :         // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
     387           0 :         const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
     388             :         // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
     389           0 :         const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
     390             :         // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
     391           0 :         const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
     392             : 
     393             :         const __m256i round_const =
     394           0 :             _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
     395           0 :             (1 << (bd + conv_params->round_1 - 1)));
     396             : 
     397           0 :         for (int32_t i = 0; i < h; ++i) {
     398           0 :             for (int32_t j = 0; j < w; j += 16) {
     399           0 :                 const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j;
     400             : 
     401             :                 // Load 16-bit data from the output of the horizontal filter in
     402             :                 // which the pixels are ordered as follows:
     403             :                 // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
     404           0 :                 const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE);
     405           0 :                 const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE);
     406           0 :                 const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE);
     407           0 :                 const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE);
     408           0 :                 const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE);
     409           0 :                 const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE);
     410           0 :                 const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE);
     411           0 :                 const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE);
     412             : 
     413             :                 // Filter the even-indices, increasing to 32-bit precision
     414           0 :                 const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
     415           0 :                 const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
     416           0 :                 const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
     417           0 :                 const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
     418             : 
     419           0 :                 const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
     420           0 :                 const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
     421           0 :                 const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
     422           0 :                 const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
     423             : 
     424           0 :                 const __m256i res_even = _mm256_add_epi32(
     425             :                     _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
     426             : 
     427             :                 // Filter the odd-indices, increasing to 32-bit precision
     428           0 :                 const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
     429           0 :                 const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
     430           0 :                 const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
     431           0 :                 const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
     432             : 
     433           0 :                 const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
     434           0 :                 const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
     435           0 :                 const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
     436           0 :                 const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
     437             : 
     438           0 :                 const __m256i res_odd = _mm256_add_epi32(
     439             :                     _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
     440             : 
     441             :                 // Pixels are currently in the following order:
     442             :                 // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
     443             :                 // res_odd order:  [ 15 13 11 9 ] [ 7 5 3 1 ]
     444             :                 //
     445             :                 // Rearrange the pixels into the following order:
     446             :                 // res_lo order: [ 11 10  9  8 ] [ 3 2 1 0 ]
     447             :                 // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
     448           0 :                 const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
     449           0 :                 const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
     450             : 
     451           0 :                 const __m256i res_lo_round = _mm256_srai_epi32(
     452             :                     _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
     453           0 :                 const __m256i res_hi_round = _mm256_srai_epi32(
     454             :                     _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
     455             : 
     456             :                 // Reduce to 16-bit precision and pack into the correct order:
     457             :                 // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
     458             :                 const __m256i res_16bit =
     459           0 :                     _mm256_packs_epi32(res_lo_round, res_hi_round);
     460           0 :                 const __m256i res_16bit_clamped = _mm256_min_epi16(
     461             :                     _mm256_max_epi16(res_16bit, clamp_low), clamp_high);
     462             : 
     463             :                 // Store in the dst array
     464           0 :                 yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped);
     465             :             }
     466             :         }
     467             :     }
     468           0 : }

Generated by: LCOV version 1.14