LCOV - code coverage report
Current view: top level - ASM_AVX2 - cfl_avx2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 92 163 56.4 %
Date: 2019-11-25 17:38:06 Functions: 7 13 53.8 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : #include <immintrin.h>
      12             : 
      13             : #include "EbDefinitions.h"
      14             : #include "aom_dsp_rtcd.h"
      15             : 
      16           0 : static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
      17             :                                         __m256i alpha_sign, __m256i dc_q0) {
      18           0 :   __m256i ac_q3 = _mm256_loadu_si256(input);
      19           0 :   __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3);
      20             :   __m256i scaled_luma_q0 =
      21           0 :       _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12);
      22           0 :   scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign);
      23           0 :   return _mm256_add_epi16(scaled_luma_q0, dc_q0);
      24             : }
      25             : 
      26    72914300 : static INLINE __m128i predict_unclipped_ssse3(const __m128i *input, __m128i alpha_q12,
      27             :     __m128i alpha_sign, __m128i dc_q0) {
      28    72914300 :     __m128i ac_q3 = _mm_loadu_si128(input);
      29    72914300 :     __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
      30   145829000 :     __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
      31    72914300 :     scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
      32    72914300 :     return _mm_add_epi16(scaled_luma_q0, dc_q0);
      33             : }
      34             : 
      35             : // Store 32-bit integer from the first element of a into memory.
      36    38981700 : static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
      37    38981700 :     *((int32_t *)mem_addr) = _mm_cvtsi128_si32(a);
      38    38981700 : }
      39             : 
      40    10368400 : void eb_cfl_predict_lbd_avx2(const int16_t *pred_buf_q3,
      41             :         uint8_t *pred,
      42             :         int32_t pred_stride,
      43             :         uint8_t *dst,
      44             :         int32_t dst_stride,
      45             :         int32_t alpha_q3,
      46             :         int32_t bit_depth,
      47             :         int32_t width,
      48             :         int32_t height) {
      49             :     (void) bit_depth;
      50    10368400 :     if (width <= 16)
      51             :     {
      52    20737000 :         const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
      53    10368500 :         const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
      54    10368500 :         const __m128i dc_q0 = _mm_set1_epi16(*pred);
      55    10368500 :         __m128i *row = (__m128i *)pred_buf_q3;
      56    10368500 :         const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
      57             :         do {
      58    64437000 :             __m128i res = predict_unclipped_ssse3(row, alpha_q12, alpha_sign, dc_q0);
      59    64385400 :             if (width < 16) {
      60    55895100 :                 res = _mm_packus_epi16(res, res);
      61    55895100 :                 if (width == 4)
      62    38983000 :                     _mm_storeh_epi32((__m128i *)dst, res);
      63             :                 else
      64    16912100 :                     _mm_storel_epi64((__m128i *)dst, res);
      65             :             }
      66             :             else {
      67     8490290 :                 __m128i next = predict_unclipped_ssse3(row + 1, alpha_q12, alpha_sign, dc_q0);
      68     8544310 :                 res = _mm_packus_epi16(res, next);
      69             :                 _mm_storeu_si128((__m128i *)dst, res);
      70             :             }
      71    64433000 :             dst += dst_stride;
      72    64433000 :         } while ((row += CFL_BUF_LINE_I128) < row_end);
      73             :     }
      74             :     else
      75             :     {
      76           0 :         const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
      77           0 :         const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
      78           0 :         const __m256i dc_q0 = _mm256_set1_epi16(*pred);
      79           0 :         __m256i *row = (__m256i *)pred_buf_q3;
      80           0 :         const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
      81             : 
      82             :         do {
      83           0 :             __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
      84           0 :             __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
      85           0 :             res = _mm256_packus_epi16(res, next);
      86           0 :             res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
      87             :             _mm256_storeu_si256((__m256i *)dst, res);
      88           0 :             dst += dst_stride;
      89           0 :             pred += pred_stride;
      90           0 :         } while ((row += CFL_BUF_LINE_I256) < row_end);
      91             :     }
      92    10364400 : }
      93             : 
      94           0 : static __m256i highbd_max_epi16(int32_t bd) {
      95           0 :   const __m256i neg_one = _mm256_set1_epi16(-1);
      96             :   // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
      97           0 :   return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one);
      98             : }
      99             : 
     100           0 : static INLINE __m128i highbd_max_epi16_ssse3(int32_t bd) {
     101           0 :     const __m128i neg_one = _mm_set1_epi16(-1);
     102             :     // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
     103           0 :     return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one);
     104             : }
     105             : 
     106           0 : static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) {
     107           0 :   return _mm256_max_epi16(_mm256_min_epi16(u, max), zero);
     108             : }
     109             : 
     110           0 : static INLINE __m128i highbd_clamp_epi16_ssse3(__m128i u, __m128i zero, __m128i max) {
     111           0 :     return _mm_max_epi16(_mm_min_epi16(u, max), zero);
     112             : }
     113             : 
     114           0 : void eb_cfl_predict_hbd_avx2(
     115             :     const int16_t *pred_buf_q3,
     116             :     uint16_t *pred,// AMIR ADDED
     117             :     int32_t pred_stride,
     118             :     uint16_t *dst,// AMIR changed to 8 bit
     119             :     int32_t dst_stride,
     120             :     int32_t alpha_q3,
     121             :     int32_t bit_depth,
     122             :     int32_t width,
     123             :     int32_t height) {
     124             :   // Use SSSE3 version for smaller widths
     125           0 :     if (width < 16)
     126             :     {
     127           0 :         const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
     128           0 :         const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
     129           0 :         const __m128i dc_q0 = _mm_set1_epi16(*pred);
     130           0 :         const __m128i max = highbd_max_epi16_ssse3(bit_depth);
     131           0 :         const __m128i zeros = _mm_setzero_si128();
     132           0 :         __m128i *row = (__m128i *)pred_buf_q3;
     133           0 :         const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
     134             :         do {
     135           0 :             __m128i res = predict_unclipped_ssse3(row, alpha_q12, alpha_sign, dc_q0);
     136           0 :             res = highbd_clamp_epi16_ssse3(res, zeros, max);
     137           0 :             if (width == 4)
     138           0 :                 _mm_storel_epi64((__m128i *)dst, res);
     139             :             else
     140             :                 _mm_storeu_si128((__m128i *)dst, res);
     141           0 :             dst += dst_stride;
     142           0 :         } while ((row += CFL_BUF_LINE_I128) < row_end);
     143             :     }
     144             :     else
     145             :     {
     146             :         assert(width == 16 || width == 32);
     147           0 :         const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
     148           0 :         const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
     149           0 :         const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)pred);
     150           0 :         const __m256i max = highbd_max_epi16(bit_depth);
     151             : 
     152           0 :         __m256i *row = (__m256i *)pred_buf_q3;
     153           0 :         const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
     154             :         do {
     155           0 :             const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
     156           0 :             _mm256_storeu_si256((__m256i *)dst,
     157             :                 highbd_clamp_epi16(res, _mm256_setzero_si256(), max));
     158           0 :             if (width == 32) {
     159             :                 const __m256i res_1 =
     160           0 :                     predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
     161           0 :                 _mm256_storeu_si256(
     162           0 :                     (__m256i *)(dst + 16),
     163             :                     highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max));
     164             :             }
     165           0 :             dst += dst_stride;
     166           0 :             pred += pred_stride;
     167           0 :         } while ((row += CFL_BUF_LINE_I256) < row_end);
     168             :     }
     169           0 : }
     170             : 
     171             : // Returns a vector where all the (32-bits) elements are the sum of all the
     172             : // lanes in a.
     173       53193 : static INLINE __m256i fill_sum_epi32(__m256i a) {
     174             :   // Given that a == [A, B, C, D, E, F, G, H]
     175       53193 :   a = _mm256_hadd_epi32(a, a);
     176             :   // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H
     177             :   // a == [A', C', A', C', E', G', E', G']
     178       53193 :   a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0));
     179             :   // a == [A', C', E', G', A', C', E', G']
     180       53193 :   a = _mm256_hadd_epi32(a, a);
     181             :   // Given that A'' == A' + C' and E'' == E' + G'
     182             :   // a == [A'', E'', A'', E'', A'', E'', A'', E'']
     183       53193 :   return _mm256_hadd_epi32(a, a);
     184             :   // Given that A''' == A'' + E''
     185             :   // a == [A''', A''', A''', A''', A''', A''', A''', A''']
     186             : }
     187      470331 : static INLINE __m128i fill_sum_epi32_sse2(__m128i l0) {
     188      470331 :     l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
     189      940662 :     return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
     190             : }
     191      215502 : static INLINE __m256i _mm256_addl_epi16(__m256i a) {
     192     1077510 :   return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()),
     193             :                           _mm256_unpackhi_epi16(a, _mm256_setzero_si256()));
     194             : }
     195             : 
     196      523504 : /*staticINLINE*/  void eb_subtract_average_avx2(int16_t *pred_buf_q3, int32_t width,
     197             :     int32_t height, int32_t round_offset,
     198             :     int32_t num_pel_log2) {
     199             :     // Use SSE2 version for smaller widths
     200             : 
     201      523504 :     if ((width == 4) || (width == 8))
     202      470331 :     {
     203      470312 :         const __m128i zeros = _mm_setzero_si128();
     204      470312 :         const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset);
     205      470312 :         const __m128i *src = (__m128i *)pred_buf_q3;
     206      470312 :         const __m128i *const end = src + height * CFL_BUF_LINE_I128;
     207      470312 :         const int32_t step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4));
     208             : 
     209      470312 :         __m128i sum = zeros;
     210             :         do {
     211             :             __m128i l0;
     212      920262 :             if (width == 4) {
     213      983424 :                 l0 = _mm_add_epi16(_mm_loadl_epi64(src),
     214      491712 :                     _mm_loadl_epi64(src + CFL_BUF_LINE_I128));
     215     1475140 :                 __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128),
     216      491712 :                     _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128));
     217     1966850 :                 sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
     218             :                     _mm_unpacklo_epi16(l1, zeros)));
     219             :             }
     220             :             else {
     221     1285650 :                 l0 = _mm_add_epi16(_mm_loadu_si128(src),
     222      428550 :                         _mm_loadu_si128(src + CFL_BUF_LINE_I128));
     223     1714200 :                 sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
     224             :                     _mm_unpackhi_epi16(l0, zeros)));
     225             :             }
     226      920262 :             src += step;
     227      920262 :         } while (src < end);
     228             : 
     229      470312 :         sum = fill_sum_epi32_sse2(sum);
     230             : 
     231             :         __m128i avg_epi16 =
     232      940662 :             _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2);
     233      470331 :         avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16);
     234             : 
     235      470331 :         src = (__m128i *)pred_buf_q3;
     236      470331 :         __m128i *dst = (__m128i *)pred_buf_q3;
     237             :         do {
     238     2823920 :             if (width == 4)
     239     3933640 :                 _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16));
     240             :             else {
     241     1714190 :                 _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16));
     242      857097 :                 if (width > 8) {
     243           0 :                     _mm_storeu_si128(dst + 1,
     244           0 :                         _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16));
     245           0 :                     if (width == 32) {
     246           0 :                         _mm_storeu_si128(dst + 2,
     247           0 :                             _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16));
     248           0 :                         _mm_storeu_si128(dst + 3,
     249           0 :                             _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16));
     250             :                     }
     251             :                 }
     252             :             }
     253     2823920 :             src += CFL_BUF_LINE_I128;
     254     2823920 :             dst += CFL_BUF_LINE_I128;
     255     2823920 :         } while (src < end);
     256             :     }
     257             :     else
     258             :     {
     259       53192 :         const __m256i *src = (__m256i *)pred_buf_q3;
     260       53192 :         const __m256i *const end = src + height * CFL_BUF_LINE_I256;
     261             :         // To maximize usage of the AVX2 registers, we sum two rows per loop
     262             :         // iteration
     263       53192 :         const int32_t step = 2 * CFL_BUF_LINE_I256;
     264             : 
     265       53192 :         __m256i sum = _mm256_setzero_si256();
     266             :         // For width 32, we use a second sum accumulator to reduce accumulator
     267             :         // dependencies in the loop.
     268             :         __m256i sum2;
     269       53192 :         if (width == 32) sum2 = _mm256_setzero_si256();
     270             : 
     271             :         do {
     272             :             // Add top row to the bottom row
     273      431004 :             __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src),
     274      215502 :                 _mm256_loadu_si256(src + CFL_BUF_LINE_I256));
     275      215502 :             sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0));
     276      215502 :             if (width == 32) { /* Don't worry, this if it gets optimized out. */
     277             :                 // Add the second part of the top row to the second part of the bottom row
     278             :                 __m256i l1 =
     279           0 :                     _mm256_add_epi16(_mm256_loadu_si256(src + 1),
     280           0 :                     _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256));
     281           0 :                 sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1));
     282             :             }
     283      215502 :             src += step;
     284      215502 :         } while (src < end);
     285             :         // Combine both sum accumulators
     286       53192 :         if (width == 32) sum = _mm256_add_epi32(sum, sum2);
     287             : 
     288       53192 :         __m256i fill = fill_sum_epi32(sum);
     289             : 
     290      159579 :         __m256i avg_epi16 = _mm256_srli_epi32(
     291             :             _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2);
     292       53193 :         avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16);
     293             : 
     294             :         // Store and subtract loop
     295       53193 :         src = (__m256i *)pred_buf_q3;
     296       53193 :         __m256i *dst = (__m256i *)pred_buf_q3;
     297             :         do {
     298      862014 :             _mm256_storeu_si256(dst,
     299             :                 _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16));
     300      431007 :             if (width == 32) {
     301           0 :                 _mm256_storeu_si256(
     302           0 :                     dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16));
     303             :             }
     304      431007 :             src += CFL_BUF_LINE_I256;
     305      431007 :             dst += CFL_BUF_LINE_I256;
     306      431007 :         } while (src < end);
     307             :     }
     308      523524 : }

Generated by: LCOV version 1.14