LCOV - code coverage report
Current view: top level - ASM_AVX2 - av1_quantize_avx2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 183 183 100.0 %
Date: 2019-11-25 17:38:06 Functions: 12 12 100.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <immintrin.h>
      13             : 
      14             : #include "aom_dsp_rtcd.h"
      15             : 
      16   162404000 : static INLINE void read_coeff(const TranLow *coeff, __m256i *c) {
      17             :     if (sizeof(TranLow) == 4) {
      18   162404000 :         const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff);
      19   324807000 :         const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1);
      20   162404000 :         *c = _mm256_packs_epi32(x0, x1);
      21   162404000 :         *c = _mm256_permute4x64_epi64(*c, 0xD8);
      22             :     }
      23             :     else
      24             :         *c = _mm256_loadu_si256((const __m256i *)coeff);
      25   162404000 : }
      26             : 
      27   196743000 : static INLINE void write_zero(TranLow *qcoeff) {
      28   196743000 :     const __m256i zero = _mm256_setzero_si256();
      29             :     if (sizeof(TranLow) == 4) {
      30             :         _mm256_storeu_si256((__m256i *)qcoeff, zero);
      31   196743000 :         _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
      32             :     }
      33             :     else
      34             :         _mm256_storeu_si256((__m256i *)qcoeff, zero);
      35   196743000 : }
      36             : 
      37   123743000 : static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
      38   123743000 :     const __m128i ac = _mm_unpackhi_epi64(*p, *p);
      39   123743000 :     *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1);
      40   123743000 : }
      41             : 
      42    41320900 : static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
      43             :     const int16_t *dequant_ptr, int log_scale,
      44             :     __m256i *thr, __m256i *qp) {
      45    41320900 :     __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
      46    41320900 :     const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
      47    41320900 :     const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
      48             : 
      49    41320900 :     if (log_scale > 0) {
      50      665428 :         const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1));
      51      665428 :         round = _mm_add_epi16(round, rnd);
      52     1330860 :         round = _mm_srai_epi16(round, log_scale);
      53             :     }
      54             : 
      55    41320900 :     init_one_qp(&round, &qp[0]);
      56    41324700 :     init_one_qp(&quant, &qp[1]);
      57             : 
      58    41290800 :     if (log_scale == 1)
      59     1295800 :         qp[1] = _mm256_slli_epi16(qp[1], log_scale);
      60             : 
      61    41290800 :     init_one_qp(&dequant, &qp[2]);
      62    41289900 :     *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
      63    41289900 : }
      64             : 
      65    41327000 : static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) {
      66    41327000 :     qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
      67    41327000 :     qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
      68    41327000 :     qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
      69    41327000 :     *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
      70    41327000 : }
      71             : 
      72             : #define store_quan(q, addr)                               \
      73             :   do {                                                    \
      74             :     __m256i sign_bits = _mm256_srai_epi16(q, 15);         \
      75             :     __m256i y0 = _mm256_unpacklo_epi16(q, sign_bits);     \
      76             :     __m256i y1 = _mm256_unpackhi_epi16(q, sign_bits);     \
      77             :     __m256i x0 = _mm256_permute2x128_si256(y0, y1, 0x20); \
      78             :     __m256i x1 = _mm256_permute2x128_si256(y0, y1, 0x31); \
      79             :     _mm256_storeu_si256((__m256i *)addr, x0);             \
      80             :     _mm256_storeu_si256((__m256i *)addr + 1, x1);         \
      81             :   } while (0)
      82             : 
      83             : #define store_two_quan(q, addr1, dq, addr2)      \
      84             :   do {                                           \
      85             :     if (sizeof(TranLow) == 4) {                  \
      86             :       store_quan(q, addr1);                      \
      87             :       store_quan(dq, addr2);                     \
      88             :     } else {                                     \
      89             :       _mm256_storeu_si256((__m256i *)addr1, q);  \
      90             :       _mm256_storeu_si256((__m256i *)addr2, dq); \
      91             :     }                                            \
      92             :   } while (0)
      93             : 
      94    41329200 : static INLINE uint16_t quant_gather_eob(__m256i eob) {
      95    41329200 :     const __m128i eob_lo = _mm256_castsi256_si128(eob);
      96    41329200 :     const __m128i eob_hi = _mm256_extractf128_si256(eob, 1);
      97    41329200 :     __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi);
      98    82658300 :     eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s);
      99    41329200 :     eob_s = _mm_minpos_epu16(eob_s);
     100    41329200 :     return INT16_MAX - _mm_extract_epi16(eob_s, 0);
     101             : }
     102             : 
     103   135731000 : static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
     104             :     const int16_t *iscan_ptr, TranLow *qcoeff,
     105             :     TranLow *dqcoeff, __m256i *eob) {
     106   135731000 :     const __m256i abs_coeff = _mm256_abs_epi16(*c);
     107   135731000 :     __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
     108   407192000 :     mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
     109   135731000 :     const int nzflag = _mm256_movemask_epi8(mask);
     110             : 
     111   135731000 :     if (nzflag) {
     112    58105100 :         __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
     113    58105100 :         q = _mm256_mulhi_epi16(q, qp[1]);
     114    58105100 :         q = _mm256_sign_epi16(q, *c);
     115   116210000 :         const __m256i dq = _mm256_mullo_epi16(q, qp[2]);
     116             : 
     117   348631000 :         store_two_quan(q, qcoeff, dq, dqcoeff);
     118    58105100 :         const __m256i zero = _mm256_setzero_si256();
     119    58105100 :         const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
     120    58105100 :         const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
     121    58105100 :         const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
     122    58105100 :         __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
     123    58105100 :         cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
     124   116210000 :         *eob = _mm256_max_epi16(*eob, cur_eob);
     125             :     }
     126             :     else {
     127    77625600 :         write_zero(qcoeff);
     128    78098100 :         write_zero(dqcoeff);
     129             :     }
     130   136186000 : }
     131             : 
     132    40668900 : void eb_av1_quantize_fp_avx2(const TranLow *coeff_ptr, intptr_t n_coeffs,
     133             :     const int16_t *zbin_ptr, const int16_t *round_ptr,
     134             :     const int16_t *quant_ptr,
     135             :     const int16_t *quant_shift_ptr,
     136             :     TranLow *qcoeff_ptr, TranLow *dqcoeff_ptr,
     137             :     const int16_t *dequant_ptr, uint16_t *eob_ptr,
     138             :     const int16_t *scan_ptr, const int16_t *iscan_ptr) {
     139             :     (void)scan_ptr;
     140             :     (void)zbin_ptr;
     141             :     (void)quant_shift_ptr;
     142    40668900 :     const unsigned int step = 16;
     143             : 
     144             :     __m256i qp[3];
     145             :     __m256i coeff, thr;
     146    40668900 :     const int log_scale = 0;
     147             : 
     148    40668900 :     init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
     149    40625200 :     read_coeff(coeff_ptr, &coeff);
     150             : 
     151    40639500 :     __m256i eob = _mm256_setzero_si256();
     152    40639500 :     quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
     153             : 
     154    40672900 :     coeff_ptr += step;
     155    40672900 :     qcoeff_ptr += step;
     156    40672900 :     dqcoeff_ptr += step;
     157    40672900 :     iscan_ptr += step;
     158    40672900 :     n_coeffs -= step;
     159             : 
     160    40672900 :     update_qp(log_scale, &thr, qp);
     161             : 
     162   136146000 :     while (n_coeffs > 0) {
     163    95475700 :         read_coeff(coeff_ptr, &coeff);
     164    95453100 :         quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
     165             : 
     166    95484900 :         coeff_ptr += step;
     167    95484900 :         qcoeff_ptr += step;
     168    95484900 :         dqcoeff_ptr += step;
     169    95484900 :         iscan_ptr += step;
     170    95484900 :         n_coeffs -= step;
     171             :     }
     172    40670300 :     *eob_ptr = quant_gather_eob(eob);
     173    40679100 : }
     174             : 
     175    25775600 : static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp,
     176             :     __m256i *c, const int16_t *iscan_ptr,
     177             :     TranLow *qcoeff, TranLow *dqcoeff,
     178             :     __m256i *eob) {
     179    25775600 :     const __m256i abs_coeff = _mm256_abs_epi16(*c);
     180    25775600 :     __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
     181    77326900 :     mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
     182    25775600 :     const int nzflag = _mm256_movemask_epi8(mask);
     183             : 
     184    25775600 :     if (nzflag) {
     185     5765730 :         __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
     186     5765730 :         q = _mm256_mulhi_epu16(q, qp[1]);
     187             : 
     188    11531500 :         __m256i dq = _mm256_mullo_epi16(q, qp[2]);
     189     5765730 :         dq = _mm256_srli_epi16(dq, 1);
     190             : 
     191     5765730 :         q = _mm256_sign_epi16(q, *c);
     192    11531500 :         dq = _mm256_sign_epi16(dq, *c);
     193             : 
     194    34594400 :         store_two_quan(q, qcoeff, dq, dqcoeff);
     195     5765730 :         const __m256i zero = _mm256_setzero_si256();
     196     5765730 :         const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
     197     5765730 :         const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
     198     5765730 :         const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
     199     5765730 :         __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
     200     5765730 :         cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
     201    11531500 :         *eob = _mm256_max_epi16(*eob, cur_eob);
     202             :     }
     203             :     else {
     204    20009900 :         write_zero(qcoeff);
     205    20025300 :         write_zero(dqcoeff);
     206             :     }
     207    25790000 : }
     208             : 
     209      647900 : void eb_av1_quantize_fp_32x32_avx2(
     210             :     const TranLow *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     211             :     const int16_t *round_ptr, const int16_t *quant_ptr,
     212             :     const int16_t *quant_shift_ptr, TranLow *qcoeff_ptr,
     213             :     TranLow *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     214             :     const int16_t *scan_ptr, const int16_t *iscan_ptr) {
     215             :     (void)scan_ptr;
     216             :     (void)zbin_ptr;
     217             :     (void)quant_shift_ptr;
     218      647900 :     const unsigned int step = 16;
     219             : 
     220             :     __m256i qp[3];
     221             :     __m256i coeff, thr;
     222      647900 :     const int log_scale = 1;
     223             : 
     224      647900 :     init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
     225      647904 :     read_coeff(coeff_ptr, &coeff);
     226             : 
     227      647903 :     __m256i eob = _mm256_setzero_si256();
     228      647903 :     quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
     229             : 
     230      647903 :     coeff_ptr += step;
     231      647903 :     qcoeff_ptr += step;
     232      647903 :     dqcoeff_ptr += step;
     233      647903 :     iscan_ptr += step;
     234      647903 :     n_coeffs -= step;
     235             : 
     236      647903 :     update_qp(log_scale, &thr, qp);
     237             : 
     238    25781500 :     while (n_coeffs > 0) {
     239    25133600 :         read_coeff(coeff_ptr, &coeff);
     240    25131700 :         quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
     241             :             &eob);
     242             : 
     243    25133600 :         coeff_ptr += step;
     244    25133600 :         qcoeff_ptr += step;
     245    25133600 :         dqcoeff_ptr += step;
     246    25133600 :         iscan_ptr += step;
     247    25133600 :         n_coeffs -= step;
     248             :     }
     249      647897 :     *eob_ptr = quant_gather_eob(eob);
     250      647893 : }
     251             : 
     252     1121270 : static INLINE void quantize_64x64(const __m256i *thr, const __m256i *qp,
     253             :     __m256i *c, const int16_t *iscan_ptr,
     254             :     TranLow *qcoeff, TranLow *dqcoeff,
     255             :     __m256i *eob) {
     256     1121270 :     const __m256i abs_coeff = _mm256_abs_epi16(*c);
     257     1121270 :     __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
     258     3363810 :     mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
     259     1121270 :     const int nzflag = _mm256_movemask_epi8(mask);
     260             : 
     261     1121270 :     if (nzflag) {
     262      720668 :         __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
     263      720668 :         __m256i qh = _mm256_mulhi_epi16(q, qp[1]);
     264     1441340 :         __m256i ql = _mm256_mullo_epi16(q, qp[1]);
     265      720668 :         qh = _mm256_slli_epi16(qh, 2);
     266      720668 :         ql = _mm256_srli_epi16(ql, 14);
     267      720668 :         q = _mm256_or_si256(qh, ql);
     268     1441340 :         const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(q, qp[2]), 14);
     269     2162000 :         const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(q, qp[2]), 2);
     270      720668 :         __m256i dq = _mm256_or_si256(dqh, dql);
     271             : 
     272      720668 :         q = _mm256_sign_epi16(q, *c);
     273     1441340 :         dq = _mm256_sign_epi16(dq, *c);
     274             : 
     275     4324010 :         store_two_quan(q, qcoeff, dq, dqcoeff);
     276      720668 :         const __m256i zero = _mm256_setzero_si256();
     277      720668 :         const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
     278      720668 :         const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
     279      720668 :         const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
     280      720668 :         __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
     281      720668 :         cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
     282     1441340 :         *eob = _mm256_max_epi16(*eob, cur_eob);
     283             :     }
     284             :     else {
     285      400603 :         write_zero(qcoeff);
     286      400814 :         write_zero(dqcoeff);
     287             :     }
     288     1121480 : }
     289             : 
     290       17527 : void eb_av1_quantize_fp_64x64_avx2(
     291             :     const TranLow *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     292             :     const int16_t *round_ptr, const int16_t *quant_ptr,
     293             :     const int16_t *quant_shift_ptr, TranLow *qcoeff_ptr,
     294             :     TranLow *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     295             :     const int16_t *scan_ptr, const int16_t *iscan_ptr) {
     296             :     (void)scan_ptr;
     297             :     (void)zbin_ptr;
     298             :     (void)quant_shift_ptr;
     299       17527 :     const unsigned int step = 16;
     300             : 
     301             :     __m256i qp[3];
     302             :     __m256i coeff, thr;
     303       17527 :     const int log_scale = 2;
     304             : 
     305       17527 :     init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
     306       17528 :     read_coeff(coeff_ptr, &coeff);
     307             : 
     308       17527 :     __m256i eob = _mm256_setzero_si256();
     309       17527 :     quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
     310             : 
     311       17527 :     coeff_ptr += step;
     312       17527 :     qcoeff_ptr += step;
     313       17527 :     dqcoeff_ptr += step;
     314       17527 :     iscan_ptr += step;
     315       17527 :     n_coeffs -= step;
     316             : 
     317       17527 :     update_qp(log_scale, &thr, qp);
     318             : 
     319     1121390 :     while (n_coeffs > 0) {
     320     1103860 :         read_coeff(coeff_ptr, &coeff);
     321     1103800 :         quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
     322             :             &eob);
     323             : 
     324     1103860 :         coeff_ptr += step;
     325     1103860 :         qcoeff_ptr += step;
     326     1103860 :         dqcoeff_ptr += step;
     327     1103860 :         iscan_ptr += step;
     328     1103860 :         n_coeffs -= step;
     329             :     }
     330       17527 :     *eob_ptr = quant_gather_eob(eob);
     331       17527 : }

Generated by: LCOV version 1.14