LCOV - code coverage report
Current view: top level - ASM_AVX2 - encodetxb_avx2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 84 84 100.0 %
Date: 2019-11-25 17:38:06 Functions: 2 2 100.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <immintrin.h> /* AVX2 */
      13             : 
      14             : #include "EbDefinitions.h"
      15             : #include "synonyms.h"
      16             : #include "synonyms_avx2.h"
      17             : 
      18   206676000 : static INLINE __m256i txb_init_levels_avx2(const TranLow *const coeff) {
      19   206676000 :     const __m256i idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
      20   206676000 :     const __m256i c0 = yy_loadu_256(coeff + 0 * 8);
      21   206941000 :     const __m256i c1 = yy_loadu_256(coeff + 1 * 8);
      22   207095000 :     const __m256i c2 = yy_loadu_256(coeff + 2 * 8);
      23   207012000 :     const __m256i c3 = yy_loadu_256(coeff + 3 * 8);
      24   207009000 :     const __m256i c01 = _mm256_packs_epi32(c0, c1);
      25   207009000 :     const __m256i c23 = _mm256_packs_epi32(c2, c3);
      26   207009000 :     const __m256i abs01 = _mm256_abs_epi16(c01);
      27   207009000 :     const __m256i abs23 = _mm256_abs_epi16(c23);
      28   207009000 :     const __m256i res = _mm256_packs_epi16(abs01, abs23);
      29   207009000 :     return _mm256_permutevar8x32_epi32(res, idx);
      30             : }
      31             : 
      32    74170600 : void eb_av1_txb_init_levels_avx2(const TranLow *const coeff,
      33             :     const int32_t width, const int32_t height,
      34             :     uint8_t *const levels) {
      35    74170600 :     const TranLow *cf = coeff;
      36    74170600 :     const __m128i x_zeros = _mm_setzero_si128();
      37    74170600 :     const __m256i y_zeros = _mm256_setzero_si256();
      38    74170600 :     uint8_t *ls = levels;
      39    74170600 :     int32_t i = height;
      40             : 
      41    74170600 :     if (width == 4) {
      42    31364000 :         xx_storeu_128(ls - 16, x_zeros);
      43             : 
      44             :         do {
      45    45504400 :             const __m256i idx = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
      46    45504400 :             const __m256i c0 = yy_loadu_256(cf);
      47    45500400 :             const __m256i c1 = yy_loadu_256(cf + 8);
      48    45471700 :             const __m256i c01 = _mm256_packs_epi32(c0, c1);
      49    45471700 :             const __m256i abs01 = _mm256_abs_epi16(c01);
      50    45471700 :             const __m256i res_ = _mm256_packs_epi16(abs01, y_zeros);
      51    45471700 :             const __m256i res = _mm256_permutevar8x32_epi32(res_, idx);
      52    45471700 :             yy_storeu_256(ls, res);
      53    45472200 :             cf += 4 * 4;
      54    45472200 :             ls += 4 * 8;
      55    45472200 :             i -= 4;
      56    45472200 :         } while (i);
      57             : 
      58    31363100 :         yy_storeu_256(ls, y_zeros);
      59             :     }
      60    42806600 :     else if (width == 8) {
      61    27149800 :         yy_storeu_256(ls - 24, y_zeros);
      62             : 
      63             :         do {
      64    62079400 :             const __m256i res = txb_init_levels_avx2(cf);
      65    62174800 :             const __m128i res0 = _mm256_castsi256_si128(res);
      66    62174800 :             const __m128i res1 = _mm256_extracti128_si256(res, 1);
      67    62174800 :             xx_storel_64(ls + 0 * 12 + 0, res0);
      68    62180000 :             *(int32_t *)(ls + 0 * 12 + 8) = 0;
      69    62180000 :             _mm_storeh_epi64((__m128i *)(ls + 1 * 12 + 0), res0);
      70    62134800 :             *(int32_t *)(ls + 1 * 12 + 8) = 0;
      71    62134800 :             xx_storel_64(ls + 2 * 12 + 0, res1);
      72    62096400 :             *(int32_t *)(ls + 2 * 12 + 8) = 0;
      73    62096400 :             _mm_storeh_epi64((__m128i *)(ls + 3 * 12 + 0), res1);
      74    62092200 :             *(int32_t *)(ls + 3 * 12 + 8) = 0;
      75    62092200 :             cf += 4 * 8;
      76    62092200 :             ls += 4 * 12;
      77    62092200 :             i -= 4;
      78    62092200 :         } while (i);
      79             : 
      80    27161200 :         yy_storeu_256(ls + 0 * 32, y_zeros);
      81    27159700 :         xx_storeu_128(ls + 1 * 32, x_zeros);
      82             :     }
      83    15656800 :     else if (width == 16) {
      84    12606100 :         yy_storeu_256(ls - 40, y_zeros);
      85    12605400 :         xx_storel_64(ls - 8, x_zeros);
      86             : 
      87             :         do {
      88    77433800 :             const __m256i res = txb_init_levels_avx2(cf);
      89    77485100 :             const __m128i res0 = _mm256_castsi256_si128(res);
      90    77485100 :             const __m128i res1 = _mm256_extracti128_si256(res, 1);
      91    77485100 :             xx_storeu_128(ls, res0);
      92    77477700 :             *(int32_t *)(ls + 16) = 0;
      93    77477700 :             xx_storeu_128(ls + 20, res1);
      94    77440300 :             *(int32_t *)(ls + 20 + 16) = 0;
      95    77440300 :             cf += 2 * 16;
      96    77440300 :             ls += 2 * 20;
      97    77440300 :             i -= 2;
      98    77440300 :         } while (i);
      99             : 
     100    12611600 :         yy_storeu_256(ls + 0 * 32, y_zeros);
     101    12611100 :         yy_storeu_256(ls + 1 * 32, y_zeros);
     102    12609000 :         xx_storeu_128(ls + 2 * 32, x_zeros);
     103             :     }
     104             :     else {
     105     3050690 :         yy_storeu_256(ls - 72, y_zeros);
     106     3265180 :         yy_storeu_256(ls - 40, y_zeros);
     107     3265060 :         xx_storel_64(ls - 8, x_zeros);
     108             : 
     109             :         do {
     110    68058000 :             const __m256i res = txb_init_levels_avx2(cf);
     111    68088400 :             yy_storeu_256(ls, res);
     112    68058500 :             *(int32_t *)(ls + 32) = 0;
     113    68058500 :             cf += 32;
     114    68058500 :             ls += 36;
     115    68058500 :         } while (--i);
     116             : 
     117     3265620 :         yy_storeu_256(ls + 0 * 32, y_zeros);
     118     3265510 :         yy_storeu_256(ls + 1 * 32, y_zeros);
     119     3265520 :         yy_storeu_256(ls + 2 * 32, y_zeros);
     120     3265480 :         yy_storeu_256(ls + 3 * 32, y_zeros);
     121     3265510 :         xx_storeu_128(ls + 4 * 32, x_zeros);
     122             :     }
     123    74387700 : }

Generated by: LCOV version 1.14