LCOV - code coverage report
Current view: top level - ASM_AVX512 - encodetxb_avx512.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 104 0.0 %
Date: 2019-11-25 17:38:06 Functions: 0 3 0.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include <immintrin.h> /* AVX2 */
       7             : 
       8             : #include "EbDefinitions.h"
       9             : #include "synonyms.h"
      10             : #include "synonyms_avx2.h"
      11             : 
      12           0 : static INLINE __m256i txb_init_levels_32_avx512(const TranLow *const coeff) {
      13             :     const __m512i idx =
      14           0 :         _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0);
      15           0 :     const __m512i c0 = _mm512_loadu_si512((__m512i *)(coeff + 0 * 16));
      16           0 :     const __m512i c1 = _mm512_loadu_si512((__m512i *)(coeff + 1 * 16));
      17           0 :     const __m512i c01 = _mm512_packs_epi32(c0, c1);
      18           0 :     const __m512i abs01 = _mm512_abs_epi16(c01);
      19           0 :     const __m512i abs_8 = _mm512_packs_epi16(abs01, abs01);
      20           0 :     const __m512i res = _mm512_permutexvar_epi32(idx, abs_8);
      21           0 :     return _mm512_castsi512_si256(res);
      22             : }
      23             : 
      24           0 : static INLINE __m512i txb_init_levels_64_avx512(const TranLow *const coeff) {
      25             :     const __m512i idx =
      26           0 :         _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
      27           0 :     const __m512i c0 = _mm512_loadu_si512((__m512i *)(coeff + 0 * 16));
      28           0 :     const __m512i c1 = _mm512_loadu_si512((__m512i *)(coeff + 1 * 16));
      29           0 :     const __m512i c2 = _mm512_loadu_si512((__m512i *)(coeff + 2 * 16));
      30           0 :     const __m512i c3 = _mm512_loadu_si512((__m512i *)(coeff + 3 * 16));
      31           0 :     const __m512i c01 = _mm512_packs_epi32(c0, c1);
      32           0 :     const __m512i c23 = _mm512_packs_epi32(c2, c3);
      33           0 :     const __m512i abs01 = _mm512_abs_epi16(c01);
      34           0 :     const __m512i abs23 = _mm512_abs_epi16(c23);
      35           0 :     const __m512i abs_8 = _mm512_packs_epi16(abs01, abs23);
      36           0 :     return _mm512_permutexvar_epi32(idx, abs_8);
      37             : }
      38             : 
      39           0 : void eb_av1_txb_init_levels_avx512(const TranLow *const coeff,
      40             :     const int32_t width, const int32_t height,
      41             :     uint8_t *const levels) {
      42           0 :     const TranLow *cf = coeff;
      43           0 :     const __m128i x_zeros = _mm_setzero_si128();
      44           0 :     uint8_t *ls = levels;
      45           0 :     int32_t i = height;
      46             : 
      47           0 :     if (width == 4) {
      48           0 :         const __m256i y_zeros = _mm256_setzero_si256();
      49             : 
      50           0 :         xx_storeu_128(ls - 16, x_zeros);
      51             : 
      52             :         do {
      53           0 :             const __m256i c0 = yy_loadu_256(cf);
      54           0 :             const __m256i c1 = yy_loadu_256(cf + 8);
      55           0 :             const __m256i c01 = _mm256_packs_epi32(c0, c1);
      56           0 :             const __m256i abs01 = _mm256_abs_epi16(c01);
      57           0 :             const __m256i abs_8 = _mm256_packs_epi16(abs01, y_zeros);
      58           0 :             const __m256i res_ = _mm256_shuffle_epi32(abs_8, 0xd8);
      59           0 :             const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8);
      60           0 :             yy_storeu_256(ls, res);
      61           0 :             cf += 4 * 4;
      62           0 :             ls += 4 * 8;
      63           0 :             i -= 4;
      64           0 :         } while (i);
      65             : 
      66           0 :         yy_storeu_256(ls, y_zeros);
      67             :     }
      68           0 :     else if (width == 8) {
      69           0 :         const __m256i y_zeros = _mm256_setzero_si256();
      70             : 
      71           0 :         yy_storeu_256(ls - 24, y_zeros);
      72             : 
      73             :         do {
      74           0 :             const __m256i res = txb_init_levels_32_avx512(cf);
      75           0 :             const __m128i res0 = _mm256_castsi256_si128(res);
      76           0 :             const __m128i res1 = _mm256_extracti128_si256(res, 1);
      77           0 :             xx_storel_64(ls + 0 * 12 + 0, res0);
      78           0 :             *(int32_t *)(ls + 0 * 12 + 8) = 0;
      79           0 :             _mm_storeh_epi64((__m128i *)(ls + 1 * 12 + 0), res0);
      80           0 :             *(int32_t *)(ls + 1 * 12 + 8) = 0;
      81           0 :             xx_storel_64(ls + 2 * 12 + 0, res1);
      82           0 :             *(int32_t *)(ls + 2 * 12 + 8) = 0;
      83           0 :             _mm_storeh_epi64((__m128i *)(ls + 3 * 12 + 0), res1);
      84           0 :             *(int32_t *)(ls + 3 * 12 + 8) = 0;
      85           0 :             cf += 4 * 8;
      86           0 :             ls += 4 * 12;
      87           0 :             i -= 4;
      88           0 :         } while (i);
      89             : 
      90           0 :         yy_storeu_256(ls + 0 * 32, y_zeros);
      91           0 :         xx_storeu_128(ls + 1 * 32, x_zeros);
      92             :     }
      93           0 :     else if (width == 16) {
      94           0 :         const __m256i y_zeros = _mm256_setzero_si256();
      95           0 :         const __m512i z_zeros = _mm512_setzero_si512();
      96             : 
      97           0 :         yy_storeu_256(ls - 40, y_zeros);
      98           0 :         xx_storel_64(ls - 8, x_zeros);
      99             : 
     100             :         do {
     101           0 :             const __m512i res = txb_init_levels_64_avx512(cf);
     102           0 :             const __m256i r0 = _mm512_castsi512_si256(res);
     103           0 :             const __m256i r1 = _mm512_extracti64x4_epi64(res, 1);
     104           0 :             const __m128i res0 = _mm256_castsi256_si128(r0);
     105           0 :             const __m128i res1 = _mm256_extracti128_si256(r0, 1);
     106           0 :             const __m128i res2 = _mm256_castsi256_si128(r1);
     107           0 :             const __m128i res3 = _mm256_extracti128_si256(r1, 1);
     108           0 :             xx_storeu_128(ls + 0 * 20, res0);
     109           0 :             *(int32_t *)(ls + 0 * 20 + 16) = 0;
     110           0 :             xx_storeu_128(ls + 1 * 20, res1);
     111           0 :             *(int32_t *)(ls + 1 * 20 + 16) = 0;
     112           0 :             xx_storeu_128(ls + 2 * 20, res2);
     113           0 :             *(int32_t *)(ls + 2 * 20 + 16) = 0;
     114           0 :             xx_storeu_128(ls + 3 * 20, res3);
     115           0 :             *(int32_t *)(ls + 3 * 20 + 16) = 0;
     116           0 :             cf += 4 * 16;
     117           0 :             ls += 4 * 20;
     118           0 :             i -= 4;
     119           0 :         } while (i);
     120             : 
     121             :         _mm512_storeu_si512((__m512i *)(ls + 0 * 64), z_zeros);
     122           0 :         xx_storeu_128(ls + 1 * 64, x_zeros);
     123             :     }
     124             :     else {
     125           0 :         const __m512i z_zeros = _mm512_setzero_si512();
     126             : 
     127           0 :         _mm512_storeu_si512((__m512i *)(ls - 72), z_zeros);
     128           0 :         xx_storel_64(ls - 8, x_zeros);
     129             : 
     130             :         do {
     131           0 :             const __m512i res = txb_init_levels_64_avx512(cf);
     132           0 :             const __m256i res0 = _mm512_castsi512_si256(res);
     133           0 :             const __m256i res1 = _mm512_extracti64x4_epi64(res, 1);
     134           0 :             yy_storeu_256(ls, res0);
     135           0 :             *(int32_t *)(ls + 32) = 0;
     136           0 :             yy_storeu_256(ls + 36, res1);
     137           0 :             *(int32_t *)(ls + 36 + 32) = 0;
     138           0 :             cf += 2 * 32;
     139           0 :             ls += 2 * 36;
     140           0 :             i -= 2;
     141           0 :         } while (i);
     142             : 
     143             :         _mm512_storeu_si512((__m512i *)(ls + 0 * 64), z_zeros);
     144           0 :         _mm512_storeu_si512((__m512i *)(ls + 1 * 64), z_zeros);
     145           0 :         xx_storeu_128(ls + 2 * 64, x_zeros);
     146             :     }
     147           0 : }

Generated by: LCOV version 1.14