LCOV - code coverage report
Current view: top level - ASM_AVX2 - EbTransforms_Intrinsic_AVX2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 83 405 20.5 %
Date: 2019-11-25 17:38:06 Functions: 12 19 63.2 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include "EbTransforms_AVX2.h"
       7             : 
       8             : #include <emmintrin.h>
       9             : #include <immintrin.h>
      10             : 
      11             : // Coefficients for forward 16/32-point transform
      12             : #ifdef __GNUC__
      13             : __attribute__((aligned(16)))
      14             : #endif
      15             : EB_ALIGN(32) const int16_t coeff_tbl_AVX2[48 * 16] =
      16             : {
      17             :     64, 64, 89, 75, 83, 36, 75, -18, 64, 64, 89, 75, 83, 36, 75, -18, 64, -64, 50, -89, 36, -83, 18, -50, 64, -64, 50, -89, 36, -83, 18, -50,
      18             :     64, 64, 50, 18, -36, -83, -89, -50, 64, 64, 50, 18, -36, -83, -89, -50, -64, 64, 18, 75, 83, -36, 75, -89, -64, 64, 18, 75, 83, -36, 75, -89,
      19             :     64, 64, -18, -50, -83, -36, 50, 89, 64, 64, -18, -50, -83, -36, 50, 89, 64, -64, -75, -18, -36, 83, 89, -75, 64, -64, -75, -18, -36, 83, 89, -75,
      20             :     64, 64, -75, -89, 36, 83, 18, -75, 64, 64, -75, -89, 36, 83, 18, -75, -64, 64, 89, -50, -83, 36, 50, -18, -64, 64, 89, -50, -83, 36, 50, -18,
      21             :     90, 87, 87, 57, 80, 9, 70, -43, 90, 87, 87, 57, 80, 9, 70, -43, 57, -80, 43, -90, 25, -70, 9, -25, 57, -80, 43, -90, 25, -70, 9, -25,
      22             :     80, 70, 9, -43, -70, -87, -87, 9, 80, 70, 9, -43, -70, -87, -87, 9, -25, 90, 57, 25, 90, -80, 43, -57, -25, 90, 57, 25, 90, -80, 43, -57,
      23             :     57, 43, -80, -90, -25, 57, 90, 25, 57, 43, -80, -90, -25, 57, 90, 25, -9, -87, -87, 70, 43, 9, 70, -80, -9, -87, -87, 70, 43, 9, 70, -80,
      24             :     25, 9, -70, -25, 90, 43, -80, -57, 25, 9, -70, -25, 90, 43, -80, -57, 43, 70, 9, -80, -57, 87, 87, -90, 43, 70, 9, -80, -57, 87, 87, -90,
      25             :     90, 90, 90, 82, 88, 67, 85, 46, 90, 90, 90, 82, 88, 67, 85, 46, 82, 22, 78, -4, 73, -31, 67, -54, 82, 22, 78, -4, 73, -31, 67, -54,
      26             :     61, -73, 54, -85, 46, -90, 38, -88, 61, -73, 54, -85, 46, -90, 38, -88, 31, -78, 22, -61, 13, -38, 4, -13, 31, -78, 22, -61, 13, -38, 4, -13,
      27             :     88, 85, 67, 46, 31, -13, -13, -67, 88, 85, 67, 46, 31, -13, -13, -67, -54, -90, -82, -73, -90, -22, -78, 38, -54, -90, -82, -73, -90, -22, -78, 38,
      28             :     -46, 82, -4, 88, 38, 54, 73, -4, -46, 82, -4, 88, 38, 54, 73, -4, 90, -61, 85, -90, 61, -78, 22, -31, 90, -61, 85, -90, 61, -78, 22, -31,
      29             :     82, 78, 22, -4, -54, -82, -90, -73, 82, 78, 22, -4, -54, -82, -90, -73, -61, 13, 13, 85, 78, 67, 85, -22, -61, 13, 13, 85, 78, 67, 85, -22,
      30             :     31, -88, -46, -61, -90, 31, -67, 90, 31, -88, -46, -61, -90, 31, -67, 90, 4, 54, 73, -38, 88, -90, 38, -46, 4, 54, 73, -38, 88, -90, 38, -46,
      31             :     73, 67, -31, -54, -90, -78, -22, 38, 73, 67, -31, -54, -90, -78, -22, 38, 78, 85, 67, -22, -38, -90, -90, 4, 78, 85, 67, -22, -38, -90, -90, 4,
      32             :     -13, 90, 82, 13, 61, -88, -46, -31, -13, 90, 82, 13, 61, -88, -46, -31, -88, 82, -4, 46, 85, -73, 54, -61, -88, 82, -4, 46, 85, -73, 54, -61,
      33             :     61, 54, -73, -85, -46, -4, 82, 88, 61, 54, -73, -85, -46, -4, 82, 88, 31, -46, -88, -61, -13, 82, 90, 13, 31, -46, -88, -61, -13, 82, 90, 13,
      34             :     -4, -90, -90, 38, 22, 67, 85, -78, -4, -90, -90, 38, 22, 67, 85, -78, -38, -22, -78, 90, 54, -31, 67, -73, -38, -22, -78, 90, 54, -31, 67, -73,
      35             :     46, 38, -90, -88, 38, 73, 54, -4, 46, 38, -90, -88, 38, 73, 54, -4, -90, -67, 31, 90, 61, -46, -88, -31, -90, -67, 31, 90, 61, -46, -88, -31,
      36             :     22, 85, 67, -78, -85, 13, 13, 61, 22, 85, 67, -78, -85, 13, 13, 61, 73, -90, -82, 54, 4, 22, 78, -82, 73, -90, -82, 54, 4, 22, 78, -82,
      37             :     31, 22, -78, -61, 90, 85, -61, -90, 31, 22, -78, -61, 90, 85, -61, -90, 4, 73, 54, -38, -88, -4, 82, 46, 4, 73, 54, -38, -88, -4, 82, 46,
      38             :     -38, -78, -22, 90, 73, -82, -90, 54, -38, -78, -22, 90, 73, -82, -90, 54, 67, -13, -13, -31, -46, 67, 85, -88, 67, -13, -13, -31, -46, 67, 85, -88,
      39             :     13, 4, -38, -13, 61, 22, -78, -31, 13, 4, -38, -13, 61, 22, -78, -31, 88, 38, -90, -46, 85, 54, -73, -61, 88, 38, -90, -46, 85, 54, -73, -61,
      40             :     54, 67, -31, -73, 4, 78, 22, -82, 54, 67, -31, -73, 4, 78, 22, -82, -46, 85, 67, -88, -82, 90, 90, -90, -46, 85, 67, -88, -82, 90, 90, -90
      41             : };
      42             : 
      43             : /*******************************************************************************
      44             : * Requirement: area_size = 4, 8, or area_size % 16 = 0
      45             : *******************************************************************************/
      46             : 
      47             : // transpose 16x16 block of data
      48           0 : void transpose16_AVX2_INTRIN(int16_t *src, uint32_t src_stride, int16_t *dst, uint32_t dst_stride)
      49             : {
      50             :     uint32_t i;
      51           0 :     for (i = 0; i < 2; i++)
      52             :     {
      53             :         __m256i a0, a1, a2, a3, a4, a5, a6, a7;
      54             :         __m256i b0, b1, b2, b3, b4, b5, b6, b7;
      55             : 
      56           0 :         a0 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 0)*src_stride));
      57           0 :         a1 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 1)*src_stride));
      58           0 :         a2 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 2)*src_stride));
      59           0 :         a3 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 3)*src_stride));
      60           0 :         a4 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 4)*src_stride));
      61           0 :         a5 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 5)*src_stride));
      62           0 :         a6 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 6)*src_stride));
      63           0 :         a7 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 7)*src_stride));
      64             : 
      65           0 :         b0 = _mm256_unpacklo_epi16(a0, a4);
      66           0 :         b1 = _mm256_unpacklo_epi16(a1, a5);
      67           0 :         b2 = _mm256_unpacklo_epi16(a2, a6);
      68           0 :         b3 = _mm256_unpacklo_epi16(a3, a7);
      69           0 :         b4 = _mm256_unpackhi_epi16(a0, a4);
      70           0 :         b5 = _mm256_unpackhi_epi16(a1, a5);
      71           0 :         b6 = _mm256_unpackhi_epi16(a2, a6);
      72           0 :         b7 = _mm256_unpackhi_epi16(a3, a7);
      73             : 
      74           0 :         a0 = _mm256_unpacklo_epi16(b0, b2);
      75           0 :         a1 = _mm256_unpacklo_epi16(b1, b3);
      76           0 :         a2 = _mm256_unpackhi_epi16(b0, b2);
      77           0 :         a3 = _mm256_unpackhi_epi16(b1, b3);
      78           0 :         a4 = _mm256_unpacklo_epi16(b4, b6);
      79           0 :         a5 = _mm256_unpacklo_epi16(b5, b7);
      80           0 :         a6 = _mm256_unpackhi_epi16(b4, b6);
      81           0 :         a7 = _mm256_unpackhi_epi16(b5, b7);
      82             : 
      83           0 :         b0 = _mm256_unpacklo_epi16(a0, a1);
      84           0 :         b1 = _mm256_unpackhi_epi16(a0, a1);
      85           0 :         b2 = _mm256_unpacklo_epi16(a2, a3);
      86           0 :         b3 = _mm256_unpackhi_epi16(a2, a3);
      87           0 :         b4 = _mm256_unpacklo_epi16(a4, a5);
      88           0 :         b5 = _mm256_unpackhi_epi16(a4, a5);
      89           0 :         b6 = _mm256_unpacklo_epi16(a6, a7);
      90           0 :         b7 = _mm256_unpackhi_epi16(a6, a7);
      91             : 
      92           0 :         _mm_storeu_si128((__m128i *)(dst + 0 * dst_stride + 8 * i), _mm256_extracti128_si256(b0, 0));
      93           0 :         _mm_storeu_si128((__m128i *)(dst + 1 * dst_stride + 8 * i), _mm256_extracti128_si256(b1, 0));
      94           0 :         _mm_storeu_si128((__m128i *)(dst + 2 * dst_stride + 8 * i), _mm256_extracti128_si256(b2, 0));
      95           0 :         _mm_storeu_si128((__m128i *)(dst + 3 * dst_stride + 8 * i), _mm256_extracti128_si256(b3, 0));
      96           0 :         _mm_storeu_si128((__m128i *)(dst + 4 * dst_stride + 8 * i), _mm256_extracti128_si256(b4, 0));
      97           0 :         _mm_storeu_si128((__m128i *)(dst + 5 * dst_stride + 8 * i), _mm256_extracti128_si256(b5, 0));
      98           0 :         _mm_storeu_si128((__m128i *)(dst + 6 * dst_stride + 8 * i), _mm256_extracti128_si256(b6, 0));
      99           0 :         _mm_storeu_si128((__m128i *)(dst + 7 * dst_stride + 8 * i), _mm256_extracti128_si256(b7, 0));
     100           0 :         _mm_storeu_si128((__m128i *)(dst + 8 * dst_stride + 8 * i), _mm256_extracti128_si256(b0, 1));
     101           0 :         _mm_storeu_si128((__m128i *)(dst + 9 * dst_stride + 8 * i), _mm256_extracti128_si256(b1, 1));
     102           0 :         _mm_storeu_si128((__m128i *)(dst + 10 * dst_stride + 8 * i), _mm256_extracti128_si256(b2, 1));
     103           0 :         _mm_storeu_si128((__m128i *)(dst + 11 * dst_stride + 8 * i), _mm256_extracti128_si256(b3, 1));
     104           0 :         _mm_storeu_si128((__m128i *)(dst + 12 * dst_stride + 8 * i), _mm256_extracti128_si256(b4, 1));
     105           0 :         _mm_storeu_si128((__m128i *)(dst + 13 * dst_stride + 8 * i), _mm256_extracti128_si256(b5, 1));
     106           0 :         _mm_storeu_si128((__m128i *)(dst + 14 * dst_stride + 8 * i), _mm256_extracti128_si256(b6, 1));
     107           0 :         _mm_storeu_si128((__m128i *)(dst + 15 * dst_stride + 8 * i), _mm256_extracti128_si256(b7, 1));
     108             :     }
     109           0 : }
     110             : 
     111             : // transpose 32x32 block of data
     112           0 : void transpose32_AVX2_INTRIN(int16_t *src, uint32_t src_stride, int16_t *dst, uint32_t dst_stride)
     113             : {
     114             :     uint32_t i, j;
     115           0 :     for (i = 0; i < 4; i++)
     116             :     {
     117           0 :         for (j = 0; j < 2; j++)
     118             :         {
     119             :             __m256i a0, a1, a2, a3, a4, a5, a6, a7;
     120             :             __m256i b0, b1, b2, b3, b4, b5, b6, b7;
     121             : 
     122           0 :             a0 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 0)*src_stride + 16 * j));
     123           0 :             a1 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 1)*src_stride + 16 * j));
     124           0 :             a2 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 2)*src_stride + 16 * j));
     125           0 :             a3 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 3)*src_stride + 16 * j));
     126           0 :             a4 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 4)*src_stride + 16 * j));
     127           0 :             a5 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 5)*src_stride + 16 * j));
     128           0 :             a6 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 6)*src_stride + 16 * j));
     129           0 :             a7 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 7)*src_stride + 16 * j));
     130             : 
     131           0 :             b0 = _mm256_unpacklo_epi16(a0, a4);
     132           0 :             b1 = _mm256_unpacklo_epi16(a1, a5);
     133           0 :             b2 = _mm256_unpacklo_epi16(a2, a6);
     134           0 :             b3 = _mm256_unpacklo_epi16(a3, a7);
     135           0 :             b4 = _mm256_unpackhi_epi16(a0, a4);
     136           0 :             b5 = _mm256_unpackhi_epi16(a1, a5);
     137           0 :             b6 = _mm256_unpackhi_epi16(a2, a6);
     138           0 :             b7 = _mm256_unpackhi_epi16(a3, a7);
     139             : 
     140           0 :             a0 = _mm256_unpacklo_epi16(b0, b2);
     141           0 :             a1 = _mm256_unpacklo_epi16(b1, b3);
     142           0 :             a2 = _mm256_unpackhi_epi16(b0, b2);
     143           0 :             a3 = _mm256_unpackhi_epi16(b1, b3);
     144           0 :             a4 = _mm256_unpacklo_epi16(b4, b6);
     145           0 :             a5 = _mm256_unpacklo_epi16(b5, b7);
     146           0 :             a6 = _mm256_unpackhi_epi16(b4, b6);
     147           0 :             a7 = _mm256_unpackhi_epi16(b5, b7);
     148             : 
     149           0 :             b0 = _mm256_unpacklo_epi16(a0, a1);
     150           0 :             b1 = _mm256_unpackhi_epi16(a0, a1);
     151           0 :             b2 = _mm256_unpacklo_epi16(a2, a3);
     152           0 :             b3 = _mm256_unpackhi_epi16(a2, a3);
     153           0 :             b4 = _mm256_unpacklo_epi16(a4, a5);
     154           0 :             b5 = _mm256_unpackhi_epi16(a4, a5);
     155           0 :             b6 = _mm256_unpacklo_epi16(a6, a7);
     156           0 :             b7 = _mm256_unpackhi_epi16(a6, a7);
     157             : 
     158           0 :             _mm_storeu_si128((__m128i *)(dst + (16 * j + 0)*dst_stride + 8 * i), _mm256_extracti128_si256(b0, 0));
     159           0 :             _mm_storeu_si128((__m128i *)(dst + (16 * j + 1)*dst_stride + 8 * i), _mm256_extracti128_si256(b1, 0));
     160           0 :             _mm_storeu_si128((__m128i *)(dst + (16 * j + 2)*dst_stride + 8 * i), _mm256_extracti128_si256(b2, 0));
     161           0 :             _mm_storeu_si128((__m128i *)(dst + (16 * j + 3)*dst_stride + 8 * i), _mm256_extracti128_si256(b3, 0));
     162           0 :             _mm_storeu_si128((__m128i *)(dst + (16 * j + 4)*dst_stride + 8 * i), _mm256_extracti128_si256(b4, 0));
     163           0 :             _mm_storeu_si128((__m128i *)(dst + (16 * j + 5)*dst_stride + 8 * i), _mm256_extracti128_si256(b5, 0));
     164           0 :             _mm_storeu_si128((__m128i *)(dst + (16 * j + 6)*dst_stride + 8 * i), _mm256_extracti128_si256(b6, 0));
     165           0 :             _mm_storeu_si128((__m128i *)(dst + (16 * j + 7)*dst_stride + 8 * i), _mm256_extracti128_si256(b7, 0));
     166           0 :             _mm_storeu_si128((__m128i *)(dst + (16 * j + 8)*dst_stride + 8 * i), _mm256_extracti128_si256(b0, 1));
     167           0 :             _mm_storeu_si128((__m128i *)(dst + (16 * j + 9)*dst_stride + 8 * i), _mm256_extracti128_si256(b1, 1));
     168           0 :             _mm_storeu_si128((__m128i *)(dst + (16 * j + 10)*dst_stride + 8 * i), _mm256_extracti128_si256(b2, 1));
     169           0 :             _mm_storeu_si128((__m128i *)(dst + (16 * j + 11)*dst_stride + 8 * i), _mm256_extracti128_si256(b3, 1));
     170           0 :             _mm_storeu_si128((__m128i *)(dst + (16 * j + 12)*dst_stride + 8 * i), _mm256_extracti128_si256(b4, 1));
     171           0 :             _mm_storeu_si128((__m128i *)(dst + (16 * j + 13)*dst_stride + 8 * i), _mm256_extracti128_si256(b5, 1));
     172           0 :             _mm_storeu_si128((__m128i *)(dst + (16 * j + 14)*dst_stride + 8 * i), _mm256_extracti128_si256(b6, 1));
     173           0 :             _mm_storeu_si128((__m128i *)(dst + (16 * j + 15)*dst_stride + 8 * i), _mm256_extracti128_si256(b7, 1));
     174             :         }
     175             :     }
     176           0 : }
     177             : 
     178             : // 32-point forward transform (32 rows)
     179           0 : void transform32_AVX2_INTRIN(int16_t *src, uint32_t src_stride, int16_t *dst, uint32_t dst_stride, uint32_t shift)
     180             : {
     181             :     uint32_t i;
     182             :     __m128i s0;
     183             :     __m256i o0;
     184           0 :     const __m256i *coeff32 = (const __m256i *)coeff_tbl_AVX2;
     185             : 
     186           0 :     shift &= 0x0000FFFF; // Redundant code to fix Visual Studio 2012 AVX2 compiler error
     187           0 :     s0 = _mm_cvtsi32_si128(shift);
     188           0 :     o0 = _mm256_set1_epi32(1 << (shift - 1));
     189             : 
     190           0 :     for (i = 0; i < 16; i++)
     191             :     {
     192             :         __m256i x0, x1, x2, x3;
     193             :         __m256i y0, y1, y2, y3;
     194             :         __m256i a0, a1, a2, a3, a4, a5, a6, a7;
     195             :         __m256i b0, b1, b2, b3, b4, b5, b6, b7;
     196             : 
     197           0 :         x0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x00))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x00)), 0x1);
     198           0 :         x1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x08))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x08)), 0x1);
     199           0 :         x2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x10))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x10)), 0x1);
     200           0 :         x3 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x18))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x18)), 0x1);
     201             : 
     202             :         // 32-point butterfly
     203           0 :         x2 = _mm256_shuffle_epi8(x2, _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
     204           0 :         x3 = _mm256_shuffle_epi8(x3, _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
     205             : 
     206           0 :         y0 = _mm256_add_epi16(x0, x3);
     207           0 :         y1 = _mm256_add_epi16(x1, x2);
     208             : 
     209           0 :         y2 = _mm256_sub_epi16(x0, x3);
     210           0 :         y3 = _mm256_sub_epi16(x1, x2);
     211             : 
     212             :         // 16-point butterfly
     213           0 :         y1 = _mm256_shuffle_epi8(y1, _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
     214             : 
     215           0 :         x0 = _mm256_add_epi16(y0, y1);
     216           0 :         x1 = _mm256_sub_epi16(y0, y1);
     217             : 
     218           0 :         x2 = y2;
     219           0 :         x3 = y3;
     220             : 
     221           0 :         a0 = _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x00), coeff32[0]);
     222           0 :         a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x55), coeff32[2]));
     223           0 :         a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0xaa), coeff32[4]));
     224           0 :         a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0xff), coeff32[6]));
     225             : 
     226           0 :         a1 = _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x00), coeff32[1]);
     227           0 :         a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x55), coeff32[3]));
     228           0 :         a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0xaa), coeff32[5]));
     229           0 :         a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0xff), coeff32[7]));
     230             : 
     231           0 :         a2 = _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x00), coeff32[8]);
     232           0 :         a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x55), coeff32[10]));
     233           0 :         a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0xaa), coeff32[12]));
     234           0 :         a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0xff), coeff32[14]));
     235             : 
     236           0 :         a3 = _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x00), coeff32[9]);
     237           0 :         a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x55), coeff32[11]));
     238           0 :         a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0xaa), coeff32[13]));
     239           0 :         a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0xff), coeff32[15]));
     240             : 
     241           0 :         a4 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[16]);
     242           0 :         a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x55), coeff32[20]));
     243           0 :         a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xaa), coeff32[24]));
     244           0 :         a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xff), coeff32[28]));
     245           0 :         a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[32]));
     246           0 :         a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x55), coeff32[36]));
     247           0 :         a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xaa), coeff32[40]));
     248           0 :         a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xff), coeff32[44]));
     249             : 
     250           0 :         a5 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[17]);
     251           0 :         a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x55), coeff32[21]));
     252           0 :         a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xaa), coeff32[25]));
     253           0 :         a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xff), coeff32[29]));
     254           0 :         a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[33]));
     255           0 :         a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x55), coeff32[37]));
     256           0 :         a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xaa), coeff32[41]));
     257           0 :         a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xff), coeff32[45]));
     258             : 
     259           0 :         a6 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[18]);
     260           0 :         a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x55), coeff32[22]));
     261           0 :         a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xaa), coeff32[26]));
     262           0 :         a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xff), coeff32[30]));
     263           0 :         a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[34]));
     264           0 :         a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x55), coeff32[38]));
     265           0 :         a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xaa), coeff32[42]));
     266           0 :         a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xff), coeff32[46]));
     267             : 
     268           0 :         a7 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[19]);
     269           0 :         a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x55), coeff32[23]));
     270           0 :         a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xaa), coeff32[27]));
     271           0 :         a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xff), coeff32[31]));
     272           0 :         a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[35]));
     273           0 :         a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x55), coeff32[39]));
     274           0 :         a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xaa), coeff32[43]));
     275           0 :         a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xff), coeff32[47]));
     276             : 
     277           0 :         b0 = _mm256_sra_epi32(_mm256_add_epi32(a0, o0), s0);
     278           0 :         b1 = _mm256_sra_epi32(_mm256_add_epi32(a1, o0), s0);
     279           0 :         b2 = _mm256_sra_epi32(_mm256_add_epi32(a2, o0), s0);
     280           0 :         b3 = _mm256_sra_epi32(_mm256_add_epi32(a3, o0), s0);
     281           0 :         b4 = _mm256_sra_epi32(_mm256_add_epi32(a4, o0), s0);
     282           0 :         b5 = _mm256_sra_epi32(_mm256_add_epi32(a5, o0), s0);
     283           0 :         b6 = _mm256_sra_epi32(_mm256_add_epi32(a6, o0), s0);
     284           0 :         b7 = _mm256_sra_epi32(_mm256_add_epi32(a7, o0), s0);
     285             : 
     286           0 :         x0 = _mm256_packs_epi32(b0, b1);
     287           0 :         x1 = _mm256_packs_epi32(b2, b3);
     288           0 :         x2 = _mm256_packs_epi32(b4, b5);
     289           0 :         x3 = _mm256_packs_epi32(b6, b7);
     290             : 
     291           0 :         y0 = _mm256_unpacklo_epi16(x0, x1);
     292           0 :         y1 = _mm256_unpackhi_epi16(x0, x1);
     293           0 :         y2 = x2;
     294           0 :         y3 = x3;
     295           0 :         x0 = _mm256_unpacklo_epi16(y0, y2);
     296           0 :         x1 = _mm256_unpackhi_epi16(y0, y2);
     297           0 :         x2 = _mm256_unpacklo_epi16(y1, y3);
     298           0 :         x3 = _mm256_unpackhi_epi16(y1, y3);
     299             : 
     300           0 :         y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 0)), _mm256_extracti128_si256(x1, 0), 0x1);
     301           0 :         y1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x2, 0)), _mm256_extracti128_si256(x3, 0), 0x1);
     302           0 :         y2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 1)), _mm256_extracti128_si256(x1, 1), 0x1);
     303           0 :         y3 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x2, 1)), _mm256_extracti128_si256(x3, 1), 0x1);
     304             :         _mm256_storeu_si256((__m256i *)(dst + 0x00), y0);
     305           0 :         _mm256_storeu_si256((__m256i *)(dst + 0x10), y1);
     306           0 :         _mm256_storeu_si256((__m256i *)(dst + dst_stride + 0x00), y2);
     307           0 :         _mm256_storeu_si256((__m256i *)(dst + dst_stride + 0x10), y3);
     308             : 
     309           0 :         src += 2 * src_stride;
     310           0 :         dst += 2 * dst_stride;
     311             :     }
     312           0 : }
     313             : 
     314           0 : void mat_mult4x4_out_buff_avx2_intrin(
     315             :     int16_t*              coeff,
     316             :     const uint32_t         coeff_stride,
     317             :     int16_t*              coeff_out,
     318             :     const uint32_t         coeff_out_stride,
     319             :     const uint16_t        *maskingMatrix,
     320             :     const uint32_t         maskingMatrixStride,
     321             :     const uint32_t         compute_size,
     322             :     const int32_t         offset,
     323             :     const int32_t         shift_num,
     324             :     uint32_t*              nonzerocoeff)
     325             : 
     326             : {
     327           0 :     __m256i z = _mm256_setzero_si256();
     328             :     __m128i a, b;
     329             :     __m256i coeffTemp, a0, a1, b0, b1, ymm_computed, MaskingMatrix, offsetREG, coeffTempORG;
     330             :     (void)compute_size;
     331             : 
     332           0 :     coeffTemp = a0 = a1 = b0 = b1 = ymm_computed = MaskingMatrix = offsetREG = _mm256_setzero_si256();
     333             : 
     334             :     // prepare Shift REG
     335           0 :     __m128i PMP_PRECISION_REG = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, (int16_t)shift_num); //_mm_set1_epi16((uint16_t)shift_num);//_mm_set1_epi32(shift_num);
     336             : 
     337             :     //prepare the offset
     338           0 :     offsetREG = _mm256_set1_epi32(offset);
     339             : 
     340             :     //load maskingMatrix_new
     341           0 :     MaskingMatrix = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)maskingMatrix), _mm_loadl_epi64((__m128i*)(maskingMatrix + maskingMatrixStride)))), _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(maskingMatrix + 2 * maskingMatrixStride)), _mm_loadl_epi64((__m128i*)(maskingMatrix + 3 * maskingMatrixStride))), 0x1);
     342             : 
     343             :     //load coefftemp
     344           0 :     a = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)coeff), _mm_loadl_epi64((__m128i*)(coeff + coeff_stride))); // 1st and 2nd row of the 4x4 block
     345           0 :     b = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(coeff + 2 * coeff_stride)), _mm_loadl_epi64((__m128i*)(coeff + 3 * coeff_stride))); // 3rd and 4th row of the 4x4 block
     346           0 :     coeffTemp = _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 0x1); // the 4x4 block is now loaded
     347             : 
     348           0 :     coeffTempORG = coeffTemp;
     349             :     //Absolute val
     350           0 :     coeffTemp = _mm256_abs_epi16(coeffTemp);
     351             : 
     352           0 :     a0 = _mm256_mullo_epi16(coeffTemp, MaskingMatrix);
     353           0 :     a1 = _mm256_mulhi_epi16(coeffTemp, MaskingMatrix);
     354             : 
     355           0 :     b0 = _mm256_unpacklo_epi16(a0, a1);
     356           0 :     b1 = _mm256_unpackhi_epi16(a0, a1);
     357             : 
     358           0 :     b0 = _mm256_add_epi32(b0, offsetREG);
     359           0 :     b1 = _mm256_add_epi32(b1, offsetREG);
     360             : 
     361             :     //Shift right by PMP_PRECISION_REG
     362           0 :     b0 = _mm256_sra_epi32(b0, PMP_PRECISION_REG);
     363           0 :     b1 = _mm256_sra_epi32(b1, PMP_PRECISION_REG);
     364             : 
     365             :     //coefftemp in c
     366           0 :     ymm_computed = _mm256_packs_epi32(b0, b1);//Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst.
     367           0 :     z = _mm256_sub_epi16(z, _mm256_cmpgt_epi16(ymm_computed, _mm256_setzero_si256())); //coeffTemp = (coeff[coeffLocation] < 0)? -coeffTemp : coeffTemp;
     368             : 
     369           0 :     ymm_computed = _mm256_sign_epi16(ymm_computed, coeffTempORG);// coeffTemp);
     370             : 
     371           0 :     a = _mm256_extracti128_si256(ymm_computed, 0);
     372           0 :     b = _mm256_extracti128_si256(ymm_computed, 1);
     373           0 :     _mm_storel_epi64((__m128i *) coeff_out, a);
     374           0 :     _mm_storel_epi64((__m128i *)(coeff_out + 1 * coeff_out_stride), _mm_srli_si128(a, 8));
     375           0 :     _mm_storel_epi64((__m128i *)(coeff_out + 2 * coeff_out_stride), b);
     376           0 :     _mm_storel_epi64((__m128i *)(coeff_out + 3 * coeff_out_stride), _mm_srli_si128(b, 8));
     377             : 
     378           0 :     z = _mm256_sad_epu8(z, _mm256_srli_si256(z, 8));
     379           0 :     *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1)));
     380           0 : }
     381             : 
     382             : /*****************************************************************************************************************************************************************/
     383           0 : void mat_mult4x4_avx2_intrin(
     384             :     int16_t*              coeff,
     385             :     const uint32_t         coeff_stride,
     386             :     const uint16_t        *maskingMatrix,
     387             :     const uint32_t         maskingMatrixStride,  //Matrix size
     388             :     const uint32_t         compute_size,  //Computation area size
     389             :     const int32_t         offset,     //(PMP_MAX >> 1)
     390             :     const int32_t         shift_num, //PMP_PRECISION
     391             :     uint32_t*              nonzerocoeff)
     392             : 
     393             : {
     394           0 :     __m256i z = _mm256_setzero_si256();
     395             :     __m128i a, b;
     396             :     __m256i coeffTemp, a0, a1, b0, b1, ymm_computed, MaskingMatrix, offsetREG, coeffTempORG;
     397             :     (void)compute_size;
     398             : 
     399           0 :     coeffTemp = a0 = a1 = b0 = b1 = ymm_computed = MaskingMatrix = offsetREG = _mm256_setzero_si256();
     400             : 
     401             :     // prepare Shift REG
     402           0 :     __m128i PMP_PRECISION_REG = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, (int16_t)shift_num); //_mm_set1_epi16((uint16_t)shift_num);//_mm_set1_epi32(shift_num);
     403             : 
     404             :     //prepare the offset
     405           0 :     offsetREG = _mm256_set1_epi32(offset);
     406             : 
     407             :     //load maskingMatrix_new
     408           0 :     MaskingMatrix = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)maskingMatrix), _mm_loadl_epi64((__m128i*)(maskingMatrix + maskingMatrixStride)))), _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(maskingMatrix + 2 * maskingMatrixStride)), _mm_loadl_epi64((__m128i*)(maskingMatrix + 3 * maskingMatrixStride))), 0x1);
     409             : 
     410             :     //load coefftemp
     411           0 :     a = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)coeff), _mm_loadl_epi64((__m128i*)(coeff + coeff_stride))); // 1st and 2nd row of the 4x4 block
     412           0 :     b = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(coeff + 2 * coeff_stride)), _mm_loadl_epi64((__m128i*)(coeff + 3 * coeff_stride))); // 3rd and 4th row of the 4x4 block
     413           0 :     coeffTemp = _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 0x1); // the 4x4 block is now loaded
     414             : 
     415           0 :     coeffTempORG = coeffTemp;
     416             :     //Absolute val
     417           0 :     coeffTemp = _mm256_abs_epi16(coeffTemp);
     418             : 
     419           0 :     a0 = _mm256_mullo_epi16(coeffTemp, MaskingMatrix);
     420           0 :     a1 = _mm256_mulhi_epi16(coeffTemp, MaskingMatrix);
     421             : 
     422           0 :     b0 = _mm256_unpacklo_epi16(a0, a1);
     423           0 :     b1 = _mm256_unpackhi_epi16(a0, a1);
     424             : 
     425           0 :     b0 = _mm256_add_epi32(b0, offsetREG);
     426           0 :     b1 = _mm256_add_epi32(b1, offsetREG);
     427             : 
     428             :     //Shift right by PMP_PRECISION_REG
     429           0 :     b0 = _mm256_sra_epi32(b0, PMP_PRECISION_REG);
     430           0 :     b1 = _mm256_sra_epi32(b1, PMP_PRECISION_REG);
     431             : 
     432             :     //coefftemp in c
     433           0 :     ymm_computed = _mm256_packs_epi32(b0, b1);//Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst.
     434           0 :     z = _mm256_sub_epi16(z, _mm256_cmpgt_epi16(ymm_computed, _mm256_setzero_si256())); //coeffTemp = (coeff[coeffLocation] < 0)? -coeffTemp : coeffTemp;
     435             : 
     436           0 :     ymm_computed = _mm256_sign_epi16(ymm_computed, coeffTempORG);// coeffTemp);
     437             : 
     438           0 :     a = _mm256_extracti128_si256(ymm_computed, 0);
     439           0 :     b = _mm256_extracti128_si256(ymm_computed, 1);
     440           0 :     _mm_storel_epi64((__m128i *)coeff, a);
     441           0 :     _mm_storel_epi64((__m128i *)(coeff + coeff_stride), _mm_srli_si128(a, 8));
     442           0 :     _mm_storel_epi64((__m128i *)(coeff + 2 * coeff_stride), b);
     443           0 :     _mm_storel_epi64((__m128i *)(coeff + 3 * coeff_stride), _mm_srli_si128(b, 8));
     444             : 
     445           0 :     z = _mm256_sad_epu8(z, _mm256_srli_si256(z, 8));
     446           0 :     *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1)));
     447           0 : }
     448             : /*******************************************mat_mult8x8_avx2_intrin**************************************************/
     449           0 : void mat_mult8x8_avx2_intrin(
     450             :     int16_t*              coeff,
     451             :     const uint32_t         coeff_stride,
     452             :     const uint16_t        *maskingMatrix,
     453             :     const uint32_t         maskingMatrixStride,  //Matrix size
     454             :     const uint32_t         compute_size,  //Computation area size
     455             :     const int32_t         offset,     //(PMP_MAX >> 1)
     456             :     const int32_t         shift_num, //PMP_PRECISION
     457             :     uint32_t*              nonzerocoeff)
     458             : {
     459             :     unsigned row;
     460           0 :     __m256i z = _mm256_setzero_si256();
     461             :     //__m128i a, b;
     462             :     __m256i coeffTemp, a0, a1, b0, b1, ymm_computed, MaskingMatrix, coeffTempORG;
     463             : 
     464           0 :     coeffTemp = a0 = a1 = b0 = b1 = ymm_computed = MaskingMatrix = _mm256_setzero_si256();
     465             : 
     466             :     // prepare Shift REG
     467           0 :     __m128i PMP_PRECISION_REG = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, (int16_t)shift_num);//_mm_set1_epi32(shift_num);
     468             : 
     469             :     //prepare the offset
     470           0 :     __m256i offsetREG = _mm256_set1_epi32(offset);
     471           0 :     row = 0;
     472             :     do {
     473             :         //load maskingMatrix_new
     474           0 :         MaskingMatrix = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(maskingMatrix + maskingMatrixStride * row))), _mm_loadu_si128((__m128i*)(maskingMatrix + maskingMatrixStride * (row + 1))), 0x1);
     475             : 
     476             :         //load coefftemp
     477           0 :         coeffTemp = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(coeff + coeff_stride * row))),
     478             :             _mm_loadu_si128((__m128i*)(coeff + coeff_stride * (row + 1))), 0x1);
     479             : 
     480           0 :         coeffTempORG = coeffTemp;
     481             :         //Absolute val
     482           0 :         coeffTemp = _mm256_abs_epi16(coeffTemp);
     483             : 
     484             :         //Multiply
     485           0 :         a0 = _mm256_mullo_epi16(coeffTemp, MaskingMatrix);
     486           0 :         a1 = _mm256_mulhi_epi16(coeffTemp, MaskingMatrix);
     487             : 
     488           0 :         b0 = _mm256_unpacklo_epi16(a0, a1);
     489           0 :         b1 = _mm256_unpackhi_epi16(a0, a1);
     490             : 
     491             :         //Add
     492           0 :         b0 = _mm256_add_epi32(b0, offsetREG);
     493           0 :         b1 = _mm256_add_epi32(b1, offsetREG);
     494             : 
     495             :         //Shift right by PMP_PRECISION_REG
     496           0 :         b0 = _mm256_sra_epi32(b0, PMP_PRECISION_REG);
     497           0 :         b1 = _mm256_sra_epi32(b1, PMP_PRECISION_REG);
     498             : 
     499             :         //coefftemp in c
     500           0 :         ymm_computed = _mm256_packs_epi32(b0, b1);//Convert packed 32-bit integers from b0 and b1 to packed 16-bit integers using signed saturation, and store the results in dst.
     501           0 :         z = _mm256_sub_epi16(z, _mm256_cmpgt_epi16(ymm_computed, _mm256_setzero_si256())); //coeffTemp = (coeff[coeffLocation] < 0)? -coeffTemp : coeffTemp;
     502             : 
     503           0 :         ymm_computed = _mm256_sign_epi16(ymm_computed, coeffTempORG);// coeffTemp);
     504             : 
     505           0 :         _mm_storeu_si128((__m128i *)(coeff + coeff_stride * row), _mm256_extracti128_si256(ymm_computed, 0));
     506           0 :         _mm_storeu_si128((__m128i *)(coeff + coeff_stride * (row + 1)), _mm256_extracti128_si256(ymm_computed, 1));
     507             : 
     508           0 :         row += 2;
     509           0 :     } while (row < compute_size);
     510             : 
     511           0 :     z = _mm256_sad_epu8(z, _mm256_srli_si256(z, 7));
     512           0 :     *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1)));
     513           0 : }
     514             : /***************************************mat_mult_nxn_avx2_intrin****************************************************/
     515           0 : void mat_mult_nxn_avx2_intrin(
     516             :     int16_t*              coeff,
     517             :     const uint32_t         coeff_stride,
     518             :     const uint16_t        *maskingMatrix,
     519             :     const uint32_t         maskingMatrixStride,  //Matrix size
     520             :     const uint32_t         compute_size,  //Computation area size
     521             :     const int32_t         offset,     //(PMP_MAX >> 1)
     522             :     const int32_t         shift_num, //PMP_PRECISION
     523             :     uint32_t*              nonzerocoeff)
     524             : {
     525             :     unsigned row, col;
     526           0 :     __m256i z = _mm256_setzero_si256();
     527             :     //__m128i a, b;
     528             :     __m256i coeffTemp, a0, a1, b0, b1, ymm_computed, MaskingMatrix, coeffTempORG;
     529             : 
     530           0 :     coeffTemp = a0 = a1 = b0 = b1 = ymm_computed = MaskingMatrix = _mm256_setzero_si256();
     531             : 
     532             :     // prepare Shift REG
     533           0 :     __m128i PMP_PRECISION_REG = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, (int16_t)shift_num);//_mm_set1_epi32(shift_num);
     534             : 
     535             :     //prepare the offset
     536           0 :     __m256i offsetREG = _mm256_set1_epi32(offset);
     537             : 
     538           0 :     row = 0;
     539             :     do {
     540           0 :         col = 0;
     541             :         do {
     542             :             //load coefftemp
     543           0 :             coeffTemp = _mm256_loadu_si256((__m256i *)(coeff + coeff_stride * row + col));
     544             : 
     545             :             //load maskingMatrix_new
     546           0 :             MaskingMatrix = _mm256_loadu_si256((__m256i *) (maskingMatrix + maskingMatrixStride * row + col));
     547             : 
     548           0 :             coeffTempORG = coeffTemp;
     549             : 
     550             :             //Absolute val
     551           0 :             coeffTemp = _mm256_abs_epi16(coeffTemp);
     552             : 
     553             :             //Multiply
     554           0 :             a0 = _mm256_mullo_epi16(coeffTemp, MaskingMatrix);
     555           0 :             a1 = _mm256_mulhi_epi16(coeffTemp, MaskingMatrix);
     556             : 
     557           0 :             b0 = _mm256_unpacklo_epi16(a0, a1);
     558           0 :             b1 = _mm256_unpackhi_epi16(a0, a1);
     559             : 
     560             :             //Add
     561           0 :             b0 = _mm256_add_epi32(b0, offsetREG);
     562           0 :             b1 = _mm256_add_epi32(b1, offsetREG);
     563             : 
     564             :             //Shift right by PMP_PRECISION_REG
     565           0 :             b0 = _mm256_sra_epi32(b0, PMP_PRECISION_REG);
     566           0 :             b1 = _mm256_sra_epi32(b1, PMP_PRECISION_REG);
     567             : 
     568             :             //coefftemp in c
     569           0 :             ymm_computed = _mm256_packs_epi32(b0, b1);//Convert packed 32-bit integers from b0 and b1 to packed 16-bit integers using signed saturation, and store the results in dst.
     570           0 :             z = _mm256_sub_epi16(z, _mm256_cmpgt_epi16(ymm_computed, _mm256_setzero_si256())); //coeffTemp = (coeff[coeffLocation] < 0)? -coeffTemp : coeffTemp;
     571             : 
     572           0 :             ymm_computed = _mm256_sign_epi16(ymm_computed, coeffTempORG);// coeffTemp);
     573             : 
     574           0 :             _mm256_storeu_si256((__m256i *)(coeff + coeff_stride * row + col), ymm_computed);
     575             : 
     576           0 :             col += 16;
     577           0 :         } while (col < compute_size);
     578             : 
     579           0 :         row++;
     580           0 :     } while (row < compute_size);
     581             : 
     582           0 :     z = _mm256_sad_epu8(z, _mm256_srli_si256(z, 7));
     583           0 :     *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1)));
     584           0 : }
     585             : 
     586   361686000 : static INLINE void EnergyComputation_kernel_avx2(const int32_t *const in,
     587             :     __m256i *const sum256)
     588             : {
     589   361686000 :     const __m256i zero = _mm256_setzero_si256();
     590   361686000 :     const __m256i input = _mm256_load_si256((__m256i *)in);
     591   361686000 :     const __m256i in_lo = _mm256_unpacklo_epi32(input, zero);
     592   361686000 :     const __m256i in_hi = _mm256_unpackhi_epi32(input, zero);
     593   361686000 :     const __m256i energy_lo = _mm256_mul_epi32(in_lo, in_lo);
     594   361686000 :     const __m256i energy_hi = _mm256_mul_epi32(in_hi, in_hi);
     595   361686000 :     *sum256 = _mm256_add_epi64(*sum256, energy_lo);
     596   361686000 :     *sum256 = _mm256_add_epi64(*sum256, energy_hi);
     597   361686000 : }
     598             : 
     599     3345080 : static INLINE uint64_t hadd64_avx2(const __m256i sum256) {
     600     3345080 :     const __m128i sum256_lo = _mm256_castsi256_si128(sum256);
     601     3345080 :     const __m128i sum256_hi = _mm256_extracti128_si256(sum256, 1);
     602     3345080 :     const __m128i sum128 = _mm_add_epi64(sum256_lo, sum256_hi);
     603     3345080 :     const __m128i sum128_hi = _mm_srli_si128(sum128, 8);
     604     3345080 :     const __m128i sum = _mm_add_epi64(sum128, sum128_hi);
     605             : 
     606     3345080 :     return _mm_extract_epi64(sum, 0);
     607             : }
     608             : 
     609     1726650 : static INLINE uint64_t EnergyComputation_avx2(const int32_t *const in,
     610             :     const uint32_t size)
     611             : {
     612     1726650 :     const __m256i zero = _mm256_setzero_si256();
     613     1726650 :     uint32_t i = 0;
     614     1726650 :     __m256i sum = zero;
     615             : 
     616             :     do {
     617   205972000 :         EnergyComputation_kernel_avx2(in + i, &sum);
     618   205972000 :         i += 8;
     619   205972000 :     } while (i < size);
     620             : 
     621     1726710 :     return hadd64_avx2(sum);
     622             : }
     623             : 
     624     1618460 : static INLINE uint64_t EnergyComputation64_avx2(const int32_t *in,
     625             :     const uint32_t height)
     626             : {
     627     1618460 :     const __m256i zero = _mm256_setzero_si256();
     628     1618460 :     uint32_t i = height;
     629     1618460 :     __m256i sum = zero;
     630             : 
     631             :     do {
     632    39105000 :         EnergyComputation_kernel_avx2(in + 0 * 8, &sum);
     633    39109200 :         EnergyComputation_kernel_avx2(in + 1 * 8, &sum);
     634    39108500 :         EnergyComputation_kernel_avx2(in + 2 * 8, &sum);
     635    39107800 :         EnergyComputation_kernel_avx2(in + 3 * 8, &sum);
     636    39105000 :         in += 64;
     637    39105000 :     } while (--i);
     638             : 
     639     1618490 :     return hadd64_avx2(sum);
     640             : }
     641             : 
     642     1618480 : static INLINE void clean_256_bytes_avx2(int32_t *buf, const uint32_t height) {
     643     1618480 :     const __m256i zero = _mm256_setzero_si256();
     644     1618480 :     uint32_t h = height;
     645             : 
     646             :     do {
     647             :         _mm256_store_si256((__m256i *)(buf + 0 * 8), zero);
     648    39132500 :         _mm256_store_si256((__m256i *)(buf + 1 * 8), zero);
     649    39132500 :         _mm256_store_si256((__m256i *)(buf + 2 * 8), zero);
     650    39132500 :         _mm256_store_si256((__m256i *)(buf + 3 * 8), zero);
     651    39132500 :         buf += 64;
     652    39132500 :     } while (--h);
     653     1618480 : }
     654             : 
     655   149964000 : static INLINE void copy_32_bytes_avx2(const int32_t *src, int32_t *dst) {
     656   149964000 :     const __m256i val = _mm256_load_si256((__m256i *)(src + 0 * 8));
     657             :     _mm256_store_si256((__m256i *)(dst + 0 * 8), val);
     658   149964000 : }
     659             : 
     660     1618500 : static INLINE void copy_256x_bytes_avx2(const int32_t *src, int32_t *dst,
     661             :     const uint32_t height) {
     662     1618500 :     uint32_t h = height;
     663             : 
     664             :     do {
     665    37495800 :         copy_32_bytes_avx2(src + 0 * 8, dst + 0 * 8);
     666    37496000 :         copy_32_bytes_avx2(src + 1 * 8, dst + 1 * 8);
     667    37495300 :         copy_32_bytes_avx2(src + 2 * 8, dst + 2 * 8);
     668    37494600 :         copy_32_bytes_avx2(src + 3 * 8, dst + 3 * 8);
     669    37495800 :         src += 64;
     670    37495800 :         dst += 32;
     671    37495800 :     } while (--h);
     672     1618480 : }
     673             : 
     674      855360 : uint64_t HandleTransform16x64_avx2(int32_t *output) {
     675             :     //bottom 16x32 area.
     676             :     const uint64_t three_quad_energy =
     677      855360 :         EnergyComputation_avx2(output + 16 * 32, 16 * 32);
     678             : 
     679             :     // Zero out the bottom 16x32 area.
     680      855375 :     memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
     681             : 
     682      855375 :     return three_quad_energy;
     683             : }
     684             : 
     685      557010 : uint64_t HandleTransform32x64_avx2(int32_t *output) {
     686             :     //bottom 32x32 area.
     687             :     const uint64_t three_quad_energy =
     688      557010 :         EnergyComputation_avx2(output + 32 * 32, 32 * 32);
     689             : 
     690             :     // Zero out the bottom 32x32 area.
     691      557017 :     memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
     692             : 
     693      557017 :     return three_quad_energy;
     694             : }
     695             : 
     696      790603 : uint64_t HandleTransform64x16_avx2(int32_t *output) {
     697             :     // top - right 32x16 area.
     698             :     const uint64_t three_quad_energy =
     699      790603 :         EnergyComputation64_avx2(output + 32, 16);
     700             : 
     701             :     // Zero out right 32x16 area.
     702      790607 :     clean_256_bytes_avx2(output + 32, 16);
     703             : 
     704             :     // Re-pack non-zero coeffs in the first 32x16 indices.
     705      790610 :     copy_256x_bytes_avx2(output + 64, output + 32, 15);
     706             : 
     707      790603 :     return three_quad_energy;
     708             : }
     709             : 
     710      513585 : uint64_t HandleTransform64x32_avx2(int32_t *output) {
     711             :     // top - right 32x32 area.
     712             :     const uint64_t three_quad_energy =
     713      513585 :         EnergyComputation64_avx2(output + 32, 32);
     714             : 
     715             :     // Zero out right 32x32 area.
     716      513589 :     clean_256_bytes_avx2(output + 32, 32);
     717             : 
     718             :     // Re-pack non-zero coeffs in the first 32x32 indices.
     719      513590 :     copy_256x_bytes_avx2(output + 64, output + 32, 31);
     720             : 
     721      513584 :     return three_quad_energy;
     722             : }
     723             : 
     724      314333 : uint64_t HandleTransform64x64_avx2(int32_t *output) {
     725             :     uint64_t three_quad_energy;
     726             : 
     727             :     // top - right 32x32 area.
     728      314333 :     three_quad_energy = EnergyComputation64_avx2(output + 32, 32);
     729             :     //bottom 64x32 area.
     730      314332 :     three_quad_energy += EnergyComputation_avx2(output + 32 * 64, 64 * 32);
     731             : 
     732             :     // Zero out top-right 32x32 area.
     733      314331 :     clean_256_bytes_avx2(output + 32, 32);
     734             : 
     735             :     // Zero out the bottom 64x32 area.
     736      314332 :     memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
     737             : 
     738             :     // Re-pack non-zero coeffs in the first 32x32 indices.
     739      314332 :     copy_256x_bytes_avx2(output + 64, output + 32, 31);
     740             : 
     741      314329 :     return three_quad_energy;
     742             : }

Generated by: LCOV version 1.14