LCOV - code coverage report
Current view: top level - ASM_SSE2 - lpf_common_sse2.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 182 240 75.8 %
Date: 2019-11-25 17:38:06 Functions: 7 13 53.8 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
      13             : #define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
      14             : 
      15             : #include <emmintrin.h>  // SSE2
      16             : 
      17             : static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
      18             :                                             __m128i *x2, __m128i *x3,
      19             :                                             __m128i *x4, __m128i *x5,
      20             :                                             __m128i *d0, __m128i *d1,
      21             :                                             __m128i *d2, __m128i *d3,
      22             :                                             __m128i *d4, __m128i *d5) {
      23             :   __m128i w0, w1, w2, w3, w4, w5, ww0;
      24             : 
      25             :   // 00 01 02 03 04 05 xx xx
      26             :   // 10 11 12 13 14 15 xx xx
      27             :   // 20 21 22 23 24 25 xx xx
      28             :   // 30 31 32 33 34 35 xx xx
      29             :   // 40 41 42 43 44 45 xx xx
      30             :   // 50 51 52 53 54 55 xx xx
      31             : 
      32             :   w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
      33             :   w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
      34             :   w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
      35             : 
      36             :   ww0 = _mm_unpacklo_epi32(w0, w1);   // 00 10 20 30 01 11 21 31
      37             :   *d0 = _mm_unpacklo_epi64(ww0, w2);  // 00 10 20 30 40 50 41 51
      38             :   *d1 = _mm_unpackhi_epi64(ww0,
      39             :                            _mm_srli_si128(w2, 4));  // 01 11 21 31 41 51 xx xx
      40             : 
      41             :   ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
      42             :   *d2 = _mm_unpacklo_epi64(ww0,
      43             :                            _mm_srli_si128(w2, 8));  // 02 12 22 32 42 52 xx xx
      44             : 
      45             :   w3 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 xx xx xx xx
      46             :   w4 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 xx xx xx xx
      47             :   w5 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 xx xx xx xx
      48             : 
      49             :   *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4));  // 03 13 23 33 43 53
      50             : 
      51             :   ww0 = _mm_unpacklo_epi32(w3, w4);   //  04 14 24 34 05 15 25 35
      52             :   *d4 = _mm_unpacklo_epi64(ww0, w5);  //  04 14 24 34 44 54 45 55
      53             :   *d5 = _mm_unpackhi_epi64(ww0,
      54             :                            _mm_slli_si128(w5, 4));  // 05 15 25 35 45 55 xx xx
      55             : }
      56             : 
      57           0 : static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
      58             :                                                     __m128i *x2, __m128i *x3,
      59             :                                                     __m128i *d0, __m128i *d1,
      60             :                                                     __m128i *d2, __m128i *d3) {
      61           0 :   __m128i zero = _mm_setzero_si128();
      62             :   __m128i w0, w1, ww0, ww1;
      63             : 
      64           0 :   w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
      65           0 :   w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
      66             : 
      67           0 :   ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
      68           0 :   ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
      69             : 
      70           0 :   *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
      71           0 :   *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
      72           0 :   *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
      73           0 :   *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
      74           0 : }
      75             : 
      76           0 : static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
      77             :                                                      __m128i *x2, __m128i *x3,
      78             :                                                      __m128i *d4, __m128i *d5,
      79             :                                                      __m128i *d6, __m128i *d7) {
      80             :   __m128i w0, w1, ww2, ww3;
      81           0 :   __m128i zero = _mm_setzero_si128();
      82             : 
      83           0 :   w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
      84           0 :   w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
      85             : 
      86           0 :   ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
      87           0 :   ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
      88             : 
      89           0 :   *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
      90           0 :   *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
      91           0 :   *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
      92           0 :   *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
      93           0 : }
      94             : 
      95             : // here in and out pointers (x and d) should be different! we don't store their
      96             : // values inside
      97           0 : static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
      98             :                                                 __m128i *x2, __m128i *x3,
      99             :                                                 __m128i *d0, __m128i *d1,
     100             :                                                 __m128i *d2, __m128i *d3,
     101             :                                                 __m128i *d4, __m128i *d5,
     102             :                                                 __m128i *d6, __m128i *d7) {
     103             :   // input
     104             :   // x0 00 01 02 03 04 05 06 07
     105             :   // x1 10 11 12 13 14 15 16 17
     106             :   // x2 20 21 22 23 24 25 26 27
     107             :   // x3 30 31 32 33 34 35 36 37
     108             :   // output
     109             :   // 00 10 20 30 xx xx xx xx
     110             :   // 01 11 21 31 xx xx xx xx
     111             :   // 02 12 22 32 xx xx xx xx
     112             :   // 03 13 23 33 xx xx xx xx
     113             :   // 04 14 24 34 xx xx xx xx
     114             :   // 05 15 25 35 xx xx xx xx
     115             :   // 06 16 26 36 xx xx xx xx
     116             :   // 07 17 27 37 xx xx xx xx
     117           0 :   highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
     118           0 :   highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
     119           0 : }
     120             : 
     121           0 : static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
     122             :                                                 __m128i *x2, __m128i *x3,
     123             :                                                 __m128i *x4, __m128i *x5,
     124             :                                                 __m128i *x6, __m128i *x7,
     125             :                                                 __m128i *d0, __m128i *d1,
     126             :                                                 __m128i *d2, __m128i *d3) {
     127             :   __m128i w0, w1, w2, w3, ww0, ww1;
     128             :   // x0 00 01 02 03 04 05 06 07
     129             :   // x1 10 11 12 13 14 15 16 17
     130             :   // x2 20 21 22 23 24 25 26 27
     131             :   // x3 30 31 32 33 34 35 36 37
     132             :   // x4 40 41 42 43 44 45 46 47
     133             :   // x5 50 51 52 53 54 55 56 57
     134             :   // x6 60 61 62 63 64 65 66 67
     135             :   // x7 70 71 72 73 74 75 76 77
     136             : 
     137           0 :   w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
     138           0 :   w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
     139           0 :   w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
     140           0 :   w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
     141             : 
     142           0 :   ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
     143           0 :   ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
     144             : 
     145           0 :   *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
     146           0 :   *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
     147             : 
     148           0 :   ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
     149           0 :   ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
     150             : 
     151           0 :   *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
     152           0 :   *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
     153           0 : }
     154             : 
     155           0 : static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
     156             :                                                  __m128i *x2, __m128i *x3,
     157             :                                                  __m128i *x4, __m128i *x5,
     158             :                                                  __m128i *x6, __m128i *x7,
     159             :                                                  __m128i *d4, __m128i *d5,
     160             :                                                  __m128i *d6, __m128i *d7) {
     161             :   __m128i w0, w1, w2, w3, ww0, ww1;
     162             :   // x0 00 01 02 03 04 05 06 07
     163             :   // x1 10 11 12 13 14 15 16 17
     164             :   // x2 20 21 22 23 24 25 26 27
     165             :   // x3 30 31 32 33 34 35 36 37
     166             :   // x4 40 41 42 43 44 45 46 47
     167             :   // x5 50 51 52 53 54 55 56 57
     168             :   // x6 60 61 62 63 64 65 66 67
     169             :   // x7 70 71 72 73 74 75 76 77
     170           0 :   w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
     171           0 :   w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
     172           0 :   w2 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 46 56 47 57
     173           0 :   w3 = _mm_unpackhi_epi16(*x6, *x7);  // 64 74 65 75 66 76 67 77
     174             : 
     175           0 :   ww0 = _mm_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
     176           0 :   ww1 = _mm_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
     177             : 
     178           0 :   *d4 = _mm_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
     179           0 :   *d5 = _mm_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
     180             : 
     181           0 :   ww0 = _mm_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
     182           0 :   ww1 = _mm_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
     183             : 
     184           0 :   *d6 = _mm_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
     185           0 :   *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
     186           0 : }
     187             : 
     188             : // here in and out pointers (x and d) should be different! we don't store their
     189             : // values inside
     190           0 : static INLINE void highbd_transpose8x8_sse2(
     191             :     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
     192             :     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
     193             :     __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
     194             :     __m128i *d7) {
     195           0 :   highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
     196           0 :   highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
     197           0 : }
     198             : 
     199             : // here in and out pointers (x and d arrays) should be different! we don't store
     200             : // their values inside
     201             : static INLINE void highbd_transpose8x16_sse2(
     202             :     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
     203             :     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
     204             :     __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
     205             :     __m128i *d7) {
     206             :   highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
     207             :                            d5, d6, d7);
     208             :   highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
     209             :                            x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
     210             :                            d4 + 1, d5 + 1, d6 + 1, d7 + 1);
     211             : }
     212             : 
     213             : // Low bit depth functions
     214      508538 : static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
     215             :                                              __m128i *x2, __m128i *x3,
     216             :                                              __m128i *d0, __m128i *d1,
     217             :                                              __m128i *d2, __m128i *d3) {
     218             :   // input
     219             :   // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
     220             :   // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
     221             :   // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
     222             :   // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
     223             :   // output
     224             :   // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
     225             :   // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
     226             :   // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
     227             :   // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
     228             : 
     229             :   __m128i w0, w1;
     230             : 
     231      508538 :   w0 = _mm_unpacklo_epi8(
     232             :       *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
     233     1017080 :   w1 = _mm_unpacklo_epi8(
     234             :       *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
     235             : 
     236      508538 :   *d0 = _mm_unpacklo_epi16(
     237             :       w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
     238             : 
     239      508538 :   *d1 = _mm_srli_si128(*d0,
     240             :                        4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
     241      508538 :   *d2 = _mm_srli_si128(*d0,
     242             :                        8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
     243      508538 :   *d3 = _mm_srli_si128(*d0,
     244             :                        12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
     245      508538 : }
     246             : 
     247      262040 : static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
     248             :                                          __m128i *x3, __m128i *d0, __m128i *d1,
     249             :                                          __m128i *d2, __m128i *d3, __m128i *d4,
     250             :                                          __m128i *d5, __m128i *d6,
     251             :                                          __m128i *d7) {
     252             :   // input
     253             :   // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
     254             :   // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
     255             :   // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
     256             :   // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
     257             :   // output
     258             :   // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
     259             :   // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
     260             :   // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
     261             :   // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
     262             :   // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
     263             :   // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
     264             :   // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
     265             :   // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
     266             : 
     267             :   __m128i w0, w1, ww0, ww1;
     268             : 
     269      262040 :   w0 = _mm_unpacklo_epi8(
     270             :       *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
     271      524080 :   w1 = _mm_unpacklo_epi8(
     272             :       *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
     273             : 
     274      262040 :   ww0 = _mm_unpacklo_epi16(
     275             :       w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
     276      262040 :   ww1 = _mm_unpackhi_epi16(
     277             :       w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
     278             : 
     279      262040 :   *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
     280      262040 :   *d1 = _mm_srli_si128(ww0,
     281             :                        4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
     282      262040 :   *d2 = _mm_srli_si128(ww0,
     283             :                        8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
     284      262040 :   *d3 = _mm_srli_si128(ww0,
     285             :                        12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
     286             : 
     287      262040 :   *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
     288      262040 :   *d5 = _mm_srli_si128(ww1,
     289             :                        4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
     290      262040 :   *d6 = _mm_srli_si128(ww1,
     291             :                        8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
     292      262040 :   *d7 = _mm_srli_si128(ww1,
     293             :                        12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
     294      262040 : }
     295             : 
     296      235052 : static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
     297             :                                          __m128i *x3, __m128i *x4, __m128i *x5,
     298             :                                          __m128i *x6, __m128i *x7, __m128i *d0,
     299             :                                          __m128i *d1, __m128i *d2,
     300             :                                          __m128i *d3) {
     301             :   // input
     302             :   // x0 00 01 02 03 04 05 06 07
     303             :   // x1 10 11 12 13 14 15 16 17
     304             :   // x2 20 21 22 23 24 25 26 27
     305             :   // x3 30 31 32 33 34 35 36 37
     306             :   // x4 40 41 42 43 44 45 46 47
     307             :   // x5  50 51 52 53 54 55 56 57
     308             :   // x6  60 61 62 63 64 65 66 67
     309             :   // x7 70 71 72 73 74 75 76 77
     310             :   // output
     311             :   // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
     312             :   // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
     313             :   // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
     314             :   // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
     315             : 
     316             :   __m128i w0, w1, w2, w3, w4, w5;
     317             : 
     318      235052 :   w0 = _mm_unpacklo_epi8(
     319             :       *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
     320             : 
     321      235052 :   w1 = _mm_unpacklo_epi8(
     322             :       *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
     323             : 
     324      235052 :   w2 = _mm_unpacklo_epi8(
     325             :       *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
     326             : 
     327      470104 :   w3 = _mm_unpacklo_epi8(
     328             :       *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
     329             : 
     330      235052 :   w4 = _mm_unpacklo_epi16(
     331             :       w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
     332      235052 :   w5 = _mm_unpacklo_epi16(
     333             :       w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
     334             : 
     335      235052 :   *d0 = _mm_unpacklo_epi32(
     336             :       w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
     337      235052 :   *d1 = _mm_srli_si128(*d0, 8);
     338      235052 :   *d2 = _mm_unpackhi_epi32(
     339             :       w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
     340      235052 :   *d3 = _mm_srli_si128(*d2, 8);
     341      235052 : }
     342             : 
     343      503028 : static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
     344             :                                      __m128i *x3, __m128i *x4, __m128i *x5,
     345             :                                      __m128i *x6, __m128i *x7, __m128i *d0d1,
     346             :                                      __m128i *d2d3, __m128i *d4d5,
     347             :                                      __m128i *d6d7) {
     348             :   __m128i w0, w1, w2, w3, w4, w5, w6, w7;
     349             :   // x0 00 01 02 03 04 05 06 07
     350             :   // x1 10 11 12 13 14 15 16 17
     351      503028 :   w0 = _mm_unpacklo_epi8(
     352             :       *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
     353             : 
     354             :   // x2 20 21 22 23 24 25 26 27
     355             :   // x3 30 31 32 33 34 35 36 37
     356      503028 :   w1 = _mm_unpacklo_epi8(
     357             :       *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
     358             : 
     359             :   // x4 40 41 42 43 44 45 46 47
     360             :   // x5  50 51 52 53 54 55 56 57
     361      503028 :   w2 = _mm_unpacklo_epi8(
     362             :       *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
     363             : 
     364             :   // x6  60 61 62 63 64 65 66 67
     365             :   // x7 70 71 72 73 74 75 76 77
     366     1006060 :   w3 = _mm_unpacklo_epi8(
     367             :       *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
     368             : 
     369      503028 :   w4 = _mm_unpacklo_epi16(
     370             :       w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
     371      503028 :   w5 = _mm_unpacklo_epi16(
     372             :       w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
     373             : 
     374      503028 :   *d0d1 = _mm_unpacklo_epi32(
     375             :       w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
     376      503028 :   *d2d3 = _mm_unpackhi_epi32(
     377             :       w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
     378             : 
     379      503028 :   w6 = _mm_unpackhi_epi16(
     380             :       w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
     381      503028 :   w7 = _mm_unpackhi_epi16(
     382             :       w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
     383             : 
     384      503028 :   *d4d5 = _mm_unpacklo_epi32(
     385             :       w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
     386      503028 :   *d6d7 = _mm_unpackhi_epi32(
     387             :       w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
     388      503028 : }
     389             : 
     390     1220580 : static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) {
     391             :     __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
     392             :     __m128i w10, w11, w12, w13, w14, w15;
     393             : 
     394     1220580 :     w0 = _mm_unpacklo_epi8(x[0], x[1]);
     395     1220580 :     w1 = _mm_unpacklo_epi8(x[2], x[3]);
     396     1220580 :     w2 = _mm_unpacklo_epi8(x[4], x[5]);
     397     1220580 :     w3 = _mm_unpacklo_epi8(x[6], x[7]);
     398             : 
     399     1220580 :     w8 = _mm_unpacklo_epi8(x[8], x[9]);
     400     1220580 :     w9 = _mm_unpacklo_epi8(x[10], x[11]);
     401     1220580 :     w10 = _mm_unpacklo_epi8(x[12], x[13]);
     402     2441160 :     w11 = _mm_unpacklo_epi8(x[14], x[15]);
     403             : 
     404     1220580 :     w4 = _mm_unpacklo_epi16(w0, w1);
     405     1220580 :     w5 = _mm_unpacklo_epi16(w2, w3);
     406     1220580 :     w12 = _mm_unpacklo_epi16(w8, w9);
     407     1220580 :     w13 = _mm_unpacklo_epi16(w10, w11);
     408             : 
     409     1220580 :     w6 = _mm_unpacklo_epi32(w4, w5);
     410     1220580 :     w7 = _mm_unpackhi_epi32(w4, w5);
     411     1220580 :     w14 = _mm_unpacklo_epi32(w12, w13);
     412     1220580 :     w15 = _mm_unpackhi_epi32(w12, w13);
     413             : 
     414             :     // Store first 4-line result
     415     1220580 :     d[0] = _mm_unpacklo_epi64(w6, w14);
     416     1220580 :     d[1] = _mm_unpackhi_epi64(w6, w14);
     417     1220580 :     d[2] = _mm_unpacklo_epi64(w7, w15);
     418     2441160 :     d[3] = _mm_unpackhi_epi64(w7, w15);
     419             : 
     420     1220580 :     w4 = _mm_unpackhi_epi16(w0, w1);
     421     1220580 :     w5 = _mm_unpackhi_epi16(w2, w3);
     422     1220580 :     w12 = _mm_unpackhi_epi16(w8, w9);
     423     1220580 :     w13 = _mm_unpackhi_epi16(w10, w11);
     424             : 
     425     1220580 :     w6 = _mm_unpacklo_epi32(w4, w5);
     426     1220580 :     w7 = _mm_unpackhi_epi32(w4, w5);
     427     1220580 :     w14 = _mm_unpacklo_epi32(w12, w13);
     428     1220580 :     w15 = _mm_unpackhi_epi32(w12, w13);
     429             : 
     430             :     // Store second 4-line result
     431     1220580 :     d[4] = _mm_unpacklo_epi64(w6, w14);
     432     1220580 :     d[5] = _mm_unpackhi_epi64(w6, w14);
     433     1220580 :     d[6] = _mm_unpacklo_epi64(w7, w15);
     434     1220580 :     d[7] = _mm_unpackhi_epi64(w7, w15);
     435             : 
     436             :     // upper half
     437     1220580 :     w0 = _mm_unpackhi_epi8(x[0], x[1]);
     438     1220580 :     w1 = _mm_unpackhi_epi8(x[2], x[3]);
     439     1220580 :     w2 = _mm_unpackhi_epi8(x[4], x[5]);
     440     1220580 :     w3 = _mm_unpackhi_epi8(x[6], x[7]);
     441             : 
     442     1220580 :     w8 = _mm_unpackhi_epi8(x[8], x[9]);
     443     1220580 :     w9 = _mm_unpackhi_epi8(x[10], x[11]);
     444     1220580 :     w10 = _mm_unpackhi_epi8(x[12], x[13]);
     445     2441160 :     w11 = _mm_unpackhi_epi8(x[14], x[15]);
     446             : 
     447     1220580 :     w4 = _mm_unpacklo_epi16(w0, w1);
     448     1220580 :     w5 = _mm_unpacklo_epi16(w2, w3);
     449     1220580 :     w12 = _mm_unpacklo_epi16(w8, w9);
     450     1220580 :     w13 = _mm_unpacklo_epi16(w10, w11);
     451             : 
     452     1220580 :     w6 = _mm_unpacklo_epi32(w4, w5);
     453     1220580 :     w7 = _mm_unpackhi_epi32(w4, w5);
     454     1220580 :     w14 = _mm_unpacklo_epi32(w12, w13);
     455     1220580 :     w15 = _mm_unpackhi_epi32(w12, w13);
     456             : 
     457             :     // Store first 4-line result
     458     1220580 :     d[8] = _mm_unpacklo_epi64(w6, w14);
     459     1220580 :     d[9] = _mm_unpackhi_epi64(w6, w14);
     460     1220580 :     d[10] = _mm_unpacklo_epi64(w7, w15);
     461     2441160 :     d[11] = _mm_unpackhi_epi64(w7, w15);
     462             : 
     463     1220580 :     w4 = _mm_unpackhi_epi16(w0, w1);
     464     1220580 :     w5 = _mm_unpackhi_epi16(w2, w3);
     465     1220580 :     w12 = _mm_unpackhi_epi16(w8, w9);
     466     1220580 :     w13 = _mm_unpackhi_epi16(w10, w11);
     467             : 
     468     1220580 :     w6 = _mm_unpacklo_epi32(w4, w5);
     469     1220580 :     w7 = _mm_unpackhi_epi32(w4, w5);
     470     1220580 :     w14 = _mm_unpacklo_epi32(w12, w13);
     471     1220580 :     w15 = _mm_unpackhi_epi32(w12, w13);
     472             : 
     473             :     // Store second 4-line result
     474     1220580 :     d[12] = _mm_unpacklo_epi64(w6, w14);
     475     1220580 :     d[13] = _mm_unpackhi_epi64(w6, w14);
     476     1220580 :     d[14] = _mm_unpacklo_epi64(w7, w15);
     477     1220580 :     d[15] = _mm_unpackhi_epi64(w7, w15);
     478     1220580 : }
     479             : 
     480      457672 : static INLINE void transpose16x8_8x16_sse2(
     481             :     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
     482             :     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
     483             :     __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
     484             :     __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
     485             :     __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
     486             :   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
     487             :   __m128i w10, w11, w12, w13, w14, w15;
     488             : 
     489      457672 :   w0 = _mm_unpacklo_epi8(*x0, *x1);
     490      457672 :   w1 = _mm_unpacklo_epi8(*x2, *x3);
     491      457672 :   w2 = _mm_unpacklo_epi8(*x4, *x5);
     492      457672 :   w3 = _mm_unpacklo_epi8(*x6, *x7);
     493             : 
     494      457672 :   w8 = _mm_unpacklo_epi8(*x8, *x9);
     495      457672 :   w9 = _mm_unpacklo_epi8(*x10, *x11);
     496      457672 :   w10 = _mm_unpacklo_epi8(*x12, *x13);
     497      915344 :   w11 = _mm_unpacklo_epi8(*x14, *x15);
     498             : 
     499      457672 :   w4 = _mm_unpacklo_epi16(w0, w1);
     500      457672 :   w5 = _mm_unpacklo_epi16(w2, w3);
     501      457672 :   w12 = _mm_unpacklo_epi16(w8, w9);
     502      457672 :   w13 = _mm_unpacklo_epi16(w10, w11);
     503             : 
     504      457672 :   w6 = _mm_unpacklo_epi32(w4, w5);
     505      457672 :   w7 = _mm_unpackhi_epi32(w4, w5);
     506      457672 :   w14 = _mm_unpacklo_epi32(w12, w13);
     507      457672 :   w15 = _mm_unpackhi_epi32(w12, w13);
     508             : 
     509             :   // Store first 4-line result
     510      457672 :   *d0 = _mm_unpacklo_epi64(w6, w14);
     511      457672 :   *d1 = _mm_unpackhi_epi64(w6, w14);
     512      457672 :   *d2 = _mm_unpacklo_epi64(w7, w15);
     513      457672 :   *d3 = _mm_unpackhi_epi64(w7, w15);
     514             : 
     515      457672 :   w4 = _mm_unpackhi_epi16(w0, w1);
     516      457672 :   w5 = _mm_unpackhi_epi16(w2, w3);
     517      457672 :   w12 = _mm_unpackhi_epi16(w8, w9);
     518      457672 :   w13 = _mm_unpackhi_epi16(w10, w11);
     519             : 
     520      457672 :   w6 = _mm_unpacklo_epi32(w4, w5);
     521      457672 :   w7 = _mm_unpackhi_epi32(w4, w5);
     522      457672 :   w14 = _mm_unpacklo_epi32(w12, w13);
     523      457672 :   w15 = _mm_unpackhi_epi32(w12, w13);
     524             : 
     525             :   // Store second 4-line result
     526      457672 :   *d4 = _mm_unpacklo_epi64(w6, w14);
     527      457672 :   *d5 = _mm_unpackhi_epi64(w6, w14);
     528      457672 :   *d6 = _mm_unpacklo_epi64(w7, w15);
     529      457672 :   *d7 = _mm_unpackhi_epi64(w7, w15);
     530      457672 : }
     531             : 
     532      143055 : static INLINE void transpose8x16_16x8_sse2(
     533             :     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
     534             :     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
     535             :     __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
     536             :     __m128i *d12d13, __m128i *d14d15) {
     537             :   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
     538             :   __m128i w10, w11, w12, w13, w14, w15;
     539             : 
     540      143055 :   w0 = _mm_unpacklo_epi8(*x0, *x1);
     541      143055 :   w1 = _mm_unpacklo_epi8(*x2, *x3);
     542      143055 :   w2 = _mm_unpacklo_epi8(*x4, *x5);
     543      143055 :   w3 = _mm_unpacklo_epi8(*x6, *x7);
     544             : 
     545      143055 :   w8 = _mm_unpackhi_epi8(*x0, *x1);
     546      143055 :   w9 = _mm_unpackhi_epi8(*x2, *x3);
     547      143055 :   w10 = _mm_unpackhi_epi8(*x4, *x5);
     548      286110 :   w11 = _mm_unpackhi_epi8(*x6, *x7);
     549             : 
     550      143055 :   w4 = _mm_unpacklo_epi16(w0, w1);
     551      143055 :   w5 = _mm_unpacklo_epi16(w2, w3);
     552      143055 :   w12 = _mm_unpacklo_epi16(w8, w9);
     553      143055 :   w13 = _mm_unpacklo_epi16(w10, w11);
     554             : 
     555      143055 :   w6 = _mm_unpacklo_epi32(w4, w5);
     556      143055 :   w7 = _mm_unpackhi_epi32(w4, w5);
     557      143055 :   w14 = _mm_unpacklo_epi32(w12, w13);
     558      143055 :   w15 = _mm_unpackhi_epi32(w12, w13);
     559             : 
     560             :   // Store first 4-line result
     561      143055 :   *d0d1 = _mm_unpacklo_epi64(w6, w14);
     562      143055 :   *d2d3 = _mm_unpackhi_epi64(w6, w14);
     563      143055 :   *d4d5 = _mm_unpacklo_epi64(w7, w15);
     564      143055 :   *d6d7 = _mm_unpackhi_epi64(w7, w15);
     565             : 
     566      143055 :   w4 = _mm_unpackhi_epi16(w0, w1);
     567      143055 :   w5 = _mm_unpackhi_epi16(w2, w3);
     568      143055 :   w12 = _mm_unpackhi_epi16(w8, w9);
     569      143055 :   w13 = _mm_unpackhi_epi16(w10, w11);
     570             : 
     571      143055 :   w6 = _mm_unpacklo_epi32(w4, w5);
     572      143055 :   w7 = _mm_unpackhi_epi32(w4, w5);
     573      143055 :   w14 = _mm_unpacklo_epi32(w12, w13);
     574      143055 :   w15 = _mm_unpackhi_epi32(w12, w13);
     575             : 
     576             :   // Store second 4-line result
     577      143055 :   *d8d9 = _mm_unpacklo_epi64(w6, w14);
     578      143055 :   *d10d11 = _mm_unpackhi_epi64(w6, w14);
     579      143055 :   *d12d13 = _mm_unpacklo_epi64(w7, w15);
     580      143055 :   *d14d15 = _mm_unpackhi_epi64(w7, w15);
     581      143055 : }
     582             : 
     583             : #endif  // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_

Generated by: LCOV version 1.14