LCOV - code coverage report
Current view: top level - ASM_SSE2 - transpose_sse2.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 91 259 35.1 %
Date: 2019-11-25 17:38:06 Functions: 6 13 46.2 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #ifndef AOM_DSP_X86_TRANSPOSE_SSE2_H_
      13             : #define AOM_DSP_X86_TRANSPOSE_SSE2_H_
      14             : 
      15             : #include <emmintrin.h>  // SSE2
      16             : 
      17             : // nclude "./aom_config.h"
      18             : 
      19             : #ifdef __cplusplus
      20             : extern "C" {
      21             : #endif
      22             : 
      23             : void transpose_8bit_4x4_reg128bit_instance_sse2(const __m128i *const in,
      24             :                                                 __m128i *const out);
      25             : 
      26             : void transpose_8bit_8x8_reg128bit_instance_sse2(const __m128i *const in,
      27             :                                                 __m128i *const out);
      28             : 
      29             : void transpose_8bit_16x8_reg128bit_instance_sse2(const __m128i *const in,
      30             :                                                  __m128i *const out);
      31             : 
      32             : void transpose_8bit_16x16_reg128bit_instance_sse2(const __m128i *const in,
      33             :                                                   __m128i *const out);
      34             : 
      35             : void partial_transpose_8bit_8x8_reg128bit_instance_sse2(const __m128i *const in,
      36             :                                                         __m128i *const out);
      37             : 
      38             : void transpose_16bit_4x4_reg128bit_instance_sse2(const __m128i *const in,
      39             :                                                  __m128i *const out);
      40             : 
      41             : void transpose_16bit_4x8_reg128bit_instance_sse2(const __m128i *const in,
      42             :                                                  __m128i *const out);
      43             : 
      44             : void transpose_16bit_8x4_reg128bit_instance_sse2(const __m128i *const in,
      45             :                                                  __m128i *const out);
      46             : 
      47             : void transpose_16bit_8x8_reg128bit_instance_sse2(const __m128i *const in,
      48             :                                                  __m128i *const out);
      49             : 
      50             : void transpose_16bit_16x16_reg128bit_instance_sse2(const __m128i *const in,
      51             :                                                    __m128i *const out);
      52             : 
      53             : void transpose_32bit_4x4_reg128bit_instance_sse2(const __m128i *const in,
      54             :                                                  __m128i *const out);
      55             : 
      56             : void transpose_32bit_4x4x2_reg128bit_instance_sse2(const __m128i *const in,
      57             :                                                    __m128i *const out);
      58             : 
      59             : void transpose_32bit_8x4_reg128bit_instance_sse2(const __m128i *const in,
      60             :                                                  __m128i *const out);
      61             : 
      62             : #ifdef __cplusplus
      63             : }
      64             : #endif
      65             : 
      66             : 
      67           0 : static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
      68             :     // Unpack 16 bit elements. Goes from:
      69             :     // in[0]: 00 01 02 03
      70             :     // in[1]: 10 11 12 13
      71             :     // in[2]: 20 21 22 23
      72             :     // in[3]: 30 31 32 33
      73             :     // to:
      74             :     // a0:    00 10 01 11  02 12 03 13
      75             :     // a1:    20 30 21 31  22 32 23 33
      76           0 :     const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
      77           0 :     const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
      78             : 
      79             :     // Unpack 32 bit elements resulting in:
      80             :     // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
      81           0 :     return _mm_unpacklo_epi16(a0, a1);
      82             : }
      83             : 
      84           0 : static INLINE void transpose_8bit_8x8(const __m128i *const in,
      85             :     __m128i *const out) {
      86             :     // Unpack 8 bit elements. Goes from:
      87             :     // in[0]: 00 01 02 03 04 05 06 07
      88             :     // in[1]: 10 11 12 13 14 15 16 17
      89             :     // in[2]: 20 21 22 23 24 25 26 27
      90             :     // in[3]: 30 31 32 33 34 35 36 37
      91             :     // in[4]: 40 41 42 43 44 45 46 47
      92             :     // in[5]: 50 51 52 53 54 55 56 57
      93             :     // in[6]: 60 61 62 63 64 65 66 67
      94             :     // in[7]: 70 71 72 73 74 75 76 77
      95             :     // to:
      96             :     // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
      97             :     // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
      98             :     // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
      99             :     // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
     100           0 :     const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
     101           0 :     const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
     102           0 :     const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
     103           0 :     const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
     104             : 
     105             :     // Unpack 16 bit elements resulting in:
     106             :     // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
     107             :     // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
     108             :     // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
     109             :     // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
     110           0 :     const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
     111           0 :     const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
     112           0 :     const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
     113           0 :     const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
     114             : 
     115             :     // Unpack 32 bit elements resulting in:
     116             :     // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
     117             :     // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
     118             :     // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
     119             :     // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
     120           0 :     const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
     121           0 :     const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
     122           0 :     const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
     123           0 :     const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
     124             : 
     125             :     // Unpack 64 bit elements resulting in:
     126             :     // out[0]: 00 10 20 30 40 50 60 70
     127             :     // out[1]: 01 11 21 31 41 51 61 71
     128             :     // out[2]: 02 12 22 32 42 52 62 72
     129             :     // out[3]: 03 13 23 33 43 53 63 73
     130             :     // out[4]: 04 14 24 34 44 54 64 74
     131             :     // out[5]: 05 15 25 35 45 55 65 75
     132             :     // out[6]: 06 16 26 36 46 56 66 76
     133             :     // out[7]: 07 17 27 37 47 57 67 77
     134           0 :     out[0] = c0;
     135           0 :     out[1] = _mm_srli_si128(c0, 8);
     136           0 :     out[2] = c1;
     137           0 :     out[3] = _mm_srli_si128(c1, 8);
     138           0 :     out[4] = c2;
     139           0 :     out[5] = _mm_srli_si128(c2, 8);
     140           0 :     out[6] = c3;
     141           0 :     out[7] = _mm_srli_si128(c3, 8);
     142           0 : }
     143             : 
     144     7870570 : static INLINE void partial_transpose_8bit_8x8(const __m128i *const in,
     145             :     __m128i *const out) {
     146             :     // Unpack 8 bit elements. Goes from:
     147             :     // in[0]: 00 01 02 03 04 05 06 07
     148             :     // in[1]: 10 11 12 13 14 15 16 17
     149             :     // in[2]: 20 21 22 23 24 25 26 27
     150             :     // in[3]: 30 31 32 33 34 35 36 37
     151             :     // in[4]: 40 41 42 43 44 45 46 47
     152             :     // in[5]: 50 51 52 53 54 55 56 57
     153             :     // in[6]: 60 61 62 63 64 65 66 67
     154             :     // in[7]: 70 71 72 73 74 75 76 77
     155             :     // to:
     156             :     // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
     157             :     // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
     158             :     // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
     159             :     // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
     160     7870570 :     const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
     161     7870570 :     const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
     162     7870570 :     const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
     163    15741100 :     const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
     164             : 
     165             :     // Unpack 16 bit elements resulting in:
     166             :     // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
     167             :     // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
     168             :     // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
     169             :     // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
     170     7870570 :     const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
     171     7870570 :     const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
     172     7870570 :     const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
     173     7870570 :     const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
     174             : 
     175             :     // Unpack 32 bit elements resulting in:
     176             :     // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
     177             :     // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
     178             :     // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
     179             :     // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
     180     7870570 :     out[0] = _mm_unpacklo_epi32(b0, b2);
     181     7870570 :     out[1] = _mm_unpackhi_epi32(b0, b2);
     182     7870570 :     out[2] = _mm_unpacklo_epi32(b1, b3);
     183     7870570 :     out[3] = _mm_unpackhi_epi32(b1, b3);
     184     7870570 : }
     185             : 
     186           0 : static INLINE void transpose_8bit_16x8(const __m128i *const in,
     187             :     __m128i *const out) {
     188             :     // Unpack 8 bit elements. Goes from:
     189             :     // in[0]: 00 01 02 03 04 05 06 07  08 09 0A 0B 0C 0D 0E 0F
     190             :     // in[1]: 10 11 12 13 14 15 16 17  18 19 1A 1B 1C 1D 1E 1F
     191             :     // in[2]: 20 21 22 23 24 25 26 27  28 29 2A 2B 2C 2D 2E 2F
     192             :     // in[3]: 30 31 32 33 34 35 36 37  38 39 3A 3B 3C 3D 3E 3F
     193             :     // in[4]: 40 41 42 43 44 45 46 47  48 49 4A 4B 4C 4D 4E 4F
     194             :     // in[5]: 50 51 52 53 54 55 56 57  58 59 5A 5B 5C 5D 5E 5F
     195             :     // in[6]: 60 61 62 63 64 65 66 67  68 69 6A 6B 6C 6D 6E 6F
     196             :     // in[7]: 70 71 72 73 74 75 76 77  78 79 7A 7B 7C 7D 7E 7F
     197             :     // to:
     198             :     // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
     199             :     // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
     200             :     // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
     201             :     // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
     202             :     // a4:    08 18 09 19 0A 1A 0B 1B  08 18 09 19 0A 1A 0B 1B
     203             :     // a5:    28 38 29 39 2A 3A 2B 3B  28 38 29 39 2A 3A 2B 3B
     204             :     // a6:    48 58 49 59 4A 5A 4B 5B  48 58 49 59 4A 5A 4B 5B
     205             :     // a7:    68 78 69 79 6A 7A 6B 7B  68 78 69 79 6A 7A 6B 7B
     206           0 :     const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
     207           0 :     const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
     208           0 :     const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
     209           0 :     const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
     210           0 :     const __m128i a4 = _mm_unpackhi_epi8(in[0], in[1]);
     211           0 :     const __m128i a5 = _mm_unpackhi_epi8(in[2], in[3]);
     212           0 :     const __m128i a6 = _mm_unpackhi_epi8(in[4], in[5]);
     213           0 :     const __m128i a7 = _mm_unpackhi_epi8(in[6], in[7]);
     214             : 
     215             :     // Unpack 16 bit elements resulting in:
     216             :     // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
     217             :     // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
     218             :     // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
     219             :     // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
     220             :     // b4: 08 18 28 38 09 19 29 39  0A 1A 2A 3A 0B 1B 2B 3B
     221             :     // b5: 48 58 68 78 49 59 69 79  4A 5A 6A 7A 4B 5B 6B 7B
     222             :     // b6: 0C 1C 2C 3C 0D 1D 2D 3D  0E 1E 2E 3E 0F 1F 2F 3F
     223             :     // b7: 4C 5C 6C 7C 4D 5D 6D 7D  4E 5E 6E 7E 4F 5F 6F 7F
     224           0 :     const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
     225           0 :     const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
     226           0 :     const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
     227           0 :     const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
     228           0 :     const __m128i b4 = _mm_unpacklo_epi16(a4, a5);
     229           0 :     const __m128i b5 = _mm_unpackhi_epi16(a4, a5);
     230           0 :     const __m128i b6 = _mm_unpacklo_epi16(a6, a7);
     231           0 :     const __m128i b7 = _mm_unpackhi_epi16(a6, a7);
     232             : 
     233             :     // Unpack 32 bit elements resulting in:
     234             :     // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
     235             :     // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
     236             :     // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
     237             :     // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
     238             :     // c4: 08 18 28 38 48 58 68 78  09 19 29 39 49 59 69 79
     239             :     // c5: 0A 1A 2A 3A 4A 5A 6A 7A  0B 1B 2B 3B 4B 5B 6B 7B
     240             :     // c6: 0C 1C 2C 3C 4C 5C 6C 7C  0D 1D 2D 3D 4D 5D 6D 7D
     241             :     // c7: 0E 1E 2E 3E 4E 5E 6E 7E  0F 1F 2F 3F 4F 5F 6F 7F
     242           0 :     out[0] = _mm_unpacklo_epi32(b0, b2);
     243           0 :     out[1] = _mm_unpackhi_epi32(b0, b2);
     244           0 :     out[2] = _mm_unpacklo_epi32(b1, b3);
     245           0 :     out[3] = _mm_unpackhi_epi32(b1, b3);
     246           0 :     out[4] = _mm_unpacklo_epi32(b4, b6);
     247           0 :     out[5] = _mm_unpackhi_epi32(b4, b6);
     248           0 :     out[6] = _mm_unpacklo_epi32(b5, b7);
     249           0 :     out[7] = _mm_unpackhi_epi32(b5, b7);
     250           0 : }
     251             : 
     252           0 : static INLINE void transpose_8bit_16x16_sse2(const __m128i *const in,
     253             :     __m128i *const out) {
     254             :     __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
     255             :     __m128i w10, w11, w12, w13, w14, w15;
     256             : 
     257           0 :     w0 = _mm_unpacklo_epi8(in[0], in[1]);
     258           0 :     w1 = _mm_unpacklo_epi8(in[2], in[3]);
     259           0 :     w2 = _mm_unpacklo_epi8(in[4], in[5]);
     260           0 :     w3 = _mm_unpacklo_epi8(in[6], in[7]);
     261             : 
     262           0 :     w8 = _mm_unpacklo_epi8(in[8], in[9]);
     263           0 :     w9 = _mm_unpacklo_epi8(in[10], in[11]);
     264           0 :     w10 = _mm_unpacklo_epi8(in[12], in[13]);
     265           0 :     w11 = _mm_unpacklo_epi8(in[14], in[15]);
     266             : 
     267           0 :     w4 = _mm_unpacklo_epi16(w0, w1);
     268           0 :     w5 = _mm_unpacklo_epi16(w2, w3);
     269           0 :     w12 = _mm_unpacklo_epi16(w8, w9);
     270           0 :     w13 = _mm_unpacklo_epi16(w10, w11);
     271             : 
     272           0 :     w6 = _mm_unpacklo_epi32(w4, w5);
     273           0 :     w7 = _mm_unpackhi_epi32(w4, w5);
     274           0 :     w14 = _mm_unpacklo_epi32(w12, w13);
     275           0 :     w15 = _mm_unpackhi_epi32(w12, w13);
     276             : 
     277             :     // Store first 4-line result
     278           0 :     out[0] = _mm_unpacklo_epi64(w6, w14);
     279           0 :     out[1] = _mm_unpackhi_epi64(w6, w14);
     280           0 :     out[2] = _mm_unpacklo_epi64(w7, w15);
     281           0 :     out[3] = _mm_unpackhi_epi64(w7, w15);
     282             : 
     283           0 :     w4 = _mm_unpackhi_epi16(w0, w1);
     284           0 :     w5 = _mm_unpackhi_epi16(w2, w3);
     285           0 :     w12 = _mm_unpackhi_epi16(w8, w9);
     286           0 :     w13 = _mm_unpackhi_epi16(w10, w11);
     287             : 
     288           0 :     w6 = _mm_unpacklo_epi32(w4, w5);
     289           0 :     w7 = _mm_unpackhi_epi32(w4, w5);
     290           0 :     w14 = _mm_unpacklo_epi32(w12, w13);
     291           0 :     w15 = _mm_unpackhi_epi32(w12, w13);
     292             : 
     293             :     // Store second 4-line result
     294           0 :     out[4] = _mm_unpacklo_epi64(w6, w14);
     295           0 :     out[5] = _mm_unpackhi_epi64(w6, w14);
     296           0 :     out[6] = _mm_unpacklo_epi64(w7, w15);
     297           0 :     out[7] = _mm_unpackhi_epi64(w7, w15);
     298             : 
     299             :     // upper half
     300           0 :     w0 = _mm_unpackhi_epi8(in[0], in[1]);
     301           0 :     w1 = _mm_unpackhi_epi8(in[2], in[3]);
     302           0 :     w2 = _mm_unpackhi_epi8(in[4], in[5]);
     303           0 :     w3 = _mm_unpackhi_epi8(in[6], in[7]);
     304             : 
     305           0 :     w8 = _mm_unpackhi_epi8(in[8], in[9]);
     306           0 :     w9 = _mm_unpackhi_epi8(in[10], in[11]);
     307           0 :     w10 = _mm_unpackhi_epi8(in[12], in[13]);
     308           0 :     w11 = _mm_unpackhi_epi8(in[14], in[15]);
     309             : 
     310           0 :     w4 = _mm_unpacklo_epi16(w0, w1);
     311           0 :     w5 = _mm_unpacklo_epi16(w2, w3);
     312           0 :     w12 = _mm_unpacklo_epi16(w8, w9);
     313           0 :     w13 = _mm_unpacklo_epi16(w10, w11);
     314             : 
     315           0 :     w6 = _mm_unpacklo_epi32(w4, w5);
     316           0 :     w7 = _mm_unpackhi_epi32(w4, w5);
     317           0 :     w14 = _mm_unpacklo_epi32(w12, w13);
     318           0 :     w15 = _mm_unpackhi_epi32(w12, w13);
     319             : 
     320             :     // Store first 4-line result
     321           0 :     out[8] = _mm_unpacklo_epi64(w6, w14);
     322           0 :     out[9] = _mm_unpackhi_epi64(w6, w14);
     323           0 :     out[10] = _mm_unpacklo_epi64(w7, w15);
     324           0 :     out[11] = _mm_unpackhi_epi64(w7, w15);
     325             : 
     326           0 :     w4 = _mm_unpackhi_epi16(w0, w1);
     327           0 :     w5 = _mm_unpackhi_epi16(w2, w3);
     328           0 :     w12 = _mm_unpackhi_epi16(w8, w9);
     329           0 :     w13 = _mm_unpackhi_epi16(w10, w11);
     330             : 
     331           0 :     w6 = _mm_unpacklo_epi32(w4, w5);
     332           0 :     w7 = _mm_unpackhi_epi32(w4, w5);
     333           0 :     w14 = _mm_unpacklo_epi32(w12, w13);
     334           0 :     w15 = _mm_unpackhi_epi32(w12, w13);
     335             : 
     336             :     // Store second 4-line result
     337           0 :     out[12] = _mm_unpacklo_epi64(w6, w14);
     338           0 :     out[13] = _mm_unpackhi_epi64(w6, w14);
     339           0 :     out[14] = _mm_unpacklo_epi64(w7, w15);
     340           0 :     out[15] = _mm_unpackhi_epi64(w7, w15);
     341           0 : }
     342             : 
     343    24231300 : static INLINE void transpose_16bit_4x4(const __m128i *const in,
     344             :     __m128i *const out) {
     345             :     // Unpack 16 bit elements. Goes from:
     346             :     // in[0]: 00 01 02 03  XX XX XX XX
     347             :     // in[1]: 10 11 12 13  XX XX XX XX
     348             :     // in[2]: 20 21 22 23  XX XX XX XX
     349             :     // in[3]: 30 31 32 33  XX XX XX XX
     350             :     // to:
     351             :     // a0:    00 10 01 11  02 12 03 13
     352             :     // a1:    20 30 21 31  22 32 23 33
     353    24231300 :     const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
     354    48462600 :     const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
     355             : 
     356             :     // Unpack 32 bit elements resulting in:
     357             :     // out[0]: 00 10 20 30
     358             :     // out[1]: 01 11 21 31
     359             :     // out[2]: 02 12 22 32
     360             :     // out[3]: 03 13 23 33
     361    24231300 :     out[0] = _mm_unpacklo_epi32(a0, a1);
     362    24231300 :     out[1] = _mm_srli_si128(out[0], 8);
     363    24231300 :     out[2] = _mm_unpackhi_epi32(a0, a1);
     364    24231300 :     out[3] = _mm_srli_si128(out[2], 8);
     365    24231300 : }
     366             : 
     367    13520200 : static INLINE void transpose_16bit_4x8(const __m128i *const in,
     368             :     __m128i *const out) {
     369             :     // Unpack 16 bit elements. Goes from:
     370             :     // in[0]: 00 01 02 03  XX XX XX XX
     371             :     // in[1]: 10 11 12 13  XX XX XX XX
     372             :     // in[2]: 20 21 22 23  XX XX XX XX
     373             :     // in[3]: 30 31 32 33  XX XX XX XX
     374             :     // in[4]: 40 41 42 43  XX XX XX XX
     375             :     // in[5]: 50 51 52 53  XX XX XX XX
     376             :     // in[6]: 60 61 62 63  XX XX XX XX
     377             :     // in[7]: 70 71 72 73  XX XX XX XX
     378             :     // to:
     379             :     // a0:    00 10 01 11  02 12 03 13
     380             :     // a1:    20 30 21 31  22 32 23 33
     381             :     // a2:    40 50 41 51  42 52 43 53
     382             :     // a3:    60 70 61 71  62 72 63 73
     383    13520200 :     const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
     384    13520200 :     const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
     385    13520200 :     const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
     386    27040300 :     const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
     387             : 
     388             :     // Unpack 32 bit elements resulting in:
     389             :     // b0: 00 10 20 30  01 11 21 31
     390             :     // b1: 40 50 60 70  41 51 61 71
     391             :     // b2: 02 12 22 32  03 13 23 33
     392             :     // b3: 42 52 62 72  43 53 63 73
     393    13520200 :     const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
     394    13520200 :     const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
     395    13520200 :     const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
     396    13520200 :     const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
     397             : 
     398             :     // Unpack 64 bit elements resulting in:
     399             :     // out[0]: 00 10 20 30  40 50 60 70
     400             :     // out[1]: 01 11 21 31  41 51 61 71
     401             :     // out[2]: 02 12 22 32  42 52 62 72
     402             :     // out[3]: 03 13 23 33  43 53 63 73
     403    13520200 :     out[0] = _mm_unpacklo_epi64(b0, b1);
     404    13520200 :     out[1] = _mm_unpackhi_epi64(b0, b1);
     405    13520200 :     out[2] = _mm_unpacklo_epi64(b2, b3);
     406    13520200 :     out[3] = _mm_unpackhi_epi64(b2, b3);
     407    13520200 : }
     408             : 
     409    13518900 : static INLINE void transpose_16bit_8x4(const __m128i *const in,
     410             :     __m128i *const out) {
     411             :     // Unpack 16 bit elements. Goes from:
     412             :     // in[0]: 00 01 02 03  04 05 06 07
     413             :     // in[1]: 10 11 12 13  14 15 16 17
     414             :     // in[2]: 20 21 22 23  24 25 26 27
     415             :     // in[3]: 30 31 32 33  34 35 36 37
     416             : 
     417             :     // to:
     418             :     // a0:    00 10 01 11  02 12 03 13
     419             :     // a1:    20 30 21 31  22 32 23 33
     420             :     // a4:    04 14 05 15  06 16 07 17
     421             :     // a5:    24 34 25 35  26 36 27 37
     422    13518900 :     const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
     423    13518900 :     const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
     424    13518900 :     const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
     425    27037800 :     const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
     426             : 
     427             :     // Unpack 32 bit elements resulting in:
     428             :     // b0: 00 10 20 30  01 11 21 31
     429             :     // b2: 04 14 24 34  05 15 25 35
     430             :     // b4: 02 12 22 32  03 13 23 33
     431             :     // b6: 06 16 26 36  07 17 27 37
     432    13518900 :     const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
     433    13518900 :     const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
     434    13518900 :     const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
     435    13518900 :     const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
     436             : 
     437             :     // Unpack 64 bit elements resulting in:
     438             :     // out[0]: 00 10 20 30  XX XX XX XX
     439             :     // out[1]: 01 11 21 31  XX XX XX XX
     440             :     // out[2]: 02 12 22 32  XX XX XX XX
     441             :     // out[3]: 03 13 23 33  XX XX XX XX
     442             :     // out[4]: 04 14 24 34  XX XX XX XX
     443             :     // out[5]: 05 15 25 35  XX XX XX XX
     444             :     // out[6]: 06 16 26 36  XX XX XX XX
     445             :     // out[7]: 07 17 27 37  XX XX XX XX
     446    13518900 :     const __m128i zeros = _mm_setzero_si128();
     447    13518900 :     out[0] = _mm_unpacklo_epi64(b0, zeros);
     448    13518900 :     out[1] = _mm_unpackhi_epi64(b0, zeros);
     449    13518900 :     out[2] = _mm_unpacklo_epi64(b4, zeros);
     450    13518900 :     out[3] = _mm_unpackhi_epi64(b4, zeros);
     451    13518900 :     out[4] = _mm_unpacklo_epi64(b2, zeros);
     452    13518900 :     out[5] = _mm_unpackhi_epi64(b2, zeros);
     453    13518900 :     out[6] = _mm_unpacklo_epi64(b6, zeros);
     454    13518900 :     out[7] = _mm_unpackhi_epi64(b6, zeros);
     455    13518900 : }
     456             : 
     457    45647800 : static INLINE void transpose_16bit_8x8(const __m128i *const in,
     458             :     __m128i *const out) {
     459             :     // Unpack 16 bit elements. Goes from:
     460             :     // in[0]: 00 01 02 03  04 05 06 07
     461             :     // in[1]: 10 11 12 13  14 15 16 17
     462             :     // in[2]: 20 21 22 23  24 25 26 27
     463             :     // in[3]: 30 31 32 33  34 35 36 37
     464             :     // in[4]: 40 41 42 43  44 45 46 47
     465             :     // in[5]: 50 51 52 53  54 55 56 57
     466             :     // in[6]: 60 61 62 63  64 65 66 67
     467             :     // in[7]: 70 71 72 73  74 75 76 77
     468             :     // to:
     469             :     // a0:    00 10 01 11  02 12 03 13
     470             :     // a1:    20 30 21 31  22 32 23 33
     471             :     // a2:    40 50 41 51  42 52 43 53
     472             :     // a3:    60 70 61 71  62 72 63 73
     473             :     // a4:    04 14 05 15  06 16 07 17
     474             :     // a5:    24 34 25 35  26 36 27 37
     475             :     // a6:    44 54 45 55  46 56 47 57
     476             :     // a7:    64 74 65 75  66 76 67 77
     477    45647800 :     const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
     478    45647800 :     const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
     479    45647800 :     const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
     480    45647800 :     const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
     481    45647800 :     const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
     482    45647800 :     const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
     483    45647800 :     const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
     484    91295700 :     const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
     485             : 
     486             :     // Unpack 32 bit elements resulting in:
     487             :     // b0: 00 10 20 30  01 11 21 31
     488             :     // b1: 40 50 60 70  41 51 61 71
     489             :     // b2: 04 14 24 34  05 15 25 35
     490             :     // b3: 44 54 64 74  45 55 65 75
     491             :     // b4: 02 12 22 32  03 13 23 33
     492             :     // b5: 42 52 62 72  43 53 63 73
     493             :     // b6: 06 16 26 36  07 17 27 37
     494             :     // b7: 46 56 66 76  47 57 67 77
     495    45647800 :     const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
     496    45647800 :     const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
     497    45647800 :     const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
     498    45647800 :     const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
     499    45647800 :     const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
     500    45647800 :     const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
     501    45647800 :     const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
     502    45647800 :     const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
     503             : 
     504             :     // Unpack 64 bit elements resulting in:
     505             :     // out[0]: 00 10 20 30  40 50 60 70
     506             :     // out[1]: 01 11 21 31  41 51 61 71
     507             :     // out[2]: 02 12 22 32  42 52 62 72
     508             :     // out[3]: 03 13 23 33  43 53 63 73
     509             :     // out[4]: 04 14 24 34  44 54 64 74
     510             :     // out[5]: 05 15 25 35  45 55 65 75
     511             :     // out[6]: 06 16 26 36  46 56 66 76
     512             :     // out[7]: 07 17 27 37  47 57 67 77
     513    45647800 :     out[0] = _mm_unpacklo_epi64(b0, b1);
     514    45647800 :     out[1] = _mm_unpackhi_epi64(b0, b1);
     515    45647800 :     out[2] = _mm_unpacklo_epi64(b4, b5);
     516    45647800 :     out[3] = _mm_unpackhi_epi64(b4, b5);
     517    45647800 :     out[4] = _mm_unpacklo_epi64(b2, b3);
     518    45647800 :     out[5] = _mm_unpackhi_epi64(b2, b3);
     519    45647800 :     out[6] = _mm_unpacklo_epi64(b6, b7);
     520    45647800 :     out[7] = _mm_unpackhi_epi64(b6, b7);
     521    45647800 : }
     522             : 
     523             : // Transpose in-place
     524           0 : static INLINE void transpose_16bit_16x16(__m128i *const left,
     525             :     __m128i *const right) {
     526             :     __m128i tbuf[8];
     527           0 :     transpose_16bit_8x8(left, left);
     528           0 :     transpose_16bit_8x8(right, tbuf);
     529           0 :     transpose_16bit_8x8(left + 8, right);
     530           0 :     transpose_16bit_8x8(right + 8, right + 8);
     531             : 
     532           0 :     left[8] = tbuf[0];
     533           0 :     left[9] = tbuf[1];
     534           0 :     left[10] = tbuf[2];
     535           0 :     left[11] = tbuf[3];
     536           0 :     left[12] = tbuf[4];
     537           0 :     left[13] = tbuf[5];
     538           0 :     left[14] = tbuf[6];
     539           0 :     left[15] = tbuf[7];
     540           0 : }
     541             : 
     542         120 : static INLINE void transpose_32bit_4x4(const __m128i *const in,
     543             :     __m128i *const out) {
     544             :     // Unpack 32 bit elements. Goes from:
     545             :     // in[0]: 00 01 02 03
     546             :     // in[1]: 10 11 12 13
     547             :     // in[2]: 20 21 22 23
     548             :     // in[3]: 30 31 32 33
     549             :     // to:
     550             :     // a0:    00 10 01 11
     551             :     // a1:    20 30 21 31
     552             :     // a2:    02 12 03 13
     553             :     // a3:    22 32 23 33
     554             : 
     555         120 :     const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
     556         120 :     const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
     557         120 :     const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
     558         240 :     const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
     559             : 
     560             :     // Unpack 64 bit elements resulting in:
     561             :     // out[0]: 00 10 20 30
     562             :     // out[1]: 01 11 21 31
     563             :     // out[2]: 02 12 22 32
     564             :     // out[3]: 03 13 23 33
     565         120 :     out[0] = _mm_unpacklo_epi64(a0, a1);
     566         120 :     out[1] = _mm_unpackhi_epi64(a0, a1);
     567         120 :     out[2] = _mm_unpacklo_epi64(a2, a3);
     568         120 :     out[3] = _mm_unpackhi_epi64(a2, a3);
     569         120 : }
     570             : 
     571           0 : static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
     572             :     __m128i *const out) {
     573             :     // Unpack 32 bit elements. Goes from:
     574             :     // in[0]: 00 01 02 03
     575             :     // in[1]: 10 11 12 13
     576             :     // in[2]: 20 21 22 23
     577             :     // in[3]: 30 31 32 33
     578             :     // in[4]: 04 05 06 07
     579             :     // in[5]: 14 15 16 17
     580             :     // in[6]: 24 25 26 27
     581             :     // in[7]: 34 35 36 37
     582             :     // to:
     583             :     // a0:    00 10 01 11
     584             :     // a1:    20 30 21 31
     585             :     // a2:    02 12 03 13
     586             :     // a3:    22 32 23 33
     587             :     // a4:    04 14 05 15
     588             :     // a5:    24 34 25 35
     589             :     // a6:    06 16 07 17
     590             :     // a7:    26 36 27 37
     591           0 :     const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
     592           0 :     const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
     593           0 :     const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
     594           0 :     const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
     595           0 :     const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
     596           0 :     const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
     597           0 :     const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
     598           0 :     const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
     599             : 
     600             :     // Unpack 64 bit elements resulting in:
     601             :     // out[0]: 00 10 20 30
     602             :     // out[1]: 01 11 21 31
     603             :     // out[2]: 02 12 22 32
     604             :     // out[3]: 03 13 23 33
     605             :     // out[4]: 04 14 24 34
     606             :     // out[5]: 05 15 25 35
     607             :     // out[6]: 06 16 26 36
     608             :     // out[7]: 07 17 27 37
     609           0 :     out[0] = _mm_unpacklo_epi64(a0, a1);
     610           0 :     out[1] = _mm_unpackhi_epi64(a0, a1);
     611           0 :     out[2] = _mm_unpacklo_epi64(a2, a3);
     612           0 :     out[3] = _mm_unpackhi_epi64(a2, a3);
     613           0 :     out[4] = _mm_unpacklo_epi64(a4, a5);
     614           0 :     out[5] = _mm_unpackhi_epi64(a4, a5);
     615           0 :     out[6] = _mm_unpacklo_epi64(a6, a7);
     616           0 :     out[7] = _mm_unpackhi_epi64(a6, a7);
     617           0 : }
     618             : 
     619           0 : static INLINE void transpose_32bit_8x4(const __m128i *const in,
     620             :     __m128i *const out) {
     621             :     // Unpack 32 bit elements. Goes from:
     622             :     // in[0]: 00 01 02 03
     623             :     // in[1]: 04 05 06 07
     624             :     // in[2]: 10 11 12 13
     625             :     // in[3]: 14 15 16 17
     626             :     // in[4]: 20 21 22 23
     627             :     // in[5]: 24 25 26 27
     628             :     // in[6]: 30 31 32 33
     629             :     // in[7]: 34 35 36 37
     630             :     // to:
     631             :     // a0: 00 10 01 11
     632             :     // a1: 20 30 21 31
     633             :     // a2: 02 12 03 13
     634             :     // a3: 22 32 23 33
     635             :     // a4: 04 14 05 15
     636             :     // a5: 24 34 25 35
     637             :     // a6: 06 16 07 17
     638             :     // a7: 26 36 27 37
     639           0 :     const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
     640           0 :     const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
     641           0 :     const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
     642           0 :     const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
     643           0 :     const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
     644           0 :     const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
     645           0 :     const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
     646           0 :     const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
     647             : 
     648             :     // Unpack 64 bit elements resulting in:
     649             :     // out[0]: 00 10 20 30
     650             :     // out[1]: 01 11 21 31
     651             :     // out[2]: 02 12 22 32
     652             :     // out[3]: 03 13 23 33
     653             :     // out[4]: 04 14 24 34
     654             :     // out[5]: 05 15 25 35
     655             :     // out[6]: 06 16 26 36
     656             :     // out[7]: 07 17 27 37
     657           0 :     out[0] = _mm_unpacklo_epi64(a0, a1);
     658           0 :     out[1] = _mm_unpackhi_epi64(a0, a1);
     659           0 :     out[2] = _mm_unpacklo_epi64(a2, a3);
     660           0 :     out[3] = _mm_unpackhi_epi64(a2, a3);
     661           0 :     out[4] = _mm_unpacklo_epi64(a4, a5);
     662           0 :     out[5] = _mm_unpackhi_epi64(a4, a5);
     663           0 :     out[6] = _mm_unpacklo_epi64(a6, a7);
     664           0 :     out[7] = _mm_unpackhi_epi64(a6, a7);
     665           0 : }
     666             : 
     667             : #endif  // AOM_DSP_X86_TRANSPOSE_SSE2_H_

Generated by: LCOV version 1.14