Line data Source code
1 : /*
2 : * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #ifndef AOM_DSP_X86_TRANSPOSE_SSE2_H_
13 : #define AOM_DSP_X86_TRANSPOSE_SSE2_H_
14 :
15 : #include <emmintrin.h> // SSE2
16 :
17 : // nclude "./aom_config.h"
18 :
19 : #ifdef __cplusplus
20 : extern "C" {
21 : #endif
22 :
23 : void transpose_8bit_4x4_reg128bit_instance_sse2(const __m128i *const in,
24 : __m128i *const out);
25 :
26 : void transpose_8bit_8x8_reg128bit_instance_sse2(const __m128i *const in,
27 : __m128i *const out);
28 :
29 : void transpose_8bit_16x8_reg128bit_instance_sse2(const __m128i *const in,
30 : __m128i *const out);
31 :
32 : void transpose_8bit_16x16_reg128bit_instance_sse2(const __m128i *const in,
33 : __m128i *const out);
34 :
35 : void partial_transpose_8bit_8x8_reg128bit_instance_sse2(const __m128i *const in,
36 : __m128i *const out);
37 :
38 : void transpose_16bit_4x4_reg128bit_instance_sse2(const __m128i *const in,
39 : __m128i *const out);
40 :
41 : void transpose_16bit_4x8_reg128bit_instance_sse2(const __m128i *const in,
42 : __m128i *const out);
43 :
44 : void transpose_16bit_8x4_reg128bit_instance_sse2(const __m128i *const in,
45 : __m128i *const out);
46 :
47 : void transpose_16bit_8x8_reg128bit_instance_sse2(const __m128i *const in,
48 : __m128i *const out);
49 :
50 : void transpose_16bit_16x16_reg128bit_instance_sse2(const __m128i *const in,
51 : __m128i *const out);
52 :
53 : void transpose_32bit_4x4_reg128bit_instance_sse2(const __m128i *const in,
54 : __m128i *const out);
55 :
56 : void transpose_32bit_4x4x2_reg128bit_instance_sse2(const __m128i *const in,
57 : __m128i *const out);
58 :
59 : void transpose_32bit_8x4_reg128bit_instance_sse2(const __m128i *const in,
60 : __m128i *const out);
61 :
62 : #ifdef __cplusplus
63 : }
64 : #endif
65 :
66 :
67 0 : static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
68 : // Unpack 16 bit elements. Goes from:
69 : // in[0]: 00 01 02 03
70 : // in[1]: 10 11 12 13
71 : // in[2]: 20 21 22 23
72 : // in[3]: 30 31 32 33
73 : // to:
74 : // a0: 00 10 01 11 02 12 03 13
75 : // a1: 20 30 21 31 22 32 23 33
76 0 : const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
77 0 : const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
78 :
79 : // Unpack 32 bit elements resulting in:
80 : // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
81 0 : return _mm_unpacklo_epi16(a0, a1);
82 : }
83 :
84 0 : static INLINE void transpose_8bit_8x8(const __m128i *const in,
85 : __m128i *const out) {
86 : // Unpack 8 bit elements. Goes from:
87 : // in[0]: 00 01 02 03 04 05 06 07
88 : // in[1]: 10 11 12 13 14 15 16 17
89 : // in[2]: 20 21 22 23 24 25 26 27
90 : // in[3]: 30 31 32 33 34 35 36 37
91 : // in[4]: 40 41 42 43 44 45 46 47
92 : // in[5]: 50 51 52 53 54 55 56 57
93 : // in[6]: 60 61 62 63 64 65 66 67
94 : // in[7]: 70 71 72 73 74 75 76 77
95 : // to:
96 : // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
97 : // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
98 : // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
99 : // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
100 0 : const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
101 0 : const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
102 0 : const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
103 0 : const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
104 :
105 : // Unpack 16 bit elements resulting in:
106 : // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
107 : // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
108 : // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
109 : // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
110 0 : const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
111 0 : const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
112 0 : const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
113 0 : const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
114 :
115 : // Unpack 32 bit elements resulting in:
116 : // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
117 : // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
118 : // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
119 : // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
120 0 : const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
121 0 : const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
122 0 : const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
123 0 : const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
124 :
125 : // Unpack 64 bit elements resulting in:
126 : // out[0]: 00 10 20 30 40 50 60 70
127 : // out[1]: 01 11 21 31 41 51 61 71
128 : // out[2]: 02 12 22 32 42 52 62 72
129 : // out[3]: 03 13 23 33 43 53 63 73
130 : // out[4]: 04 14 24 34 44 54 64 74
131 : // out[5]: 05 15 25 35 45 55 65 75
132 : // out[6]: 06 16 26 36 46 56 66 76
133 : // out[7]: 07 17 27 37 47 57 67 77
134 0 : out[0] = c0;
135 0 : out[1] = _mm_srli_si128(c0, 8);
136 0 : out[2] = c1;
137 0 : out[3] = _mm_srli_si128(c1, 8);
138 0 : out[4] = c2;
139 0 : out[5] = _mm_srli_si128(c2, 8);
140 0 : out[6] = c3;
141 0 : out[7] = _mm_srli_si128(c3, 8);
142 0 : }
143 :
144 7870570 : static INLINE void partial_transpose_8bit_8x8(const __m128i *const in,
145 : __m128i *const out) {
146 : // Unpack 8 bit elements. Goes from:
147 : // in[0]: 00 01 02 03 04 05 06 07
148 : // in[1]: 10 11 12 13 14 15 16 17
149 : // in[2]: 20 21 22 23 24 25 26 27
150 : // in[3]: 30 31 32 33 34 35 36 37
151 : // in[4]: 40 41 42 43 44 45 46 47
152 : // in[5]: 50 51 52 53 54 55 56 57
153 : // in[6]: 60 61 62 63 64 65 66 67
154 : // in[7]: 70 71 72 73 74 75 76 77
155 : // to:
156 : // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
157 : // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
158 : // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
159 : // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
160 7870570 : const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
161 7870570 : const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
162 7870570 : const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
163 15741100 : const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
164 :
165 : // Unpack 16 bit elements resulting in:
166 : // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
167 : // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
168 : // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
169 : // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
170 7870570 : const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
171 7870570 : const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
172 7870570 : const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
173 7870570 : const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
174 :
175 : // Unpack 32 bit elements resulting in:
176 : // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
177 : // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
178 : // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
179 : // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
180 7870570 : out[0] = _mm_unpacklo_epi32(b0, b2);
181 7870570 : out[1] = _mm_unpackhi_epi32(b0, b2);
182 7870570 : out[2] = _mm_unpacklo_epi32(b1, b3);
183 7870570 : out[3] = _mm_unpackhi_epi32(b1, b3);
184 7870570 : }
185 :
186 0 : static INLINE void transpose_8bit_16x8(const __m128i *const in,
187 : __m128i *const out) {
188 : // Unpack 8 bit elements. Goes from:
189 : // in[0]: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
190 : // in[1]: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F
191 : // in[2]: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F
192 : // in[3]: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F
193 : // in[4]: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F
194 : // in[5]: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F
195 : // in[6]: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F
196 : // in[7]: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F
197 : // to:
198 : // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
199 : // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
200 : // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
201 : // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
202 : // a4: 08 18 09 19 0A 1A 0B 1B 08 18 09 19 0A 1A 0B 1B
203 : // a5: 28 38 29 39 2A 3A 2B 3B 28 38 29 39 2A 3A 2B 3B
204 : // a6: 48 58 49 59 4A 5A 4B 5B 48 58 49 59 4A 5A 4B 5B
205 : // a7: 68 78 69 79 6A 7A 6B 7B 68 78 69 79 6A 7A 6B 7B
206 0 : const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
207 0 : const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
208 0 : const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
209 0 : const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
210 0 : const __m128i a4 = _mm_unpackhi_epi8(in[0], in[1]);
211 0 : const __m128i a5 = _mm_unpackhi_epi8(in[2], in[3]);
212 0 : const __m128i a6 = _mm_unpackhi_epi8(in[4], in[5]);
213 0 : const __m128i a7 = _mm_unpackhi_epi8(in[6], in[7]);
214 :
215 : // Unpack 16 bit elements resulting in:
216 : // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
217 : // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
218 : // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
219 : // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
220 : // b4: 08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
221 : // b5: 48 58 68 78 49 59 69 79 4A 5A 6A 7A 4B 5B 6B 7B
222 : // b6: 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
223 : // b7: 4C 5C 6C 7C 4D 5D 6D 7D 4E 5E 6E 7E 4F 5F 6F 7F
224 0 : const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
225 0 : const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
226 0 : const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
227 0 : const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
228 0 : const __m128i b4 = _mm_unpacklo_epi16(a4, a5);
229 0 : const __m128i b5 = _mm_unpackhi_epi16(a4, a5);
230 0 : const __m128i b6 = _mm_unpacklo_epi16(a6, a7);
231 0 : const __m128i b7 = _mm_unpackhi_epi16(a6, a7);
232 :
233 : // Unpack 32 bit elements resulting in:
234 : // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
235 : // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
236 : // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
237 : // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
238 : // c4: 08 18 28 38 48 58 68 78 09 19 29 39 49 59 69 79
239 : // c5: 0A 1A 2A 3A 4A 5A 6A 7A 0B 1B 2B 3B 4B 5B 6B 7B
240 : // c6: 0C 1C 2C 3C 4C 5C 6C 7C 0D 1D 2D 3D 4D 5D 6D 7D
241 : // c7: 0E 1E 2E 3E 4E 5E 6E 7E 0F 1F 2F 3F 4F 5F 6F 7F
242 0 : out[0] = _mm_unpacklo_epi32(b0, b2);
243 0 : out[1] = _mm_unpackhi_epi32(b0, b2);
244 0 : out[2] = _mm_unpacklo_epi32(b1, b3);
245 0 : out[3] = _mm_unpackhi_epi32(b1, b3);
246 0 : out[4] = _mm_unpacklo_epi32(b4, b6);
247 0 : out[5] = _mm_unpackhi_epi32(b4, b6);
248 0 : out[6] = _mm_unpacklo_epi32(b5, b7);
249 0 : out[7] = _mm_unpackhi_epi32(b5, b7);
250 0 : }
251 :
252 0 : static INLINE void transpose_8bit_16x16_sse2(const __m128i *const in,
253 : __m128i *const out) {
254 : __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
255 : __m128i w10, w11, w12, w13, w14, w15;
256 :
257 0 : w0 = _mm_unpacklo_epi8(in[0], in[1]);
258 0 : w1 = _mm_unpacklo_epi8(in[2], in[3]);
259 0 : w2 = _mm_unpacklo_epi8(in[4], in[5]);
260 0 : w3 = _mm_unpacklo_epi8(in[6], in[7]);
261 :
262 0 : w8 = _mm_unpacklo_epi8(in[8], in[9]);
263 0 : w9 = _mm_unpacklo_epi8(in[10], in[11]);
264 0 : w10 = _mm_unpacklo_epi8(in[12], in[13]);
265 0 : w11 = _mm_unpacklo_epi8(in[14], in[15]);
266 :
267 0 : w4 = _mm_unpacklo_epi16(w0, w1);
268 0 : w5 = _mm_unpacklo_epi16(w2, w3);
269 0 : w12 = _mm_unpacklo_epi16(w8, w9);
270 0 : w13 = _mm_unpacklo_epi16(w10, w11);
271 :
272 0 : w6 = _mm_unpacklo_epi32(w4, w5);
273 0 : w7 = _mm_unpackhi_epi32(w4, w5);
274 0 : w14 = _mm_unpacklo_epi32(w12, w13);
275 0 : w15 = _mm_unpackhi_epi32(w12, w13);
276 :
277 : // Store first 4-line result
278 0 : out[0] = _mm_unpacklo_epi64(w6, w14);
279 0 : out[1] = _mm_unpackhi_epi64(w6, w14);
280 0 : out[2] = _mm_unpacklo_epi64(w7, w15);
281 0 : out[3] = _mm_unpackhi_epi64(w7, w15);
282 :
283 0 : w4 = _mm_unpackhi_epi16(w0, w1);
284 0 : w5 = _mm_unpackhi_epi16(w2, w3);
285 0 : w12 = _mm_unpackhi_epi16(w8, w9);
286 0 : w13 = _mm_unpackhi_epi16(w10, w11);
287 :
288 0 : w6 = _mm_unpacklo_epi32(w4, w5);
289 0 : w7 = _mm_unpackhi_epi32(w4, w5);
290 0 : w14 = _mm_unpacklo_epi32(w12, w13);
291 0 : w15 = _mm_unpackhi_epi32(w12, w13);
292 :
293 : // Store second 4-line result
294 0 : out[4] = _mm_unpacklo_epi64(w6, w14);
295 0 : out[5] = _mm_unpackhi_epi64(w6, w14);
296 0 : out[6] = _mm_unpacklo_epi64(w7, w15);
297 0 : out[7] = _mm_unpackhi_epi64(w7, w15);
298 :
299 : // upper half
300 0 : w0 = _mm_unpackhi_epi8(in[0], in[1]);
301 0 : w1 = _mm_unpackhi_epi8(in[2], in[3]);
302 0 : w2 = _mm_unpackhi_epi8(in[4], in[5]);
303 0 : w3 = _mm_unpackhi_epi8(in[6], in[7]);
304 :
305 0 : w8 = _mm_unpackhi_epi8(in[8], in[9]);
306 0 : w9 = _mm_unpackhi_epi8(in[10], in[11]);
307 0 : w10 = _mm_unpackhi_epi8(in[12], in[13]);
308 0 : w11 = _mm_unpackhi_epi8(in[14], in[15]);
309 :
310 0 : w4 = _mm_unpacklo_epi16(w0, w1);
311 0 : w5 = _mm_unpacklo_epi16(w2, w3);
312 0 : w12 = _mm_unpacklo_epi16(w8, w9);
313 0 : w13 = _mm_unpacklo_epi16(w10, w11);
314 :
315 0 : w6 = _mm_unpacklo_epi32(w4, w5);
316 0 : w7 = _mm_unpackhi_epi32(w4, w5);
317 0 : w14 = _mm_unpacklo_epi32(w12, w13);
318 0 : w15 = _mm_unpackhi_epi32(w12, w13);
319 :
320 : // Store first 4-line result
321 0 : out[8] = _mm_unpacklo_epi64(w6, w14);
322 0 : out[9] = _mm_unpackhi_epi64(w6, w14);
323 0 : out[10] = _mm_unpacklo_epi64(w7, w15);
324 0 : out[11] = _mm_unpackhi_epi64(w7, w15);
325 :
326 0 : w4 = _mm_unpackhi_epi16(w0, w1);
327 0 : w5 = _mm_unpackhi_epi16(w2, w3);
328 0 : w12 = _mm_unpackhi_epi16(w8, w9);
329 0 : w13 = _mm_unpackhi_epi16(w10, w11);
330 :
331 0 : w6 = _mm_unpacklo_epi32(w4, w5);
332 0 : w7 = _mm_unpackhi_epi32(w4, w5);
333 0 : w14 = _mm_unpacklo_epi32(w12, w13);
334 0 : w15 = _mm_unpackhi_epi32(w12, w13);
335 :
336 : // Store second 4-line result
337 0 : out[12] = _mm_unpacklo_epi64(w6, w14);
338 0 : out[13] = _mm_unpackhi_epi64(w6, w14);
339 0 : out[14] = _mm_unpacklo_epi64(w7, w15);
340 0 : out[15] = _mm_unpackhi_epi64(w7, w15);
341 0 : }
342 :
343 24231300 : static INLINE void transpose_16bit_4x4(const __m128i *const in,
344 : __m128i *const out) {
345 : // Unpack 16 bit elements. Goes from:
346 : // in[0]: 00 01 02 03 XX XX XX XX
347 : // in[1]: 10 11 12 13 XX XX XX XX
348 : // in[2]: 20 21 22 23 XX XX XX XX
349 : // in[3]: 30 31 32 33 XX XX XX XX
350 : // to:
351 : // a0: 00 10 01 11 02 12 03 13
352 : // a1: 20 30 21 31 22 32 23 33
353 24231300 : const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
354 48462600 : const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
355 :
356 : // Unpack 32 bit elements resulting in:
357 : // out[0]: 00 10 20 30
358 : // out[1]: 01 11 21 31
359 : // out[2]: 02 12 22 32
360 : // out[3]: 03 13 23 33
361 24231300 : out[0] = _mm_unpacklo_epi32(a0, a1);
362 24231300 : out[1] = _mm_srli_si128(out[0], 8);
363 24231300 : out[2] = _mm_unpackhi_epi32(a0, a1);
364 24231300 : out[3] = _mm_srli_si128(out[2], 8);
365 24231300 : }
366 :
367 13520200 : static INLINE void transpose_16bit_4x8(const __m128i *const in,
368 : __m128i *const out) {
369 : // Unpack 16 bit elements. Goes from:
370 : // in[0]: 00 01 02 03 XX XX XX XX
371 : // in[1]: 10 11 12 13 XX XX XX XX
372 : // in[2]: 20 21 22 23 XX XX XX XX
373 : // in[3]: 30 31 32 33 XX XX XX XX
374 : // in[4]: 40 41 42 43 XX XX XX XX
375 : // in[5]: 50 51 52 53 XX XX XX XX
376 : // in[6]: 60 61 62 63 XX XX XX XX
377 : // in[7]: 70 71 72 73 XX XX XX XX
378 : // to:
379 : // a0: 00 10 01 11 02 12 03 13
380 : // a1: 20 30 21 31 22 32 23 33
381 : // a2: 40 50 41 51 42 52 43 53
382 : // a3: 60 70 61 71 62 72 63 73
383 13520200 : const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
384 13520200 : const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
385 13520200 : const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
386 27040300 : const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
387 :
388 : // Unpack 32 bit elements resulting in:
389 : // b0: 00 10 20 30 01 11 21 31
390 : // b1: 40 50 60 70 41 51 61 71
391 : // b2: 02 12 22 32 03 13 23 33
392 : // b3: 42 52 62 72 43 53 63 73
393 13520200 : const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
394 13520200 : const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
395 13520200 : const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
396 13520200 : const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
397 :
398 : // Unpack 64 bit elements resulting in:
399 : // out[0]: 00 10 20 30 40 50 60 70
400 : // out[1]: 01 11 21 31 41 51 61 71
401 : // out[2]: 02 12 22 32 42 52 62 72
402 : // out[3]: 03 13 23 33 43 53 63 73
403 13520200 : out[0] = _mm_unpacklo_epi64(b0, b1);
404 13520200 : out[1] = _mm_unpackhi_epi64(b0, b1);
405 13520200 : out[2] = _mm_unpacklo_epi64(b2, b3);
406 13520200 : out[3] = _mm_unpackhi_epi64(b2, b3);
407 13520200 : }
408 :
409 13518900 : static INLINE void transpose_16bit_8x4(const __m128i *const in,
410 : __m128i *const out) {
411 : // Unpack 16 bit elements. Goes from:
412 : // in[0]: 00 01 02 03 04 05 06 07
413 : // in[1]: 10 11 12 13 14 15 16 17
414 : // in[2]: 20 21 22 23 24 25 26 27
415 : // in[3]: 30 31 32 33 34 35 36 37
416 :
417 : // to:
418 : // a0: 00 10 01 11 02 12 03 13
419 : // a1: 20 30 21 31 22 32 23 33
420 : // a4: 04 14 05 15 06 16 07 17
421 : // a5: 24 34 25 35 26 36 27 37
422 13518900 : const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
423 13518900 : const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
424 13518900 : const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
425 27037800 : const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
426 :
427 : // Unpack 32 bit elements resulting in:
428 : // b0: 00 10 20 30 01 11 21 31
429 : // b2: 04 14 24 34 05 15 25 35
430 : // b4: 02 12 22 32 03 13 23 33
431 : // b6: 06 16 26 36 07 17 27 37
432 13518900 : const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
433 13518900 : const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
434 13518900 : const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
435 13518900 : const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
436 :
437 : // Unpack 64 bit elements resulting in:
438 : // out[0]: 00 10 20 30 XX XX XX XX
439 : // out[1]: 01 11 21 31 XX XX XX XX
440 : // out[2]: 02 12 22 32 XX XX XX XX
441 : // out[3]: 03 13 23 33 XX XX XX XX
442 : // out[4]: 04 14 24 34 XX XX XX XX
443 : // out[5]: 05 15 25 35 XX XX XX XX
444 : // out[6]: 06 16 26 36 XX XX XX XX
445 : // out[7]: 07 17 27 37 XX XX XX XX
446 13518900 : const __m128i zeros = _mm_setzero_si128();
447 13518900 : out[0] = _mm_unpacklo_epi64(b0, zeros);
448 13518900 : out[1] = _mm_unpackhi_epi64(b0, zeros);
449 13518900 : out[2] = _mm_unpacklo_epi64(b4, zeros);
450 13518900 : out[3] = _mm_unpackhi_epi64(b4, zeros);
451 13518900 : out[4] = _mm_unpacklo_epi64(b2, zeros);
452 13518900 : out[5] = _mm_unpackhi_epi64(b2, zeros);
453 13518900 : out[6] = _mm_unpacklo_epi64(b6, zeros);
454 13518900 : out[7] = _mm_unpackhi_epi64(b6, zeros);
455 13518900 : }
456 :
457 45647800 : static INLINE void transpose_16bit_8x8(const __m128i *const in,
458 : __m128i *const out) {
459 : // Unpack 16 bit elements. Goes from:
460 : // in[0]: 00 01 02 03 04 05 06 07
461 : // in[1]: 10 11 12 13 14 15 16 17
462 : // in[2]: 20 21 22 23 24 25 26 27
463 : // in[3]: 30 31 32 33 34 35 36 37
464 : // in[4]: 40 41 42 43 44 45 46 47
465 : // in[5]: 50 51 52 53 54 55 56 57
466 : // in[6]: 60 61 62 63 64 65 66 67
467 : // in[7]: 70 71 72 73 74 75 76 77
468 : // to:
469 : // a0: 00 10 01 11 02 12 03 13
470 : // a1: 20 30 21 31 22 32 23 33
471 : // a2: 40 50 41 51 42 52 43 53
472 : // a3: 60 70 61 71 62 72 63 73
473 : // a4: 04 14 05 15 06 16 07 17
474 : // a5: 24 34 25 35 26 36 27 37
475 : // a6: 44 54 45 55 46 56 47 57
476 : // a7: 64 74 65 75 66 76 67 77
477 45647800 : const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
478 45647800 : const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
479 45647800 : const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
480 45647800 : const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
481 45647800 : const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
482 45647800 : const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
483 45647800 : const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
484 91295700 : const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
485 :
486 : // Unpack 32 bit elements resulting in:
487 : // b0: 00 10 20 30 01 11 21 31
488 : // b1: 40 50 60 70 41 51 61 71
489 : // b2: 04 14 24 34 05 15 25 35
490 : // b3: 44 54 64 74 45 55 65 75
491 : // b4: 02 12 22 32 03 13 23 33
492 : // b5: 42 52 62 72 43 53 63 73
493 : // b6: 06 16 26 36 07 17 27 37
494 : // b7: 46 56 66 76 47 57 67 77
495 45647800 : const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
496 45647800 : const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
497 45647800 : const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
498 45647800 : const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
499 45647800 : const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
500 45647800 : const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
501 45647800 : const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
502 45647800 : const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
503 :
504 : // Unpack 64 bit elements resulting in:
505 : // out[0]: 00 10 20 30 40 50 60 70
506 : // out[1]: 01 11 21 31 41 51 61 71
507 : // out[2]: 02 12 22 32 42 52 62 72
508 : // out[3]: 03 13 23 33 43 53 63 73
509 : // out[4]: 04 14 24 34 44 54 64 74
510 : // out[5]: 05 15 25 35 45 55 65 75
511 : // out[6]: 06 16 26 36 46 56 66 76
512 : // out[7]: 07 17 27 37 47 57 67 77
513 45647800 : out[0] = _mm_unpacklo_epi64(b0, b1);
514 45647800 : out[1] = _mm_unpackhi_epi64(b0, b1);
515 45647800 : out[2] = _mm_unpacklo_epi64(b4, b5);
516 45647800 : out[3] = _mm_unpackhi_epi64(b4, b5);
517 45647800 : out[4] = _mm_unpacklo_epi64(b2, b3);
518 45647800 : out[5] = _mm_unpackhi_epi64(b2, b3);
519 45647800 : out[6] = _mm_unpacklo_epi64(b6, b7);
520 45647800 : out[7] = _mm_unpackhi_epi64(b6, b7);
521 45647800 : }
522 :
523 : // Transpose in-place
524 0 : static INLINE void transpose_16bit_16x16(__m128i *const left,
525 : __m128i *const right) {
526 : __m128i tbuf[8];
527 0 : transpose_16bit_8x8(left, left);
528 0 : transpose_16bit_8x8(right, tbuf);
529 0 : transpose_16bit_8x8(left + 8, right);
530 0 : transpose_16bit_8x8(right + 8, right + 8);
531 :
532 0 : left[8] = tbuf[0];
533 0 : left[9] = tbuf[1];
534 0 : left[10] = tbuf[2];
535 0 : left[11] = tbuf[3];
536 0 : left[12] = tbuf[4];
537 0 : left[13] = tbuf[5];
538 0 : left[14] = tbuf[6];
539 0 : left[15] = tbuf[7];
540 0 : }
541 :
542 120 : static INLINE void transpose_32bit_4x4(const __m128i *const in,
543 : __m128i *const out) {
544 : // Unpack 32 bit elements. Goes from:
545 : // in[0]: 00 01 02 03
546 : // in[1]: 10 11 12 13
547 : // in[2]: 20 21 22 23
548 : // in[3]: 30 31 32 33
549 : // to:
550 : // a0: 00 10 01 11
551 : // a1: 20 30 21 31
552 : // a2: 02 12 03 13
553 : // a3: 22 32 23 33
554 :
555 120 : const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
556 120 : const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
557 120 : const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
558 240 : const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
559 :
560 : // Unpack 64 bit elements resulting in:
561 : // out[0]: 00 10 20 30
562 : // out[1]: 01 11 21 31
563 : // out[2]: 02 12 22 32
564 : // out[3]: 03 13 23 33
565 120 : out[0] = _mm_unpacklo_epi64(a0, a1);
566 120 : out[1] = _mm_unpackhi_epi64(a0, a1);
567 120 : out[2] = _mm_unpacklo_epi64(a2, a3);
568 120 : out[3] = _mm_unpackhi_epi64(a2, a3);
569 120 : }
570 :
571 0 : static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
572 : __m128i *const out) {
573 : // Unpack 32 bit elements. Goes from:
574 : // in[0]: 00 01 02 03
575 : // in[1]: 10 11 12 13
576 : // in[2]: 20 21 22 23
577 : // in[3]: 30 31 32 33
578 : // in[4]: 04 05 06 07
579 : // in[5]: 14 15 16 17
580 : // in[6]: 24 25 26 27
581 : // in[7]: 34 35 36 37
582 : // to:
583 : // a0: 00 10 01 11
584 : // a1: 20 30 21 31
585 : // a2: 02 12 03 13
586 : // a3: 22 32 23 33
587 : // a4: 04 14 05 15
588 : // a5: 24 34 25 35
589 : // a6: 06 16 07 17
590 : // a7: 26 36 27 37
591 0 : const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
592 0 : const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
593 0 : const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
594 0 : const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
595 0 : const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
596 0 : const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
597 0 : const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
598 0 : const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
599 :
600 : // Unpack 64 bit elements resulting in:
601 : // out[0]: 00 10 20 30
602 : // out[1]: 01 11 21 31
603 : // out[2]: 02 12 22 32
604 : // out[3]: 03 13 23 33
605 : // out[4]: 04 14 24 34
606 : // out[5]: 05 15 25 35
607 : // out[6]: 06 16 26 36
608 : // out[7]: 07 17 27 37
609 0 : out[0] = _mm_unpacklo_epi64(a0, a1);
610 0 : out[1] = _mm_unpackhi_epi64(a0, a1);
611 0 : out[2] = _mm_unpacklo_epi64(a2, a3);
612 0 : out[3] = _mm_unpackhi_epi64(a2, a3);
613 0 : out[4] = _mm_unpacklo_epi64(a4, a5);
614 0 : out[5] = _mm_unpackhi_epi64(a4, a5);
615 0 : out[6] = _mm_unpacklo_epi64(a6, a7);
616 0 : out[7] = _mm_unpackhi_epi64(a6, a7);
617 0 : }
618 :
619 0 : static INLINE void transpose_32bit_8x4(const __m128i *const in,
620 : __m128i *const out) {
621 : // Unpack 32 bit elements. Goes from:
622 : // in[0]: 00 01 02 03
623 : // in[1]: 04 05 06 07
624 : // in[2]: 10 11 12 13
625 : // in[3]: 14 15 16 17
626 : // in[4]: 20 21 22 23
627 : // in[5]: 24 25 26 27
628 : // in[6]: 30 31 32 33
629 : // in[7]: 34 35 36 37
630 : // to:
631 : // a0: 00 10 01 11
632 : // a1: 20 30 21 31
633 : // a2: 02 12 03 13
634 : // a3: 22 32 23 33
635 : // a4: 04 14 05 15
636 : // a5: 24 34 25 35
637 : // a6: 06 16 07 17
638 : // a7: 26 36 27 37
639 0 : const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
640 0 : const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
641 0 : const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
642 0 : const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
643 0 : const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
644 0 : const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
645 0 : const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
646 0 : const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
647 :
648 : // Unpack 64 bit elements resulting in:
649 : // out[0]: 00 10 20 30
650 : // out[1]: 01 11 21 31
651 : // out[2]: 02 12 22 32
652 : // out[3]: 03 13 23 33
653 : // out[4]: 04 14 24 34
654 : // out[5]: 05 15 25 35
655 : // out[6]: 06 16 26 36
656 : // out[7]: 07 17 27 37
657 0 : out[0] = _mm_unpacklo_epi64(a0, a1);
658 0 : out[1] = _mm_unpackhi_epi64(a0, a1);
659 0 : out[2] = _mm_unpacklo_epi64(a2, a3);
660 0 : out[3] = _mm_unpackhi_epi64(a2, a3);
661 0 : out[4] = _mm_unpacklo_epi64(a4, a5);
662 0 : out[5] = _mm_unpackhi_epi64(a4, a5);
663 0 : out[6] = _mm_unpacklo_epi64(a6, a7);
664 0 : out[7] = _mm_unpackhi_epi64(a6, a7);
665 0 : }
666 :
667 : #endif // AOM_DSP_X86_TRANSPOSE_SSE2_H_
|