Line data Source code
1 : /*
2 : * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
13 : #define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
14 :
15 : #include <emmintrin.h> // SSE2
16 :
17 : static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
18 : __m128i *x2, __m128i *x3,
19 : __m128i *x4, __m128i *x5,
20 : __m128i *d0, __m128i *d1,
21 : __m128i *d2, __m128i *d3,
22 : __m128i *d4, __m128i *d5) {
23 : __m128i w0, w1, w2, w3, w4, w5, ww0;
24 :
25 : // 00 01 02 03 04 05 xx xx
26 : // 10 11 12 13 14 15 xx xx
27 : // 20 21 22 23 24 25 xx xx
28 : // 30 31 32 33 34 35 xx xx
29 : // 40 41 42 43 44 45 xx xx
30 : // 50 51 52 53 54 55 xx xx
31 :
32 : w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13
33 : w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33
34 : w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53
35 :
36 : ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
37 : *d0 = _mm_unpacklo_epi64(ww0, w2); // 00 10 20 30 40 50 41 51
38 : *d1 = _mm_unpackhi_epi64(ww0,
39 : _mm_srli_si128(w2, 4)); // 01 11 21 31 41 51 xx xx
40 :
41 : ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
42 : *d2 = _mm_unpacklo_epi64(ww0,
43 : _mm_srli_si128(w2, 8)); // 02 12 22 32 42 52 xx xx
44 :
45 : w3 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 xx xx xx xx
46 : w4 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 xx xx xx xx
47 : w5 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 xx xx xx xx
48 :
49 : *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4)); // 03 13 23 33 43 53
50 :
51 : ww0 = _mm_unpacklo_epi32(w3, w4); // 04 14 24 34 05 15 25 35
52 : *d4 = _mm_unpacklo_epi64(ww0, w5); // 04 14 24 34 44 54 45 55
53 : *d5 = _mm_unpackhi_epi64(ww0,
54 : _mm_slli_si128(w5, 4)); // 05 15 25 35 45 55 xx xx
55 : }
56 :
57 0 : static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
58 : __m128i *x2, __m128i *x3,
59 : __m128i *d0, __m128i *d1,
60 : __m128i *d2, __m128i *d3) {
61 0 : __m128i zero = _mm_setzero_si128();
62 : __m128i w0, w1, ww0, ww1;
63 :
64 0 : w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13
65 0 : w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33
66 :
67 0 : ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
68 0 : ww1 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
69 :
70 0 : *d0 = _mm_unpacklo_epi64(ww0, zero); // 00 10 20 30 xx xx xx xx
71 0 : *d1 = _mm_unpackhi_epi64(ww0, zero); // 01 11 21 31 xx xx xx xx
72 0 : *d2 = _mm_unpacklo_epi64(ww1, zero); // 02 12 22 32 xx xx xx xx
73 0 : *d3 = _mm_unpackhi_epi64(ww1, zero); // 03 13 23 33 xx xx xx xx
74 0 : }
75 :
76 0 : static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
77 : __m128i *x2, __m128i *x3,
78 : __m128i *d4, __m128i *d5,
79 : __m128i *d6, __m128i *d7) {
80 : __m128i w0, w1, ww2, ww3;
81 0 : __m128i zero = _mm_setzero_si128();
82 :
83 0 : w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17
84 0 : w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37
85 :
86 0 : ww2 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35
87 0 : ww3 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37
88 :
89 0 : *d4 = _mm_unpacklo_epi64(ww2, zero); // 04 14 24 34 xx xx xx xx
90 0 : *d5 = _mm_unpackhi_epi64(ww2, zero); // 05 15 25 35 xx xx xx xx
91 0 : *d6 = _mm_unpacklo_epi64(ww3, zero); // 06 16 26 36 xx xx xx xx
92 0 : *d7 = _mm_unpackhi_epi64(ww3, zero); // 07 17 27 37 xx xx xx xx
93 0 : }
94 :
95 : // here in and out pointers (x and d) should be different! we don't store their
96 : // values inside
97 0 : static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
98 : __m128i *x2, __m128i *x3,
99 : __m128i *d0, __m128i *d1,
100 : __m128i *d2, __m128i *d3,
101 : __m128i *d4, __m128i *d5,
102 : __m128i *d6, __m128i *d7) {
103 : // input
104 : // x0 00 01 02 03 04 05 06 07
105 : // x1 10 11 12 13 14 15 16 17
106 : // x2 20 21 22 23 24 25 26 27
107 : // x3 30 31 32 33 34 35 36 37
108 : // output
109 : // 00 10 20 30 xx xx xx xx
110 : // 01 11 21 31 xx xx xx xx
111 : // 02 12 22 32 xx xx xx xx
112 : // 03 13 23 33 xx xx xx xx
113 : // 04 14 24 34 xx xx xx xx
114 : // 05 15 25 35 xx xx xx xx
115 : // 06 16 26 36 xx xx xx xx
116 : // 07 17 27 37 xx xx xx xx
117 0 : highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
118 0 : highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
119 0 : }
120 :
121 0 : static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
122 : __m128i *x2, __m128i *x3,
123 : __m128i *x4, __m128i *x5,
124 : __m128i *x6, __m128i *x7,
125 : __m128i *d0, __m128i *d1,
126 : __m128i *d2, __m128i *d3) {
127 : __m128i w0, w1, w2, w3, ww0, ww1;
128 : // x0 00 01 02 03 04 05 06 07
129 : // x1 10 11 12 13 14 15 16 17
130 : // x2 20 21 22 23 24 25 26 27
131 : // x3 30 31 32 33 34 35 36 37
132 : // x4 40 41 42 43 44 45 46 47
133 : // x5 50 51 52 53 54 55 56 57
134 : // x6 60 61 62 63 64 65 66 67
135 : // x7 70 71 72 73 74 75 76 77
136 :
137 0 : w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13
138 0 : w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33
139 0 : w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53
140 0 : w3 = _mm_unpacklo_epi16(*x6, *x7); // 60 70 61 71 62 72 63 73
141 :
142 0 : ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
143 0 : ww1 = _mm_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71
144 :
145 0 : *d0 = _mm_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70
146 0 : *d1 = _mm_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71
147 :
148 0 : ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
149 0 : ww1 = _mm_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73
150 :
151 0 : *d2 = _mm_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72
152 0 : *d3 = _mm_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73
153 0 : }
154 :
155 0 : static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
156 : __m128i *x2, __m128i *x3,
157 : __m128i *x4, __m128i *x5,
158 : __m128i *x6, __m128i *x7,
159 : __m128i *d4, __m128i *d5,
160 : __m128i *d6, __m128i *d7) {
161 : __m128i w0, w1, w2, w3, ww0, ww1;
162 : // x0 00 01 02 03 04 05 06 07
163 : // x1 10 11 12 13 14 15 16 17
164 : // x2 20 21 22 23 24 25 26 27
165 : // x3 30 31 32 33 34 35 36 37
166 : // x4 40 41 42 43 44 45 46 47
167 : // x5 50 51 52 53 54 55 56 57
168 : // x6 60 61 62 63 64 65 66 67
169 : // x7 70 71 72 73 74 75 76 77
170 0 : w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17
171 0 : w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37
172 0 : w2 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 46 56 47 57
173 0 : w3 = _mm_unpackhi_epi16(*x6, *x7); // 64 74 65 75 66 76 67 77
174 :
175 0 : ww0 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35
176 0 : ww1 = _mm_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75
177 :
178 0 : *d4 = _mm_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74
179 0 : *d5 = _mm_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75
180 :
181 0 : ww0 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37
182 0 : ww1 = _mm_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77
183 :
184 0 : *d6 = _mm_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76
185 0 : *d7 = _mm_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77
186 0 : }
187 :
188 : // here in and out pointers (x and d) should be different! we don't store their
189 : // values inside
190 0 : static INLINE void highbd_transpose8x8_sse2(
191 : __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
192 : __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
193 : __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
194 : __m128i *d7) {
195 0 : highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
196 0 : highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
197 0 : }
198 :
199 : // here in and out pointers (x and d arrays) should be different! we don't store
200 : // their values inside
201 : static INLINE void highbd_transpose8x16_sse2(
202 : __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
203 : __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
204 : __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
205 : __m128i *d7) {
206 : highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
207 : d5, d6, d7);
208 : highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
209 : x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
210 : d4 + 1, d5 + 1, d6 + 1, d7 + 1);
211 : }
212 :
213 : // Low bit depth functions
214 508538 : static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
215 : __m128i *x2, __m128i *x3,
216 : __m128i *d0, __m128i *d1,
217 : __m128i *d2, __m128i *d3) {
218 : // input
219 : // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
220 : // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
221 : // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
222 : // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
223 : // output
224 : // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
225 : // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
226 : // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
227 : // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
228 :
229 : __m128i w0, w1;
230 :
231 508538 : w0 = _mm_unpacklo_epi8(
232 : *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
233 1017080 : w1 = _mm_unpacklo_epi8(
234 : *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
235 :
236 508538 : *d0 = _mm_unpacklo_epi16(
237 : w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
238 :
239 508538 : *d1 = _mm_srli_si128(*d0,
240 : 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
241 508538 : *d2 = _mm_srli_si128(*d0,
242 : 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
243 508538 : *d3 = _mm_srli_si128(*d0,
244 : 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
245 508538 : }
246 :
247 262040 : static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
248 : __m128i *x3, __m128i *d0, __m128i *d1,
249 : __m128i *d2, __m128i *d3, __m128i *d4,
250 : __m128i *d5, __m128i *d6,
251 : __m128i *d7) {
252 : // input
253 : // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
254 : // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
255 : // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
256 : // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
257 : // output
258 : // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
259 : // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
260 : // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
261 : // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
262 : // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
263 : // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
264 : // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
265 : // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
266 :
267 : __m128i w0, w1, ww0, ww1;
268 :
269 262040 : w0 = _mm_unpacklo_epi8(
270 : *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
271 524080 : w1 = _mm_unpacklo_epi8(
272 : *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
273 :
274 262040 : ww0 = _mm_unpacklo_epi16(
275 : w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
276 262040 : ww1 = _mm_unpackhi_epi16(
277 : w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
278 :
279 262040 : *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
280 262040 : *d1 = _mm_srli_si128(ww0,
281 : 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
282 262040 : *d2 = _mm_srli_si128(ww0,
283 : 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
284 262040 : *d3 = _mm_srli_si128(ww0,
285 : 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
286 :
287 262040 : *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
288 262040 : *d5 = _mm_srli_si128(ww1,
289 : 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
290 262040 : *d6 = _mm_srli_si128(ww1,
291 : 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
292 262040 : *d7 = _mm_srli_si128(ww1,
293 : 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
294 262040 : }
295 :
296 235052 : static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
297 : __m128i *x3, __m128i *x4, __m128i *x5,
298 : __m128i *x6, __m128i *x7, __m128i *d0,
299 : __m128i *d1, __m128i *d2,
300 : __m128i *d3) {
301 : // input
302 : // x0 00 01 02 03 04 05 06 07
303 : // x1 10 11 12 13 14 15 16 17
304 : // x2 20 21 22 23 24 25 26 27
305 : // x3 30 31 32 33 34 35 36 37
306 : // x4 40 41 42 43 44 45 46 47
307 : // x5 50 51 52 53 54 55 56 57
308 : // x6 60 61 62 63 64 65 66 67
309 : // x7 70 71 72 73 74 75 76 77
310 : // output
311 : // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
312 : // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
313 : // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
314 : // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
315 :
316 : __m128i w0, w1, w2, w3, w4, w5;
317 :
318 235052 : w0 = _mm_unpacklo_epi8(
319 : *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
320 :
321 235052 : w1 = _mm_unpacklo_epi8(
322 : *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
323 :
324 235052 : w2 = _mm_unpacklo_epi8(
325 : *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
326 :
327 470104 : w3 = _mm_unpacklo_epi8(
328 : *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
329 :
330 235052 : w4 = _mm_unpacklo_epi16(
331 : w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
332 235052 : w5 = _mm_unpacklo_epi16(
333 : w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
334 :
335 235052 : *d0 = _mm_unpacklo_epi32(
336 : w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
337 235052 : *d1 = _mm_srli_si128(*d0, 8);
338 235052 : *d2 = _mm_unpackhi_epi32(
339 : w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
340 235052 : *d3 = _mm_srli_si128(*d2, 8);
341 235052 : }
342 :
343 503028 : static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
344 : __m128i *x3, __m128i *x4, __m128i *x5,
345 : __m128i *x6, __m128i *x7, __m128i *d0d1,
346 : __m128i *d2d3, __m128i *d4d5,
347 : __m128i *d6d7) {
348 : __m128i w0, w1, w2, w3, w4, w5, w6, w7;
349 : // x0 00 01 02 03 04 05 06 07
350 : // x1 10 11 12 13 14 15 16 17
351 503028 : w0 = _mm_unpacklo_epi8(
352 : *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
353 :
354 : // x2 20 21 22 23 24 25 26 27
355 : // x3 30 31 32 33 34 35 36 37
356 503028 : w1 = _mm_unpacklo_epi8(
357 : *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
358 :
359 : // x4 40 41 42 43 44 45 46 47
360 : // x5 50 51 52 53 54 55 56 57
361 503028 : w2 = _mm_unpacklo_epi8(
362 : *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
363 :
364 : // x6 60 61 62 63 64 65 66 67
365 : // x7 70 71 72 73 74 75 76 77
366 1006060 : w3 = _mm_unpacklo_epi8(
367 : *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
368 :
369 503028 : w4 = _mm_unpacklo_epi16(
370 : w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
371 503028 : w5 = _mm_unpacklo_epi16(
372 : w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
373 :
374 503028 : *d0d1 = _mm_unpacklo_epi32(
375 : w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
376 503028 : *d2d3 = _mm_unpackhi_epi32(
377 : w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
378 :
379 503028 : w6 = _mm_unpackhi_epi16(
380 : w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
381 503028 : w7 = _mm_unpackhi_epi16(
382 : w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
383 :
384 503028 : *d4d5 = _mm_unpacklo_epi32(
385 : w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
386 503028 : *d6d7 = _mm_unpackhi_epi32(
387 : w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
388 503028 : }
389 :
390 1220580 : static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) {
391 : __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
392 : __m128i w10, w11, w12, w13, w14, w15;
393 :
394 1220580 : w0 = _mm_unpacklo_epi8(x[0], x[1]);
395 1220580 : w1 = _mm_unpacklo_epi8(x[2], x[3]);
396 1220580 : w2 = _mm_unpacklo_epi8(x[4], x[5]);
397 1220580 : w3 = _mm_unpacklo_epi8(x[6], x[7]);
398 :
399 1220580 : w8 = _mm_unpacklo_epi8(x[8], x[9]);
400 1220580 : w9 = _mm_unpacklo_epi8(x[10], x[11]);
401 1220580 : w10 = _mm_unpacklo_epi8(x[12], x[13]);
402 2441160 : w11 = _mm_unpacklo_epi8(x[14], x[15]);
403 :
404 1220580 : w4 = _mm_unpacklo_epi16(w0, w1);
405 1220580 : w5 = _mm_unpacklo_epi16(w2, w3);
406 1220580 : w12 = _mm_unpacklo_epi16(w8, w9);
407 1220580 : w13 = _mm_unpacklo_epi16(w10, w11);
408 :
409 1220580 : w6 = _mm_unpacklo_epi32(w4, w5);
410 1220580 : w7 = _mm_unpackhi_epi32(w4, w5);
411 1220580 : w14 = _mm_unpacklo_epi32(w12, w13);
412 1220580 : w15 = _mm_unpackhi_epi32(w12, w13);
413 :
414 : // Store first 4-line result
415 1220580 : d[0] = _mm_unpacklo_epi64(w6, w14);
416 1220580 : d[1] = _mm_unpackhi_epi64(w6, w14);
417 1220580 : d[2] = _mm_unpacklo_epi64(w7, w15);
418 2441160 : d[3] = _mm_unpackhi_epi64(w7, w15);
419 :
420 1220580 : w4 = _mm_unpackhi_epi16(w0, w1);
421 1220580 : w5 = _mm_unpackhi_epi16(w2, w3);
422 1220580 : w12 = _mm_unpackhi_epi16(w8, w9);
423 1220580 : w13 = _mm_unpackhi_epi16(w10, w11);
424 :
425 1220580 : w6 = _mm_unpacklo_epi32(w4, w5);
426 1220580 : w7 = _mm_unpackhi_epi32(w4, w5);
427 1220580 : w14 = _mm_unpacklo_epi32(w12, w13);
428 1220580 : w15 = _mm_unpackhi_epi32(w12, w13);
429 :
430 : // Store second 4-line result
431 1220580 : d[4] = _mm_unpacklo_epi64(w6, w14);
432 1220580 : d[5] = _mm_unpackhi_epi64(w6, w14);
433 1220580 : d[6] = _mm_unpacklo_epi64(w7, w15);
434 1220580 : d[7] = _mm_unpackhi_epi64(w7, w15);
435 :
436 : // upper half
437 1220580 : w0 = _mm_unpackhi_epi8(x[0], x[1]);
438 1220580 : w1 = _mm_unpackhi_epi8(x[2], x[3]);
439 1220580 : w2 = _mm_unpackhi_epi8(x[4], x[5]);
440 1220580 : w3 = _mm_unpackhi_epi8(x[6], x[7]);
441 :
442 1220580 : w8 = _mm_unpackhi_epi8(x[8], x[9]);
443 1220580 : w9 = _mm_unpackhi_epi8(x[10], x[11]);
444 1220580 : w10 = _mm_unpackhi_epi8(x[12], x[13]);
445 2441160 : w11 = _mm_unpackhi_epi8(x[14], x[15]);
446 :
447 1220580 : w4 = _mm_unpacklo_epi16(w0, w1);
448 1220580 : w5 = _mm_unpacklo_epi16(w2, w3);
449 1220580 : w12 = _mm_unpacklo_epi16(w8, w9);
450 1220580 : w13 = _mm_unpacklo_epi16(w10, w11);
451 :
452 1220580 : w6 = _mm_unpacklo_epi32(w4, w5);
453 1220580 : w7 = _mm_unpackhi_epi32(w4, w5);
454 1220580 : w14 = _mm_unpacklo_epi32(w12, w13);
455 1220580 : w15 = _mm_unpackhi_epi32(w12, w13);
456 :
457 : // Store first 4-line result
458 1220580 : d[8] = _mm_unpacklo_epi64(w6, w14);
459 1220580 : d[9] = _mm_unpackhi_epi64(w6, w14);
460 1220580 : d[10] = _mm_unpacklo_epi64(w7, w15);
461 2441160 : d[11] = _mm_unpackhi_epi64(w7, w15);
462 :
463 1220580 : w4 = _mm_unpackhi_epi16(w0, w1);
464 1220580 : w5 = _mm_unpackhi_epi16(w2, w3);
465 1220580 : w12 = _mm_unpackhi_epi16(w8, w9);
466 1220580 : w13 = _mm_unpackhi_epi16(w10, w11);
467 :
468 1220580 : w6 = _mm_unpacklo_epi32(w4, w5);
469 1220580 : w7 = _mm_unpackhi_epi32(w4, w5);
470 1220580 : w14 = _mm_unpacklo_epi32(w12, w13);
471 1220580 : w15 = _mm_unpackhi_epi32(w12, w13);
472 :
473 : // Store second 4-line result
474 1220580 : d[12] = _mm_unpacklo_epi64(w6, w14);
475 1220580 : d[13] = _mm_unpackhi_epi64(w6, w14);
476 1220580 : d[14] = _mm_unpacklo_epi64(w7, w15);
477 1220580 : d[15] = _mm_unpackhi_epi64(w7, w15);
478 1220580 : }
479 :
480 457672 : static INLINE void transpose16x8_8x16_sse2(
481 : __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
482 : __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
483 : __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
484 : __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
485 : __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
486 : __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
487 : __m128i w10, w11, w12, w13, w14, w15;
488 :
489 457672 : w0 = _mm_unpacklo_epi8(*x0, *x1);
490 457672 : w1 = _mm_unpacklo_epi8(*x2, *x3);
491 457672 : w2 = _mm_unpacklo_epi8(*x4, *x5);
492 457672 : w3 = _mm_unpacklo_epi8(*x6, *x7);
493 :
494 457672 : w8 = _mm_unpacklo_epi8(*x8, *x9);
495 457672 : w9 = _mm_unpacklo_epi8(*x10, *x11);
496 457672 : w10 = _mm_unpacklo_epi8(*x12, *x13);
497 915344 : w11 = _mm_unpacklo_epi8(*x14, *x15);
498 :
499 457672 : w4 = _mm_unpacklo_epi16(w0, w1);
500 457672 : w5 = _mm_unpacklo_epi16(w2, w3);
501 457672 : w12 = _mm_unpacklo_epi16(w8, w9);
502 457672 : w13 = _mm_unpacklo_epi16(w10, w11);
503 :
504 457672 : w6 = _mm_unpacklo_epi32(w4, w5);
505 457672 : w7 = _mm_unpackhi_epi32(w4, w5);
506 457672 : w14 = _mm_unpacklo_epi32(w12, w13);
507 457672 : w15 = _mm_unpackhi_epi32(w12, w13);
508 :
509 : // Store first 4-line result
510 457672 : *d0 = _mm_unpacklo_epi64(w6, w14);
511 457672 : *d1 = _mm_unpackhi_epi64(w6, w14);
512 457672 : *d2 = _mm_unpacklo_epi64(w7, w15);
513 457672 : *d3 = _mm_unpackhi_epi64(w7, w15);
514 :
515 457672 : w4 = _mm_unpackhi_epi16(w0, w1);
516 457672 : w5 = _mm_unpackhi_epi16(w2, w3);
517 457672 : w12 = _mm_unpackhi_epi16(w8, w9);
518 457672 : w13 = _mm_unpackhi_epi16(w10, w11);
519 :
520 457672 : w6 = _mm_unpacklo_epi32(w4, w5);
521 457672 : w7 = _mm_unpackhi_epi32(w4, w5);
522 457672 : w14 = _mm_unpacklo_epi32(w12, w13);
523 457672 : w15 = _mm_unpackhi_epi32(w12, w13);
524 :
525 : // Store second 4-line result
526 457672 : *d4 = _mm_unpacklo_epi64(w6, w14);
527 457672 : *d5 = _mm_unpackhi_epi64(w6, w14);
528 457672 : *d6 = _mm_unpacklo_epi64(w7, w15);
529 457672 : *d7 = _mm_unpackhi_epi64(w7, w15);
530 457672 : }
531 :
532 143055 : static INLINE void transpose8x16_16x8_sse2(
533 : __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
534 : __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
535 : __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
536 : __m128i *d12d13, __m128i *d14d15) {
537 : __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
538 : __m128i w10, w11, w12, w13, w14, w15;
539 :
540 143055 : w0 = _mm_unpacklo_epi8(*x0, *x1);
541 143055 : w1 = _mm_unpacklo_epi8(*x2, *x3);
542 143055 : w2 = _mm_unpacklo_epi8(*x4, *x5);
543 143055 : w3 = _mm_unpacklo_epi8(*x6, *x7);
544 :
545 143055 : w8 = _mm_unpackhi_epi8(*x0, *x1);
546 143055 : w9 = _mm_unpackhi_epi8(*x2, *x3);
547 143055 : w10 = _mm_unpackhi_epi8(*x4, *x5);
548 286110 : w11 = _mm_unpackhi_epi8(*x6, *x7);
549 :
550 143055 : w4 = _mm_unpacklo_epi16(w0, w1);
551 143055 : w5 = _mm_unpacklo_epi16(w2, w3);
552 143055 : w12 = _mm_unpacklo_epi16(w8, w9);
553 143055 : w13 = _mm_unpacklo_epi16(w10, w11);
554 :
555 143055 : w6 = _mm_unpacklo_epi32(w4, w5);
556 143055 : w7 = _mm_unpackhi_epi32(w4, w5);
557 143055 : w14 = _mm_unpacklo_epi32(w12, w13);
558 143055 : w15 = _mm_unpackhi_epi32(w12, w13);
559 :
560 : // Store first 4-line result
561 143055 : *d0d1 = _mm_unpacklo_epi64(w6, w14);
562 143055 : *d2d3 = _mm_unpackhi_epi64(w6, w14);
563 143055 : *d4d5 = _mm_unpacklo_epi64(w7, w15);
564 143055 : *d6d7 = _mm_unpackhi_epi64(w7, w15);
565 :
566 143055 : w4 = _mm_unpackhi_epi16(w0, w1);
567 143055 : w5 = _mm_unpackhi_epi16(w2, w3);
568 143055 : w12 = _mm_unpackhi_epi16(w8, w9);
569 143055 : w13 = _mm_unpackhi_epi16(w10, w11);
570 :
571 143055 : w6 = _mm_unpacklo_epi32(w4, w5);
572 143055 : w7 = _mm_unpackhi_epi32(w4, w5);
573 143055 : w14 = _mm_unpacklo_epi32(w12, w13);
574 143055 : w15 = _mm_unpackhi_epi32(w12, w13);
575 :
576 : // Store second 4-line result
577 143055 : *d8d9 = _mm_unpacklo_epi64(w6, w14);
578 143055 : *d10d11 = _mm_unpackhi_epi64(w6, w14);
579 143055 : *d12d13 = _mm_unpacklo_epi64(w7, w15);
580 143055 : *d14d15 = _mm_unpackhi_epi64(w7, w15);
581 143055 : }
582 :
583 : #endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
|