Line data Source code
1 : /* Copyright (C) 2007-2019 Free Software Foundation, Inc.
2 :
3 : This file is part of GCC.
4 :
5 : GCC is free software; you can redistribute it and/or modify
6 : it under the terms of the GNU General Public License as published by
7 : the Free Software Foundation; either version 3, or (at your option)
8 : any later version.
9 :
10 : GCC is distributed in the hope that it will be useful,
11 : but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : GNU General Public License for more details.
14 :
15 : Under Section 7 of GPL version 3, you are granted additional
16 : permissions described in the GCC Runtime Library Exception, version
17 : 3.1, as published by the Free Software Foundation.
18 :
19 : You should have received a copy of the GNU General Public License and
20 : a copy of the GCC Runtime Library Exception along with this program;
21 : see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 : <http://www.gnu.org/licenses/>. */
23 :
24 : /* Implemented from the specification included in the Intel C++ Compiler
25 : User Guide and Reference, version 10.0. */
26 :
27 : #ifndef _SMMINTRIN_H_INCLUDED
28 : #define _SMMINTRIN_H_INCLUDED
29 :
30 : /* We need definitions from the SSSE3, SSE3, SSE2 and SSE header
31 : files. */
32 : #include <tmmintrin.h>
33 :
34 : #ifndef __SSE4_1__
35 : #pragma GCC push_options
36 : #pragma GCC target("sse4.1")
37 : #define __DISABLE_SSE4_1__
38 : #endif /* __SSE4_1__ */
39 :
40 : /* Rounding mode macros. */
41 : #define _MM_FROUND_TO_NEAREST_INT 0x00
42 : #define _MM_FROUND_TO_NEG_INF 0x01
43 : #define _MM_FROUND_TO_POS_INF 0x02
44 : #define _MM_FROUND_TO_ZERO 0x03
45 : #define _MM_FROUND_CUR_DIRECTION 0x04
46 :
47 : #define _MM_FROUND_RAISE_EXC 0x00
48 : #define _MM_FROUND_NO_EXC 0x08
49 :
50 : #define _MM_FROUND_NINT \
51 : (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
52 : #define _MM_FROUND_FLOOR \
53 : (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
54 : #define _MM_FROUND_CEIL \
55 : (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
56 : #define _MM_FROUND_TRUNC \
57 : (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
58 : #define _MM_FROUND_RINT \
59 : (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
60 : #define _MM_FROUND_NEARBYINT \
61 : (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
62 :
63 : /* Test Instruction */
64 : /* Packed integer 128-bit bitwise comparison. Return 1 if
65 : (__V & __M) == 0. */
66 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
67 : _mm_testz_si128 (__m128i __M, __m128i __V)
68 : {
69 450254400 : return __builtin_ia32_ptestz128 ((__v2di)__M, (__v2di)__V);
70 : }
71 :
72 : /* Packed integer 128-bit bitwise comparison. Return 1 if
73 : (__V & ~__M) == 0. */
74 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
75 : _mm_testc_si128 (__m128i __M, __m128i __V)
76 : {
77 : return __builtin_ia32_ptestc128 ((__v2di)__M, (__v2di)__V);
78 : }
79 :
80 : /* Packed integer 128-bit bitwise comparison. Return 1 if
81 : (__V & __M) != 0 && (__V & ~__M) != 0. */
82 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83 : _mm_testnzc_si128 (__m128i __M, __m128i __V)
84 : {
85 : return __builtin_ia32_ptestnzc128 ((__v2di)__M, (__v2di)__V);
86 : }
87 :
88 : /* Macros for packed integer 128-bit comparison intrinsics. */
89 : #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
90 :
91 : #define _mm_test_all_ones(V) \
92 : _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))
93 :
94 : #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
95 :
96 : /* Packed/scalar double precision floating point rounding. */
97 :
98 : #ifdef __OPTIMIZE__
99 : extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
100 : _mm_round_pd (__m128d __V, const int __M)
101 : {
102 : return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M);
103 : }
104 :
105 : extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
106 : _mm_round_sd(__m128d __D, __m128d __V, const int __M)
107 : {
108 : return (__m128d) __builtin_ia32_roundsd ((__v2df)__D,
109 : (__v2df)__V,
110 : __M);
111 : }
112 : #else
113 : #define _mm_round_pd(V, M) \
114 : ((__m128d) __builtin_ia32_roundpd ((__v2df)(__m128d)(V), (int)(M)))
115 :
116 : #define _mm_round_sd(D, V, M) \
117 : ((__m128d) __builtin_ia32_roundsd ((__v2df)(__m128d)(D), \
118 : (__v2df)(__m128d)(V), (int)(M)))
119 : #endif
120 :
121 : /* Packed/scalar single precision floating point rounding. */
122 :
123 : #ifdef __OPTIMIZE__
124 : extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125 : _mm_round_ps (__m128 __V, const int __M)
126 : {
127 : return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M);
128 : }
129 :
130 : extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
131 : _mm_round_ss (__m128 __D, __m128 __V, const int __M)
132 : {
133 : return (__m128) __builtin_ia32_roundss ((__v4sf)__D,
134 : (__v4sf)__V,
135 : __M);
136 : }
137 : #else
138 : #define _mm_round_ps(V, M) \
139 : ((__m128) __builtin_ia32_roundps ((__v4sf)(__m128)(V), (int)(M)))
140 :
141 : #define _mm_round_ss(D, V, M) \
142 : ((__m128) __builtin_ia32_roundss ((__v4sf)(__m128)(D), \
143 : (__v4sf)(__m128)(V), (int)(M)))
144 : #endif
145 :
146 : /* Macros for ceil/floor intrinsics. */
147 : #define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL)
148 : #define _mm_ceil_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL)
149 :
150 : #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
151 : #define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)
152 :
153 : #define _mm_ceil_ps(V) _mm_round_ps ((V), _MM_FROUND_CEIL)
154 : #define _mm_ceil_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL)
155 :
156 : #define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR)
157 : #define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)
158 :
159 : /* SSE4.1 */
160 :
161 : /* Integer blend instructions - select data from 2 sources using
162 : constant/variable mask. */
163 :
164 : #ifdef __OPTIMIZE__
165 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
166 : _mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M)
167 : {
168 : return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X,
169 : (__v8hi)__Y,
170 : __M);
171 : }
172 : #else
173 : #define _mm_blend_epi16(X, Y, M) \
174 : ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(X), \
175 : (__v8hi)(__m128i)(Y), (int)(M)))
176 : #endif
177 :
178 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
179 : _mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M)
180 : {
181 778243100 : return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__X,
182 : (__v16qi)__Y,
183 : (__v16qi)__M);
184 : }
185 :
186 : /* Single precision floating point blend instructions - select data
187 : from 2 sources using constant/variable mask. */
188 :
189 : #ifdef __OPTIMIZE__
190 : extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
191 : _mm_blend_ps (__m128 __X, __m128 __Y, const int __M)
192 : {
193 : return (__m128) __builtin_ia32_blendps ((__v4sf)__X,
194 : (__v4sf)__Y,
195 : __M);
196 : }
197 : #else
198 : #define _mm_blend_ps(X, Y, M) \
199 : ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(X), \
200 : (__v4sf)(__m128)(Y), (int)(M)))
201 : #endif
202 :
203 : extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
204 : _mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M)
205 : {
206 : return (__m128) __builtin_ia32_blendvps ((__v4sf)__X,
207 : (__v4sf)__Y,
208 : (__v4sf)__M);
209 : }
210 :
211 : /* Double precision floating point blend instructions - select data
212 : from 2 sources using constant/variable mask. */
213 :
214 : #ifdef __OPTIMIZE__
215 : extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
216 : _mm_blend_pd (__m128d __X, __m128d __Y, const int __M)
217 : {
218 : return (__m128d) __builtin_ia32_blendpd ((__v2df)__X,
219 : (__v2df)__Y,
220 : __M);
221 : }
222 : #else
223 : #define _mm_blend_pd(X, Y, M) \
224 : ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(X), \
225 : (__v2df)(__m128d)(Y), (int)(M)))
226 : #endif
227 :
228 : extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
229 : _mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M)
230 : {
231 : return (__m128d) __builtin_ia32_blendvpd ((__v2df)__X,
232 : (__v2df)__Y,
233 : (__v2df)__M);
234 : }
235 :
236 : /* Dot product instructions with mask-defined summing and zeroing parts
237 : of result. */
238 :
239 : #ifdef __OPTIMIZE__
240 : extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
241 : _mm_dp_ps (__m128 __X, __m128 __Y, const int __M)
242 : {
243 : return (__m128) __builtin_ia32_dpps ((__v4sf)__X,
244 : (__v4sf)__Y,
245 : __M);
246 : }
247 :
248 : extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
249 : _mm_dp_pd (__m128d __X, __m128d __Y, const int __M)
250 : {
251 : return (__m128d) __builtin_ia32_dppd ((__v2df)__X,
252 : (__v2df)__Y,
253 : __M);
254 : }
255 : #else
256 : #define _mm_dp_ps(X, Y, M) \
257 : ((__m128) __builtin_ia32_dpps ((__v4sf)(__m128)(X), \
258 : (__v4sf)(__m128)(Y), (int)(M)))
259 :
260 : #define _mm_dp_pd(X, Y, M) \
261 : ((__m128d) __builtin_ia32_dppd ((__v2df)(__m128d)(X), \
262 : (__v2df)(__m128d)(Y), (int)(M)))
263 : #endif
264 :
265 : /* Packed integer 64-bit comparison, zeroing or filling with ones
266 : corresponding parts of result. */
267 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268 : _mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
269 : {
270 : return (__m128i) ((__v2di)__X == (__v2di)__Y);
271 : }
272 :
273 : /* Min/max packed integer instructions. */
274 :
275 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
276 : _mm_min_epi8 (__m128i __X, __m128i __Y)
277 : {
278 : return (__m128i) __builtin_ia32_pminsb128 ((__v16qi)__X, (__v16qi)__Y);
279 : }
280 :
281 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282 : _mm_max_epi8 (__m128i __X, __m128i __Y)
283 : {
284 : return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi)__X, (__v16qi)__Y);
285 : }
286 :
287 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288 : _mm_min_epu16 (__m128i __X, __m128i __Y)
289 : {
290 2206660 : return (__m128i) __builtin_ia32_pminuw128 ((__v8hi)__X, (__v8hi)__Y);
291 : }
292 :
293 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
294 : _mm_max_epu16 (__m128i __X, __m128i __Y)
295 : {
296 : return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi)__X, (__v8hi)__Y);
297 : }
298 :
299 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
300 : _mm_min_epi32 (__m128i __X, __m128i __Y)
301 : {
302 545050000 : return (__m128i) __builtin_ia32_pminsd128 ((__v4si)__X, (__v4si)__Y);
303 : }
304 :
305 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
306 : _mm_max_epi32 (__m128i __X, __m128i __Y)
307 : {
308 0 : return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si)__X, (__v4si)__Y);
309 : }
310 :
311 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312 : _mm_min_epu32 (__m128i __X, __m128i __Y)
313 : {
314 11400300 : return (__m128i) __builtin_ia32_pminud128 ((__v4si)__X, (__v4si)__Y);
315 : }
316 :
317 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
318 : _mm_max_epu32 (__m128i __X, __m128i __Y)
319 : {
320 : return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y);
321 : }
322 :
323 : /* Packed integer 32-bit multiplication with truncation of upper
324 : halves of results. */
325 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
326 : _mm_mullo_epi32 (__m128i __X, __m128i __Y)
327 : {
328 1132109459 : return (__m128i) ((__v4su)__X * (__v4su)__Y);
329 : }
330 :
331 : /* Packed integer 32-bit multiplication of 2 pairs of operands
332 : with two 64-bit results. */
333 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
334 : _mm_mul_epi32 (__m128i __X, __m128i __Y)
335 : {
336 : return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y);
337 : }
338 :
339 : /* Insert single precision float into packed single precision array
340 : element selected by index N. The bits [7-6] of N define S
341 : index, the bits [5-4] define D index, and bits [3-0] define
342 : zeroing mask for D. */
343 :
344 : #ifdef __OPTIMIZE__
345 : extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
346 : _mm_insert_ps (__m128 __D, __m128 __S, const int __N)
347 : {
348 : return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D,
349 : (__v4sf)__S,
350 : __N);
351 : }
352 : #else
353 : #define _mm_insert_ps(D, S, N) \
354 : ((__m128) __builtin_ia32_insertps128 ((__v4sf)(__m128)(D), \
355 : (__v4sf)(__m128)(S), (int)(N)))
356 : #endif
357 :
358 : /* Helper macro to create the N value for _mm_insert_ps. */
359 : #define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M))
360 :
361 : /* Extract binary representation of single precision float from packed
362 : single precision array element of X selected by index N. */
363 :
364 : #ifdef __OPTIMIZE__
365 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
366 : _mm_extract_ps (__m128 __X, const int __N)
367 : {
368 : union { int i; float f; } __tmp;
369 : __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N);
370 : return __tmp.i;
371 : }
372 : #else
373 : #define _mm_extract_ps(X, N) \
374 : (__extension__ \
375 : ({ \
376 : union { int i; float f; } __tmp; \
377 : __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(__m128)(X), (int)(N)); \
378 : __tmp.i; \
379 : }))
380 : #endif
381 :
382 : /* Extract binary representation of single precision float into
383 : D from packed single precision array element of S selected
384 : by index N. */
385 : #define _MM_EXTRACT_FLOAT(D, S, N) \
386 : { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); }
387 :
388 : /* Extract specified single precision float element into the lower
389 : part of __m128. */
390 : #define _MM_PICK_OUT_PS(X, N) \
391 : _mm_insert_ps (_mm_setzero_ps (), (X), \
392 : _MM_MK_INSERTPS_NDX ((N), 0, 0x0e))
393 :
394 : /* Insert integer, S, into packed integer array element of D
395 : selected by index N. */
396 :
397 : #ifdef __OPTIMIZE__
398 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
399 : _mm_insert_epi8 (__m128i __D, int __S, const int __N)
400 : {
401 : return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D,
402 : __S, __N);
403 : }
404 :
405 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
406 : _mm_insert_epi32 (__m128i __D, int __S, const int __N)
407 : {
408 : return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D,
409 : __S, __N);
410 : }
411 :
412 : #ifdef __x86_64__
413 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
414 : _mm_insert_epi64 (__m128i __D, long long __S, const int __N)
415 : {
416 : return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D,
417 : __S, __N);
418 : }
419 : #endif
420 : #else
421 : #define _mm_insert_epi8(D, S, N) \
422 : ((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(__m128i)(D), \
423 : (int)(S), (int)(N)))
424 :
425 : #define _mm_insert_epi32(D, S, N) \
426 : ((__m128i) __builtin_ia32_vec_set_v4si ((__v4si)(__m128i)(D), \
427 : (int)(S), (int)(N)))
428 :
429 : #ifdef __x86_64__
430 : #define _mm_insert_epi64(D, S, N) \
431 : ((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(__m128i)(D), \
432 : (long long)(S), (int)(N)))
433 : #endif
434 : #endif
435 :
436 : /* Extract integer from packed integer array element of X selected by
437 : index N. */
438 :
439 : #ifdef __OPTIMIZE__
440 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
441 : _mm_extract_epi8 (__m128i __X, const int __N)
442 : {
443 : return (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N);
444 : }
445 :
446 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447 : _mm_extract_epi32 (__m128i __X, const int __N)
448 : {
449 : return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N);
450 : }
451 :
452 : #ifdef __x86_64__
453 : extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
454 : _mm_extract_epi64 (__m128i __X, const int __N)
455 : {
456 : return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N);
457 : }
458 : #endif
459 : #else
460 : #define _mm_extract_epi8(X, N) \
461 : ((int) (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)(__m128i)(X), (int)(N)))
462 : #define _mm_extract_epi32(X, N) \
463 : ((int) __builtin_ia32_vec_ext_v4si ((__v4si)(__m128i)(X), (int)(N)))
464 :
465 : #ifdef __x86_64__
466 : #define _mm_extract_epi64(X, N) \
467 : ((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(__m128i)(X), (int)(N)))
468 : #endif
469 : #endif
470 :
471 : /* Return horizontal packed word minimum and its index in bits [15:0]
472 : and bits [18:16] respectively. */
473 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
474 : _mm_minpos_epu16 (__m128i __X)
475 : {
476 2823509200 : return (__m128i) __builtin_ia32_phminposuw128 ((__v8hi)__X);
477 : }
478 :
479 : /* Packed integer sign-extension. */
480 :
481 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
482 : _mm_cvtepi8_epi32 (__m128i __X)
483 : {
484 : return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__X);
485 : }
486 :
487 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
488 : _mm_cvtepi16_epi32 (__m128i __X)
489 : {
490 91869859 : return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__X);
491 : }
492 :
493 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
494 : _mm_cvtepi8_epi64 (__m128i __X)
495 : {
496 : return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__X);
497 : }
498 :
499 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
500 : _mm_cvtepi32_epi64 (__m128i __X)
501 : {
502 26400 : return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__X);
503 : }
504 :
505 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
506 : _mm_cvtepi16_epi64 (__m128i __X)
507 : {
508 : return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__X);
509 : }
510 :
511 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
512 : _mm_cvtepi8_epi16 (__m128i __X)
513 : {
514 : return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__X);
515 : }
516 :
517 : /* Packed integer zero-extension. */
518 :
519 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
520 : _mm_cvtepu8_epi32 (__m128i __X)
521 : {
522 0 : return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__X);
523 : }
524 :
525 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526 : _mm_cvtepu16_epi32 (__m128i __X)
527 : {
528 2208620 : return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__X);
529 : }
530 :
531 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
532 : _mm_cvtepu8_epi64 (__m128i __X)
533 : {
534 : return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__X);
535 : }
536 :
537 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
538 : _mm_cvtepu32_epi64 (__m128i __X)
539 : {
540 : return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__X);
541 : }
542 :
543 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
544 : _mm_cvtepu16_epi64 (__m128i __X)
545 : {
546 : return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__X);
547 : }
548 :
549 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
550 : _mm_cvtepu8_epi16 (__m128i __X)
551 : {
552 882540260 : return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__X);
553 : }
554 :
555 : /* Pack 8 double words from 2 operands into 8 words of result with
556 : unsigned saturation. */
557 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
558 : _mm_packus_epi32 (__m128i __X, __m128i __Y)
559 : {
560 1155839 : return (__m128i) __builtin_ia32_packusdw128 ((__v4si)__X, (__v4si)__Y);
561 : }
562 :
563 : /* Sum absolute 8-bit integer difference of adjacent groups of 4
564 : byte integers in the first 2 operands. Starting offsets within
565 : operands are determined by the 3rd mask operand. */
566 :
567 : #ifdef __OPTIMIZE__
568 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
569 : _mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M)
570 : {
571 : return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X,
572 : (__v16qi)__Y, __M);
573 : }
574 : #else
575 : #define _mm_mpsadbw_epu8(X, Y, M) \
576 : ((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(__m128i)(X), \
577 : (__v16qi)(__m128i)(Y), (int)(M)))
578 : #endif
579 :
580 : /* Load double quadword using non-temporal aligned hint. */
581 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
582 : _mm_stream_load_si128 (__m128i *__X)
583 : {
584 : return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __X);
585 : }
586 :
587 : #ifndef __SSE4_2__
588 : #pragma GCC push_options
589 : #pragma GCC target("sse4.2")
590 : #define __DISABLE_SSE4_2__
591 : #endif /* __SSE4_2__ */
592 :
593 : /* These macros specify the source data format. */
594 : #define _SIDD_UBYTE_OPS 0x00
595 : #define _SIDD_UWORD_OPS 0x01
596 : #define _SIDD_SBYTE_OPS 0x02
597 : #define _SIDD_SWORD_OPS 0x03
598 :
599 : /* These macros specify the comparison operation. */
600 : #define _SIDD_CMP_EQUAL_ANY 0x00
601 : #define _SIDD_CMP_RANGES 0x04
602 : #define _SIDD_CMP_EQUAL_EACH 0x08
603 : #define _SIDD_CMP_EQUAL_ORDERED 0x0c
604 :
605 : /* These macros specify the polarity. */
606 : #define _SIDD_POSITIVE_POLARITY 0x00
607 : #define _SIDD_NEGATIVE_POLARITY 0x10
608 : #define _SIDD_MASKED_POSITIVE_POLARITY 0x20
609 : #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
610 :
611 : /* These macros specify the output selection in _mm_cmpXstri (). */
612 : #define _SIDD_LEAST_SIGNIFICANT 0x00
613 : #define _SIDD_MOST_SIGNIFICANT 0x40
614 :
615 : /* These macros specify the output selection in _mm_cmpXstrm (). */
616 : #define _SIDD_BIT_MASK 0x00
617 : #define _SIDD_UNIT_MASK 0x40
618 :
619 : /* Intrinsics for text/string processing. */
620 :
621 : #ifdef __OPTIMIZE__
622 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
623 : _mm_cmpistrm (__m128i __X, __m128i __Y, const int __M)
624 : {
625 : return (__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)__X,
626 : (__v16qi)__Y,
627 : __M);
628 : }
629 :
630 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631 : _mm_cmpistri (__m128i __X, __m128i __Y, const int __M)
632 : {
633 : return __builtin_ia32_pcmpistri128 ((__v16qi)__X,
634 : (__v16qi)__Y,
635 : __M);
636 : }
637 :
638 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
639 : _mm_cmpestrm (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
640 : {
641 : return (__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)__X, __LX,
642 : (__v16qi)__Y, __LY,
643 : __M);
644 : }
645 :
646 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
647 : _mm_cmpestri (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
648 : {
649 : return __builtin_ia32_pcmpestri128 ((__v16qi)__X, __LX,
650 : (__v16qi)__Y, __LY,
651 : __M);
652 : }
653 : #else
654 : #define _mm_cmpistrm(X, Y, M) \
655 : ((__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)(__m128i)(X), \
656 : (__v16qi)(__m128i)(Y), (int)(M)))
657 : #define _mm_cmpistri(X, Y, M) \
658 : ((int) __builtin_ia32_pcmpistri128 ((__v16qi)(__m128i)(X), \
659 : (__v16qi)(__m128i)(Y), (int)(M)))
660 :
661 : #define _mm_cmpestrm(X, LX, Y, LY, M) \
662 : ((__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)(__m128i)(X), \
663 : (int)(LX), (__v16qi)(__m128i)(Y), \
664 : (int)(LY), (int)(M)))
665 : #define _mm_cmpestri(X, LX, Y, LY, M) \
666 : ((int) __builtin_ia32_pcmpestri128 ((__v16qi)(__m128i)(X), (int)(LX), \
667 : (__v16qi)(__m128i)(Y), (int)(LY), \
668 : (int)(M)))
669 : #endif
670 :
671 : /* Intrinsics for text/string processing and reading values of
672 : EFlags. */
673 :
674 : #ifdef __OPTIMIZE__
675 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
676 : _mm_cmpistra (__m128i __X, __m128i __Y, const int __M)
677 : {
678 : return __builtin_ia32_pcmpistria128 ((__v16qi)__X,
679 : (__v16qi)__Y,
680 : __M);
681 : }
682 :
683 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
684 : _mm_cmpistrc (__m128i __X, __m128i __Y, const int __M)
685 : {
686 : return __builtin_ia32_pcmpistric128 ((__v16qi)__X,
687 : (__v16qi)__Y,
688 : __M);
689 : }
690 :
691 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
692 : _mm_cmpistro (__m128i __X, __m128i __Y, const int __M)
693 : {
694 : return __builtin_ia32_pcmpistrio128 ((__v16qi)__X,
695 : (__v16qi)__Y,
696 : __M);
697 : }
698 :
699 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
700 : _mm_cmpistrs (__m128i __X, __m128i __Y, const int __M)
701 : {
702 : return __builtin_ia32_pcmpistris128 ((__v16qi)__X,
703 : (__v16qi)__Y,
704 : __M);
705 : }
706 :
707 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
708 : _mm_cmpistrz (__m128i __X, __m128i __Y, const int __M)
709 : {
710 : return __builtin_ia32_pcmpistriz128 ((__v16qi)__X,
711 : (__v16qi)__Y,
712 : __M);
713 : }
714 :
715 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
716 : _mm_cmpestra (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
717 : {
718 : return __builtin_ia32_pcmpestria128 ((__v16qi)__X, __LX,
719 : (__v16qi)__Y, __LY,
720 : __M);
721 : }
722 :
723 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
724 : _mm_cmpestrc (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
725 : {
726 : return __builtin_ia32_pcmpestric128 ((__v16qi)__X, __LX,
727 : (__v16qi)__Y, __LY,
728 : __M);
729 : }
730 :
731 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
732 : _mm_cmpestro (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
733 : {
734 : return __builtin_ia32_pcmpestrio128 ((__v16qi)__X, __LX,
735 : (__v16qi)__Y, __LY,
736 : __M);
737 : }
738 :
739 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
740 : _mm_cmpestrs (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
741 : {
742 : return __builtin_ia32_pcmpestris128 ((__v16qi)__X, __LX,
743 : (__v16qi)__Y, __LY,
744 : __M);
745 : }
746 :
747 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
748 : _mm_cmpestrz (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
749 : {
750 : return __builtin_ia32_pcmpestriz128 ((__v16qi)__X, __LX,
751 : (__v16qi)__Y, __LY,
752 : __M);
753 : }
754 : #else
755 : #define _mm_cmpistra(X, Y, M) \
756 : ((int) __builtin_ia32_pcmpistria128 ((__v16qi)(__m128i)(X), \
757 : (__v16qi)(__m128i)(Y), (int)(M)))
758 : #define _mm_cmpistrc(X, Y, M) \
759 : ((int) __builtin_ia32_pcmpistric128 ((__v16qi)(__m128i)(X), \
760 : (__v16qi)(__m128i)(Y), (int)(M)))
761 : #define _mm_cmpistro(X, Y, M) \
762 : ((int) __builtin_ia32_pcmpistrio128 ((__v16qi)(__m128i)(X), \
763 : (__v16qi)(__m128i)(Y), (int)(M)))
764 : #define _mm_cmpistrs(X, Y, M) \
765 : ((int) __builtin_ia32_pcmpistris128 ((__v16qi)(__m128i)(X), \
766 : (__v16qi)(__m128i)(Y), (int)(M)))
767 : #define _mm_cmpistrz(X, Y, M) \
768 : ((int) __builtin_ia32_pcmpistriz128 ((__v16qi)(__m128i)(X), \
769 : (__v16qi)(__m128i)(Y), (int)(M)))
770 :
771 : #define _mm_cmpestra(X, LX, Y, LY, M) \
772 : ((int) __builtin_ia32_pcmpestria128 ((__v16qi)(__m128i)(X), (int)(LX), \
773 : (__v16qi)(__m128i)(Y), (int)(LY), \
774 : (int)(M)))
775 : #define _mm_cmpestrc(X, LX, Y, LY, M) \
776 : ((int) __builtin_ia32_pcmpestric128 ((__v16qi)(__m128i)(X), (int)(LX), \
777 : (__v16qi)(__m128i)(Y), (int)(LY), \
778 : (int)(M)))
779 : #define _mm_cmpestro(X, LX, Y, LY, M) \
780 : ((int) __builtin_ia32_pcmpestrio128 ((__v16qi)(__m128i)(X), (int)(LX), \
781 : (__v16qi)(__m128i)(Y), (int)(LY), \
782 : (int)(M)))
783 : #define _mm_cmpestrs(X, LX, Y, LY, M) \
784 : ((int) __builtin_ia32_pcmpestris128 ((__v16qi)(__m128i)(X), (int)(LX), \
785 : (__v16qi)(__m128i)(Y), (int)(LY), \
786 : (int)(M)))
787 : #define _mm_cmpestrz(X, LX, Y, LY, M) \
788 : ((int) __builtin_ia32_pcmpestriz128 ((__v16qi)(__m128i)(X), (int)(LX), \
789 : (__v16qi)(__m128i)(Y), (int)(LY), \
790 : (int)(M)))
791 : #endif
792 :
793 : /* Packed integer 64-bit comparison, zeroing or filling with ones
794 : corresponding parts of result. */
795 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
796 : _mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
797 : {
798 : return (__m128i) ((__v2di)__X > (__v2di)__Y);
799 : }
800 :
801 : #ifdef __DISABLE_SSE4_2__
802 : #undef __DISABLE_SSE4_2__
803 : #pragma GCC pop_options
804 : #endif /* __DISABLE_SSE4_2__ */
805 :
806 : #ifdef __DISABLE_SSE4_1__
807 : #undef __DISABLE_SSE4_1__
808 : #pragma GCC pop_options
809 : #endif /* __DISABLE_SSE4_1__ */
810 :
811 : #include <popcntintrin.h>
812 :
813 : #ifndef __SSE4_1__
814 : #pragma GCC push_options
815 : #pragma GCC target("sse4.1")
816 : #define __DISABLE_SSE4_1__
817 : #endif /* __SSE4_1__ */
818 :
819 : #ifndef __SSE4_2__
820 : #pragma GCC push_options
821 : #pragma GCC target("sse4.2")
822 : #define __DISABLE_SSE4_2__
823 : #endif /* __SSE4_1__ */
824 :
825 : /* Accumulate CRC32 (polynomial 0x11EDC6F41) value. */
826 : extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
827 : _mm_crc32_u8 (unsigned int __C, unsigned char __V)
828 : {
829 : return __builtin_ia32_crc32qi (__C, __V);
830 : }
831 :
832 : extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
833 : _mm_crc32_u16 (unsigned int __C, unsigned short __V)
834 : {
835 : return __builtin_ia32_crc32hi (__C, __V);
836 : }
837 :
838 : extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
839 : _mm_crc32_u32 (unsigned int __C, unsigned int __V)
840 : {
841 : return __builtin_ia32_crc32si (__C, __V);
842 : }
843 :
844 : #ifdef __x86_64__
845 : extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
846 : _mm_crc32_u64 (unsigned long long __C, unsigned long long __V)
847 : {
848 : return __builtin_ia32_crc32di (__C, __V);
849 : }
850 : #endif
851 :
852 : #ifdef __DISABLE_SSE4_2__
853 : #undef __DISABLE_SSE4_2__
854 : #pragma GCC pop_options
855 : #endif /* __DISABLE_SSE4_2__ */
856 :
857 : #ifdef __DISABLE_SSE4_1__
858 : #undef __DISABLE_SSE4_1__
859 : #pragma GCC pop_options
860 : #endif /* __DISABLE_SSE4_1__ */
861 :
862 : #endif /* _SMMINTRIN_H_INCLUDED */
|