Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #ifndef AOM_DSP_X86_SYNONYMS_H_
13 : #define AOM_DSP_X86_SYNONYMS_H_
14 :
15 : #include <immintrin.h>
16 : #include "EbDefinitions.h"
17 :
18 : /**
19 : * Various reusable shorthands for x86 SIMD intrinsics.
20 : *
21 : * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
22 : * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
23 : */
24 :
25 3270493500 : static INLINE __m128i xx_loadl_64(const void *a) {
26 3270493500 : return _mm_loadl_epi64((const __m128i *)a);
27 : }
28 :
29 2117031143 : static INLINE __m128i xx_loadu_128(const void *a) {
30 2117031143 : return _mm_loadu_si128((const __m128i *)a);
31 : }
32 :
33 322560553 : static INLINE void xx_storel_32(void *const a, const __m128i v) {
34 322560553 : *(uint32_t *)a = _mm_cvtsi128_si32(v);
35 322560553 : }
36 :
37 926129000 : static INLINE void xx_storel_64(void *const a, const __m128i v) {
38 926129000 : _mm_storel_epi64((__m128i *)a, v);
39 926129000 : }
40 :
41 696154000 : static INLINE void xx_storeu_128(void *const a, const __m128i v) {
42 : _mm_storeu_si128((__m128i *)a, v);
43 696154000 : }
44 :
45 31471800 : static INLINE __m128i _mm_loadh_epi64(const void *const p, const __m128i s) {
46 94414700 : return _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(s), (double *)p));
47 : }
48 :
49 212645742 : static INLINE void _mm_storeh_epi64(__m128i *const p, const __m128i x) {
50 212645742 : _mm_storeh_pd((double *)p, _mm_castsi128_pd(x));
51 212645742 : }
52 :
53 31470600 : static INLINE __m128i load8bit_8x2_sse2(const void *const src,
54 : const ptrdiff_t strideInByte) {
55 31470600 : const __m128i s = _mm_loadl_epi64((__m128i *)src);
56 31470600 : return _mm_loadh_epi64((__m128i *)((uint8_t *)src + strideInByte), s);
57 : }
58 :
59 30893290 : static INLINE __m128i load_u8_8x2_sse2(const uint8_t *const src,
60 : const ptrdiff_t stride) {
61 30893290 : return load8bit_8x2_sse2(src, sizeof(*src) * stride);
62 : }
63 :
64 576822 : static INLINE __m128i load_u16_4x2_sse2(const uint16_t *const src,
65 : const ptrdiff_t stride) {
66 576822 : return load8bit_8x2_sse2(src, sizeof(*src) * stride);
67 : }
68 :
69 : SIMD_INLINE void store_u8_4x2_sse2(const __m128i src, uint8_t *const dst,
70 : const int32_t stride) {
71 35268622 : xx_storel_32(dst, src);
72 35268722 : *(int32_t *)(dst + stride) = _mm_extract_epi32(src, 1);
73 35268722 : }
74 :
75 : SIMD_INLINE void store_u16_2x2_sse2(const __m128i src, uint16_t *const dst,
76 : const int32_t stride) {
77 0 : xx_storel_32(dst, src);
78 0 : *(int32_t *)(dst + stride) = _mm_extract_epi32(src, 1);
79 0 : }
80 :
81 : SIMD_INLINE void store_s16_4x2_sse2(const __m128i src, int16_t *const dst,
82 : const int32_t stride) {
83 72419400 : _mm_storel_epi64((__m128i *)dst, src);
84 72419400 : _mm_storeh_epi64((__m128i *)(dst + stride), src);
85 72413800 : }
86 :
87 : SIMD_INLINE void store_u16_4x2_sse2(const __m128i src, uint16_t *const dst,
88 : const int32_t stride) {
89 733244 : _mm_storel_epi64((__m128i *)dst, src);
90 733244 : _mm_storeh_epi64((__m128i *)(dst + stride), src);
91 733245 : }
92 :
93 : // The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
94 : // compilers. The following function is equivalent to _mm_set_epi64x()
95 : // acting on 32-bit integers.
96 : static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
97 : #if defined(_MSC_VER) && _MSC_VER < 1900
98 : return _mm_set_epi32(0, e1, 0, e0);
99 : #else
100 : return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
101 : #endif
102 : }
103 :
104 : // The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
105 : // compilers. The following function is equivalent to _mm_set1_epi64x()
106 : // acting on a 32-bit integer.
107 23050000 : static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
108 : #if defined(_MSC_VER) && _MSC_VER < 1900
109 : return _mm_set_epi32(0, a, 0, a);
110 : #else
111 46099900 : return _mm_set1_epi64x((uint32_t)a);
112 : #endif
113 : }
114 :
115 0 : static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
116 0 : return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
117 : }
118 :
119 293965719 : static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int32_t bits) {
120 587931438 : const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
121 587931438 : return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
122 : }
123 :
124 : static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int32_t bits) {
125 : const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
126 : const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
127 : return _mm_srli_epi32(v_tmp_d, bits);
128 : }
129 :
130 : // This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
131 : static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int32_t bits) {
132 : const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
133 : const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
134 : return _mm_srai_epi32(v_tmp_d, bits);
135 : }
136 :
137 : // This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
138 0 : static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int32_t bits) {
139 0 : const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
140 0 : const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
141 : const __m128i v_tmp_d =
142 0 : _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
143 0 : return _mm_srai_epi32(v_tmp_d, bits);
144 : }
145 :
146 : static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int32_t bits) {
147 : const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
148 : const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
149 : const __m128i v_tmp_d =
150 : _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d);
151 : return _mm_srai_epi16(v_tmp_d, bits);
152 : }
153 :
154 : // This fucntion will fail gcc Linux ABI build
155 : // Tunraround is to replace the core of the fucntion in each call
156 :
157 : //static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
158 : // const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
159 : // return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
160 : //}
161 : //
162 :
163 : // Note:
164 : // _mm256_insert_epi16 intrinsics is available from vs2017.
165 : // We define this macro for vs2015 and earlier. The
166 : // intrinsics used here are in vs2015 document:
167 : // https://msdn.microsoft.com/en-us/library/hh977022.aspx
168 : // Input parameters:
169 : // a: __m256i,
170 : // d: int16_t,
171 : // indx: imm8 (0 - 15)
172 : //#if _MSC_VER <= 1900
173 : #if defined(_MSC_VER) && _MSC_VER < 1910
174 : #define _mm256_insert_epi16(a, d, indx) \
175 : _mm256_insertf128_si256( \
176 : a, \
177 : _mm_insert_epi16(_mm256_extractf128_si256(a, indx >> 3), d, indx % 8), \
178 : indx >> 3)
179 :
180 : static INLINE int32_t _mm256_extract_epi32(__m256i a, const int32_t i) {
181 : return a.m256i_i32[i & 7];
182 : }
183 :
184 : static INLINE int32_t _mm256_extract_epi16(__m256i a, const int32_t i) {
185 : return a.m256i_i16[i & 15];
186 : }
187 :
188 : static INLINE __m256i _mm256_insert_epi32(__m256i a, int32_t b, const int32_t i) {
189 : __m256i c = a;
190 : c.m256i_i32[i & 7] = b;
191 : return c;
192 : }
193 : #endif
194 :
195 : #endif // AOM_DSP_X86_SYNONYMS_H_
|