Line data Source code
1 : /*
2 : * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 : #ifndef AV1_COMMON_X86_AV1_TXFM_SSE2_H_
12 : #define AV1_COMMON_X86_AV1_TXFM_SSE2_H_
13 :
14 : #include <emmintrin.h> // SSE2
15 :
16 : #ifdef __cplusplus
17 : extern "C" {
18 : #endif
19 :
20 : #define pair_set_epi16(a, b) \
21 : _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
22 :
23 : static INLINE void btf_16_w4_sse2(
24 : const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
25 : const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
26 : __m128i *const out0, __m128i *const out1) {
27 : const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
28 : const __m128i u0 = _mm_madd_epi16(t0, *w0);
29 : const __m128i v0 = _mm_madd_epi16(t0, *w1);
30 : const __m128i a0 = _mm_add_epi32(u0, __rounding);
31 : const __m128i b0 = _mm_add_epi32(v0, __rounding);
32 : const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
33 : const __m128i d0 = _mm_srai_epi32(b0, cos_bit);
34 :
35 : *out0 = _mm_packs_epi32(c0, c0);
36 : *out1 = _mm_packs_epi32(d0, c0);
37 : }
38 :
39 : #define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
40 : { \
41 : __m128i t0 = _mm_unpacklo_epi16(in0, in1); \
42 : __m128i u0 = _mm_madd_epi16(t0, w0); \
43 : __m128i v0 = _mm_madd_epi16(t0, w1); \
44 : \
45 : __m128i a0 = _mm_add_epi32(u0, __rounding); \
46 : __m128i b0 = _mm_add_epi32(v0, __rounding); \
47 : \
48 : __m128i c0 = _mm_srai_epi32(a0, cos_bit); \
49 : __m128i d0 = _mm_srai_epi32(b0, cos_bit); \
50 : \
51 : out0 = _mm_packs_epi32(c0, c0); \
52 : out1 = _mm_packs_epi32(d0, d0); \
53 : }
54 :
55 : #define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
56 : { \
57 : __m128i t0 = _mm_unpacklo_epi16(in0, in1); \
58 : __m128i t1 = _mm_unpackhi_epi16(in0, in1); \
59 : __m128i u0 = _mm_madd_epi16(t0, w0); \
60 : __m128i u1 = _mm_madd_epi16(t1, w0); \
61 : __m128i v0 = _mm_madd_epi16(t0, w1); \
62 : __m128i v1 = _mm_madd_epi16(t1, w1); \
63 : \
64 : __m128i a0 = _mm_add_epi32(u0, __rounding); \
65 : __m128i a1 = _mm_add_epi32(u1, __rounding); \
66 : __m128i b0 = _mm_add_epi32(v0, __rounding); \
67 : __m128i b1 = _mm_add_epi32(v1, __rounding); \
68 : \
69 : __m128i c0 = _mm_srai_epi32(a0, cos_bit); \
70 : __m128i c1 = _mm_srai_epi32(a1, cos_bit); \
71 : __m128i d0 = _mm_srai_epi32(b0, cos_bit); \
72 : __m128i d1 = _mm_srai_epi32(b1, cos_bit); \
73 : \
74 : out0 = _mm_packs_epi32(c0, c1); \
75 : out1 = _mm_packs_epi32(d0, d1); \
76 : }
77 :
78 : static INLINE __m128i load_16bit_to_16bit(const int16_t *a) {
79 : return _mm_load_si128((const __m128i *)a);
80 : }
81 :
82 228781000 : static INLINE __m128i load_32bit_to_16bit(const int32_t *a) {
83 228781000 : const __m128i a_low = _mm_load_si128((const __m128i *)a);
84 457563000 : return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
85 : }
86 :
87 102215000 : static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
88 102215000 : const __m128i a_low = _mm_load_si128((const __m128i *)a);
89 102215000 : return _mm_packs_epi32(a_low, a_low);
90 : }
91 :
92 : // Store 4 16 bit values. Sign extend the values.
93 : static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
94 : const __m128i a_lo = _mm_unpacklo_epi16(a, a);
95 : const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
96 : _mm_store_si128((__m128i *)b, a_1);
97 : }
98 :
99 : // Store 8 16 bit values. Sign extend the values.
100 : static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
101 : const __m128i a_lo = _mm_unpacklo_epi16(a, a);
102 : const __m128i a_hi = _mm_unpackhi_epi16(a, a);
103 : const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
104 : const __m128i a_2 = _mm_srai_epi32(a_hi, 16);
105 : _mm_store_si128((__m128i *)b, a_1);
106 : _mm_store_si128((__m128i *)(b + 4), a_2);
107 : }
108 :
109 : static INLINE __m128i scale_round_sse2(const __m128i a, const int32_t scale) {
110 : const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
111 : const __m128i b = _mm_madd_epi16(a, scale_rounding);
112 : return _mm_srai_epi32(b, NewSqrt2Bits);
113 : }
114 :
115 : static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a,
116 : int32_t *const b) {
117 : const __m128i one = _mm_set1_epi16(1);
118 : const __m128i a_lo = _mm_unpacklo_epi16(a, one);
119 : const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
120 : _mm_store_si128((__m128i *)b, b_lo);
121 : }
122 :
123 : static INLINE void store_rect_16bit_to_32bit(const __m128i a,
124 : int32_t *const b) {
125 : const __m128i one = _mm_set1_epi16(1);
126 : const __m128i a_lo = _mm_unpacklo_epi16(a, one);
127 : const __m128i a_hi = _mm_unpackhi_epi16(a, one);
128 : const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
129 : const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
130 : _mm_store_si128((__m128i *)b, b_lo);
131 : _mm_store_si128((__m128i *)(b + 4), b_hi);
132 : }
133 :
134 : static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
135 : const int32_t stride,
136 : __m128i *const out,
137 : const int32_t out_size) {
138 : for (int32_t i = 0; i < out_size; ++i)
139 : out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
140 : }
141 :
142 : static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
143 : const int32_t stride,
144 : __m128i *const out,
145 : const int32_t out_size) {
146 : for (int32_t i = 0; i < out_size; ++i)
147 : out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
148 : }
149 :
150 : static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int32_t stride,
151 : __m128i *out, int32_t out_size) {
152 : for (int32_t i = 0; i < out_size; ++i)
153 : out[i] = load_16bit_to_16bit(in + i * stride);
154 : }
155 :
156 : static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
157 : int32_t stride, __m128i *out,
158 : int32_t out_size) {
159 : for (int32_t i = 0; i < out_size; ++i)
160 : out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
161 : }
162 :
163 29627500 : static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int32_t stride,
164 : __m128i *out, int32_t out_size) {
165 239257000 : for (int32_t i = 0; i < out_size; ++i)
166 209625000 : out[i] = load_32bit_to_16bit(in + i * stride);
167 29631200 : }
168 :
169 18840700 : static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int32_t stride,
170 : __m128i *out, int32_t out_size) {
171 121057000 : for (int32_t i = 0; i < out_size; ++i)
172 102215000 : out[i] = load_32bit_to_16bit_w4(in + i * stride);
173 18841900 : }
174 :
175 : static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
176 : int32_t stride, __m128i *out,
177 : int32_t out_size) {
178 : for (int32_t i = 0; i < out_size; ++i)
179 : out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride);
180 : }
181 :
182 : static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
183 : int32_t *const out,
184 : const int32_t stride,
185 : const int32_t out_size) {
186 : for (int32_t i = 0; i < out_size; ++i)
187 : store_16bit_to_32bit_w4(in[i], out + i * stride);
188 : }
189 :
190 : static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
191 : int32_t *const out,
192 : const int32_t stride,
193 : const int32_t out_size) {
194 : for (int32_t i = 0; i < out_size; ++i)
195 : store_16bit_to_32bit(in[i], out + i * stride);
196 : }
197 :
198 : static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
199 : int32_t *const out,
200 : const int32_t stride,
201 : const int32_t out_size) {
202 : for (int32_t i = 0; i < out_size; ++i)
203 : store_rect_16bit_to_32bit_w4(in[i], out + i * stride);
204 : }
205 :
206 : static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
207 : int32_t *const out,
208 : const int32_t stride,
209 : const int32_t out_size) {
210 : for (int32_t i = 0; i < out_size; ++i)
211 : store_rect_16bit_to_32bit(in[i], out + i * stride);
212 : }
213 :
214 : static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
215 : uint16_t *out,
216 : const int32_t stride) {
217 : for (int32_t i = 0; i < 8; ++i)
218 : _mm_store_si128((__m128i *)(out + i * stride), in[i]);
219 : }
220 :
221 : static INLINE void round_shift_16bit(__m128i *in, int32_t size, int32_t bit) {
222 : if (bit < 0) {
223 : bit = -bit;
224 : __m128i rounding = _mm_set1_epi16(1 << (bit - 1));
225 : for (int32_t i = 0; i < size; ++i) {
226 : in[i] = _mm_adds_epi16(in[i], rounding);
227 : in[i] = _mm_srai_epi16(in[i], bit);
228 : }
229 : }
230 : else if (bit > 0) {
231 : for (int32_t i = 0; i < size; ++i)
232 : in[i] = _mm_slli_epi16(in[i], bit);
233 : }
234 : }
235 :
236 2117690 : static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int32_t size) {
237 16089900 : for (int32_t i = 0; i < size; ++i)
238 13972200 : out[size - i - 1] = in[i];
239 2117690 : }
240 :
241 : void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
242 : int32_t stride, TxType tx_type, int32_t bd);
243 :
244 : void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
245 : int32_t stride, TxType tx_type, int32_t bd);
246 :
247 : void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
248 : int32_t stride, TxType tx_type, int32_t bd);
249 :
250 : void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
251 : int32_t stride, TxType tx_type, int32_t bd);
252 :
253 : void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
254 : int32_t stride, TxType tx_type, int32_t bd);
255 :
256 : void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
257 : int32_t stride, TxType tx_type, int32_t bd);
258 :
259 : void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
260 : int32_t stride, TxType tx_type, int32_t bd);
261 :
262 : void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
263 : int32_t stride, TxType tx_type, int32_t bd);
264 :
265 : void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
266 : int32_t stride, TxType tx_type, int32_t bd);
267 :
268 : void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
269 : int32_t stride, TxType tx_type, int32_t bd);
270 :
271 : void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
272 : int32_t stride, TxType tx_type, int32_t bd);
273 :
274 : void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
275 : int32_t stride, TxType tx_type, int32_t bd);
276 :
277 : void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
278 : int32_t stride, TxType tx_type, int32_t bd);
279 :
280 : void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
281 : int32_t stride, TxType tx_type, int32_t bd);
282 :
283 : void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
284 : int32_t stride, TxType tx_type, int32_t bd);
285 :
286 : void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
287 : int32_t stride, TxType tx_type, int32_t bd);
288 :
289 : typedef void(*transform_1d_sse2)(const __m128i *input, __m128i *output,
290 : int8_t cos_bit);
291 :
292 : typedef struct {
293 : transform_1d_sse2 col, row; // vertical and horizontal
294 : } transform_2d_sse2;
295 :
296 : #ifdef __cplusplus
297 : }
298 : #endif // __cplusplus
299 : #endif // AV1_COMMON_X86_AV1_TXFM_SSE2_H_
|