Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : /*
7 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
8 : *
9 : * This source code is subject to the terms of the BSD 2 Clause License and
10 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
11 : * was not distributed with this source code in the LICENSE file, you can
12 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
13 : * Media Patent License 1.0 was not distributed with this source code in the
14 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
15 : */
16 :
17 : #include <assert.h>
18 : #include <smmintrin.h> /* SSE4.1 */
19 :
20 : #include "EbDefinitions.h"
21 : #include "aom_dsp_rtcd.h"
22 : #include "emmintrin.h"
23 : #include "EbTransforms.h"
24 : #include "highbd_txfm_utility_sse4.h"
25 :
26 : #include "av1_txfm_sse4.h"
27 :
28 : static const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL] = {
29 : fwd_shift_4x4, fwd_shift_8x8, fwd_shift_16x16, fwd_shift_32x32,
30 : fwd_shift_64x64, fwd_shift_4x8, fwd_shift_8x4, fwd_shift_8x16,
31 : fwd_shift_16x8, fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64,
32 : fwd_shift_64x32, fwd_shift_4x16, fwd_shift_16x4, fwd_shift_8x32,
33 : fwd_shift_32x8, fwd_shift_16x64, fwd_shift_64x16,
34 : };
35 :
36 : typedef void(*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int32_t bit,
37 : const int32_t num_cols);
38 :
39 22754100 : static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
40 : int32_t stride, int32_t flipud, int32_t fliplr,
41 : int32_t shift) {
42 22754100 : if (!flipud) {
43 21627900 : in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
44 21627900 : in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
45 21627900 : in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
46 43255700 : in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
47 : }
48 : else {
49 1126220 : in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
50 1126220 : in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
51 1126220 : in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
52 2252440 : in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
53 : }
54 :
55 22754100 : if (fliplr) {
56 1149590 : in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
57 1149590 : in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
58 1149590 : in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
59 1149590 : in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
60 : }
61 :
62 22754100 : in[0] = _mm_cvtepi16_epi32(in[0]);
63 22754100 : in[1] = _mm_cvtepi16_epi32(in[1]);
64 22754100 : in[2] = _mm_cvtepi16_epi32(in[2]);
65 22754100 : in[3] = _mm_cvtepi16_epi32(in[3]);
66 :
67 22754100 : in[0] = _mm_slli_epi32(in[0], shift);
68 22754100 : in[1] = _mm_slli_epi32(in[1], shift);
69 22754100 : in[2] = _mm_slli_epi32(in[2], shift);
70 22754100 : in[3] = _mm_slli_epi32(in[3], shift);
71 22754100 : }
72 :
73 6187440 : static void fidtx4x4_sse4_1(__m128i *in, __m128i *out, int32_t bit, int32_t col_num) {
74 : (void)bit;
75 6187440 : __m128i fact = _mm_set1_epi32(NewSqrt2);
76 6187440 : __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
77 : __m128i a_low;
78 : __m128i v[4];
79 :
80 30933400 : for (int32_t i = 0; i < 4; i++) {
81 49491800 : a_low = _mm_mullo_epi32(in[i * col_num], fact);
82 24745900 : a_low = _mm_add_epi32(a_low, offset);
83 49491800 : out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
84 : }
85 :
86 : // Transpose for 4x4
87 6187440 : v[0] = _mm_unpacklo_epi32(out[0], out[1]);
88 6187440 : v[1] = _mm_unpackhi_epi32(out[0], out[1]);
89 6187440 : v[2] = _mm_unpacklo_epi32(out[2], out[3]);
90 6187440 : v[3] = _mm_unpackhi_epi32(out[2], out[3]);
91 :
92 6187440 : out[0] = _mm_unpacklo_epi64(v[0], v[2]);
93 6187440 : out[1] = _mm_unpackhi_epi64(v[0], v[2]);
94 6187440 : out[2] = _mm_unpacklo_epi64(v[1], v[3]);
95 6187440 : out[3] = _mm_unpackhi_epi64(v[1], v[3]);
96 6187440 : }
97 :
98 : // We only use stage-2 bit;
99 : // shift[0] is used in load_buffer_4x4()
100 : // shift[1] is used in txfm_func_col()
101 : // shift[2] is used in txfm_func_row()
102 27355300 : static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int32_t bit,
103 : const int32_t num_col) {
104 27355300 : const int32_t *cospi = cospi_arr(bit);
105 27356200 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
106 27356200 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
107 27356200 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
108 27356200 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
109 : __m128i s0, s1, s2, s3;
110 : __m128i u0, u1, u2, u3;
111 : __m128i v0, v1, v2, v3;
112 :
113 27356200 : int32_t endidx = 3 * num_col;
114 27356200 : s0 = _mm_add_epi32(in[0], in[endidx]);
115 27356200 : s3 = _mm_sub_epi32(in[0], in[endidx]);
116 27356200 : endidx -= num_col;
117 27356200 : s1 = _mm_add_epi32(in[num_col], in[endidx]);
118 54712500 : s2 = _mm_sub_epi32(in[num_col], in[endidx]);
119 :
120 : // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
121 27356200 : u0 = _mm_mullo_epi32(s0, cospi32);
122 27356200 : u1 = _mm_mullo_epi32(s1, cospi32);
123 27356200 : u2 = _mm_add_epi32(u0, u1);
124 27356200 : v0 = _mm_sub_epi32(u0, u1);
125 :
126 27356200 : u3 = _mm_add_epi32(u2, rnding);
127 27356200 : v1 = _mm_add_epi32(v0, rnding);
128 :
129 27356200 : u0 = _mm_srai_epi32(u3, bit);
130 27356200 : u2 = _mm_srai_epi32(v1, bit);
131 :
132 : // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
133 27356200 : v0 = _mm_mullo_epi32(s2, cospi48);
134 27356200 : v1 = _mm_mullo_epi32(s3, cospi16);
135 27356200 : v2 = _mm_add_epi32(v0, v1);
136 :
137 27356200 : v3 = _mm_add_epi32(v2, rnding);
138 27356200 : u1 = _mm_srai_epi32(v3, bit);
139 :
140 27356200 : v0 = _mm_mullo_epi32(s2, cospi16);
141 27356200 : v1 = _mm_mullo_epi32(s3, cospi48);
142 27356200 : v2 = _mm_sub_epi32(v1, v0);
143 :
144 27356200 : v3 = _mm_add_epi32(v2, rnding);
145 27356200 : u3 = _mm_srai_epi32(v3, bit);
146 :
147 : // Note: shift[1] and shift[2] are zeros
148 :
149 : // Transpose 4x4 32-bit
150 27356200 : v0 = _mm_unpacklo_epi32(u0, u1);
151 27356200 : v1 = _mm_unpackhi_epi32(u0, u1);
152 27356200 : v2 = _mm_unpacklo_epi32(u2, u3);
153 27356200 : v3 = _mm_unpackhi_epi32(u2, u3);
154 :
155 27356200 : out[0] = _mm_unpacklo_epi64(v0, v2);
156 27356200 : out[1] = _mm_unpackhi_epi64(v0, v2);
157 27356200 : out[2] = _mm_unpacklo_epi64(v1, v3);
158 27356200 : out[3] = _mm_unpackhi_epi64(v1, v3);
159 27356200 : }
160 :
161 22771100 : static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
162 22771100 : _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
163 22771100 : _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
164 22771100 : _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
165 22771100 : _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
166 22771100 : }
167 :
168 11994900 : static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int32_t bit,
169 : const int32_t num_col) {
170 11994900 : const int32_t *sinpi = sinpi_arr(bit);
171 11993800 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
172 11993800 : const __m128i sinpi1 = _mm_set1_epi32((int32_t)sinpi[1]);
173 11993800 : const __m128i sinpi2 = _mm_set1_epi32((int32_t)sinpi[2]);
174 11993800 : const __m128i sinpi3 = _mm_set1_epi32((int32_t)sinpi[3]);
175 11993800 : const __m128i sinpi4 = _mm_set1_epi32((int32_t)sinpi[4]);
176 : __m128i t;
177 : __m128i s0, s1, s2, s3, s4, s5, s6, s7;
178 : __m128i x0, x1, x2, x3;
179 : __m128i u0, u1, u2, u3;
180 : __m128i v0, v1, v2, v3;
181 :
182 11993800 : int32_t idx = 0 * num_col;
183 11993800 : s0 = _mm_mullo_epi32(in[idx], sinpi1);
184 11993800 : s1 = _mm_mullo_epi32(in[idx], sinpi4);
185 11993800 : t = _mm_add_epi32(in[idx], in[idx + num_col]);
186 11993800 : idx += num_col;
187 11993800 : s2 = _mm_mullo_epi32(in[idx], sinpi2);
188 11993800 : s3 = _mm_mullo_epi32(in[idx], sinpi1);
189 11993800 : idx += num_col;
190 11993800 : s4 = _mm_mullo_epi32(in[idx], sinpi3);
191 11993800 : idx += num_col;
192 11993800 : s5 = _mm_mullo_epi32(in[idx], sinpi4);
193 11993800 : s6 = _mm_mullo_epi32(in[idx], sinpi2);
194 23987600 : s7 = _mm_sub_epi32(t, in[idx]);
195 :
196 11993800 : t = _mm_add_epi32(s0, s2);
197 11993800 : x0 = _mm_add_epi32(t, s5);
198 11993800 : x1 = _mm_mullo_epi32(s7, sinpi3);
199 11993800 : t = _mm_sub_epi32(s1, s3);
200 11993800 : x2 = _mm_add_epi32(t, s6);
201 11993800 : x3 = s4;
202 :
203 11993800 : s0 = _mm_add_epi32(x0, x3);
204 11993800 : s1 = x1;
205 11993800 : s2 = _mm_sub_epi32(x2, x3);
206 11993800 : t = _mm_sub_epi32(x2, x0);
207 11993800 : s3 = _mm_add_epi32(t, x3);
208 :
209 11993800 : u0 = _mm_add_epi32(s0, rnding);
210 11993800 : u0 = _mm_srai_epi32(u0, bit);
211 :
212 11993800 : u1 = _mm_add_epi32(s1, rnding);
213 11993800 : u1 = _mm_srai_epi32(u1, bit);
214 :
215 11993800 : u2 = _mm_add_epi32(s2, rnding);
216 11993800 : u2 = _mm_srai_epi32(u2, bit);
217 :
218 11993800 : u3 = _mm_add_epi32(s3, rnding);
219 11993800 : u3 = _mm_srai_epi32(u3, bit);
220 :
221 11993800 : v0 = _mm_unpacklo_epi32(u0, u1);
222 11993800 : v1 = _mm_unpackhi_epi32(u0, u1);
223 11993800 : v2 = _mm_unpacklo_epi32(u2, u3);
224 11993800 : v3 = _mm_unpackhi_epi32(u2, u3);
225 :
226 11993800 : out[0] = _mm_unpacklo_epi64(v0, v2);
227 11993800 : out[1] = _mm_unpackhi_epi64(v0, v2);
228 11993800 : out[2] = _mm_unpacklo_epi64(v1, v3);
229 11993800 : out[3] = _mm_unpackhi_epi64(v1, v3);
230 11993800 : }
231 :
232 22760500 : void eb_av1_fwd_txfm2d_4x4_sse4_1(int16_t *input, int32_t *coeff, uint32_t stride, TxType tx_type, uint8_t bd)
233 : {
234 : __m128i in[4];
235 22760500 : const int8_t *shift = fwd_txfm_shift_ls[TX_4X4];
236 22760500 : const int32_t txw_idx = get_txw_idx(TX_4X4);
237 22760100 : const int32_t txh_idx = get_txh_idx(TX_4X4);
238 :
239 22779700 : switch (tx_type) {
240 9814640 : case DCT_DCT:
241 9814640 : load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
242 9817500 : fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
243 9817690 : fdct4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
244 9817680 : write_buffer_4x4(in, coeff);
245 9816920 : break;
246 2282410 : case ADST_DCT:
247 2282410 : load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
248 2282620 : fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
249 2282630 : fdct4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
250 2282690 : write_buffer_4x4(in, coeff);
251 2282630 : break;
252 2341050 : case DCT_ADST:
253 2341050 : load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
254 2341260 : fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
255 2341280 : fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
256 2341270 : write_buffer_4x4(in, coeff);
257 2341190 : break;
258 1966580 : case ADST_ADST:
259 1966580 : load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
260 1966730 : fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
261 1966760 : fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
262 1966810 : write_buffer_4x4(in, coeff);
263 1966780 : break;
264 275069 : case FLIPADST_DCT:
265 275069 : load_buffer_4x4(input, in, stride, 1, 0, shift[0]);
266 275074 : fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
267 275072 : fdct4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
268 275072 : write_buffer_4x4(in, coeff);
269 275072 : break;
270 279377 : case DCT_FLIPADST:
271 279377 : load_buffer_4x4(input, in, stride, 0, 1, shift[0]);
272 279381 : fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
273 279383 : fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
274 279384 : write_buffer_4x4(in, coeff);
275 279383 : break;
276 284573 : case FLIPADST_FLIPADST:
277 284573 : load_buffer_4x4(input, in, stride, 1, 1, shift[0]);
278 284575 : fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
279 284574 : fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
280 284573 : write_buffer_4x4(in, coeff);
281 284573 : break;
282 291436 : case ADST_FLIPADST:
283 291436 : load_buffer_4x4(input, in, stride, 0, 1, shift[0]);
284 291438 : fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
285 291437 : fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
286 291437 : write_buffer_4x4(in, coeff);
287 291436 : break;
288 294438 : case FLIPADST_ADST:
289 294438 : load_buffer_4x4(input, in, stride, 1, 0, shift[0]);
290 294439 : fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
291 294440 : fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
292 294441 : write_buffer_4x4(in, coeff);
293 294440 : break;
294 1238040 : case IDTX:
295 1238040 : load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
296 1238120 : fidtx4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
297 1238100 : fidtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
298 1238120 : write_buffer_4x4(in, coeff);
299 1238110 : break;
300 1251030 : case V_DCT:
301 1251030 : load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
302 1251100 : fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
303 1251100 : fidtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
304 1251100 : write_buffer_4x4(in, coeff);
305 1251080 : break;
306 1315130 : case H_DCT:
307 1315130 : load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
308 1315190 : fidtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
309 1315180 : fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
310 1315220 : write_buffer_4x4(in, coeff);
311 1315200 : break;
312 277724 : case V_ADST:
313 277724 : load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
314 277724 : fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
315 277725 : fidtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
316 277725 : write_buffer_4x4(in, coeff);
317 277725 : break;
318 300236 : case H_ADST:
319 300236 : load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
320 300238 : fidtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
321 300238 : fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
322 300237 : write_buffer_4x4(in, coeff);
323 300237 : break;
324 273713 : case V_FLIPADST:
325 273713 : load_buffer_4x4(input, in, stride, 1, 0, shift[0]);
326 273716 : fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
327 273718 : fidtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
328 273718 : write_buffer_4x4(in, coeff);
329 273718 : break;
330 294212 : case H_FLIPADST:
331 294212 : load_buffer_4x4(input, in, stride, 0, 1, shift[0]);
332 294214 : fidtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
333 294211 : fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
334 294212 : write_buffer_4x4(in, coeff);
335 294211 : break;
336 0 : default: assert(0);
337 : }
338 : (void)bd;
339 22782700 : }
|