Line data Source code
1 : /*
2 : * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <immintrin.h>
13 :
14 : #include "aom_dsp_rtcd.h"
15 :
16 162404000 : static INLINE void read_coeff(const TranLow *coeff, __m256i *c) {
17 : if (sizeof(TranLow) == 4) {
18 162404000 : const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff);
19 324807000 : const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1);
20 162404000 : *c = _mm256_packs_epi32(x0, x1);
21 162404000 : *c = _mm256_permute4x64_epi64(*c, 0xD8);
22 : }
23 : else
24 : *c = _mm256_loadu_si256((const __m256i *)coeff);
25 162404000 : }
26 :
27 196743000 : static INLINE void write_zero(TranLow *qcoeff) {
28 196743000 : const __m256i zero = _mm256_setzero_si256();
29 : if (sizeof(TranLow) == 4) {
30 : _mm256_storeu_si256((__m256i *)qcoeff, zero);
31 196743000 : _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
32 : }
33 : else
34 : _mm256_storeu_si256((__m256i *)qcoeff, zero);
35 196743000 : }
36 :
37 123743000 : static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
38 123743000 : const __m128i ac = _mm_unpackhi_epi64(*p, *p);
39 123743000 : *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1);
40 123743000 : }
41 :
42 41320900 : static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
43 : const int16_t *dequant_ptr, int log_scale,
44 : __m256i *thr, __m256i *qp) {
45 41320900 : __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
46 41320900 : const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
47 41320900 : const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
48 :
49 41320900 : if (log_scale > 0) {
50 665428 : const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1));
51 665428 : round = _mm_add_epi16(round, rnd);
52 1330860 : round = _mm_srai_epi16(round, log_scale);
53 : }
54 :
55 41320900 : init_one_qp(&round, &qp[0]);
56 41324700 : init_one_qp(&quant, &qp[1]);
57 :
58 41290800 : if (log_scale == 1)
59 1295800 : qp[1] = _mm256_slli_epi16(qp[1], log_scale);
60 :
61 41290800 : init_one_qp(&dequant, &qp[2]);
62 41289900 : *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
63 41289900 : }
64 :
65 41327000 : static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) {
66 41327000 : qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
67 41327000 : qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
68 41327000 : qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
69 41327000 : *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
70 41327000 : }
71 :
72 : #define store_quan(q, addr) \
73 : do { \
74 : __m256i sign_bits = _mm256_srai_epi16(q, 15); \
75 : __m256i y0 = _mm256_unpacklo_epi16(q, sign_bits); \
76 : __m256i y1 = _mm256_unpackhi_epi16(q, sign_bits); \
77 : __m256i x0 = _mm256_permute2x128_si256(y0, y1, 0x20); \
78 : __m256i x1 = _mm256_permute2x128_si256(y0, y1, 0x31); \
79 : _mm256_storeu_si256((__m256i *)addr, x0); \
80 : _mm256_storeu_si256((__m256i *)addr + 1, x1); \
81 : } while (0)
82 :
83 : #define store_two_quan(q, addr1, dq, addr2) \
84 : do { \
85 : if (sizeof(TranLow) == 4) { \
86 : store_quan(q, addr1); \
87 : store_quan(dq, addr2); \
88 : } else { \
89 : _mm256_storeu_si256((__m256i *)addr1, q); \
90 : _mm256_storeu_si256((__m256i *)addr2, dq); \
91 : } \
92 : } while (0)
93 :
94 41329200 : static INLINE uint16_t quant_gather_eob(__m256i eob) {
95 41329200 : const __m128i eob_lo = _mm256_castsi256_si128(eob);
96 41329200 : const __m128i eob_hi = _mm256_extractf128_si256(eob, 1);
97 41329200 : __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi);
98 82658300 : eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s);
99 41329200 : eob_s = _mm_minpos_epu16(eob_s);
100 41329200 : return INT16_MAX - _mm_extract_epi16(eob_s, 0);
101 : }
102 :
103 135731000 : static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
104 : const int16_t *iscan_ptr, TranLow *qcoeff,
105 : TranLow *dqcoeff, __m256i *eob) {
106 135731000 : const __m256i abs_coeff = _mm256_abs_epi16(*c);
107 135731000 : __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
108 407192000 : mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
109 135731000 : const int nzflag = _mm256_movemask_epi8(mask);
110 :
111 135731000 : if (nzflag) {
112 58105100 : __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
113 58105100 : q = _mm256_mulhi_epi16(q, qp[1]);
114 58105100 : q = _mm256_sign_epi16(q, *c);
115 116210000 : const __m256i dq = _mm256_mullo_epi16(q, qp[2]);
116 :
117 348631000 : store_two_quan(q, qcoeff, dq, dqcoeff);
118 58105100 : const __m256i zero = _mm256_setzero_si256();
119 58105100 : const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
120 58105100 : const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
121 58105100 : const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
122 58105100 : __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
123 58105100 : cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
124 116210000 : *eob = _mm256_max_epi16(*eob, cur_eob);
125 : }
126 : else {
127 77625600 : write_zero(qcoeff);
128 78098100 : write_zero(dqcoeff);
129 : }
130 136186000 : }
131 :
132 40668900 : void eb_av1_quantize_fp_avx2(const TranLow *coeff_ptr, intptr_t n_coeffs,
133 : const int16_t *zbin_ptr, const int16_t *round_ptr,
134 : const int16_t *quant_ptr,
135 : const int16_t *quant_shift_ptr,
136 : TranLow *qcoeff_ptr, TranLow *dqcoeff_ptr,
137 : const int16_t *dequant_ptr, uint16_t *eob_ptr,
138 : const int16_t *scan_ptr, const int16_t *iscan_ptr) {
139 : (void)scan_ptr;
140 : (void)zbin_ptr;
141 : (void)quant_shift_ptr;
142 40668900 : const unsigned int step = 16;
143 :
144 : __m256i qp[3];
145 : __m256i coeff, thr;
146 40668900 : const int log_scale = 0;
147 :
148 40668900 : init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
149 40625200 : read_coeff(coeff_ptr, &coeff);
150 :
151 40639500 : __m256i eob = _mm256_setzero_si256();
152 40639500 : quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
153 :
154 40672900 : coeff_ptr += step;
155 40672900 : qcoeff_ptr += step;
156 40672900 : dqcoeff_ptr += step;
157 40672900 : iscan_ptr += step;
158 40672900 : n_coeffs -= step;
159 :
160 40672900 : update_qp(log_scale, &thr, qp);
161 :
162 136146000 : while (n_coeffs > 0) {
163 95475700 : read_coeff(coeff_ptr, &coeff);
164 95453100 : quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
165 :
166 95484900 : coeff_ptr += step;
167 95484900 : qcoeff_ptr += step;
168 95484900 : dqcoeff_ptr += step;
169 95484900 : iscan_ptr += step;
170 95484900 : n_coeffs -= step;
171 : }
172 40670300 : *eob_ptr = quant_gather_eob(eob);
173 40679100 : }
174 :
175 25775600 : static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp,
176 : __m256i *c, const int16_t *iscan_ptr,
177 : TranLow *qcoeff, TranLow *dqcoeff,
178 : __m256i *eob) {
179 25775600 : const __m256i abs_coeff = _mm256_abs_epi16(*c);
180 25775600 : __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
181 77326900 : mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
182 25775600 : const int nzflag = _mm256_movemask_epi8(mask);
183 :
184 25775600 : if (nzflag) {
185 5765730 : __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
186 5765730 : q = _mm256_mulhi_epu16(q, qp[1]);
187 :
188 11531500 : __m256i dq = _mm256_mullo_epi16(q, qp[2]);
189 5765730 : dq = _mm256_srli_epi16(dq, 1);
190 :
191 5765730 : q = _mm256_sign_epi16(q, *c);
192 11531500 : dq = _mm256_sign_epi16(dq, *c);
193 :
194 34594400 : store_two_quan(q, qcoeff, dq, dqcoeff);
195 5765730 : const __m256i zero = _mm256_setzero_si256();
196 5765730 : const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
197 5765730 : const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
198 5765730 : const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
199 5765730 : __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
200 5765730 : cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
201 11531500 : *eob = _mm256_max_epi16(*eob, cur_eob);
202 : }
203 : else {
204 20009900 : write_zero(qcoeff);
205 20025300 : write_zero(dqcoeff);
206 : }
207 25790000 : }
208 :
209 647900 : void eb_av1_quantize_fp_32x32_avx2(
210 : const TranLow *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
211 : const int16_t *round_ptr, const int16_t *quant_ptr,
212 : const int16_t *quant_shift_ptr, TranLow *qcoeff_ptr,
213 : TranLow *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
214 : const int16_t *scan_ptr, const int16_t *iscan_ptr) {
215 : (void)scan_ptr;
216 : (void)zbin_ptr;
217 : (void)quant_shift_ptr;
218 647900 : const unsigned int step = 16;
219 :
220 : __m256i qp[3];
221 : __m256i coeff, thr;
222 647900 : const int log_scale = 1;
223 :
224 647900 : init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
225 647904 : read_coeff(coeff_ptr, &coeff);
226 :
227 647903 : __m256i eob = _mm256_setzero_si256();
228 647903 : quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
229 :
230 647903 : coeff_ptr += step;
231 647903 : qcoeff_ptr += step;
232 647903 : dqcoeff_ptr += step;
233 647903 : iscan_ptr += step;
234 647903 : n_coeffs -= step;
235 :
236 647903 : update_qp(log_scale, &thr, qp);
237 :
238 25781500 : while (n_coeffs > 0) {
239 25133600 : read_coeff(coeff_ptr, &coeff);
240 25131700 : quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
241 : &eob);
242 :
243 25133600 : coeff_ptr += step;
244 25133600 : qcoeff_ptr += step;
245 25133600 : dqcoeff_ptr += step;
246 25133600 : iscan_ptr += step;
247 25133600 : n_coeffs -= step;
248 : }
249 647897 : *eob_ptr = quant_gather_eob(eob);
250 647893 : }
251 :
252 1121270 : static INLINE void quantize_64x64(const __m256i *thr, const __m256i *qp,
253 : __m256i *c, const int16_t *iscan_ptr,
254 : TranLow *qcoeff, TranLow *dqcoeff,
255 : __m256i *eob) {
256 1121270 : const __m256i abs_coeff = _mm256_abs_epi16(*c);
257 1121270 : __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
258 3363810 : mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
259 1121270 : const int nzflag = _mm256_movemask_epi8(mask);
260 :
261 1121270 : if (nzflag) {
262 720668 : __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
263 720668 : __m256i qh = _mm256_mulhi_epi16(q, qp[1]);
264 1441340 : __m256i ql = _mm256_mullo_epi16(q, qp[1]);
265 720668 : qh = _mm256_slli_epi16(qh, 2);
266 720668 : ql = _mm256_srli_epi16(ql, 14);
267 720668 : q = _mm256_or_si256(qh, ql);
268 1441340 : const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(q, qp[2]), 14);
269 2162000 : const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(q, qp[2]), 2);
270 720668 : __m256i dq = _mm256_or_si256(dqh, dql);
271 :
272 720668 : q = _mm256_sign_epi16(q, *c);
273 1441340 : dq = _mm256_sign_epi16(dq, *c);
274 :
275 4324010 : store_two_quan(q, qcoeff, dq, dqcoeff);
276 720668 : const __m256i zero = _mm256_setzero_si256();
277 720668 : const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
278 720668 : const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
279 720668 : const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
280 720668 : __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
281 720668 : cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
282 1441340 : *eob = _mm256_max_epi16(*eob, cur_eob);
283 : }
284 : else {
285 400603 : write_zero(qcoeff);
286 400814 : write_zero(dqcoeff);
287 : }
288 1121480 : }
289 :
290 17527 : void eb_av1_quantize_fp_64x64_avx2(
291 : const TranLow *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
292 : const int16_t *round_ptr, const int16_t *quant_ptr,
293 : const int16_t *quant_shift_ptr, TranLow *qcoeff_ptr,
294 : TranLow *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
295 : const int16_t *scan_ptr, const int16_t *iscan_ptr) {
296 : (void)scan_ptr;
297 : (void)zbin_ptr;
298 : (void)quant_shift_ptr;
299 17527 : const unsigned int step = 16;
300 :
301 : __m256i qp[3];
302 : __m256i coeff, thr;
303 17527 : const int log_scale = 2;
304 :
305 17527 : init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
306 17528 : read_coeff(coeff_ptr, &coeff);
307 :
308 17527 : __m256i eob = _mm256_setzero_si256();
309 17527 : quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
310 :
311 17527 : coeff_ptr += step;
312 17527 : qcoeff_ptr += step;
313 17527 : dqcoeff_ptr += step;
314 17527 : iscan_ptr += step;
315 17527 : n_coeffs -= step;
316 :
317 17527 : update_qp(log_scale, &thr, qp);
318 :
319 1121390 : while (n_coeffs > 0) {
320 1103860 : read_coeff(coeff_ptr, &coeff);
321 1103800 : quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
322 : &eob);
323 :
324 1103860 : coeff_ptr += step;
325 1103860 : qcoeff_ptr += step;
326 1103860 : dqcoeff_ptr += step;
327 1103860 : iscan_ptr += step;
328 1103860 : n_coeffs -= step;
329 : }
330 17527 : *eob_ptr = quant_gather_eob(eob);
331 17527 : }
|