Line data Source code
1 : /*
2 : * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 : #include <immintrin.h>
12 :
13 : #include "EbDefinitions.h"
14 : #include "aom_dsp_rtcd.h"
15 :
16 0 : static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
17 : __m256i alpha_sign, __m256i dc_q0) {
18 0 : __m256i ac_q3 = _mm256_loadu_si256(input);
19 0 : __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3);
20 : __m256i scaled_luma_q0 =
21 0 : _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12);
22 0 : scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign);
23 0 : return _mm256_add_epi16(scaled_luma_q0, dc_q0);
24 : }
25 :
26 72914300 : static INLINE __m128i predict_unclipped_ssse3(const __m128i *input, __m128i alpha_q12,
27 : __m128i alpha_sign, __m128i dc_q0) {
28 72914300 : __m128i ac_q3 = _mm_loadu_si128(input);
29 72914300 : __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
30 145829000 : __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
31 72914300 : scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
32 72914300 : return _mm_add_epi16(scaled_luma_q0, dc_q0);
33 : }
34 :
35 : // Store 32-bit integer from the first element of a into memory.
36 38981700 : static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
37 38981700 : *((int32_t *)mem_addr) = _mm_cvtsi128_si32(a);
38 38981700 : }
39 :
40 10368400 : void eb_cfl_predict_lbd_avx2(const int16_t *pred_buf_q3,
41 : uint8_t *pred,
42 : int32_t pred_stride,
43 : uint8_t *dst,
44 : int32_t dst_stride,
45 : int32_t alpha_q3,
46 : int32_t bit_depth,
47 : int32_t width,
48 : int32_t height) {
49 : (void) bit_depth;
50 10368400 : if (width <= 16)
51 : {
52 20737000 : const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
53 10368500 : const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
54 10368500 : const __m128i dc_q0 = _mm_set1_epi16(*pred);
55 10368500 : __m128i *row = (__m128i *)pred_buf_q3;
56 10368500 : const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
57 : do {
58 64437000 : __m128i res = predict_unclipped_ssse3(row, alpha_q12, alpha_sign, dc_q0);
59 64385400 : if (width < 16) {
60 55895100 : res = _mm_packus_epi16(res, res);
61 55895100 : if (width == 4)
62 38983000 : _mm_storeh_epi32((__m128i *)dst, res);
63 : else
64 16912100 : _mm_storel_epi64((__m128i *)dst, res);
65 : }
66 : else {
67 8490290 : __m128i next = predict_unclipped_ssse3(row + 1, alpha_q12, alpha_sign, dc_q0);
68 8544310 : res = _mm_packus_epi16(res, next);
69 : _mm_storeu_si128((__m128i *)dst, res);
70 : }
71 64433000 : dst += dst_stride;
72 64433000 : } while ((row += CFL_BUF_LINE_I128) < row_end);
73 : }
74 : else
75 : {
76 0 : const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
77 0 : const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
78 0 : const __m256i dc_q0 = _mm256_set1_epi16(*pred);
79 0 : __m256i *row = (__m256i *)pred_buf_q3;
80 0 : const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
81 :
82 : do {
83 0 : __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
84 0 : __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
85 0 : res = _mm256_packus_epi16(res, next);
86 0 : res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
87 : _mm256_storeu_si256((__m256i *)dst, res);
88 0 : dst += dst_stride;
89 0 : pred += pred_stride;
90 0 : } while ((row += CFL_BUF_LINE_I256) < row_end);
91 : }
92 10364400 : }
93 :
94 0 : static __m256i highbd_max_epi16(int32_t bd) {
95 0 : const __m256i neg_one = _mm256_set1_epi16(-1);
96 : // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
97 0 : return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one);
98 : }
99 :
100 0 : static INLINE __m128i highbd_max_epi16_ssse3(int32_t bd) {
101 0 : const __m128i neg_one = _mm_set1_epi16(-1);
102 : // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
103 0 : return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one);
104 : }
105 :
106 0 : static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) {
107 0 : return _mm256_max_epi16(_mm256_min_epi16(u, max), zero);
108 : }
109 :
110 0 : static INLINE __m128i highbd_clamp_epi16_ssse3(__m128i u, __m128i zero, __m128i max) {
111 0 : return _mm_max_epi16(_mm_min_epi16(u, max), zero);
112 : }
113 :
114 0 : void eb_cfl_predict_hbd_avx2(
115 : const int16_t *pred_buf_q3,
116 : uint16_t *pred,// AMIR ADDED
117 : int32_t pred_stride,
118 : uint16_t *dst,// AMIR changed to 8 bit
119 : int32_t dst_stride,
120 : int32_t alpha_q3,
121 : int32_t bit_depth,
122 : int32_t width,
123 : int32_t height) {
124 : // Use SSSE3 version for smaller widths
125 0 : if (width < 16)
126 : {
127 0 : const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
128 0 : const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
129 0 : const __m128i dc_q0 = _mm_set1_epi16(*pred);
130 0 : const __m128i max = highbd_max_epi16_ssse3(bit_depth);
131 0 : const __m128i zeros = _mm_setzero_si128();
132 0 : __m128i *row = (__m128i *)pred_buf_q3;
133 0 : const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
134 : do {
135 0 : __m128i res = predict_unclipped_ssse3(row, alpha_q12, alpha_sign, dc_q0);
136 0 : res = highbd_clamp_epi16_ssse3(res, zeros, max);
137 0 : if (width == 4)
138 0 : _mm_storel_epi64((__m128i *)dst, res);
139 : else
140 : _mm_storeu_si128((__m128i *)dst, res);
141 0 : dst += dst_stride;
142 0 : } while ((row += CFL_BUF_LINE_I128) < row_end);
143 : }
144 : else
145 : {
146 : assert(width == 16 || width == 32);
147 0 : const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
148 0 : const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
149 0 : const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)pred);
150 0 : const __m256i max = highbd_max_epi16(bit_depth);
151 :
152 0 : __m256i *row = (__m256i *)pred_buf_q3;
153 0 : const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
154 : do {
155 0 : const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
156 0 : _mm256_storeu_si256((__m256i *)dst,
157 : highbd_clamp_epi16(res, _mm256_setzero_si256(), max));
158 0 : if (width == 32) {
159 : const __m256i res_1 =
160 0 : predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
161 0 : _mm256_storeu_si256(
162 0 : (__m256i *)(dst + 16),
163 : highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max));
164 : }
165 0 : dst += dst_stride;
166 0 : pred += pred_stride;
167 0 : } while ((row += CFL_BUF_LINE_I256) < row_end);
168 : }
169 0 : }
170 :
171 : // Returns a vector where all the (32-bits) elements are the sum of all the
172 : // lanes in a.
173 53193 : static INLINE __m256i fill_sum_epi32(__m256i a) {
174 : // Given that a == [A, B, C, D, E, F, G, H]
175 53193 : a = _mm256_hadd_epi32(a, a);
176 : // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H
177 : // a == [A', C', A', C', E', G', E', G']
178 53193 : a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0));
179 : // a == [A', C', E', G', A', C', E', G']
180 53193 : a = _mm256_hadd_epi32(a, a);
181 : // Given that A'' == A' + C' and E'' == E' + G'
182 : // a == [A'', E'', A'', E'', A'', E'', A'', E'']
183 53193 : return _mm256_hadd_epi32(a, a);
184 : // Given that A''' == A'' + E''
185 : // a == [A''', A''', A''', A''', A''', A''', A''', A''']
186 : }
187 470331 : static INLINE __m128i fill_sum_epi32_sse2(__m128i l0) {
188 470331 : l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
189 940662 : return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
190 : }
191 215502 : static INLINE __m256i _mm256_addl_epi16(__m256i a) {
192 1077510 : return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()),
193 : _mm256_unpackhi_epi16(a, _mm256_setzero_si256()));
194 : }
195 :
196 523504 : /*staticINLINE*/ void eb_subtract_average_avx2(int16_t *pred_buf_q3, int32_t width,
197 : int32_t height, int32_t round_offset,
198 : int32_t num_pel_log2) {
199 : // Use SSE2 version for smaller widths
200 :
201 523504 : if ((width == 4) || (width == 8))
202 470331 : {
203 470312 : const __m128i zeros = _mm_setzero_si128();
204 470312 : const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset);
205 470312 : const __m128i *src = (__m128i *)pred_buf_q3;
206 470312 : const __m128i *const end = src + height * CFL_BUF_LINE_I128;
207 470312 : const int32_t step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4));
208 :
209 470312 : __m128i sum = zeros;
210 : do {
211 : __m128i l0;
212 920262 : if (width == 4) {
213 983424 : l0 = _mm_add_epi16(_mm_loadl_epi64(src),
214 491712 : _mm_loadl_epi64(src + CFL_BUF_LINE_I128));
215 1475140 : __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128),
216 491712 : _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128));
217 1966850 : sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
218 : _mm_unpacklo_epi16(l1, zeros)));
219 : }
220 : else {
221 1285650 : l0 = _mm_add_epi16(_mm_loadu_si128(src),
222 428550 : _mm_loadu_si128(src + CFL_BUF_LINE_I128));
223 1714200 : sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
224 : _mm_unpackhi_epi16(l0, zeros)));
225 : }
226 920262 : src += step;
227 920262 : } while (src < end);
228 :
229 470312 : sum = fill_sum_epi32_sse2(sum);
230 :
231 : __m128i avg_epi16 =
232 940662 : _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2);
233 470331 : avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16);
234 :
235 470331 : src = (__m128i *)pred_buf_q3;
236 470331 : __m128i *dst = (__m128i *)pred_buf_q3;
237 : do {
238 2823920 : if (width == 4)
239 3933640 : _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16));
240 : else {
241 1714190 : _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16));
242 857097 : if (width > 8) {
243 0 : _mm_storeu_si128(dst + 1,
244 0 : _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16));
245 0 : if (width == 32) {
246 0 : _mm_storeu_si128(dst + 2,
247 0 : _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16));
248 0 : _mm_storeu_si128(dst + 3,
249 0 : _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16));
250 : }
251 : }
252 : }
253 2823920 : src += CFL_BUF_LINE_I128;
254 2823920 : dst += CFL_BUF_LINE_I128;
255 2823920 : } while (src < end);
256 : }
257 : else
258 : {
259 53192 : const __m256i *src = (__m256i *)pred_buf_q3;
260 53192 : const __m256i *const end = src + height * CFL_BUF_LINE_I256;
261 : // To maximize usage of the AVX2 registers, we sum two rows per loop
262 : // iteration
263 53192 : const int32_t step = 2 * CFL_BUF_LINE_I256;
264 :
265 53192 : __m256i sum = _mm256_setzero_si256();
266 : // For width 32, we use a second sum accumulator to reduce accumulator
267 : // dependencies in the loop.
268 : __m256i sum2;
269 53192 : if (width == 32) sum2 = _mm256_setzero_si256();
270 :
271 : do {
272 : // Add top row to the bottom row
273 431004 : __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src),
274 215502 : _mm256_loadu_si256(src + CFL_BUF_LINE_I256));
275 215502 : sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0));
276 215502 : if (width == 32) { /* Don't worry, this if it gets optimized out. */
277 : // Add the second part of the top row to the second part of the bottom row
278 : __m256i l1 =
279 0 : _mm256_add_epi16(_mm256_loadu_si256(src + 1),
280 0 : _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256));
281 0 : sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1));
282 : }
283 215502 : src += step;
284 215502 : } while (src < end);
285 : // Combine both sum accumulators
286 53192 : if (width == 32) sum = _mm256_add_epi32(sum, sum2);
287 :
288 53192 : __m256i fill = fill_sum_epi32(sum);
289 :
290 159579 : __m256i avg_epi16 = _mm256_srli_epi32(
291 : _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2);
292 53193 : avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16);
293 :
294 : // Store and subtract loop
295 53193 : src = (__m256i *)pred_buf_q3;
296 53193 : __m256i *dst = (__m256i *)pred_buf_q3;
297 : do {
298 862014 : _mm256_storeu_si256(dst,
299 : _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16));
300 431007 : if (width == 32) {
301 0 : _mm256_storeu_si256(
302 0 : dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16));
303 : }
304 431007 : src += CFL_BUF_LINE_I256;
305 431007 : dst += CFL_BUF_LINE_I256;
306 431007 : } while (src < end);
307 : }
308 523524 : }
|