Line data Source code
1 : /*
2 : * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include "EbDefinitions.h"
13 : #include "aom_dsp_rtcd.h"
14 : #include <tmmintrin.h>
15 :
16 : // Weights are quadratic from '1' to '1 / BlockSize', scaled by
17 : // 2^sm_weight_log2_scale.
18 : static const int32_t sm_weight_log2_scale = 8;
19 :
20 : // max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
21 : #define MAX_BLOCK_DIM 64
22 :
23 : /* clang-format off */
24 : static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
25 : // Unused, because we always offset by bs, which is at least 2.
26 : 0, 0,
27 : // bs = 2
28 : 255, 128,
29 : // bs = 4
30 : 255, 149, 85, 64,
31 : // bs = 8
32 : 255, 197, 146, 105, 73, 50, 37, 32,
33 : // bs = 16
34 : 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
35 : // bs = 32
36 : 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
37 : 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
38 : // bs = 64
39 : 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
40 : 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
41 : 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
42 : 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
43 : };
44 :
45 : // -----------------------------------------------------------------------------
46 : // PAETH_PRED
47 :
48 : // -----------------------------------------------------------------------------
49 : // SMOOTH_PRED
50 :
51 : // pixels[0]: above and below_pred interleave vector
52 : // pixels[1]: left vector
53 : // pixels[2]: right_pred vector
54 267394 : static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
55 : int32_t height, __m128i *pixels) {
56 267394 : __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
57 267394 : if (height == 4)
58 372746 : pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
59 81021 : else if (height == 8)
60 109728 : pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
61 : else
62 52314 : pixels[1] = _mm_loadu_si128(((const __m128i *)left));
63 :
64 267394 : pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
65 :
66 534788 : const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
67 267394 : const __m128i zero = _mm_setzero_si128();
68 267394 : d = _mm_unpacklo_epi8(d, zero);
69 267394 : pixels[0] = _mm_unpacklo_epi16(d, bp);
70 267394 : }
71 :
72 : // weight_h[0]: weight_h vector
73 : // weight_h[1]: scale - weight_h vector
74 : // weight_h[2]: same as [0], second half for height = 16 only
75 : // weight_h[3]: same as [1], second half for height = 16 only
76 : // weight_w[0]: weights_w and scale - weights_w interleave vector
77 267393 : static INLINE void load_weight_w4(const uint8_t *weight_array, int32_t height,
78 : __m128i *weight_h, __m128i *weight_w) {
79 267393 : const __m128i zero = _mm_setzero_si128();
80 267393 : const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
81 534786 : const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
82 267393 : weight_h[0] = _mm_unpacklo_epi8(t, zero);
83 267393 : weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
84 267393 : weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
85 :
86 267393 : if (height == 8) {
87 109728 : const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
88 54864 : weight_h[0] = _mm_unpacklo_epi8(weight, zero);
89 109728 : weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
90 : }
91 212529 : else if (height == 16) {
92 52318 : const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
93 26159 : weight_h[0] = _mm_unpacklo_epi8(weight, zero);
94 26159 : weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
95 26159 : weight_h[2] = _mm_unpackhi_epi8(weight, zero);
96 52318 : weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
97 : }
98 267393 : }
99 :
100 293553 : static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
101 : const __m128i *ww, int32_t h, uint8_t *dst,
102 : ptrdiff_t stride, int32_t second_half) {
103 587106 : const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
104 293553 : const __m128i one = _mm_set1_epi16(1);
105 293553 : const __m128i inc = _mm_set1_epi16(0x202);
106 293553 : const __m128i gat = _mm_set1_epi32(0xc080400);
107 587106 : __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) : _mm_set1_epi16((short)0x8000);
108 293553 : __m128i d = _mm_set1_epi16(0x100);
109 :
110 1896430 : for (int32_t i = 0; i < h; ++i) {
111 1602880 : const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
112 3205750 : const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
113 1602880 : const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
114 1602880 : __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
115 :
116 1602880 : __m128i b = _mm_shuffle_epi8(pixel[1], rep);
117 1602880 : b = _mm_unpacklo_epi16(b, pixel[2]);
118 3205750 : __m128i sum = _mm_madd_epi16(b, ww[0]);
119 :
120 1602880 : sum = _mm_add_epi32(s, sum);
121 1602880 : sum = _mm_add_epi32(sum, round);
122 3205750 : sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
123 :
124 1602880 : sum = _mm_shuffle_epi8(sum, gat);
125 1602880 : *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
126 1602880 : dst += stride;
127 :
128 1602880 : rep = _mm_add_epi16(rep, one);
129 1602880 : d = _mm_add_epi16(d, inc);
130 : }
131 293553 : }
132 :
133 186373 : void eb_aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
134 : const uint8_t *above, const uint8_t *left) {
135 : __m128i pixels[3];
136 186373 : load_pixel_w4(above, left, 4, pixels);
137 :
138 : __m128i wh[4], ww[2];
139 186373 : load_weight_w4(sm_weight_arrays, 4, wh, ww);
140 :
141 186373 : smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
142 186375 : }
143 :
144 54864 : void eb_aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
145 : const uint8_t *above, const uint8_t *left) {
146 : __m128i pixels[3];
147 54864 : load_pixel_w4(above, left, 8, pixels);
148 :
149 : __m128i wh[4], ww[2];
150 54864 : load_weight_w4(sm_weight_arrays, 8, wh, ww);
151 :
152 54864 : smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
153 54864 : }
154 :
155 26159 : void eb_aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
156 : const uint8_t *above,
157 : const uint8_t *left) {
158 : __m128i pixels[3];
159 26159 : load_pixel_w4(above, left, 16, pixels);
160 :
161 : __m128i wh[4], ww[2];
162 26159 : load_weight_w4(sm_weight_arrays, 16, wh, ww);
163 :
164 26159 : smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
165 26159 : dst += stride << 3;
166 26159 : smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
167 26159 : }
168 :
169 : // pixels[0]: above and below_pred interleave vector, first half
170 : // pixels[1]: above and below_pred interleave vector, second half
171 : // pixels[2]: left vector
172 : // pixels[3]: right_pred vector
173 : // pixels[4]: above and below_pred interleave vector, first half
174 : // pixels[5]: above and below_pred interleave vector, second half
175 : // pixels[6]: left vector + 16
176 : // pixels[7]: right_pred vector
177 826684 : static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
178 : int32_t height, __m128i *pixels) {
179 826684 : const __m128i zero = _mm_setzero_si128();
180 1653370 : const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
181 826684 : __m128i d = _mm_loadl_epi64((const __m128i *)above);
182 826684 : d = _mm_unpacklo_epi8(d, zero);
183 826684 : pixels[0] = _mm_unpacklo_epi16(d, bp);
184 826684 : pixels[1] = _mm_unpackhi_epi16(d, bp);
185 :
186 826684 : pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
187 :
188 826684 : if (height == 4)
189 105128 : pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
190 774120 : else if (height == 8)
191 793418 : pixels[2] = _mm_loadl_epi64((const __m128i *)left);
192 377411 : else if (height == 16)
193 687440 : pixels[2] = _mm_load_si128((const __m128i *)left);
194 : else {
195 33691 : pixels[2] = _mm_load_si128((const __m128i *)left);
196 33691 : pixels[4] = pixels[0];
197 33691 : pixels[5] = pixels[1];
198 33691 : pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
199 33691 : pixels[7] = pixels[3];
200 : }
201 826684 : }
202 :
203 : // weight_h[0]: weight_h vector
204 : // weight_h[1]: scale - weight_h vector
205 : // weight_h[2]: same as [0], offset 8
206 : // weight_h[3]: same as [1], offset 8
207 : // weight_h[4]: same as [0], offset 16
208 : // weight_h[5]: same as [1], offset 16
209 : // weight_h[6]: same as [0], offset 24
210 : // weight_h[7]: same as [1], offset 24
211 : // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
212 : // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
213 826687 : static INLINE void load_weight_w8(const uint8_t *weight_array, int32_t height,
214 : __m128i *weight_h, __m128i *weight_w) {
215 826687 : const __m128i zero = _mm_setzero_si128();
216 826687 : const int32_t we_offset = height < 8 ? 4 : 8;
217 1653370 : __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
218 826687 : weight_h[0] = _mm_unpacklo_epi8(we, zero);
219 826687 : const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
220 826687 : weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
221 :
222 826687 : if (height == 4) {
223 52564 : we = _mm_srli_si128(we, 4);
224 52564 : __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
225 52564 : __m128i tmp2 = _mm_sub_epi16(d, tmp1);
226 52564 : weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
227 105128 : weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
228 : }
229 : else {
230 774123 : weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
231 1548250 : weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
232 : }
233 :
234 826687 : if (height == 16) {
235 687438 : we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
236 343719 : weight_h[0] = _mm_unpacklo_epi8(we, zero);
237 343719 : weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
238 343719 : weight_h[2] = _mm_unpackhi_epi8(we, zero);
239 687438 : weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
240 : }
241 482968 : else if (height == 32) {
242 : const __m128i weight_lo =
243 67404 : _mm_loadu_si128((const __m128i *)&weight_array[32]);
244 33702 : weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
245 33702 : weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
246 33702 : weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
247 33702 : weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
248 : const __m128i weight_hi =
249 33702 : _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
250 33702 : weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
251 33702 : weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
252 33702 : weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
253 67404 : weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
254 : }
255 826687 : }
256 :
257 1271490 : static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
258 : const __m128i *ww, int32_t h, uint8_t *dst,
259 : ptrdiff_t stride, int32_t second_half) {
260 2542990 : const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
261 1271490 : const __m128i one = _mm_set1_epi16(1);
262 1271490 : const __m128i inc = _mm_set1_epi16(0x202);
263 1271490 : const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
264 :
265 2542990 : __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) : _mm_set1_epi16((short)0x8000);
266 1271490 : __m128i d = _mm_set1_epi16(0x100);
267 :
268 : int32_t i;
269 11231900 : for (i = 0; i < h; ++i) {
270 9960430 : const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
271 19920900 : const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
272 9960430 : const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
273 9960430 : __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
274 9960430 : __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
275 :
276 9960430 : __m128i b = _mm_shuffle_epi8(pixels[2], rep);
277 9960430 : b = _mm_unpacklo_epi16(b, pixels[3]);
278 9960430 : __m128i sum0 = _mm_madd_epi16(b, ww[0]);
279 19920900 : __m128i sum1 = _mm_madd_epi16(b, ww[1]);
280 :
281 9960430 : s0 = _mm_add_epi32(s0, sum0);
282 9960430 : s0 = _mm_add_epi32(s0, round);
283 19920900 : s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
284 :
285 9960430 : s1 = _mm_add_epi32(s1, sum1);
286 9960430 : s1 = _mm_add_epi32(s1, round);
287 19920900 : s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
288 :
289 9960430 : sum0 = _mm_packus_epi16(s0, s1);
290 9960430 : sum0 = _mm_shuffle_epi8(sum0, gat);
291 9960430 : _mm_storel_epi64((__m128i *)dst, sum0);
292 9960430 : dst += stride;
293 :
294 9960430 : rep = _mm_add_epi16(rep, one);
295 9960430 : d = _mm_add_epi16(d, inc);
296 : }
297 1271490 : }
298 :
299 52564 : void eb_aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
300 : const uint8_t *above, const uint8_t *left) {
301 : __m128i pixels[4];
302 52564 : load_pixel_w8(above, left, 4, pixels);
303 :
304 : __m128i wh[4], ww[2];
305 52564 : load_weight_w8(sm_weight_arrays, 4, wh, ww);
306 :
307 52564 : smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
308 52564 : }
309 :
310 396711 : void eb_aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
311 : const uint8_t *above, const uint8_t *left) {
312 : __m128i pixels[4];
313 396711 : load_pixel_w8(above, left, 8, pixels);
314 :
315 : __m128i wh[4], ww[2];
316 396709 : load_weight_w8(sm_weight_arrays, 8, wh, ww);
317 :
318 396713 : smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
319 396714 : }
320 :
321 343719 : void eb_aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
322 : const uint8_t *above,
323 : const uint8_t *left) {
324 : __m128i pixels[4];
325 343719 : load_pixel_w8(above, left, 16, pixels);
326 :
327 : __m128i wh[4], ww[2];
328 343720 : load_weight_w8(sm_weight_arrays, 16, wh, ww);
329 :
330 343720 : smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
331 343721 : dst += stride << 3;
332 343721 : smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
333 343722 : }
334 :
335 33702 : void eb_aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
336 : const uint8_t *above,
337 : const uint8_t *left) {
338 : __m128i pixels[8];
339 33702 : load_pixel_w8(above, left, 32, pixels);
340 :
341 : __m128i wh[8], ww[2];
342 33702 : load_weight_w8(sm_weight_arrays, 32, wh, ww);
343 :
344 33702 : smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
345 33702 : dst += stride << 3;
346 33702 : smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
347 33702 : dst += stride << 3;
348 33702 : smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
349 33702 : dst += stride << 3;
350 33702 : smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
351 33702 : }
352 :
353 870553 : static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
354 : const uint8_t *above,
355 : const uint8_t *left, uint32_t bw,
356 : uint32_t bh) {
357 870553 : const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
358 870553 : const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
359 870553 : const __m128i zero = _mm_setzero_si128();
360 : const __m128i scale_value =
361 870553 : _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
362 1741110 : const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
363 870553 : const __m128i dup16 = _mm_set1_epi32(0x01000100);
364 : const __m128i top_right =
365 2611660 : _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
366 870553 : const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
367 870553 : const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
368 :
369 15931400 : for (uint32_t y = 0; y < bh; ++y) {
370 15060800 : const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
371 30121700 : const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
372 15060800 : const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
373 15060800 : __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
374 15060800 : const __m128i wl_y =
375 15060800 : _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
376 15060800 : pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
377 15060800 : pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
378 :
379 57215500 : for (uint32_t x = 0; x < bw; x += 8) {
380 42154600 : const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
381 : const __m128i weights_x =
382 84309300 : _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
383 42154600 : const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
384 42154600 : const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
385 42154600 : const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
386 :
387 42154600 : __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
388 42154600 : __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
389 :
390 : const __m128i scale_m_weights_x =
391 84309300 : _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
392 42154600 : const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
393 42154600 : const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
394 42154600 : const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
395 :
396 42154600 : pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
397 42154600 : pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
398 :
399 42154600 : pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
400 42154600 : pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
401 :
402 42154600 : pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
403 84309300 : pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
404 :
405 42154600 : __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
406 42154600 : pred = _mm_shuffle_epi8(pred, gat);
407 42154600 : _mm_storel_epi64((__m128i *)(dst + x), pred);
408 : }
409 15060800 : dst += stride;
410 : }
411 870553 : }
412 :
413 27620 : void eb_aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
414 : const uint8_t *above,
415 : const uint8_t *left) {
416 27620 : smooth_predictor_wxh(dst, stride, above, left, 16, 4);
417 27620 : }
418 :
419 273507 : void eb_aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
420 : const uint8_t *above,
421 : const uint8_t *left) {
422 273507 : smooth_predictor_wxh(dst, stride, above, left, 16, 8);
423 273507 : }
424 :
425 203771 : void eb_aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
426 : const uint8_t *above,
427 : const uint8_t *left) {
428 203771 : smooth_predictor_wxh(dst, stride, above, left, 16, 16);
429 203772 : }
430 :
431 122161 : void eb_aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
432 : const uint8_t *above,
433 : const uint8_t *left) {
434 122161 : smooth_predictor_wxh(dst, stride, above, left, 16, 32);
435 122161 : }
436 :
437 24708 : void eb_aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
438 : const uint8_t *above,
439 : const uint8_t *left) {
440 24708 : smooth_predictor_wxh(dst, stride, above, left, 32, 8);
441 24708 : }
442 :
443 116297 : void eb_aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
444 : const uint8_t *above,
445 : const uint8_t *left) {
446 116297 : smooth_predictor_wxh(dst, stride, above, left, 32, 16);
447 116297 : }
448 :
449 89259 : void eb_aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
450 : const uint8_t *above,
451 : const uint8_t *left) {
452 89259 : smooth_predictor_wxh(dst, stride, above, left, 32, 32);
453 89259 : }
454 :
455 2079 : void eb_aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
456 : const uint8_t *above,
457 : const uint8_t *left) {
458 2079 : smooth_predictor_wxh(dst, stride, above, left, 32, 64);
459 2079 : }
460 :
461 3720 : void eb_aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
462 : const uint8_t *above,
463 : const uint8_t *left) {
464 3720 : smooth_predictor_wxh(dst, stride, above, left, 64, 64);
465 3720 : }
466 :
467 1569 : void eb_aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
468 : const uint8_t *above,
469 : const uint8_t *left) {
470 1569 : smooth_predictor_wxh(dst, stride, above, left, 64, 32);
471 1569 : }
472 :
473 2438 : void eb_aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
474 : const uint8_t *above,
475 : const uint8_t *left) {
476 2438 : smooth_predictor_wxh(dst, stride, above, left, 64, 16);
477 2438 : }
478 :
479 3436 : void eb_aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
480 : const uint8_t *above,
481 : const uint8_t *left) {
482 3436 : smooth_predictor_wxh(dst, stride, above, left, 16, 64);
483 3436 : }
484 :
485 : // -----------------------------------------------------------------------------
486 : // SMOOTH_V_PRED
487 :
488 : // pixels[0]: above and below_pred interleave vector
489 197579 : static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
490 : int32_t height, __m128i *pixels) {
491 197579 : const __m128i zero = _mm_setzero_si128();
492 197579 : __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
493 395158 : const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
494 197579 : d = _mm_unpacklo_epi8(d, zero);
495 197579 : pixels[0] = _mm_unpacklo_epi16(d, bp);
496 197579 : }
497 :
498 : // weights[0]: weights_h vector
499 : // weights[1]: scale - weights_h vector
500 197580 : static INLINE void load_weight_v_w4(const uint8_t *weight_array, int32_t height,
501 : __m128i *weights) {
502 197580 : const __m128i zero = _mm_setzero_si128();
503 197580 : const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
504 :
505 197580 : if (height == 4) {
506 : const __m128i weight =
507 268940 : _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
508 134470 : weights[0] = _mm_unpacklo_epi8(weight, zero);
509 268940 : weights[1] = _mm_sub_epi16(d, weights[0]);
510 : }
511 63110 : else if (height == 8) {
512 85340 : const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
513 42670 : weights[0] = _mm_unpacklo_epi8(weight, zero);
514 85340 : weights[1] = _mm_sub_epi16(d, weights[0]);
515 : }
516 : else {
517 40880 : const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
518 20440 : weights[0] = _mm_unpacklo_epi8(weight, zero);
519 20440 : weights[1] = _mm_sub_epi16(d, weights[0]);
520 20440 : weights[2] = _mm_unpackhi_epi8(weight, zero);
521 40880 : weights[3] = _mm_sub_epi16(d, weights[2]);
522 : }
523 197580 : }
524 :
525 218021 : static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
526 : const __m128i *weight, int32_t h, uint8_t *dst,
527 : ptrdiff_t stride) {
528 436042 : const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
529 218021 : const __m128i inc = _mm_set1_epi16(0x202);
530 218021 : const __m128i gat = _mm_set1_epi32(0xc080400);
531 218021 : __m128i d = _mm_set1_epi16(0x100);
532 :
533 1424300 : for (int32_t i = 0; i < h; ++i) {
534 1206270 : const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
535 2412550 : const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
536 1206270 : const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
537 2412550 : __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
538 1206270 : sum = _mm_add_epi32(sum, pred_round);
539 2412550 : sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
540 1206270 : sum = _mm_shuffle_epi8(sum, gat);
541 1206270 : *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
542 1206270 : dst += stride;
543 1206270 : d = _mm_add_epi16(d, inc);
544 : }
545 218021 : }
546 :
547 134469 : void eb_aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
548 : const uint8_t *above,
549 : const uint8_t *left) {
550 : __m128i pixels;
551 134469 : load_pixel_v_w4(above, left, 4, &pixels);
552 :
553 : __m128i weights[2];
554 134470 : load_weight_v_w4(sm_weight_arrays, 4, weights);
555 :
556 134471 : smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
557 134471 : }
558 :
559 42670 : void eb_aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
560 : const uint8_t *above,
561 : const uint8_t *left) {
562 : __m128i pixels;
563 42670 : load_pixel_v_w4(above, left, 8, &pixels);
564 :
565 : __m128i weights[2];
566 42670 : load_weight_v_w4(sm_weight_arrays, 8, weights);
567 :
568 42670 : smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
569 42670 : }
570 :
571 20440 : void eb_aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
572 : const uint8_t *above,
573 : const uint8_t *left) {
574 : __m128i pixels;
575 20440 : load_pixel_v_w4(above, left, 16, &pixels);
576 :
577 : __m128i weights[4];
578 20440 : load_weight_v_w4(sm_weight_arrays, 16, weights);
579 :
580 20440 : smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
581 20440 : dst += stride << 3;
582 20440 : smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
583 20440 : }
584 :
585 : // pixels[0]: above and below_pred interleave vector, first half
586 : // pixels[1]: above and below_pred interleave vector, second half
587 133577 : static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
588 : int32_t height, __m128i *pixels) {
589 133577 : const __m128i zero = _mm_setzero_si128();
590 133577 : __m128i d = _mm_loadl_epi64((const __m128i *)above);
591 267154 : const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
592 133577 : d = _mm_unpacklo_epi8(d, zero);
593 133577 : pixels[0] = _mm_unpacklo_epi16(d, bp);
594 133577 : pixels[1] = _mm_unpackhi_epi16(d, bp);
595 133577 : }
596 :
597 : // weight_h[0]: weight_h vector
598 : // weight_h[1]: scale - weight_h vector
599 : // weight_h[2]: same as [0], offset 8
600 : // weight_h[3]: same as [1], offset 8
601 : // weight_h[4]: same as [0], offset 16
602 : // weight_h[5]: same as [1], offset 16
603 : // weight_h[6]: same as [0], offset 24
604 : // weight_h[7]: same as [1], offset 24
605 133577 : static INLINE void load_weight_v_w8(const uint8_t *weight_array, int32_t height,
606 : __m128i *weight_h) {
607 133577 : const __m128i zero = _mm_setzero_si128();
608 133577 : const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
609 :
610 133577 : if (height < 16) {
611 104335 : const int32_t offset = height < 8 ? 4 : 8;
612 : const __m128i weight =
613 208670 : _mm_loadu_si128((const __m128i *)&weight_array[offset]);
614 104335 : weight_h[0] = _mm_unpacklo_epi8(weight, zero);
615 208670 : weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
616 : }
617 29242 : else if (height == 16) {
618 31926 : const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
619 15963 : weight_h[0] = _mm_unpacklo_epi8(weight, zero);
620 15963 : weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
621 15963 : weight_h[2] = _mm_unpackhi_epi8(weight, zero);
622 31926 : weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
623 : }
624 : else {
625 : const __m128i weight_lo =
626 26558 : _mm_loadu_si128((const __m128i *)&weight_array[32]);
627 13279 : weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
628 13279 : weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
629 13279 : weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
630 13279 : weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
631 : const __m128i weight_hi =
632 13279 : _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
633 13279 : weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
634 13279 : weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
635 13279 : weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
636 26558 : weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
637 : }
638 133577 : }
639 :
640 189379 : static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
641 : int32_t h, uint8_t *dst, ptrdiff_t stride) {
642 378758 : const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
643 189379 : const __m128i inc = _mm_set1_epi16(0x202);
644 189379 : const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
645 189379 : __m128i d = _mm_set1_epi16(0x100);
646 :
647 1541810 : for (int32_t i = 0; i < h; ++i) {
648 1352430 : const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
649 2704860 : const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
650 1352430 : const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
651 1352430 : __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
652 2704860 : __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
653 :
654 1352430 : s0 = _mm_add_epi32(s0, pred_round);
655 2704860 : s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
656 :
657 1352430 : s1 = _mm_add_epi32(s1, pred_round);
658 2704860 : s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
659 :
660 1352430 : __m128i sum01 = _mm_packus_epi16(s0, s1);
661 1352430 : sum01 = _mm_shuffle_epi8(sum01, gat);
662 1352430 : _mm_storel_epi64((__m128i *)dst, sum01);
663 1352430 : dst += stride;
664 :
665 1352430 : d = _mm_add_epi16(d, inc);
666 : }
667 189379 : }
668 :
669 40646 : void eb_aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
670 : const uint8_t *above,
671 : const uint8_t *left) {
672 : __m128i pixels[2];
673 40646 : load_pixel_v_w8(above, left, 4, pixels);
674 :
675 : __m128i wh[2];
676 40646 : load_weight_v_w8(sm_weight_arrays, 4, wh);
677 :
678 40646 : smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
679 40646 : }
680 :
681 63689 : void eb_aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
682 : const uint8_t *above,
683 : const uint8_t *left) {
684 : __m128i pixels[2];
685 63689 : load_pixel_v_w8(above, left, 8, pixels);
686 :
687 : __m128i wh[2];
688 63689 : load_weight_v_w8(sm_weight_arrays, 8, wh);
689 :
690 63689 : smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
691 63689 : }
692 :
693 15963 : void eb_aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
694 : const uint8_t *above,
695 : const uint8_t *left) {
696 : __m128i pixels[2];
697 15963 : load_pixel_v_w8(above, left, 16, pixels);
698 :
699 : __m128i wh[4];
700 15963 : load_weight_v_w8(sm_weight_arrays, 16, wh);
701 :
702 15963 : smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
703 15963 : dst += stride << 3;
704 15963 : smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
705 15963 : }
706 :
707 13280 : void eb_aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
708 : const uint8_t *above,
709 : const uint8_t *left) {
710 : __m128i pixels[2];
711 13280 : load_pixel_v_w8(above, left, 32, pixels);
712 :
713 : __m128i wh[8];
714 13280 : load_weight_v_w8(sm_weight_arrays, 32, wh);
715 :
716 13280 : smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
717 13280 : dst += stride << 3;
718 13280 : smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
719 13280 : dst += stride << 3;
720 13280 : smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
721 13280 : dst += stride << 3;
722 13280 : smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
723 13280 : }
724 :
725 136903 : static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
726 : const uint8_t *above,
727 : const uint8_t *left, uint32_t bw,
728 : uint32_t bh) {
729 136903 : const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
730 136903 : const __m128i zero = _mm_setzero_si128();
731 : const __m128i scale_value =
732 273806 : _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
733 136903 : const __m128i dup16 = _mm_set1_epi32(0x01000100);
734 : const __m128i bottom_left =
735 410709 : _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
736 136903 : const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
737 : const __m128i round =
738 136903 : _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
739 :
740 2718280 : for (uint32_t y = 0; y < bh; ++y) {
741 5162740 : const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
742 : const __m128i scale_m_weights_y =
743 5162740 : _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
744 2581370 : const __m128i wl_y =
745 2581370 : _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
746 :
747 11659000 : for (uint32_t x = 0; x < bw; x += 8) {
748 18155200 : const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
749 : // 8 -> 16
750 9077590 : const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
751 9077590 : const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
752 9077590 : const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
753 : // top_x * weights_y + scale_m_weights_y * bottom_left
754 9077590 : __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
755 9077590 : __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
756 :
757 9077590 : pred_lo = _mm_add_epi32(pred_lo, round);
758 9077590 : pred_hi = _mm_add_epi32(pred_hi, round);
759 9077590 : pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
760 18155200 : pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
761 :
762 9077590 : __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
763 9077590 : pred = _mm_shuffle_epi8(pred, gat);
764 9077590 : _mm_storel_epi64((__m128i *)(dst + x), pred);
765 : }
766 2581370 : dst += stride;
767 : }
768 136903 : }
769 :
770 23210 : void eb_aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
771 : const uint8_t *above,
772 : const uint8_t *left) {
773 23210 : smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
774 23210 : }
775 :
776 16653 : void eb_aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
777 : const uint8_t *above,
778 : const uint8_t *left) {
779 16653 : smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
780 16653 : }
781 :
782 34203 : void eb_aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
783 : const uint8_t *above,
784 : const uint8_t *left) {
785 34203 : smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
786 34203 : }
787 :
788 8958 : void eb_aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
789 : const uint8_t *above,
790 : const uint8_t *left) {
791 8958 : smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
792 8958 : }
793 :
794 13250 : void eb_aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
795 : const uint8_t *above,
796 : const uint8_t *left) {
797 13250 : smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
798 13250 : }
799 :
800 8423 : void eb_aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
801 : const uint8_t *above,
802 : const uint8_t *left) {
803 8423 : smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
804 8424 : }
805 :
806 19316 : void eb_aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
807 : const uint8_t *above,
808 : const uint8_t *left) {
809 19316 : smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
810 19317 : }
811 :
812 2021 : void eb_aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
813 : const uint8_t *above,
814 : const uint8_t *left) {
815 2021 : smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
816 2021 : }
817 :
818 3701 : void eb_aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
819 : const uint8_t *above,
820 : const uint8_t *left) {
821 3701 : smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
822 3701 : }
823 :
824 1527 : void eb_aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
825 : const uint8_t *above,
826 : const uint8_t *left) {
827 1527 : smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
828 1527 : }
829 :
830 2361 : void eb_aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
831 : const uint8_t *above,
832 : const uint8_t *left) {
833 2361 : smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
834 2361 : }
835 :
836 3280 : void eb_aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
837 : const uint8_t *above,
838 : const uint8_t *left) {
839 3280 : smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
840 3280 : }
841 :
842 : // -----------------------------------------------------------------------------
843 : // SMOOTH_H_PRED
844 :
845 : // pixels[0]: left vector
846 : // pixels[1]: right_pred vector
847 216365 : static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
848 : int32_t height, __m128i *pixels) {
849 216365 : if (height == 4)
850 299778 : pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
851 66476 : else if (height == 8)
852 44486 : pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
853 : else
854 21990 : pixels[0] = _mm_loadu_si128(((const __m128i *)left));
855 216365 : pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
856 216365 : }
857 :
858 : // weights[0]: weights_w and scale - weights_w interleave vector
859 216365 : static INLINE void load_weight_h_w4(const uint8_t *weight_array, int32_t height,
860 : __m128i *weights) {
861 : (void)height;
862 432730 : const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
863 216365 : const __m128i zero = _mm_setzero_si128();
864 :
865 216365 : const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
866 432730 : const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
867 216365 : const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
868 216365 : weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
869 216365 : }
870 :
871 238356 : static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
872 : const __m128i *weight, int32_t h, uint8_t *dst,
873 : ptrdiff_t stride) {
874 476712 : const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
875 238356 : const __m128i one = _mm_set1_epi16(1);
876 238356 : const __m128i gat = _mm_set1_epi32(0xc080400);
877 238356 : __m128i rep = _mm_set1_epi16((short)0x8000);
878 :
879 1545630 : for (int32_t i = 0; i < h; ++i) {
880 1307280 : __m128i b = _mm_shuffle_epi8(pixel[0], rep);
881 1307280 : b = _mm_unpacklo_epi16(b, pixel[1]);
882 2614550 : __m128i sum = _mm_madd_epi16(b, weight[0]);
883 :
884 1307280 : sum = _mm_add_epi32(sum, pred_round);
885 2614550 : sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
886 :
887 1307280 : sum = _mm_shuffle_epi8(sum, gat);
888 1307280 : *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
889 1307280 : dst += stride;
890 :
891 1307280 : rep = _mm_add_epi16(rep, one);
892 : }
893 238356 : }
894 :
895 149889 : void eb_aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
896 : const uint8_t *above,
897 : const uint8_t *left) {
898 : __m128i pixels[2];
899 149889 : load_pixel_h_w4(above, left, 4, pixels);
900 :
901 : __m128i weights;
902 149889 : load_weight_h_w4(sm_weight_arrays, 4, &weights);
903 :
904 149889 : smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
905 149889 : }
906 :
907 44486 : void eb_aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
908 : const uint8_t *above,
909 : const uint8_t *left) {
910 : __m128i pixels[2];
911 44486 : load_pixel_h_w4(above, left, 8, pixels);
912 :
913 : __m128i weights;
914 44486 : load_weight_h_w4(sm_weight_arrays, 8, &weights);
915 :
916 44486 : smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
917 44486 : }
918 :
919 21991 : void eb_aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
920 : const uint8_t *above,
921 : const uint8_t *left) {
922 : __m128i pixels[2];
923 21991 : load_pixel_h_w4(above, left, 16, pixels);
924 :
925 : __m128i weights;
926 21991 : load_weight_h_w4(sm_weight_arrays, 8, &weights);
927 :
928 21991 : smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
929 21991 : dst += stride << 3;
930 :
931 21991 : pixels[0] = _mm_srli_si128(pixels[0], 8);
932 21991 : smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
933 21991 : }
934 :
935 : // pixels[0]: left vector
936 : // pixels[1]: right_pred vector
937 : // pixels[2]: left vector + 16
938 : // pixels[3]: right_pred vector
939 151179 : static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
940 : int32_t height, __m128i *pixels) {
941 151179 : pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
942 :
943 151179 : if (height == 4)
944 83884 : pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
945 109237 : else if (height == 8)
946 70118 : pixels[0] = _mm_loadl_epi64((const __m128i *)left);
947 39119 : else if (height == 16)
948 16968 : pixels[0] = _mm_load_si128((const __m128i *)left);
949 : else {
950 22151 : pixels[0] = _mm_load_si128((const __m128i *)left);
951 22151 : pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
952 22151 : pixels[3] = pixels[1];
953 : }
954 151179 : }
955 :
956 : // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
957 : // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
958 151180 : static INLINE void load_weight_h_w8(const uint8_t *weight_array, int32_t height,
959 : __m128i *weight_w) {
960 : (void)height;
961 151180 : const __m128i zero = _mm_setzero_si128();
962 151180 : const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
963 302360 : const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
964 151180 : const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
965 151180 : const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
966 151180 : weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
967 151180 : weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
968 151180 : }
969 :
970 234604 : static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
971 : int32_t h, uint8_t *dst, ptrdiff_t stride,
972 : int32_t second_half) {
973 469208 : const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
974 234604 : const __m128i one = _mm_set1_epi16(1);
975 234604 : const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
976 469208 : __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) : _mm_set1_epi16((short)0x8000);
977 :
978 1943610 : for (int32_t i = 0; i < h; ++i) {
979 1709000 : __m128i b = _mm_shuffle_epi8(pixels[0], rep);
980 1709000 : b = _mm_unpacklo_epi16(b, pixels[1]);
981 1709000 : __m128i sum0 = _mm_madd_epi16(b, ww[0]);
982 3418010 : __m128i sum1 = _mm_madd_epi16(b, ww[1]);
983 :
984 1709000 : sum0 = _mm_add_epi32(sum0, pred_round);
985 3418010 : sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
986 :
987 1709000 : sum1 = _mm_add_epi32(sum1, pred_round);
988 3418010 : sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
989 :
990 1709000 : sum0 = _mm_packus_epi16(sum0, sum1);
991 1709000 : sum0 = _mm_shuffle_epi8(sum0, gat);
992 1709000 : _mm_storel_epi64((__m128i *)dst, sum0);
993 1709000 : dst += stride;
994 :
995 1709000 : rep = _mm_add_epi16(rep, one);
996 : }
997 234604 : }
998 :
999 41942 : void eb_aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1000 : const uint8_t *above,
1001 : const uint8_t *left) {
1002 : __m128i pixels[2];
1003 41942 : load_pixel_h_w8(above, left, 4, pixels);
1004 :
1005 : __m128i ww[2];
1006 41942 : load_weight_h_w8(sm_weight_arrays, 4, ww);
1007 :
1008 41942 : smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
1009 41942 : }
1010 :
1011 70118 : void eb_aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1012 : const uint8_t *above,
1013 : const uint8_t *left) {
1014 : __m128i pixels[2];
1015 70118 : load_pixel_h_w8(above, left, 8, pixels);
1016 :
1017 : __m128i ww[2];
1018 70118 : load_weight_h_w8(sm_weight_arrays, 8, ww);
1019 :
1020 70118 : smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1021 70118 : }
1022 :
1023 16968 : void eb_aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1024 : const uint8_t *above,
1025 : const uint8_t *left) {
1026 : __m128i pixels[2];
1027 16968 : load_pixel_h_w8(above, left, 16, pixels);
1028 :
1029 : __m128i ww[2];
1030 16968 : load_weight_h_w8(sm_weight_arrays, 16, ww);
1031 :
1032 16968 : smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1033 16968 : dst += stride << 3;
1034 16968 : smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
1035 16968 : }
1036 :
1037 22152 : void eb_aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1038 : const uint8_t *above,
1039 : const uint8_t *left) {
1040 : __m128i pixels[4];
1041 22152 : load_pixel_h_w8(above, left, 32, pixels);
1042 :
1043 : __m128i ww[2];
1044 22152 : load_weight_h_w8(sm_weight_arrays, 32, ww);
1045 :
1046 22152 : smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
1047 22152 : dst += stride << 3;
1048 22152 : smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
1049 22152 : dst += stride << 3;
1050 22152 : smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
1051 22152 : dst += stride << 3;
1052 22152 : smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
1053 22152 : }
1054 :
1055 153675 : static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1056 : const uint8_t *above,
1057 : const uint8_t *left, uint32_t bw,
1058 : uint32_t bh) {
1059 153675 : const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
1060 153675 : const __m128i zero = _mm_setzero_si128();
1061 : const __m128i scale_value =
1062 153675 : _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1063 307350 : const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
1064 153675 : const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1065 153675 : const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1066 :
1067 3044320 : for (uint32_t y = 0; y < bh; ++y) {
1068 5781280 : const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
1069 2890640 : const __m128i tr_ly =
1070 2890640 : _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
1071 :
1072 12781400 : for (uint32_t x = 0; x < bw; x += 8) {
1073 : const __m128i weights_x =
1074 19781400 : _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
1075 9890720 : const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
1076 9890720 : const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
1077 9890720 : const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
1078 9890720 : const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
1079 9890720 : __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
1080 9890720 : __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
1081 :
1082 9890720 : pred_lo = _mm_add_epi32(pred_lo, pred_round);
1083 9890720 : pred_hi = _mm_add_epi32(pred_hi, pred_round);
1084 :
1085 9890720 : pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1086 19781400 : pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1087 :
1088 9890720 : __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1089 9890720 : pred = _mm_shuffle_epi8(pred, gat);
1090 9890720 : _mm_storel_epi64((__m128i *)(dst + x), pred);
1091 : }
1092 2890640 : dst += stride;
1093 : }
1094 153675 : }
1095 :
1096 24652 : void eb_aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1097 : const uint8_t *above,
1098 : const uint8_t *left) {
1099 24652 : smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
1100 24652 : }
1101 :
1102 18280 : void eb_aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1103 : const uint8_t *above,
1104 : const uint8_t *left) {
1105 18280 : smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
1106 18281 : }
1107 :
1108 36175 : void eb_aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1109 : const uint8_t *above,
1110 : const uint8_t *left) {
1111 36175 : smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
1112 36175 : }
1113 :
1114 14245 : void eb_aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1115 : const uint8_t *above,
1116 : const uint8_t *left) {
1117 14245 : smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
1118 14245 : }
1119 :
1120 3259 : void eb_aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1121 : const uint8_t *above,
1122 : const uint8_t *left) {
1123 3259 : smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
1124 3259 : }
1125 :
1126 16423 : void eb_aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1127 : const uint8_t *above,
1128 : const uint8_t *left) {
1129 16423 : smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
1130 16423 : }
1131 :
1132 10819 : void eb_aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1133 : const uint8_t *above,
1134 : const uint8_t *left) {
1135 10819 : smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
1136 10819 : }
1137 :
1138 20095 : void eb_aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1139 : const uint8_t *above,
1140 : const uint8_t *left) {
1141 20095 : smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
1142 20096 : }
1143 :
1144 2011 : void eb_aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1145 : const uint8_t *above,
1146 : const uint8_t *left) {
1147 2011 : smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
1148 2011 : }
1149 :
1150 3706 : void eb_aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1151 : const uint8_t *above,
1152 : const uint8_t *left) {
1153 3706 : smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
1154 3706 : }
1155 :
1156 1580 : void eb_aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1157 : const uint8_t *above,
1158 : const uint8_t *left) {
1159 1580 : smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
1160 1580 : }
1161 :
1162 2429 : void eb_aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1163 : const uint8_t *above,
1164 : const uint8_t *left) {
1165 2429 : smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
1166 2429 : }
1167 :
1168 0 : void eb_smooth_v_predictor_all_ssse3(uint8_t *dst, ptrdiff_t stride, int32_t bw,
1169 : int32_t bh, const uint8_t *above,
1170 : const uint8_t *left) {
1171 : (void)bh;
1172 :
1173 0 : switch (bw) {
1174 0 : case 4:
1175 0 : eb_aom_smooth_v_predictor_4x4_ssse3(
1176 : dst,
1177 : stride,
1178 : above,
1179 : left
1180 : );
1181 0 : break;
1182 0 : case 8:
1183 0 : eb_aom_smooth_v_predictor_8x8_ssse3(
1184 : dst,
1185 : stride,
1186 : above,
1187 : left
1188 : );
1189 0 : break;
1190 0 : case 16:
1191 0 : eb_aom_smooth_v_predictor_16x16_ssse3(
1192 : dst,
1193 : stride,
1194 : above,
1195 : left
1196 : );
1197 0 : break;
1198 0 : case 32:
1199 0 : eb_aom_smooth_v_predictor_32x32_ssse3(
1200 : dst,
1201 : stride,
1202 : above,
1203 : left
1204 : );
1205 0 : break;
1206 0 : case 64:
1207 0 : eb_aom_smooth_v_predictor_64x64_ssse3(
1208 : dst,
1209 : stride,
1210 : above,
1211 : left
1212 : );
1213 0 : break;
1214 0 : default:
1215 :
1216 0 : break;
1217 : }
1218 0 : }
1219 0 : void eb_smooth_h_predictor_all_ssse3(uint8_t *dst, ptrdiff_t stride, int32_t bw,
1220 : int32_t bh, const uint8_t *above,
1221 : const uint8_t *left) {
1222 : (void)bh;
1223 : //printf("here");
1224 0 : switch (bw) {
1225 0 : case 4:
1226 0 : eb_aom_smooth_h_predictor_4x4_ssse3(
1227 : dst,
1228 : stride,
1229 : above,
1230 : left
1231 : );
1232 0 : break;
1233 0 : case 8:
1234 0 : eb_aom_smooth_h_predictor_8x8_ssse3(
1235 : dst,
1236 : stride,
1237 : above,
1238 : left
1239 : );
1240 0 : break;
1241 0 : case 16:
1242 0 : eb_aom_smooth_h_predictor_16x16_ssse3(
1243 : dst,
1244 : stride,
1245 : above,
1246 : left
1247 : );
1248 0 : break;
1249 0 : case 32:
1250 0 : eb_aom_smooth_h_predictor_32x32_ssse3(
1251 : dst,
1252 : stride,
1253 : above,
1254 : left
1255 : );
1256 0 : break;
1257 0 : case 64:
1258 0 : eb_aom_smooth_h_predictor_64x64_ssse3(
1259 : dst,
1260 : stride,
1261 : above,
1262 : left
1263 : );
1264 0 : break;
1265 0 : default:
1266 :
1267 0 : break;
1268 : }
1269 0 : }
|