Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include "EbDefinitions.h"
7 : #include "emmintrin.h"
8 : #include "aom_dsp_rtcd.h"
9 :
10 2268160 : static INLINE __m128i dc_sum_16(const uint8_t *ref) {
11 2268160 : __m128i x = _mm_loadu_si128((__m128i const *)ref);
12 2268160 : const __m128i zero = _mm_setzero_si128();
13 2268160 : x = _mm_sad_epu8(x, zero);
14 2268160 : const __m128i high = _mm_unpackhi_epi64(x, x);
15 2268160 : return _mm_add_epi16(x, high);
16 : }
17 :
18 2457810 : static INLINE void dc_store_4xh(uint32_t dc, int32_t height, uint8_t *dst,
19 : ptrdiff_t stride) {
20 14398700 : for (int32_t i = 0; i < height; i += 2) {
21 11940900 : *(uint32_t *)dst = dc;
22 11940900 : dst += stride;
23 11940900 : *(uint32_t *)dst = dc;
24 11940900 : dst += stride;
25 : }
26 2457810 : }
27 :
28 195204 : static INLINE __m128i dc_sum_32(const uint8_t *ref) {
29 195204 : __m128i x0 = _mm_load_si128((__m128i const *)ref);
30 390408 : __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
31 195204 : const __m128i zero = _mm_setzero_si128();
32 195204 : x0 = _mm_sad_epu8(x0, zero);
33 195204 : x1 = _mm_sad_epu8(x1, zero);
34 195204 : x0 = _mm_add_epi16(x0, x1);
35 195204 : const __m128i high = _mm_unpackhi_epi64(x0, x0);
36 195204 : return _mm_add_epi16(x0, high);
37 : }
38 :
39 3265 : static INLINE __m128i dc_sum_64(const uint8_t *ref) {
40 3265 : __m128i x0 = _mm_load_si128((__m128i const *)ref);
41 3265 : __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
42 3265 : __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
43 6530 : __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
44 3265 : const __m128i zero = _mm_setzero_si128();
45 3265 : x0 = _mm_sad_epu8(x0, zero);
46 3265 : x1 = _mm_sad_epu8(x1, zero);
47 3265 : x2 = _mm_sad_epu8(x2, zero);
48 3265 : x3 = _mm_sad_epu8(x3, zero);
49 3265 : x0 = _mm_add_epi16(x0, x1);
50 3265 : x2 = _mm_add_epi16(x2, x3);
51 3265 : x0 = _mm_add_epi16(x0, x2);
52 3265 : const __m128i high = _mm_unpackhi_epi64(x0, x0);
53 3265 : return _mm_add_epi16(x0, high);
54 : }
55 :
56 32207 : static INLINE void dc_store_32xh(const __m128i *row, int32_t height, uint8_t *dst,
57 : ptrdiff_t stride) {
58 : int32_t i;
59 289863 : for (i = 0; i < height; ++i) {
60 257656 : _mm_storeu_si128((__m128i *)dst, *row);
61 257656 : _mm_storeu_si128((__m128i *)(dst + 16), *row);
62 257656 : dst += stride;
63 : }
64 32207 : }
65 :
66 1974630 : static INLINE void dc_store_16xh(const __m128i *row, int32_t height, uint8_t *dst,
67 : ptrdiff_t stride) {
68 : int32_t i;
69 26823900 : for (i = 0; i < height; ++i) {
70 24849200 : _mm_storeu_si128((__m128i *)dst, *row);
71 24849200 : dst += stride;
72 : }
73 1974630 : }
74 :
75 0 : void intra_mode_dc_16x16_av1_sse2_intrin(
76 : EbBool is_left_availble,
77 : EbBool is_above_availble,
78 : const uint32_t size, //input parameter, denotes the size of the current PU
79 : uint8_t *ref_samples, //input parameter, pointer to the reference samples
80 : uint8_t *dst, //output parameter, pointer to the prediction
81 : const uint32_t prediction_buffer_stride, //input parameter, denotes the stride for the prediction ptr
82 : const EbBool skip) //skip half rows
83 : {
84 0 : uint32_t leftOffset = 0;
85 0 : uint32_t topOffset = (size << 1) + 1;
86 0 : uint32_t rowStride = skip ? 2 : 1;
87 :
88 0 : if (is_left_availble && !is_above_availble) {
89 0 : __m128i sum_left = dc_sum_16(&ref_samples[leftOffset]);
90 0 : const __m128i eight = _mm_set1_epi16((uint16_t)8);
91 0 : sum_left = _mm_add_epi16(sum_left, eight);
92 0 : sum_left = _mm_srai_epi16(sum_left, 4);
93 0 : sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
94 0 : sum_left = _mm_shufflelo_epi16(sum_left, 0);
95 0 : const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
96 0 : dc_store_16xh(&row, 16, dst, rowStride * prediction_buffer_stride);
97 : }
98 0 : else if (is_above_availble && !is_left_availble) {
99 0 : __m128i sum_above = dc_sum_16(&ref_samples[topOffset]);
100 0 : const __m128i eight = _mm_set1_epi16((uint16_t)8);
101 0 : sum_above = _mm_add_epi16(sum_above, eight);
102 0 : sum_above = _mm_srai_epi16(sum_above, 4);
103 0 : sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
104 0 : sum_above = _mm_shufflelo_epi16(sum_above, 0);
105 0 : const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
106 0 : dc_store_16xh(&row, 16, dst, rowStride * prediction_buffer_stride);
107 : }
108 : else
109 : {
110 0 : const __m128i sum_left = dc_sum_16(&ref_samples[leftOffset]);
111 0 : __m128i sum_above = dc_sum_16(&ref_samples[topOffset]);
112 0 : sum_above = _mm_add_epi16(sum_above, sum_left);
113 :
114 0 : uint32_t sum = _mm_cvtsi128_si32(sum_above);
115 0 : sum += 16;
116 0 : sum >>= 5;
117 0 : const __m128i row = _mm_set1_epi8((uint8_t)sum);
118 0 : dc_store_16xh(&row, 16, dst, rowStride * prediction_buffer_stride);
119 : }
120 0 : }
121 3747170 : static INLINE void dc_store_8xh(const __m128i *row, int32_t height, uint8_t *dst,
122 : ptrdiff_t stride) {
123 : int32_t i;
124 41729600 : for (i = 0; i < height; ++i) {
125 37982400 : _mm_storel_epi64((__m128i *)dst, *row);
126 37982400 : dst += stride;
127 : }
128 3747170 : }
129 4719570 : static INLINE __m128i dc_sum_8(const uint8_t *ref) {
130 4719570 : __m128i x = _mm_loadl_epi64((__m128i const *)ref);
131 4719570 : const __m128i zero = _mm_setzero_si128();
132 4719570 : return _mm_sad_epu8(x, zero);
133 : }
134 :
135 0 : void intra_mode_dc_8x8_av1_sse2_intrin(
136 : EbBool is_left_availble,
137 : EbBool is_above_availble,
138 : const uint32_t size, //input parameter, denotes the size of the current PU
139 : uint8_t *ref_samples, //input parameter, pointer to the reference samples
140 : uint8_t *dst, //output parameter, pointer to the prediction
141 : const uint32_t prediction_buffer_stride, //input parameter, denotes the stride for the prediction ptr
142 : const EbBool skip) //skip half rows
143 : {
144 0 : uint32_t leftOffset = 0;
145 0 : uint32_t topOffset = (size << 1) + 1;
146 0 : uint32_t rowStride = skip ? 2 : 1;
147 :
148 0 : if (is_left_availble && !is_above_availble) {
149 0 : __m128i sum_left = dc_sum_8(&ref_samples[leftOffset]);
150 0 : const __m128i four = _mm_set1_epi16((uint16_t)4);
151 0 : sum_left = _mm_add_epi16(sum_left, four);
152 0 : sum_left = _mm_srai_epi16(sum_left, 3);
153 0 : sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
154 0 : const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
155 0 : dc_store_8xh(&row, 8, dst, rowStride * prediction_buffer_stride);
156 : }
157 0 : else if (is_above_availble && !is_left_availble) {
158 0 : __m128i sum_above = dc_sum_8(&ref_samples[topOffset]);
159 0 : const __m128i four = _mm_set1_epi16((uint16_t)4);
160 0 : sum_above = _mm_add_epi16(sum_above, four);
161 0 : sum_above = _mm_srai_epi16(sum_above, 3);
162 0 : sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
163 0 : const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
164 0 : dc_store_8xh(&row, 8, dst, rowStride * prediction_buffer_stride);
165 : }
166 : else
167 : {
168 0 : const __m128i sum_left = dc_sum_8(&ref_samples[leftOffset]);
169 0 : __m128i sum_above = dc_sum_8(&ref_samples[topOffset]);
170 0 : sum_above = _mm_add_epi16(sum_above, sum_left);
171 :
172 0 : uint32_t sum = _mm_cvtsi128_si32(sum_above);
173 0 : sum += 8;
174 0 : sum >>= 4;
175 0 : const __m128i row = _mm_set1_epi8((uint8_t)sum);
176 0 : dc_store_8xh(&row, 8, dst, rowStride * prediction_buffer_stride);
177 : }
178 0 : }
179 4272130 : static INLINE __m128i dc_sum_4(const uint8_t *ref) {
180 4272130 : __m128i x = _mm_loadl_epi64((__m128i const *)ref);
181 4272130 : const __m128i zero = _mm_setzero_si128();
182 4272130 : x = _mm_unpacklo_epi8(x, zero);
183 4272130 : return _mm_sad_epu8(x, zero);
184 : }
185 0 : static INLINE void dc_store_4x4(uint32_t dc, uint8_t *dst, ptrdiff_t stride) {
186 : int32_t i;
187 0 : for (i = 0; i < 2; ++i) {
188 0 : *(uint32_t *)dst = dc;
189 0 : dst += stride;
190 0 : *(uint32_t *)dst = dc;
191 0 : dst += stride;
192 : }
193 0 : }
194 0 : void intra_mode_dc_4x4_av1_sse2_intrin(
195 : EbBool is_left_availble,
196 : EbBool is_above_availble,
197 : const uint32_t size, //input parameter, denotes the size of the current PU
198 : uint8_t *ref_samples, //input parameter, pointer to the reference samples
199 : uint8_t *dst, //output parameter, pointer to the prediction
200 : const uint32_t prediction_buffer_stride, //input parameter, denotes the stride for the prediction ptr
201 : const EbBool skip) //skip half rows
202 : {
203 0 : uint32_t leftOffset = 0;
204 0 : uint32_t topOffset = (size << 1) + 1;
205 0 : uint32_t rowStride = skip ? 2 : 1;
206 :
207 0 : if (is_left_availble && !is_above_availble) {
208 0 : __m128i sum_left = dc_sum_4(&ref_samples[leftOffset]);
209 0 : const __m128i two = _mm_set1_epi16((uint16_t)2);
210 0 : sum_left = _mm_add_epi16(sum_left, two);
211 0 : sum_left = _mm_srai_epi16(sum_left, 2);
212 0 : sum_left = _mm_shufflelo_epi16(sum_left, 0);
213 0 : sum_left = _mm_packus_epi16(sum_left, sum_left);
214 :
215 0 : const uint32_t pred = _mm_cvtsi128_si32(sum_left);
216 0 : dc_store_4x4(pred, dst, rowStride * prediction_buffer_stride);
217 : }
218 0 : else if (is_above_availble && !is_left_availble) {
219 0 : __m128i sum_above = dc_sum_4(&ref_samples[topOffset]);
220 0 : const __m128i two = _mm_set1_epi16((int16_t)2);
221 0 : sum_above = _mm_add_epi16(sum_above, two);
222 0 : sum_above = _mm_srai_epi16(sum_above, 2);
223 0 : sum_above = _mm_shufflelo_epi16(sum_above, 0);
224 0 : sum_above = _mm_packus_epi16(sum_above, sum_above);
225 :
226 0 : const uint32_t pred = _mm_cvtsi128_si32(sum_above);
227 0 : dc_store_4x4(pred, dst, rowStride * prediction_buffer_stride);
228 : }
229 : else
230 : {
231 0 : const __m128i sum_left = dc_sum_4(&ref_samples[leftOffset]);
232 0 : __m128i sum_above = dc_sum_4(&ref_samples[topOffset]);
233 0 : sum_above = _mm_add_epi16(sum_left, sum_above);
234 :
235 0 : uint32_t sum = _mm_cvtsi128_si32(sum_above);
236 0 : sum += 4;
237 0 : sum >>= 3;
238 :
239 0 : const __m128i row = _mm_set1_epi8((uint8_t)sum);
240 0 : const uint32_t pred = _mm_cvtsi128_si32(row);
241 0 : dc_store_4x4(pred, dst, rowStride * prediction_buffer_stride);
242 : }
243 0 : }
244 :
245 : #define DC_SHIFT2 16
246 : #define DC_MULTIPLIER_1X2 0x5556
247 : #define DC_MULTIPLIER_1X4 0x3334
248 :
249 5273090 : static INLINE int32_t divide_using_multiply_shift(int32_t num, int32_t shift1,
250 : int32_t multiplier) {
251 5273090 : const int32_t interm = num >> shift1;
252 5273090 : return interm * multiplier >> DC_SHIFT2;
253 : }
254 :
255 22917 : static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
256 : const uint8_t *above, int32_t height) {
257 22917 : const __m128i row0 = _mm_load_si128((__m128i const *)above);
258 22917 : const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
259 206253 : for (int32_t i = 0; i < height; ++i) {
260 : _mm_storeu_si128((__m128i *)dst, row0);
261 183336 : _mm_storeu_si128((__m128i *)(dst + 16), row1);
262 183336 : dst += stride;
263 : }
264 22917 : }
265 :
266 5508030 : static INLINE void h_pred_store_16xh(const __m128i *row, int32_t h, uint8_t *dst,
267 : ptrdiff_t stride) {
268 : int32_t i;
269 27539700 : for (i = 0; i < h; ++i) {
270 22031700 : _mm_storeu_si128((__m128i *)dst, row[i]);
271 22031700 : dst += stride;
272 : }
273 5508030 : }
274 3776090 : static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
275 3776090 : const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
276 3776090 : const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
277 3776090 : const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
278 3776090 : const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
279 :
280 3776090 : row[0] = _mm_unpacklo_epi64(u0, u0);
281 3776090 : row[1] = _mm_unpacklo_epi64(u1, u1);
282 3776090 : row[2] = _mm_unpacklo_epi64(u2, u2);
283 3776090 : row[3] = _mm_unpacklo_epi64(u3, u3);
284 3776090 : }
285 3757220 : static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
286 3757220 : const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
287 3757220 : const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
288 3757220 : const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
289 3757220 : const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
290 :
291 3757220 : row[0] = _mm_unpackhi_epi64(u0, u0);
292 3757220 : row[1] = _mm_unpackhi_epi64(u1, u1);
293 3757220 : row[2] = _mm_unpackhi_epi64(u2, u2);
294 3757220 : row[3] = _mm_unpackhi_epi64(u3, u3);
295 3757220 : }
296 : // Process 16x8, first 4 rows
297 : // Use first 8 bytes of left register: xxxxxxxx33221100
298 2763570 : static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
299 : ptrdiff_t stride) {
300 : __m128i row[4];
301 2763570 : repeat_low_4pixels(left, row);
302 2763620 : h_pred_store_16xh(row, 4, dst, stride);
303 2763620 : }
304 :
305 : // Process 16x8, second 4 rows
306 : // Use second 8 bytes of left register: 77665544xxxxxxxx
307 2744640 : static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
308 : ptrdiff_t stride) {
309 : __m128i row[4];
310 2744640 : repeat_high_4pixels(left, row);
311 2744660 : h_pred_store_16xh(row, 4, dst, stride);
312 2744620 : }
313 :
314 358560 : static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
315 : const uint8_t *left, int32_t count) {
316 358560 : int32_t i = 0;
317 : do {
318 722269 : const __m128i left_col = _mm_load_si128((const __m128i *)left);
319 722269 : const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
320 722269 : h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
321 722270 : dst += stride << 2;
322 722270 : h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
323 722270 : dst += stride << 2;
324 :
325 722270 : const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
326 722270 : h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
327 722269 : dst += stride << 2;
328 722269 : h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
329 722274 : dst += stride << 2;
330 :
331 722274 : left += 16;
332 722274 : i++;
333 722274 : } while (i < count);
334 358565 : }
335 :
336 2025250 : static INLINE void h_pred_store_32xh(const __m128i *row, int32_t h, uint8_t *dst,
337 : ptrdiff_t stride) {
338 : int32_t i;
339 10126100 : for (i = 0; i < h; ++i) {
340 8100880 : _mm_storeu_si128((__m128i *)dst, row[i]);
341 8100880 : _mm_storeu_si128((__m128i *)(dst + 16), row[i]);
342 8100880 : dst += stride;
343 : }
344 2025250 : }
345 :
346 : // Process 32x8, first 4 rows
347 : // Use first 8 bytes of left register: xxxxxxxx33221100
348 1012640 : static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
349 : ptrdiff_t stride) {
350 : __m128i row[4];
351 1012640 : repeat_low_4pixels(left, row);
352 1012650 : h_pred_store_32xh(row, 4, dst, stride);
353 1012650 : }
354 :
355 : // Process 32x8, second 4 rows
356 : // Use second 8 bytes of left register: 77665544xxxxxxxx
357 1012650 : static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
358 : ptrdiff_t stride) {
359 : __m128i row[4];
360 1012650 : repeat_high_4pixels(left, row);
361 1012660 : h_pred_store_32xh(row, 4, dst, stride);
362 1012660 : }
363 :
364 1490 : static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
365 : const uint8_t *left, int32_t height) {
366 1490 : int32_t i = height >> 2;
367 : do {
368 47680 : __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
369 23840 : left4 = _mm_unpacklo_epi8(left4, left4);
370 23840 : left4 = _mm_unpacklo_epi8(left4, left4);
371 23840 : const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
372 23840 : const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
373 : _mm_storeu_si128((__m128i *)dst, r0);
374 23840 : _mm_storeu_si128((__m128i *)(dst + 16), r0);
375 23840 : _mm_storeu_si128((__m128i *)(dst + stride), r1);
376 23840 : _mm_storeu_si128((__m128i *)(dst + stride + 16), r1);
377 23840 : const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
378 23840 : const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
379 23840 : _mm_storeu_si128((__m128i *)(dst + stride * 2), r2);
380 23840 : _mm_storeu_si128((__m128i *)(dst + stride * 2 + 16), r2);
381 23840 : _mm_storeu_si128((__m128i *)(dst + stride * 3), r3);
382 23840 : _mm_storeu_si128((__m128i *)(dst + stride * 3 + 16), r3);
383 23840 : left += 4;
384 23840 : dst += stride * 4;
385 23840 : } while (--i);
386 1490 : }
387 :
388 1139080 : static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
389 : const uint8_t *above, const uint8_t *left,
390 : int32_t count) {
391 : (void)above;
392 2310010 : for (int32_t i = 0; i < count; ++i) {
393 1170930 : const __m128i left_col = _mm_load_si128((__m128i const *)left);
394 1170930 : __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
395 1170930 : __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
396 :
397 1170930 : __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
398 1170930 : __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
399 1170930 : __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
400 1170930 : __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
401 1170930 : _mm_storel_epi64((__m128i *)dst, row0);
402 1170930 : dst += stride;
403 1170930 : _mm_storel_epi64((__m128i *)dst, row1);
404 1170930 : dst += stride;
405 1170930 : _mm_storel_epi64((__m128i *)dst, row2);
406 1170930 : dst += stride;
407 1170930 : _mm_storel_epi64((__m128i *)dst, row3);
408 1170930 : dst += stride;
409 :
410 1170930 : left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
411 1170930 : row0 = _mm_shufflelo_epi16(left_col_low, 0);
412 1170930 : row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
413 1170930 : row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
414 1170930 : row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
415 1170930 : _mm_storel_epi64((__m128i *)dst, row0);
416 1170930 : dst += stride;
417 1170930 : _mm_storel_epi64((__m128i *)dst, row1);
418 1170930 : dst += stride;
419 1170930 : _mm_storel_epi64((__m128i *)dst, row2);
420 1170930 : dst += stride;
421 1170930 : _mm_storel_epi64((__m128i *)dst, row3);
422 1170930 : dst += stride;
423 :
424 1170930 : row0 = _mm_shufflelo_epi16(left_col_high, 0);
425 1170930 : row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
426 1170930 : row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
427 1170930 : row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
428 1170930 : _mm_storel_epi64((__m128i *)dst, row0);
429 1170930 : dst += stride;
430 1170930 : _mm_storel_epi64((__m128i *)dst, row1);
431 1170930 : dst += stride;
432 1170930 : _mm_storel_epi64((__m128i *)dst, row2);
433 1170930 : dst += stride;
434 1170930 : _mm_storel_epi64((__m128i *)dst, row3);
435 1170930 : dst += stride;
436 :
437 1170930 : left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
438 1170930 : row0 = _mm_shufflelo_epi16(left_col_high, 0);
439 1170930 : row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
440 1170930 : row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
441 1170930 : row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
442 1170930 : _mm_storel_epi64((__m128i *)dst, row0);
443 1170930 : dst += stride;
444 1170930 : _mm_storel_epi64((__m128i *)dst, row1);
445 1170930 : dst += stride;
446 1170930 : _mm_storel_epi64((__m128i *)dst, row2);
447 1170930 : dst += stride;
448 1170930 : _mm_storel_epi64((__m128i *)dst, row3);
449 1170930 : dst += stride;
450 1170930 : left += 16;
451 : }
452 1139080 : }
453 :
454 5719 : static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
455 : const uint8_t *left, int32_t height) {
456 5719 : int32_t i = height >> 2;
457 : do {
458 124344 : __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
459 62172 : left4 = _mm_unpacklo_epi8(left4, left4);
460 62172 : left4 = _mm_unpacklo_epi8(left4, left4);
461 62172 : const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
462 62172 : const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
463 : _mm_storeu_si128((__m128i *)dst, r0);
464 62172 : _mm_storeu_si128((__m128i *)(dst + 16), r0);
465 62172 : _mm_storeu_si128((__m128i *)(dst + 32), r0);
466 62172 : _mm_storeu_si128((__m128i *)(dst + 48), r0);
467 62172 : _mm_storeu_si128((__m128i *)(dst + stride), r1);
468 62172 : _mm_storeu_si128((__m128i *)(dst + stride + 16), r1);
469 62172 : _mm_storeu_si128((__m128i *)(dst + stride + 32), r1);
470 62172 : _mm_storeu_si128((__m128i *)(dst + stride + 48), r1);
471 62172 : const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
472 62172 : const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
473 62172 : _mm_storeu_si128((__m128i *)(dst + stride * 2), r2);
474 62172 : _mm_storeu_si128((__m128i *)(dst + stride * 2 + 16), r2);
475 62172 : _mm_storeu_si128((__m128i *)(dst + stride * 2 + 32), r2);
476 62172 : _mm_storeu_si128((__m128i *)(dst + stride * 2 + 48), r2);
477 62172 : _mm_storeu_si128((__m128i *)(dst + stride * 3), r3);
478 62172 : _mm_storeu_si128((__m128i *)(dst + stride * 3 + 16), r3);
479 62172 : _mm_storeu_si128((__m128i *)(dst + stride * 3 + 32), r3);
480 62172 : _mm_storeu_si128((__m128i *)(dst + stride * 3 + 48), r3);
481 62172 : left += 4;
482 62172 : dst += stride * 4;
483 62172 : } while (--i);
484 5719 : }
485 :
486 2923 : void eb_aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
487 : const uint8_t *above, const uint8_t *left) {
488 : (void)above;
489 2923 : h_predictor_64xh(dst, stride, left, 64);
490 2923 : }
491 :
492 1055 : void eb_aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
493 : const uint8_t *above, const uint8_t *left) {
494 : (void)above;
495 1055 : h_predictor_64xh(dst, stride, left, 32);
496 1055 : }
497 :
498 1490 : void eb_aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
499 : const uint8_t *above, const uint8_t *left) {
500 : (void)above;
501 1490 : h_predictor_32xh(dst, stride, left, 64);
502 1490 : }
503 :
504 1741 : void eb_aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
505 : const uint8_t *above, const uint8_t *left) {
506 : (void)above;
507 1741 : h_predictor_64xh(dst, stride, left, 16);
508 1741 : }
509 :
510 2575 : void eb_aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
511 : const uint8_t *above, const uint8_t *left) {
512 : (void)above;
513 2575 : h_predictor_16xh(dst, stride, left, 4);
514 2575 : }
515 :
516 355985 : void eb_aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
517 : const uint8_t *above, const uint8_t *left) {
518 : (void)above;
519 355985 : h_predictor_16xh(dst, stride, left, 2);
520 355989 : }
521 :
522 496710 : void eb_aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
523 : const uint8_t *above, const uint8_t *left) {
524 : __m128i left_col, left_col_8p;
525 : (void)above;
526 :
527 496710 : left_col = _mm_load_si128((const __m128i *)left);
528 :
529 496710 : left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
530 496710 : h_prediction_32x8_1(&left_col_8p, dst, stride);
531 496724 : dst += stride << 2;
532 496724 : h_prediction_32x8_2(&left_col_8p, dst, stride);
533 496723 : dst += stride << 2;
534 :
535 496723 : left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
536 496723 : h_prediction_32x8_1(&left_col_8p, dst, stride);
537 496723 : dst += stride << 2;
538 496723 : h_prediction_32x8_2(&left_col_8p, dst, stride);
539 496721 : }
540 :
541 18998 : void eb_aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
542 : const uint8_t *above, const uint8_t *left) {
543 : (void)above;
544 18998 : const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
545 18998 : const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
546 18998 : h_prediction_16x8_1(&left_col_8p, dst, stride);
547 18998 : }
548 :
549 1300130 : void eb_aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
550 : const uint8_t *above, const uint8_t *left) {
551 : (void)above;
552 1300130 : const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
553 1300130 : const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
554 1300130 : h_prediction_16x8_1(&left_col_8p, dst, stride);
555 1300180 : dst += stride << 2;
556 1300180 : h_prediction_16x8_2(&left_col_8p, dst, stride);
557 1300180 : }
558 :
559 19217 : void eb_aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
560 : const uint8_t *above, const uint8_t *left) {
561 : __m128i left_col, left_col_8p;
562 : (void)above;
563 :
564 19217 : left_col = _mm_load_si128((const __m128i *)left);
565 :
566 19217 : left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
567 19217 : h_prediction_32x8_1(&left_col_8p, dst, stride);
568 19217 : dst += stride << 2;
569 19217 : h_prediction_32x8_2(&left_col_8p, dst, stride);
570 19217 : }
571 :
572 19761 : void eb_aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
573 : const uint8_t *above, const uint8_t *left) {
574 : (void)above;
575 19761 : const __m128i left_col = _mm_load_si128((__m128i const *)left);
576 19761 : __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
577 19761 : __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
578 :
579 19761 : __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
580 19761 : __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
581 19761 : __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
582 19761 : __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
583 19761 : *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
584 19761 : dst += stride;
585 19761 : *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
586 19761 : dst += stride;
587 19761 : *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
588 19761 : dst += stride;
589 19761 : *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
590 19761 : dst += stride;
591 :
592 19761 : left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
593 19761 : row0 = _mm_shufflelo_epi16(left_col_low, 0);
594 19761 : row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
595 19761 : row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
596 19761 : row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
597 19761 : *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
598 19761 : dst += stride;
599 19761 : *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
600 19761 : dst += stride;
601 19761 : *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
602 19761 : dst += stride;
603 19761 : *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
604 19761 : dst += stride;
605 :
606 19761 : row0 = _mm_shufflelo_epi16(left_col_high, 0);
607 19761 : row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
608 19761 : row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
609 19761 : row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
610 19761 : *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
611 19761 : dst += stride;
612 19761 : *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
613 19761 : dst += stride;
614 19761 : *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
615 19761 : dst += stride;
616 19761 : *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
617 19761 : dst += stride;
618 :
619 19761 : left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
620 19761 : row0 = _mm_shufflelo_epi16(left_col_high, 0);
621 19761 : row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
622 19761 : row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
623 19761 : row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
624 19761 : *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
625 19761 : dst += stride;
626 19761 : *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
627 19761 : dst += stride;
628 19761 : *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
629 19761 : dst += stride;
630 19761 : *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
631 19761 : }
632 :
633 51564 : void eb_aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
634 : const uint8_t *above, const uint8_t *left) {
635 : (void)above;
636 51564 : __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
637 51564 : left_col = _mm_unpacklo_epi8(left_col, left_col);
638 51564 : __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
639 51564 : __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
640 51564 : __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
641 51564 : __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
642 51564 : *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
643 51564 : dst += stride;
644 51564 : *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
645 51564 : dst += stride;
646 51564 : *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
647 51564 : dst += stride;
648 51564 : *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
649 51564 : dst += stride;
650 51564 : left_col = _mm_unpackhi_epi64(left_col, left_col);
651 51564 : row0 = _mm_shufflelo_epi16(left_col, 0);
652 51564 : row1 = _mm_shufflelo_epi16(left_col, 0x55);
653 51564 : row2 = _mm_shufflelo_epi16(left_col, 0xaa);
654 51564 : row3 = _mm_shufflelo_epi16(left_col, 0xff);
655 51564 : *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
656 51564 : dst += stride;
657 51564 : *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
658 51564 : dst += stride;
659 51564 : *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
660 51564 : dst += stride;
661 51564 : *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
662 51564 : }
663 :
664 1107240 : void eb_aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
665 : const uint8_t *above, const uint8_t *left) {
666 1107240 : h_predictor_8x16xc(dst, stride, above, left, 1);
667 1107280 : }
668 31844 : void eb_aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
669 : const uint8_t *above, const uint8_t *left) {
670 31844 : h_predictor_8x16xc(dst, stride, above, left, 2);
671 31844 : }
672 49856 : void eb_aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
673 : const uint8_t *above, const uint8_t *left) {
674 : (void)above;
675 49856 : __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
676 49856 : left_col = _mm_unpacklo_epi8(left_col, left_col);
677 49856 : __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
678 49856 : __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
679 49856 : __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
680 49856 : __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
681 49856 : _mm_storel_epi64((__m128i *)dst, row0);
682 49856 : dst += stride;
683 49856 : _mm_storel_epi64((__m128i *)dst, row1);
684 49856 : dst += stride;
685 49856 : _mm_storel_epi64((__m128i *)dst, row2);
686 49856 : dst += stride;
687 49856 : _mm_storel_epi64((__m128i *)dst, row3);
688 49856 : }
689 :
690 316366 : void eb_aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
691 : const uint8_t *above, const uint8_t *left) {
692 316366 : const __m128i row = _mm_load_si128((__m128i const *)above);
693 : (void)left;
694 316366 : dc_store_16xh(&row, 32, dst, stride);
695 316373 : }
696 :
697 41766 : void eb_aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
698 : const uint8_t *above, const uint8_t *left) {
699 20883 : const __m128i row = _mm_load_si128((__m128i const *)above);
700 : (void)left;
701 20883 : dc_store_16xh(&row, 4, dst, stride);
702 20883 : }
703 :
704 2199 : void eb_aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
705 : const uint8_t *above, const uint8_t *left) {
706 2199 : const __m128i row = _mm_load_si128((__m128i const *)above);
707 : (void)left;
708 2199 : dc_store_16xh(&row, 64, dst, stride);
709 2199 : }
710 395111 : void eb_aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
711 : const uint8_t *above, const uint8_t *left) {
712 395111 : const __m128i row = _mm_load_si128((__m128i const *)above);
713 : (void)left;
714 395111 : dc_store_16xh(&row, 8, dst, stride);
715 395110 : }
716 :
717 22917 : void eb_aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
718 : const uint8_t *above, const uint8_t *left) {
719 : (void)left;
720 22917 : v_predictor_32xh(dst, stride, above, 8);
721 22917 : }
722 16720 : void eb_aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
723 : const uint8_t *above, const uint8_t *left) {
724 16720 : const uint32_t pred = *(uint32_t *)above;
725 : (void)left;
726 16720 : dc_store_4xh(pred, 16, dst, stride);
727 16720 : }
728 51014 : void eb_aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
729 : const uint8_t *above, const uint8_t *left) {
730 51014 : const uint32_t pred = *(uint32_t *)above;
731 : (void)left;
732 51014 : dc_store_4xh(pred, 8, dst, stride);
733 51014 : }
734 1126200 : void eb_aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
735 : const uint8_t *above, const uint8_t *left) {
736 1126200 : const __m128i row = _mm_loadl_epi64((__m128i const *)above);
737 : (void)left;
738 1126200 : dc_store_8xh(&row, 16, dst, stride);
739 1126230 : }
740 26671 : void eb_aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
741 : const uint8_t *above, const uint8_t *left) {
742 26671 : const __m128i row = _mm_loadl_epi64((__m128i const *)above);
743 : (void)left;
744 26671 : dc_store_8xh(&row, 32, dst, stride);
745 26671 : }
746 47030 : void eb_aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
747 : const uint8_t *above, const uint8_t *left) {
748 47030 : const __m128i row = _mm_loadl_epi64((__m128i const *)above);
749 : (void)left;
750 47030 : dc_store_8xh(&row, 4, dst, stride);
751 47030 : }
752 : // -----------------------------------------------------------------------------
753 : // DC_128
754 :
755 5487 : void eb_aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
756 : const uint8_t *above, const uint8_t *left) {
757 : (void)above;
758 : (void)left;
759 5487 : const uint32_t pred = 0x80808080;
760 5487 : dc_store_4xh(pred, 8, dst, stride);
761 5487 : }
762 :
763 2645 : void eb_aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
764 : const uint8_t *above, const uint8_t *left) {
765 : (void)above;
766 : (void)left;
767 2645 : const uint32_t pred = 0x80808080;
768 2645 : dc_store_4xh(pred, 16, dst, stride);
769 2645 : }
770 :
771 5973 : void eb_aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
772 : const uint8_t *above, const uint8_t *left) {
773 : (void)above;
774 : (void)left;
775 5973 : const __m128i row = _mm_set1_epi8((uint8_t)128);
776 5973 : dc_store_8xh(&row, 4, dst, stride);
777 5973 : }
778 :
779 3021 : void eb_aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
780 : const uint8_t *above, const uint8_t *left) {
781 : (void)above;
782 : (void)left;
783 3021 : const __m128i row = _mm_set1_epi8((uint8_t)128);
784 3021 : dc_store_8xh(&row, 16, dst, stride);
785 3021 : }
786 :
787 515 : void eb_aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
788 : const uint8_t *above, const uint8_t *left) {
789 : (void)above;
790 : (void)left;
791 515 : const __m128i row = _mm_set1_epi8((uint8_t)128);
792 515 : dc_store_8xh(&row, 32, dst, stride);
793 515 : }
794 :
795 2562 : void eb_aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
796 : const uint8_t *above, const uint8_t *left) {
797 : (void)above;
798 : (void)left;
799 2562 : const __m128i row = _mm_set1_epi8((uint8_t)128);
800 2562 : dc_store_16xh(&row, 4, dst, stride);
801 2562 : }
802 :
803 2959 : void eb_aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
804 : const uint8_t *above, const uint8_t *left) {
805 : (void)above;
806 : (void)left;
807 2959 : const __m128i row = _mm_set1_epi8((uint8_t)128);
808 2959 : dc_store_16xh(&row, 8, dst, stride);
809 2959 : }
810 :
811 374 : void eb_aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
812 : const uint8_t *above,
813 : const uint8_t *left) {
814 : (void)above;
815 : (void)left;
816 374 : const __m128i row = _mm_set1_epi8((uint8_t)128);
817 374 : dc_store_16xh(&row, 32, dst, stride);
818 374 : }
819 :
820 23 : void eb_aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
821 : const uint8_t *above,
822 : const uint8_t *left) {
823 : (void)above;
824 : (void)left;
825 23 : const __m128i row = _mm_set1_epi8((uint8_t)128);
826 23 : dc_store_16xh(&row, 64, dst, stride);
827 23 : }
828 :
829 448 : void eb_aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
830 : const uint8_t *above, const uint8_t *left) {
831 : (void)above;
832 : (void)left;
833 448 : const __m128i row = _mm_set1_epi8((uint8_t)128);
834 448 : dc_store_32xh(&row, 8, dst, stride);
835 448 : }
836 :
837 : // -----------------------------------------------------------------------------
838 : // DC_TOP
839 :
840 106255 : void eb_aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
841 : const uint8_t *above, const uint8_t *left) {
842 : (void)left;
843 106255 : __m128i sum_above = dc_sum_4(above);
844 106255 : const __m128i two = _mm_set1_epi16((int16_t)2);
845 106255 : sum_above = _mm_add_epi16(sum_above, two);
846 106255 : sum_above = _mm_srai_epi16(sum_above, 2);
847 106255 : sum_above = _mm_shufflelo_epi16(sum_above, 0);
848 106255 : sum_above = _mm_packus_epi16(sum_above, sum_above);
849 :
850 106255 : const uint32_t pred = _mm_cvtsi128_si32(sum_above);
851 106255 : dc_store_4xh(pred, 8, dst, stride);
852 106255 : }
853 :
854 19213 : void eb_aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
855 : const uint8_t *above, const uint8_t *left) {
856 : (void)left;
857 19213 : __m128i sum_above = dc_sum_4(above);
858 19213 : const __m128i two = _mm_set1_epi16((int16_t)2);
859 19213 : sum_above = _mm_add_epi16(sum_above, two);
860 19213 : sum_above = _mm_srai_epi16(sum_above, 2);
861 19213 : sum_above = _mm_shufflelo_epi16(sum_above, 0);
862 19213 : sum_above = _mm_packus_epi16(sum_above, sum_above);
863 :
864 19213 : const uint32_t pred = _mm_cvtsi128_si32(sum_above);
865 19213 : dc_store_4xh(pred, 16, dst, stride);
866 19213 : }
867 :
868 227262 : void eb_aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
869 : const uint8_t *above, const uint8_t *left) {
870 : (void)left;
871 227262 : __m128i sum_above = dc_sum_8(above);
872 227261 : const __m128i four = _mm_set1_epi16((uint16_t)4);
873 227261 : sum_above = _mm_add_epi16(sum_above, four);
874 227261 : sum_above = _mm_srai_epi16(sum_above, 3);
875 227261 : sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
876 227261 : const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
877 227261 : dc_store_8xh(&row, 4, dst, stride);
878 227265 : }
879 :
880 27594 : void eb_aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
881 : const uint8_t *above, const uint8_t *left) {
882 : (void)left;
883 27594 : __m128i sum_above = dc_sum_8(above);
884 27594 : const __m128i four = _mm_set1_epi16((uint16_t)4);
885 27594 : sum_above = _mm_add_epi16(sum_above, four);
886 27594 : sum_above = _mm_srai_epi16(sum_above, 3);
887 27594 : sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
888 27594 : const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
889 27594 : dc_store_8xh(&row, 16, dst, stride);
890 27594 : }
891 :
892 2039 : void eb_aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
893 : const uint8_t *above, const uint8_t *left) {
894 : (void)left;
895 2039 : __m128i sum_above = dc_sum_8(above);
896 2039 : const __m128i four = _mm_set1_epi16((uint16_t)4);
897 2039 : sum_above = _mm_add_epi16(sum_above, four);
898 2039 : sum_above = _mm_srai_epi16(sum_above, 3);
899 2039 : sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
900 2039 : const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
901 2039 : dc_store_8xh(&row, 32, dst, stride);
902 2039 : }
903 :
904 74793 : void eb_aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
905 : const uint8_t *above, const uint8_t *left) {
906 : (void)left;
907 74793 : __m128i sum_above = dc_sum_16(above);
908 74793 : const __m128i eight = _mm_set1_epi16((uint16_t)8);
909 74793 : sum_above = _mm_add_epi16(sum_above, eight);
910 74793 : sum_above = _mm_srai_epi16(sum_above, 4);
911 74793 : sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
912 74793 : sum_above = _mm_shufflelo_epi16(sum_above, 0);
913 74793 : const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
914 74793 : dc_store_16xh(&row, 4, dst, stride);
915 74793 : }
916 :
917 72995 : void eb_aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
918 : const uint8_t *above, const uint8_t *left) {
919 : (void)left;
920 72995 : __m128i sum_above = dc_sum_16(above);
921 72995 : const __m128i eight = _mm_set1_epi16((uint16_t)8);
922 72995 : sum_above = _mm_add_epi16(sum_above, eight);
923 72995 : sum_above = _mm_srai_epi16(sum_above, 4);
924 72995 : sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
925 72995 : sum_above = _mm_shufflelo_epi16(sum_above, 0);
926 72995 : const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
927 72995 : dc_store_16xh(&row, 8, dst, stride);
928 72995 : }
929 :
930 7800 : void eb_aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
931 : const uint8_t *above,
932 : const uint8_t *left) {
933 : (void)left;
934 7800 : __m128i sum_above = dc_sum_16(above);
935 7800 : const __m128i eight = _mm_set1_epi16((uint16_t)8);
936 7800 : sum_above = _mm_add_epi16(sum_above, eight);
937 7800 : sum_above = _mm_srai_epi16(sum_above, 4);
938 7800 : sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
939 7800 : sum_above = _mm_shufflelo_epi16(sum_above, 0);
940 7800 : const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
941 7800 : dc_store_16xh(&row, 32, dst, stride);
942 7800 : }
943 :
944 138 : void eb_aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
945 : const uint8_t *above,
946 : const uint8_t *left) {
947 : (void)left;
948 138 : __m128i sum_above = dc_sum_16(above);
949 138 : const __m128i eight = _mm_set1_epi16((uint16_t)8);
950 138 : sum_above = _mm_add_epi16(sum_above, eight);
951 138 : sum_above = _mm_srai_epi16(sum_above, 4);
952 138 : sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
953 138 : sum_above = _mm_shufflelo_epi16(sum_above, 0);
954 138 : const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
955 138 : dc_store_16xh(&row, 64, dst, stride);
956 138 : }
957 :
958 7751 : void eb_aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
959 : const uint8_t *above, const uint8_t *left) {
960 : (void)left;
961 7751 : __m128i sum_above = dc_sum_32(above);
962 7751 : const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
963 7751 : sum_above = _mm_add_epi16(sum_above, sixteen);
964 7751 : sum_above = _mm_srai_epi16(sum_above, 5);
965 7751 : sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
966 7751 : sum_above = _mm_shufflelo_epi16(sum_above, 0);
967 7751 : const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
968 7751 : dc_store_32xh(&row, 8, dst, stride);
969 7751 : }
970 :
971 : // -----------------------------------------------------------------------------
972 : // DC_LEFT
973 :
974 121184 : void eb_aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
975 : const uint8_t *above, const uint8_t *left) {
976 : (void)above;
977 121184 : __m128i sum_left = dc_sum_8(left);
978 121184 : const __m128i four = _mm_set1_epi16((uint16_t)4);
979 121184 : sum_left = _mm_add_epi16(sum_left, four);
980 121184 : sum_left = _mm_srai_epi16(sum_left, 3);
981 121184 : sum_left = _mm_shufflelo_epi16(sum_left, 0);
982 121184 : sum_left = _mm_packus_epi16(sum_left, sum_left);
983 :
984 121184 : const uint32_t pred = _mm_cvtsi128_si32(sum_left);
985 121184 : dc_store_4xh(pred, 8, dst, stride);
986 121184 : }
987 :
988 61725 : void eb_aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
989 : const uint8_t *above,
990 : const uint8_t *left) {
991 : (void)above;
992 61725 : __m128i sum_left = dc_sum_16(left);
993 61725 : const __m128i eight = _mm_set1_epi16((uint16_t)8);
994 61725 : sum_left = _mm_add_epi16(sum_left, eight);
995 61725 : sum_left = _mm_srai_epi16(sum_left, 4);
996 61725 : sum_left = _mm_shufflelo_epi16(sum_left, 0);
997 61725 : sum_left = _mm_packus_epi16(sum_left, sum_left);
998 :
999 61725 : const uint32_t pred = _mm_cvtsi128_si32(sum_left);
1000 61725 : dc_store_4xh(pred, 16, dst, stride);
1001 61725 : }
1002 :
1003 50950 : void eb_aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
1004 : const uint8_t *above, const uint8_t *left) {
1005 : (void)above;
1006 50950 : __m128i sum_left = dc_sum_4(left);
1007 50950 : const __m128i two = _mm_set1_epi16((uint16_t)2);
1008 50950 : sum_left = _mm_add_epi16(sum_left, two);
1009 50950 : sum_left = _mm_srai_epi16(sum_left, 2);
1010 50950 : sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
1011 50950 : const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
1012 50950 : dc_store_8xh(&row, 4, dst, stride);
1013 50950 : }
1014 :
1015 64721 : void eb_aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
1016 : const uint8_t *above,
1017 : const uint8_t *left) {
1018 : (void)above;
1019 64721 : __m128i sum_left = dc_sum_16(left);
1020 64721 : const __m128i eight = _mm_set1_epi16((uint16_t)8);
1021 64721 : sum_left = _mm_add_epi16(sum_left, eight);
1022 64721 : sum_left = _mm_srai_epi16(sum_left, 4);
1023 64721 : sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
1024 64721 : const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
1025 64721 : dc_store_8xh(&row, 16, dst, stride);
1026 64721 : }
1027 :
1028 7394 : void eb_aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
1029 : const uint8_t *above,
1030 : const uint8_t *left) {
1031 : (void)above;
1032 7394 : __m128i sum_left = dc_sum_32(left);
1033 7394 : const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
1034 7394 : sum_left = _mm_add_epi16(sum_left, sixteen);
1035 7394 : sum_left = _mm_srai_epi16(sum_left, 5);
1036 7394 : sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
1037 7394 : const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
1038 7394 : dc_store_8xh(&row, 32, dst, stride);
1039 7394 : }
1040 :
1041 15905 : void eb_aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
1042 : const uint8_t *above,
1043 : const uint8_t *left) {
1044 : (void)above;
1045 15905 : __m128i sum_left = dc_sum_4(left);
1046 15905 : const __m128i two = _mm_set1_epi16((uint16_t)2);
1047 15905 : sum_left = _mm_add_epi16(sum_left, two);
1048 15905 : sum_left = _mm_srai_epi16(sum_left, 2);
1049 15905 : sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
1050 15905 : sum_left = _mm_shufflelo_epi16(sum_left, 0);
1051 15905 : const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
1052 15905 : dc_store_16xh(&row, 4, dst, stride);
1053 15905 : }
1054 :
1055 23277 : void eb_aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
1056 : const uint8_t *above,
1057 : const uint8_t *left) {
1058 : (void)above;
1059 23277 : __m128i sum_left = dc_sum_8(left);
1060 23277 : const __m128i four = _mm_set1_epi16((uint16_t)4);
1061 23277 : sum_left = _mm_add_epi16(sum_left, four);
1062 23277 : sum_left = _mm_srai_epi16(sum_left, 3);
1063 23277 : sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
1064 23277 : sum_left = _mm_shufflelo_epi16(sum_left, 0);
1065 23277 : const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
1066 23277 : dc_store_16xh(&row, 8, dst, stride);
1067 23277 : }
1068 :
1069 17698 : void eb_aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
1070 : const uint8_t *above,
1071 : const uint8_t *left) {
1072 : (void)above;
1073 17698 : __m128i sum_left = dc_sum_32(left);
1074 17698 : const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
1075 17698 : sum_left = _mm_add_epi16(sum_left, sixteen);
1076 17698 : sum_left = _mm_srai_epi16(sum_left, 5);
1077 17698 : sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
1078 17698 : sum_left = _mm_shufflelo_epi16(sum_left, 0);
1079 17698 : const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
1080 17698 : dc_store_16xh(&row, 32, dst, stride);
1081 17698 : }
1082 :
1083 358 : void eb_aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
1084 : const uint8_t *above,
1085 : const uint8_t *left) {
1086 : (void)above;
1087 358 : __m128i sum_left = dc_sum_64(left);
1088 358 : const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
1089 358 : sum_left = _mm_add_epi16(sum_left, thirtytwo);
1090 358 : sum_left = _mm_srai_epi16(sum_left, 6);
1091 358 : sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
1092 358 : sum_left = _mm_shufflelo_epi16(sum_left, 0);
1093 358 : const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
1094 358 : dc_store_16xh(&row, 64, dst, stride);
1095 358 : }
1096 :
1097 2922 : void eb_aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
1098 : const uint8_t *above,
1099 : const uint8_t *left) {
1100 : (void)above;
1101 2922 : __m128i sum_left = dc_sum_8(left);
1102 2922 : const __m128i four = _mm_set1_epi16((uint16_t)4);
1103 2922 : sum_left = _mm_add_epi16(sum_left, four);
1104 2922 : sum_left = _mm_srai_epi16(sum_left, 3);
1105 2922 : sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
1106 2922 : sum_left = _mm_shufflelo_epi16(sum_left, 0);
1107 2922 : const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
1108 2922 : dc_store_32xh(&row, 8, dst, stride);
1109 2922 : }
1110 :
1111 : // -----------------------------------------------------------------------------
1112 : // DC_PRED
1113 :
1114 1646180 : void eb_aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
1115 : const uint8_t *above, const uint8_t *left) {
1116 1646180 : const __m128i sum_left = dc_sum_8(left);
1117 1646190 : __m128i sum_above = dc_sum_4(above);
1118 1646180 : sum_above = _mm_add_epi16(sum_left, sum_above);
1119 :
1120 1646180 : uint32_t sum = _mm_cvtsi128_si32(sum_above);
1121 1646180 : sum += 6;
1122 1646180 : sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
1123 :
1124 3292360 : const __m128i row = _mm_set1_epi8((uint8_t)sum);
1125 1646180 : const uint32_t pred = _mm_cvtsi128_si32(row);
1126 1646180 : dc_store_4xh(pred, 8, dst, stride);
1127 1646220 : }
1128 :
1129 427359 : void eb_aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
1130 : const uint8_t *above, const uint8_t *left) {
1131 427359 : const __m128i sum_left = dc_sum_16(left);
1132 427358 : __m128i sum_above = dc_sum_4(above);
1133 427357 : sum_above = _mm_add_epi16(sum_left, sum_above);
1134 :
1135 427357 : uint32_t sum = _mm_cvtsi128_si32(sum_above);
1136 427357 : sum += 10;
1137 427357 : sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
1138 :
1139 854714 : const __m128i row = _mm_set1_epi8((uint8_t)sum);
1140 427357 : const uint32_t pred = _mm_cvtsi128_si32(row);
1141 427357 : dc_store_4xh(pred, 16, dst, stride);
1142 427354 : }
1143 :
1144 1589560 : void eb_aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
1145 : const uint8_t *above, const uint8_t *left) {
1146 1589560 : const __m128i sum_left = dc_sum_4(left);
1147 1589560 : __m128i sum_above = dc_sum_8(above);
1148 1589550 : sum_above = _mm_add_epi16(sum_above, sum_left);
1149 :
1150 1589550 : uint32_t sum = _mm_cvtsi128_si32(sum_above);
1151 1589550 : sum += 6;
1152 1589550 : sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
1153 :
1154 1589550 : const __m128i row = _mm_set1_epi8((uint8_t)sum);
1155 1589550 : dc_store_8xh(&row, 4, dst, stride);
1156 1589560 : }
1157 :
1158 537524 : void eb_aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
1159 : const uint8_t *above, const uint8_t *left) {
1160 537524 : const __m128i sum_left = dc_sum_16(left);
1161 537524 : __m128i sum_above = dc_sum_8(above);
1162 537525 : sum_above = _mm_add_epi16(sum_above, sum_left);
1163 :
1164 537525 : uint32_t sum = _mm_cvtsi128_si32(sum_above);
1165 537525 : sum += 12;
1166 537525 : sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
1167 537525 : const __m128i row = _mm_set1_epi8((uint8_t)sum);
1168 537525 : dc_store_8xh(&row, 16, dst, stride);
1169 537527 : }
1170 :
1171 30836 : void eb_aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
1172 : const uint8_t *above, const uint8_t *left) {
1173 30836 : const __m128i sum_left = dc_sum_32(left);
1174 30836 : __m128i sum_above = dc_sum_8(above);
1175 30836 : sum_above = _mm_add_epi16(sum_above, sum_left);
1176 :
1177 30836 : uint32_t sum = _mm_cvtsi128_si32(sum_above);
1178 30836 : sum += 20;
1179 30836 : sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
1180 30836 : const __m128i row = _mm_set1_epi8((uint8_t)sum);
1181 30836 : dc_store_8xh(&row, 32, dst, stride);
1182 30836 : }
1183 :
1184 417239 : void eb_aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
1185 : const uint8_t *above, const uint8_t *left) {
1186 417239 : const __m128i sum_left = dc_sum_4(left);
1187 417240 : __m128i sum_above = dc_sum_16(above);
1188 417238 : sum_above = _mm_add_epi16(sum_above, sum_left);
1189 :
1190 417238 : uint32_t sum = _mm_cvtsi128_si32(sum_above);
1191 417238 : sum += 10;
1192 417238 : sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
1193 417238 : const __m128i row = _mm_set1_epi8((uint8_t)sum);
1194 417238 : dc_store_16xh(&row, 4, dst, stride);
1195 417238 : }
1196 :
1197 490644 : void eb_aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
1198 : const uint8_t *above, const uint8_t *left) {
1199 490644 : const __m128i sum_left = dc_sum_8(left);
1200 490643 : __m128i sum_above = dc_sum_16(above);
1201 490642 : sum_above = _mm_add_epi16(sum_above, sum_left);
1202 :
1203 490642 : uint32_t sum = _mm_cvtsi128_si32(sum_above);
1204 490642 : sum += 12;
1205 490642 : sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
1206 490642 : const __m128i row = _mm_set1_epi8((uint8_t)sum);
1207 490642 : dc_store_16xh(&row, 8, dst, stride);
1208 490642 : }
1209 :
1210 110441 : void eb_aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
1211 : const uint8_t *above, const uint8_t *left) {
1212 110441 : const __m128i sum_left = dc_sum_32(left);
1213 110440 : __m128i sum_above = dc_sum_16(above);
1214 110440 : sum_above = _mm_add_epi16(sum_left, sum_above);
1215 :
1216 110440 : uint32_t sum = _mm_cvtsi128_si32(sum_above);
1217 110440 : sum += 24;
1218 110440 : sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
1219 110440 : const __m128i row = _mm_set1_epi8((uint8_t)sum);
1220 110440 : dc_store_16xh(&row, 32, dst, stride);
1221 110442 : }
1222 :
1223 2907 : void eb_aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
1224 : const uint8_t *above, const uint8_t *left) {
1225 2907 : const __m128i sum_left = dc_sum_64(left);
1226 2907 : __m128i sum_above = dc_sum_16(above);
1227 2907 : sum_above = _mm_add_epi16(sum_left, sum_above);
1228 :
1229 2907 : uint32_t sum = _mm_cvtsi128_si32(sum_above);
1230 2907 : sum += 40;
1231 2907 : sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
1232 2907 : const __m128i row = _mm_set1_epi8((uint8_t)sum);
1233 2907 : dc_store_16xh(&row, 64, dst, stride);
1234 2907 : }
1235 :
1236 21086 : void eb_aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
1237 : const uint8_t *above, const uint8_t *left) {
1238 21086 : __m128i sum_above = dc_sum_32(above);
1239 21086 : const __m128i sum_left = dc_sum_8(left);
1240 21086 : sum_above = _mm_add_epi16(sum_above, sum_left);
1241 :
1242 21086 : uint32_t sum = _mm_cvtsi128_si32(sum_above);
1243 21086 : sum += 20;
1244 21086 : sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
1245 21086 : const __m128i row = _mm_set1_epi8((uint8_t)sum);
1246 21086 : dc_store_32xh(&row, 8, dst, stride);
1247 21086 : }
|