Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 : #include <tmmintrin.h>
6 : #include "EbDefinitions.h"
7 : #include "aom_dsp_rtcd.h"
8 :
9 : static const int32_t sm_weight_log2_scale = 8;
10 :
11 : // =============================================================================
12 :
13 : // SMOOTH_PRED
14 :
15 : // bs = 4
16 : EB_ALIGN(16) static const uint16_t sm_weights_4[8] = {
17 : 255, 1, 149, 107, 85, 171, 64, 192
18 : };
19 :
20 : // bs = 8
21 : EB_ALIGN(32) static const uint16_t sm_weights_8[16] = {
22 : 255, 1, 197, 59, 146, 110, 105, 151,
23 : 73, 183, 50, 206, 37, 219, 32, 224
24 : };
25 :
26 : // bs = 16
27 : EB_ALIGN(32) static const uint16_t sm_weights_16[32] = {
28 : 255, 1, 225, 31, 196, 60, 170, 86,
29 : 145, 111, 123, 133, 102, 154, 84, 172,
30 : 68, 188, 54, 202, 43, 213, 33, 223,
31 : 26, 230, 20, 236, 17, 239, 16, 240,
32 : };
33 :
34 : // 4xN
35 :
36 0 : static INLINE void load_right_weights_4(const uint16_t *const above,
37 : __m128i *const r, __m128i *const weights)
38 : {
39 0 : *r = _mm_set1_epi16((uint16_t)above[3]);
40 0 : *weights = _mm_load_si128((const __m128i *)sm_weights_4);
41 0 : }
42 :
43 0 : static INLINE void init_4(const uint16_t *const above,
44 : const uint16_t *const left, const int32_t h, __m128i *const ab,
45 : __m128i *const r, __m128i *const weights_w, __m128i *const rep)
46 : {
47 0 : const __m128i a = _mm_loadl_epi64((const __m128i *)above);
48 0 : const __m128i b = _mm_set1_epi16((uint16_t)left[h - 1]);
49 0 : *ab = _mm_unpacklo_epi16(a, b);
50 0 : load_right_weights_4(above, r, weights_w);
51 :
52 0 : rep[0] = _mm_set1_epi32(0x03020100);
53 0 : rep[1] = _mm_set1_epi32(0x07060504);
54 0 : rep[2] = _mm_set1_epi32(0x0B0A0908);
55 0 : rep[3] = _mm_set1_epi32(0x0F0E0D0C);
56 0 : }
57 :
58 0 : static INLINE void load_left_8(const uint16_t *const left,
59 : const __m128i r, __m128i *const lr)
60 : {
61 0 : const __m128i l = _mm_load_si128((const __m128i *)left);
62 0 : lr[0] = _mm_unpacklo_epi16(l, r); // 0 1 2 3
63 0 : lr[1] = _mm_unpackhi_epi16(l, r); // 4 5 6 7
64 0 : }
65 :
66 0 : static INLINE __m128i smooth_pred_4(const __m128i weights_w,
67 : const __m128i weights_h, const __m128i rep,
68 : const __m128i ab, const __m128i lr)
69 : {
70 0 : const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
71 0 : const __m128i w = _mm_shuffle_epi8(weights_h, rep);
72 0 : const __m128i t = _mm_shuffle_epi8(lr, rep);
73 0 : const __m128i s0 = _mm_madd_epi16(ab, w);
74 0 : const __m128i s1 = _mm_madd_epi16(t, weights_w);
75 : __m128i sum;
76 :
77 0 : sum = _mm_add_epi32(s0, s1);
78 0 : sum = _mm_add_epi32(sum, round);
79 0 : sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
80 0 : return sum;
81 : }
82 :
83 0 : static INLINE void smooth_pred_4x2(const __m128i weights_w,
84 : const __m128i weights_h, const __m128i *const rep, const __m128i ab,
85 : const __m128i lr, uint16_t **const dst, const ptrdiff_t stride)
86 : {
87 0 : const __m128i sum0 = smooth_pred_4(weights_w, weights_h, rep[0], ab, lr);
88 0 : const __m128i sum1 = smooth_pred_4(weights_w, weights_h, rep[1], ab, lr);
89 0 : const __m128i sum = _mm_packs_epi32(sum0, sum1);
90 0 : _mm_storel_epi64((__m128i *)*dst, sum);
91 0 : *dst += stride;
92 0 : _mm_storeh_pd((double *)*dst, _mm_castsi128_pd(sum));
93 0 : *dst += stride;
94 0 : }
95 :
96 0 : static INLINE void smooth_pred_4x4(const __m128i weights_w,
97 : const __m128i weights_h, const __m128i *const rep, const __m128i ab,
98 : const __m128i lr, uint16_t **const dst, const ptrdiff_t stride)
99 : {
100 0 : smooth_pred_4x2(weights_w, weights_h, rep + 0, ab, lr, dst, stride);
101 0 : smooth_pred_4x2(weights_w, weights_h, rep + 2, ab, lr, dst, stride);
102 0 : }
103 :
104 : // 4x4
105 :
106 0 : void eb_aom_highbd_smooth_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride,
107 : const uint16_t *above, const uint16_t *left, int32_t bd)
108 : {
109 0 : const __m128i l = _mm_loadl_epi64((const __m128i *)left);
110 : __m128i ab, r, lr, weights_w, rep[4];
111 : (void)bd;
112 :
113 0 : init_4(above, left, 4, &ab, &r, &weights_w, rep);
114 0 : lr = _mm_unpacklo_epi16(l, r);
115 0 : smooth_pred_4x4(weights_w, weights_w, rep, ab, lr, &dst, stride);
116 0 : }
117 :
118 : // 4x8
119 :
120 0 : void eb_aom_highbd_smooth_predictor_4x8_ssse3(uint16_t *dst, ptrdiff_t stride,
121 : const uint16_t *above, const uint16_t *left, int32_t bd)
122 : {
123 : __m128i ab, r, lr[2], weights_w, weights_h, rep[4];
124 : (void)bd;
125 :
126 0 : init_4(above, left, 8, &ab, &r, &weights_w, rep);
127 0 : load_left_8(left, r, lr);
128 0 : weights_h = _mm_load_si128((const __m128i *)(sm_weights_8 + 0));
129 0 : smooth_pred_4x4(weights_w, weights_h, rep, ab, lr[0], &dst, stride);
130 0 : weights_h = _mm_load_si128((const __m128i *)(sm_weights_8 + 8));
131 0 : smooth_pred_4x4(weights_w, weights_h, rep, ab, lr[1], &dst, stride);
132 0 : }
133 :
134 : // 4x16
135 :
136 0 : void eb_aom_highbd_smooth_predictor_4x16_ssse3(uint16_t *dst, ptrdiff_t stride,
137 : const uint16_t *above, const uint16_t *left, int32_t bd)
138 : {
139 : __m128i ab, r, lr[2], weights_w, weights_h, rep[4];
140 : (void)bd;
141 :
142 0 : init_4(above, left, 16, &ab, &r, &weights_w, rep);
143 :
144 0 : load_left_8(left + 0, r, lr);
145 0 : weights_h = _mm_load_si128((const __m128i *)(sm_weights_16 + 0));
146 0 : smooth_pred_4x4(weights_w, weights_h, rep, ab, lr[0], &dst, stride);
147 0 : weights_h = _mm_load_si128((const __m128i *)(sm_weights_16 + 8));
148 0 : smooth_pred_4x4(weights_w, weights_h, rep, ab, lr[1], &dst, stride);
149 :
150 0 : load_left_8(left + 8, r, lr);
151 0 : weights_h = _mm_load_si128((const __m128i *)(sm_weights_16 + 16));
152 0 : smooth_pred_4x4(weights_w, weights_h, rep, ab, lr[0], &dst, stride);
153 0 : weights_h = _mm_load_si128((const __m128i *)(sm_weights_16 + 24));
154 0 : smooth_pred_4x4(weights_w, weights_h, rep, ab, lr[1], &dst, stride);
155 0 : }
156 :
157 : // =============================================================================
158 :
159 : // SMOOTH_H_PRED
160 :
161 : // 4xN
162 :
163 0 : static INLINE __m128i smooth_h_pred_4(const __m128i weights,
164 : __m128i *const lr)
165 : {
166 0 : const __m128i round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
167 0 : const __m128i rep = _mm_set1_epi32(0x03020100);
168 0 : const __m128i t = _mm_shuffle_epi8(*lr, rep);
169 0 : const __m128i sum0 = _mm_madd_epi16(t, weights);
170 0 : const __m128i sum1 = _mm_add_epi32(sum0, round);
171 0 : const __m128i sum2 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
172 0 : *lr = _mm_srli_si128(*lr, 4);
173 0 : return sum2;
174 : }
175 :
176 0 : static INLINE void smooth_h_pred_4x2(const __m128i weights, __m128i *const lr,
177 : uint16_t **const dst, const ptrdiff_t stride)
178 : {
179 0 : const __m128i sum0 = smooth_h_pred_4(weights, lr);
180 0 : const __m128i sum1 = smooth_h_pred_4(weights, lr);
181 0 : const __m128i sum = _mm_packs_epi32(sum0, sum1);
182 0 : _mm_storel_epi64((__m128i *)*dst, sum);
183 0 : *dst += stride;
184 0 : _mm_storeh_pd((double *)*dst, _mm_castsi128_pd(sum));
185 0 : *dst += stride;
186 0 : }
187 :
188 0 : static INLINE void smooth_h_pred_4x4(const __m128i weights, __m128i *const lr,
189 : uint16_t **const dst, const ptrdiff_t stride)
190 : {
191 0 : smooth_h_pred_4x2(weights, lr, dst, stride);
192 0 : smooth_h_pred_4x2(weights, lr, dst, stride);
193 0 : }
194 :
195 : // 4x4
196 :
197 0 : void eb_aom_highbd_smooth_h_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride,
198 : const uint16_t *above, const uint16_t *left, int32_t bd)
199 : {
200 0 : const __m128i l = _mm_loadl_epi64((const __m128i *)left);
201 : __m128i r, weights;
202 : (void)bd;
203 :
204 0 : load_right_weights_4(above, &r, &weights);
205 0 : __m128i lr = _mm_unpacklo_epi16(l, r);
206 0 : smooth_h_pred_4x4(weights, &lr, &dst, stride);
207 0 : }
208 :
209 : // 4x8
210 :
211 0 : void eb_aom_highbd_smooth_h_predictor_4x8_ssse3(uint16_t *dst, ptrdiff_t stride,
212 : const uint16_t *above, const uint16_t *left, int32_t bd)
213 : {
214 : __m128i r, lr[2], weights;
215 : (void)bd;
216 :
217 0 : load_right_weights_4(above, &r, &weights);
218 0 : load_left_8(left, r, lr);
219 0 : smooth_h_pred_4x4(weights, &lr[0], &dst, stride);
220 0 : smooth_h_pred_4x4(weights, &lr[1], &dst, stride);
221 0 : }
222 :
223 : // 4x16
224 :
225 0 : void eb_aom_highbd_smooth_h_predictor_4x16_ssse3(uint16_t *dst, ptrdiff_t stride,
226 : const uint16_t *above, const uint16_t *left, int32_t bd)
227 : {
228 : __m128i r, lr[2], weights;
229 : (void)bd;
230 :
231 0 : load_right_weights_4(above, &r, &weights);
232 0 : load_left_8(left + 0, r, lr);
233 0 : smooth_h_pred_4x4(weights, &lr[0], &dst, stride);
234 0 : smooth_h_pred_4x4(weights, &lr[1], &dst, stride);
235 0 : load_left_8(left + 8, r, lr);
236 0 : smooth_h_pred_4x4(weights, &lr[0], &dst, stride);
237 0 : smooth_h_pred_4x4(weights, &lr[1], &dst, stride);
238 0 : }
239 :
240 : // =============================================================================
241 :
242 : // SMOOTH_V_PRED
243 :
244 : // 4xN
245 :
246 0 : static INLINE void smooth_v_init_4(const uint16_t *const above,
247 : const uint16_t *const left, const int32_t h, __m128i *const ab,
248 : __m128i *const rep)
249 : {
250 0 : const __m128i a = _mm_loadl_epi64((const __m128i *)above);
251 0 : const __m128i b = _mm_set1_epi16((uint16_t)left[h - 1]);
252 0 : *ab = _mm_unpacklo_epi16(a, b);
253 :
254 0 : rep[0] = _mm_set1_epi32(0x03020100);
255 0 : rep[1] = _mm_set1_epi32(0x07060504);
256 0 : rep[2] = _mm_set1_epi32(0x0B0A0908);
257 0 : rep[3] = _mm_set1_epi32(0x0F0E0D0C);
258 0 : }
259 :
260 0 : static INLINE __m128i smooth_v_pred_4(const __m128i weights, const __m128i rep,
261 : const __m128i ab)
262 : {
263 0 : const __m128i round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
264 0 : const __m128i w = _mm_shuffle_epi8(weights, rep);
265 0 : const __m128i sum0 = _mm_madd_epi16(ab, w);
266 : __m128i sum;
267 :
268 0 : sum = _mm_add_epi32(sum0, round);
269 0 : sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
270 0 : return sum;
271 : }
272 :
273 0 : static INLINE void smooth_v_pred_4x2(const __m128i weights,
274 : const __m128i *const rep, const __m128i ab, uint16_t **const dst,
275 : const ptrdiff_t stride)
276 : {
277 0 : const __m128i sum0 = smooth_v_pred_4(weights, rep[0], ab);
278 0 : const __m128i sum1 = smooth_v_pred_4(weights, rep[1], ab);
279 0 : const __m128i sum = _mm_packs_epi32(sum0, sum1);
280 0 : _mm_storel_epi64((__m128i *)*dst, sum);
281 0 : *dst += stride;
282 0 : _mm_storeh_pd((double *)*dst, _mm_castsi128_pd(sum));
283 0 : *dst += stride;
284 0 : }
285 :
286 0 : static INLINE void smooth_v_pred_4x4(const __m128i weights,
287 : const __m128i *const rep, const __m128i ab, uint16_t **const dst,
288 : const ptrdiff_t stride)
289 : {
290 0 : smooth_v_pred_4x2(weights, rep + 0, ab, dst, stride);
291 0 : smooth_v_pred_4x2(weights, rep + 2, ab, dst, stride);
292 0 : }
293 :
294 : // 4x4
295 :
296 0 : void eb_aom_highbd_smooth_v_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride,
297 : const uint16_t *above, const uint16_t *left, int32_t bd)
298 : {
299 : __m128i ab, rep[4];
300 : (void)bd;
301 :
302 0 : smooth_v_init_4(above, left, 4, &ab, rep);
303 0 : const __m128i weights = _mm_load_si128((const __m128i *)sm_weights_4);
304 0 : smooth_v_pred_4x4(weights, rep, ab, &dst, stride);
305 0 : }
306 :
307 : // 4x8
308 :
309 0 : void eb_aom_highbd_smooth_v_predictor_4x8_ssse3(uint16_t *dst, ptrdiff_t stride,
310 : const uint16_t *above, const uint16_t *left, int32_t bd)
311 : {
312 : __m128i ab, weights, rep[4];
313 : (void)bd;
314 :
315 0 : smooth_v_init_4(above, left, 8, &ab, rep);
316 0 : weights = _mm_load_si128((const __m128i *)(sm_weights_8 + 0));
317 0 : smooth_v_pred_4x4(weights, rep, ab, &dst, stride);
318 0 : weights = _mm_load_si128((const __m128i *)(sm_weights_8 + 8));
319 0 : smooth_v_pred_4x4(weights, rep, ab, &dst, stride);
320 0 : }
321 :
322 : // 4x16
323 :
324 0 : void eb_aom_highbd_smooth_v_predictor_4x16_ssse3(uint16_t *dst, ptrdiff_t stride,
325 : const uint16_t *above, const uint16_t *left, int32_t bd)
326 : {
327 : __m128i ab, weights, rep[4];
328 : (void)bd;
329 :
330 0 : smooth_v_init_4(above, left, 16, &ab, rep);
331 :
332 0 : weights = _mm_load_si128((const __m128i *)(sm_weights_16 + 0));
333 0 : smooth_v_pred_4x4(weights, rep, ab, &dst, stride);
334 0 : weights = _mm_load_si128((const __m128i *)(sm_weights_16 + 8));
335 0 : smooth_v_pred_4x4(weights, rep, ab, &dst, stride);
336 :
337 0 : weights = _mm_load_si128((const __m128i *)(sm_weights_16 + 16));
338 0 : smooth_v_pred_4x4(weights, rep, ab, &dst, stride);
339 0 : weights = _mm_load_si128((const __m128i *)(sm_weights_16 + 24));
340 0 : smooth_v_pred_4x4(weights, rep, ab, &dst, stride);
341 0 : }
342 :
343 : // Return 8 16-bit pixels in one row
344 3240830 : static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
345 : const __m128i *topleft) {
346 6481660 : const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
347 :
348 6481660 : __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
349 6481660 : __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
350 9722480 : __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
351 :
352 3240830 : __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
353 6481660 : mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
354 3240830 : __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
355 :
356 3240830 : pl = _mm_andnot_si128(mask1, *left);
357 :
358 3240830 : ptl = _mm_and_si128(mask2, *topleft);
359 6481660 : pt = _mm_andnot_si128(mask2, *top);
360 3240830 : pt = _mm_or_si128(pt, ptl);
361 3240830 : pt = _mm_and_si128(mask1, pt);
362 :
363 3240830 : return _mm_or_si128(pl, pt);
364 : }
365 :
366 131225 : void eb_aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
367 : const uint8_t *above, const uint8_t *left) {
368 131225 : __m128i l = _mm_loadl_epi64((const __m128i *)left);
369 131225 : const __m128i t = _mm_loadl_epi64((const __m128i *)above);
370 131225 : const __m128i zero = _mm_setzero_si128();
371 131225 : const __m128i t16 = _mm_unpacklo_epi8(t, zero);
372 262450 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
373 131225 : __m128i rep = _mm_set1_epi16(0x8000);
374 131225 : const __m128i one = _mm_set1_epi16(1);
375 :
376 : int i;
377 656119 : for (i = 0; i < 4; ++i) {
378 524896 : const __m128i l16 = _mm_shuffle_epi8(l, rep);
379 524896 : const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
380 :
381 524894 : *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
382 524894 : dst += stride;
383 524894 : rep = _mm_add_epi16(rep, one);
384 : }
385 131223 : }
386 :
387 44528 : void eb_aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
388 : const uint8_t *above, const uint8_t *left) {
389 44528 : __m128i l = _mm_loadl_epi64((const __m128i *)left);
390 44528 : const __m128i t = _mm_loadl_epi64((const __m128i *)above);
391 44528 : const __m128i zero = _mm_setzero_si128();
392 44528 : const __m128i t16 = _mm_unpacklo_epi8(t, zero);
393 89056 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
394 44528 : __m128i rep = _mm_set1_epi16(0x8000);
395 44528 : const __m128i one = _mm_set1_epi16(1);
396 :
397 : int i;
398 400747 : for (i = 0; i < 8; ++i) {
399 356220 : const __m128i l16 = _mm_shuffle_epi8(l, rep);
400 356220 : const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
401 :
402 356219 : *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
403 356219 : dst += stride;
404 356219 : rep = _mm_add_epi16(rep, one);
405 : }
406 44527 : }
407 :
408 20899 : void eb_aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
409 : const uint8_t *above, const uint8_t *left) {
410 20899 : __m128i l = _mm_load_si128((const __m128i *)left);
411 41798 : const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
412 20899 : const __m128i zero = _mm_setzero_si128();
413 20899 : const __m128i t16 = _mm_unpacklo_epi8(t, zero);
414 41798 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
415 20899 : __m128i rep = _mm_set1_epi16(0x8000);
416 20899 : const __m128i one = _mm_set1_epi16(1);
417 :
418 355275 : for (int i = 0; i < 16; ++i) {
419 334374 : const __m128i l16 = _mm_shuffle_epi8(l, rep);
420 334374 : const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
421 :
422 334376 : *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
423 334376 : dst += stride;
424 334376 : rep = _mm_add_epi16(rep, one);
425 : }
426 20901 : }
427 :
428 42186 : void eb_aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
429 : const uint8_t *above, const uint8_t *left) {
430 42186 : __m128i l = _mm_loadl_epi64((const __m128i *)left);
431 42186 : const __m128i t = _mm_loadl_epi64((const __m128i *)above);
432 42186 : const __m128i zero = _mm_setzero_si128();
433 42186 : const __m128i t16 = _mm_unpacklo_epi8(t, zero);
434 84372 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
435 42186 : __m128i rep = _mm_set1_epi16(0x8000);
436 42186 : const __m128i one = _mm_set1_epi16(1);
437 :
438 : int i;
439 210930 : for (i = 0; i < 4; ++i) {
440 168744 : const __m128i l16 = _mm_shuffle_epi8(l, rep);
441 168744 : const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
442 :
443 168744 : _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
444 168744 : dst += stride;
445 168744 : rep = _mm_add_epi16(rep, one);
446 : }
447 42186 : }
448 :
449 57007 : void eb_aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
450 : const uint8_t *above, const uint8_t *left) {
451 57007 : __m128i l = _mm_loadl_epi64((const __m128i *)left);
452 57007 : const __m128i t = _mm_loadl_epi64((const __m128i *)above);
453 57007 : const __m128i zero = _mm_setzero_si128();
454 57007 : const __m128i t16 = _mm_unpacklo_epi8(t, zero);
455 114014 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
456 57007 : __m128i rep = _mm_set1_epi16(0x8000);
457 57007 : const __m128i one = _mm_set1_epi16(1);
458 :
459 : int i;
460 513051 : for (i = 0; i < 8; ++i) {
461 456045 : const __m128i l16 = _mm_shuffle_epi8(l, rep);
462 456045 : const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
463 :
464 456044 : _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
465 456044 : dst += stride;
466 456044 : rep = _mm_add_epi16(rep, one);
467 : }
468 57006 : }
469 :
470 15579 : void eb_aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
471 : const uint8_t *above, const uint8_t *left) {
472 15579 : __m128i l = _mm_load_si128((const __m128i *)left);
473 15579 : const __m128i t = _mm_loadl_epi64((const __m128i *)above);
474 15579 : const __m128i zero = _mm_setzero_si128();
475 15579 : const __m128i t16 = _mm_unpacklo_epi8(t, zero);
476 31158 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
477 15579 : __m128i rep = _mm_set1_epi16(0x8000);
478 15579 : const __m128i one = _mm_set1_epi16(1);
479 :
480 : int i;
481 264836 : for (i = 0; i < 16; ++i) {
482 249256 : const __m128i l16 = _mm_shuffle_epi8(l, rep);
483 249256 : const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
484 :
485 249257 : _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
486 249257 : dst += stride;
487 249257 : rep = _mm_add_epi16(rep, one);
488 : }
489 15580 : }
490 :
491 14275 : void eb_aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
492 : const uint8_t *above, const uint8_t *left) {
493 14275 : const __m128i t = _mm_loadl_epi64((const __m128i *)above);
494 14275 : const __m128i zero = _mm_setzero_si128();
495 14275 : const __m128i t16 = _mm_unpacklo_epi8(t, zero);
496 28550 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
497 14275 : const __m128i one = _mm_set1_epi16(1);
498 :
499 42825 : for (int j = 0; j < 2; ++j) {
500 57100 : const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
501 28550 : __m128i rep = _mm_set1_epi16(0x8000);
502 485350 : for (int i = 0; i < 16; ++i) {
503 456800 : const __m128i l16 = _mm_shuffle_epi8(l, rep);
504 456800 : const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
505 :
506 456800 : _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
507 456800 : dst += stride;
508 456800 : rep = _mm_add_epi16(rep, one);
509 : }
510 : }
511 14275 : }
512 :
513 : // Return 16 8-bit pixels in one row
514 347360 : static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
515 : const __m128i *top1,
516 : const __m128i *topleft) {
517 347360 : const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
518 347360 : const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
519 347360 : return _mm_packus_epi16(p0, p1);
520 : }
521 :
522 23576 : void eb_aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
523 : const uint8_t *above, const uint8_t *left) {
524 47152 : __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
525 23576 : const __m128i t = _mm_load_si128((const __m128i *)above);
526 23576 : const __m128i zero = _mm_setzero_si128();
527 23576 : const __m128i top0 = _mm_unpacklo_epi8(t, zero);
528 23576 : const __m128i top1 = _mm_unpackhi_epi8(t, zero);
529 47152 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
530 23576 : __m128i rep = _mm_set1_epi16(0x8000);
531 23576 : const __m128i one = _mm_set1_epi16(1);
532 :
533 117880 : for (int i = 0; i < 4; ++i) {
534 94304 : const __m128i l16 = _mm_shuffle_epi8(l, rep);
535 94304 : const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
536 :
537 : _mm_storeu_si128((__m128i *)dst, row);
538 94304 : dst += stride;
539 94304 : rep = _mm_add_epi16(rep, one);
540 : }
541 23576 : }
542 :
543 0 : void eb_aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
544 : const uint8_t *above, const uint8_t *left) {
545 0 : __m128i l = _mm_loadl_epi64((const __m128i *)left);
546 0 : const __m128i t = _mm_load_si128((const __m128i *)above);
547 0 : const __m128i zero = _mm_setzero_si128();
548 0 : const __m128i top0 = _mm_unpacklo_epi8(t, zero);
549 0 : const __m128i top1 = _mm_unpackhi_epi8(t, zero);
550 0 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
551 0 : __m128i rep = _mm_set1_epi16(0x8000);
552 0 : const __m128i one = _mm_set1_epi16(1);
553 :
554 : int i;
555 0 : for (i = 0; i < 8; ++i) {
556 0 : const __m128i l16 = _mm_shuffle_epi8(l, rep);
557 0 : const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
558 :
559 : _mm_storeu_si128((__m128i *)dst, row);
560 0 : dst += stride;
561 0 : rep = _mm_add_epi16(rep, one);
562 : }
563 0 : }
564 :
565 0 : void eb_aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
566 : const uint8_t *above,
567 : const uint8_t *left) {
568 0 : __m128i l = _mm_load_si128((const __m128i *)left);
569 0 : const __m128i t = _mm_load_si128((const __m128i *)above);
570 0 : const __m128i zero = _mm_setzero_si128();
571 0 : const __m128i top0 = _mm_unpacklo_epi8(t, zero);
572 0 : const __m128i top1 = _mm_unpackhi_epi8(t, zero);
573 0 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
574 0 : __m128i rep = _mm_set1_epi16(0x8000);
575 0 : const __m128i one = _mm_set1_epi16(1);
576 :
577 : int i;
578 0 : for (i = 0; i < 16; ++i) {
579 0 : const __m128i l16 = _mm_shuffle_epi8(l, rep);
580 0 : const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
581 :
582 : _mm_storeu_si128((__m128i *)dst, row);
583 0 : dst += stride;
584 0 : rep = _mm_add_epi16(rep, one);
585 : }
586 0 : }
587 :
588 0 : void eb_aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
589 : const uint8_t *above,
590 : const uint8_t *left) {
591 0 : __m128i l = _mm_load_si128((const __m128i *)left);
592 0 : const __m128i t = _mm_load_si128((const __m128i *)above);
593 0 : const __m128i zero = _mm_setzero_si128();
594 0 : const __m128i top0 = _mm_unpacklo_epi8(t, zero);
595 0 : const __m128i top1 = _mm_unpackhi_epi8(t, zero);
596 0 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
597 0 : __m128i rep = _mm_set1_epi16(0x8000);
598 0 : const __m128i one = _mm_set1_epi16(1);
599 : __m128i l16;
600 :
601 : int i;
602 0 : for (i = 0; i < 16; ++i) {
603 0 : l16 = _mm_shuffle_epi8(l, rep);
604 0 : const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
605 :
606 : _mm_storeu_si128((__m128i *)dst, row);
607 0 : dst += stride;
608 0 : rep = _mm_add_epi16(rep, one);
609 : }
610 :
611 0 : l = _mm_load_si128((const __m128i *)(left + 16));
612 0 : rep = _mm_set1_epi16(0x8000);
613 0 : for (i = 0; i < 16; ++i) {
614 0 : l16 = _mm_shuffle_epi8(l, rep);
615 0 : const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
616 :
617 : _mm_storeu_si128((__m128i *)dst, row);
618 0 : dst += stride;
619 0 : rep = _mm_add_epi16(rep, one);
620 : }
621 0 : }
622 :
623 0 : void eb_aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
624 : const uint8_t *above,
625 : const uint8_t *left) {
626 0 : const __m128i t = _mm_load_si128((const __m128i *)above);
627 0 : const __m128i zero = _mm_setzero_si128();
628 0 : const __m128i top0 = _mm_unpacklo_epi8(t, zero);
629 0 : const __m128i top1 = _mm_unpackhi_epi8(t, zero);
630 0 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
631 0 : const __m128i one = _mm_set1_epi16(1);
632 :
633 0 : for (int j = 0; j < 4; ++j) {
634 0 : const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
635 0 : __m128i rep = _mm_set1_epi16(0x8000);
636 0 : for (int i = 0; i < 16; ++i) {
637 0 : const __m128i l16 = _mm_shuffle_epi8(l, rep);
638 0 : const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
639 : _mm_storeu_si128((__m128i *)dst, row);
640 0 : dst += stride;
641 0 : rep = _mm_add_epi16(rep, one);
642 : }
643 : }
644 0 : }
645 :
646 15816 : void eb_aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
647 : const uint8_t *above, const uint8_t *left) {
648 15816 : const __m128i a = _mm_load_si128((const __m128i *)above);
649 31632 : const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
650 15816 : const __m128i zero = _mm_setzero_si128();
651 15816 : const __m128i al = _mm_unpacklo_epi8(a, zero);
652 15816 : const __m128i ah = _mm_unpackhi_epi8(a, zero);
653 15816 : const __m128i bl = _mm_unpacklo_epi8(b, zero);
654 15816 : const __m128i bh = _mm_unpackhi_epi8(b, zero);
655 :
656 31632 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
657 15816 : __m128i rep = _mm_set1_epi16(0x8000);
658 15816 : const __m128i one = _mm_set1_epi16(1);
659 15816 : const __m128i l = _mm_loadl_epi64((const __m128i *)left);
660 : __m128i l16;
661 :
662 142344 : for (int i = 0; i < 8; ++i) {
663 126528 : l16 = _mm_shuffle_epi8(l, rep);
664 126528 : const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
665 126528 : const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
666 :
667 : _mm_storeu_si128((__m128i *)dst, r32l);
668 126528 : _mm_storeu_si128((__m128i *)(dst + 16), r32h);
669 126528 : dst += stride;
670 126528 : rep = _mm_add_epi16(rep, one);
671 : }
672 15816 : }
673 :
674 0 : void eb_aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
675 : const uint8_t *above,
676 : const uint8_t *left) {
677 0 : const __m128i a = _mm_load_si128((const __m128i *)above);
678 0 : const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
679 0 : const __m128i zero = _mm_setzero_si128();
680 0 : const __m128i al = _mm_unpacklo_epi8(a, zero);
681 0 : const __m128i ah = _mm_unpackhi_epi8(a, zero);
682 0 : const __m128i bl = _mm_unpacklo_epi8(b, zero);
683 0 : const __m128i bh = _mm_unpackhi_epi8(b, zero);
684 :
685 0 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
686 0 : __m128i rep = _mm_set1_epi16(0x8000);
687 0 : const __m128i one = _mm_set1_epi16(1);
688 0 : __m128i l = _mm_load_si128((const __m128i *)left);
689 : __m128i l16;
690 :
691 : int i;
692 0 : for (i = 0; i < 16; ++i) {
693 0 : l16 = _mm_shuffle_epi8(l, rep);
694 0 : const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
695 0 : const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
696 :
697 : _mm_storeu_si128((__m128i *)dst, r32l);
698 0 : _mm_storeu_si128((__m128i *)(dst + 16), r32h);
699 0 : dst += stride;
700 0 : rep = _mm_add_epi16(rep, one);
701 : }
702 0 : }
703 :
704 0 : void eb_aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
705 : const uint8_t *above,
706 : const uint8_t *left) {
707 0 : const __m128i a = _mm_load_si128((const __m128i *)above);
708 0 : const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
709 0 : const __m128i zero = _mm_setzero_si128();
710 0 : const __m128i al = _mm_unpacklo_epi8(a, zero);
711 0 : const __m128i ah = _mm_unpackhi_epi8(a, zero);
712 0 : const __m128i bl = _mm_unpacklo_epi8(b, zero);
713 0 : const __m128i bh = _mm_unpackhi_epi8(b, zero);
714 :
715 0 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
716 0 : __m128i rep = _mm_set1_epi16(0x8000);
717 0 : const __m128i one = _mm_set1_epi16(1);
718 0 : __m128i l = _mm_load_si128((const __m128i *)left);
719 : __m128i l16;
720 :
721 : int i;
722 0 : for (i = 0; i < 16; ++i) {
723 0 : l16 = _mm_shuffle_epi8(l, rep);
724 0 : const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
725 0 : const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
726 :
727 : _mm_storeu_si128((__m128i *)dst, r32l);
728 0 : _mm_storeu_si128((__m128i *)(dst + 16), r32h);
729 0 : dst += stride;
730 0 : rep = _mm_add_epi16(rep, one);
731 : }
732 :
733 0 : rep = _mm_set1_epi16(0x8000);
734 0 : l = _mm_load_si128((const __m128i *)(left + 16));
735 0 : for (i = 0; i < 16; ++i) {
736 0 : l16 = _mm_shuffle_epi8(l, rep);
737 0 : const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
738 0 : const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
739 :
740 : _mm_storeu_si128((__m128i *)dst, r32l);
741 0 : _mm_storeu_si128((__m128i *)(dst + 16), r32h);
742 0 : dst += stride;
743 0 : rep = _mm_add_epi16(rep, one);
744 : }
745 0 : }
746 :
747 0 : void eb_aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
748 : const uint8_t *above,
749 : const uint8_t *left) {
750 0 : const __m128i a = _mm_load_si128((const __m128i *)above);
751 0 : const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
752 0 : const __m128i zero = _mm_setzero_si128();
753 0 : const __m128i al = _mm_unpacklo_epi8(a, zero);
754 0 : const __m128i ah = _mm_unpackhi_epi8(a, zero);
755 0 : const __m128i bl = _mm_unpacklo_epi8(b, zero);
756 0 : const __m128i bh = _mm_unpackhi_epi8(b, zero);
757 :
758 0 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
759 0 : const __m128i one = _mm_set1_epi16(1);
760 : __m128i l16;
761 :
762 : int i, j;
763 0 : for (j = 0; j < 4; ++j) {
764 0 : const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
765 0 : __m128i rep = _mm_set1_epi16(0x8000);
766 0 : for (i = 0; i < 16; ++i) {
767 0 : l16 = _mm_shuffle_epi8(l, rep);
768 0 : const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
769 0 : const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
770 :
771 : _mm_storeu_si128((__m128i *)dst, r32l);
772 0 : _mm_storeu_si128((__m128i *)(dst + 16), r32h);
773 0 : dst += stride;
774 0 : rep = _mm_add_epi16(rep, one);
775 : }
776 : }
777 0 : }
778 :
779 0 : void eb_aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
780 : const uint8_t *above,
781 : const uint8_t *left) {
782 0 : const __m128i a = _mm_load_si128((const __m128i *)above);
783 0 : const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
784 0 : const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
785 0 : const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
786 0 : const __m128i zero = _mm_setzero_si128();
787 0 : const __m128i al = _mm_unpacklo_epi8(a, zero);
788 0 : const __m128i ah = _mm_unpackhi_epi8(a, zero);
789 0 : const __m128i bl = _mm_unpacklo_epi8(b, zero);
790 0 : const __m128i bh = _mm_unpackhi_epi8(b, zero);
791 0 : const __m128i cl = _mm_unpacklo_epi8(c, zero);
792 0 : const __m128i ch = _mm_unpackhi_epi8(c, zero);
793 0 : const __m128i dl = _mm_unpacklo_epi8(d, zero);
794 0 : const __m128i dh = _mm_unpackhi_epi8(d, zero);
795 :
796 0 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
797 0 : const __m128i one = _mm_set1_epi16(1);
798 : __m128i l16;
799 :
800 : int i, j;
801 0 : for (j = 0; j < 2; ++j) {
802 0 : const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
803 0 : __m128i rep = _mm_set1_epi16(0x8000);
804 0 : for (i = 0; i < 16; ++i) {
805 0 : l16 = _mm_shuffle_epi8(l, rep);
806 0 : const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
807 0 : const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
808 0 : const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
809 0 : const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
810 :
811 : _mm_storeu_si128((__m128i *)dst, r0);
812 0 : _mm_storeu_si128((__m128i *)(dst + 16), r1);
813 0 : _mm_storeu_si128((__m128i *)(dst + 32), r2);
814 0 : _mm_storeu_si128((__m128i *)(dst + 48), r3);
815 0 : dst += stride;
816 0 : rep = _mm_add_epi16(rep, one);
817 : }
818 : }
819 0 : }
820 :
821 0 : void eb_aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
822 : const uint8_t *above,
823 : const uint8_t *left) {
824 0 : const __m128i a = _mm_load_si128((const __m128i *)above);
825 0 : const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
826 0 : const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
827 0 : const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
828 0 : const __m128i zero = _mm_setzero_si128();
829 0 : const __m128i al = _mm_unpacklo_epi8(a, zero);
830 0 : const __m128i ah = _mm_unpackhi_epi8(a, zero);
831 0 : const __m128i bl = _mm_unpacklo_epi8(b, zero);
832 0 : const __m128i bh = _mm_unpackhi_epi8(b, zero);
833 0 : const __m128i cl = _mm_unpacklo_epi8(c, zero);
834 0 : const __m128i ch = _mm_unpackhi_epi8(c, zero);
835 0 : const __m128i dl = _mm_unpacklo_epi8(d, zero);
836 0 : const __m128i dh = _mm_unpackhi_epi8(d, zero);
837 :
838 0 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
839 0 : const __m128i one = _mm_set1_epi16(1);
840 : __m128i l16;
841 :
842 : int i, j;
843 0 : for (j = 0; j < 4; ++j) {
844 0 : const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
845 0 : __m128i rep = _mm_set1_epi16(0x8000);
846 0 : for (i = 0; i < 16; ++i) {
847 0 : l16 = _mm_shuffle_epi8(l, rep);
848 0 : const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
849 0 : const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
850 0 : const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
851 0 : const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
852 :
853 : _mm_storeu_si128((__m128i *)dst, r0);
854 0 : _mm_storeu_si128((__m128i *)(dst + 16), r1);
855 0 : _mm_storeu_si128((__m128i *)(dst + 32), r2);
856 0 : _mm_storeu_si128((__m128i *)(dst + 48), r3);
857 0 : dst += stride;
858 0 : rep = _mm_add_epi16(rep, one);
859 : }
860 : }
861 0 : }
862 :
863 0 : void eb_aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
864 : const uint8_t *above,
865 : const uint8_t *left) {
866 0 : const __m128i a = _mm_load_si128((const __m128i *)above);
867 0 : const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
868 0 : const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
869 0 : const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
870 0 : const __m128i zero = _mm_setzero_si128();
871 0 : const __m128i al = _mm_unpacklo_epi8(a, zero);
872 0 : const __m128i ah = _mm_unpackhi_epi8(a, zero);
873 0 : const __m128i bl = _mm_unpacklo_epi8(b, zero);
874 0 : const __m128i bh = _mm_unpackhi_epi8(b, zero);
875 0 : const __m128i cl = _mm_unpacklo_epi8(c, zero);
876 0 : const __m128i ch = _mm_unpackhi_epi8(c, zero);
877 0 : const __m128i dl = _mm_unpacklo_epi8(d, zero);
878 0 : const __m128i dh = _mm_unpackhi_epi8(d, zero);
879 :
880 0 : const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
881 0 : const __m128i one = _mm_set1_epi16(1);
882 : __m128i l16;
883 :
884 : int i;
885 0 : const __m128i l = _mm_load_si128((const __m128i *)left);
886 0 : __m128i rep = _mm_set1_epi16(0x8000);
887 0 : for (i = 0; i < 16; ++i) {
888 0 : l16 = _mm_shuffle_epi8(l, rep);
889 0 : const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
890 0 : const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
891 0 : const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
892 0 : const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
893 :
894 : _mm_storeu_si128((__m128i *)dst, r0);
895 0 : _mm_storeu_si128((__m128i *)(dst + 16), r1);
896 0 : _mm_storeu_si128((__m128i *)(dst + 32), r2);
897 0 : _mm_storeu_si128((__m128i *)(dst + 48), r3);
898 0 : dst += stride;
899 0 : rep = _mm_add_epi16(rep, one);
900 : }
901 0 : }
|