Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include "EbPictureOperators_SSE2.h"
7 : #include <emmintrin.h>
8 : #include "EbDefinitions.h"
9 :
10 : /*******************************************************************************
11 : PictureAdditionKernel_INTRIN
12 : *******************************************************************************/
13 0 : void picture_addition_kernel4x4_sse_intrin(
14 : uint8_t *pred_ptr,
15 : uint32_t pred_stride,
16 : int16_t *residual_ptr,
17 : uint32_t residual_stride,
18 : uint8_t *recon_ptr,
19 : uint32_t recon_stride,
20 : uint32_t width,
21 : uint32_t height)
22 : {
23 : uint32_t y;
24 : __m128i xmm0, recon_0_3;
25 0 : xmm0 = _mm_setzero_si128();
26 :
27 0 : for (y = 0; y < 4; ++y) {
28 0 : recon_0_3 = _mm_packus_epi16(_mm_add_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)pred_ptr), xmm0), _mm_loadl_epi64((__m128i *)residual_ptr)), xmm0);
29 :
30 0 : *(uint32_t *)recon_ptr = _mm_cvtsi128_si32(recon_0_3);
31 0 : pred_ptr += pred_stride;
32 0 : residual_ptr += residual_stride;
33 0 : recon_ptr += recon_stride;
34 : }
35 : (void)width;
36 : (void)height;
37 :
38 0 : return;
39 : }
40 :
41 0 : void picture_addition_kernel8x8_sse2_intrin(
42 : uint8_t *pred_ptr,
43 : uint32_t pred_stride,
44 : int16_t *residual_ptr,
45 : uint32_t residual_stride,
46 : uint8_t *recon_ptr,
47 : uint32_t recon_stride,
48 : uint32_t width,
49 : uint32_t height)
50 : {
51 : __m128i recon_0_7, xmm0;
52 : uint32_t y;
53 :
54 0 : xmm0 = _mm_setzero_si128();
55 :
56 0 : for (y = 0; y < 8; ++y) {
57 0 : recon_0_7 = _mm_packus_epi16(_mm_add_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)pred_ptr), xmm0), _mm_loadu_si128((__m128i *)residual_ptr)), xmm0);
58 :
59 0 : *(uint64_t *)recon_ptr = _mm_cvtsi128_si64(recon_0_7);
60 0 : pred_ptr += pred_stride;
61 0 : residual_ptr += residual_stride;
62 0 : recon_ptr += recon_stride;
63 : }
64 : (void)width;
65 : (void)height;
66 :
67 0 : return;
68 : }
69 :
70 0 : void picture_addition_kernel16x16_sse2_intrin(
71 : uint8_t *pred_ptr,
72 : uint32_t pred_stride,
73 : int16_t *residual_ptr,
74 : uint32_t residual_stride,
75 : uint8_t *recon_ptr,
76 : uint32_t recon_stride,
77 : uint32_t width,
78 : uint32_t height)
79 : {
80 : __m128i xmm0, xmm_clip_U8, pred_0_15, recon_0_7, recon_8_15;
81 : uint32_t y;
82 :
83 0 : xmm0 = _mm_setzero_si128();
84 :
85 0 : for (y = 0; y < 16; ++y) {
86 0 : pred_0_15 = _mm_loadu_si128((__m128i *)pred_ptr);
87 0 : recon_0_7 = _mm_add_epi16(_mm_unpacklo_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)residual_ptr));
88 0 : recon_8_15 = _mm_add_epi16(_mm_unpackhi_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 8)));
89 0 : xmm_clip_U8 = _mm_packus_epi16(recon_0_7, recon_8_15);
90 :
91 : _mm_storeu_si128((__m128i*)recon_ptr, xmm_clip_U8);
92 :
93 0 : pred_ptr += pred_stride;
94 0 : residual_ptr += residual_stride;
95 0 : recon_ptr += recon_stride;
96 : }
97 : (void)width;
98 : (void)height;
99 :
100 0 : return;
101 : }
102 0 : void picture_addition_kernel32x32_sse2_intrin(
103 : uint8_t *pred_ptr,
104 : uint32_t pred_stride,
105 : int16_t *residual_ptr,
106 : uint32_t residual_stride,
107 : uint8_t *recon_ptr,
108 : uint32_t recon_stride,
109 : uint32_t width,
110 : uint32_t height)
111 : {
112 : uint32_t y;
113 : __m128i xmm0, pred_0_15, pred_16_31, recon_0_15_clipped, recon_0_7, recon_8_15, recon_16_23, recon_24_31, recon_16_31_clipped;
114 0 : xmm0 = _mm_setzero_si128();
115 :
116 0 : for (y = 0; y < 32; ++y) {
117 0 : pred_0_15 = _mm_loadu_si128((__m128i *)pred_ptr);
118 0 : pred_16_31 = _mm_loadu_si128((__m128i *)(pred_ptr + 16));
119 :
120 0 : recon_0_7 = _mm_add_epi16(_mm_unpacklo_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)residual_ptr));
121 0 : recon_8_15 = _mm_add_epi16(_mm_unpackhi_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 8)));
122 0 : recon_16_23 = _mm_add_epi16(_mm_unpacklo_epi8(pred_16_31, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 16)));
123 0 : recon_24_31 = _mm_add_epi16(_mm_unpackhi_epi8(pred_16_31, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 24)));
124 :
125 0 : recon_0_15_clipped = _mm_packus_epi16(recon_0_7, recon_8_15);
126 0 : recon_16_31_clipped = _mm_packus_epi16(recon_16_23, recon_24_31);
127 :
128 : _mm_storeu_si128((__m128i*)recon_ptr, recon_0_15_clipped);
129 0 : _mm_storeu_si128((__m128i*)(recon_ptr + 16), recon_16_31_clipped);
130 :
131 0 : pred_ptr += pred_stride;
132 0 : residual_ptr += residual_stride;
133 0 : recon_ptr += recon_stride;
134 : }
135 : (void)width;
136 : (void)height;
137 :
138 0 : return;
139 : }
140 :
141 0 : void picture_addition_kernel64x64_sse2_intrin(
142 : uint8_t *pred_ptr,
143 : uint32_t pred_stride,
144 : int16_t *residual_ptr,
145 : uint32_t residual_stride,
146 : uint8_t *recon_ptr,
147 : uint32_t recon_stride,
148 : uint32_t width,
149 : uint32_t height)
150 : {
151 : uint32_t y;
152 :
153 : __m128i xmm0, pred_0_15, pred_16_31, pred_32_47, pred_48_63;
154 : __m128i recon_0_15_clipped, recon_16_31_clipped, recon_32_47_clipped, recon_48_63_clipped;
155 : __m128i recon_0_7, recon_8_15, recon_16_23, recon_24_31, recon_32_39, recon_40_47, recon_48_55, recon_56_63;
156 :
157 0 : xmm0 = _mm_setzero_si128();
158 :
159 0 : for (y = 0; y < 64; ++y) {
160 0 : pred_0_15 = _mm_loadu_si128((__m128i *)pred_ptr);
161 0 : pred_16_31 = _mm_loadu_si128((__m128i *)(pred_ptr + 16));
162 0 : pred_32_47 = _mm_loadu_si128((__m128i *)(pred_ptr + 32));
163 0 : pred_48_63 = _mm_loadu_si128((__m128i *)(pred_ptr + 48));
164 :
165 0 : recon_0_7 = _mm_add_epi16(_mm_unpacklo_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)residual_ptr));
166 0 : recon_8_15 = _mm_add_epi16(_mm_unpackhi_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 8)));
167 0 : recon_16_23 = _mm_add_epi16(_mm_unpacklo_epi8(pred_16_31, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 16)));
168 0 : recon_24_31 = _mm_add_epi16(_mm_unpackhi_epi8(pred_16_31, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 24)));
169 0 : recon_32_39 = _mm_add_epi16(_mm_unpacklo_epi8(pred_32_47, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 32)));
170 0 : recon_40_47 = _mm_add_epi16(_mm_unpackhi_epi8(pred_32_47, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 40)));
171 0 : recon_48_55 = _mm_add_epi16(_mm_unpacklo_epi8(pred_48_63, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 48)));
172 0 : recon_56_63 = _mm_add_epi16(_mm_unpackhi_epi8(pred_48_63, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 56)));
173 :
174 0 : recon_0_15_clipped = _mm_packus_epi16(recon_0_7, recon_8_15);
175 0 : recon_16_31_clipped = _mm_packus_epi16(recon_16_23, recon_24_31);
176 0 : recon_32_47_clipped = _mm_packus_epi16(recon_32_39, recon_40_47);
177 0 : recon_48_63_clipped = _mm_packus_epi16(recon_48_55, recon_56_63);
178 :
179 : _mm_storeu_si128((__m128i*)recon_ptr, recon_0_15_clipped);
180 0 : _mm_storeu_si128((__m128i*)(recon_ptr + 16), recon_16_31_clipped);
181 0 : _mm_storeu_si128((__m128i*)(recon_ptr + 32), recon_32_47_clipped);
182 0 : _mm_storeu_si128((__m128i*)(recon_ptr + 48), recon_48_63_clipped);
183 :
184 0 : pred_ptr += pred_stride;
185 0 : residual_ptr += residual_stride;
186 0 : recon_ptr += recon_stride;
187 : }
188 : (void)width;
189 : (void)height;
190 :
191 0 : return;
192 : }
193 :
194 : /******************************************************************************************************
195 : residual_kernel
196 : ***********************************************************************************************************/
197 0 : void residual_kernel_sub_sampled4x4_sse_intrin(
198 : uint8_t *input,
199 : uint32_t input_stride,
200 : uint8_t *pred,
201 : uint32_t pred_stride,
202 : int16_t *residual,
203 : uint32_t residual_stride,
204 : uint32_t area_width,
205 : uint32_t area_height,
206 : uint8_t last_line)
207 : {
208 0 : __m128i residual_0_3, xmm0 = _mm_setzero_si128();
209 : uint32_t y;
210 : //hard code subampling dimensions, keep residual_stride
211 0 : area_height >>= 1;
212 0 : input_stride <<= 1;
213 0 : pred_stride <<= 1;
214 :
215 0 : for (y = 0; y < area_height; ++y) {
216 0 : residual_0_3 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)input), xmm0),
217 0 : _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)pred), xmm0));
218 :
219 0 : *(uint64_t *)residual = _mm_cvtsi128_si64(residual_0_3);
220 :
221 0 : residual += residual_stride;
222 0 : *(uint64_t *)residual = _mm_cvtsi128_si64(residual_0_3);
223 :
224 0 : input += input_stride;
225 0 : pred += pred_stride;
226 0 : residual += residual_stride;
227 : }
228 : (void)area_width;
229 : //compute the last line
230 :
231 0 : if (last_line) {
232 0 : input -= (input_stride) >> 1;
233 0 : pred -= (pred_stride) >> 1;
234 0 : residual -= residual_stride;
235 0 : residual_0_3 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)input), xmm0),
236 0 : _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)pred), xmm0));
237 :
238 0 : *(uint64_t *)residual = _mm_cvtsi128_si64(residual_0_3);
239 : }
240 :
241 0 : return;
242 : }
243 :
244 0 : void residual_kernel_sub_sampled8x8_sse2_intrin(
245 : uint8_t *input,
246 : uint32_t input_stride,
247 : uint8_t *pred,
248 : uint32_t pred_stride,
249 : int16_t *residual,
250 : uint32_t residual_stride,
251 : uint32_t area_width,
252 : uint32_t area_height,
253 : uint8_t last_line
254 : )
255 : {
256 : __m128i xmm0, residual_0_7;
257 : uint32_t y;
258 :
259 0 : xmm0 = _mm_setzero_si128();
260 : //hard code subampling dimensions, keep residual_stride
261 0 : area_height >>= 1;
262 0 : input_stride <<= 1;
263 0 : pred_stride <<= 1;
264 :
265 0 : for (y = 0; y < area_height; ++y) {
266 0 : residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)pred), xmm0));
267 :
268 : _mm_storeu_si128((__m128i*)residual, residual_0_7);
269 :
270 0 : residual += residual_stride;
271 : _mm_storeu_si128((__m128i*)residual, residual_0_7);
272 :
273 0 : input += input_stride;
274 0 : pred += pred_stride;
275 0 : residual += residual_stride;
276 : }
277 : (void)area_width;
278 : //compute the last line
279 0 : if (last_line) {
280 0 : input -= (input_stride) >> 1;
281 0 : pred -= (pred_stride) >> 1;
282 0 : residual -= residual_stride;
283 :
284 0 : residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)pred), xmm0));
285 :
286 : _mm_storeu_si128((__m128i*)residual, residual_0_7);
287 : }
288 :
289 0 : return;
290 : }
291 :
292 0 : void residual_kernel_sub_sampled16x16_sse2_intrin(
293 : uint8_t *input,
294 : uint32_t input_stride,
295 : uint8_t *pred,
296 : uint32_t pred_stride,
297 : int16_t *residual,
298 : uint32_t residual_stride,
299 : uint32_t area_width,
300 : uint32_t area_height,
301 : uint8_t last_line
302 : )
303 : {
304 : __m128i xmm0, residual_0_7, residual_8_15;
305 : uint32_t y;
306 :
307 0 : xmm0 = _mm_setzero_si128();
308 : //hard code subampling dimensions, keep residual_stride
309 0 : area_height >>= 1;
310 0 : input_stride <<= 1;
311 0 : pred_stride <<= 1;
312 :
313 0 : for (y = 0; y < area_height; ++y) {
314 0 : residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
315 0 : residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
316 :
317 : _mm_storeu_si128((__m128i*)residual, residual_0_7);
318 0 : _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
319 :
320 0 : residual += residual_stride;
321 : _mm_storeu_si128((__m128i*)residual, residual_0_7);
322 0 : _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
323 :
324 0 : input += input_stride;
325 0 : pred += pred_stride;
326 0 : residual += residual_stride;
327 : }
328 : (void)area_width;
329 : //compute the last line
330 :
331 0 : if (last_line) {
332 0 : input -= (input_stride) >> 1;
333 0 : pred -= (pred_stride) >> 1;
334 0 : residual -= residual_stride;
335 :
336 0 : residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
337 0 : residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
338 :
339 : _mm_storeu_si128((__m128i*)residual, residual_0_7);
340 0 : _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
341 : }
342 0 : return;
343 : }
344 :
345 0 : void residual_kernel_sub_sampled32x32_sse2_intrin(
346 : uint8_t *input,
347 : uint32_t input_stride,
348 : uint8_t *pred,
349 : uint32_t pred_stride,
350 : int16_t *residual,
351 : uint32_t residual_stride,
352 : uint32_t area_width,
353 : uint32_t area_height,
354 : uint8_t last_line)
355 : {
356 : __m128i xmm0, residual_0_7, residual_8_15, residual_16_23, residual_24_31;
357 : uint32_t y;
358 :
359 0 : xmm0 = _mm_setzero_si128();
360 :
361 : //hard code subampling dimensions, keep residual_stride
362 0 : area_height >>= 1;
363 0 : input_stride <<= 1;
364 0 : pred_stride <<= 1;
365 :
366 0 : for (y = 0; y < area_height; ++y) {
367 0 : residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
368 0 : residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
369 0 : residual_16_23 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
370 0 : residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
371 :
372 : _mm_storeu_si128((__m128i*)residual, residual_0_7);
373 0 : _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
374 0 : _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
375 0 : _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
376 :
377 0 : residual += residual_stride;
378 : _mm_storeu_si128((__m128i*)residual, residual_0_7);
379 0 : _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
380 0 : _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
381 0 : _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
382 :
383 0 : input += input_stride;
384 0 : pred += pred_stride;
385 0 : residual += residual_stride;
386 : }
387 : (void)area_width;
388 : //compute the last line
389 :
390 0 : if (last_line) {
391 0 : input -= (input_stride) >> 1;
392 0 : pred -= (pred_stride) >> 1;
393 0 : residual -= residual_stride;
394 :
395 0 : residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
396 0 : residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
397 0 : residual_16_23 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
398 0 : residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
399 :
400 : _mm_storeu_si128((__m128i*)residual, residual_0_7);
401 0 : _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
402 0 : _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
403 0 : _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
404 : }
405 :
406 0 : return;
407 : }
408 :
409 0 : void residual_kernel_sub_sampled64x64_sse2_intrin(
410 : uint8_t *input,
411 : uint32_t input_stride,
412 : uint8_t *pred,
413 : uint32_t pred_stride,
414 : int16_t *residual,
415 : uint32_t residual_stride,
416 : uint32_t area_width,
417 : uint32_t area_height,
418 : uint8_t last_line)
419 : {
420 : __m128i xmm0, residual_0_7, residual_8_15, residual_16_23, residual_24_31, resdiaul_32_39, residual_40_47, residual_48_55, residual_56_63;
421 : uint32_t y;
422 :
423 0 : xmm0 = _mm_setzero_si128();
424 :
425 : //hard code subampling dimensions, keep residual_stride
426 0 : area_height >>= 1;
427 0 : input_stride <<= 1;
428 0 : pred_stride <<= 1;
429 :
430 0 : for (y = 0; y < area_height; ++y) {
431 0 : residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
432 0 : residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
433 0 : residual_16_23 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
434 0 : residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
435 0 : resdiaul_32_39 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
436 0 : residual_40_47 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
437 0 : residual_48_55 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
438 0 : residual_56_63 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
439 :
440 : _mm_storeu_si128((__m128i*)residual, residual_0_7);
441 0 : _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
442 0 : _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
443 0 : _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
444 0 : _mm_storeu_si128((__m128i*)(residual + 32), resdiaul_32_39);
445 0 : _mm_storeu_si128((__m128i*)(residual + 40), residual_40_47);
446 0 : _mm_storeu_si128((__m128i*)(residual + 48), residual_48_55);
447 0 : _mm_storeu_si128((__m128i*)(residual + 56), residual_56_63);
448 :
449 : //duplicate top field residual to bottom field
450 0 : residual += residual_stride;
451 : _mm_storeu_si128((__m128i*)residual, residual_0_7);
452 0 : _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
453 0 : _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
454 0 : _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
455 0 : _mm_storeu_si128((__m128i*)(residual + 32), resdiaul_32_39);
456 0 : _mm_storeu_si128((__m128i*)(residual + 40), residual_40_47);
457 0 : _mm_storeu_si128((__m128i*)(residual + 48), residual_48_55);
458 0 : _mm_storeu_si128((__m128i*)(residual + 56), residual_56_63);
459 :
460 0 : input += input_stride;
461 0 : pred += pred_stride;
462 0 : residual += residual_stride;
463 : }
464 : (void)area_width;
465 : //compute the last line
466 :
467 0 : if (last_line) {
468 0 : input -= (input_stride) >> 1;
469 0 : pred -= (pred_stride) >> 1;
470 0 : residual -= residual_stride;
471 :
472 0 : residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
473 0 : residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
474 0 : residual_16_23 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
475 0 : residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
476 0 : resdiaul_32_39 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
477 0 : residual_40_47 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
478 0 : residual_48_55 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
479 0 : residual_56_63 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
480 :
481 : _mm_storeu_si128((__m128i*)residual, residual_0_7);
482 0 : _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
483 0 : _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
484 0 : _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
485 0 : _mm_storeu_si128((__m128i*)(residual + 32), resdiaul_32_39);
486 0 : _mm_storeu_si128((__m128i*)(residual + 40), residual_40_47);
487 0 : _mm_storeu_si128((__m128i*)(residual + 48), residual_48_55);
488 0 : _mm_storeu_si128((__m128i*)(residual + 56), residual_56_63);
489 : }
490 :
491 0 : return;
492 : }
493 : /******************************************************************************************************
494 : residual_kernel16bit_sse2_intrin
495 : ******************************************************************************************************/
496 0 : void residual_kernel16bit_sse2_intrin(
497 : uint16_t *input,
498 : uint32_t input_stride,
499 : uint16_t *pred,
500 : uint32_t pred_stride,
501 : int16_t *residual,
502 : uint32_t residual_stride,
503 : uint32_t area_width,
504 : uint32_t area_height)
505 : {
506 : uint32_t x, y;
507 : __m128i residual0, residual1;
508 :
509 0 : if (area_width == 4)
510 : {
511 0 : for (y = 0; y < area_height; y += 2) {
512 0 : residual0 = _mm_sub_epi16(_mm_loadl_epi64((__m128i*)input), _mm_loadl_epi64((__m128i*)pred));
513 0 : residual1 = _mm_sub_epi16(_mm_loadl_epi64((__m128i*)(input + input_stride)), _mm_loadl_epi64((__m128i*)(pred + pred_stride)));
514 :
515 0 : _mm_storel_epi64((__m128i*)residual, residual0);
516 0 : _mm_storel_epi64((__m128i*)(residual + residual_stride), residual1);
517 :
518 0 : input += input_stride << 1;
519 0 : pred += pred_stride << 1;
520 0 : residual += residual_stride << 1;
521 : }
522 : }
523 0 : else if (area_width == 8) {
524 0 : for (y = 0; y < area_height; y += 2) {
525 0 : residual0 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred));
526 0 : residual1 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride)), _mm_loadu_si128((__m128i*)(pred + pred_stride)));
527 :
528 : _mm_storeu_si128((__m128i*) residual, residual0);
529 0 : _mm_storeu_si128((__m128i*) (residual + residual_stride), residual1);
530 :
531 0 : input += input_stride << 1;
532 0 : pred += pred_stride << 1;
533 0 : residual += residual_stride << 1;
534 : }
535 : }
536 0 : else if (area_width == 16) {
537 : __m128i residual2, residual3;
538 :
539 0 : for (y = 0; y < area_height; y += 2) {
540 0 : residual0 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred));
541 0 : residual1 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 8)), _mm_loadu_si128((__m128i*)(pred + 8)));
542 0 : residual2 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride)), _mm_loadu_si128((__m128i*)(pred + pred_stride)));
543 0 : residual3 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 8)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 8)));
544 :
545 : _mm_storeu_si128((__m128i*)residual, residual0);
546 0 : _mm_storeu_si128((__m128i*)(residual + 8), residual1);
547 0 : _mm_storeu_si128((__m128i*)(residual + residual_stride), residual2);
548 0 : _mm_storeu_si128((__m128i*)(residual + residual_stride + 8), residual3);
549 :
550 0 : input += input_stride << 1;
551 0 : pred += pred_stride << 1;
552 0 : residual += residual_stride << 1;
553 : }
554 : }
555 0 : else if (area_width == 32) {
556 0 : for (y = 0; y < area_height; y += 2) {
557 : //residual[columnIndex] = ((int16_t)input[columnIndex]) - ((int16_t)pred[columnIndex]);
558 0 : _mm_storeu_si128((__m128i*) residual, _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred)));
559 0 : _mm_storeu_si128((__m128i*) (residual + 8), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 8)), _mm_loadu_si128((__m128i*)(pred + 8))));
560 0 : _mm_storeu_si128((__m128i*) (residual + 16), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 16)), _mm_loadu_si128((__m128i*)(pred + 16))));
561 0 : _mm_storeu_si128((__m128i*) (residual + 24), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 24)), _mm_loadu_si128((__m128i*)(pred + 24))));
562 :
563 0 : _mm_storeu_si128((__m128i*) (residual + residual_stride), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride)), _mm_loadu_si128((__m128i*)(pred + pred_stride))));
564 0 : _mm_storeu_si128((__m128i*) (residual + residual_stride + 8), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 8)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 8))));
565 0 : _mm_storeu_si128((__m128i*) (residual + residual_stride + 16), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 16)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 16))));
566 0 : _mm_storeu_si128((__m128i*) (residual + residual_stride + 24), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 24)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 24))));
567 :
568 0 : input += input_stride << 1;
569 0 : pred += pred_stride << 1;
570 0 : residual += residual_stride << 1;
571 : }
572 : }
573 0 : else if (area_width == 64) { // Branch was not tested because the encoder had max txb_size of 32
574 :
575 0 : for (y = 0; y < area_height; y += 2) {
576 : //residual[columnIndex] = ((int16_t)input[columnIndex]) - ((int16_t)pred[columnIndex]) 8 indices per _mm_sub_epi16
577 0 : _mm_storeu_si128((__m128i*) residual, _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred)));
578 0 : _mm_storeu_si128((__m128i*) (residual + 8), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 8)), _mm_loadu_si128((__m128i*)(pred + 8))));
579 0 : _mm_storeu_si128((__m128i*) (residual + 16), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 16)), _mm_loadu_si128((__m128i*)(pred + 16))));
580 0 : _mm_storeu_si128((__m128i*) (residual + 24), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 24)), _mm_loadu_si128((__m128i*)(pred + 24))));
581 0 : _mm_storeu_si128((__m128i*) (residual + 32), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 32)), _mm_loadu_si128((__m128i*)(pred + 32))));
582 0 : _mm_storeu_si128((__m128i*) (residual + 40), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 40)), _mm_loadu_si128((__m128i*)(pred + 40))));
583 0 : _mm_storeu_si128((__m128i*) (residual + 48), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 48)), _mm_loadu_si128((__m128i*)(pred + 48))));
584 0 : _mm_storeu_si128((__m128i*) (residual + 56), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 56)), _mm_loadu_si128((__m128i*)(pred + 56))));
585 :
586 0 : _mm_storeu_si128((__m128i*) (residual + residual_stride), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride)), _mm_loadu_si128((__m128i*)(pred + pred_stride))));
587 0 : _mm_storeu_si128((__m128i*) (residual + residual_stride + 8), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 8)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 8))));
588 0 : _mm_storeu_si128((__m128i*) (residual + residual_stride + 16), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 16)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 16))));
589 0 : _mm_storeu_si128((__m128i*) (residual + residual_stride + 24), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 24)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 24))));
590 0 : _mm_storeu_si128((__m128i*) (residual + residual_stride + 32), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 32)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 32))));
591 0 : _mm_storeu_si128((__m128i*) (residual + residual_stride + 40), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 40)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 40))));
592 0 : _mm_storeu_si128((__m128i*) (residual + residual_stride + 48), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 48)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 48))));
593 0 : _mm_storeu_si128((__m128i*) (residual + residual_stride + 56), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 56)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 56))));
594 :
595 0 : input += input_stride << 1;
596 0 : pred += pred_stride << 1;
597 0 : residual += residual_stride << 1;
598 : }
599 : }
600 : else {
601 0 : uint32_t inputStrideDiff = 2 * input_stride;
602 0 : uint32_t predStrideDiff = 2 * pred_stride;
603 0 : uint32_t residualStrideDiff = 2 * residual_stride;
604 0 : inputStrideDiff -= area_width;
605 0 : predStrideDiff -= area_width;
606 0 : residualStrideDiff -= area_width;
607 :
608 0 : if (!(area_width & 7)) {
609 0 : for (x = 0; x < area_height; x += 2) {
610 0 : for (y = 0; y < area_width; y += 8) {
611 0 : _mm_storeu_si128((__m128i*) residual, _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred)));
612 0 : _mm_storeu_si128((__m128i*) (residual + residual_stride), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride)), _mm_loadu_si128((__m128i*)(pred + pred_stride))));
613 :
614 0 : input += 8;
615 0 : pred += 8;
616 0 : residual += 8;
617 : }
618 0 : input = input + inputStrideDiff;
619 0 : pred = pred + predStrideDiff;
620 0 : residual = residual + residualStrideDiff;
621 : }
622 : }
623 : else {
624 0 : for (x = 0; x < area_height; x += 2) {
625 0 : for (y = 0; y < area_width; y += 4) {
626 0 : _mm_storel_epi64((__m128i*) residual, _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred)));
627 0 : _mm_storel_epi64((__m128i*) (residual + residual_stride), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride)), _mm_loadu_si128((__m128i*)(pred + pred_stride))));
628 :
629 0 : input += 4;
630 0 : pred += 4;
631 0 : residual += 4;
632 : }
633 0 : input = input + inputStrideDiff;
634 0 : pred = pred + predStrideDiff;
635 0 : residual = residual + residualStrideDiff;
636 : }
637 : }
638 : }
639 0 : return;
640 : }
641 :
642 : /******************************************************************************************************
643 : picture_addition_kernel16bit_sse2_intrin
644 : ******************************************************************************************************/
645 :
646 0 : void picture_addition_kernel16bit_sse2_intrin(
647 : uint16_t *pred_ptr,
648 : uint32_t pred_stride,
649 : int16_t *residual_ptr,
650 : uint32_t residual_stride,
651 : uint16_t *recon_ptr,
652 : uint32_t recon_stride,
653 : uint32_t width,
654 : uint32_t height)
655 : {
656 : __m128i xmm_0, xmm_Max10bit;
657 :
658 : uint32_t y, x;
659 :
660 0 : xmm_0 = _mm_setzero_si128();
661 0 : xmm_Max10bit = _mm_set1_epi16(1023);
662 :
663 0 : if (width == 4)
664 : {
665 : __m128i xmm_sum_0_3, xmm_sum_s0_s3, xmm_clip3_0_3, xmm_clip3_s0_s3;
666 0 : for (y = 0; y < height; y += 2) {
667 0 : xmm_sum_0_3 = _mm_adds_epi16(_mm_loadl_epi64((__m128i*)pred_ptr), _mm_loadl_epi64((__m128i*)residual_ptr));
668 0 : xmm_sum_s0_s3 = _mm_adds_epi16(_mm_loadl_epi64((__m128i*)(pred_ptr + pred_stride)), _mm_loadl_epi64((__m128i*)(residual_ptr + residual_stride)));
669 :
670 0 : xmm_clip3_0_3 = _mm_max_epi16(_mm_min_epi16(xmm_sum_0_3, xmm_Max10bit), xmm_0);
671 0 : xmm_clip3_s0_s3 = _mm_max_epi16(_mm_min_epi16(xmm_sum_s0_s3, xmm_Max10bit), xmm_0);
672 :
673 0 : _mm_storel_epi64((__m128i*) recon_ptr, xmm_clip3_0_3);
674 0 : _mm_storel_epi64((__m128i*) (recon_ptr + recon_stride), xmm_clip3_s0_s3);
675 :
676 0 : pred_ptr += pred_stride << 1;
677 0 : residual_ptr += residual_stride << 1;
678 0 : recon_ptr += recon_stride << 1;
679 : }
680 : }
681 0 : else if (width == 8) {
682 : __m128i xmm_sum_0_7, xmm_sum_s0_s7, xmm_clip3_0_7, xmm_clip3_s0_s7;
683 :
684 0 : for (y = 0; y < height; y += 2) {
685 0 : xmm_sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)pred_ptr), _mm_loadu_si128((__m128i*)residual_ptr));
686 0 : xmm_sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride)));
687 :
688 0 : xmm_clip3_0_7 = _mm_max_epi16(_mm_min_epi16(xmm_sum_0_7, xmm_Max10bit), xmm_0);
689 0 : xmm_clip3_s0_s7 = _mm_max_epi16(_mm_min_epi16(xmm_sum_s0_s7, xmm_Max10bit), xmm_0);
690 :
691 : _mm_storeu_si128((__m128i*) recon_ptr, xmm_clip3_0_7);
692 0 : _mm_storeu_si128((__m128i*) (recon_ptr + recon_stride), xmm_clip3_s0_s7);
693 :
694 0 : pred_ptr += pred_stride << 1;
695 0 : residual_ptr += residual_stride << 1;
696 0 : recon_ptr += recon_stride << 1;
697 : }
698 : }
699 0 : else if (width == 16) {
700 : __m128i sum_0_7, sum_8_15, sum_s0_s7, sum_s8_s15, clip3_0_7, clip3_8_15, clip3_s0_s7, clip3_s8_s15;
701 :
702 0 : for (y = 0; y < height; y += 2) {
703 0 : sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)pred_ptr), _mm_loadu_si128((__m128i*)residual_ptr));
704 0 : sum_8_15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 8)), _mm_loadu_si128((__m128i*)(residual_ptr + 8)));
705 0 : sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride)));
706 0 : sum_s8_s15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride + 8)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride + 8)));
707 :
708 0 : clip3_0_7 = _mm_max_epi16(_mm_min_epi16(sum_0_7, xmm_Max10bit), xmm_0);
709 0 : clip3_8_15 = _mm_max_epi16(_mm_min_epi16(sum_8_15, xmm_Max10bit), xmm_0);
710 0 : clip3_s0_s7 = _mm_max_epi16(_mm_min_epi16(sum_s0_s7, xmm_Max10bit), xmm_0);
711 0 : clip3_s8_s15 = _mm_max_epi16(_mm_min_epi16(sum_s8_s15, xmm_Max10bit), xmm_0);
712 :
713 : _mm_storeu_si128((__m128i*) recon_ptr, clip3_0_7);
714 0 : _mm_storeu_si128((__m128i*) (recon_ptr + 8), clip3_8_15);
715 0 : _mm_storeu_si128((__m128i*) (recon_ptr + recon_stride), clip3_s0_s7);
716 0 : _mm_storeu_si128((__m128i*) (recon_ptr + recon_stride + 8), clip3_s8_s15);
717 :
718 0 : pred_ptr += pred_stride << 1;
719 0 : residual_ptr += residual_stride << 1;
720 0 : recon_ptr += recon_stride << 1;
721 : }
722 : }
723 0 : else if (width == 32) {
724 : __m128i sum_0_7, sum_8_15, sum_16_23, sum_24_31, sum_s0_s7, sum_s8_s15, sum_s16_s23, sum_s24_s31;
725 : __m128i clip3_0_7, clip3_8_15, clip3_16_23, clip3_24_31, clip3_s0_s7, clip3_s8_s15, clip3_s16_s23, clip3_s24_s31;
726 :
727 0 : for (y = 0; y < height; y += 2) {
728 0 : sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)pred_ptr), _mm_loadu_si128((__m128i*)residual_ptr));
729 0 : sum_8_15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 8)), _mm_loadu_si128((__m128i*)(residual_ptr + 8)));
730 0 : sum_16_23 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 16)), _mm_loadu_si128((__m128i*)(residual_ptr + 16)));
731 0 : sum_24_31 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 24)), _mm_loadu_si128((__m128i*)(residual_ptr + 24)));
732 :
733 0 : sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride)));
734 0 : sum_s8_s15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride + 8)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride + 8)));
735 0 : sum_s16_s23 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride + 16)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride + 16)));
736 0 : sum_s24_s31 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride + 24)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride + 24)));
737 :
738 0 : clip3_0_7 = _mm_max_epi16(_mm_min_epi16(sum_0_7, xmm_Max10bit), xmm_0);
739 0 : clip3_8_15 = _mm_max_epi16(_mm_min_epi16(sum_8_15, xmm_Max10bit), xmm_0);
740 0 : clip3_16_23 = _mm_max_epi16(_mm_min_epi16(sum_16_23, xmm_Max10bit), xmm_0);
741 0 : clip3_24_31 = _mm_max_epi16(_mm_min_epi16(sum_24_31, xmm_Max10bit), xmm_0);
742 :
743 0 : clip3_s0_s7 = _mm_max_epi16(_mm_min_epi16(sum_s0_s7, xmm_Max10bit), xmm_0);
744 0 : clip3_s8_s15 = _mm_max_epi16(_mm_min_epi16(sum_s8_s15, xmm_Max10bit), xmm_0);
745 0 : clip3_s16_s23 = _mm_max_epi16(_mm_min_epi16(sum_s16_s23, xmm_Max10bit), xmm_0);
746 0 : clip3_s24_s31 = _mm_max_epi16(_mm_min_epi16(sum_s24_s31, xmm_Max10bit), xmm_0);
747 :
748 : _mm_storeu_si128((__m128i*) recon_ptr, clip3_0_7);
749 0 : _mm_storeu_si128((__m128i*) (recon_ptr + 8), clip3_8_15);
750 0 : _mm_storeu_si128((__m128i*) (recon_ptr + 16), clip3_16_23);
751 0 : _mm_storeu_si128((__m128i*) (recon_ptr + 24), clip3_24_31);
752 :
753 0 : _mm_storeu_si128((__m128i*) (recon_ptr + recon_stride), clip3_s0_s7);
754 0 : _mm_storeu_si128((__m128i*) (recon_ptr + recon_stride + 8), clip3_s8_s15);
755 0 : _mm_storeu_si128((__m128i*) (recon_ptr + recon_stride + 16), clip3_s16_s23);
756 0 : _mm_storeu_si128((__m128i*) (recon_ptr + recon_stride + 24), clip3_s24_s31);
757 :
758 0 : pred_ptr += pred_stride << 1;
759 0 : residual_ptr += residual_stride << 1;
760 0 : recon_ptr += recon_stride << 1;
761 : }
762 : }
763 0 : else if (width == 64) { // Branch not tested due to Max TU size is 32 at time of development
764 :
765 : __m128i sum_0_7, sum_8_15, sum_16_23, sum_24_31, sum_32_39, sum_40_47, sum_48_55, sum_56_63;
766 : __m128i clip3_0_7, clip3_8_15, clip3_16_23, clip3_24_31, clip3_32_39, clip3_40_47, clip3_48_55, clip3_56_63;
767 :
768 0 : for (y = 0; y < height; ++y) {
769 0 : sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)pred_ptr), _mm_loadu_si128((__m128i*)residual_ptr));
770 0 : sum_8_15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 8)), _mm_loadu_si128((__m128i*)(residual_ptr + 8)));
771 0 : sum_16_23 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 16)), _mm_loadu_si128((__m128i*)(residual_ptr + 16)));
772 0 : sum_24_31 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 24)), _mm_loadu_si128((__m128i*)(residual_ptr + 24)));
773 0 : sum_32_39 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 32)), _mm_loadu_si128((__m128i*)(residual_ptr + 32)));
774 0 : sum_40_47 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 40)), _mm_loadu_si128((__m128i*)(residual_ptr + 40)));
775 0 : sum_48_55 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 48)), _mm_loadu_si128((__m128i*)(residual_ptr + 48)));
776 0 : sum_56_63 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 56)), _mm_loadu_si128((__m128i*)(residual_ptr + 56)));
777 :
778 0 : clip3_0_7 = _mm_max_epi16(_mm_min_epi16(sum_0_7, xmm_Max10bit), xmm_0);
779 0 : clip3_8_15 = _mm_max_epi16(_mm_min_epi16(sum_8_15, xmm_Max10bit), xmm_0);
780 0 : clip3_16_23 = _mm_max_epi16(_mm_min_epi16(sum_16_23, xmm_Max10bit), xmm_0);
781 0 : clip3_24_31 = _mm_max_epi16(_mm_min_epi16(sum_24_31, xmm_Max10bit), xmm_0);
782 0 : clip3_32_39 = _mm_max_epi16(_mm_min_epi16(sum_32_39, xmm_Max10bit), xmm_0);
783 0 : clip3_40_47 = _mm_max_epi16(_mm_min_epi16(sum_40_47, xmm_Max10bit), xmm_0);
784 0 : clip3_48_55 = _mm_max_epi16(_mm_min_epi16(sum_48_55, xmm_Max10bit), xmm_0);
785 0 : clip3_56_63 = _mm_max_epi16(_mm_min_epi16(sum_56_63, xmm_Max10bit), xmm_0);
786 :
787 : _mm_storeu_si128((__m128i*) recon_ptr, clip3_0_7);
788 0 : _mm_storeu_si128((__m128i*) (recon_ptr + 8), clip3_8_15);
789 0 : _mm_storeu_si128((__m128i*) (recon_ptr + 16), clip3_16_23);
790 0 : _mm_storeu_si128((__m128i*) (recon_ptr + 24), clip3_24_31);
791 0 : _mm_storeu_si128((__m128i*) (recon_ptr + 32), clip3_32_39);
792 0 : _mm_storeu_si128((__m128i*) (recon_ptr + 40), clip3_40_47);
793 0 : _mm_storeu_si128((__m128i*) (recon_ptr + 48), clip3_48_55);
794 0 : _mm_storeu_si128((__m128i*) (recon_ptr + 56), clip3_56_63);
795 :
796 0 : pred_ptr += pred_stride;
797 0 : residual_ptr += residual_stride;
798 0 : recon_ptr += recon_stride;
799 : }
800 : }
801 : else
802 : {
803 0 : uint32_t predStrideDiff = 2 * pred_stride;
804 0 : uint32_t residualStrideDiff = 2 * residual_stride;
805 0 : uint32_t reconStrideDiff = 2 * recon_stride;
806 0 : predStrideDiff -= width;
807 0 : residualStrideDiff -= width;
808 0 : reconStrideDiff -= width;
809 :
810 0 : if (!(width & 7)) {
811 : __m128i xmm_sum_0_7, xmm_sum_s0_s7, xmm_clip3_0_7, xmm_clip3_s0_s7;
812 :
813 0 : for (x = 0; x < height; x += 2) {
814 0 : for (y = 0; y < width; y += 8) {
815 0 : xmm_sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)pred_ptr), _mm_loadu_si128((__m128i*)residual_ptr));
816 0 : xmm_sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride)));
817 :
818 0 : xmm_clip3_0_7 = _mm_max_epi16(_mm_min_epi16(xmm_sum_0_7, xmm_Max10bit), xmm_0);
819 0 : xmm_clip3_s0_s7 = _mm_max_epi16(_mm_min_epi16(xmm_sum_s0_s7, xmm_Max10bit), xmm_0);
820 :
821 : _mm_storeu_si128((__m128i*) recon_ptr, xmm_clip3_0_7);
822 0 : _mm_storeu_si128((__m128i*) (recon_ptr + recon_stride), xmm_clip3_s0_s7);
823 :
824 0 : pred_ptr += 8;
825 0 : residual_ptr += 8;
826 0 : recon_ptr += 8;
827 : }
828 0 : pred_ptr += predStrideDiff;
829 0 : residual_ptr += residualStrideDiff;
830 0 : recon_ptr += reconStrideDiff;
831 : }
832 : }
833 : else {
834 : __m128i xmm_sum_0_7, xmm_sum_s0_s7, xmm_clip3_0_3, xmm_clip3_s0_s3;
835 0 : for (x = 0; x < height; x += 2) {
836 0 : for (y = 0; y < width; y += 4) {
837 0 : xmm_sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)pred_ptr), _mm_loadu_si128((__m128i*)residual_ptr));
838 0 : xmm_sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride)));
839 :
840 0 : xmm_clip3_0_3 = _mm_max_epi16(_mm_min_epi16(xmm_sum_0_7, xmm_Max10bit), xmm_0);
841 0 : xmm_clip3_s0_s3 = _mm_max_epi16(_mm_min_epi16(xmm_sum_s0_s7, xmm_Max10bit), xmm_0);
842 :
843 0 : _mm_storel_epi64((__m128i*) recon_ptr, xmm_clip3_0_3);
844 0 : _mm_storel_epi64((__m128i*) (recon_ptr + recon_stride), xmm_clip3_s0_s3);
845 :
846 0 : pred_ptr += 4;
847 0 : residual_ptr += 4;
848 0 : recon_ptr += 4;
849 : }
850 0 : pred_ptr += predStrideDiff;
851 0 : residual_ptr += residualStrideDiff;
852 0 : recon_ptr += reconStrideDiff;
853 : }
854 : }
855 : }
856 0 : return;
857 : }
858 :
859 0 : static INLINE __m128i Distortion_SSE2_INTRIN(const __m128i input,
860 : const __m128i recon, const __m128i sum) {
861 0 : const __m128i in = _mm_unpacklo_epi8(input, _mm_setzero_si128());
862 0 : const __m128i re = _mm_unpacklo_epi8(recon, _mm_setzero_si128());
863 0 : const __m128i diff = _mm_sub_epi16(in, re);
864 0 : const __m128i dist = _mm_madd_epi16(diff, diff);
865 0 : return _mm_add_epi32(sum, dist);
866 : }
867 :
868 0 : uint64_t spatial_full_distortion_kernel4x_n_sse2_intrin(
869 : uint8_t *input,
870 : uint32_t input_offset,
871 : uint32_t input_stride,
872 : uint8_t *recon,
873 : uint32_t recon_offset,
874 : uint32_t recon_stride,
875 : uint32_t area_width,
876 : uint32_t area_height)
877 : {
878 0 : int32_t row_count = area_height;
879 0 : __m128i sum = _mm_setzero_si128();
880 0 : input += input_offset;
881 0 : recon += recon_offset;
882 : (void)area_width;
883 :
884 : do {
885 0 : const __m128i in = _mm_cvtsi32_si128(*(uint32_t *)input);
886 0 : const __m128i re = _mm_cvtsi32_si128(*(uint32_t *)recon);
887 0 : sum = Distortion_SSE2_INTRIN(in, re, sum);
888 0 : input += input_stride;
889 0 : recon += recon_stride;
890 0 : } while (--row_count);
891 :
892 0 : sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
893 :
894 0 : return _mm_cvtsi128_si32(sum);
895 : }
896 :
897 0 : uint64_t spatial_full_distortion_kernel8x_n_sse2_intrin(
898 : uint8_t *input,
899 : uint32_t input_offset,
900 : uint32_t input_stride,
901 : uint8_t *recon,
902 : uint32_t recon_offset,
903 : uint32_t recon_stride,
904 : uint32_t area_width,
905 : uint32_t area_height)
906 : {
907 0 : int32_t row_count = area_height;
908 0 : __m128i sum = _mm_setzero_si128();
909 0 : input += input_offset;
910 0 : recon += recon_offset;
911 : (void)area_width;
912 :
913 : do {
914 0 : const __m128i in = _mm_loadl_epi64((__m128i *)input);
915 0 : const __m128i re = _mm_loadl_epi64((__m128i *)recon);
916 0 : sum = Distortion_SSE2_INTRIN(in, re, sum);
917 0 : input += input_stride;
918 0 : recon += recon_stride;
919 0 : } while (--row_count);
920 :
921 0 : return Hadd32_SSE2_INTRIN(sum);
922 : }
923 :
924 0 : static INLINE void SpatialFullDistortionKernel16_SSE2_INTRIN(
925 : const uint8_t *const input, const uint8_t *const recon, __m128i *const sum)
926 : {
927 0 : const __m128i in = _mm_loadu_si128((__m128i *)input);
928 0 : const __m128i re = _mm_loadu_si128((__m128i *)recon);
929 0 : const __m128i max = _mm_max_epu8(in, re);
930 0 : const __m128i min = _mm_min_epu8(in, re);
931 0 : const __m128i diff = _mm_sub_epi8(max, min);
932 0 : const __m128i diff_L = _mm_unpacklo_epi8(diff, _mm_setzero_si128());
933 0 : const __m128i diff_H = _mm_unpackhi_epi8(diff, _mm_setzero_si128());
934 0 : const __m128i dist_L = _mm_madd_epi16(diff_L, diff_L);
935 0 : const __m128i dist_H = _mm_madd_epi16(diff_H, diff_H);
936 0 : const __m128i dist = _mm_add_epi32(dist_L, dist_H);
937 0 : *sum = _mm_add_epi32(*sum, dist);
938 0 : }
939 :
940 0 : static INLINE void SpatialFullDistortionKernel32_SSE2_INTRIN(
941 : const uint8_t *const input, const uint8_t *const recon, __m128i *const sum)
942 : {
943 0 : SpatialFullDistortionKernel16_SSE2_INTRIN(input + 0 * 16, recon + 0 * 16, sum);
944 0 : SpatialFullDistortionKernel16_SSE2_INTRIN(input + 1 * 16, recon + 1 * 16, sum);
945 0 : }
946 :
947 0 : static INLINE void SpatialFullDistortionKernel64_SSE2_INTRIN(
948 : const uint8_t *const input, const uint8_t *const recon, __m128i *const sum)
949 : {
950 0 : SpatialFullDistortionKernel32_SSE2_INTRIN(input + 0 * 32, recon + 0 * 32, sum);
951 0 : SpatialFullDistortionKernel32_SSE2_INTRIN(input + 1 * 32, recon + 1 * 32, sum);
952 0 : }
953 :
954 0 : uint64_t spatial_full_distortion_kernel16x_n_sse2_intrin(
955 : uint8_t *input,
956 : uint32_t input_offset,
957 : uint32_t input_stride,
958 : uint8_t *recon,
959 : uint32_t recon_offset,
960 : uint32_t recon_stride,
961 : uint32_t area_width,
962 : uint32_t area_height)
963 : {
964 0 : int32_t row_count = area_height;
965 0 : __m128i sum = _mm_setzero_si128();
966 0 : input += input_offset;
967 0 : recon += recon_offset;
968 : (void)area_width;
969 :
970 : do {
971 0 : SpatialFullDistortionKernel16_SSE2_INTRIN(input, recon, &sum);
972 0 : input += input_stride;
973 0 : recon += recon_stride;
974 0 : } while (--row_count);
975 :
976 0 : return Hadd32_SSE2_INTRIN(sum);
977 : }
978 :
979 0 : uint64_t spatial_full_distortion_kernel32x_n_sse2_intrin(
980 : uint8_t *input,
981 : uint32_t input_offset,
982 : uint32_t input_stride,
983 : uint8_t *recon,
984 : uint32_t recon_offset,
985 : uint32_t recon_stride,
986 : uint32_t area_width,
987 : uint32_t area_height)
988 : {
989 0 : int32_t row_count = area_height;
990 0 : __m128i sum = _mm_setzero_si128();
991 0 : input += input_offset;
992 0 : recon += recon_offset;
993 : (void)area_width;
994 :
995 : do {
996 0 : SpatialFullDistortionKernel32_SSE2_INTRIN(input, recon, &sum);
997 0 : input += input_stride;
998 0 : recon += recon_stride;
999 0 : } while (--row_count);
1000 :
1001 0 : return Hadd32_SSE2_INTRIN(sum);
1002 : }
1003 :
1004 0 : uint64_t spatial_full_distortion_kernel64x_n_sse2_intrin(
1005 : uint8_t *input,
1006 : uint32_t input_offset,
1007 : uint32_t input_stride,
1008 : uint8_t *recon,
1009 : uint32_t recon_offset,
1010 : uint32_t recon_stride,
1011 : uint32_t area_width,
1012 : uint32_t area_height)
1013 : {
1014 0 : int32_t row_count = area_height;
1015 0 : __m128i sum = _mm_setzero_si128();
1016 0 : input += input_offset;
1017 0 : recon += recon_offset;
1018 : (void)area_width;
1019 :
1020 : do {
1021 0 : SpatialFullDistortionKernel64_SSE2_INTRIN(input, recon, &sum);
1022 0 : input += input_stride;
1023 0 : recon += recon_stride;
1024 0 : } while (--row_count);
1025 :
1026 0 : return Hadd32_SSE2_INTRIN(sum);
1027 : }
1028 :
1029 0 : uint64_t spatial_full_distortion_kernel128x_n_sse2_intrin(
1030 : uint8_t *input,
1031 : uint32_t input_offset,
1032 : uint32_t input_stride,
1033 : uint8_t *recon,
1034 : uint32_t recon_offset,
1035 : uint32_t recon_stride,
1036 : uint32_t area_width,
1037 : uint32_t area_height)
1038 : {
1039 0 : int32_t row_count = area_height;
1040 0 : __m128i sum = _mm_setzero_si128();
1041 0 : input += input_offset;
1042 0 : recon += recon_offset;
1043 : (void)area_width;
1044 :
1045 : do {
1046 0 : SpatialFullDistortionKernel64_SSE2_INTRIN(input + 0 * 64, recon + 0 * 64, &sum);
1047 0 : SpatialFullDistortionKernel64_SSE2_INTRIN(input + 1 * 64, recon + 1 * 64, &sum);
1048 0 : input += input_stride;
1049 0 : recon += recon_stride;
1050 0 : } while (--row_count);
1051 :
1052 0 : return Hadd32_SSE2_INTRIN(sum);
1053 : }
1054 :
1055 : /*********************************
1056 : * x86 implememtation of Picture Addition
1057 : *********************************/
1058 0 : void picture_addition_sse2(
1059 : uint8_t *pred_ptr,
1060 : uint32_t pred_stride,
1061 : int16_t *residual_ptr,
1062 : uint32_t residual_stride,
1063 : uint8_t *recon_ptr,
1064 : uint32_t recon_stride,
1065 : uint32_t width,
1066 : uint32_t height)
1067 : {
1068 :
1069 0 : switch (width) {
1070 0 : case 4:
1071 0 : picture_addition_kernel4x4_sse_intrin(pred_ptr, pred_stride, residual_ptr, residual_stride, recon_ptr, recon_stride, width, height); break;
1072 0 : case 8:
1073 0 : picture_addition_kernel8x8_sse2_intrin(pred_ptr, pred_stride, residual_ptr, residual_stride, recon_ptr, recon_stride, width, height); break;
1074 0 : case 16:
1075 0 : picture_addition_kernel16x16_sse2_intrin(pred_ptr, pred_stride, residual_ptr, residual_stride, recon_ptr, recon_stride, width, height); break;
1076 0 : case 32:
1077 0 : picture_addition_kernel32x32_sse2_intrin(pred_ptr, pred_stride, residual_ptr, residual_stride, recon_ptr, recon_stride, width, height); break;
1078 0 : case 64:
1079 0 : picture_addition_kernel64x64_sse2_intrin(pred_ptr, pred_stride, residual_ptr, residual_stride, recon_ptr, recon_stride, width, height); break;
1080 0 : default:
1081 0 : break;
1082 : }
1083 :
1084 0 : return;
1085 : }
|