Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include <assert.h>
7 :
8 : #include "EbComputeSAD_SSE4_1.h"
9 : #include "EbDefinitions.h"
10 : #include "smmintrin.h"
11 :
12 : #define UPDATE_BEST(s, k, offset) \
13 : temSum = _mm_extract_epi32(s, k); \
14 : if (temSum < lowSum) { \
15 : lowSum = temSum; \
16 : xBest = j + offset + k; \
17 : yBest = i; \
18 : }
19 :
20 0 : void ext_sad_calculation_32x32_64x64_sse4_intrin(
21 : uint32_t *p_sad16x16,
22 : uint32_t *p_best_sad32x32,
23 : uint32_t *p_best_sad64x64,
24 : uint32_t *p_best_mv32x32,
25 : uint32_t *p_best_mv64x64,
26 : uint32_t mv,
27 : uint32_t *p_sad32x32)
28 : {
29 : __m128i xmm_N1, sad32x32_greater_than_bitmask, sad32x32_less_than_or_eq_bitmask, BestSad32x32, BestMV32x32, xmm_mv;
30 : __m128i Sad16x16_0_7_lo, Sad16x16_0_7_hi, Sad16x16_8_15_lo, Sad16x16_8_15_hi, xmm_sad64x64, xmm_sad64x64_total, xmm_pBestSad32x32, xmm_pBestMV32x32;
31 :
32 0 : Sad16x16_0_7_lo = _mm_unpacklo_epi32(_mm_loadu_si128((__m128i*)p_sad16x16), _mm_loadu_si128((__m128i*)(p_sad16x16 + 4)));
33 0 : Sad16x16_0_7_hi = _mm_unpackhi_epi32(_mm_loadu_si128((__m128i*)p_sad16x16), _mm_loadu_si128((__m128i*)(p_sad16x16 + 4)));
34 0 : Sad16x16_8_15_lo = _mm_unpacklo_epi32(_mm_loadu_si128((__m128i*)(p_sad16x16 + 8)), _mm_loadu_si128((__m128i*)(p_sad16x16 + 12)));
35 0 : Sad16x16_8_15_hi = _mm_unpackhi_epi32(_mm_loadu_si128((__m128i*)(p_sad16x16 + 8)), _mm_loadu_si128((__m128i*)(p_sad16x16 + 12)));
36 :
37 0 : xmm_sad64x64 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(Sad16x16_0_7_lo, Sad16x16_8_15_lo), _mm_unpackhi_epi64(Sad16x16_0_7_lo, Sad16x16_8_15_lo)),
38 : _mm_add_epi32(_mm_unpacklo_epi64(Sad16x16_0_7_hi, Sad16x16_8_15_hi), _mm_unpackhi_epi64(Sad16x16_0_7_hi, Sad16x16_8_15_hi)));
39 :
40 0 : p_sad32x32[0] = _mm_extract_epi32(xmm_sad64x64, 0);
41 0 : p_sad32x32[1] = _mm_extract_epi32(xmm_sad64x64, 1);
42 0 : p_sad32x32[2] = _mm_extract_epi32(xmm_sad64x64, 2);
43 0 : p_sad32x32[3] = _mm_extract_epi32(xmm_sad64x64, 3);
44 :
45 0 : xmm_sad64x64_total = _mm_add_epi32(_mm_srli_si128(xmm_sad64x64, 8), xmm_sad64x64);
46 :
47 0 : xmm_sad64x64_total = _mm_add_epi32(_mm_srli_si128(xmm_sad64x64_total, 4), xmm_sad64x64_total);
48 :
49 0 : xmm_mv = _mm_cvtsi32_si128(mv);
50 0 : xmm_mv = _mm_unpacklo_epi32(xmm_mv, xmm_mv);
51 0 : xmm_mv = _mm_unpacklo_epi64(xmm_mv, xmm_mv);
52 :
53 0 : xmm_pBestSad32x32 = _mm_loadu_si128((__m128i*)p_best_sad32x32);
54 0 : xmm_pBestMV32x32 = _mm_loadu_si128((__m128i*)p_best_mv32x32);
55 :
56 0 : sad32x32_greater_than_bitmask = _mm_cmpgt_epi32(xmm_pBestSad32x32, xmm_sad64x64);// _mm_cmplt_epi32(xmm_pBestSad32x32, xmm_sad64x64);
57 :
58 0 : xmm_N1 = _mm_cmpeq_epi8(xmm_mv, xmm_mv); // anything compared to itself is equal (get 0xFFFFFFFF)
59 0 : sad32x32_less_than_or_eq_bitmask = _mm_sub_epi32(xmm_N1, sad32x32_greater_than_bitmask);
60 :
61 0 : BestSad32x32 = _mm_or_si128(_mm_and_si128(xmm_pBestSad32x32, sad32x32_less_than_or_eq_bitmask), _mm_and_si128(xmm_sad64x64, sad32x32_greater_than_bitmask));
62 0 : BestMV32x32 = _mm_or_si128(_mm_and_si128(xmm_pBestMV32x32, sad32x32_less_than_or_eq_bitmask), _mm_and_si128(xmm_mv, sad32x32_greater_than_bitmask));
63 :
64 : _mm_storeu_si128((__m128i*)p_best_sad32x32, BestSad32x32);
65 : _mm_storeu_si128((__m128i*)p_best_mv32x32, BestMV32x32);
66 :
67 0 : uint32_t sad64x64 = _mm_cvtsi128_si32(xmm_sad64x64_total);
68 0 : if (sad64x64 < p_best_sad64x64[0]) {
69 0 : p_best_sad64x64[0] = sad64x64;
70 0 : p_best_mv64x64[0] = _mm_cvtsi128_si32(xmm_mv);
71 : }
72 0 : }
73 :
74 : /*******************************************************************************
75 : * Requirement: width = 4, 8, 16, 24, 32, 48 or 64
76 : * Requirement: block_height <= 64
77 : * Requirement: block_height % 2 = 0 when width = 4 or 8
78 : *******************************************************************************/
79 0 : void sad_loop_kernel_sse4_1_intrin(
80 : uint8_t *src, // input parameter, source samples Ptr
81 : uint32_t src_stride, // input parameter, source stride
82 : uint8_t *ref, // input parameter, reference samples Ptr
83 : uint32_t ref_stride, // input parameter, reference stride
84 : uint32_t block_height, // input parameter, block height (M)
85 : uint32_t block_width, // input parameter, block width (N)
86 : uint64_t *best_sad,
87 : int16_t *x_search_center,
88 : int16_t *y_search_center,
89 : uint32_t src_stride_raw, // input parameter, source stride (no line skipping)
90 : int16_t search_area_width,
91 : int16_t search_area_height)
92 : {
93 0 : int16_t xBest = *x_search_center, yBest = *y_search_center;
94 0 : uint32_t lowSum = 0xffffff;
95 0 : uint32_t temSum = 0;
96 : int16_t i, j;
97 : uint32_t k, l;
98 0 : uint32_t leftover = search_area_width & 7;
99 : const uint8_t *pRef, *pSrc;
100 0 : __m128i s0, s1, s2, s3, s4, s5, s6, s7, s8 = _mm_set1_epi32(-1);
101 :
102 0 : if (leftover) {
103 0 : for (k = 0; k < leftover; k++)
104 0 : s8 = _mm_slli_si128(s8, 2);
105 : }
106 :
107 0 : switch (block_width) {
108 0 : case 4:
109 0 : for (i = 0; i < search_area_height; i++) {
110 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
111 0 : pSrc = src;
112 0 : pRef = ref + j;
113 0 : s3 = _mm_setzero_si128();
114 0 : for (k = 0; k < block_height; k += 2) {
115 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
116 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
117 0 : s2 = _mm_cvtsi32_si128(*(uint32_t *)pSrc);
118 0 : s5 = _mm_cvtsi32_si128(*(uint32_t *)(pSrc + src_stride));
119 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
120 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
121 0 : pSrc += src_stride << 1;
122 0 : pRef += ref_stride << 1;
123 : }
124 0 : s3 = _mm_minpos_epu16(s3);
125 0 : temSum = _mm_extract_epi16(s3, 0);
126 0 : if (temSum < lowSum) {
127 0 : lowSum = temSum;
128 0 : xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
129 0 : yBest = i;
130 : }
131 : }
132 :
133 0 : if (leftover) {
134 0 : pSrc = src;
135 0 : pRef = ref + j;
136 0 : s3 = _mm_setzero_si128();
137 0 : for (k = 0; k < block_height; k += 2) {
138 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
139 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
140 0 : s2 = _mm_cvtsi32_si128(*(uint32_t *)pSrc);
141 0 : s5 = _mm_cvtsi32_si128(*(uint32_t *)(pSrc + src_stride));
142 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
143 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
144 0 : pSrc += src_stride << 1;
145 0 : pRef += ref_stride << 1;
146 : }
147 0 : s3 = _mm_or_si128(s3, s8);
148 0 : s3 = _mm_minpos_epu16(s3);
149 0 : temSum = _mm_extract_epi16(s3, 0);
150 0 : if (temSum < lowSum) {
151 0 : lowSum = temSum;
152 0 : xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
153 0 : yBest = i;
154 : }
155 : }
156 0 : ref += src_stride_raw;
157 : }
158 0 : break;
159 :
160 0 : case 8:
161 0 : for (i = 0; i < search_area_height; i++) {
162 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
163 0 : pSrc = src;
164 0 : pRef = ref + j;
165 0 : s3 = s4 = _mm_setzero_si128();
166 0 : for (k = 0; k < block_height; k += 2) {
167 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
168 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
169 0 : s2 = _mm_loadl_epi64((__m128i*)pSrc);
170 0 : s5 = _mm_loadl_epi64((__m128i*)(pSrc + src_stride));
171 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
172 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
173 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
174 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
175 0 : pSrc += src_stride << 1;
176 0 : pRef += ref_stride << 1;
177 : }
178 0 : s3 = _mm_adds_epu16(s3, s4);
179 0 : s3 = _mm_minpos_epu16(s3);
180 0 : temSum = _mm_extract_epi16(s3, 0);
181 0 : if (temSum < lowSum) {
182 0 : lowSum = temSum;
183 0 : xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
184 0 : yBest = i;
185 : }
186 : }
187 :
188 0 : if (leftover) {
189 0 : pSrc = src;
190 0 : pRef = ref + j;
191 0 : s3 = s4 = _mm_setzero_si128();
192 0 : for (k = 0; k < block_height; k += 2) {
193 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
194 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
195 0 : s2 = _mm_loadl_epi64((__m128i*)pSrc);
196 0 : s5 = _mm_loadl_epi64((__m128i*)(pSrc + src_stride));
197 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
198 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
199 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
200 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
201 0 : pSrc += src_stride << 1;
202 0 : pRef += ref_stride << 1;
203 : }
204 0 : s3 = _mm_adds_epu16(s3, s4);
205 0 : s3 = _mm_or_si128(s3, s8);
206 0 : s3 = _mm_minpos_epu16(s3);
207 0 : temSum = _mm_extract_epi16(s3, 0);
208 0 : if (temSum < lowSum) {
209 0 : lowSum = temSum;
210 0 : xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
211 0 : yBest = i;
212 : }
213 : }
214 0 : ref += src_stride_raw;
215 : }
216 0 : break;
217 :
218 0 : case 16:
219 0 : if (block_height <= 16) {
220 0 : for (i = 0; i < search_area_height; i++) {
221 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
222 0 : pSrc = src;
223 0 : pRef = ref + j;
224 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
225 0 : for (k = 0; k < block_height; k++) {
226 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
227 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
228 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
229 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
230 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
231 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
232 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
233 0 : pSrc += src_stride;
234 0 : pRef += ref_stride;
235 : }
236 0 : s3 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
237 0 : s3 = _mm_minpos_epu16(s3);
238 0 : temSum = _mm_extract_epi16(s3, 0);
239 0 : if (temSum < lowSum) {
240 0 : lowSum = temSum;
241 0 : xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
242 0 : yBest = i;
243 : }
244 : }
245 :
246 0 : if (leftover) {
247 0 : pSrc = src;
248 0 : pRef = ref + j;
249 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
250 0 : for (k = 0; k < block_height; k++) {
251 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
252 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
253 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
254 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
255 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
256 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
257 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
258 0 : pSrc += src_stride;
259 0 : pRef += ref_stride;
260 : }
261 0 : s3 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
262 0 : s3 = _mm_or_si128(s3, s8);
263 0 : s3 = _mm_minpos_epu16(s3);
264 0 : temSum = _mm_extract_epi16(s3, 0);
265 0 : if (temSum < lowSum) {
266 0 : lowSum = temSum;
267 0 : xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
268 0 : yBest = i;
269 : }
270 : }
271 0 : ref += src_stride_raw;
272 : }
273 : }
274 0 : else if (block_height <= 32) {
275 0 : for (i = 0; i < search_area_height; i++) {
276 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
277 0 : pSrc = src;
278 0 : pRef = ref + j;
279 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
280 0 : for (k = 0; k < block_height; k++) {
281 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
282 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
283 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
284 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
285 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
286 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
287 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
288 0 : pSrc += src_stride;
289 0 : pRef += ref_stride;
290 : }
291 0 : s3 = _mm_adds_epu16(s3, s4);
292 0 : s5 = _mm_adds_epu16(s5, s6);
293 0 : s4 = _mm_minpos_epu16(s3);
294 0 : s6 = _mm_minpos_epu16(s5);
295 0 : s4 = _mm_unpacklo_epi16(s4, s4);
296 0 : s4 = _mm_unpacklo_epi32(s4, s4);
297 0 : s4 = _mm_unpacklo_epi64(s4, s4);
298 0 : s6 = _mm_unpacklo_epi16(s6, s6);
299 0 : s6 = _mm_unpacklo_epi32(s6, s6);
300 0 : s6 = _mm_unpacklo_epi64(s6, s6);
301 0 : s3 = _mm_sub_epi16(s3, s4);
302 0 : s5 = _mm_adds_epu16(s5, s3);
303 0 : s5 = _mm_sub_epi16(s5, s6);
304 0 : s5 = _mm_minpos_epu16(s5);
305 0 : temSum = _mm_extract_epi16(s5, 0);
306 0 : temSum += _mm_extract_epi16(s4, 0);
307 0 : temSum += _mm_extract_epi16(s6, 0);
308 0 : if (temSum < lowSum) {
309 0 : lowSum = temSum;
310 0 : xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
311 0 : yBest = i;
312 : }
313 : }
314 :
315 0 : if (leftover) {
316 0 : pSrc = src;
317 0 : pRef = ref + j;
318 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
319 0 : for (k = 0; k < block_height; k++) {
320 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
321 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
322 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
323 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
324 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
325 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
326 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
327 0 : pSrc += src_stride;
328 0 : pRef += ref_stride;
329 : }
330 0 : s3 = _mm_adds_epu16(s3, s4);
331 0 : s5 = _mm_adds_epu16(s5, s6);
332 0 : s3 = _mm_or_si128(s3, s8);
333 0 : s5 = _mm_or_si128(s5, s8);
334 0 : s4 = _mm_minpos_epu16(s3);
335 0 : s6 = _mm_minpos_epu16(s5);
336 0 : s4 = _mm_unpacklo_epi16(s4, s4);
337 0 : s4 = _mm_unpacklo_epi32(s4, s4);
338 0 : s4 = _mm_unpacklo_epi64(s4, s4);
339 0 : s6 = _mm_unpacklo_epi16(s6, s6);
340 0 : s6 = _mm_unpacklo_epi32(s6, s6);
341 0 : s6 = _mm_unpacklo_epi64(s6, s6);
342 0 : s3 = _mm_sub_epi16(s3, s4);
343 0 : s5 = _mm_adds_epu16(s5, s3);
344 0 : s5 = _mm_sub_epi16(s5, s6);
345 0 : s5 = _mm_minpos_epu16(s5);
346 0 : temSum = _mm_extract_epi16(s5, 0);
347 0 : temSum += _mm_extract_epi16(s4, 0);
348 0 : temSum += _mm_extract_epi16(s6, 0);
349 0 : if (temSum < lowSum) {
350 0 : lowSum = temSum;
351 0 : xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
352 0 : yBest = i;
353 : }
354 : }
355 0 : ref += src_stride_raw;
356 : }
357 : }
358 : else {
359 0 : for (i = 0; i < search_area_height; i++) {
360 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
361 0 : pSrc = src;
362 0 : pRef = ref + j;
363 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
364 0 : for (k = 0; k < block_height; k++) {
365 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
366 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
367 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
368 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
369 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
370 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
371 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
372 0 : pSrc += src_stride;
373 0 : pRef += ref_stride;
374 : }
375 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
376 0 : s0 = _mm_minpos_epu16(s0);
377 0 : temSum = _mm_extract_epi16(s0, 0);
378 0 : if (temSum < lowSum) {
379 0 : if (temSum != 0xFFFF) { // no overflow
380 0 : lowSum = temSum;
381 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
382 0 : yBest = i;
383 : }
384 : else {
385 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
386 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
387 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
388 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
389 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
390 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
391 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
392 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
393 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
394 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
395 0 : UPDATE_BEST(s0, 0, 0);
396 0 : UPDATE_BEST(s0, 1, 0);
397 0 : UPDATE_BEST(s0, 2, 0);
398 0 : UPDATE_BEST(s0, 3, 0);
399 0 : UPDATE_BEST(s3, 0, 4);
400 0 : UPDATE_BEST(s3, 1, 4);
401 0 : UPDATE_BEST(s3, 2, 4);
402 0 : UPDATE_BEST(s3, 3, 4);
403 : }
404 : }
405 : }
406 :
407 0 : if (leftover) {
408 0 : pSrc = src;
409 0 : pRef = ref + j;
410 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
411 0 : for (k = 0; k < block_height; k++) {
412 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
413 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
414 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
415 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
416 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
417 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
418 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
419 0 : pSrc += src_stride;
420 0 : pRef += ref_stride;
421 : }
422 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
423 0 : s0 = _mm_or_si128(s0, s8);
424 0 : s0 = _mm_minpos_epu16(s0);
425 0 : temSum = _mm_extract_epi16(s0, 0);
426 0 : if (temSum < lowSum) {
427 0 : if (temSum != 0xFFFF) { // no overflow
428 0 : lowSum = temSum;
429 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
430 0 : yBest = i;
431 : }
432 : else {
433 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
434 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
435 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
436 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
437 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
438 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
439 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
440 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
441 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
442 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
443 0 : k = leftover;
444 0 : while (k > 0) {
445 0 : for (l = 0; l < 4 && k; l++, k--) {
446 0 : temSum = _mm_extract_epi32(s0, 0);
447 0 : s0 = _mm_srli_si128(s0, 4);
448 0 : if (temSum < lowSum) {
449 0 : lowSum = temSum;
450 0 : xBest = (int16_t)(j + leftover - k);
451 0 : yBest = i;
452 : }
453 : }
454 0 : s0 = s3;
455 : }
456 : }
457 : }
458 : }
459 0 : ref += src_stride_raw;
460 : }
461 : }
462 0 : break;
463 :
464 0 : case 24:
465 0 : if (block_height <= 16) {
466 0 : for (i = 0; i < search_area_height; i++) {
467 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
468 0 : pSrc = src;
469 0 : pRef = ref + j;
470 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
471 0 : for (k = 0; k < block_height; k++) {
472 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
473 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
474 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
475 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
476 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
477 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
478 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
479 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
480 0 : s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
481 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
482 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
483 0 : pSrc += src_stride;
484 0 : pRef += ref_stride;
485 : }
486 0 : s3 = _mm_adds_epu16(s3, s4);
487 0 : s5 = _mm_adds_epu16(s5, s6);
488 0 : s4 = _mm_minpos_epu16(s3);
489 0 : s6 = _mm_minpos_epu16(s5);
490 0 : s4 = _mm_unpacklo_epi16(s4, s4);
491 0 : s4 = _mm_unpacklo_epi32(s4, s4);
492 0 : s4 = _mm_unpacklo_epi64(s4, s4);
493 0 : s6 = _mm_unpacklo_epi16(s6, s6);
494 0 : s6 = _mm_unpacklo_epi32(s6, s6);
495 0 : s6 = _mm_unpacklo_epi64(s6, s6);
496 0 : s3 = _mm_sub_epi16(s3, s4);
497 0 : s5 = _mm_adds_epu16(s5, s3);
498 0 : s5 = _mm_sub_epi16(s5, s6);
499 0 : s5 = _mm_minpos_epu16(s5);
500 0 : temSum = _mm_extract_epi16(s5, 0);
501 0 : temSum += _mm_extract_epi16(s4, 0);
502 0 : temSum += _mm_extract_epi16(s6, 0);
503 0 : if (temSum < lowSum) {
504 0 : lowSum = temSum;
505 0 : xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
506 0 : yBest = i;
507 : }
508 : }
509 :
510 0 : if (leftover) {
511 0 : pSrc = src;
512 0 : pRef = ref + j;
513 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
514 0 : for (k = 0; k < block_height; k++) {
515 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
516 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
517 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
518 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
519 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
520 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
521 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
522 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
523 0 : s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
524 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
525 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
526 0 : pSrc += src_stride;
527 0 : pRef += ref_stride;
528 : }
529 0 : s3 = _mm_adds_epu16(s3, s4);
530 0 : s5 = _mm_adds_epu16(s5, s6);
531 0 : s3 = _mm_or_si128(s3, s8);
532 0 : s5 = _mm_or_si128(s5, s8);
533 0 : s4 = _mm_minpos_epu16(s3);
534 0 : s6 = _mm_minpos_epu16(s5);
535 0 : s4 = _mm_unpacklo_epi16(s4, s4);
536 0 : s4 = _mm_unpacklo_epi32(s4, s4);
537 0 : s4 = _mm_unpacklo_epi64(s4, s4);
538 0 : s6 = _mm_unpacklo_epi16(s6, s6);
539 0 : s6 = _mm_unpacklo_epi32(s6, s6);
540 0 : s6 = _mm_unpacklo_epi64(s6, s6);
541 0 : s3 = _mm_sub_epi16(s3, s4);
542 0 : s5 = _mm_adds_epu16(s5, s3);
543 0 : s5 = _mm_sub_epi16(s5, s6);
544 0 : s5 = _mm_minpos_epu16(s5);
545 0 : temSum = _mm_extract_epi16(s5, 0);
546 0 : temSum += _mm_extract_epi16(s4, 0);
547 0 : temSum += _mm_extract_epi16(s6, 0);
548 0 : if (temSum < lowSum) {
549 0 : lowSum = temSum;
550 0 : xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
551 0 : yBest = i;
552 : }
553 : }
554 0 : ref += src_stride_raw;
555 : }
556 : }
557 : else {
558 0 : for (i = 0; i < search_area_height; i++) {
559 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
560 0 : pSrc = src;
561 0 : pRef = ref + j;
562 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
563 0 : for (k = 0; k < block_height; k++) {
564 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
565 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
566 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
567 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
568 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
569 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
570 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
571 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
572 0 : s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
573 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
574 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
575 0 : pSrc += src_stride;
576 0 : pRef += ref_stride;
577 : }
578 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
579 0 : s0 = _mm_minpos_epu16(s0);
580 0 : temSum = _mm_extract_epi16(s0, 0);
581 0 : if (temSum < lowSum) {
582 0 : if (temSum != 0xFFFF) { // no overflow
583 0 : lowSum = temSum;
584 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
585 0 : yBest = i;
586 : }
587 : else {
588 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
589 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
590 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
591 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
592 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
593 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
594 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
595 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
596 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
597 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
598 0 : UPDATE_BEST(s0, 0, 0);
599 0 : UPDATE_BEST(s0, 1, 0);
600 0 : UPDATE_BEST(s0, 2, 0);
601 0 : UPDATE_BEST(s0, 3, 0);
602 0 : UPDATE_BEST(s3, 0, 4);
603 0 : UPDATE_BEST(s3, 1, 4);
604 0 : UPDATE_BEST(s3, 2, 4);
605 0 : UPDATE_BEST(s3, 3, 4);
606 : }
607 : }
608 : }
609 :
610 0 : if (leftover) {
611 0 : pSrc = src;
612 0 : pRef = ref + j;
613 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
614 0 : for (k = 0; k < block_height; k++) {
615 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
616 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
617 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
618 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
619 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
620 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
621 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
622 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
623 0 : s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
624 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
625 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
626 0 : pSrc += src_stride;
627 0 : pRef += ref_stride;
628 : }
629 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
630 0 : s0 = _mm_or_si128(s0, s8);
631 0 : s0 = _mm_minpos_epu16(s0);
632 0 : temSum = _mm_extract_epi16(s0, 0);
633 0 : if (temSum < lowSum) {
634 0 : if (temSum != 0xFFFF) { // no overflow
635 0 : lowSum = temSum;
636 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
637 0 : yBest = i;
638 : }
639 : else {
640 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
641 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
642 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
643 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
644 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
645 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
646 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
647 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
648 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
649 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
650 0 : k = leftover;
651 0 : while (k > 0) {
652 0 : for (l = 0; l < 4 && k; l++, k--) {
653 0 : temSum = _mm_extract_epi32(s0, 0);
654 0 : s0 = _mm_srli_si128(s0, 4);
655 0 : if (temSum < lowSum) {
656 0 : lowSum = temSum;
657 0 : xBest = (int16_t)(j + leftover - k);
658 0 : yBest = i;
659 : }
660 : }
661 0 : s0 = s3;
662 : }
663 : }
664 : }
665 : }
666 0 : ref += src_stride_raw;
667 : }
668 : }
669 0 : break;
670 :
671 0 : case 32:
672 0 : if (block_height <= 32) {
673 0 : for (i = 0; i < search_area_height; i++) {
674 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
675 0 : pSrc = src;
676 0 : pRef = ref + j;
677 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
678 0 : for (k = 0; k < block_height; k++) {
679 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
680 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
681 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
682 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
683 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
684 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
685 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
686 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
687 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
688 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
689 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
690 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
691 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
692 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
693 0 : pSrc += src_stride;
694 0 : pRef += ref_stride;
695 : }
696 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
697 0 : s0 = _mm_minpos_epu16(s0);
698 0 : temSum = _mm_extract_epi16(s0, 0);
699 0 : temSum &= 0x0000FFFF;
700 0 : if (temSum < lowSum) {
701 0 : if (temSum != 0xFFFF) { // no overflow
702 0 : lowSum = temSum;
703 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
704 0 : yBest = i;
705 : }
706 : else {
707 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
708 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
709 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
710 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
711 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
712 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
713 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
714 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
715 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
716 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
717 0 : UPDATE_BEST(s0, 0, 0);
718 0 : UPDATE_BEST(s0, 1, 0);
719 0 : UPDATE_BEST(s0, 2, 0);
720 0 : UPDATE_BEST(s0, 3, 0);
721 0 : UPDATE_BEST(s3, 0, 4);
722 0 : UPDATE_BEST(s3, 1, 4);
723 0 : UPDATE_BEST(s3, 2, 4);
724 0 : UPDATE_BEST(s3, 3, 4);
725 : }
726 : }
727 : }
728 :
729 0 : if (leftover) {
730 0 : pSrc = src;
731 0 : pRef = ref + j;
732 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
733 0 : for (k = 0; k < block_height; k++) {
734 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
735 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
736 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
737 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
738 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
739 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
740 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
741 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
742 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
743 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
744 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
745 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
746 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
747 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
748 0 : pSrc += src_stride;
749 0 : pRef += ref_stride;
750 : }
751 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
752 0 : s0 = _mm_or_si128(s0, s8);
753 0 : s0 = _mm_minpos_epu16(s0);
754 0 : temSum = _mm_extract_epi16(s0, 0);
755 0 : temSum &= 0x0000FFFF;
756 0 : if (temSum < lowSum) {
757 0 : if (temSum != 0xFFFF) { // no overflow
758 0 : lowSum = temSum;
759 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
760 0 : yBest = i;
761 : }
762 : else {
763 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
764 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
765 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
766 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
767 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
768 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
769 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
770 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
771 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
772 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
773 0 : k = leftover;
774 0 : while (k > 0) {
775 0 : for (l = 0; l < 4 && k; l++, k--) {
776 0 : temSum = _mm_extract_epi32(s0, 0);
777 0 : s0 = _mm_srli_si128(s0, 4);
778 0 : if (temSum < lowSum) {
779 0 : lowSum = temSum;
780 0 : xBest = (int16_t)(j + leftover - k);
781 0 : yBest = i;
782 : }
783 : }
784 0 : s0 = s3;
785 : }
786 : }
787 : }
788 : }
789 0 : ref += src_stride_raw;
790 : }
791 : }
792 : else {
793 : __m128i s9, s10, s11, s12;
794 0 : for (i = 0; i < search_area_height; i++) {
795 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
796 0 : pSrc = src;
797 0 : pRef = ref + j;
798 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
799 0 : for (k = 0; k < block_height >> 1; k++) {
800 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
801 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
802 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
803 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
804 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
805 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
806 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
807 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
808 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
809 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
810 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
811 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
812 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
813 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
814 0 : pSrc += src_stride;
815 0 : pRef += ref_stride;
816 : }
817 0 : s9 = s10 = s11 = s12 = _mm_setzero_si128();
818 0 : for (; k < block_height; k++) {
819 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
820 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
821 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
822 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
823 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
824 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
825 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
826 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
827 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
828 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
829 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
830 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
831 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
832 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
833 0 : pSrc += src_stride;
834 0 : pRef += ref_stride;
835 : }
836 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
837 0 : s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
838 0 : s0 = _mm_minpos_epu16(s0);
839 0 : temSum = _mm_extract_epi16(s0, 0);
840 0 : temSum &= 0x0000FFFF;
841 0 : if (temSum < lowSum) {
842 0 : if (temSum != 0xFFFF) { // no overflow
843 0 : lowSum = temSum;
844 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
845 0 : yBest = i;
846 : }
847 : else {
848 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
849 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
850 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
851 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
852 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
853 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
854 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
855 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
856 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
857 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
858 0 : s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
859 0 : s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
860 0 : s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
861 0 : s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
862 0 : s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
863 0 : s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
864 0 : s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
865 0 : s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
866 0 : s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
867 0 : s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
868 0 : UPDATE_BEST(s0, 0, 0);
869 0 : UPDATE_BEST(s0, 1, 0);
870 0 : UPDATE_BEST(s0, 2, 0);
871 0 : UPDATE_BEST(s0, 3, 0);
872 0 : UPDATE_BEST(s3, 0, 4);
873 0 : UPDATE_BEST(s3, 1, 4);
874 0 : UPDATE_BEST(s3, 2, 4);
875 0 : UPDATE_BEST(s3, 3, 4);
876 : }
877 : }
878 : }
879 :
880 0 : if (leftover) {
881 0 : pSrc = src;
882 0 : pRef = ref + j;
883 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
884 0 : for (k = 0; k < block_height >> 1; k++) {
885 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
886 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
887 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
888 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
889 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
890 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
891 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
892 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
893 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
894 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
895 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
896 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
897 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
898 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
899 0 : pSrc += src_stride;
900 0 : pRef += ref_stride;
901 : }
902 0 : s9 = s10 = s11 = s12 = _mm_setzero_si128();
903 0 : for (; k < block_height; k++) {
904 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
905 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
906 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
907 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
908 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
909 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
910 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
911 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
912 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
913 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
914 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
915 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
916 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
917 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
918 0 : pSrc += src_stride;
919 0 : pRef += ref_stride;
920 : }
921 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
922 0 : s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
923 0 : s0 = _mm_or_si128(s0, s8);
924 0 : s0 = _mm_minpos_epu16(s0);
925 0 : temSum = _mm_extract_epi16(s0, 0);
926 0 : temSum &= 0x0000FFFF;
927 0 : if (temSum < lowSum) {
928 0 : if (temSum != 0xFFFF) { // no overflow
929 0 : lowSum = temSum;
930 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
931 0 : yBest = i;
932 : }
933 : else {
934 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
935 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
936 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
937 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
938 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
939 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
940 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
941 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
942 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
943 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
944 0 : s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
945 0 : s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
946 0 : s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
947 0 : s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
948 0 : s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
949 0 : s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
950 0 : s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
951 0 : s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
952 0 : s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
953 0 : s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
954 0 : k = leftover;
955 0 : while (k > 0) {
956 0 : for (l = 0; l < 4 && k; l++, k--) {
957 0 : temSum = _mm_extract_epi32(s0, 0);
958 0 : s0 = _mm_srli_si128(s0, 4);
959 0 : if (temSum < lowSum) {
960 0 : lowSum = temSum;
961 0 : xBest = (int16_t)(j + leftover - k);
962 0 : yBest = i;
963 : }
964 : }
965 0 : s0 = s3;
966 : }
967 : }
968 : }
969 : }
970 0 : ref += src_stride_raw;
971 : }
972 : }
973 0 : break;
974 :
975 0 : case 48:
976 0 : if (block_height <= 32) {
977 : __m128i s9, s10, s11, s12;
978 0 : for (i = 0; i < search_area_height; i++) {
979 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
980 0 : pSrc = src;
981 0 : pRef = ref + j;
982 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
983 0 : for (k = 0; k < block_height >> 1; k++) {
984 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
985 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
986 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
987 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
988 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
989 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
990 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
991 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
992 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
993 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
994 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
995 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
996 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
997 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
998 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
999 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1000 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1001 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1002 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1003 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1004 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1005 0 : pSrc += src_stride;
1006 0 : pRef += ref_stride;
1007 : }
1008 0 : s9 = s10 = s11 = s12 = _mm_setzero_si128();
1009 0 : for (; k < block_height; k++) {
1010 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1011 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
1012 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
1013 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
1014 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
1015 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
1016 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
1017 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
1018 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
1019 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
1020 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
1021 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
1022 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
1023 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
1024 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1025 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1026 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1027 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
1028 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
1029 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
1030 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
1031 0 : pSrc += src_stride;
1032 0 : pRef += ref_stride;
1033 : }
1034 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
1035 0 : s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
1036 0 : s0 = _mm_minpos_epu16(s0);
1037 0 : temSum = _mm_extract_epi16(s0, 0);
1038 0 : temSum &= 0x0000FFFF;
1039 0 : if (temSum < lowSum) {
1040 0 : if (temSum != 0xFFFF) { // no overflow
1041 0 : lowSum = temSum;
1042 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
1043 0 : yBest = i;
1044 : }
1045 : else {
1046 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
1047 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
1048 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
1049 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
1050 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
1051 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
1052 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
1053 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
1054 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
1055 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
1056 0 : s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
1057 0 : s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
1058 0 : s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
1059 0 : s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
1060 0 : s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
1061 0 : s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
1062 0 : s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
1063 0 : s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
1064 0 : s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
1065 0 : s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
1066 0 : UPDATE_BEST(s0, 0, 0);
1067 0 : UPDATE_BEST(s0, 1, 0);
1068 0 : UPDATE_BEST(s0, 2, 0);
1069 0 : UPDATE_BEST(s0, 3, 0);
1070 0 : UPDATE_BEST(s3, 0, 4);
1071 0 : UPDATE_BEST(s3, 1, 4);
1072 0 : UPDATE_BEST(s3, 2, 4);
1073 0 : UPDATE_BEST(s3, 3, 4);
1074 : }
1075 : }
1076 : }
1077 :
1078 0 : if (leftover) {
1079 0 : pSrc = src;
1080 0 : pRef = ref + j;
1081 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
1082 0 : for (k = 0; k < block_height >> 1; k++) {
1083 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1084 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
1085 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
1086 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1087 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1088 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1089 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1090 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
1091 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
1092 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
1093 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1094 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1095 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1096 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1097 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1098 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1099 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1100 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1101 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1102 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1103 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1104 0 : pSrc += src_stride;
1105 0 : pRef += ref_stride;
1106 : }
1107 0 : s9 = s10 = s11 = s12 = _mm_setzero_si128();
1108 0 : for (; k < block_height; k++) {
1109 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1110 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
1111 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
1112 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
1113 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
1114 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
1115 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
1116 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
1117 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
1118 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
1119 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
1120 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
1121 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
1122 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
1123 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1124 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1125 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1126 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
1127 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
1128 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
1129 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
1130 0 : pSrc += src_stride;
1131 0 : pRef += ref_stride;
1132 : }
1133 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
1134 0 : s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
1135 0 : s0 = _mm_or_si128(s0, s8);
1136 0 : s0 = _mm_minpos_epu16(s0);
1137 0 : temSum = _mm_extract_epi16(s0, 0);
1138 0 : temSum &= 0x0000FFFF;
1139 0 : if (temSum < lowSum) {
1140 0 : if (temSum != 0xFFFF) { // no overflow
1141 0 : lowSum = temSum;
1142 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
1143 0 : yBest = i;
1144 : }
1145 : else {
1146 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
1147 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
1148 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
1149 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
1150 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
1151 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
1152 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
1153 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
1154 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
1155 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
1156 0 : s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
1157 0 : s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
1158 0 : s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
1159 0 : s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
1160 0 : s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
1161 0 : s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
1162 0 : s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
1163 0 : s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
1164 0 : s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
1165 0 : s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
1166 0 : k = leftover;
1167 0 : while (k > 0) {
1168 0 : for (l = 0; l < 4 && k; l++, k--) {
1169 0 : temSum = _mm_extract_epi32(s0, 0);
1170 0 : s0 = _mm_srli_si128(s0, 4);
1171 0 : if (temSum < lowSum) {
1172 0 : lowSum = temSum;
1173 0 : xBest = (int16_t)(j + leftover - k);
1174 0 : yBest = i;
1175 : }
1176 : }
1177 0 : s0 = s3;
1178 : }
1179 : }
1180 : }
1181 : }
1182 0 : ref += src_stride_raw;
1183 : }
1184 : }
1185 : else {
1186 : __m128i s9, s10;
1187 0 : for (i = 0; i < search_area_height; i++) {
1188 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
1189 0 : pSrc = src;
1190 0 : pRef = ref + j;
1191 0 : s9 = s10 = _mm_setzero_si128();
1192 0 : k = 0;
1193 0 : while (k < block_height) {
1194 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
1195 0 : for (l = 0; l < 21 && k < block_height; k++, l++) {
1196 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1197 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
1198 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
1199 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1200 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1201 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1202 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1203 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
1204 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
1205 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
1206 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1207 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1208 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1209 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1210 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1211 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1212 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1213 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1214 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1215 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1216 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1217 0 : pSrc += src_stride;
1218 0 : pRef += ref_stride;
1219 : }
1220 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
1221 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
1222 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
1223 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
1224 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
1225 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
1226 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
1227 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
1228 0 : s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
1229 0 : s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
1230 : }
1231 0 : s0 = _mm_packus_epi32(s9, s10);
1232 0 : s0 = _mm_minpos_epu16(s0);
1233 0 : temSum = _mm_extract_epi16(s0, 0);
1234 0 : temSum &= 0x0000FFFF;
1235 0 : if (temSum < lowSum) {
1236 0 : if (temSum != 0xFFFF) { // no overflow
1237 0 : lowSum = temSum;
1238 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
1239 0 : yBest = i;
1240 : }
1241 : else {
1242 0 : UPDATE_BEST(s9, 0, 0);
1243 0 : UPDATE_BEST(s9, 1, 0);
1244 0 : UPDATE_BEST(s9, 2, 0);
1245 0 : UPDATE_BEST(s9, 3, 0);
1246 0 : UPDATE_BEST(s10, 0, 4);
1247 0 : UPDATE_BEST(s10, 1, 4);
1248 0 : UPDATE_BEST(s10, 2, 4);
1249 0 : UPDATE_BEST(s10, 3, 4);
1250 : }
1251 : }
1252 : }
1253 :
1254 0 : if (leftover) {
1255 0 : pSrc = src;
1256 0 : pRef = ref + j;
1257 0 : s9 = s10 = _mm_setzero_si128();
1258 0 : k = 0;
1259 0 : while (k < block_height) {
1260 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
1261 0 : for (l = 0; l < 21 && k < block_height; k++, l++) {
1262 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1263 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
1264 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
1265 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1266 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1267 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1268 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1269 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
1270 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
1271 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
1272 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1273 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1274 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1275 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1276 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1277 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1278 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1279 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1280 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1281 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1282 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1283 0 : pSrc += src_stride;
1284 0 : pRef += ref_stride;
1285 : }
1286 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
1287 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
1288 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
1289 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
1290 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
1291 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
1292 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
1293 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
1294 0 : s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
1295 0 : s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
1296 : }
1297 0 : s0 = _mm_packus_epi32(s9, s10);
1298 0 : s0 = _mm_or_si128(s0, s8);
1299 0 : s0 = _mm_minpos_epu16(s0);
1300 0 : temSum = _mm_extract_epi16(s0, 0);
1301 0 : temSum &= 0x0000FFFF;
1302 0 : if (temSum < lowSum) {
1303 0 : if (temSum != 0xFFFF) { // no overflow
1304 0 : lowSum = temSum;
1305 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
1306 0 : yBest = i;
1307 : }
1308 : else {
1309 0 : k = leftover;
1310 0 : while (k > 0) {
1311 0 : for (l = 0; l < 4 && k; l++, k--) {
1312 0 : temSum = _mm_extract_epi32(s9, 0);
1313 0 : s9 = _mm_srli_si128(s9, 4);
1314 0 : if (temSum < lowSum) {
1315 0 : lowSum = temSum;
1316 0 : xBest = (int16_t)(j + leftover - k);
1317 0 : yBest = i;
1318 : }
1319 : }
1320 0 : s9 = s10;
1321 : }
1322 : }
1323 : }
1324 : }
1325 0 : ref += src_stride_raw;
1326 : }
1327 : }
1328 0 : break;
1329 :
1330 0 : case 64:
1331 0 : if (block_height <= 32) {
1332 : __m128i s9, s10, s11, s12;
1333 0 : for (i = 0; i < search_area_height; i++) {
1334 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
1335 0 : pSrc = src;
1336 0 : pRef = ref + j;
1337 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
1338 0 : for (k = 0; k < block_height >> 1; k++) {
1339 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1340 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
1341 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
1342 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1343 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1344 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1345 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1346 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
1347 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
1348 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
1349 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1350 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1351 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1352 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1353 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1354 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1355 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1356 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1357 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1358 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1359 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1360 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
1361 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
1362 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
1363 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1364 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1365 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1366 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1367 0 : pSrc += src_stride;
1368 0 : pRef += ref_stride;
1369 : }
1370 0 : s9 = s10 = s11 = s12 = _mm_setzero_si128();
1371 0 : for (; k < block_height; k++) {
1372 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1373 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
1374 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
1375 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
1376 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
1377 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
1378 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
1379 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
1380 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
1381 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
1382 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
1383 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
1384 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
1385 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
1386 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1387 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1388 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1389 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
1390 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
1391 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
1392 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
1393 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
1394 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
1395 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
1396 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
1397 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
1398 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
1399 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
1400 0 : pSrc += src_stride;
1401 0 : pRef += ref_stride;
1402 : }
1403 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
1404 0 : s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
1405 0 : s0 = _mm_minpos_epu16(s0);
1406 0 : temSum = _mm_extract_epi16(s0, 0);
1407 0 : temSum &= 0x0000FFFF;
1408 0 : if (temSum < lowSum) {
1409 0 : if (temSum != 0xFFFF) { // no overflow
1410 0 : lowSum = temSum;
1411 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
1412 0 : yBest = i;
1413 : }
1414 : else {
1415 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
1416 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
1417 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
1418 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
1419 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
1420 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
1421 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
1422 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
1423 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
1424 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
1425 0 : s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
1426 0 : s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
1427 0 : s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
1428 0 : s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
1429 0 : s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
1430 0 : s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
1431 0 : s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
1432 0 : s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
1433 0 : s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
1434 0 : s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
1435 0 : UPDATE_BEST(s0, 0, 0);
1436 0 : UPDATE_BEST(s0, 1, 0);
1437 0 : UPDATE_BEST(s0, 2, 0);
1438 0 : UPDATE_BEST(s0, 3, 0);
1439 0 : UPDATE_BEST(s3, 0, 4);
1440 0 : UPDATE_BEST(s3, 1, 4);
1441 0 : UPDATE_BEST(s3, 2, 4);
1442 0 : UPDATE_BEST(s3, 3, 4);
1443 : }
1444 : }
1445 : }
1446 :
1447 0 : if (leftover) {
1448 0 : pSrc = src;
1449 0 : pRef = ref + j;
1450 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
1451 0 : for (k = 0; k < block_height >> 1; k++) {
1452 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1453 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
1454 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
1455 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1456 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1457 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1458 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1459 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
1460 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
1461 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
1462 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1463 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1464 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1465 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1466 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1467 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1468 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1469 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1470 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1471 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1472 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1473 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
1474 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
1475 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
1476 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1477 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1478 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1479 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1480 0 : pSrc += src_stride;
1481 0 : pRef += ref_stride;
1482 : }
1483 0 : s9 = s10 = s11 = s12 = _mm_setzero_si128();
1484 0 : for (; k < block_height; k++) {
1485 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1486 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
1487 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
1488 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
1489 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
1490 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
1491 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
1492 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
1493 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
1494 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
1495 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
1496 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
1497 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
1498 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
1499 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1500 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1501 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1502 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
1503 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
1504 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
1505 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
1506 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
1507 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
1508 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
1509 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
1510 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
1511 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
1512 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
1513 0 : pSrc += src_stride;
1514 0 : pRef += ref_stride;
1515 : }
1516 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
1517 0 : s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
1518 0 : s0 = _mm_or_si128(s0, s8);
1519 0 : s0 = _mm_minpos_epu16(s0);
1520 0 : temSum = _mm_extract_epi16(s0, 0);
1521 0 : temSum &= 0x0000FFFF;
1522 0 : if (temSum < lowSum) {
1523 0 : if (temSum != 0xFFFF) { // no overflow
1524 0 : lowSum = temSum;
1525 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
1526 0 : yBest = i;
1527 : }
1528 : else {
1529 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
1530 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
1531 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
1532 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
1533 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
1534 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
1535 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
1536 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
1537 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
1538 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
1539 0 : s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
1540 0 : s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
1541 0 : s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
1542 0 : s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
1543 0 : s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
1544 0 : s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
1545 0 : s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
1546 0 : s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
1547 0 : s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
1548 0 : s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
1549 0 : k = leftover;
1550 0 : while (k > 0) {
1551 0 : for (l = 0; l < 4 && k; l++, k--) {
1552 0 : temSum = _mm_extract_epi32(s0, 0);
1553 0 : s0 = _mm_srli_si128(s0, 4);
1554 0 : if (temSum < lowSum) {
1555 0 : lowSum = temSum;
1556 0 : xBest = (int16_t)(j + leftover - k);
1557 0 : yBest = i;
1558 : }
1559 : }
1560 0 : s0 = s3;
1561 : }
1562 : }
1563 : }
1564 : }
1565 0 : ref += src_stride_raw;
1566 : }
1567 : }
1568 : else {
1569 : __m128i s9, s10;
1570 0 : for (i = 0; i < search_area_height; i++) {
1571 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
1572 0 : pSrc = src;
1573 0 : pRef = ref + j;
1574 0 : s9 = s10 = _mm_setzero_si128();
1575 0 : k = 0;
1576 0 : while (k < block_height) {
1577 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
1578 0 : for (l = 0; l < 16 && k < block_height; k++, l++) {
1579 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1580 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
1581 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
1582 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1583 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1584 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1585 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1586 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
1587 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
1588 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
1589 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1590 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1591 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1592 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1593 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1594 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1595 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1596 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1597 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1598 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1599 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1600 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
1601 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
1602 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
1603 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1604 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1605 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1606 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1607 0 : pSrc += src_stride;
1608 0 : pRef += ref_stride;
1609 : }
1610 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
1611 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
1612 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
1613 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
1614 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
1615 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
1616 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
1617 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
1618 0 : s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
1619 0 : s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
1620 : }
1621 0 : s0 = _mm_packus_epi32(s9, s10);
1622 0 : s0 = _mm_minpos_epu16(s0);
1623 0 : temSum = _mm_extract_epi16(s0, 0);
1624 0 : temSum &= 0x0000FFFF;
1625 0 : if (temSum < lowSum) {
1626 0 : if (temSum != 0xFFFF) { // no overflow
1627 0 : lowSum = temSum;
1628 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
1629 0 : yBest = i;
1630 : }
1631 : else {
1632 0 : UPDATE_BEST(s9, 0, 0);
1633 0 : UPDATE_BEST(s9, 1, 0);
1634 0 : UPDATE_BEST(s9, 2, 0);
1635 0 : UPDATE_BEST(s9, 3, 0);
1636 0 : UPDATE_BEST(s10, 0, 4);
1637 0 : UPDATE_BEST(s10, 1, 4);
1638 0 : UPDATE_BEST(s10, 2, 4);
1639 0 : UPDATE_BEST(s10, 3, 4);
1640 : }
1641 : }
1642 : }
1643 :
1644 0 : if (leftover) {
1645 0 : pSrc = src;
1646 0 : pRef = ref + j;
1647 0 : s9 = s10 = _mm_setzero_si128();
1648 0 : k = 0;
1649 0 : while (k < block_height) {
1650 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
1651 0 : for (l = 0; l < 16 && k < block_height; k++, l++) {
1652 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1653 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
1654 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
1655 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1656 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1657 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1658 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1659 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
1660 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
1661 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
1662 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1663 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1664 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1665 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1666 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1667 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1668 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1669 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1670 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1671 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1672 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1673 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
1674 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
1675 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
1676 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1677 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1678 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1679 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1680 0 : pSrc += src_stride;
1681 0 : pRef += ref_stride;
1682 : }
1683 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
1684 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
1685 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
1686 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
1687 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
1688 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
1689 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
1690 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
1691 0 : s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
1692 0 : s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
1693 : }
1694 0 : s0 = _mm_packus_epi32(s9, s10);
1695 0 : s0 = _mm_or_si128(s0, s8);
1696 0 : s0 = _mm_minpos_epu16(s0);
1697 0 : temSum = _mm_extract_epi16(s0, 0);
1698 0 : temSum &= 0x0000FFFF;
1699 0 : if (temSum < lowSum) {
1700 0 : if (temSum != 0xFFFF) { // no overflow
1701 0 : lowSum = temSum;
1702 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
1703 0 : yBest = i;
1704 : }
1705 : else {
1706 0 : k = leftover;
1707 0 : while (k > 0) {
1708 0 : for (l = 0; l < 4 && k; l++, k--) {
1709 0 : temSum = _mm_extract_epi32(s9, 0);
1710 0 : s9 = _mm_srli_si128(s9, 4);
1711 0 : if (temSum < lowSum) {
1712 0 : lowSum = temSum;
1713 0 : xBest = (int16_t)(j + leftover - k);
1714 0 : yBest = i;
1715 : }
1716 : }
1717 0 : s9 = s10;
1718 : }
1719 : }
1720 : }
1721 : }
1722 0 : ref += src_stride_raw;
1723 : }
1724 : }
1725 0 : break;
1726 :
1727 0 : default:
1728 : assert(0);
1729 0 : break;
1730 : }
1731 :
1732 0 : *best_sad = lowSum;
1733 0 : *x_search_center = xBest;
1734 0 : *y_search_center = yBest;
1735 0 : }
1736 :
1737 0 : void sad_loop_kernel_sparse_sse4_1_intrin(
1738 : uint8_t *src, // input parameter, source samples Ptr
1739 : uint32_t src_stride, // input parameter, source stride
1740 : uint8_t *ref, // input parameter, reference samples Ptr
1741 : uint32_t ref_stride, // input parameter, reference stride
1742 : uint32_t block_height, // input parameter, block height (M)
1743 : uint32_t block_width, // input parameter, block width (N)
1744 : uint64_t *best_sad,
1745 : int16_t *x_search_center,
1746 : int16_t *y_search_center,
1747 : uint32_t src_stride_raw, // input parameter, source stride (no line skipping)
1748 : int16_t search_area_width,
1749 : int16_t search_area_height)
1750 : {
1751 0 : int16_t xBest = *x_search_center, yBest = *y_search_center;
1752 0 : uint32_t lowSum = 0xffffff;
1753 0 : uint32_t temSum = 0;
1754 : int16_t i, j;
1755 : uint32_t k, l;
1756 0 : uint32_t leftover = search_area_width & 7;
1757 : const uint8_t *pRef, *pSrc;
1758 0 : __m128i s0, s1, s2, s3, s4, s5, s6, s7, s8 = _mm_set1_epi32(-1);
1759 :
1760 0 : if (leftover) {
1761 0 : for (k = 0; k < leftover; k++)
1762 0 : s8 = _mm_slli_si128(s8, 2);
1763 : }
1764 :
1765 0 : switch (block_width) {
1766 0 : case 4:
1767 0 : for (i = 0; i < search_area_height; i++) {
1768 0 : uint32_t startW = (i & 1) << 3;
1769 0 : for (j = startW; j <= search_area_width - 8; j += 16) {
1770 0 : pSrc = src;
1771 0 : pRef = ref + j;
1772 0 : s3 = _mm_setzero_si128();
1773 0 : for (k = 0; k < block_height; k += 2) {
1774 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1775 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
1776 0 : s2 = _mm_cvtsi32_si128(*(uint32_t *)pSrc);
1777 0 : s5 = _mm_cvtsi32_si128(*(uint32_t *)(pSrc + src_stride));
1778 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1779 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1780 0 : pSrc += src_stride << 1;
1781 0 : pRef += ref_stride << 1;
1782 : }
1783 0 : s3 = _mm_minpos_epu16(s3);
1784 0 : temSum = _mm_extract_epi16(s3, 0);
1785 0 : if (temSum < lowSum) {
1786 0 : lowSum = temSum;
1787 0 : xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
1788 0 : yBest = i;
1789 : }
1790 : }
1791 :
1792 0 : if (leftover && j < search_area_width ) {
1793 0 : pSrc = src;
1794 0 : pRef = ref + j;
1795 0 : s3 = _mm_setzero_si128();
1796 0 : for (k = 0; k < block_height; k += 2) {
1797 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1798 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
1799 0 : s2 = _mm_cvtsi32_si128(*(uint32_t *)pSrc);
1800 0 : s5 = _mm_cvtsi32_si128(*(uint32_t *)(pSrc + src_stride));
1801 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1802 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1803 0 : pSrc += src_stride << 1;
1804 0 : pRef += ref_stride << 1;
1805 : }
1806 0 : s3 = _mm_or_si128(s3, s8);
1807 0 : s3 = _mm_minpos_epu16(s3);
1808 0 : temSum = _mm_extract_epi16(s3, 0);
1809 0 : if (temSum < lowSum) {
1810 0 : lowSum = temSum;
1811 0 : xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
1812 0 : yBest = i;
1813 : }
1814 : }
1815 0 : ref += src_stride_raw;
1816 : }
1817 0 : break;
1818 :
1819 0 : case 8:
1820 0 : for (i = 0; i < search_area_height; i++) {
1821 0 : uint32_t startW = (i & 1) << 3;
1822 0 : for (j = startW; j <= search_area_width - 8; j += 16) {
1823 0 : pSrc = src;
1824 0 : pRef = ref + j;
1825 0 : s3 = s4 = _mm_setzero_si128();
1826 0 : for (k = 0; k < block_height; k += 2) {
1827 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1828 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
1829 0 : s2 = _mm_loadl_epi64((__m128i*)pSrc);
1830 0 : s5 = _mm_loadl_epi64((__m128i*)(pSrc + src_stride));
1831 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1832 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1833 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1834 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1835 0 : pSrc += src_stride << 1;
1836 0 : pRef += ref_stride << 1;
1837 : }
1838 0 : s3 = _mm_adds_epu16(s3, s4);
1839 0 : s3 = _mm_minpos_epu16(s3);
1840 0 : temSum = _mm_extract_epi16(s3, 0);
1841 0 : if (temSum < lowSum) {
1842 0 : lowSum = temSum;
1843 0 : xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
1844 0 : yBest = i;
1845 : }
1846 : }
1847 :
1848 0 : if (leftover && j < search_area_width ) {
1849 0 : pSrc = src;
1850 0 : pRef = ref + j;
1851 0 : s3 = s4 = _mm_setzero_si128();
1852 0 : for (k = 0; k < block_height; k += 2) {
1853 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1854 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
1855 0 : s2 = _mm_loadl_epi64((__m128i*)pSrc);
1856 0 : s5 = _mm_loadl_epi64((__m128i*)(pSrc + src_stride));
1857 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1858 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1859 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1860 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1861 0 : pSrc += src_stride << 1;
1862 0 : pRef += ref_stride << 1;
1863 : }
1864 0 : s3 = _mm_adds_epu16(s3, s4);
1865 0 : s3 = _mm_or_si128(s3, s8);
1866 0 : s3 = _mm_minpos_epu16(s3);
1867 0 : temSum = _mm_extract_epi16(s3, 0);
1868 0 : if (temSum < lowSum) {
1869 0 : lowSum = temSum;
1870 0 : xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
1871 0 : yBest = i;
1872 : }
1873 : }
1874 0 : ref += src_stride_raw;
1875 : }
1876 0 : break;
1877 :
1878 0 : case 16:
1879 0 : if (block_height <= 16) {
1880 0 : for (i = 0; i < search_area_height; i++) {
1881 0 : uint32_t startW = (i & 1) << 3;
1882 0 : for (j = startW; j <= search_area_width - 8; j += 16) {
1883 0 : pSrc = src;
1884 0 : pRef = ref + j;
1885 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
1886 0 : for (k = 0; k < block_height; k++) {
1887 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1888 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
1889 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
1890 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1891 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1892 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1893 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1894 0 : pSrc += src_stride;
1895 0 : pRef += ref_stride;
1896 : }
1897 0 : s3 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
1898 0 : s3 = _mm_minpos_epu16(s3);
1899 0 : temSum = _mm_extract_epi16(s3, 0);
1900 0 : if (temSum < lowSum) {
1901 0 : lowSum = temSum;
1902 0 : xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
1903 0 : yBest = i;
1904 : }
1905 : }
1906 :
1907 0 : if (leftover && j < search_area_width ) {
1908 0 : pSrc = src;
1909 0 : pRef = ref + j;
1910 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
1911 0 : for (k = 0; k < block_height; k++) {
1912 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1913 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
1914 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
1915 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1916 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1917 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1918 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1919 0 : pSrc += src_stride;
1920 0 : pRef += ref_stride;
1921 : }
1922 0 : s3 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
1923 0 : s3 = _mm_or_si128(s3, s8);
1924 0 : s3 = _mm_minpos_epu16(s3);
1925 0 : temSum = _mm_extract_epi16(s3, 0);
1926 0 : if (temSum < lowSum) {
1927 0 : lowSum = temSum;
1928 0 : xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
1929 0 : yBest = i;
1930 : }
1931 : }
1932 0 : ref += src_stride_raw;
1933 : }
1934 : }
1935 0 : else if (block_height <= 32) {
1936 0 : for (i = 0; i < search_area_height; i++) {
1937 0 : uint32_t startW = (i & 1) << 3;
1938 0 : for (j = startW; j <= search_area_width - 8; j += 16) {
1939 0 : pSrc = src;
1940 0 : pRef = ref + j;
1941 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
1942 0 : for (k = 0; k < block_height; k++) {
1943 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1944 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
1945 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
1946 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1947 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1948 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1949 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1950 0 : pSrc += src_stride;
1951 0 : pRef += ref_stride;
1952 : }
1953 0 : s3 = _mm_adds_epu16(s3, s4);
1954 0 : s5 = _mm_adds_epu16(s5, s6);
1955 0 : s4 = _mm_minpos_epu16(s3);
1956 0 : s6 = _mm_minpos_epu16(s5);
1957 0 : s4 = _mm_unpacklo_epi16(s4, s4);
1958 0 : s4 = _mm_unpacklo_epi32(s4, s4);
1959 0 : s4 = _mm_unpacklo_epi64(s4, s4);
1960 0 : s6 = _mm_unpacklo_epi16(s6, s6);
1961 0 : s6 = _mm_unpacklo_epi32(s6, s6);
1962 0 : s6 = _mm_unpacklo_epi64(s6, s6);
1963 0 : s3 = _mm_sub_epi16(s3, s4);
1964 0 : s5 = _mm_adds_epu16(s5, s3);
1965 0 : s5 = _mm_sub_epi16(s5, s6);
1966 0 : s5 = _mm_minpos_epu16(s5);
1967 0 : temSum = _mm_extract_epi16(s5, 0);
1968 0 : temSum += _mm_extract_epi16(s4, 0);
1969 0 : temSum += _mm_extract_epi16(s6, 0);
1970 0 : if (temSum < lowSum) {
1971 0 : lowSum = temSum;
1972 0 : xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
1973 0 : yBest = i;
1974 : }
1975 : }
1976 :
1977 0 : if (leftover && j < search_area_width ) {
1978 0 : pSrc = src;
1979 0 : pRef = ref + j;
1980 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
1981 0 : for (k = 0; k < block_height; k++) {
1982 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
1983 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
1984 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
1985 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1986 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1987 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1988 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1989 0 : pSrc += src_stride;
1990 0 : pRef += ref_stride;
1991 : }
1992 0 : s3 = _mm_adds_epu16(s3, s4);
1993 0 : s5 = _mm_adds_epu16(s5, s6);
1994 0 : s3 = _mm_or_si128(s3, s8);
1995 0 : s5 = _mm_or_si128(s5, s8);
1996 0 : s4 = _mm_minpos_epu16(s3);
1997 0 : s6 = _mm_minpos_epu16(s5);
1998 0 : s4 = _mm_unpacklo_epi16(s4, s4);
1999 0 : s4 = _mm_unpacklo_epi32(s4, s4);
2000 0 : s4 = _mm_unpacklo_epi64(s4, s4);
2001 0 : s6 = _mm_unpacklo_epi16(s6, s6);
2002 0 : s6 = _mm_unpacklo_epi32(s6, s6);
2003 0 : s6 = _mm_unpacklo_epi64(s6, s6);
2004 0 : s3 = _mm_sub_epi16(s3, s4);
2005 0 : s5 = _mm_adds_epu16(s5, s3);
2006 0 : s5 = _mm_sub_epi16(s5, s6);
2007 0 : s5 = _mm_minpos_epu16(s5);
2008 0 : temSum = _mm_extract_epi16(s5, 0);
2009 0 : temSum += _mm_extract_epi16(s4, 0);
2010 0 : temSum += _mm_extract_epi16(s6, 0);
2011 0 : if (temSum < lowSum) {
2012 0 : lowSum = temSum;
2013 0 : xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
2014 0 : yBest = i;
2015 : }
2016 : }
2017 0 : ref += src_stride_raw;
2018 : }
2019 : }
2020 : else {
2021 0 : for (i = 0; i < search_area_height; i++) {
2022 0 : uint32_t startW = (i & 1) << 3;
2023 0 : for (j = startW; j <= search_area_width - 8; j += 16) {
2024 0 : pSrc = src;
2025 0 : pRef = ref + j;
2026 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
2027 0 : for (k = 0; k < block_height; k++) {
2028 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2029 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2030 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2031 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2032 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2033 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2034 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2035 0 : pSrc += src_stride;
2036 0 : pRef += ref_stride;
2037 : }
2038 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
2039 0 : s0 = _mm_minpos_epu16(s0);
2040 0 : temSum = _mm_extract_epi16(s0, 0);
2041 0 : if (temSum < lowSum) {
2042 0 : if (temSum != 0xFFFF) { // no overflow
2043 0 : lowSum = temSum;
2044 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
2045 0 : yBest = i;
2046 : }
2047 : else {
2048 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2049 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2050 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
2051 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
2052 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
2053 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
2054 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
2055 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
2056 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
2057 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
2058 0 : UPDATE_BEST(s0, 0, 0);
2059 0 : UPDATE_BEST(s0, 1, 0);
2060 0 : UPDATE_BEST(s0, 2, 0);
2061 0 : UPDATE_BEST(s0, 3, 0);
2062 0 : UPDATE_BEST(s3, 0, 4);
2063 0 : UPDATE_BEST(s3, 1, 4);
2064 0 : UPDATE_BEST(s3, 2, 4);
2065 0 : UPDATE_BEST(s3, 3, 4);
2066 : }
2067 : }
2068 : }
2069 :
2070 0 : if (leftover && j < search_area_width ) {
2071 0 : pSrc = src;
2072 0 : pRef = ref + j;
2073 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
2074 0 : for (k = 0; k < block_height; k++) {
2075 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2076 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2077 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2078 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2079 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2080 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2081 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2082 0 : pSrc += src_stride;
2083 0 : pRef += ref_stride;
2084 : }
2085 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
2086 0 : s0 = _mm_or_si128(s0, s8);
2087 0 : s0 = _mm_minpos_epu16(s0);
2088 0 : temSum = _mm_extract_epi16(s0, 0);
2089 0 : if (temSum < lowSum) {
2090 0 : if (temSum != 0xFFFF) { // no overflow
2091 0 : lowSum = temSum;
2092 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
2093 0 : yBest = i;
2094 : }
2095 : else {
2096 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2097 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2098 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
2099 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
2100 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
2101 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
2102 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
2103 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
2104 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
2105 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
2106 0 : k = leftover;
2107 0 : while (k > 0) {
2108 0 : for (l = 0; l < 4 && k; l++, k--) {
2109 0 : temSum = _mm_extract_epi32(s0, 0);
2110 0 : s0 = _mm_srli_si128(s0, 4);
2111 0 : if (temSum < lowSum) {
2112 0 : lowSum = temSum;
2113 0 : xBest = (int16_t)(j + leftover - k);
2114 0 : yBest = i;
2115 : }
2116 : }
2117 0 : s0 = s3;
2118 : }
2119 : }
2120 : }
2121 : }
2122 0 : ref += src_stride_raw;
2123 : }
2124 : }
2125 0 : break;
2126 :
2127 0 : case 24:
2128 0 : if (block_height <= 16) {
2129 0 : for (i = 0; i < search_area_height; i++) {
2130 0 : uint32_t startW = (i & 1) << 3;
2131 0 : for (j = startW; j <= search_area_width - 8; j += 16) {
2132 0 : pSrc = src;
2133 0 : pRef = ref + j;
2134 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
2135 0 : for (k = 0; k < block_height; k++) {
2136 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2137 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2138 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2139 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2140 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2141 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2142 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2143 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
2144 0 : s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
2145 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2146 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2147 0 : pSrc += src_stride;
2148 0 : pRef += ref_stride;
2149 : }
2150 0 : s3 = _mm_adds_epu16(s3, s4);
2151 0 : s5 = _mm_adds_epu16(s5, s6);
2152 0 : s4 = _mm_minpos_epu16(s3);
2153 0 : s6 = _mm_minpos_epu16(s5);
2154 0 : s4 = _mm_unpacklo_epi16(s4, s4);
2155 0 : s4 = _mm_unpacklo_epi32(s4, s4);
2156 0 : s4 = _mm_unpacklo_epi64(s4, s4);
2157 0 : s6 = _mm_unpacklo_epi16(s6, s6);
2158 0 : s6 = _mm_unpacklo_epi32(s6, s6);
2159 0 : s6 = _mm_unpacklo_epi64(s6, s6);
2160 0 : s3 = _mm_sub_epi16(s3, s4);
2161 0 : s5 = _mm_adds_epu16(s5, s3);
2162 0 : s5 = _mm_sub_epi16(s5, s6);
2163 0 : s5 = _mm_minpos_epu16(s5);
2164 0 : temSum = _mm_extract_epi16(s5, 0);
2165 0 : temSum += _mm_extract_epi16(s4, 0);
2166 0 : temSum += _mm_extract_epi16(s6, 0);
2167 0 : if (temSum < lowSum) {
2168 0 : lowSum = temSum;
2169 0 : xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
2170 0 : yBest = i;
2171 : }
2172 : }
2173 :
2174 0 : if (leftover && j < search_area_width ) {
2175 0 : pSrc = src;
2176 0 : pRef = ref + j;
2177 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
2178 0 : for (k = 0; k < block_height; k++) {
2179 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2180 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2181 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2182 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2183 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2184 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2185 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2186 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
2187 0 : s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
2188 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2189 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2190 0 : pSrc += src_stride;
2191 0 : pRef += ref_stride;
2192 : }
2193 0 : s3 = _mm_adds_epu16(s3, s4);
2194 0 : s5 = _mm_adds_epu16(s5, s6);
2195 0 : s3 = _mm_or_si128(s3, s8);
2196 0 : s5 = _mm_or_si128(s5, s8);
2197 0 : s4 = _mm_minpos_epu16(s3);
2198 0 : s6 = _mm_minpos_epu16(s5);
2199 0 : s4 = _mm_unpacklo_epi16(s4, s4);
2200 0 : s4 = _mm_unpacklo_epi32(s4, s4);
2201 0 : s4 = _mm_unpacklo_epi64(s4, s4);
2202 0 : s6 = _mm_unpacklo_epi16(s6, s6);
2203 0 : s6 = _mm_unpacklo_epi32(s6, s6);
2204 0 : s6 = _mm_unpacklo_epi64(s6, s6);
2205 0 : s3 = _mm_sub_epi16(s3, s4);
2206 0 : s5 = _mm_adds_epu16(s5, s3);
2207 0 : s5 = _mm_sub_epi16(s5, s6);
2208 0 : s5 = _mm_minpos_epu16(s5);
2209 0 : temSum = _mm_extract_epi16(s5, 0);
2210 0 : temSum += _mm_extract_epi16(s4, 0);
2211 0 : temSum += _mm_extract_epi16(s6, 0);
2212 0 : if (temSum < lowSum) {
2213 0 : lowSum = temSum;
2214 0 : xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
2215 0 : yBest = i;
2216 : }
2217 : }
2218 0 : ref += src_stride_raw;
2219 : }
2220 : }
2221 : else {
2222 0 : for (i = 0; i < search_area_height; i++) {
2223 0 : uint32_t startW = (i & 1) << 3;
2224 0 : for (j = startW; j <= search_area_width - 8; j += 16) {
2225 0 : pSrc = src;
2226 0 : pRef = ref + j;
2227 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
2228 0 : for (k = 0; k < block_height; k++) {
2229 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2230 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2231 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2232 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2233 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2234 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2235 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2236 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
2237 0 : s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
2238 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2239 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2240 0 : pSrc += src_stride;
2241 0 : pRef += ref_stride;
2242 : }
2243 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
2244 0 : s0 = _mm_minpos_epu16(s0);
2245 0 : temSum = _mm_extract_epi16(s0, 0);
2246 0 : if (temSum < lowSum) {
2247 0 : if (temSum != 0xFFFF) { // no overflow
2248 0 : lowSum = temSum;
2249 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
2250 0 : yBest = i;
2251 : }
2252 : else {
2253 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2254 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2255 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
2256 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
2257 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
2258 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
2259 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
2260 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
2261 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
2262 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
2263 0 : UPDATE_BEST(s0, 0, 0);
2264 0 : UPDATE_BEST(s0, 1, 0);
2265 0 : UPDATE_BEST(s0, 2, 0);
2266 0 : UPDATE_BEST(s0, 3, 0);
2267 0 : UPDATE_BEST(s3, 0, 4);
2268 0 : UPDATE_BEST(s3, 1, 4);
2269 0 : UPDATE_BEST(s3, 2, 4);
2270 0 : UPDATE_BEST(s3, 3, 4);
2271 : }
2272 : }
2273 : }
2274 :
2275 0 : if (leftover && j < search_area_width ) {
2276 0 : pSrc = src;
2277 0 : pRef = ref + j;
2278 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
2279 0 : for (k = 0; k < block_height; k++) {
2280 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2281 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2282 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2283 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2284 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2285 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2286 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2287 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
2288 0 : s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
2289 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2290 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2291 0 : pSrc += src_stride;
2292 0 : pRef += ref_stride;
2293 : }
2294 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
2295 0 : s0 = _mm_or_si128(s0, s8);
2296 0 : s0 = _mm_minpos_epu16(s0);
2297 0 : temSum = _mm_extract_epi16(s0, 0);
2298 0 : if (temSum < lowSum) {
2299 0 : if (temSum != 0xFFFF) { // no overflow
2300 0 : lowSum = temSum;
2301 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
2302 0 : yBest = i;
2303 : }
2304 : else {
2305 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2306 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2307 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
2308 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
2309 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
2310 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
2311 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
2312 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
2313 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
2314 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
2315 0 : k = leftover;
2316 0 : while (k > 0) {
2317 0 : for (l = 0; l < 4 && k; l++, k--) {
2318 0 : temSum = _mm_extract_epi32(s0, 0);
2319 0 : s0 = _mm_srli_si128(s0, 4);
2320 0 : if (temSum < lowSum) {
2321 0 : lowSum = temSum;
2322 0 : xBest = (int16_t)(j + leftover - k);
2323 0 : yBest = i;
2324 : }
2325 : }
2326 0 : s0 = s3;
2327 : }
2328 : }
2329 : }
2330 : }
2331 0 : ref += src_stride_raw;
2332 : }
2333 : }
2334 0 : break;
2335 :
2336 0 : case 32:
2337 0 : if (block_height <= 32) {
2338 0 : for (i = 0; i < search_area_height; i++) {
2339 0 : uint32_t startW = (i & 1) << 3;
2340 0 : for (j = startW; j <= search_area_width - 8; j += 16) {
2341 0 : pSrc = src;
2342 0 : pRef = ref + j;
2343 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
2344 0 : for (k = 0; k < block_height; k++) {
2345 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2346 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2347 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2348 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2349 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2350 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2351 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2352 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
2353 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
2354 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
2355 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2356 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2357 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2358 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2359 0 : pSrc += src_stride;
2360 0 : pRef += ref_stride;
2361 : }
2362 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
2363 0 : s0 = _mm_minpos_epu16(s0);
2364 0 : temSum = _mm_extract_epi16(s0, 0);
2365 0 : temSum &= 0x0000FFFF;
2366 0 : if (temSum < lowSum) {
2367 0 : if (temSum != 0xFFFF) { // no overflow
2368 0 : lowSum = temSum;
2369 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
2370 0 : yBest = i;
2371 : }
2372 : else {
2373 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2374 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2375 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
2376 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
2377 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
2378 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
2379 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
2380 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
2381 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
2382 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
2383 0 : UPDATE_BEST(s0, 0, 0);
2384 0 : UPDATE_BEST(s0, 1, 0);
2385 0 : UPDATE_BEST(s0, 2, 0);
2386 0 : UPDATE_BEST(s0, 3, 0);
2387 0 : UPDATE_BEST(s3, 0, 4);
2388 0 : UPDATE_BEST(s3, 1, 4);
2389 0 : UPDATE_BEST(s3, 2, 4);
2390 0 : UPDATE_BEST(s3, 3, 4);
2391 : }
2392 : }
2393 : }
2394 :
2395 0 : if (leftover && j < search_area_width ) {
2396 0 : pSrc = src;
2397 0 : pRef = ref + j;
2398 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
2399 0 : for (k = 0; k < block_height; k++) {
2400 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2401 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2402 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2403 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2404 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2405 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2406 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2407 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
2408 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
2409 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
2410 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2411 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2412 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2413 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2414 0 : pSrc += src_stride;
2415 0 : pRef += ref_stride;
2416 : }
2417 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
2418 0 : s0 = _mm_or_si128(s0, s8);
2419 0 : s0 = _mm_minpos_epu16(s0);
2420 0 : temSum = _mm_extract_epi16(s0, 0);
2421 0 : temSum &= 0x0000FFFF;
2422 0 : if (temSum < lowSum) {
2423 0 : if (temSum != 0xFFFF) { // no overflow
2424 0 : lowSum = temSum;
2425 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
2426 0 : yBest = i;
2427 : }
2428 : else {
2429 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2430 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2431 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
2432 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
2433 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
2434 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
2435 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
2436 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
2437 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
2438 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
2439 0 : k = leftover;
2440 0 : while (k > 0) {
2441 0 : for (l = 0; l < 4 && k; l++, k--) {
2442 0 : temSum = _mm_extract_epi32(s0, 0);
2443 0 : s0 = _mm_srli_si128(s0, 4);
2444 0 : if (temSum < lowSum) {
2445 0 : lowSum = temSum;
2446 0 : xBest = (int16_t)(j + leftover - k);
2447 0 : yBest = i;
2448 : }
2449 : }
2450 0 : s0 = s3;
2451 : }
2452 : }
2453 : }
2454 : }
2455 0 : ref += src_stride_raw;
2456 : }
2457 : }
2458 : else {
2459 : __m128i s9, s10, s11, s12;
2460 0 : for (i = 0; i < search_area_height; i++) {
2461 0 : uint32_t startW = (i & 1) << 3;
2462 0 : for (j = startW; j <= search_area_width - 8; j += 16) {
2463 0 : pSrc = src;
2464 0 : pRef = ref + j;
2465 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
2466 0 : for (k = 0; k < block_height >> 1; k++) {
2467 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2468 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2469 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2470 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2471 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2472 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2473 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2474 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
2475 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
2476 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
2477 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2478 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2479 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2480 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2481 0 : pSrc += src_stride;
2482 0 : pRef += ref_stride;
2483 : }
2484 0 : s9 = s10 = s11 = s12 = _mm_setzero_si128();
2485 0 : for (; k < block_height; k++) {
2486 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2487 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2488 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2489 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
2490 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
2491 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
2492 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
2493 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
2494 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
2495 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
2496 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
2497 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
2498 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
2499 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
2500 0 : pSrc += src_stride;
2501 0 : pRef += ref_stride;
2502 : }
2503 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
2504 0 : s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
2505 0 : s0 = _mm_minpos_epu16(s0);
2506 0 : temSum = _mm_extract_epi16(s0, 0);
2507 0 : temSum &= 0x0000FFFF;
2508 0 : if (temSum < lowSum) {
2509 0 : if (temSum != 0xFFFF) { // no overflow
2510 0 : lowSum = temSum;
2511 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
2512 0 : yBest = i;
2513 : }
2514 : else {
2515 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2516 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2517 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
2518 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
2519 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
2520 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
2521 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
2522 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
2523 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
2524 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
2525 0 : s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
2526 0 : s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
2527 0 : s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
2528 0 : s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
2529 0 : s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
2530 0 : s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
2531 0 : s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
2532 0 : s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
2533 0 : s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
2534 0 : s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
2535 0 : UPDATE_BEST(s0, 0, 0);
2536 0 : UPDATE_BEST(s0, 1, 0);
2537 0 : UPDATE_BEST(s0, 2, 0);
2538 0 : UPDATE_BEST(s0, 3, 0);
2539 0 : UPDATE_BEST(s3, 0, 4);
2540 0 : UPDATE_BEST(s3, 1, 4);
2541 0 : UPDATE_BEST(s3, 2, 4);
2542 0 : UPDATE_BEST(s3, 3, 4);
2543 : }
2544 : }
2545 : }
2546 :
2547 0 : if (leftover && j < search_area_width ) {
2548 0 : pSrc = src;
2549 0 : pRef = ref + j;
2550 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
2551 0 : for (k = 0; k < block_height >> 1; k++) {
2552 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2553 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2554 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2555 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2556 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2557 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2558 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2559 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
2560 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
2561 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
2562 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2563 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2564 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2565 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2566 0 : pSrc += src_stride;
2567 0 : pRef += ref_stride;
2568 : }
2569 0 : s9 = s10 = s11 = s12 = _mm_setzero_si128();
2570 0 : for (; k < block_height; k++) {
2571 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2572 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2573 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2574 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
2575 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
2576 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
2577 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
2578 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
2579 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
2580 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
2581 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
2582 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
2583 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
2584 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
2585 0 : pSrc += src_stride;
2586 0 : pRef += ref_stride;
2587 : }
2588 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
2589 0 : s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
2590 0 : s0 = _mm_or_si128(s0, s8);
2591 0 : s0 = _mm_minpos_epu16(s0);
2592 0 : temSum = _mm_extract_epi16(s0, 0);
2593 0 : temSum &= 0x0000FFFF;
2594 0 : if (temSum < lowSum) {
2595 0 : if (temSum != 0xFFFF) { // no overflow
2596 0 : lowSum = temSum;
2597 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
2598 0 : yBest = i;
2599 : }
2600 : else {
2601 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2602 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2603 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
2604 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
2605 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
2606 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
2607 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
2608 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
2609 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
2610 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
2611 0 : s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
2612 0 : s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
2613 0 : s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
2614 0 : s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
2615 0 : s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
2616 0 : s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
2617 0 : s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
2618 0 : s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
2619 0 : s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
2620 0 : s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
2621 0 : k = leftover;
2622 0 : while (k > 0) {
2623 0 : for (l = 0; l < 4 && k; l++, k--) {
2624 0 : temSum = _mm_extract_epi32(s0, 0);
2625 0 : s0 = _mm_srli_si128(s0, 4);
2626 0 : if (temSum < lowSum) {
2627 0 : lowSum = temSum;
2628 0 : xBest = (int16_t)(j + leftover - k);
2629 0 : yBest = i;
2630 : }
2631 : }
2632 0 : s0 = s3;
2633 : }
2634 : }
2635 : }
2636 : }
2637 0 : ref += src_stride_raw;
2638 : }
2639 : }
2640 0 : break;
2641 :
2642 0 : case 48:
2643 0 : if (block_height <= 32) {
2644 : __m128i s9, s10, s11, s12;
2645 0 : for (i = 0; i < search_area_height; i++) {
2646 0 : uint32_t startW = (i & 1) << 3;
2647 0 : for (j = startW; j <= search_area_width - 8; j += 16) {
2648 0 : pSrc = src;
2649 0 : pRef = ref + j;
2650 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
2651 0 : for (k = 0; k < block_height >> 1; k++) {
2652 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2653 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2654 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2655 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2656 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2657 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2658 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2659 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
2660 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
2661 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
2662 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2663 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2664 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2665 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2666 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
2667 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
2668 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
2669 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2670 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2671 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2672 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2673 0 : pSrc += src_stride;
2674 0 : pRef += ref_stride;
2675 : }
2676 0 : s9 = s10 = s11 = s12 = _mm_setzero_si128();
2677 0 : for (; k < block_height; k++) {
2678 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2679 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2680 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2681 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
2682 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
2683 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
2684 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
2685 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
2686 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
2687 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
2688 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
2689 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
2690 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
2691 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
2692 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
2693 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
2694 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
2695 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
2696 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
2697 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
2698 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
2699 0 : pSrc += src_stride;
2700 0 : pRef += ref_stride;
2701 : }
2702 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
2703 0 : s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
2704 0 : s0 = _mm_minpos_epu16(s0);
2705 0 : temSum = _mm_extract_epi16(s0, 0);
2706 0 : temSum &= 0x0000FFFF;
2707 0 : if (temSum < lowSum) {
2708 0 : if (temSum != 0xFFFF) { // no overflow
2709 0 : lowSum = temSum;
2710 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
2711 0 : yBest = i;
2712 : }
2713 : else {
2714 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2715 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2716 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
2717 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
2718 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
2719 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
2720 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
2721 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
2722 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
2723 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
2724 0 : s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
2725 0 : s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
2726 0 : s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
2727 0 : s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
2728 0 : s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
2729 0 : s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
2730 0 : s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
2731 0 : s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
2732 0 : s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
2733 0 : s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
2734 0 : UPDATE_BEST(s0, 0, 0);
2735 0 : UPDATE_BEST(s0, 1, 0);
2736 0 : UPDATE_BEST(s0, 2, 0);
2737 0 : UPDATE_BEST(s0, 3, 0);
2738 0 : UPDATE_BEST(s3, 0, 4);
2739 0 : UPDATE_BEST(s3, 1, 4);
2740 0 : UPDATE_BEST(s3, 2, 4);
2741 0 : UPDATE_BEST(s3, 3, 4);
2742 : }
2743 : }
2744 : }
2745 :
2746 0 : if (leftover && j < search_area_width ) {
2747 0 : pSrc = src;
2748 0 : pRef = ref + j;
2749 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
2750 0 : for (k = 0; k < block_height >> 1; k++) {
2751 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2752 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2753 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2754 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2755 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2756 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2757 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2758 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
2759 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
2760 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
2761 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2762 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2763 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2764 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2765 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
2766 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
2767 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
2768 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2769 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2770 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2771 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2772 0 : pSrc += src_stride;
2773 0 : pRef += ref_stride;
2774 : }
2775 0 : s9 = s10 = s11 = s12 = _mm_setzero_si128();
2776 0 : for (; k < block_height; k++) {
2777 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2778 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2779 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2780 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
2781 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
2782 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
2783 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
2784 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
2785 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
2786 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
2787 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
2788 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
2789 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
2790 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
2791 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
2792 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
2793 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
2794 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
2795 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
2796 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
2797 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
2798 0 : pSrc += src_stride;
2799 0 : pRef += ref_stride;
2800 : }
2801 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
2802 0 : s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
2803 0 : s0 = _mm_or_si128(s0, s8);
2804 0 : s0 = _mm_minpos_epu16(s0);
2805 0 : temSum = _mm_extract_epi16(s0, 0);
2806 0 : temSum &= 0x0000FFFF;
2807 0 : if (temSum < lowSum) {
2808 0 : if (temSum != 0xFFFF) { // no overflow
2809 0 : lowSum = temSum;
2810 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
2811 0 : yBest = i;
2812 : }
2813 : else {
2814 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2815 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2816 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
2817 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
2818 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
2819 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
2820 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
2821 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
2822 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
2823 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
2824 0 : s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
2825 0 : s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
2826 0 : s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
2827 0 : s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
2828 0 : s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
2829 0 : s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
2830 0 : s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
2831 0 : s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
2832 0 : s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
2833 0 : s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
2834 0 : k = leftover;
2835 0 : while (k > 0) {
2836 0 : for (l = 0; l < 4 && k; l++, k--) {
2837 0 : temSum = _mm_extract_epi32(s0, 0);
2838 0 : s0 = _mm_srli_si128(s0, 4);
2839 0 : if (temSum < lowSum) {
2840 0 : lowSum = temSum;
2841 0 : xBest = (int16_t)(j + leftover - k);
2842 0 : yBest = i;
2843 : }
2844 : }
2845 0 : s0 = s3;
2846 : }
2847 : }
2848 : }
2849 : }
2850 0 : ref += src_stride_raw;
2851 : }
2852 : }
2853 : else {
2854 : __m128i s9, s10;
2855 0 : for (i = 0; i < search_area_height; i++) {
2856 0 : uint32_t startW = (i & 1) << 3;
2857 0 : for (j = startW; j <= search_area_width - 8; j += 16) {
2858 0 : pSrc = src;
2859 0 : pRef = ref + j;
2860 0 : s9 = s10 = _mm_setzero_si128();
2861 0 : k = 0;
2862 0 : while (k < block_height) {
2863 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
2864 0 : for (l = 0; l < 21 && k < block_height; k++, l++) {
2865 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2866 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2867 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2868 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2869 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2870 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2871 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2872 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
2873 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
2874 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
2875 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2876 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2877 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2878 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2879 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
2880 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
2881 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
2882 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2883 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2884 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2885 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2886 0 : pSrc += src_stride;
2887 0 : pRef += ref_stride;
2888 : }
2889 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2890 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2891 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
2892 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
2893 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
2894 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
2895 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
2896 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
2897 0 : s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
2898 0 : s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
2899 : }
2900 0 : s0 = _mm_packus_epi32(s9, s10);
2901 0 : s0 = _mm_minpos_epu16(s0);
2902 0 : temSum = _mm_extract_epi16(s0, 0);
2903 0 : temSum &= 0x0000FFFF;
2904 0 : if (temSum < lowSum) {
2905 0 : if (temSum != 0xFFFF) { // no overflow
2906 0 : lowSum = temSum;
2907 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
2908 0 : yBest = i;
2909 : }
2910 : else {
2911 0 : UPDATE_BEST(s9, 0, 0);
2912 0 : UPDATE_BEST(s9, 1, 0);
2913 0 : UPDATE_BEST(s9, 2, 0);
2914 0 : UPDATE_BEST(s9, 3, 0);
2915 0 : UPDATE_BEST(s10, 0, 4);
2916 0 : UPDATE_BEST(s10, 1, 4);
2917 0 : UPDATE_BEST(s10, 2, 4);
2918 0 : UPDATE_BEST(s10, 3, 4);
2919 : }
2920 : }
2921 : }
2922 :
2923 0 : if (leftover && j < search_area_width ) {
2924 0 : pSrc = src;
2925 0 : pRef = ref + j;
2926 0 : s9 = s10 = _mm_setzero_si128();
2927 0 : k = 0;
2928 0 : while (k < block_height) {
2929 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
2930 0 : for (l = 0; l < 21 && k < block_height; k++, l++) {
2931 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
2932 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
2933 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
2934 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2935 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2936 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2937 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2938 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
2939 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
2940 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
2941 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2942 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2943 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2944 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2945 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
2946 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
2947 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
2948 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
2949 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
2950 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
2951 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
2952 0 : pSrc += src_stride;
2953 0 : pRef += ref_stride;
2954 : }
2955 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2956 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2957 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
2958 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
2959 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
2960 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
2961 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
2962 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
2963 0 : s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
2964 0 : s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
2965 : }
2966 0 : s0 = _mm_packus_epi32(s9, s10);
2967 0 : s0 = _mm_or_si128(s0, s8);
2968 0 : s0 = _mm_minpos_epu16(s0);
2969 0 : temSum = _mm_extract_epi16(s0, 0);
2970 0 : temSum &= 0x0000FFFF;
2971 0 : if (temSum < lowSum) {
2972 0 : if (temSum != 0xFFFF) { // no overflow
2973 0 : lowSum = temSum;
2974 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
2975 0 : yBest = i;
2976 : }
2977 : else {
2978 0 : k = leftover;
2979 0 : while (k > 0) {
2980 0 : for (l = 0; l < 4 && k; l++, k--) {
2981 0 : temSum = _mm_extract_epi32(s9, 0);
2982 0 : s9 = _mm_srli_si128(s9, 4);
2983 0 : if (temSum < lowSum) {
2984 0 : lowSum = temSum;
2985 0 : xBest = (int16_t)(j + leftover - k);
2986 0 : yBest = i;
2987 : }
2988 : }
2989 0 : s9 = s10;
2990 : }
2991 : }
2992 : }
2993 : }
2994 0 : ref += src_stride_raw;
2995 : }
2996 : }
2997 0 : break;
2998 :
2999 0 : case 64:
3000 0 : if (block_height <= 32) {
3001 : __m128i s9, s10, s11, s12;
3002 0 : for (i = 0; i < search_area_height; i++) {
3003 0 : uint32_t startW = (i & 1) << 3;
3004 0 : for (j = startW; j <= search_area_width - 8; j += 16) {
3005 0 : pSrc = src;
3006 0 : pRef = ref + j;
3007 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
3008 0 : for (k = 0; k < block_height >> 1; k++) {
3009 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3010 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3011 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3012 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3013 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3014 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3015 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3016 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
3017 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
3018 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
3019 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3020 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3021 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3022 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3023 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
3024 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
3025 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
3026 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3027 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3028 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3029 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3030 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
3031 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
3032 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
3033 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3034 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3035 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3036 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3037 0 : pSrc += src_stride;
3038 0 : pRef += ref_stride;
3039 : }
3040 0 : s9 = s10 = s11 = s12 = _mm_setzero_si128();
3041 0 : for (; k < block_height; k++) {
3042 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3043 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3044 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3045 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
3046 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
3047 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
3048 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
3049 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
3050 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
3051 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
3052 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
3053 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
3054 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
3055 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
3056 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
3057 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
3058 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
3059 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
3060 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
3061 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
3062 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
3063 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
3064 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
3065 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
3066 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
3067 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
3068 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
3069 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
3070 0 : pSrc += src_stride;
3071 0 : pRef += ref_stride;
3072 : }
3073 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
3074 0 : s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
3075 0 : s0 = _mm_minpos_epu16(s0);
3076 0 : temSum = _mm_extract_epi16(s0, 0);
3077 0 : temSum &= 0x0000FFFF;
3078 0 : if (temSum < lowSum) {
3079 0 : if (temSum != 0xFFFF) { // no overflow
3080 0 : lowSum = temSum;
3081 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
3082 0 : yBest = i;
3083 : }
3084 : else {
3085 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
3086 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
3087 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
3088 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
3089 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
3090 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
3091 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
3092 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
3093 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
3094 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
3095 0 : s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
3096 0 : s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
3097 0 : s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
3098 0 : s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
3099 0 : s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
3100 0 : s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
3101 0 : s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
3102 0 : s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
3103 0 : s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
3104 0 : s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
3105 0 : UPDATE_BEST(s0, 0, 0);
3106 0 : UPDATE_BEST(s0, 1, 0);
3107 0 : UPDATE_BEST(s0, 2, 0);
3108 0 : UPDATE_BEST(s0, 3, 0);
3109 0 : UPDATE_BEST(s3, 0, 4);
3110 0 : UPDATE_BEST(s3, 1, 4);
3111 0 : UPDATE_BEST(s3, 2, 4);
3112 0 : UPDATE_BEST(s3, 3, 4);
3113 : }
3114 : }
3115 : }
3116 :
3117 0 : if (leftover && j < search_area_width ) {
3118 0 : pSrc = src;
3119 0 : pRef = ref + j;
3120 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
3121 0 : for (k = 0; k < block_height >> 1; k++) {
3122 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3123 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3124 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3125 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3126 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3127 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3128 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3129 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
3130 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
3131 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
3132 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3133 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3134 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3135 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3136 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
3137 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
3138 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
3139 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3140 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3141 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3142 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3143 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
3144 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
3145 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
3146 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3147 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3148 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3149 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3150 0 : pSrc += src_stride;
3151 0 : pRef += ref_stride;
3152 : }
3153 0 : s9 = s10 = s11 = s12 = _mm_setzero_si128();
3154 0 : for (; k < block_height; k++) {
3155 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3156 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3157 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3158 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
3159 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
3160 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
3161 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
3162 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
3163 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
3164 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
3165 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
3166 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
3167 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
3168 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
3169 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
3170 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
3171 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
3172 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
3173 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
3174 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
3175 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
3176 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
3177 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
3178 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
3179 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
3180 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
3181 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
3182 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
3183 0 : pSrc += src_stride;
3184 0 : pRef += ref_stride;
3185 : }
3186 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
3187 0 : s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
3188 0 : s0 = _mm_or_si128(s0, s8);
3189 0 : s0 = _mm_minpos_epu16(s0);
3190 0 : temSum = _mm_extract_epi16(s0, 0);
3191 0 : temSum &= 0x0000FFFF;
3192 0 : if (temSum < lowSum) {
3193 0 : if (temSum != 0xFFFF) { // no overflow
3194 0 : lowSum = temSum;
3195 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
3196 0 : yBest = i;
3197 : }
3198 : else {
3199 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
3200 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
3201 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
3202 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
3203 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
3204 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
3205 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
3206 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
3207 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
3208 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
3209 0 : s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
3210 0 : s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
3211 0 : s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
3212 0 : s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
3213 0 : s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
3214 0 : s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
3215 0 : s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
3216 0 : s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
3217 0 : s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
3218 0 : s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
3219 0 : k = leftover;
3220 0 : while (k > 0) {
3221 0 : for (l = 0; l < 4 && k; l++, k--) {
3222 0 : temSum = _mm_extract_epi32(s0, 0);
3223 0 : s0 = _mm_srli_si128(s0, 4);
3224 0 : if (temSum < lowSum) {
3225 0 : lowSum = temSum;
3226 0 : xBest = (int16_t)(j + leftover - k);
3227 0 : yBest = i;
3228 : }
3229 : }
3230 0 : s0 = s3;
3231 : }
3232 : }
3233 : }
3234 : }
3235 0 : ref += src_stride_raw;
3236 : }
3237 : }
3238 : else {
3239 : __m128i s9, s10;
3240 0 : for (i = 0; i < search_area_height; i++) {
3241 0 : uint32_t startW = (i & 1) << 3;
3242 0 : for (j = startW; j <= search_area_width - 8; j += 16) {
3243 0 : pSrc = src;
3244 0 : pRef = ref + j;
3245 0 : s9 = s10 = _mm_setzero_si128();
3246 0 : k = 0;
3247 0 : while (k < block_height) {
3248 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
3249 0 : for (l = 0; l < 16 && k < block_height; k++, l++) {
3250 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3251 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3252 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3253 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3254 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3255 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3256 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3257 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
3258 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
3259 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
3260 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3261 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3262 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3263 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3264 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
3265 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
3266 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
3267 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3268 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3269 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3270 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3271 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
3272 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
3273 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
3274 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3275 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3276 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3277 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3278 0 : pSrc += src_stride;
3279 0 : pRef += ref_stride;
3280 : }
3281 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
3282 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
3283 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
3284 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
3285 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
3286 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
3287 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
3288 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
3289 0 : s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
3290 0 : s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
3291 : }
3292 0 : s0 = _mm_packus_epi32(s9, s10);
3293 0 : s0 = _mm_minpos_epu16(s0);
3294 0 : temSum = _mm_extract_epi16(s0, 0);
3295 0 : temSum &= 0x0000FFFF;
3296 0 : if (temSum < lowSum) {
3297 0 : if (temSum != 0xFFFF) { // no overflow
3298 0 : lowSum = temSum;
3299 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
3300 0 : yBest = i;
3301 : }
3302 : else {
3303 0 : UPDATE_BEST(s9, 0, 0);
3304 0 : UPDATE_BEST(s9, 1, 0);
3305 0 : UPDATE_BEST(s9, 2, 0);
3306 0 : UPDATE_BEST(s9, 3, 0);
3307 0 : UPDATE_BEST(s10, 0, 4);
3308 0 : UPDATE_BEST(s10, 1, 4);
3309 0 : UPDATE_BEST(s10, 2, 4);
3310 0 : UPDATE_BEST(s10, 3, 4);
3311 : }
3312 : }
3313 : }
3314 :
3315 0 : if (leftover && j < search_area_width ) {
3316 0 : pSrc = src;
3317 0 : pRef = ref + j;
3318 0 : s9 = s10 = _mm_setzero_si128();
3319 0 : k = 0;
3320 0 : while (k < block_height) {
3321 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
3322 0 : for (l = 0; l < 16 && k < block_height; k++, l++) {
3323 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3324 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3325 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3326 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3327 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3328 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3329 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3330 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
3331 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
3332 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
3333 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3334 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3335 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3336 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3337 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
3338 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
3339 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
3340 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3341 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3342 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3343 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3344 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
3345 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
3346 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
3347 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3348 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3349 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3350 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3351 0 : pSrc += src_stride;
3352 0 : pRef += ref_stride;
3353 : }
3354 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
3355 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
3356 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
3357 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
3358 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
3359 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
3360 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
3361 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
3362 0 : s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
3363 0 : s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
3364 : }
3365 0 : s0 = _mm_packus_epi32(s9, s10);
3366 0 : s0 = _mm_or_si128(s0, s8);
3367 0 : s0 = _mm_minpos_epu16(s0);
3368 0 : temSum = _mm_extract_epi16(s0, 0);
3369 0 : temSum &= 0x0000FFFF;
3370 0 : if (temSum < lowSum) {
3371 0 : if (temSum != 0xFFFF) { // no overflow
3372 0 : lowSum = temSum;
3373 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
3374 0 : yBest = i;
3375 : }
3376 : else {
3377 0 : k = leftover;
3378 0 : while (k > 0) {
3379 0 : for (l = 0; l < 4 && k; l++, k--) {
3380 0 : temSum = _mm_extract_epi32(s9, 0);
3381 0 : s9 = _mm_srli_si128(s9, 4);
3382 0 : if (temSum < lowSum) {
3383 0 : lowSum = temSum;
3384 0 : xBest = (int16_t)(j + leftover - k);
3385 0 : yBest = i;
3386 : }
3387 : }
3388 0 : s9 = s10;
3389 : }
3390 : }
3391 : }
3392 : }
3393 0 : ref += src_stride_raw;
3394 : }
3395 : }
3396 0 : break;
3397 :
3398 0 : default:
3399 : assert(0);
3400 0 : break;
3401 : }
3402 :
3403 0 : *best_sad = lowSum;
3404 0 : *x_search_center = xBest;
3405 0 : *y_search_center = yBest;
3406 0 : }
3407 :
3408 : /*******************************************************************************
3409 : * Requirement: width = 4, 8, 16, 24, 32, 48 or 64
3410 : * Requirement: block_height <= 64
3411 : * Requirement: block_height % 2 = 0 when width = 4 or 8
3412 : *******************************************************************************/
3413 0 : void sad_loop_kernel_sse4_1_hme_l0_intrin(
3414 : uint8_t *src, // input parameter, source samples Ptr
3415 : uint32_t src_stride, // input parameter, source stride
3416 : uint8_t *ref, // input parameter, reference samples Ptr
3417 : uint32_t ref_stride, // input parameter, reference stride
3418 : uint32_t block_height, // input parameter, block height (M)
3419 : uint32_t block_width, // input parameter, block width (N)
3420 : uint64_t *best_sad,
3421 : int16_t *x_search_center,
3422 : int16_t *y_search_center,
3423 : uint32_t src_stride_raw, // input parameter, source stride (no line skipping)
3424 : int16_t search_area_width,
3425 : int16_t search_area_height)
3426 : {
3427 0 : int16_t xBest = *x_search_center, yBest = *y_search_center;
3428 0 : uint32_t lowSum = 0xffffff;
3429 0 : uint32_t temSum = 0;
3430 : int16_t i, j;
3431 : uint32_t k, l;
3432 : const uint8_t *pRef, *pSrc;
3433 : __m128i s0, s1, s2, s3, s4, s5, s6, s7, s9, s10, s11;
3434 :
3435 0 : switch (block_width) {
3436 0 : case 4:
3437 0 : for (i = 0; i < search_area_height; i++) {
3438 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
3439 0 : pSrc = src;
3440 0 : pRef = ref + j;
3441 0 : s3 = _mm_setzero_si128();
3442 0 : for (k = 0; k < block_height; k += 2) {
3443 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3444 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
3445 0 : s2 = _mm_cvtsi32_si128(*(uint32_t *)pSrc);
3446 0 : s5 = _mm_cvtsi32_si128(*(uint32_t *)(pSrc + src_stride));
3447 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3448 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
3449 0 : pSrc += src_stride << 1;
3450 0 : pRef += ref_stride << 1;
3451 : }
3452 0 : s3 = _mm_minpos_epu16(s3);
3453 0 : temSum = _mm_extract_epi16(s3, 0);
3454 0 : if (temSum < lowSum) {
3455 0 : lowSum = temSum;
3456 0 : xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
3457 0 : yBest = i;
3458 : }
3459 : }
3460 0 : ref += src_stride_raw;
3461 : }
3462 0 : break;
3463 :
3464 0 : case 8:
3465 0 : for (i = 0; i < search_area_height; i++) {
3466 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
3467 0 : pSrc = src;
3468 0 : pRef = ref + j;
3469 0 : s3 = s4 = _mm_setzero_si128();
3470 0 : for (k = 0; k < block_height; k += 2) {
3471 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3472 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
3473 0 : s2 = _mm_loadl_epi64((__m128i*)pSrc);
3474 0 : s5 = _mm_loadl_epi64((__m128i*)(pSrc + src_stride));
3475 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3476 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3477 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
3478 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
3479 0 : pSrc += src_stride << 1;
3480 0 : pRef += ref_stride << 1;
3481 : }
3482 0 : s3 = _mm_adds_epu16(s3, s4);
3483 0 : s3 = _mm_minpos_epu16(s3);
3484 0 : temSum = _mm_extract_epi16(s3, 0);
3485 0 : if (temSum < lowSum) {
3486 0 : lowSum = temSum;
3487 0 : xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
3488 0 : yBest = i;
3489 : }
3490 : }
3491 :
3492 0 : ref += src_stride_raw;
3493 : }
3494 0 : break;
3495 :
3496 0 : case 16:
3497 0 : if (block_height <= 16) {
3498 0 : for (i = 0; i < search_area_height; i++) {
3499 0 : for (j = 0; j <= search_area_width - 16; j += 16) {
3500 0 : pSrc = src;
3501 0 : pRef = ref + j;
3502 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
3503 0 : s7 = s9 = s10 = s11 = _mm_setzero_si128();
3504 0 : for (k = 0; k < block_height; k++) {
3505 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3506 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3507 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3508 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3509 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3510 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3511 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3512 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
3513 0 : s7 = _mm_adds_epu16(s7, _mm_mpsadbw_epu8(s1, s2, 0));
3514 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 5));
3515 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 2));
3516 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 7));
3517 0 : pSrc += src_stride;
3518 0 : pRef += ref_stride;
3519 : }
3520 0 : s3 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
3521 0 : s3 = _mm_minpos_epu16(s3);
3522 0 : temSum = _mm_extract_epi16(s3, 0);
3523 0 : if (temSum < lowSum) {
3524 0 : lowSum = temSum;
3525 0 : xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
3526 0 : yBest = i;
3527 : }
3528 :
3529 0 : s7 = _mm_adds_epu16(_mm_adds_epu16(s7, s11), _mm_adds_epu16(s9, s10));
3530 0 : s7 = _mm_minpos_epu16(s7);
3531 0 : temSum = _mm_extract_epi16(s7, 0);
3532 0 : if (temSum < lowSum) {
3533 0 : lowSum = temSum;
3534 0 : xBest = (int16_t)(j + 8 + _mm_extract_epi16(s7, 1));
3535 0 : yBest = i;
3536 : }
3537 : }
3538 :
3539 0 : ref += src_stride_raw;
3540 : }
3541 : }
3542 0 : else if (block_height <= 32) {
3543 0 : for (i = 0; i < search_area_height; i++) {
3544 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
3545 0 : pSrc = src;
3546 0 : pRef = ref + j;
3547 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
3548 0 : for (k = 0; k < block_height; k++) {
3549 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3550 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3551 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3552 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3553 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3554 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3555 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3556 0 : pSrc += src_stride;
3557 0 : pRef += ref_stride;
3558 : }
3559 0 : s3 = _mm_adds_epu16(s3, s4);
3560 0 : s5 = _mm_adds_epu16(s5, s6);
3561 0 : s4 = _mm_minpos_epu16(s3);
3562 0 : s6 = _mm_minpos_epu16(s5);
3563 0 : s4 = _mm_unpacklo_epi16(s4, s4);
3564 0 : s4 = _mm_unpacklo_epi32(s4, s4);
3565 0 : s4 = _mm_unpacklo_epi64(s4, s4);
3566 0 : s6 = _mm_unpacklo_epi16(s6, s6);
3567 0 : s6 = _mm_unpacklo_epi32(s6, s6);
3568 0 : s6 = _mm_unpacklo_epi64(s6, s6);
3569 0 : s3 = _mm_sub_epi16(s3, s4);
3570 0 : s5 = _mm_adds_epu16(s5, s3);
3571 0 : s5 = _mm_sub_epi16(s5, s6);
3572 0 : s5 = _mm_minpos_epu16(s5);
3573 0 : temSum = _mm_extract_epi16(s5, 0);
3574 0 : temSum += _mm_extract_epi16(s4, 0);
3575 0 : temSum += _mm_extract_epi16(s6, 0);
3576 0 : if (temSum < lowSum) {
3577 0 : lowSum = temSum;
3578 0 : xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
3579 0 : yBest = i;
3580 : }
3581 : }
3582 :
3583 0 : ref += src_stride_raw;
3584 : }
3585 : }
3586 : else {
3587 0 : for (i = 0; i < search_area_height; i++) {
3588 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
3589 0 : pSrc = src;
3590 0 : pRef = ref + j;
3591 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
3592 0 : for (k = 0; k < block_height; k++) {
3593 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3594 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3595 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3596 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3597 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3598 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3599 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3600 0 : pSrc += src_stride;
3601 0 : pRef += ref_stride;
3602 : }
3603 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
3604 0 : s0 = _mm_minpos_epu16(s0);
3605 0 : temSum = _mm_extract_epi16(s0, 0);
3606 0 : if (temSum < lowSum) {
3607 0 : if (temSum != 0xFFFF) { // no overflow
3608 0 : lowSum = temSum;
3609 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
3610 0 : yBest = i;
3611 : }
3612 : else {
3613 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
3614 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
3615 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
3616 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
3617 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
3618 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
3619 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
3620 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
3621 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
3622 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
3623 0 : UPDATE_BEST(s0, 0, 0);
3624 0 : UPDATE_BEST(s0, 1, 0);
3625 0 : UPDATE_BEST(s0, 2, 0);
3626 0 : UPDATE_BEST(s0, 3, 0);
3627 0 : UPDATE_BEST(s3, 0, 4);
3628 0 : UPDATE_BEST(s3, 1, 4);
3629 0 : UPDATE_BEST(s3, 2, 4);
3630 0 : UPDATE_BEST(s3, 3, 4);
3631 : }
3632 : }
3633 : }
3634 0 : ref += src_stride_raw;
3635 : }
3636 : }
3637 0 : break;
3638 :
3639 0 : case 24:
3640 0 : if (block_height <= 16) {
3641 0 : for (i = 0; i < search_area_height; i++) {
3642 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
3643 0 : pSrc = src;
3644 0 : pRef = ref + j;
3645 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
3646 0 : for (k = 0; k < block_height; k++) {
3647 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3648 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3649 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3650 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3651 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3652 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3653 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3654 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
3655 0 : s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
3656 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3657 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3658 0 : pSrc += src_stride;
3659 0 : pRef += ref_stride;
3660 : }
3661 0 : s3 = _mm_adds_epu16(s3, s4);
3662 0 : s5 = _mm_adds_epu16(s5, s6);
3663 0 : s4 = _mm_minpos_epu16(s3);
3664 0 : s6 = _mm_minpos_epu16(s5);
3665 0 : s4 = _mm_unpacklo_epi16(s4, s4);
3666 0 : s4 = _mm_unpacklo_epi32(s4, s4);
3667 0 : s4 = _mm_unpacklo_epi64(s4, s4);
3668 0 : s6 = _mm_unpacklo_epi16(s6, s6);
3669 0 : s6 = _mm_unpacklo_epi32(s6, s6);
3670 0 : s6 = _mm_unpacklo_epi64(s6, s6);
3671 0 : s3 = _mm_sub_epi16(s3, s4);
3672 0 : s5 = _mm_adds_epu16(s5, s3);
3673 0 : s5 = _mm_sub_epi16(s5, s6);
3674 0 : s5 = _mm_minpos_epu16(s5);
3675 0 : temSum = _mm_extract_epi16(s5, 0);
3676 0 : temSum += _mm_extract_epi16(s4, 0);
3677 0 : temSum += _mm_extract_epi16(s6, 0);
3678 0 : if (temSum < lowSum) {
3679 0 : lowSum = temSum;
3680 0 : xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
3681 0 : yBest = i;
3682 : }
3683 : }
3684 0 : ref += src_stride_raw;
3685 : }
3686 : }
3687 : else {
3688 0 : for (i = 0; i < search_area_height; i++) {
3689 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
3690 0 : pSrc = src;
3691 0 : pRef = ref + j;
3692 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
3693 0 : for (k = 0; k < block_height; k++) {
3694 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3695 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3696 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3697 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3698 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3699 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3700 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3701 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
3702 0 : s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
3703 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3704 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3705 0 : pSrc += src_stride;
3706 0 : pRef += ref_stride;
3707 : }
3708 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
3709 0 : s0 = _mm_minpos_epu16(s0);
3710 0 : temSum = _mm_extract_epi16(s0, 0);
3711 0 : if (temSum < lowSum) {
3712 0 : if (temSum != 0xFFFF) { // no overflow
3713 0 : lowSum = temSum;
3714 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
3715 0 : yBest = i;
3716 : }
3717 : else {
3718 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
3719 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
3720 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
3721 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
3722 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
3723 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
3724 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
3725 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
3726 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
3727 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
3728 0 : UPDATE_BEST(s0, 0, 0);
3729 0 : UPDATE_BEST(s0, 1, 0);
3730 0 : UPDATE_BEST(s0, 2, 0);
3731 0 : UPDATE_BEST(s0, 3, 0);
3732 0 : UPDATE_BEST(s3, 0, 4);
3733 0 : UPDATE_BEST(s3, 1, 4);
3734 0 : UPDATE_BEST(s3, 2, 4);
3735 0 : UPDATE_BEST(s3, 3, 4);
3736 : }
3737 : }
3738 : }
3739 :
3740 0 : ref += src_stride_raw;
3741 : }
3742 : }
3743 0 : break;
3744 :
3745 0 : case 32:
3746 0 : if (block_height < 16) {
3747 0 : for (i = 0; i < search_area_height; i++) {
3748 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
3749 0 : pSrc = src;
3750 0 : pRef = ref + j;
3751 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
3752 0 : for (k = 0; k < block_height; k++) {
3753 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3754 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3755 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3756 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3757 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3758 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3759 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3760 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
3761 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
3762 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
3763 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3764 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3765 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3766 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3767 0 : pSrc += src_stride;
3768 0 : pRef += ref_stride;
3769 : }
3770 0 : s3 = _mm_adds_epu16(s3, s4);
3771 0 : s5 = _mm_adds_epu16(s5, s6);
3772 0 : s4 = _mm_minpos_epu16(s3);
3773 0 : s6 = _mm_minpos_epu16(s5);
3774 0 : s4 = _mm_unpacklo_epi16(s4, s4);
3775 0 : s4 = _mm_unpacklo_epi32(s4, s4);
3776 0 : s4 = _mm_unpacklo_epi64(s4, s4);
3777 0 : s6 = _mm_unpacklo_epi16(s6, s6);
3778 0 : s6 = _mm_unpacklo_epi32(s6, s6);
3779 0 : s6 = _mm_unpacklo_epi64(s6, s6);
3780 0 : s3 = _mm_sub_epi16(s3, s4);
3781 0 : s5 = _mm_adds_epu16(s5, s3);
3782 0 : s5 = _mm_sub_epi16(s5, s6);
3783 0 : s5 = _mm_minpos_epu16(s5);
3784 0 : temSum = _mm_extract_epi16(s5, 0);
3785 0 : temSum += _mm_extract_epi16(s4, 0);
3786 0 : temSum += _mm_extract_epi16(s6, 0);
3787 0 : temSum &= 0x0000FFFF;
3788 0 : if (temSum < lowSum) {
3789 0 : lowSum = temSum;
3790 0 : xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
3791 0 : yBest = i;
3792 : }
3793 : }
3794 :
3795 0 : ref += src_stride_raw;
3796 : }
3797 : }
3798 0 : else if (block_height <= 32) {
3799 0 : for (i = 0; i < search_area_height; i++) {
3800 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
3801 0 : pSrc = src;
3802 0 : pRef = ref + j;
3803 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
3804 0 : for (k = 0; k < block_height; k++) {
3805 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3806 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3807 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3808 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3809 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3810 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3811 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3812 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
3813 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
3814 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
3815 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3816 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3817 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3818 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3819 0 : pSrc += src_stride;
3820 0 : pRef += ref_stride;
3821 : }
3822 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
3823 0 : s0 = _mm_minpos_epu16(s0);
3824 0 : temSum = _mm_extract_epi16(s0, 0);
3825 0 : temSum &= 0x0000FFFF;
3826 0 : if (temSum < lowSum) {
3827 0 : if (temSum != 0xFFFF) { // no overflow
3828 0 : lowSum = temSum;
3829 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
3830 0 : yBest = i;
3831 : }
3832 : else {
3833 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
3834 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
3835 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
3836 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
3837 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
3838 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
3839 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
3840 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
3841 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
3842 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
3843 0 : UPDATE_BEST(s0, 0, 0);
3844 0 : UPDATE_BEST(s0, 1, 0);
3845 0 : UPDATE_BEST(s0, 2, 0);
3846 0 : UPDATE_BEST(s0, 3, 0);
3847 0 : UPDATE_BEST(s3, 0, 4);
3848 0 : UPDATE_BEST(s3, 1, 4);
3849 0 : UPDATE_BEST(s3, 2, 4);
3850 0 : UPDATE_BEST(s3, 3, 4);
3851 : }
3852 : }
3853 : }
3854 0 : ref += src_stride_raw;
3855 : }
3856 : }
3857 : else {
3858 : __m128i s9, s10, s11, s12;
3859 0 : for (i = 0; i < search_area_height; i++) {
3860 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
3861 0 : pSrc = src;
3862 0 : pRef = ref + j;
3863 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
3864 0 : for (k = 0; k < block_height >> 1; k++) {
3865 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3866 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3867 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3868 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3869 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3870 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3871 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3872 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
3873 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
3874 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
3875 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3876 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3877 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3878 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3879 0 : pSrc += src_stride;
3880 0 : pRef += ref_stride;
3881 : }
3882 0 : s9 = s10 = s11 = s12 = _mm_setzero_si128();
3883 0 : for (; k < block_height; k++) {
3884 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3885 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3886 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3887 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
3888 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
3889 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
3890 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
3891 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
3892 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
3893 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
3894 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
3895 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
3896 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
3897 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
3898 0 : pSrc += src_stride;
3899 0 : pRef += ref_stride;
3900 : }
3901 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
3902 0 : s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
3903 0 : s0 = _mm_minpos_epu16(s0);
3904 0 : temSum = _mm_extract_epi16(s0, 0);
3905 0 : temSum &= 0x0000FFFF;
3906 0 : if (temSum < lowSum) {
3907 0 : if (temSum != 0xFFFF) { // no overflow
3908 0 : lowSum = temSum;
3909 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
3910 0 : yBest = i;
3911 : }
3912 : else {
3913 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
3914 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
3915 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
3916 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
3917 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
3918 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
3919 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
3920 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
3921 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
3922 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
3923 0 : s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
3924 0 : s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
3925 0 : s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
3926 0 : s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
3927 0 : s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
3928 0 : s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
3929 0 : s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
3930 0 : s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
3931 0 : s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
3932 0 : s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
3933 0 : UPDATE_BEST(s0, 0, 0);
3934 0 : UPDATE_BEST(s0, 1, 0);
3935 0 : UPDATE_BEST(s0, 2, 0);
3936 0 : UPDATE_BEST(s0, 3, 0);
3937 0 : UPDATE_BEST(s3, 0, 4);
3938 0 : UPDATE_BEST(s3, 1, 4);
3939 0 : UPDATE_BEST(s3, 2, 4);
3940 0 : UPDATE_BEST(s3, 3, 4);
3941 : }
3942 : }
3943 : }
3944 :
3945 0 : ref += src_stride_raw;
3946 : }
3947 : }
3948 0 : break;
3949 :
3950 0 : case 48:
3951 0 : if (block_height <= 32) {
3952 : __m128i s9, s10, s11, s12;
3953 0 : for (i = 0; i < search_area_height; i++) {
3954 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
3955 0 : pSrc = src;
3956 0 : pRef = ref + j;
3957 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
3958 0 : for (k = 0; k < block_height >> 1; k++) {
3959 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3960 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3961 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3962 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3963 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3964 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3965 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3966 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
3967 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
3968 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
3969 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3970 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3971 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3972 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3973 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
3974 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
3975 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
3976 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
3977 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
3978 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
3979 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
3980 0 : pSrc += src_stride;
3981 0 : pRef += ref_stride;
3982 : }
3983 0 : s9 = s10 = s11 = s12 = _mm_setzero_si128();
3984 0 : for (; k < block_height; k++) {
3985 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
3986 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
3987 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
3988 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
3989 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
3990 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
3991 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
3992 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
3993 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
3994 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
3995 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
3996 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
3997 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
3998 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
3999 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
4000 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
4001 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
4002 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
4003 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
4004 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
4005 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
4006 0 : pSrc += src_stride;
4007 0 : pRef += ref_stride;
4008 : }
4009 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
4010 0 : s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
4011 0 : s0 = _mm_minpos_epu16(s0);
4012 0 : temSum = _mm_extract_epi16(s0, 0);
4013 0 : temSum &= 0x0000FFFF;
4014 0 : if (temSum < lowSum) {
4015 0 : if (temSum != 0xFFFF) { // no overflow
4016 0 : lowSum = temSum;
4017 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
4018 0 : yBest = i;
4019 : }
4020 : else {
4021 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
4022 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
4023 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
4024 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
4025 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
4026 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
4027 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
4028 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
4029 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
4030 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
4031 0 : s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
4032 0 : s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
4033 0 : s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
4034 0 : s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
4035 0 : s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
4036 0 : s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
4037 0 : s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
4038 0 : s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
4039 0 : s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
4040 0 : s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
4041 0 : UPDATE_BEST(s0, 0, 0);
4042 0 : UPDATE_BEST(s0, 1, 0);
4043 0 : UPDATE_BEST(s0, 2, 0);
4044 0 : UPDATE_BEST(s0, 3, 0);
4045 0 : UPDATE_BEST(s3, 0, 4);
4046 0 : UPDATE_BEST(s3, 1, 4);
4047 0 : UPDATE_BEST(s3, 2, 4);
4048 0 : UPDATE_BEST(s3, 3, 4);
4049 : }
4050 : }
4051 : }
4052 :
4053 0 : ref += src_stride_raw;
4054 : }
4055 : }
4056 : else {
4057 : __m128i s9, s10;
4058 0 : for (i = 0; i < search_area_height; i++) {
4059 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
4060 0 : pSrc = src;
4061 0 : pRef = ref + j;
4062 0 : s9 = s10 = _mm_setzero_si128();
4063 0 : k = 0;
4064 0 : while (k < block_height) {
4065 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
4066 0 : for (l = 0; l < 21 && k < block_height; k++, l++) {
4067 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
4068 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
4069 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
4070 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
4071 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
4072 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
4073 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
4074 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
4075 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
4076 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
4077 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
4078 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
4079 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
4080 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
4081 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
4082 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
4083 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
4084 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
4085 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
4086 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
4087 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
4088 0 : pSrc += src_stride;
4089 0 : pRef += ref_stride;
4090 : }
4091 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
4092 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
4093 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
4094 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
4095 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
4096 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
4097 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
4098 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
4099 0 : s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
4100 0 : s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
4101 : }
4102 0 : s0 = _mm_packus_epi32(s9, s10);
4103 0 : s0 = _mm_minpos_epu16(s0);
4104 0 : temSum = _mm_extract_epi16(s0, 0);
4105 0 : temSum &= 0x0000FFFF;
4106 0 : if (temSum < lowSum) {
4107 0 : if (temSum != 0xFFFF) { // no overflow
4108 0 : lowSum = temSum;
4109 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
4110 0 : yBest = i;
4111 : }
4112 : else {
4113 0 : UPDATE_BEST(s9, 0, 0);
4114 0 : UPDATE_BEST(s9, 1, 0);
4115 0 : UPDATE_BEST(s9, 2, 0);
4116 0 : UPDATE_BEST(s9, 3, 0);
4117 0 : UPDATE_BEST(s10, 0, 4);
4118 0 : UPDATE_BEST(s10, 1, 4);
4119 0 : UPDATE_BEST(s10, 2, 4);
4120 0 : UPDATE_BEST(s10, 3, 4);
4121 : }
4122 : }
4123 : }
4124 :
4125 0 : ref += src_stride_raw;
4126 : }
4127 : }
4128 0 : break;
4129 :
4130 0 : case 64:
4131 0 : if (block_height <= 32) {
4132 : __m128i s9, s10, s11, s12;
4133 0 : for (i = 0; i < search_area_height; i++) {
4134 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
4135 0 : pSrc = src;
4136 0 : pRef = ref + j;
4137 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
4138 0 : for (k = 0; k < block_height >> 1; k++) {
4139 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
4140 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
4141 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
4142 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
4143 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
4144 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
4145 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
4146 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
4147 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
4148 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
4149 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
4150 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
4151 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
4152 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
4153 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
4154 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
4155 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
4156 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
4157 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
4158 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
4159 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
4160 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
4161 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
4162 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
4163 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
4164 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
4165 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
4166 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
4167 0 : pSrc += src_stride;
4168 0 : pRef += ref_stride;
4169 : }
4170 0 : s9 = s10 = s11 = s12 = _mm_setzero_si128();
4171 0 : for (; k < block_height; k++) {
4172 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
4173 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
4174 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
4175 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
4176 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
4177 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
4178 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
4179 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
4180 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
4181 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
4182 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
4183 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
4184 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
4185 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
4186 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
4187 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
4188 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
4189 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
4190 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
4191 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
4192 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
4193 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
4194 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
4195 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
4196 0 : s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
4197 0 : s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
4198 0 : s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
4199 0 : s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
4200 0 : pSrc += src_stride;
4201 0 : pRef += ref_stride;
4202 : }
4203 0 : s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
4204 0 : s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
4205 0 : s0 = _mm_minpos_epu16(s0);
4206 0 : temSum = _mm_extract_epi16(s0, 0);
4207 0 : temSum &= 0x0000FFFF;
4208 0 : if (temSum < lowSum) {
4209 0 : if (temSum != 0xFFFF) { // no overflow
4210 0 : lowSum = temSum;
4211 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
4212 0 : yBest = i;
4213 : }
4214 : else {
4215 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
4216 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
4217 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
4218 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
4219 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
4220 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
4221 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
4222 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
4223 0 : s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
4224 0 : s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
4225 0 : s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
4226 0 : s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
4227 0 : s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
4228 0 : s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
4229 0 : s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
4230 0 : s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
4231 0 : s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
4232 0 : s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
4233 0 : s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
4234 0 : s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
4235 0 : UPDATE_BEST(s0, 0, 0);
4236 0 : UPDATE_BEST(s0, 1, 0);
4237 0 : UPDATE_BEST(s0, 2, 0);
4238 0 : UPDATE_BEST(s0, 3, 0);
4239 0 : UPDATE_BEST(s3, 0, 4);
4240 0 : UPDATE_BEST(s3, 1, 4);
4241 0 : UPDATE_BEST(s3, 2, 4);
4242 0 : UPDATE_BEST(s3, 3, 4);
4243 : }
4244 : }
4245 : }
4246 :
4247 0 : ref += src_stride_raw;
4248 : }
4249 : }
4250 : else {
4251 : __m128i s9, s10;
4252 0 : for (i = 0; i < search_area_height; i++) {
4253 0 : for (j = 0; j <= search_area_width - 8; j += 8) {
4254 0 : pSrc = src;
4255 0 : pRef = ref + j;
4256 0 : s9 = s10 = _mm_setzero_si128();
4257 0 : k = 0;
4258 0 : while (k < block_height) {
4259 0 : s3 = s4 = s5 = s6 = _mm_setzero_si128();
4260 0 : for (l = 0; l < 16 && k < block_height; k++, l++) {
4261 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
4262 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
4263 0 : s2 = _mm_loadu_si128((__m128i*)pSrc);
4264 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
4265 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
4266 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
4267 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
4268 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
4269 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
4270 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
4271 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
4272 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
4273 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
4274 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
4275 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
4276 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
4277 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
4278 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
4279 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
4280 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
4281 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
4282 0 : s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
4283 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
4284 0 : s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
4285 0 : s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
4286 0 : s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
4287 0 : s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
4288 0 : s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
4289 0 : pSrc += src_stride;
4290 0 : pRef += ref_stride;
4291 : }
4292 0 : s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
4293 0 : s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
4294 0 : s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
4295 0 : s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
4296 0 : s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
4297 0 : s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
4298 0 : s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
4299 0 : s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
4300 0 : s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
4301 0 : s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
4302 : }
4303 0 : s0 = _mm_packus_epi32(s9, s10);
4304 0 : s0 = _mm_minpos_epu16(s0);
4305 0 : temSum = _mm_extract_epi16(s0, 0);
4306 0 : temSum &= 0x0000FFFF;
4307 0 : if (temSum < lowSum) {
4308 0 : if (temSum != 0xFFFF) { // no overflow
4309 0 : lowSum = temSum;
4310 0 : xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
4311 0 : yBest = i;
4312 : }
4313 : else {
4314 0 : UPDATE_BEST(s9, 0, 0);
4315 0 : UPDATE_BEST(s9, 1, 0);
4316 0 : UPDATE_BEST(s9, 2, 0);
4317 0 : UPDATE_BEST(s9, 3, 0);
4318 0 : UPDATE_BEST(s10, 0, 4);
4319 0 : UPDATE_BEST(s10, 1, 4);
4320 0 : UPDATE_BEST(s10, 2, 4);
4321 0 : UPDATE_BEST(s10, 3, 4);
4322 : }
4323 : }
4324 : }
4325 :
4326 0 : ref += src_stride_raw;
4327 : }
4328 : }
4329 0 : break;
4330 :
4331 0 : default:
4332 : assert(0);
4333 0 : break;
4334 : }
4335 :
4336 0 : *best_sad = lowSum;
4337 0 : *x_search_center = xBest;
4338 0 : *y_search_center = yBest;
4339 0 : }
4340 :
4341 0 : static INLINE void sad_eight_8x4_sse41_intrin(const uint8_t *src,
4342 : const uint32_t src_stride, const uint8_t *ref, const uint32_t ref_stride,
4343 : __m128i *sad)
4344 : {
4345 0 : const uint8_t *pSrc = src;
4346 0 : const uint8_t *pRef = ref;
4347 : __m128i s0, s1, s2, s3;
4348 :
4349 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
4350 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride * 2));
4351 0 : s2 = _mm_loadl_epi64((__m128i*)pSrc);
4352 0 : s3 = _mm_loadl_epi64((__m128i*)(pSrc + src_stride * 2));
4353 0 : *sad = _mm_adds_epu16(*sad, _mm_mpsadbw_epu8(s0, s2, 0));
4354 0 : *sad = _mm_adds_epu16(*sad, _mm_mpsadbw_epu8(s0, s2, 5));
4355 0 : *sad = _mm_adds_epu16(*sad, _mm_mpsadbw_epu8(s1, s3, 0));
4356 0 : *sad = _mm_adds_epu16(*sad, _mm_mpsadbw_epu8(s1, s3, 5));
4357 :
4358 0 : pSrc += src_stride * 4;
4359 0 : pRef += ref_stride * 4;
4360 :
4361 0 : s0 = _mm_loadu_si128((__m128i*)pRef);
4362 0 : s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride * 2));
4363 0 : s2 = _mm_loadl_epi64((__m128i*)pSrc);
4364 0 : s3 = _mm_loadl_epi64((__m128i*)(pSrc + src_stride * 2));
4365 0 : *sad = _mm_adds_epu16(*sad, _mm_mpsadbw_epu8(s0, s2, 0));
4366 0 : *sad = _mm_adds_epu16(*sad, _mm_mpsadbw_epu8(s0, s2, 5));
4367 0 : *sad = _mm_adds_epu16(*sad, _mm_mpsadbw_epu8(s1, s3, 0));
4368 0 : *sad = _mm_adds_epu16(*sad, _mm_mpsadbw_epu8(s1, s3, 5));
4369 0 : }
4370 :
4371 0 : void get_eight_horizontal_search_point_results_8x8_16x16_pu_sse41_intrin(
4372 : uint8_t *src,
4373 : uint32_t src_stride,
4374 : uint8_t *ref,
4375 : uint32_t ref_stride,
4376 : uint32_t *p_best_sad8x8,
4377 : uint32_t *p_best_mv8x8,
4378 : uint32_t *p_best_sad16x16,
4379 : uint32_t *p_best_mv16x16,
4380 : uint32_t mv,
4381 : uint16_t *p_sad16x16,
4382 : EbBool sub_sad)
4383 : {
4384 : int16_t x_mv, y_mv;
4385 : __m128i s0, s1, s3;
4386 : __m128i sad[4];
4387 : uint32_t temSum;
4388 :
4389 0 : sad[0] = sad[1] = sad[2] = sad[3] = _mm_setzero_si128();
4390 :
4391 : /*
4392 : ------------------------------------- -----------------------------------
4393 : | 8x8_00 | 8x8_01 | 8x8_04 | 8x8_05 | 8x8_16 | 8x8_17 | 8x8_20 | 8x8_21 |
4394 : ------------------------------------- -----------------------------------
4395 : | 8x8_02 | 8x8_03 | 8x8_06 | 8x8_07 | 8x8_18 | 8x8_19 | 8x8_22 | 8x8_23 |
4396 : ----------------------- ----------- ---------------------- ----------
4397 : | 8x8_08 | 8x8_09 | 8x8_12 | 8x8_13 | 8x8_24 | 8x8_25 | 8x8_29 | 8x8_29 |
4398 : ---------------------- ----------- --------------------- ----------
4399 : | 8x8_10 | 8x8_11 | 8x8_14 | 8x8_15 | 8x8_26 | 8x8_27 | 8x8_30 | 8x8_31 |
4400 : ------------------------------------- -----------------------------------
4401 :
4402 : ------------------------------------- -----------------------------------
4403 : | 8x8_32 | 8x8_33 | 8x8_36 | 8x8_37 | 8x8_48 | 8x8_49 | 8x8_52 | 8x8_53 |
4404 : ------------------------------------- -----------------------------------
4405 : | 8x8_34 | 8x8_35 | 8x8_38 | 8x8_39 | 8x8_50 | 8x8_51 | 8x8_54 | 8x8_55 |
4406 : ----------------------- ----------- ---------------------- ----------
4407 : | 8x8_40 | 8x8_41 | 8x8_44 | 8x8_45 | 8x8_56 | 8x8_57 | 8x8_60 | 8x8_61 |
4408 : ---------------------- ----------- --------------------- ----------
4409 : | 8x8_42 | 8x8_43 | 8x8_46 | 8x8_48 | 8x8_58 | 8x8_59 | 8x8_62 | 8x8_63 |
4410 : ------------------------------------- -----------------------------------
4411 : */
4412 :
4413 : /*
4414 : ---------------------- ----------------------
4415 : | 16x16_0 | 16x16_1 | 16x16_4 | 16x16_5 |
4416 : ---------------------- ----------------------
4417 : | 16x16_2 | 16x16_3 | 16x16_6 | 16x16_7 |
4418 : ----------------------- -----------------------
4419 : | 16x16_8 | 16x16_9 | 16x16_12 | 16x16_13 |
4420 : ---------------------- ----------------------
4421 : | 16x16_10 | 16x16_11 | 16x16_14 | 16x16_15 |
4422 : ----------------------- -----------------------
4423 : */
4424 :
4425 : //8x8_0
4426 0 : sad_eight_8x4_sse41_intrin(src + 0 * src_stride + 0, src_stride, ref + 0 * ref_stride + 0, ref_stride, &sad[0]);
4427 0 : sad_eight_8x4_sse41_intrin(src + 0 * src_stride + 8, src_stride, ref + 0 * ref_stride + 8, ref_stride, &sad[1]);
4428 0 : sad_eight_8x4_sse41_intrin(src + 8 * src_stride + 0, src_stride, ref + 8 * ref_stride + 0, ref_stride, &sad[2]);
4429 0 : sad_eight_8x4_sse41_intrin(src + 8 * src_stride + 8, src_stride, ref + 8 * ref_stride + 8, ref_stride, &sad[3]);
4430 :
4431 0 : if (sub_sad) {
4432 0 : sad[0] = _mm_slli_epi16(sad[0], 1);
4433 0 : sad[1] = _mm_slli_epi16(sad[1], 1);
4434 0 : sad[2] = _mm_slli_epi16(sad[2], 1);
4435 0 : sad[3] = _mm_slli_epi16(sad[3], 1);
4436 : }
4437 : else {
4438 0 : sad_eight_8x4_sse41_intrin(src + 1 * src_stride + 0, src_stride, ref + 1 * ref_stride + 0, ref_stride, &sad[0]);
4439 0 : sad_eight_8x4_sse41_intrin(src + 1 * src_stride + 8, src_stride, ref + 1 * ref_stride + 8, ref_stride, &sad[1]);
4440 0 : sad_eight_8x4_sse41_intrin(src + 9 * src_stride + 0, src_stride, ref + 9 * ref_stride + 0, ref_stride, &sad[2]);
4441 0 : sad_eight_8x4_sse41_intrin(src + 9 * src_stride + 8, src_stride, ref + 9 * ref_stride + 8, ref_stride, &sad[3]);
4442 : }
4443 :
4444 : //find the best for 8x8_0
4445 0 : s3 = _mm_minpos_epu16(sad[0]);
4446 0 : temSum = _mm_extract_epi16(s3, 0);
4447 0 : if (temSum < p_best_sad8x8[0]) {
4448 0 : p_best_sad8x8[0] = temSum;
4449 0 : x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1) * 4);
4450 0 : y_mv = _MVYT(mv);
4451 0 : p_best_mv8x8[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4452 : }
4453 :
4454 : //find the best for 8x8_1
4455 0 : s3 = _mm_minpos_epu16(sad[1]);
4456 0 : temSum = _mm_extract_epi16(s3, 0);
4457 0 : if (temSum < p_best_sad8x8[1]) {
4458 0 : p_best_sad8x8[1] = temSum;
4459 0 : x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1) * 4);
4460 0 : y_mv = _MVYT(mv);
4461 0 : p_best_mv8x8[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4462 : }
4463 :
4464 : //find the best for 8x8_2
4465 0 : s3 = _mm_minpos_epu16(sad[2]);
4466 0 : temSum = _mm_extract_epi16(s3, 0);
4467 0 : if (temSum < p_best_sad8x8[2]) {
4468 0 : p_best_sad8x8[2] = temSum;
4469 0 : x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1) * 4);
4470 0 : y_mv = _MVYT(mv);
4471 0 : p_best_mv8x8[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4472 : }
4473 :
4474 : //find the best for 8x8_3
4475 0 : s3 = _mm_minpos_epu16(sad[3]);
4476 0 : temSum = _mm_extract_epi16(s3, 0);
4477 0 : if (temSum < p_best_sad8x8[3]) {
4478 0 : p_best_sad8x8[3] = temSum;
4479 0 : x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1) * 4);
4480 0 : y_mv = _MVYT(mv);
4481 0 : p_best_mv8x8[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4482 : }
4483 :
4484 : //16x16
4485 : {
4486 0 : s0 = _mm_adds_epu16(sad[0], sad[1]);
4487 0 : s1 = _mm_adds_epu16(sad[2], sad[3]);
4488 0 : s3 = _mm_adds_epu16(s0, s1);
4489 : //sotore the 8 SADs(16x16 SADs)
4490 : _mm_store_si128((__m128i*)p_sad16x16, s3);
4491 : //find the best for 16x16
4492 0 : s3 = _mm_minpos_epu16(s3);
4493 0 : temSum = _mm_extract_epi16(s3, 0);
4494 0 : if (temSum < p_best_sad16x16[0]) {
4495 0 : p_best_sad16x16[0] = temSum;
4496 0 : x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1) * 4);
4497 0 : y_mv = _MVYT(mv);
4498 0 : p_best_mv16x16[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4499 : }
4500 : }
4501 0 : }
4502 :
4503 : /*******************************************
4504 : Calcualte SAD for 32x32,64x64 from 16x16
4505 : and check if there is improvement, if yes keep
4506 : the best SAD+MV
4507 : *******************************************/
4508 0 : void get_eight_horizontal_search_point_results_32x32_64x64_pu_sse41_intrin(
4509 : uint16_t *p_sad16x16,
4510 : uint32_t *p_best_sad32x32,
4511 : uint32_t *p_best_sad64x64,
4512 : uint32_t *p_best_mv32x32,
4513 : uint32_t *p_best_mv64x64,
4514 : uint32_t mv)
4515 : {
4516 : int16_t x_mv, y_mv;
4517 :
4518 : uint32_t temSum;
4519 : __m128i s0, s1, s2, s3, s4, s5, sad_0, sad_1, s6, s7;
4520 : __m128i sad_00, sad_01, sad_10, sad_11, sad_20, sad_21, sad_30, sad_31;
4521 0 : __m128i Zero = _mm_setzero_si128();
4522 :
4523 : /*--------------------
4524 : | 32x32_0 | 32x32_1
4525 : ----------------------
4526 : | 32x32_2 | 32x32_3
4527 : ----------------------*/
4528 :
4529 : /* data ordering in p_sad16x16 buffer
4530 :
4531 : Search Search Search
4532 : Point 0 Point 1 Point 7
4533 : ---------------------------------------
4534 : 16x16_0 | x | x | ...... | x |
4535 : ---------------------------------------
4536 : 16x16_1 | x | x | ...... | x |
4537 :
4538 : 16x16_n | x | x | ...... | x |
4539 :
4540 : ---------------------------------------
4541 : 16x16_15 | x | x | ...... | x |
4542 : ---------------------------------------
4543 : */
4544 :
4545 : //32x32_0
4546 0 : s0 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 0 * 8));
4547 0 : s1 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 1 * 8));
4548 0 : s2 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 2 * 8));
4549 0 : s3 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 3 * 8));
4550 :
4551 0 : s4 = _mm_unpackhi_epi16(s0, Zero);
4552 0 : s5 = _mm_unpacklo_epi16(s0, Zero);
4553 0 : s6 = _mm_unpackhi_epi16(s1, Zero);
4554 0 : s7 = _mm_unpacklo_epi16(s1, Zero);
4555 0 : s0 = _mm_add_epi32(s4, s6);
4556 0 : s1 = _mm_add_epi32(s5, s7);
4557 :
4558 0 : s4 = _mm_unpackhi_epi16(s2, Zero);
4559 0 : s5 = _mm_unpacklo_epi16(s2, Zero);
4560 0 : s6 = _mm_unpackhi_epi16(s3, Zero);
4561 0 : s7 = _mm_unpacklo_epi16(s3, Zero);
4562 0 : s2 = _mm_add_epi32(s4, s6);
4563 0 : s3 = _mm_add_epi32(s5, s7);
4564 :
4565 0 : sad_01 = _mm_add_epi32(s0, s2);
4566 0 : sad_00 = _mm_add_epi32(s1, s3);
4567 :
4568 : //sad_00
4569 0 : temSum = _mm_extract_epi32(sad_00, 0);
4570 0 : if (temSum < p_best_sad32x32[0]) {
4571 0 : p_best_sad32x32[0] = temSum;
4572 0 : x_mv = _MVXT(mv) + (0 + 0) * 4; y_mv = _MVYT(mv);
4573 0 : p_best_mv32x32[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4574 : }
4575 0 : temSum = _mm_extract_epi32(sad_00, 1);
4576 0 : if (temSum < p_best_sad32x32[0]) {
4577 0 : p_best_sad32x32[0] = temSum;
4578 0 : x_mv = _MVXT(mv) + (0 + 1) * 4; y_mv = _MVYT(mv);
4579 0 : p_best_mv32x32[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4580 : }
4581 0 : temSum = _mm_extract_epi32(sad_00, 2);
4582 0 : if (temSum < p_best_sad32x32[0]) {
4583 0 : p_best_sad32x32[0] = temSum;
4584 0 : x_mv = _MVXT(mv) + (0 + 2) * 4; y_mv = _MVYT(mv);
4585 0 : p_best_mv32x32[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4586 : }
4587 0 : temSum = _mm_extract_epi32(sad_00, 3);
4588 0 : if (temSum < p_best_sad32x32[0]) {
4589 0 : p_best_sad32x32[0] = temSum;
4590 0 : x_mv = _MVXT(mv) + (0 + 3) * 4; y_mv = _MVYT(mv);
4591 0 : p_best_mv32x32[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4592 : }
4593 :
4594 : //sad_01
4595 0 : temSum = _mm_extract_epi32(sad_01, 0);
4596 0 : if (temSum < p_best_sad32x32[0]) {
4597 0 : p_best_sad32x32[0] = temSum;
4598 0 : x_mv = _MVXT(mv) + (4 + 0) * 4; y_mv = _MVYT(mv);
4599 0 : p_best_mv32x32[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4600 : }
4601 0 : temSum = _mm_extract_epi32(sad_01, 1);
4602 0 : if (temSum < p_best_sad32x32[0]) {
4603 0 : p_best_sad32x32[0] = temSum;
4604 0 : x_mv = _MVXT(mv) + (4 + 1) * 4; y_mv = _MVYT(mv);
4605 0 : p_best_mv32x32[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4606 : }
4607 0 : temSum = _mm_extract_epi32(sad_01, 2);
4608 0 : if (temSum < p_best_sad32x32[0]) {
4609 0 : p_best_sad32x32[0] = temSum;
4610 0 : x_mv = _MVXT(mv) + (4 + 2) * 4; y_mv = _MVYT(mv);
4611 0 : p_best_mv32x32[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4612 : }
4613 0 : temSum = _mm_extract_epi32(sad_01, 3);
4614 0 : if (temSum < p_best_sad32x32[0]) {
4615 0 : p_best_sad32x32[0] = temSum;
4616 0 : x_mv = _MVXT(mv) + (4 + 3) * 4; y_mv = _MVYT(mv);
4617 0 : p_best_mv32x32[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4618 : }
4619 :
4620 : //32x32_1
4621 0 : s0 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 4 * 8));
4622 0 : s1 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 5 * 8));
4623 0 : s2 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 6 * 8));
4624 0 : s3 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 7 * 8));
4625 :
4626 0 : s4 = _mm_unpackhi_epi16(s0, Zero);
4627 0 : s5 = _mm_unpacklo_epi16(s0, Zero);
4628 0 : s6 = _mm_unpackhi_epi16(s1, Zero);
4629 0 : s7 = _mm_unpacklo_epi16(s1, Zero);
4630 0 : s0 = _mm_add_epi32(s4, s6);
4631 0 : s1 = _mm_add_epi32(s5, s7);
4632 :
4633 0 : s4 = _mm_unpackhi_epi16(s2, Zero);
4634 0 : s5 = _mm_unpacklo_epi16(s2, Zero);
4635 0 : s6 = _mm_unpackhi_epi16(s3, Zero);
4636 0 : s7 = _mm_unpacklo_epi16(s3, Zero);
4637 0 : s2 = _mm_add_epi32(s4, s6);
4638 0 : s3 = _mm_add_epi32(s5, s7);
4639 :
4640 0 : sad_11 = _mm_add_epi32(s0, s2);
4641 0 : sad_10 = _mm_add_epi32(s1, s3);
4642 :
4643 : //sad_10
4644 0 : temSum = _mm_extract_epi32(sad_10, 0);
4645 0 : if (temSum < p_best_sad32x32[1]) {
4646 0 : p_best_sad32x32[1] = temSum;
4647 0 : x_mv = _MVXT(mv) + (0 + 0) * 4; y_mv = _MVYT(mv);
4648 0 : p_best_mv32x32[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4649 : }
4650 0 : temSum = _mm_extract_epi32(sad_10, 1);
4651 0 : if (temSum < p_best_sad32x32[1]) {
4652 0 : p_best_sad32x32[1] = temSum;
4653 0 : x_mv = _MVXT(mv) + (0 + 1) * 4; y_mv = _MVYT(mv);
4654 0 : p_best_mv32x32[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4655 : }
4656 0 : temSum = _mm_extract_epi32(sad_10, 2);
4657 0 : if (temSum < p_best_sad32x32[1]) {
4658 0 : p_best_sad32x32[1] = temSum;
4659 0 : x_mv = _MVXT(mv) + (0 + 2) * 4; y_mv = _MVYT(mv);
4660 0 : p_best_mv32x32[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4661 : }
4662 0 : temSum = _mm_extract_epi32(sad_10, 3);
4663 0 : if (temSum < p_best_sad32x32[1]) {
4664 0 : p_best_sad32x32[1] = temSum;
4665 0 : x_mv = _MVXT(mv) + (0 + 3) * 4; y_mv = _MVYT(mv);
4666 0 : p_best_mv32x32[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4667 : }
4668 :
4669 : //sad_11
4670 0 : temSum = _mm_extract_epi32(sad_11, 0);
4671 0 : if (temSum < p_best_sad32x32[1]) {
4672 0 : p_best_sad32x32[1] = temSum;
4673 0 : x_mv = _MVXT(mv) + (4 + 0) * 4; y_mv = _MVYT(mv);
4674 0 : p_best_mv32x32[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4675 : }
4676 0 : temSum = _mm_extract_epi32(sad_11, 1);
4677 0 : if (temSum < p_best_sad32x32[1]) {
4678 0 : p_best_sad32x32[1] = temSum;
4679 0 : x_mv = _MVXT(mv) + (4 + 1) * 4; y_mv = _MVYT(mv);
4680 0 : p_best_mv32x32[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4681 : }
4682 0 : temSum = _mm_extract_epi32(sad_11, 2);
4683 0 : if (temSum < p_best_sad32x32[1]) {
4684 0 : p_best_sad32x32[1] = temSum;
4685 0 : x_mv = _MVXT(mv) + (4 + 2) * 4; y_mv = _MVYT(mv);
4686 0 : p_best_mv32x32[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4687 : }
4688 0 : temSum = _mm_extract_epi32(sad_11, 3);
4689 0 : if (temSum < p_best_sad32x32[1]) {
4690 0 : p_best_sad32x32[1] = temSum;
4691 0 : x_mv = _MVXT(mv) + (4 + 3) * 4; y_mv = _MVYT(mv);
4692 0 : p_best_mv32x32[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4693 : }
4694 :
4695 : //32x32_2
4696 0 : s0 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 8 * 8));
4697 0 : s1 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 9 * 8));
4698 0 : s2 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 10 * 8));
4699 0 : s3 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 11 * 8));
4700 :
4701 0 : s4 = _mm_unpackhi_epi16(s0, Zero);
4702 0 : s5 = _mm_unpacklo_epi16(s0, Zero);
4703 0 : s6 = _mm_unpackhi_epi16(s1, Zero);
4704 0 : s7 = _mm_unpacklo_epi16(s1, Zero);
4705 0 : s0 = _mm_add_epi32(s4, s6);
4706 0 : s1 = _mm_add_epi32(s5, s7);
4707 :
4708 0 : s4 = _mm_unpackhi_epi16(s2, Zero);
4709 0 : s5 = _mm_unpacklo_epi16(s2, Zero);
4710 0 : s6 = _mm_unpackhi_epi16(s3, Zero);
4711 0 : s7 = _mm_unpacklo_epi16(s3, Zero);
4712 0 : s2 = _mm_add_epi32(s4, s6);
4713 0 : s3 = _mm_add_epi32(s5, s7);
4714 :
4715 0 : sad_21 = _mm_add_epi32(s0, s2);
4716 0 : sad_20 = _mm_add_epi32(s1, s3);
4717 :
4718 : //sad_20
4719 0 : temSum = _mm_extract_epi32(sad_20, 0);
4720 0 : if (temSum < p_best_sad32x32[2]) {
4721 0 : p_best_sad32x32[2] = temSum;
4722 0 : x_mv = _MVXT(mv) + (0 + 0) * 4; y_mv = _MVYT(mv);
4723 0 : p_best_mv32x32[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4724 : }
4725 0 : temSum = _mm_extract_epi32(sad_20, 1);
4726 0 : if (temSum < p_best_sad32x32[2]) {
4727 0 : p_best_sad32x32[2] = temSum;
4728 0 : x_mv = _MVXT(mv) + (0 + 1) * 4; y_mv = _MVYT(mv);
4729 0 : p_best_mv32x32[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4730 : }
4731 0 : temSum = _mm_extract_epi32(sad_20, 2);
4732 0 : if (temSum < p_best_sad32x32[2]) {
4733 0 : p_best_sad32x32[2] = temSum;
4734 0 : x_mv = _MVXT(mv) + (0 + 2) * 4; y_mv = _MVYT(mv);
4735 0 : p_best_mv32x32[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4736 : }
4737 0 : temSum = _mm_extract_epi32(sad_20, 3);
4738 0 : if (temSum < p_best_sad32x32[2]) {
4739 0 : p_best_sad32x32[2] = temSum;
4740 0 : x_mv = _MVXT(mv) + (0 + 3) * 4; y_mv = _MVYT(mv);
4741 0 : p_best_mv32x32[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4742 : }
4743 :
4744 : //sad_21
4745 0 : temSum = _mm_extract_epi32(sad_21, 0);
4746 0 : if (temSum < p_best_sad32x32[2]) {
4747 0 : p_best_sad32x32[2] = temSum;
4748 0 : x_mv = _MVXT(mv) + (4 + 0) * 4; y_mv = _MVYT(mv);
4749 0 : p_best_mv32x32[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4750 : }
4751 0 : temSum = _mm_extract_epi32(sad_21, 1);
4752 0 : if (temSum < p_best_sad32x32[2]) {
4753 0 : p_best_sad32x32[2] = temSum;
4754 0 : x_mv = _MVXT(mv) + (4 + 1) * 4; y_mv = _MVYT(mv);
4755 0 : p_best_mv32x32[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4756 : }
4757 0 : temSum = _mm_extract_epi32(sad_21, 2);
4758 0 : if (temSum < p_best_sad32x32[2]) {
4759 0 : p_best_sad32x32[2] = temSum;
4760 0 : x_mv = _MVXT(mv) + (4 + 2) * 4; y_mv = _MVYT(mv);
4761 0 : p_best_mv32x32[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4762 : }
4763 0 : temSum = _mm_extract_epi32(sad_21, 3);
4764 0 : if (temSum < p_best_sad32x32[2]) {
4765 0 : p_best_sad32x32[2] = temSum;
4766 0 : x_mv = _MVXT(mv) + (4 + 3) * 4; y_mv = _MVYT(mv);
4767 0 : p_best_mv32x32[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4768 : }
4769 :
4770 : //32x32_3
4771 0 : s0 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 12 * 8));
4772 0 : s1 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 13 * 8));
4773 0 : s2 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 14 * 8));
4774 0 : s3 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 15 * 8));
4775 :
4776 0 : s4 = _mm_unpackhi_epi16(s0, Zero);
4777 0 : s5 = _mm_unpacklo_epi16(s0, Zero);
4778 0 : s6 = _mm_unpackhi_epi16(s1, Zero);
4779 0 : s7 = _mm_unpacklo_epi16(s1, Zero);
4780 0 : s0 = _mm_add_epi32(s4, s6);
4781 0 : s1 = _mm_add_epi32(s5, s7);
4782 :
4783 0 : s4 = _mm_unpackhi_epi16(s2, Zero);
4784 0 : s5 = _mm_unpacklo_epi16(s2, Zero);
4785 0 : s6 = _mm_unpackhi_epi16(s3, Zero);
4786 0 : s7 = _mm_unpacklo_epi16(s3, Zero);
4787 0 : s2 = _mm_add_epi32(s4, s6);
4788 0 : s3 = _mm_add_epi32(s5, s7);
4789 :
4790 0 : sad_31 = _mm_add_epi32(s0, s2);
4791 0 : sad_30 = _mm_add_epi32(s1, s3);
4792 :
4793 : //sad_30
4794 0 : temSum = _mm_extract_epi32(sad_30, 0);
4795 0 : if (temSum < p_best_sad32x32[3]) {
4796 0 : p_best_sad32x32[3] = temSum;
4797 0 : x_mv = _MVXT(mv) + (0 + 0) * 4; y_mv = _MVYT(mv);
4798 0 : p_best_mv32x32[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4799 : }
4800 0 : temSum = _mm_extract_epi32(sad_30, 1);
4801 0 : if (temSum < p_best_sad32x32[3]) {
4802 0 : p_best_sad32x32[3] = temSum;
4803 0 : x_mv = _MVXT(mv) + (0 + 1) * 4; y_mv = _MVYT(mv);
4804 0 : p_best_mv32x32[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4805 : }
4806 0 : temSum = _mm_extract_epi32(sad_30, 2);
4807 0 : if (temSum < p_best_sad32x32[3]) {
4808 0 : p_best_sad32x32[3] = temSum;
4809 0 : x_mv = _MVXT(mv) + (0 + 2) * 4; y_mv = _MVYT(mv);
4810 0 : p_best_mv32x32[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4811 : }
4812 0 : temSum = _mm_extract_epi32(sad_30, 3);
4813 0 : if (temSum < p_best_sad32x32[3]) {
4814 0 : p_best_sad32x32[3] = temSum;
4815 0 : x_mv = _MVXT(mv) + (0 + 3) * 4; y_mv = _MVYT(mv);
4816 0 : p_best_mv32x32[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4817 : }
4818 :
4819 : //sad_31
4820 0 : temSum = _mm_extract_epi32(sad_31, 0);
4821 0 : if (temSum < p_best_sad32x32[3]) {
4822 0 : p_best_sad32x32[3] = temSum;
4823 0 : x_mv = _MVXT(mv) + (4 + 0) * 4; y_mv = _MVYT(mv);
4824 0 : p_best_mv32x32[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4825 : }
4826 0 : temSum = _mm_extract_epi32(sad_31, 1);
4827 0 : if (temSum < p_best_sad32x32[3]) {
4828 0 : p_best_sad32x32[3] = temSum;
4829 0 : x_mv = _MVXT(mv) + (4 + 1) * 4; y_mv = _MVYT(mv);
4830 0 : p_best_mv32x32[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4831 : }
4832 0 : temSum = _mm_extract_epi32(sad_31, 2);
4833 0 : if (temSum < p_best_sad32x32[3]) {
4834 0 : p_best_sad32x32[3] = temSum;
4835 0 : x_mv = _MVXT(mv) + (4 + 2) * 4; y_mv = _MVYT(mv);
4836 0 : p_best_mv32x32[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4837 : }
4838 0 : temSum = _mm_extract_epi32(sad_31, 3);
4839 0 : if (temSum < p_best_sad32x32[3]) {
4840 0 : p_best_sad32x32[3] = temSum;
4841 0 : x_mv = _MVXT(mv) + (4 + 3) * 4; y_mv = _MVYT(mv);
4842 0 : p_best_mv32x32[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4843 : }
4844 :
4845 0 : sad_0 = _mm_add_epi32(_mm_add_epi32(sad_00, sad_10), _mm_add_epi32(sad_20, sad_30));
4846 0 : sad_1 = _mm_add_epi32(_mm_add_epi32(sad_01, sad_11), _mm_add_epi32(sad_21, sad_31));
4847 :
4848 : //sad_0
4849 0 : temSum = _mm_extract_epi32(sad_0, 0);
4850 0 : if (temSum < p_best_sad64x64[0]) {
4851 0 : p_best_sad64x64[0] = temSum;
4852 0 : x_mv = _MVXT(mv) + (0 + 0) * 4; y_mv = _MVYT(mv);
4853 0 : p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4854 : }
4855 0 : temSum = _mm_extract_epi32(sad_0, 1);
4856 0 : if (temSum < p_best_sad64x64[0]) {
4857 0 : p_best_sad64x64[0] = temSum;
4858 0 : x_mv = _MVXT(mv) + (0 + 1) * 4; y_mv = _MVYT(mv);
4859 0 : p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4860 : }
4861 0 : temSum = _mm_extract_epi32(sad_0, 2);
4862 0 : if (temSum < p_best_sad64x64[0]) {
4863 0 : p_best_sad64x64[0] = temSum;
4864 0 : x_mv = _MVXT(mv) + (0 + 2) * 4; y_mv = _MVYT(mv);
4865 0 : p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4866 : }
4867 0 : temSum = _mm_extract_epi32(sad_0, 3);
4868 0 : if (temSum < p_best_sad64x64[0]) {
4869 0 : p_best_sad64x64[0] = temSum;
4870 0 : x_mv = _MVXT(mv) + (0 + 3) * 4; y_mv = _MVYT(mv);
4871 0 : p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4872 : }
4873 :
4874 : //sad_1
4875 0 : temSum = _mm_extract_epi32(sad_1, 0);
4876 0 : if (temSum < p_best_sad64x64[0]) {
4877 0 : p_best_sad64x64[0] = temSum;
4878 0 : x_mv = _MVXT(mv) + (4 + 0) * 4; y_mv = _MVYT(mv);
4879 0 : p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4880 : }
4881 0 : temSum = _mm_extract_epi32(sad_1, 1);
4882 0 : if (temSum < p_best_sad64x64[0]) {
4883 0 : p_best_sad64x64[0] = temSum;
4884 0 : x_mv = _MVXT(mv) + (4 + 1) * 4; y_mv = _MVYT(mv);
4885 0 : p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4886 : }
4887 0 : temSum = _mm_extract_epi32(sad_1, 2);
4888 0 : if (temSum < p_best_sad64x64[0]) {
4889 0 : p_best_sad64x64[0] = temSum;
4890 0 : x_mv = _MVXT(mv) + (4 + 2) * 4; y_mv = _MVYT(mv);
4891 0 : p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4892 : }
4893 0 : temSum = _mm_extract_epi32(sad_1, 3);
4894 0 : if (temSum < p_best_sad64x64[0]) {
4895 0 : p_best_sad64x64[0] = temSum;
4896 0 : x_mv = _MVXT(mv) + (4 + 3) * 4; y_mv = _MVYT(mv);
4897 0 : p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
4898 : }
4899 0 : }
|