LCOV - code coverage report
Current view: top level - ASM_SSE4_1 - EbComputeSAD_Intrinsic_SSE4_1.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 4015 0.0 %
Date: 2019-11-25 17:38:06 Functions: 0 7 0.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include <assert.h>
       7             : 
       8             : #include "EbComputeSAD_SSE4_1.h"
       9             : #include "EbDefinitions.h"
      10             : #include "smmintrin.h"
      11             : 
      12             : #define UPDATE_BEST(s, k, offset) \
      13             :   temSum = _mm_extract_epi32(s, k); \
      14             :   if (temSum < lowSum) { \
      15             :     lowSum = temSum; \
      16             :     xBest = j + offset + k; \
      17             :     yBest = i; \
      18             :   }
      19             : 
      20           0 : void ext_sad_calculation_32x32_64x64_sse4_intrin(
      21             :     uint32_t  *p_sad16x16,
      22             :     uint32_t  *p_best_sad32x32,
      23             :     uint32_t  *p_best_sad64x64,
      24             :     uint32_t  *p_best_mv32x32,
      25             :     uint32_t  *p_best_mv64x64,
      26             :     uint32_t   mv,
      27             :     uint32_t  *p_sad32x32)
      28             : {
      29             :     __m128i xmm_N1, sad32x32_greater_than_bitmask, sad32x32_less_than_or_eq_bitmask, BestSad32x32, BestMV32x32, xmm_mv;
      30             :     __m128i Sad16x16_0_7_lo, Sad16x16_0_7_hi, Sad16x16_8_15_lo, Sad16x16_8_15_hi, xmm_sad64x64, xmm_sad64x64_total, xmm_pBestSad32x32, xmm_pBestMV32x32;
      31             : 
      32           0 :     Sad16x16_0_7_lo = _mm_unpacklo_epi32(_mm_loadu_si128((__m128i*)p_sad16x16), _mm_loadu_si128((__m128i*)(p_sad16x16 + 4)));
      33           0 :     Sad16x16_0_7_hi = _mm_unpackhi_epi32(_mm_loadu_si128((__m128i*)p_sad16x16), _mm_loadu_si128((__m128i*)(p_sad16x16 + 4)));
      34           0 :     Sad16x16_8_15_lo = _mm_unpacklo_epi32(_mm_loadu_si128((__m128i*)(p_sad16x16 + 8)), _mm_loadu_si128((__m128i*)(p_sad16x16 + 12)));
      35           0 :     Sad16x16_8_15_hi = _mm_unpackhi_epi32(_mm_loadu_si128((__m128i*)(p_sad16x16 + 8)), _mm_loadu_si128((__m128i*)(p_sad16x16 + 12)));
      36             : 
      37           0 :     xmm_sad64x64 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(Sad16x16_0_7_lo, Sad16x16_8_15_lo), _mm_unpackhi_epi64(Sad16x16_0_7_lo, Sad16x16_8_15_lo)),
      38             :         _mm_add_epi32(_mm_unpacklo_epi64(Sad16x16_0_7_hi, Sad16x16_8_15_hi), _mm_unpackhi_epi64(Sad16x16_0_7_hi, Sad16x16_8_15_hi)));
      39             : 
      40           0 :     p_sad32x32[0] = _mm_extract_epi32(xmm_sad64x64, 0);
      41           0 :     p_sad32x32[1] = _mm_extract_epi32(xmm_sad64x64, 1);
      42           0 :     p_sad32x32[2] = _mm_extract_epi32(xmm_sad64x64, 2);
      43           0 :     p_sad32x32[3] = _mm_extract_epi32(xmm_sad64x64, 3);
      44             : 
      45           0 :     xmm_sad64x64_total = _mm_add_epi32(_mm_srli_si128(xmm_sad64x64, 8), xmm_sad64x64);
      46             : 
      47           0 :     xmm_sad64x64_total = _mm_add_epi32(_mm_srli_si128(xmm_sad64x64_total, 4), xmm_sad64x64_total);
      48             : 
      49           0 :     xmm_mv = _mm_cvtsi32_si128(mv);
      50           0 :     xmm_mv = _mm_unpacklo_epi32(xmm_mv, xmm_mv);
      51           0 :     xmm_mv = _mm_unpacklo_epi64(xmm_mv, xmm_mv);
      52             : 
      53           0 :     xmm_pBestSad32x32 = _mm_loadu_si128((__m128i*)p_best_sad32x32);
      54           0 :     xmm_pBestMV32x32 = _mm_loadu_si128((__m128i*)p_best_mv32x32);
      55             : 
      56           0 :     sad32x32_greater_than_bitmask = _mm_cmpgt_epi32(xmm_pBestSad32x32, xmm_sad64x64);// _mm_cmplt_epi32(xmm_pBestSad32x32, xmm_sad64x64);
      57             : 
      58           0 :     xmm_N1 = _mm_cmpeq_epi8(xmm_mv, xmm_mv); // anything compared to itself is equal (get 0xFFFFFFFF)
      59           0 :     sad32x32_less_than_or_eq_bitmask = _mm_sub_epi32(xmm_N1, sad32x32_greater_than_bitmask);
      60             : 
      61           0 :     BestSad32x32 = _mm_or_si128(_mm_and_si128(xmm_pBestSad32x32, sad32x32_less_than_or_eq_bitmask), _mm_and_si128(xmm_sad64x64, sad32x32_greater_than_bitmask));
      62           0 :     BestMV32x32 = _mm_or_si128(_mm_and_si128(xmm_pBestMV32x32, sad32x32_less_than_or_eq_bitmask), _mm_and_si128(xmm_mv, sad32x32_greater_than_bitmask));
      63             : 
      64             :     _mm_storeu_si128((__m128i*)p_best_sad32x32, BestSad32x32);
      65             :     _mm_storeu_si128((__m128i*)p_best_mv32x32, BestMV32x32);
      66             : 
      67           0 :     uint32_t sad64x64 = _mm_cvtsi128_si32(xmm_sad64x64_total);
      68           0 :     if (sad64x64 < p_best_sad64x64[0]) {
      69           0 :         p_best_sad64x64[0] = sad64x64;
      70           0 :         p_best_mv64x64[0] = _mm_cvtsi128_si32(xmm_mv);
      71             :     }
      72           0 : }
      73             : 
      74             : /*******************************************************************************
      75             :  * Requirement: width   = 4, 8, 16, 24, 32, 48 or 64
      76             :  * Requirement: block_height <= 64
      77             :  * Requirement: block_height % 2 = 0 when width = 4 or 8
      78             : *******************************************************************************/
      79           0 : void sad_loop_kernel_sse4_1_intrin(
      80             :     uint8_t  *src,                            // input parameter, source samples Ptr
      81             :     uint32_t  src_stride,                      // input parameter, source stride
      82             :     uint8_t  *ref,                            // input parameter, reference samples Ptr
      83             :     uint32_t  ref_stride,                      // input parameter, reference stride
      84             :     uint32_t  block_height,                   // input parameter, block height (M)
      85             :     uint32_t  block_width,                    // input parameter, block width (N)
      86             :     uint64_t *best_sad,
      87             :     int16_t *x_search_center,
      88             :     int16_t *y_search_center,
      89             :     uint32_t  src_stride_raw,                   // input parameter, source stride (no line skipping)
      90             :     int16_t search_area_width,
      91             :     int16_t search_area_height)
      92             : {
      93           0 :     int16_t xBest = *x_search_center, yBest = *y_search_center;
      94           0 :     uint32_t lowSum = 0xffffff;
      95           0 :     uint32_t temSum = 0;
      96             :     int16_t i, j;
      97             :     uint32_t k, l;
      98           0 :     uint32_t leftover = search_area_width & 7;
      99             :     const uint8_t *pRef, *pSrc;
     100           0 :     __m128i s0, s1, s2, s3, s4, s5, s6, s7, s8 = _mm_set1_epi32(-1);
     101             : 
     102           0 :     if (leftover) {
     103           0 :         for (k = 0; k < leftover; k++)
     104           0 :             s8 = _mm_slli_si128(s8, 2);
     105             :     }
     106             : 
     107           0 :     switch (block_width) {
     108           0 :     case 4:
     109           0 :         for (i = 0; i < search_area_height; i++) {
     110           0 :             for (j = 0; j <= search_area_width - 8; j += 8) {
     111           0 :                 pSrc = src;
     112           0 :                 pRef = ref + j;
     113           0 :                 s3 = _mm_setzero_si128();
     114           0 :                 for (k = 0; k < block_height; k += 2) {
     115           0 :                     s0 = _mm_loadu_si128((__m128i*)pRef);
     116           0 :                     s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
     117           0 :                     s2 = _mm_cvtsi32_si128(*(uint32_t *)pSrc);
     118           0 :                     s5 = _mm_cvtsi32_si128(*(uint32_t *)(pSrc + src_stride));
     119           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     120           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
     121           0 :                     pSrc += src_stride << 1;
     122           0 :                     pRef += ref_stride << 1;
     123             :                 }
     124           0 :                 s3 = _mm_minpos_epu16(s3);
     125           0 :                 temSum = _mm_extract_epi16(s3, 0);
     126           0 :                 if (temSum < lowSum) {
     127           0 :                     lowSum = temSum;
     128           0 :                     xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
     129           0 :                     yBest = i;
     130             :                 }
     131             :             }
     132             : 
     133           0 :             if (leftover) {
     134           0 :                 pSrc = src;
     135           0 :                 pRef = ref + j;
     136           0 :                 s3 = _mm_setzero_si128();
     137           0 :                 for (k = 0; k < block_height; k += 2) {
     138           0 :                     s0 = _mm_loadu_si128((__m128i*)pRef);
     139           0 :                     s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
     140           0 :                     s2 = _mm_cvtsi32_si128(*(uint32_t *)pSrc);
     141           0 :                     s5 = _mm_cvtsi32_si128(*(uint32_t *)(pSrc + src_stride));
     142           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     143           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
     144           0 :                     pSrc += src_stride << 1;
     145           0 :                     pRef += ref_stride << 1;
     146             :                 }
     147           0 :                 s3 = _mm_or_si128(s3, s8);
     148           0 :                 s3 = _mm_minpos_epu16(s3);
     149           0 :                 temSum = _mm_extract_epi16(s3, 0);
     150           0 :                 if (temSum < lowSum) {
     151           0 :                     lowSum = temSum;
     152           0 :                     xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
     153           0 :                     yBest = i;
     154             :                 }
     155             :             }
     156           0 :             ref += src_stride_raw;
     157             :         }
     158           0 :         break;
     159             : 
     160           0 :     case 8:
     161           0 :         for (i = 0; i < search_area_height; i++) {
     162           0 :             for (j = 0; j <= search_area_width - 8; j += 8) {
     163           0 :                 pSrc = src;
     164           0 :                 pRef = ref + j;
     165           0 :                 s3 = s4 = _mm_setzero_si128();
     166           0 :                 for (k = 0; k < block_height; k += 2) {
     167           0 :                     s0 = _mm_loadu_si128((__m128i*)pRef);
     168           0 :                     s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
     169           0 :                     s2 = _mm_loadl_epi64((__m128i*)pSrc);
     170           0 :                     s5 = _mm_loadl_epi64((__m128i*)(pSrc + src_stride));
     171           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     172           0 :                     s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     173           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
     174           0 :                     s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
     175           0 :                     pSrc += src_stride << 1;
     176           0 :                     pRef += ref_stride << 1;
     177             :                 }
     178           0 :                 s3 = _mm_adds_epu16(s3, s4);
     179           0 :                 s3 = _mm_minpos_epu16(s3);
     180           0 :                 temSum = _mm_extract_epi16(s3, 0);
     181           0 :                 if (temSum < lowSum) {
     182           0 :                     lowSum = temSum;
     183           0 :                     xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
     184           0 :                     yBest = i;
     185             :                 }
     186             :             }
     187             : 
     188           0 :             if (leftover) {
     189           0 :                 pSrc = src;
     190           0 :                 pRef = ref + j;
     191           0 :                 s3 = s4 = _mm_setzero_si128();
     192           0 :                 for (k = 0; k < block_height; k += 2) {
     193           0 :                     s0 = _mm_loadu_si128((__m128i*)pRef);
     194           0 :                     s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
     195           0 :                     s2 = _mm_loadl_epi64((__m128i*)pSrc);
     196           0 :                     s5 = _mm_loadl_epi64((__m128i*)(pSrc + src_stride));
     197           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     198           0 :                     s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     199           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
     200           0 :                     s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
     201           0 :                     pSrc += src_stride << 1;
     202           0 :                     pRef += ref_stride << 1;
     203             :                 }
     204           0 :                 s3 = _mm_adds_epu16(s3, s4);
     205           0 :                 s3 = _mm_or_si128(s3, s8);
     206           0 :                 s3 = _mm_minpos_epu16(s3);
     207           0 :                 temSum = _mm_extract_epi16(s3, 0);
     208           0 :                 if (temSum < lowSum) {
     209           0 :                     lowSum = temSum;
     210           0 :                     xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
     211           0 :                     yBest = i;
     212             :                 }
     213             :             }
     214           0 :             ref += src_stride_raw;
     215             :         }
     216           0 :         break;
     217             : 
     218           0 :     case 16:
     219           0 :         if (block_height <= 16) {
     220           0 :             for (i = 0; i < search_area_height; i++) {
     221           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
     222           0 :                     pSrc = src;
     223           0 :                     pRef = ref + j;
     224           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
     225           0 :                     for (k = 0; k < block_height; k++) {
     226           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     227           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     228           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     229           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     230           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     231           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     232           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     233           0 :                         pSrc += src_stride;
     234           0 :                         pRef += ref_stride;
     235             :                     }
     236           0 :                     s3 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
     237           0 :                     s3 = _mm_minpos_epu16(s3);
     238           0 :                     temSum = _mm_extract_epi16(s3, 0);
     239           0 :                     if (temSum < lowSum) {
     240           0 :                         lowSum = temSum;
     241           0 :                         xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
     242           0 :                         yBest = i;
     243             :                     }
     244             :                 }
     245             : 
     246           0 :                 if (leftover) {
     247           0 :                     pSrc = src;
     248           0 :                     pRef = ref + j;
     249           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
     250           0 :                     for (k = 0; k < block_height; k++) {
     251           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     252           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     253           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     254           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     255           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     256           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     257           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     258           0 :                         pSrc += src_stride;
     259           0 :                         pRef += ref_stride;
     260             :                     }
     261           0 :                     s3 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
     262           0 :                     s3 = _mm_or_si128(s3, s8);
     263           0 :                     s3 = _mm_minpos_epu16(s3);
     264           0 :                     temSum = _mm_extract_epi16(s3, 0);
     265           0 :                     if (temSum < lowSum) {
     266           0 :                         lowSum = temSum;
     267           0 :                         xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
     268           0 :                         yBest = i;
     269             :                     }
     270             :                 }
     271           0 :                 ref += src_stride_raw;
     272             :             }
     273             :         }
     274           0 :         else if (block_height <= 32) {
     275           0 :             for (i = 0; i < search_area_height; i++) {
     276           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
     277           0 :                     pSrc = src;
     278           0 :                     pRef = ref + j;
     279           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
     280           0 :                     for (k = 0; k < block_height; k++) {
     281           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     282           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     283           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     284           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     285           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     286           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     287           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     288           0 :                         pSrc += src_stride;
     289           0 :                         pRef += ref_stride;
     290             :                     }
     291           0 :                     s3 = _mm_adds_epu16(s3, s4);
     292           0 :                     s5 = _mm_adds_epu16(s5, s6);
     293           0 :                     s4 = _mm_minpos_epu16(s3);
     294           0 :                     s6 = _mm_minpos_epu16(s5);
     295           0 :                     s4 = _mm_unpacklo_epi16(s4, s4);
     296           0 :                     s4 = _mm_unpacklo_epi32(s4, s4);
     297           0 :                     s4 = _mm_unpacklo_epi64(s4, s4);
     298           0 :                     s6 = _mm_unpacklo_epi16(s6, s6);
     299           0 :                     s6 = _mm_unpacklo_epi32(s6, s6);
     300           0 :                     s6 = _mm_unpacklo_epi64(s6, s6);
     301           0 :                     s3 = _mm_sub_epi16(s3, s4);
     302           0 :                     s5 = _mm_adds_epu16(s5, s3);
     303           0 :                     s5 = _mm_sub_epi16(s5, s6);
     304           0 :                     s5 = _mm_minpos_epu16(s5);
     305           0 :                     temSum = _mm_extract_epi16(s5, 0);
     306           0 :                     temSum += _mm_extract_epi16(s4, 0);
     307           0 :                     temSum += _mm_extract_epi16(s6, 0);
     308           0 :                     if (temSum < lowSum) {
     309           0 :                         lowSum = temSum;
     310           0 :                         xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
     311           0 :                         yBest = i;
     312             :                     }
     313             :                 }
     314             : 
     315           0 :                 if (leftover) {
     316           0 :                     pSrc = src;
     317           0 :                     pRef = ref + j;
     318           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
     319           0 :                     for (k = 0; k < block_height; k++) {
     320           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     321           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     322           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     323           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     324           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     325           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     326           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     327           0 :                         pSrc += src_stride;
     328           0 :                         pRef += ref_stride;
     329             :                     }
     330           0 :                     s3 = _mm_adds_epu16(s3, s4);
     331           0 :                     s5 = _mm_adds_epu16(s5, s6);
     332           0 :                     s3 = _mm_or_si128(s3, s8);
     333           0 :                     s5 = _mm_or_si128(s5, s8);
     334           0 :                     s4 = _mm_minpos_epu16(s3);
     335           0 :                     s6 = _mm_minpos_epu16(s5);
     336           0 :                     s4 = _mm_unpacklo_epi16(s4, s4);
     337           0 :                     s4 = _mm_unpacklo_epi32(s4, s4);
     338           0 :                     s4 = _mm_unpacklo_epi64(s4, s4);
     339           0 :                     s6 = _mm_unpacklo_epi16(s6, s6);
     340           0 :                     s6 = _mm_unpacklo_epi32(s6, s6);
     341           0 :                     s6 = _mm_unpacklo_epi64(s6, s6);
     342           0 :                     s3 = _mm_sub_epi16(s3, s4);
     343           0 :                     s5 = _mm_adds_epu16(s5, s3);
     344           0 :                     s5 = _mm_sub_epi16(s5, s6);
     345           0 :                     s5 = _mm_minpos_epu16(s5);
     346           0 :                     temSum = _mm_extract_epi16(s5, 0);
     347           0 :                     temSum += _mm_extract_epi16(s4, 0);
     348           0 :                     temSum += _mm_extract_epi16(s6, 0);
     349           0 :                     if (temSum < lowSum) {
     350           0 :                         lowSum = temSum;
     351           0 :                         xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
     352           0 :                         yBest = i;
     353             :                     }
     354             :                 }
     355           0 :                 ref += src_stride_raw;
     356             :             }
     357             :         }
     358             :         else {
     359           0 :             for (i = 0; i < search_area_height; i++) {
     360           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
     361           0 :                     pSrc = src;
     362           0 :                     pRef = ref + j;
     363           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
     364           0 :                     for (k = 0; k < block_height; k++) {
     365           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     366           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     367           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     368           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     369           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     370           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     371           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     372           0 :                         pSrc += src_stride;
     373           0 :                         pRef += ref_stride;
     374             :                     }
     375           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
     376           0 :                     s0 = _mm_minpos_epu16(s0);
     377           0 :                     temSum = _mm_extract_epi16(s0, 0);
     378           0 :                     if (temSum < lowSum) {
     379           0 :                         if (temSum != 0xFFFF) { // no overflow
     380           0 :                             lowSum = temSum;
     381           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
     382           0 :                             yBest = i;
     383             :                         }
     384             :                         else {
     385           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
     386           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
     387           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
     388           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
     389           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
     390           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
     391           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
     392           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
     393           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
     394           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
     395           0 :                             UPDATE_BEST(s0, 0, 0);
     396           0 :                             UPDATE_BEST(s0, 1, 0);
     397           0 :                             UPDATE_BEST(s0, 2, 0);
     398           0 :                             UPDATE_BEST(s0, 3, 0);
     399           0 :                             UPDATE_BEST(s3, 0, 4);
     400           0 :                             UPDATE_BEST(s3, 1, 4);
     401           0 :                             UPDATE_BEST(s3, 2, 4);
     402           0 :                             UPDATE_BEST(s3, 3, 4);
     403             :                         }
     404             :                     }
     405             :                 }
     406             : 
     407           0 :                 if (leftover) {
     408           0 :                     pSrc = src;
     409           0 :                     pRef = ref + j;
     410           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
     411           0 :                     for (k = 0; k < block_height; k++) {
     412           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     413           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     414           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     415           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     416           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     417           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     418           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     419           0 :                         pSrc += src_stride;
     420           0 :                         pRef += ref_stride;
     421             :                     }
     422           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
     423           0 :                     s0 = _mm_or_si128(s0, s8);
     424           0 :                     s0 = _mm_minpos_epu16(s0);
     425           0 :                     temSum = _mm_extract_epi16(s0, 0);
     426           0 :                     if (temSum < lowSum) {
     427           0 :                         if (temSum != 0xFFFF) { // no overflow
     428           0 :                             lowSum = temSum;
     429           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
     430           0 :                             yBest = i;
     431             :                         }
     432             :                         else {
     433           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
     434           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
     435           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
     436           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
     437           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
     438           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
     439           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
     440           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
     441           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
     442           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
     443           0 :                             k = leftover;
     444           0 :                             while (k > 0) {
     445           0 :                                 for (l = 0; l < 4 && k; l++, k--) {
     446           0 :                                     temSum = _mm_extract_epi32(s0, 0);
     447           0 :                                     s0 = _mm_srli_si128(s0, 4);
     448           0 :                                     if (temSum < lowSum) {
     449           0 :                                         lowSum = temSum;
     450           0 :                                         xBest = (int16_t)(j + leftover - k);
     451           0 :                                         yBest = i;
     452             :                                     }
     453             :                                 }
     454           0 :                                 s0 = s3;
     455             :                             }
     456             :                         }
     457             :                     }
     458             :                 }
     459           0 :                 ref += src_stride_raw;
     460             :             }
     461             :         }
     462           0 :         break;
     463             : 
     464           0 :     case 24:
     465           0 :         if (block_height <= 16) {
     466           0 :             for (i = 0; i < search_area_height; i++) {
     467           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
     468           0 :                     pSrc = src;
     469           0 :                     pRef = ref + j;
     470           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
     471           0 :                     for (k = 0; k < block_height; k++) {
     472           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     473           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     474           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     475           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     476           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     477           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     478           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     479           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
     480           0 :                         s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
     481           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     482           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     483           0 :                         pSrc += src_stride;
     484           0 :                         pRef += ref_stride;
     485             :                     }
     486           0 :                     s3 = _mm_adds_epu16(s3, s4);
     487           0 :                     s5 = _mm_adds_epu16(s5, s6);
     488           0 :                     s4 = _mm_minpos_epu16(s3);
     489           0 :                     s6 = _mm_minpos_epu16(s5);
     490           0 :                     s4 = _mm_unpacklo_epi16(s4, s4);
     491           0 :                     s4 = _mm_unpacklo_epi32(s4, s4);
     492           0 :                     s4 = _mm_unpacklo_epi64(s4, s4);
     493           0 :                     s6 = _mm_unpacklo_epi16(s6, s6);
     494           0 :                     s6 = _mm_unpacklo_epi32(s6, s6);
     495           0 :                     s6 = _mm_unpacklo_epi64(s6, s6);
     496           0 :                     s3 = _mm_sub_epi16(s3, s4);
     497           0 :                     s5 = _mm_adds_epu16(s5, s3);
     498           0 :                     s5 = _mm_sub_epi16(s5, s6);
     499           0 :                     s5 = _mm_minpos_epu16(s5);
     500           0 :                     temSum = _mm_extract_epi16(s5, 0);
     501           0 :                     temSum += _mm_extract_epi16(s4, 0);
     502           0 :                     temSum += _mm_extract_epi16(s6, 0);
     503           0 :                     if (temSum < lowSum) {
     504           0 :                         lowSum = temSum;
     505           0 :                         xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
     506           0 :                         yBest = i;
     507             :                     }
     508             :                 }
     509             : 
     510           0 :                 if (leftover) {
     511           0 :                     pSrc = src;
     512           0 :                     pRef = ref + j;
     513           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
     514           0 :                     for (k = 0; k < block_height; k++) {
     515           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     516           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     517           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     518           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     519           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     520           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     521           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     522           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
     523           0 :                         s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
     524           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     525           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     526           0 :                         pSrc += src_stride;
     527           0 :                         pRef += ref_stride;
     528             :                     }
     529           0 :                     s3 = _mm_adds_epu16(s3, s4);
     530           0 :                     s5 = _mm_adds_epu16(s5, s6);
     531           0 :                     s3 = _mm_or_si128(s3, s8);
     532           0 :                     s5 = _mm_or_si128(s5, s8);
     533           0 :                     s4 = _mm_minpos_epu16(s3);
     534           0 :                     s6 = _mm_minpos_epu16(s5);
     535           0 :                     s4 = _mm_unpacklo_epi16(s4, s4);
     536           0 :                     s4 = _mm_unpacklo_epi32(s4, s4);
     537           0 :                     s4 = _mm_unpacklo_epi64(s4, s4);
     538           0 :                     s6 = _mm_unpacklo_epi16(s6, s6);
     539           0 :                     s6 = _mm_unpacklo_epi32(s6, s6);
     540           0 :                     s6 = _mm_unpacklo_epi64(s6, s6);
     541           0 :                     s3 = _mm_sub_epi16(s3, s4);
     542           0 :                     s5 = _mm_adds_epu16(s5, s3);
     543           0 :                     s5 = _mm_sub_epi16(s5, s6);
     544           0 :                     s5 = _mm_minpos_epu16(s5);
     545           0 :                     temSum = _mm_extract_epi16(s5, 0);
     546           0 :                     temSum += _mm_extract_epi16(s4, 0);
     547           0 :                     temSum += _mm_extract_epi16(s6, 0);
     548           0 :                     if (temSum < lowSum) {
     549           0 :                         lowSum = temSum;
     550           0 :                         xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
     551           0 :                         yBest = i;
     552             :                     }
     553             :                 }
     554           0 :                 ref += src_stride_raw;
     555             :             }
     556             :         }
     557             :         else {
     558           0 :             for (i = 0; i < search_area_height; i++) {
     559           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
     560           0 :                     pSrc = src;
     561           0 :                     pRef = ref + j;
     562           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
     563           0 :                     for (k = 0; k < block_height; k++) {
     564           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     565           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     566           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     567           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     568           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     569           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     570           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     571           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
     572           0 :                         s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
     573           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     574           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     575           0 :                         pSrc += src_stride;
     576           0 :                         pRef += ref_stride;
     577             :                     }
     578           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
     579           0 :                     s0 = _mm_minpos_epu16(s0);
     580           0 :                     temSum = _mm_extract_epi16(s0, 0);
     581           0 :                     if (temSum < lowSum) {
     582           0 :                         if (temSum != 0xFFFF) { // no overflow
     583           0 :                             lowSum = temSum;
     584           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
     585           0 :                             yBest = i;
     586             :                         }
     587             :                         else {
     588           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
     589           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
     590           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
     591           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
     592           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
     593           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
     594           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
     595           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
     596           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
     597           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
     598           0 :                             UPDATE_BEST(s0, 0, 0);
     599           0 :                             UPDATE_BEST(s0, 1, 0);
     600           0 :                             UPDATE_BEST(s0, 2, 0);
     601           0 :                             UPDATE_BEST(s0, 3, 0);
     602           0 :                             UPDATE_BEST(s3, 0, 4);
     603           0 :                             UPDATE_BEST(s3, 1, 4);
     604           0 :                             UPDATE_BEST(s3, 2, 4);
     605           0 :                             UPDATE_BEST(s3, 3, 4);
     606             :                         }
     607             :                     }
     608             :                 }
     609             : 
     610           0 :                 if (leftover) {
     611           0 :                     pSrc = src;
     612           0 :                     pRef = ref + j;
     613           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
     614           0 :                     for (k = 0; k < block_height; k++) {
     615           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     616           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     617           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     618           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     619           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     620           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     621           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     622           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
     623           0 :                         s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
     624           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     625           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     626           0 :                         pSrc += src_stride;
     627           0 :                         pRef += ref_stride;
     628             :                     }
     629           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
     630           0 :                     s0 = _mm_or_si128(s0, s8);
     631           0 :                     s0 = _mm_minpos_epu16(s0);
     632           0 :                     temSum = _mm_extract_epi16(s0, 0);
     633           0 :                     if (temSum < lowSum) {
     634           0 :                         if (temSum != 0xFFFF) { // no overflow
     635           0 :                             lowSum = temSum;
     636           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
     637           0 :                             yBest = i;
     638             :                         }
     639             :                         else {
     640           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
     641           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
     642           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
     643           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
     644           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
     645           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
     646           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
     647           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
     648           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
     649           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
     650           0 :                             k = leftover;
     651           0 :                             while (k > 0) {
     652           0 :                                 for (l = 0; l < 4 && k; l++, k--) {
     653           0 :                                     temSum = _mm_extract_epi32(s0, 0);
     654           0 :                                     s0 = _mm_srli_si128(s0, 4);
     655           0 :                                     if (temSum < lowSum) {
     656           0 :                                         lowSum = temSum;
     657           0 :                                         xBest = (int16_t)(j + leftover - k);
     658           0 :                                         yBest = i;
     659             :                                     }
     660             :                                 }
     661           0 :                                 s0 = s3;
     662             :                             }
     663             :                         }
     664             :                     }
     665             :                 }
     666           0 :                 ref += src_stride_raw;
     667             :             }
     668             :         }
     669           0 :         break;
     670             : 
     671           0 :     case 32:
     672           0 :         if (block_height <= 32) {
     673           0 :             for (i = 0; i < search_area_height; i++) {
     674           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
     675           0 :                     pSrc = src;
     676           0 :                     pRef = ref + j;
     677           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
     678           0 :                     for (k = 0; k < block_height; k++) {
     679           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     680           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     681           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     682           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     683           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     684           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     685           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     686           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
     687           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
     688           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
     689           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     690           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     691           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     692           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     693           0 :                         pSrc += src_stride;
     694           0 :                         pRef += ref_stride;
     695             :                     }
     696           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
     697           0 :                     s0 = _mm_minpos_epu16(s0);
     698           0 :                     temSum = _mm_extract_epi16(s0, 0);
     699           0 :                     temSum &= 0x0000FFFF;
     700           0 :                     if (temSum < lowSum) {
     701           0 :                         if (temSum != 0xFFFF) { // no overflow
     702           0 :                             lowSum = temSum;
     703           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
     704           0 :                             yBest = i;
     705             :                         }
     706             :                         else {
     707           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
     708           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
     709           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
     710           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
     711           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
     712           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
     713           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
     714           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
     715           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
     716           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
     717           0 :                             UPDATE_BEST(s0, 0, 0);
     718           0 :                             UPDATE_BEST(s0, 1, 0);
     719           0 :                             UPDATE_BEST(s0, 2, 0);
     720           0 :                             UPDATE_BEST(s0, 3, 0);
     721           0 :                             UPDATE_BEST(s3, 0, 4);
     722           0 :                             UPDATE_BEST(s3, 1, 4);
     723           0 :                             UPDATE_BEST(s3, 2, 4);
     724           0 :                             UPDATE_BEST(s3, 3, 4);
     725             :                         }
     726             :                     }
     727             :                 }
     728             : 
     729           0 :                 if (leftover) {
     730           0 :                     pSrc = src;
     731           0 :                     pRef = ref + j;
     732           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
     733           0 :                     for (k = 0; k < block_height; k++) {
     734           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     735           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     736           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     737           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     738           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     739           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     740           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     741           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
     742           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
     743           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
     744           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     745           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     746           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     747           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     748           0 :                         pSrc += src_stride;
     749           0 :                         pRef += ref_stride;
     750             :                     }
     751           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
     752           0 :                     s0 = _mm_or_si128(s0, s8);
     753           0 :                     s0 = _mm_minpos_epu16(s0);
     754           0 :                     temSum = _mm_extract_epi16(s0, 0);
     755           0 :                     temSum &= 0x0000FFFF;
     756           0 :                     if (temSum < lowSum) {
     757           0 :                         if (temSum != 0xFFFF) { // no overflow
     758           0 :                             lowSum = temSum;
     759           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
     760           0 :                             yBest = i;
     761             :                         }
     762             :                         else {
     763           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
     764           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
     765           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
     766           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
     767           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
     768           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
     769           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
     770           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
     771           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
     772           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
     773           0 :                             k = leftover;
     774           0 :                             while (k > 0) {
     775           0 :                                 for (l = 0; l < 4 && k; l++, k--) {
     776           0 :                                     temSum = _mm_extract_epi32(s0, 0);
     777           0 :                                     s0 = _mm_srli_si128(s0, 4);
     778           0 :                                     if (temSum < lowSum) {
     779           0 :                                         lowSum = temSum;
     780           0 :                                         xBest = (int16_t)(j + leftover - k);
     781           0 :                                         yBest = i;
     782             :                                     }
     783             :                                 }
     784           0 :                                 s0 = s3;
     785             :                             }
     786             :                         }
     787             :                     }
     788             :                 }
     789           0 :                 ref += src_stride_raw;
     790             :             }
     791             :         }
     792             :         else {
     793             :             __m128i s9, s10, s11, s12;
     794           0 :             for (i = 0; i < search_area_height; i++) {
     795           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
     796           0 :                     pSrc = src;
     797           0 :                     pRef = ref + j;
     798           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
     799           0 :                     for (k = 0; k < block_height >> 1; k++) {
     800           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     801           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     802           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     803           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     804           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     805           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     806           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     807           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
     808           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
     809           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
     810           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     811           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     812           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     813           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     814           0 :                         pSrc += src_stride;
     815           0 :                         pRef += ref_stride;
     816             :                     }
     817           0 :                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
     818           0 :                     for (; k < block_height; k++) {
     819           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     820           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     821           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     822           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
     823           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
     824           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
     825           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
     826           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
     827           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
     828           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
     829           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
     830           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
     831           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
     832           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
     833           0 :                         pSrc += src_stride;
     834           0 :                         pRef += ref_stride;
     835             :                     }
     836           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
     837           0 :                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
     838           0 :                     s0 = _mm_minpos_epu16(s0);
     839           0 :                     temSum = _mm_extract_epi16(s0, 0);
     840           0 :                     temSum &= 0x0000FFFF;
     841           0 :                     if (temSum < lowSum) {
     842           0 :                         if (temSum != 0xFFFF) { // no overflow
     843           0 :                             lowSum = temSum;
     844           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
     845           0 :                             yBest = i;
     846             :                         }
     847             :                         else {
     848           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
     849           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
     850           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
     851           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
     852           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
     853           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
     854           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
     855           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
     856           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
     857           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
     858           0 :                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     859           0 :                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     860           0 :                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
     861           0 :                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
     862           0 :                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
     863           0 :                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
     864           0 :                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
     865           0 :                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
     866           0 :                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
     867           0 :                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
     868           0 :                             UPDATE_BEST(s0, 0, 0);
     869           0 :                             UPDATE_BEST(s0, 1, 0);
     870           0 :                             UPDATE_BEST(s0, 2, 0);
     871           0 :                             UPDATE_BEST(s0, 3, 0);
     872           0 :                             UPDATE_BEST(s3, 0, 4);
     873           0 :                             UPDATE_BEST(s3, 1, 4);
     874           0 :                             UPDATE_BEST(s3, 2, 4);
     875           0 :                             UPDATE_BEST(s3, 3, 4);
     876             :                         }
     877             :                     }
     878             :                 }
     879             : 
     880           0 :                 if (leftover) {
     881           0 :                     pSrc = src;
     882           0 :                     pRef = ref + j;
     883           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
     884           0 :                     for (k = 0; k < block_height >> 1; k++) {
     885           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     886           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     887           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     888           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     889           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     890           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     891           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     892           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
     893           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
     894           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
     895           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     896           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     897           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     898           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     899           0 :                         pSrc += src_stride;
     900           0 :                         pRef += ref_stride;
     901             :                     }
     902           0 :                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
     903           0 :                     for (; k < block_height; k++) {
     904           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     905           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     906           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     907           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
     908           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
     909           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
     910           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
     911           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
     912           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
     913           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
     914           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
     915           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
     916           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
     917           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
     918           0 :                         pSrc += src_stride;
     919           0 :                         pRef += ref_stride;
     920             :                     }
     921           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
     922           0 :                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
     923           0 :                     s0 = _mm_or_si128(s0, s8);
     924           0 :                     s0 = _mm_minpos_epu16(s0);
     925           0 :                     temSum = _mm_extract_epi16(s0, 0);
     926           0 :                     temSum &= 0x0000FFFF;
     927           0 :                     if (temSum < lowSum) {
     928           0 :                         if (temSum != 0xFFFF) { // no overflow
     929           0 :                             lowSum = temSum;
     930           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
     931           0 :                             yBest = i;
     932             :                         }
     933             :                         else {
     934           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
     935           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
     936           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
     937           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
     938           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
     939           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
     940           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
     941           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
     942           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
     943           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
     944           0 :                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     945           0 :                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     946           0 :                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
     947           0 :                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
     948           0 :                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
     949           0 :                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
     950           0 :                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
     951           0 :                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
     952           0 :                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
     953           0 :                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
     954           0 :                             k = leftover;
     955           0 :                             while (k > 0) {
     956           0 :                                 for (l = 0; l < 4 && k; l++, k--) {
     957           0 :                                     temSum = _mm_extract_epi32(s0, 0);
     958           0 :                                     s0 = _mm_srli_si128(s0, 4);
     959           0 :                                     if (temSum < lowSum) {
     960           0 :                                         lowSum = temSum;
     961           0 :                                         xBest = (int16_t)(j + leftover - k);
     962           0 :                                         yBest = i;
     963             :                                     }
     964             :                                 }
     965           0 :                                 s0 = s3;
     966             :                             }
     967             :                         }
     968             :                     }
     969             :                 }
     970           0 :                 ref += src_stride_raw;
     971             :             }
     972             :         }
     973           0 :         break;
     974             : 
     975           0 :     case 48:
     976           0 :         if (block_height <= 32) {
     977             :             __m128i s9, s10, s11, s12;
     978           0 :             for (i = 0; i < search_area_height; i++) {
     979           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
     980           0 :                     pSrc = src;
     981           0 :                     pRef = ref + j;
     982           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
     983           0 :                     for (k = 0; k < block_height >> 1; k++) {
     984           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
     985           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
     986           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
     987           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     988           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     989           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     990           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     991           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
     992           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
     993           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
     994           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
     995           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
     996           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
     997           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
     998           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
     999           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    1000           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    1001           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1002           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1003           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1004           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1005           0 :                         pSrc += src_stride;
    1006           0 :                         pRef += ref_stride;
    1007             :                     }
    1008           0 :                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
    1009           0 :                     for (; k < block_height; k++) {
    1010           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    1011           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    1012           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    1013           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    1014           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    1015           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    1016           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    1017           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    1018           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    1019           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    1020           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    1021           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    1022           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    1023           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    1024           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    1025           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    1026           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    1027           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    1028           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    1029           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    1030           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    1031           0 :                         pSrc += src_stride;
    1032           0 :                         pRef += ref_stride;
    1033             :                     }
    1034           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    1035           0 :                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
    1036           0 :                     s0 = _mm_minpos_epu16(s0);
    1037           0 :                     temSum = _mm_extract_epi16(s0, 0);
    1038           0 :                     temSum &= 0x0000FFFF;
    1039           0 :                     if (temSum < lowSum) {
    1040           0 :                         if (temSum != 0xFFFF) { // no overflow
    1041           0 :                             lowSum = temSum;
    1042           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    1043           0 :                             yBest = i;
    1044             :                         }
    1045             :                         else {
    1046           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    1047           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    1048           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    1049           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    1050           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    1051           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    1052           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    1053           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    1054           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    1055           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    1056           0 :                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
    1057           0 :                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
    1058           0 :                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
    1059           0 :                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
    1060           0 :                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
    1061           0 :                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
    1062           0 :                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
    1063           0 :                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
    1064           0 :                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
    1065           0 :                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
    1066           0 :                             UPDATE_BEST(s0, 0, 0);
    1067           0 :                             UPDATE_BEST(s0, 1, 0);
    1068           0 :                             UPDATE_BEST(s0, 2, 0);
    1069           0 :                             UPDATE_BEST(s0, 3, 0);
    1070           0 :                             UPDATE_BEST(s3, 0, 4);
    1071           0 :                             UPDATE_BEST(s3, 1, 4);
    1072           0 :                             UPDATE_BEST(s3, 2, 4);
    1073           0 :                             UPDATE_BEST(s3, 3, 4);
    1074             :                         }
    1075             :                     }
    1076             :                 }
    1077             : 
    1078           0 :                 if (leftover) {
    1079           0 :                     pSrc = src;
    1080           0 :                     pRef = ref + j;
    1081           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    1082           0 :                     for (k = 0; k < block_height >> 1; k++) {
    1083           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    1084           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    1085           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    1086           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1087           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1088           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1089           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1090           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    1091           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    1092           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    1093           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1094           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1095           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1096           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1097           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    1098           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    1099           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    1100           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1101           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1102           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1103           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1104           0 :                         pSrc += src_stride;
    1105           0 :                         pRef += ref_stride;
    1106             :                     }
    1107           0 :                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
    1108           0 :                     for (; k < block_height; k++) {
    1109           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    1110           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    1111           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    1112           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    1113           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    1114           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    1115           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    1116           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    1117           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    1118           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    1119           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    1120           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    1121           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    1122           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    1123           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    1124           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    1125           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    1126           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    1127           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    1128           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    1129           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    1130           0 :                         pSrc += src_stride;
    1131           0 :                         pRef += ref_stride;
    1132             :                     }
    1133           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    1134           0 :                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
    1135           0 :                     s0 = _mm_or_si128(s0, s8);
    1136           0 :                     s0 = _mm_minpos_epu16(s0);
    1137           0 :                     temSum = _mm_extract_epi16(s0, 0);
    1138           0 :                     temSum &= 0x0000FFFF;
    1139           0 :                     if (temSum < lowSum) {
    1140           0 :                         if (temSum != 0xFFFF) { // no overflow
    1141           0 :                             lowSum = temSum;
    1142           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    1143           0 :                             yBest = i;
    1144             :                         }
    1145             :                         else {
    1146           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    1147           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    1148           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    1149           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    1150           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    1151           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    1152           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    1153           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    1154           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    1155           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    1156           0 :                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
    1157           0 :                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
    1158           0 :                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
    1159           0 :                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
    1160           0 :                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
    1161           0 :                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
    1162           0 :                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
    1163           0 :                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
    1164           0 :                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
    1165           0 :                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
    1166           0 :                             k = leftover;
    1167           0 :                             while (k > 0) {
    1168           0 :                                 for (l = 0; l < 4 && k; l++, k--) {
    1169           0 :                                     temSum = _mm_extract_epi32(s0, 0);
    1170           0 :                                     s0 = _mm_srli_si128(s0, 4);
    1171           0 :                                     if (temSum < lowSum) {
    1172           0 :                                         lowSum = temSum;
    1173           0 :                                         xBest = (int16_t)(j + leftover - k);
    1174           0 :                                         yBest = i;
    1175             :                                     }
    1176             :                                 }
    1177           0 :                                 s0 = s3;
    1178             :                             }
    1179             :                         }
    1180             :                     }
    1181             :                 }
    1182           0 :                 ref += src_stride_raw;
    1183             :             }
    1184             :         }
    1185             :         else {
    1186             :             __m128i s9, s10;
    1187           0 :             for (i = 0; i < search_area_height; i++) {
    1188           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
    1189           0 :                     pSrc = src;
    1190           0 :                     pRef = ref + j;
    1191           0 :                     s9 = s10 = _mm_setzero_si128();
    1192           0 :                     k = 0;
    1193           0 :                     while (k < block_height) {
    1194           0 :                         s3 = s4 = s5 = s6 = _mm_setzero_si128();
    1195           0 :                         for (l = 0; l < 21 && k < block_height; k++, l++) {
    1196           0 :                             s0 = _mm_loadu_si128((__m128i*)pRef);
    1197           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    1198           0 :                             s2 = _mm_loadu_si128((__m128i*)pSrc);
    1199           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1200           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1201           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1202           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1203           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    1204           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    1205           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    1206           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1207           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1208           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1209           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1210           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    1211           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    1212           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    1213           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1214           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1215           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1216           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1217           0 :                             pSrc += src_stride;
    1218           0 :                             pRef += ref_stride;
    1219             :                         }
    1220           0 :                         s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    1221           0 :                         s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    1222           0 :                         s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    1223           0 :                         s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    1224           0 :                         s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    1225           0 :                         s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    1226           0 :                         s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    1227           0 :                         s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    1228           0 :                         s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
    1229           0 :                         s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
    1230             :                     }
    1231           0 :                     s0 = _mm_packus_epi32(s9, s10);
    1232           0 :                     s0 = _mm_minpos_epu16(s0);
    1233           0 :                     temSum = _mm_extract_epi16(s0, 0);
    1234           0 :                     temSum &= 0x0000FFFF;
    1235           0 :                     if (temSum < lowSum) {
    1236           0 :                         if (temSum != 0xFFFF) { // no overflow
    1237           0 :                             lowSum = temSum;
    1238           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    1239           0 :                             yBest = i;
    1240             :                         }
    1241             :                         else {
    1242           0 :                             UPDATE_BEST(s9, 0, 0);
    1243           0 :                             UPDATE_BEST(s9, 1, 0);
    1244           0 :                             UPDATE_BEST(s9, 2, 0);
    1245           0 :                             UPDATE_BEST(s9, 3, 0);
    1246           0 :                             UPDATE_BEST(s10, 0, 4);
    1247           0 :                             UPDATE_BEST(s10, 1, 4);
    1248           0 :                             UPDATE_BEST(s10, 2, 4);
    1249           0 :                             UPDATE_BEST(s10, 3, 4);
    1250             :                         }
    1251             :                     }
    1252             :                 }
    1253             : 
    1254           0 :                 if (leftover) {
    1255           0 :                     pSrc = src;
    1256           0 :                     pRef = ref + j;
    1257           0 :                     s9 = s10 = _mm_setzero_si128();
    1258           0 :                     k = 0;
    1259           0 :                     while (k < block_height) {
    1260           0 :                         s3 = s4 = s5 = s6 = _mm_setzero_si128();
    1261           0 :                         for (l = 0; l < 21 && k < block_height; k++, l++) {
    1262           0 :                             s0 = _mm_loadu_si128((__m128i*)pRef);
    1263           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    1264           0 :                             s2 = _mm_loadu_si128((__m128i*)pSrc);
    1265           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1266           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1267           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1268           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1269           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    1270           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    1271           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    1272           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1273           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1274           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1275           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1276           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    1277           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    1278           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    1279           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1280           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1281           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1282           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1283           0 :                             pSrc += src_stride;
    1284           0 :                             pRef += ref_stride;
    1285             :                         }
    1286           0 :                         s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    1287           0 :                         s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    1288           0 :                         s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    1289           0 :                         s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    1290           0 :                         s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    1291           0 :                         s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    1292           0 :                         s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    1293           0 :                         s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    1294           0 :                         s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
    1295           0 :                         s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
    1296             :                     }
    1297           0 :                     s0 = _mm_packus_epi32(s9, s10);
    1298           0 :                     s0 = _mm_or_si128(s0, s8);
    1299           0 :                     s0 = _mm_minpos_epu16(s0);
    1300           0 :                     temSum = _mm_extract_epi16(s0, 0);
    1301           0 :                     temSum &= 0x0000FFFF;
    1302           0 :                     if (temSum < lowSum) {
    1303           0 :                         if (temSum != 0xFFFF) { // no overflow
    1304           0 :                             lowSum = temSum;
    1305           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    1306           0 :                             yBest = i;
    1307             :                         }
    1308             :                         else {
    1309           0 :                             k = leftover;
    1310           0 :                             while (k > 0) {
    1311           0 :                                 for (l = 0; l < 4 && k; l++, k--) {
    1312           0 :                                     temSum = _mm_extract_epi32(s9, 0);
    1313           0 :                                     s9 = _mm_srli_si128(s9, 4);
    1314           0 :                                     if (temSum < lowSum) {
    1315           0 :                                         lowSum = temSum;
    1316           0 :                                         xBest = (int16_t)(j + leftover - k);
    1317           0 :                                         yBest = i;
    1318             :                                     }
    1319             :                                 }
    1320           0 :                                 s9 = s10;
    1321             :                             }
    1322             :                         }
    1323             :                     }
    1324             :                 }
    1325           0 :                 ref += src_stride_raw;
    1326             :             }
    1327             :         }
    1328           0 :         break;
    1329             : 
    1330           0 :     case 64:
    1331           0 :         if (block_height <= 32) {
    1332             :             __m128i s9, s10, s11, s12;
    1333           0 :             for (i = 0; i < search_area_height; i++) {
    1334           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
    1335           0 :                     pSrc = src;
    1336           0 :                     pRef = ref + j;
    1337           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    1338           0 :                     for (k = 0; k < block_height >> 1; k++) {
    1339           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    1340           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    1341           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    1342           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1343           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1344           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1345           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1346           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    1347           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    1348           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    1349           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1350           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1351           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1352           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1353           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    1354           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    1355           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    1356           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1357           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1358           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1359           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1360           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
    1361           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
    1362           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
    1363           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1364           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1365           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1366           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1367           0 :                         pSrc += src_stride;
    1368           0 :                         pRef += ref_stride;
    1369             :                     }
    1370           0 :                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
    1371           0 :                     for (; k < block_height; k++) {
    1372           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    1373           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    1374           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    1375           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    1376           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    1377           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    1378           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    1379           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    1380           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    1381           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    1382           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    1383           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    1384           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    1385           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    1386           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    1387           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    1388           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    1389           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    1390           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    1391           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    1392           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    1393           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
    1394           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
    1395           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
    1396           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    1397           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    1398           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    1399           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    1400           0 :                         pSrc += src_stride;
    1401           0 :                         pRef += ref_stride;
    1402             :                     }
    1403           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    1404           0 :                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
    1405           0 :                     s0 = _mm_minpos_epu16(s0);
    1406           0 :                     temSum = _mm_extract_epi16(s0, 0);
    1407           0 :                     temSum &= 0x0000FFFF;
    1408           0 :                     if (temSum < lowSum) {
    1409           0 :                         if (temSum != 0xFFFF) { // no overflow
    1410           0 :                             lowSum = temSum;
    1411           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    1412           0 :                             yBest = i;
    1413             :                         }
    1414             :                         else {
    1415           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    1416           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    1417           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    1418           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    1419           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    1420           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    1421           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    1422           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    1423           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    1424           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    1425           0 :                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
    1426           0 :                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
    1427           0 :                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
    1428           0 :                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
    1429           0 :                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
    1430           0 :                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
    1431           0 :                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
    1432           0 :                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
    1433           0 :                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
    1434           0 :                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
    1435           0 :                             UPDATE_BEST(s0, 0, 0);
    1436           0 :                             UPDATE_BEST(s0, 1, 0);
    1437           0 :                             UPDATE_BEST(s0, 2, 0);
    1438           0 :                             UPDATE_BEST(s0, 3, 0);
    1439           0 :                             UPDATE_BEST(s3, 0, 4);
    1440           0 :                             UPDATE_BEST(s3, 1, 4);
    1441           0 :                             UPDATE_BEST(s3, 2, 4);
    1442           0 :                             UPDATE_BEST(s3, 3, 4);
    1443             :                         }
    1444             :                     }
    1445             :                 }
    1446             : 
    1447           0 :                 if (leftover) {
    1448           0 :                     pSrc = src;
    1449           0 :                     pRef = ref + j;
    1450           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    1451           0 :                     for (k = 0; k < block_height >> 1; k++) {
    1452           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    1453           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    1454           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    1455           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1456           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1457           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1458           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1459           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    1460           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    1461           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    1462           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1463           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1464           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1465           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1466           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    1467           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    1468           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    1469           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1470           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1471           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1472           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1473           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
    1474           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
    1475           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
    1476           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1477           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1478           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1479           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1480           0 :                         pSrc += src_stride;
    1481           0 :                         pRef += ref_stride;
    1482             :                     }
    1483           0 :                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
    1484           0 :                     for (; k < block_height; k++) {
    1485           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    1486           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    1487           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    1488           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    1489           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    1490           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    1491           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    1492           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    1493           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    1494           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    1495           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    1496           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    1497           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    1498           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    1499           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    1500           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    1501           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    1502           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    1503           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    1504           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    1505           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    1506           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
    1507           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
    1508           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
    1509           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    1510           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    1511           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    1512           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    1513           0 :                         pSrc += src_stride;
    1514           0 :                         pRef += ref_stride;
    1515             :                     }
    1516           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    1517           0 :                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
    1518           0 :                     s0 = _mm_or_si128(s0, s8);
    1519           0 :                     s0 = _mm_minpos_epu16(s0);
    1520           0 :                     temSum = _mm_extract_epi16(s0, 0);
    1521           0 :                     temSum &= 0x0000FFFF;
    1522           0 :                     if (temSum < lowSum) {
    1523           0 :                         if (temSum != 0xFFFF) { // no overflow
    1524           0 :                             lowSum = temSum;
    1525           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    1526           0 :                             yBest = i;
    1527             :                         }
    1528             :                         else {
    1529           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    1530           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    1531           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    1532           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    1533           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    1534           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    1535           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    1536           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    1537           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    1538           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    1539           0 :                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
    1540           0 :                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
    1541           0 :                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
    1542           0 :                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
    1543           0 :                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
    1544           0 :                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
    1545           0 :                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
    1546           0 :                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
    1547           0 :                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
    1548           0 :                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
    1549           0 :                             k = leftover;
    1550           0 :                             while (k > 0) {
    1551           0 :                                 for (l = 0; l < 4 && k; l++, k--) {
    1552           0 :                                     temSum = _mm_extract_epi32(s0, 0);
    1553           0 :                                     s0 = _mm_srli_si128(s0, 4);
    1554           0 :                                     if (temSum < lowSum) {
    1555           0 :                                         lowSum = temSum;
    1556           0 :                                         xBest = (int16_t)(j + leftover - k);
    1557           0 :                                         yBest = i;
    1558             :                                     }
    1559             :                                 }
    1560           0 :                                 s0 = s3;
    1561             :                             }
    1562             :                         }
    1563             :                     }
    1564             :                 }
    1565           0 :                 ref += src_stride_raw;
    1566             :             }
    1567             :         }
    1568             :         else {
    1569             :             __m128i s9, s10;
    1570           0 :             for (i = 0; i < search_area_height; i++) {
    1571           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
    1572           0 :                     pSrc = src;
    1573           0 :                     pRef = ref + j;
    1574           0 :                     s9 = s10 = _mm_setzero_si128();
    1575           0 :                     k = 0;
    1576           0 :                     while (k < block_height) {
    1577           0 :                         s3 = s4 = s5 = s6 = _mm_setzero_si128();
    1578           0 :                         for (l = 0; l < 16 && k < block_height; k++, l++) {
    1579           0 :                             s0 = _mm_loadu_si128((__m128i*)pRef);
    1580           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    1581           0 :                             s2 = _mm_loadu_si128((__m128i*)pSrc);
    1582           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1583           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1584           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1585           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1586           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    1587           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    1588           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    1589           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1590           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1591           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1592           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1593           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    1594           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    1595           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    1596           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1597           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1598           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1599           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1600           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
    1601           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
    1602           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
    1603           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1604           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1605           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1606           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1607           0 :                             pSrc += src_stride;
    1608           0 :                             pRef += ref_stride;
    1609             :                         }
    1610           0 :                         s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    1611           0 :                         s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    1612           0 :                         s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    1613           0 :                         s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    1614           0 :                         s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    1615           0 :                         s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    1616           0 :                         s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    1617           0 :                         s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    1618           0 :                         s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
    1619           0 :                         s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
    1620             :                     }
    1621           0 :                     s0 = _mm_packus_epi32(s9, s10);
    1622           0 :                     s0 = _mm_minpos_epu16(s0);
    1623           0 :                     temSum = _mm_extract_epi16(s0, 0);
    1624           0 :                     temSum &= 0x0000FFFF;
    1625           0 :                     if (temSum < lowSum) {
    1626           0 :                         if (temSum != 0xFFFF) { // no overflow
    1627           0 :                             lowSum = temSum;
    1628           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    1629           0 :                             yBest = i;
    1630             :                         }
    1631             :                         else {
    1632           0 :                             UPDATE_BEST(s9, 0, 0);
    1633           0 :                             UPDATE_BEST(s9, 1, 0);
    1634           0 :                             UPDATE_BEST(s9, 2, 0);
    1635           0 :                             UPDATE_BEST(s9, 3, 0);
    1636           0 :                             UPDATE_BEST(s10, 0, 4);
    1637           0 :                             UPDATE_BEST(s10, 1, 4);
    1638           0 :                             UPDATE_BEST(s10, 2, 4);
    1639           0 :                             UPDATE_BEST(s10, 3, 4);
    1640             :                         }
    1641             :                     }
    1642             :                 }
    1643             : 
    1644           0 :                 if (leftover) {
    1645           0 :                     pSrc = src;
    1646           0 :                     pRef = ref + j;
    1647           0 :                     s9 = s10 = _mm_setzero_si128();
    1648           0 :                     k = 0;
    1649           0 :                     while (k < block_height) {
    1650           0 :                         s3 = s4 = s5 = s6 = _mm_setzero_si128();
    1651           0 :                         for (l = 0; l < 16 && k < block_height; k++, l++) {
    1652           0 :                             s0 = _mm_loadu_si128((__m128i*)pRef);
    1653           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    1654           0 :                             s2 = _mm_loadu_si128((__m128i*)pSrc);
    1655           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1656           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1657           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1658           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1659           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    1660           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    1661           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    1662           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1663           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1664           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1665           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1666           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    1667           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    1668           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    1669           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1670           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1671           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1672           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1673           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
    1674           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
    1675           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
    1676           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1677           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1678           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1679           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1680           0 :                             pSrc += src_stride;
    1681           0 :                             pRef += ref_stride;
    1682             :                         }
    1683           0 :                         s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    1684           0 :                         s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    1685           0 :                         s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    1686           0 :                         s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    1687           0 :                         s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    1688           0 :                         s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    1689           0 :                         s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    1690           0 :                         s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    1691           0 :                         s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
    1692           0 :                         s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
    1693             :                     }
    1694           0 :                     s0 = _mm_packus_epi32(s9, s10);
    1695           0 :                     s0 = _mm_or_si128(s0, s8);
    1696           0 :                     s0 = _mm_minpos_epu16(s0);
    1697           0 :                     temSum = _mm_extract_epi16(s0, 0);
    1698           0 :                     temSum &= 0x0000FFFF;
    1699           0 :                     if (temSum < lowSum) {
    1700           0 :                         if (temSum != 0xFFFF) { // no overflow
    1701           0 :                             lowSum = temSum;
    1702           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    1703           0 :                             yBest = i;
    1704             :                         }
    1705             :                         else {
    1706           0 :                             k = leftover;
    1707           0 :                             while (k > 0) {
    1708           0 :                                 for (l = 0; l < 4 && k; l++, k--) {
    1709           0 :                                     temSum = _mm_extract_epi32(s9, 0);
    1710           0 :                                     s9 = _mm_srli_si128(s9, 4);
    1711           0 :                                     if (temSum < lowSum) {
    1712           0 :                                         lowSum = temSum;
    1713           0 :                                         xBest = (int16_t)(j + leftover - k);
    1714           0 :                                         yBest = i;
    1715             :                                     }
    1716             :                                 }
    1717           0 :                                 s9 = s10;
    1718             :                             }
    1719             :                         }
    1720             :                     }
    1721             :                 }
    1722           0 :                 ref += src_stride_raw;
    1723             :             }
    1724             :         }
    1725           0 :         break;
    1726             : 
    1727           0 :     default:
    1728             :         assert(0);
    1729           0 :         break;
    1730             :     }
    1731             : 
    1732           0 :     *best_sad = lowSum;
    1733           0 :     *x_search_center = xBest;
    1734           0 :     *y_search_center = yBest;
    1735           0 : }
    1736             : 
    1737           0 : void sad_loop_kernel_sparse_sse4_1_intrin(
    1738             :     uint8_t  *src,                            // input parameter, source samples Ptr
    1739             :     uint32_t  src_stride,                      // input parameter, source stride
    1740             :     uint8_t  *ref,                            // input parameter, reference samples Ptr
    1741             :     uint32_t  ref_stride,                      // input parameter, reference stride
    1742             :     uint32_t  block_height,                   // input parameter, block height (M)
    1743             :     uint32_t  block_width,                    // input parameter, block width (N)
    1744             :     uint64_t *best_sad,
    1745             :     int16_t *x_search_center,
    1746             :     int16_t *y_search_center,
    1747             :     uint32_t  src_stride_raw,                   // input parameter, source stride (no line skipping)
    1748             :     int16_t search_area_width,
    1749             :     int16_t search_area_height)
    1750             : {
    1751           0 :     int16_t xBest = *x_search_center, yBest = *y_search_center;
    1752           0 :     uint32_t lowSum = 0xffffff;
    1753           0 :     uint32_t temSum = 0;
    1754             :     int16_t i, j;
    1755             :     uint32_t k, l;
    1756           0 :     uint32_t leftover = search_area_width & 7;
    1757             :     const uint8_t *pRef, *pSrc;
    1758           0 :     __m128i s0, s1, s2, s3, s4, s5, s6, s7, s8 = _mm_set1_epi32(-1);
    1759             : 
    1760           0 :     if (leftover) {
    1761           0 :         for (k = 0; k < leftover; k++)
    1762           0 :             s8 = _mm_slli_si128(s8, 2);
    1763             :     }
    1764             : 
    1765           0 :     switch (block_width) {
    1766           0 :     case 4:
    1767           0 :         for (i = 0; i < search_area_height; i++) {
    1768           0 :             uint32_t startW = (i & 1) << 3;
    1769           0 :             for (j = startW; j <= search_area_width - 8; j += 16) {
    1770           0 :                 pSrc = src;
    1771           0 :                 pRef = ref + j;
    1772           0 :                 s3 = _mm_setzero_si128();
    1773           0 :                 for (k = 0; k < block_height; k += 2) {
    1774           0 :                     s0 = _mm_loadu_si128((__m128i*)pRef);
    1775           0 :                     s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
    1776           0 :                     s2 = _mm_cvtsi32_si128(*(uint32_t *)pSrc);
    1777           0 :                     s5 = _mm_cvtsi32_si128(*(uint32_t *)(pSrc + src_stride));
    1778           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1779           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
    1780           0 :                     pSrc += src_stride << 1;
    1781           0 :                     pRef += ref_stride << 1;
    1782             :                 }
    1783           0 :                 s3 = _mm_minpos_epu16(s3);
    1784           0 :                 temSum = _mm_extract_epi16(s3, 0);
    1785           0 :                 if (temSum < lowSum) {
    1786           0 :                     lowSum = temSum;
    1787           0 :                     xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
    1788           0 :                     yBest = i;
    1789             :                 }
    1790             :             }
    1791             : 
    1792           0 :             if (leftover && j < search_area_width ) {
    1793           0 :                 pSrc = src;
    1794           0 :                 pRef = ref + j;
    1795           0 :                 s3 = _mm_setzero_si128();
    1796           0 :                 for (k = 0; k < block_height; k += 2) {
    1797           0 :                     s0 = _mm_loadu_si128((__m128i*)pRef);
    1798           0 :                     s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
    1799           0 :                     s2 = _mm_cvtsi32_si128(*(uint32_t *)pSrc);
    1800           0 :                     s5 = _mm_cvtsi32_si128(*(uint32_t *)(pSrc + src_stride));
    1801           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1802           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
    1803           0 :                     pSrc += src_stride << 1;
    1804           0 :                     pRef += ref_stride << 1;
    1805             :                 }
    1806           0 :                 s3 = _mm_or_si128(s3, s8);
    1807           0 :                 s3 = _mm_minpos_epu16(s3);
    1808           0 :                 temSum = _mm_extract_epi16(s3, 0);
    1809           0 :                 if (temSum < lowSum) {
    1810           0 :                     lowSum = temSum;
    1811           0 :                     xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
    1812           0 :                     yBest = i;
    1813             :                 }
    1814             :             }
    1815           0 :             ref += src_stride_raw;
    1816             :         }
    1817           0 :         break;
    1818             : 
    1819           0 :     case 8:
    1820           0 :         for (i = 0; i < search_area_height; i++) {
    1821           0 :             uint32_t startW = (i & 1) << 3;
    1822           0 :             for (j = startW; j <= search_area_width - 8; j += 16) {
    1823           0 :                 pSrc = src;
    1824           0 :                 pRef = ref + j;
    1825           0 :                 s3 = s4 = _mm_setzero_si128();
    1826           0 :                 for (k = 0; k < block_height; k += 2) {
    1827           0 :                     s0 = _mm_loadu_si128((__m128i*)pRef);
    1828           0 :                     s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
    1829           0 :                     s2 = _mm_loadl_epi64((__m128i*)pSrc);
    1830           0 :                     s5 = _mm_loadl_epi64((__m128i*)(pSrc + src_stride));
    1831           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1832           0 :                     s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1833           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
    1834           0 :                     s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
    1835           0 :                     pSrc += src_stride << 1;
    1836           0 :                     pRef += ref_stride << 1;
    1837             :                 }
    1838           0 :                 s3 = _mm_adds_epu16(s3, s4);
    1839           0 :                 s3 = _mm_minpos_epu16(s3);
    1840           0 :                 temSum = _mm_extract_epi16(s3, 0);
    1841           0 :                 if (temSum < lowSum) {
    1842           0 :                     lowSum = temSum;
    1843           0 :                     xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
    1844           0 :                     yBest = i;
    1845             :                 }
    1846             :             }
    1847             : 
    1848           0 :             if (leftover && j < search_area_width ) {
    1849           0 :                 pSrc = src;
    1850           0 :                 pRef = ref + j;
    1851           0 :                 s3 = s4 = _mm_setzero_si128();
    1852           0 :                 for (k = 0; k < block_height; k += 2) {
    1853           0 :                     s0 = _mm_loadu_si128((__m128i*)pRef);
    1854           0 :                     s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
    1855           0 :                     s2 = _mm_loadl_epi64((__m128i*)pSrc);
    1856           0 :                     s5 = _mm_loadl_epi64((__m128i*)(pSrc + src_stride));
    1857           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1858           0 :                     s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1859           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
    1860           0 :                     s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
    1861           0 :                     pSrc += src_stride << 1;
    1862           0 :                     pRef += ref_stride << 1;
    1863             :                 }
    1864           0 :                 s3 = _mm_adds_epu16(s3, s4);
    1865           0 :                 s3 = _mm_or_si128(s3, s8);
    1866           0 :                 s3 = _mm_minpos_epu16(s3);
    1867           0 :                 temSum = _mm_extract_epi16(s3, 0);
    1868           0 :                 if (temSum < lowSum) {
    1869           0 :                     lowSum = temSum;
    1870           0 :                     xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
    1871           0 :                     yBest = i;
    1872             :                 }
    1873             :             }
    1874           0 :             ref += src_stride_raw;
    1875             :         }
    1876           0 :         break;
    1877             : 
    1878           0 :     case 16:
    1879           0 :         if (block_height <= 16) {
    1880           0 :             for (i = 0; i < search_area_height; i++) {
    1881           0 :                 uint32_t startW = (i & 1) << 3;
    1882           0 :                 for (j = startW; j <= search_area_width - 8; j += 16) {
    1883           0 :                     pSrc = src;
    1884           0 :                     pRef = ref + j;
    1885           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    1886           0 :                     for (k = 0; k < block_height; k++) {
    1887           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    1888           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    1889           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    1890           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1891           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1892           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1893           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1894           0 :                         pSrc += src_stride;
    1895           0 :                         pRef += ref_stride;
    1896             :                     }
    1897           0 :                     s3 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    1898           0 :                     s3 = _mm_minpos_epu16(s3);
    1899           0 :                     temSum = _mm_extract_epi16(s3, 0);
    1900           0 :                     if (temSum < lowSum) {
    1901           0 :                         lowSum = temSum;
    1902           0 :                         xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
    1903           0 :                         yBest = i;
    1904             :                     }
    1905             :                 }
    1906             : 
    1907           0 :                 if (leftover && j < search_area_width ) {
    1908           0 :                     pSrc = src;
    1909           0 :                     pRef = ref + j;
    1910           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    1911           0 :                     for (k = 0; k < block_height; k++) {
    1912           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    1913           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    1914           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    1915           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1916           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1917           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1918           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1919           0 :                         pSrc += src_stride;
    1920           0 :                         pRef += ref_stride;
    1921             :                     }
    1922           0 :                     s3 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    1923           0 :                     s3 = _mm_or_si128(s3, s8);
    1924           0 :                     s3 = _mm_minpos_epu16(s3);
    1925           0 :                     temSum = _mm_extract_epi16(s3, 0);
    1926           0 :                     if (temSum < lowSum) {
    1927           0 :                         lowSum = temSum;
    1928           0 :                         xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
    1929           0 :                         yBest = i;
    1930             :                     }
    1931             :                 }
    1932           0 :                 ref += src_stride_raw;
    1933             :             }
    1934             :         }
    1935           0 :         else if (block_height <= 32) {
    1936           0 :             for (i = 0; i < search_area_height; i++) {
    1937           0 :                 uint32_t startW = (i & 1) << 3;
    1938           0 :                 for (j = startW; j <= search_area_width - 8; j += 16) {
    1939           0 :                     pSrc = src;
    1940           0 :                     pRef = ref + j;
    1941           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    1942           0 :                     for (k = 0; k < block_height; k++) {
    1943           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    1944           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    1945           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    1946           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1947           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1948           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1949           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1950           0 :                         pSrc += src_stride;
    1951           0 :                         pRef += ref_stride;
    1952             :                     }
    1953           0 :                     s3 = _mm_adds_epu16(s3, s4);
    1954           0 :                     s5 = _mm_adds_epu16(s5, s6);
    1955           0 :                     s4 = _mm_minpos_epu16(s3);
    1956           0 :                     s6 = _mm_minpos_epu16(s5);
    1957           0 :                     s4 = _mm_unpacklo_epi16(s4, s4);
    1958           0 :                     s4 = _mm_unpacklo_epi32(s4, s4);
    1959           0 :                     s4 = _mm_unpacklo_epi64(s4, s4);
    1960           0 :                     s6 = _mm_unpacklo_epi16(s6, s6);
    1961           0 :                     s6 = _mm_unpacklo_epi32(s6, s6);
    1962           0 :                     s6 = _mm_unpacklo_epi64(s6, s6);
    1963           0 :                     s3 = _mm_sub_epi16(s3, s4);
    1964           0 :                     s5 = _mm_adds_epu16(s5, s3);
    1965           0 :                     s5 = _mm_sub_epi16(s5, s6);
    1966           0 :                     s5 = _mm_minpos_epu16(s5);
    1967           0 :                     temSum = _mm_extract_epi16(s5, 0);
    1968           0 :                     temSum += _mm_extract_epi16(s4, 0);
    1969           0 :                     temSum += _mm_extract_epi16(s6, 0);
    1970           0 :                     if (temSum < lowSum) {
    1971           0 :                         lowSum = temSum;
    1972           0 :                         xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
    1973           0 :                         yBest = i;
    1974             :                     }
    1975             :                 }
    1976             : 
    1977           0 :                 if (leftover && j < search_area_width ) {
    1978           0 :                     pSrc = src;
    1979           0 :                     pRef = ref + j;
    1980           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    1981           0 :                     for (k = 0; k < block_height; k++) {
    1982           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    1983           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    1984           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    1985           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    1986           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    1987           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    1988           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    1989           0 :                         pSrc += src_stride;
    1990           0 :                         pRef += ref_stride;
    1991             :                     }
    1992           0 :                     s3 = _mm_adds_epu16(s3, s4);
    1993           0 :                     s5 = _mm_adds_epu16(s5, s6);
    1994           0 :                     s3 = _mm_or_si128(s3, s8);
    1995           0 :                     s5 = _mm_or_si128(s5, s8);
    1996           0 :                     s4 = _mm_minpos_epu16(s3);
    1997           0 :                     s6 = _mm_minpos_epu16(s5);
    1998           0 :                     s4 = _mm_unpacklo_epi16(s4, s4);
    1999           0 :                     s4 = _mm_unpacklo_epi32(s4, s4);
    2000           0 :                     s4 = _mm_unpacklo_epi64(s4, s4);
    2001           0 :                     s6 = _mm_unpacklo_epi16(s6, s6);
    2002           0 :                     s6 = _mm_unpacklo_epi32(s6, s6);
    2003           0 :                     s6 = _mm_unpacklo_epi64(s6, s6);
    2004           0 :                     s3 = _mm_sub_epi16(s3, s4);
    2005           0 :                     s5 = _mm_adds_epu16(s5, s3);
    2006           0 :                     s5 = _mm_sub_epi16(s5, s6);
    2007           0 :                     s5 = _mm_minpos_epu16(s5);
    2008           0 :                     temSum = _mm_extract_epi16(s5, 0);
    2009           0 :                     temSum += _mm_extract_epi16(s4, 0);
    2010           0 :                     temSum += _mm_extract_epi16(s6, 0);
    2011           0 :                     if (temSum < lowSum) {
    2012           0 :                         lowSum = temSum;
    2013           0 :                         xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
    2014           0 :                         yBest = i;
    2015             :                     }
    2016             :                 }
    2017           0 :                 ref += src_stride_raw;
    2018             :             }
    2019             :         }
    2020             :         else {
    2021           0 :             for (i = 0; i < search_area_height; i++) {
    2022           0 :                 uint32_t startW = (i & 1) << 3;
    2023           0 :                 for (j = startW; j <= search_area_width - 8; j += 16) {
    2024           0 :                     pSrc = src;
    2025           0 :                     pRef = ref + j;
    2026           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    2027           0 :                     for (k = 0; k < block_height; k++) {
    2028           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    2029           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2030           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    2031           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2032           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2033           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2034           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2035           0 :                         pSrc += src_stride;
    2036           0 :                         pRef += ref_stride;
    2037             :                     }
    2038           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    2039           0 :                     s0 = _mm_minpos_epu16(s0);
    2040           0 :                     temSum = _mm_extract_epi16(s0, 0);
    2041           0 :                     if (temSum < lowSum) {
    2042           0 :                         if (temSum != 0xFFFF) { // no overflow
    2043           0 :                             lowSum = temSum;
    2044           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    2045           0 :                             yBest = i;
    2046             :                         }
    2047             :                         else {
    2048           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    2049           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    2050           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    2051           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    2052           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    2053           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    2054           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    2055           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    2056           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    2057           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    2058           0 :                             UPDATE_BEST(s0, 0, 0);
    2059           0 :                             UPDATE_BEST(s0, 1, 0);
    2060           0 :                             UPDATE_BEST(s0, 2, 0);
    2061           0 :                             UPDATE_BEST(s0, 3, 0);
    2062           0 :                             UPDATE_BEST(s3, 0, 4);
    2063           0 :                             UPDATE_BEST(s3, 1, 4);
    2064           0 :                             UPDATE_BEST(s3, 2, 4);
    2065           0 :                             UPDATE_BEST(s3, 3, 4);
    2066             :                         }
    2067             :                     }
    2068             :                 }
    2069             : 
    2070           0 :                 if (leftover && j < search_area_width ) {
    2071           0 :                     pSrc = src;
    2072           0 :                     pRef = ref + j;
    2073           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    2074           0 :                     for (k = 0; k < block_height; k++) {
    2075           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    2076           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2077           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    2078           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2079           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2080           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2081           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2082           0 :                         pSrc += src_stride;
    2083           0 :                         pRef += ref_stride;
    2084             :                     }
    2085           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    2086           0 :                     s0 = _mm_or_si128(s0, s8);
    2087           0 :                     s0 = _mm_minpos_epu16(s0);
    2088           0 :                     temSum = _mm_extract_epi16(s0, 0);
    2089           0 :                     if (temSum < lowSum) {
    2090           0 :                         if (temSum != 0xFFFF) { // no overflow
    2091           0 :                             lowSum = temSum;
    2092           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    2093           0 :                             yBest = i;
    2094             :                         }
    2095             :                         else {
    2096           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    2097           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    2098           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    2099           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    2100           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    2101           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    2102           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    2103           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    2104           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    2105           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    2106           0 :                             k = leftover;
    2107           0 :                             while (k > 0) {
    2108           0 :                                 for (l = 0; l < 4 && k; l++, k--) {
    2109           0 :                                     temSum = _mm_extract_epi32(s0, 0);
    2110           0 :                                     s0 = _mm_srli_si128(s0, 4);
    2111           0 :                                     if (temSum < lowSum) {
    2112           0 :                                         lowSum = temSum;
    2113           0 :                                         xBest = (int16_t)(j + leftover - k);
    2114           0 :                                         yBest = i;
    2115             :                                     }
    2116             :                                 }
    2117           0 :                                 s0 = s3;
    2118             :                             }
    2119             :                         }
    2120             :                     }
    2121             :                 }
    2122           0 :                 ref += src_stride_raw;
    2123             :             }
    2124             :         }
    2125           0 :         break;
    2126             : 
    2127           0 :     case 24:
    2128           0 :         if (block_height <= 16) {
    2129           0 :             for (i = 0; i < search_area_height; i++) {
    2130           0 :                 uint32_t startW = (i & 1) << 3;
    2131           0 :                 for (j = startW; j <= search_area_width - 8; j += 16) {
    2132           0 :                     pSrc = src;
    2133           0 :                     pRef = ref + j;
    2134           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    2135           0 :                     for (k = 0; k < block_height; k++) {
    2136           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    2137           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2138           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    2139           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2140           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2141           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2142           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2143           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    2144           0 :                         s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
    2145           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2146           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2147           0 :                         pSrc += src_stride;
    2148           0 :                         pRef += ref_stride;
    2149             :                     }
    2150           0 :                     s3 = _mm_adds_epu16(s3, s4);
    2151           0 :                     s5 = _mm_adds_epu16(s5, s6);
    2152           0 :                     s4 = _mm_minpos_epu16(s3);
    2153           0 :                     s6 = _mm_minpos_epu16(s5);
    2154           0 :                     s4 = _mm_unpacklo_epi16(s4, s4);
    2155           0 :                     s4 = _mm_unpacklo_epi32(s4, s4);
    2156           0 :                     s4 = _mm_unpacklo_epi64(s4, s4);
    2157           0 :                     s6 = _mm_unpacklo_epi16(s6, s6);
    2158           0 :                     s6 = _mm_unpacklo_epi32(s6, s6);
    2159           0 :                     s6 = _mm_unpacklo_epi64(s6, s6);
    2160           0 :                     s3 = _mm_sub_epi16(s3, s4);
    2161           0 :                     s5 = _mm_adds_epu16(s5, s3);
    2162           0 :                     s5 = _mm_sub_epi16(s5, s6);
    2163           0 :                     s5 = _mm_minpos_epu16(s5);
    2164           0 :                     temSum = _mm_extract_epi16(s5, 0);
    2165           0 :                     temSum += _mm_extract_epi16(s4, 0);
    2166           0 :                     temSum += _mm_extract_epi16(s6, 0);
    2167           0 :                     if (temSum < lowSum) {
    2168           0 :                         lowSum = temSum;
    2169           0 :                         xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
    2170           0 :                         yBest = i;
    2171             :                     }
    2172             :                 }
    2173             : 
    2174           0 :                 if (leftover && j < search_area_width ) {
    2175           0 :                     pSrc = src;
    2176           0 :                     pRef = ref + j;
    2177           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    2178           0 :                     for (k = 0; k < block_height; k++) {
    2179           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    2180           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2181           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    2182           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2183           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2184           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2185           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2186           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    2187           0 :                         s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
    2188           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2189           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2190           0 :                         pSrc += src_stride;
    2191           0 :                         pRef += ref_stride;
    2192             :                     }
    2193           0 :                     s3 = _mm_adds_epu16(s3, s4);
    2194           0 :                     s5 = _mm_adds_epu16(s5, s6);
    2195           0 :                     s3 = _mm_or_si128(s3, s8);
    2196           0 :                     s5 = _mm_or_si128(s5, s8);
    2197           0 :                     s4 = _mm_minpos_epu16(s3);
    2198           0 :                     s6 = _mm_minpos_epu16(s5);
    2199           0 :                     s4 = _mm_unpacklo_epi16(s4, s4);
    2200           0 :                     s4 = _mm_unpacklo_epi32(s4, s4);
    2201           0 :                     s4 = _mm_unpacklo_epi64(s4, s4);
    2202           0 :                     s6 = _mm_unpacklo_epi16(s6, s6);
    2203           0 :                     s6 = _mm_unpacklo_epi32(s6, s6);
    2204           0 :                     s6 = _mm_unpacklo_epi64(s6, s6);
    2205           0 :                     s3 = _mm_sub_epi16(s3, s4);
    2206           0 :                     s5 = _mm_adds_epu16(s5, s3);
    2207           0 :                     s5 = _mm_sub_epi16(s5, s6);
    2208           0 :                     s5 = _mm_minpos_epu16(s5);
    2209           0 :                     temSum = _mm_extract_epi16(s5, 0);
    2210           0 :                     temSum += _mm_extract_epi16(s4, 0);
    2211           0 :                     temSum += _mm_extract_epi16(s6, 0);
    2212           0 :                     if (temSum < lowSum) {
    2213           0 :                         lowSum = temSum;
    2214           0 :                         xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
    2215           0 :                         yBest = i;
    2216             :                     }
    2217             :                 }
    2218           0 :                 ref += src_stride_raw;
    2219             :             }
    2220             :         }
    2221             :         else {
    2222           0 :             for (i = 0; i < search_area_height; i++) {
    2223           0 :                 uint32_t startW = (i & 1) << 3;
    2224           0 :                 for (j = startW; j <= search_area_width - 8; j += 16) {
    2225           0 :                     pSrc = src;
    2226           0 :                     pRef = ref + j;
    2227           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    2228           0 :                     for (k = 0; k < block_height; k++) {
    2229           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    2230           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2231           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    2232           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2233           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2234           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2235           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2236           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    2237           0 :                         s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
    2238           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2239           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2240           0 :                         pSrc += src_stride;
    2241           0 :                         pRef += ref_stride;
    2242             :                     }
    2243           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    2244           0 :                     s0 = _mm_minpos_epu16(s0);
    2245           0 :                     temSum = _mm_extract_epi16(s0, 0);
    2246           0 :                     if (temSum < lowSum) {
    2247           0 :                         if (temSum != 0xFFFF) { // no overflow
    2248           0 :                             lowSum = temSum;
    2249           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    2250           0 :                             yBest = i;
    2251             :                         }
    2252             :                         else {
    2253           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    2254           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    2255           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    2256           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    2257           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    2258           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    2259           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    2260           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    2261           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    2262           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    2263           0 :                             UPDATE_BEST(s0, 0, 0);
    2264           0 :                             UPDATE_BEST(s0, 1, 0);
    2265           0 :                             UPDATE_BEST(s0, 2, 0);
    2266           0 :                             UPDATE_BEST(s0, 3, 0);
    2267           0 :                             UPDATE_BEST(s3, 0, 4);
    2268           0 :                             UPDATE_BEST(s3, 1, 4);
    2269           0 :                             UPDATE_BEST(s3, 2, 4);
    2270           0 :                             UPDATE_BEST(s3, 3, 4);
    2271             :                         }
    2272             :                     }
    2273             :                 }
    2274             : 
    2275           0 :                 if (leftover && j < search_area_width ) {
    2276           0 :                     pSrc = src;
    2277           0 :                     pRef = ref + j;
    2278           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    2279           0 :                     for (k = 0; k < block_height; k++) {
    2280           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    2281           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2282           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    2283           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2284           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2285           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2286           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2287           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    2288           0 :                         s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
    2289           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2290           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2291           0 :                         pSrc += src_stride;
    2292           0 :                         pRef += ref_stride;
    2293             :                     }
    2294           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    2295           0 :                     s0 = _mm_or_si128(s0, s8);
    2296           0 :                     s0 = _mm_minpos_epu16(s0);
    2297           0 :                     temSum = _mm_extract_epi16(s0, 0);
    2298           0 :                     if (temSum < lowSum) {
    2299           0 :                         if (temSum != 0xFFFF) { // no overflow
    2300           0 :                             lowSum = temSum;
    2301           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    2302           0 :                             yBest = i;
    2303             :                         }
    2304             :                         else {
    2305           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    2306           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    2307           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    2308           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    2309           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    2310           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    2311           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    2312           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    2313           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    2314           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    2315           0 :                             k = leftover;
    2316           0 :                             while (k > 0) {
    2317           0 :                                 for (l = 0; l < 4 && k; l++, k--) {
    2318           0 :                                     temSum = _mm_extract_epi32(s0, 0);
    2319           0 :                                     s0 = _mm_srli_si128(s0, 4);
    2320           0 :                                     if (temSum < lowSum) {
    2321           0 :                                         lowSum = temSum;
    2322           0 :                                         xBest = (int16_t)(j + leftover - k);
    2323           0 :                                         yBest = i;
    2324             :                                     }
    2325             :                                 }
    2326           0 :                                 s0 = s3;
    2327             :                             }
    2328             :                         }
    2329             :                     }
    2330             :                 }
    2331           0 :                 ref += src_stride_raw;
    2332             :             }
    2333             :         }
    2334           0 :         break;
    2335             : 
    2336           0 :     case 32:
    2337           0 :         if (block_height <= 32) {
    2338           0 :             for (i = 0; i < search_area_height; i++) {
    2339           0 :                 uint32_t startW = (i & 1) << 3;
    2340           0 :                 for (j = startW; j <= search_area_width - 8; j += 16) {
    2341           0 :                     pSrc = src;
    2342           0 :                     pRef = ref + j;
    2343           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    2344           0 :                     for (k = 0; k < block_height; k++) {
    2345           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    2346           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2347           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    2348           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2349           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2350           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2351           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2352           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    2353           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    2354           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    2355           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2356           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2357           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2358           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2359           0 :                         pSrc += src_stride;
    2360           0 :                         pRef += ref_stride;
    2361             :                     }
    2362           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    2363           0 :                     s0 = _mm_minpos_epu16(s0);
    2364           0 :                     temSum = _mm_extract_epi16(s0, 0);
    2365           0 :                     temSum &= 0x0000FFFF;
    2366           0 :                     if (temSum < lowSum) {
    2367           0 :                         if (temSum != 0xFFFF) { // no overflow
    2368           0 :                             lowSum = temSum;
    2369           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    2370           0 :                             yBest = i;
    2371             :                         }
    2372             :                         else {
    2373           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    2374           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    2375           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    2376           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    2377           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    2378           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    2379           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    2380           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    2381           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    2382           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    2383           0 :                             UPDATE_BEST(s0, 0, 0);
    2384           0 :                             UPDATE_BEST(s0, 1, 0);
    2385           0 :                             UPDATE_BEST(s0, 2, 0);
    2386           0 :                             UPDATE_BEST(s0, 3, 0);
    2387           0 :                             UPDATE_BEST(s3, 0, 4);
    2388           0 :                             UPDATE_BEST(s3, 1, 4);
    2389           0 :                             UPDATE_BEST(s3, 2, 4);
    2390           0 :                             UPDATE_BEST(s3, 3, 4);
    2391             :                         }
    2392             :                     }
    2393             :                 }
    2394             : 
    2395           0 :                 if (leftover && j < search_area_width ) {
    2396           0 :                     pSrc = src;
    2397           0 :                     pRef = ref + j;
    2398           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    2399           0 :                     for (k = 0; k < block_height; k++) {
    2400           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    2401           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2402           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    2403           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2404           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2405           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2406           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2407           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    2408           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    2409           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    2410           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2411           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2412           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2413           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2414           0 :                         pSrc += src_stride;
    2415           0 :                         pRef += ref_stride;
    2416             :                     }
    2417           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    2418           0 :                     s0 = _mm_or_si128(s0, s8);
    2419           0 :                     s0 = _mm_minpos_epu16(s0);
    2420           0 :                     temSum = _mm_extract_epi16(s0, 0);
    2421           0 :                     temSum &= 0x0000FFFF;
    2422           0 :                     if (temSum < lowSum) {
    2423           0 :                         if (temSum != 0xFFFF) { // no overflow
    2424           0 :                             lowSum = temSum;
    2425           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    2426           0 :                             yBest = i;
    2427             :                         }
    2428             :                         else {
    2429           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    2430           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    2431           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    2432           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    2433           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    2434           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    2435           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    2436           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    2437           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    2438           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    2439           0 :                             k = leftover;
    2440           0 :                             while (k > 0) {
    2441           0 :                                 for (l = 0; l < 4 && k; l++, k--) {
    2442           0 :                                     temSum = _mm_extract_epi32(s0, 0);
    2443           0 :                                     s0 = _mm_srli_si128(s0, 4);
    2444           0 :                                     if (temSum < lowSum) {
    2445           0 :                                         lowSum = temSum;
    2446           0 :                                         xBest = (int16_t)(j + leftover - k);
    2447           0 :                                         yBest = i;
    2448             :                                     }
    2449             :                                 }
    2450           0 :                                 s0 = s3;
    2451             :                             }
    2452             :                         }
    2453             :                     }
    2454             :                 }
    2455           0 :                 ref += src_stride_raw;
    2456             :             }
    2457             :         }
    2458             :         else {
    2459             :             __m128i s9, s10, s11, s12;
    2460           0 :             for (i = 0; i < search_area_height; i++) {
    2461           0 :                 uint32_t startW = (i & 1) << 3;
    2462           0 :                 for (j = startW; j <= search_area_width - 8; j += 16) {
    2463           0 :                     pSrc = src;
    2464           0 :                     pRef = ref + j;
    2465           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    2466           0 :                     for (k = 0; k < block_height >> 1; k++) {
    2467           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    2468           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2469           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    2470           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2471           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2472           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2473           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2474           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    2475           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    2476           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    2477           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2478           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2479           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2480           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2481           0 :                         pSrc += src_stride;
    2482           0 :                         pRef += ref_stride;
    2483             :                     }
    2484           0 :                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
    2485           0 :                     for (; k < block_height; k++) {
    2486           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    2487           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2488           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    2489           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    2490           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    2491           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    2492           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    2493           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    2494           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    2495           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    2496           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    2497           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    2498           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    2499           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    2500           0 :                         pSrc += src_stride;
    2501           0 :                         pRef += ref_stride;
    2502             :                     }
    2503           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    2504           0 :                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
    2505           0 :                     s0 = _mm_minpos_epu16(s0);
    2506           0 :                     temSum = _mm_extract_epi16(s0, 0);
    2507           0 :                     temSum &= 0x0000FFFF;
    2508           0 :                     if (temSum < lowSum) {
    2509           0 :                         if (temSum != 0xFFFF) { // no overflow
    2510           0 :                             lowSum = temSum;
    2511           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    2512           0 :                             yBest = i;
    2513             :                         }
    2514             :                         else {
    2515           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    2516           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    2517           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    2518           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    2519           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    2520           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    2521           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    2522           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    2523           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    2524           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    2525           0 :                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
    2526           0 :                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
    2527           0 :                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
    2528           0 :                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
    2529           0 :                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
    2530           0 :                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
    2531           0 :                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
    2532           0 :                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
    2533           0 :                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
    2534           0 :                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
    2535           0 :                             UPDATE_BEST(s0, 0, 0);
    2536           0 :                             UPDATE_BEST(s0, 1, 0);
    2537           0 :                             UPDATE_BEST(s0, 2, 0);
    2538           0 :                             UPDATE_BEST(s0, 3, 0);
    2539           0 :                             UPDATE_BEST(s3, 0, 4);
    2540           0 :                             UPDATE_BEST(s3, 1, 4);
    2541           0 :                             UPDATE_BEST(s3, 2, 4);
    2542           0 :                             UPDATE_BEST(s3, 3, 4);
    2543             :                         }
    2544             :                     }
    2545             :                 }
    2546             : 
    2547           0 :                 if (leftover && j < search_area_width ) {
    2548           0 :                     pSrc = src;
    2549           0 :                     pRef = ref + j;
    2550           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    2551           0 :                     for (k = 0; k < block_height >> 1; k++) {
    2552           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    2553           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2554           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    2555           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2556           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2557           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2558           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2559           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    2560           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    2561           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    2562           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2563           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2564           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2565           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2566           0 :                         pSrc += src_stride;
    2567           0 :                         pRef += ref_stride;
    2568             :                     }
    2569           0 :                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
    2570           0 :                     for (; k < block_height; k++) {
    2571           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    2572           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2573           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    2574           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    2575           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    2576           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    2577           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    2578           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    2579           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    2580           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    2581           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    2582           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    2583           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    2584           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    2585           0 :                         pSrc += src_stride;
    2586           0 :                         pRef += ref_stride;
    2587             :                     }
    2588           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    2589           0 :                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
    2590           0 :                     s0 = _mm_or_si128(s0, s8);
    2591           0 :                     s0 = _mm_minpos_epu16(s0);
    2592           0 :                     temSum = _mm_extract_epi16(s0, 0);
    2593           0 :                     temSum &= 0x0000FFFF;
    2594           0 :                     if (temSum < lowSum) {
    2595           0 :                         if (temSum != 0xFFFF) { // no overflow
    2596           0 :                             lowSum = temSum;
    2597           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    2598           0 :                             yBest = i;
    2599             :                         }
    2600             :                         else {
    2601           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    2602           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    2603           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    2604           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    2605           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    2606           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    2607           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    2608           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    2609           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    2610           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    2611           0 :                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
    2612           0 :                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
    2613           0 :                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
    2614           0 :                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
    2615           0 :                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
    2616           0 :                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
    2617           0 :                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
    2618           0 :                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
    2619           0 :                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
    2620           0 :                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
    2621           0 :                             k = leftover;
    2622           0 :                             while (k > 0) {
    2623           0 :                                 for (l = 0; l < 4 && k; l++, k--) {
    2624           0 :                                     temSum = _mm_extract_epi32(s0, 0);
    2625           0 :                                     s0 = _mm_srli_si128(s0, 4);
    2626           0 :                                     if (temSum < lowSum) {
    2627           0 :                                         lowSum = temSum;
    2628           0 :                                         xBest = (int16_t)(j + leftover - k);
    2629           0 :                                         yBest = i;
    2630             :                                     }
    2631             :                                 }
    2632           0 :                                 s0 = s3;
    2633             :                             }
    2634             :                         }
    2635             :                     }
    2636             :                 }
    2637           0 :                 ref += src_stride_raw;
    2638             :             }
    2639             :         }
    2640           0 :         break;
    2641             : 
    2642           0 :     case 48:
    2643           0 :         if (block_height <= 32) {
    2644             :             __m128i s9, s10, s11, s12;
    2645           0 :             for (i = 0; i < search_area_height; i++) {
    2646           0 :                 uint32_t startW = (i & 1) << 3;
    2647           0 :                 for (j = startW; j <= search_area_width - 8; j += 16) {
    2648           0 :                     pSrc = src;
    2649           0 :                     pRef = ref + j;
    2650           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    2651           0 :                     for (k = 0; k < block_height >> 1; k++) {
    2652           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    2653           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2654           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    2655           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2656           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2657           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2658           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2659           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    2660           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    2661           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    2662           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2663           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2664           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2665           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2666           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    2667           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    2668           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    2669           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2670           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2671           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2672           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2673           0 :                         pSrc += src_stride;
    2674           0 :                         pRef += ref_stride;
    2675             :                     }
    2676           0 :                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
    2677           0 :                     for (; k < block_height; k++) {
    2678           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    2679           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2680           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    2681           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    2682           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    2683           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    2684           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    2685           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    2686           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    2687           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    2688           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    2689           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    2690           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    2691           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    2692           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    2693           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    2694           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    2695           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    2696           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    2697           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    2698           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    2699           0 :                         pSrc += src_stride;
    2700           0 :                         pRef += ref_stride;
    2701             :                     }
    2702           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    2703           0 :                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
    2704           0 :                     s0 = _mm_minpos_epu16(s0);
    2705           0 :                     temSum = _mm_extract_epi16(s0, 0);
    2706           0 :                     temSum &= 0x0000FFFF;
    2707           0 :                     if (temSum < lowSum) {
    2708           0 :                         if (temSum != 0xFFFF) { // no overflow
    2709           0 :                             lowSum = temSum;
    2710           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    2711           0 :                             yBest = i;
    2712             :                         }
    2713             :                         else {
    2714           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    2715           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    2716           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    2717           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    2718           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    2719           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    2720           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    2721           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    2722           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    2723           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    2724           0 :                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
    2725           0 :                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
    2726           0 :                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
    2727           0 :                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
    2728           0 :                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
    2729           0 :                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
    2730           0 :                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
    2731           0 :                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
    2732           0 :                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
    2733           0 :                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
    2734           0 :                             UPDATE_BEST(s0, 0, 0);
    2735           0 :                             UPDATE_BEST(s0, 1, 0);
    2736           0 :                             UPDATE_BEST(s0, 2, 0);
    2737           0 :                             UPDATE_BEST(s0, 3, 0);
    2738           0 :                             UPDATE_BEST(s3, 0, 4);
    2739           0 :                             UPDATE_BEST(s3, 1, 4);
    2740           0 :                             UPDATE_BEST(s3, 2, 4);
    2741           0 :                             UPDATE_BEST(s3, 3, 4);
    2742             :                         }
    2743             :                     }
    2744             :                 }
    2745             : 
    2746           0 :                 if (leftover && j < search_area_width ) {
    2747           0 :                     pSrc = src;
    2748           0 :                     pRef = ref + j;
    2749           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    2750           0 :                     for (k = 0; k < block_height >> 1; k++) {
    2751           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    2752           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2753           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    2754           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2755           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2756           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2757           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2758           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    2759           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    2760           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    2761           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2762           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2763           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2764           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2765           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    2766           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    2767           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    2768           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2769           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2770           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2771           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2772           0 :                         pSrc += src_stride;
    2773           0 :                         pRef += ref_stride;
    2774             :                     }
    2775           0 :                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
    2776           0 :                     for (; k < block_height; k++) {
    2777           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    2778           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2779           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    2780           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    2781           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    2782           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    2783           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    2784           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    2785           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    2786           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    2787           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    2788           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    2789           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    2790           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    2791           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    2792           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    2793           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    2794           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    2795           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    2796           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    2797           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    2798           0 :                         pSrc += src_stride;
    2799           0 :                         pRef += ref_stride;
    2800             :                     }
    2801           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    2802           0 :                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
    2803           0 :                     s0 = _mm_or_si128(s0, s8);
    2804           0 :                     s0 = _mm_minpos_epu16(s0);
    2805           0 :                     temSum = _mm_extract_epi16(s0, 0);
    2806           0 :                     temSum &= 0x0000FFFF;
    2807           0 :                     if (temSum < lowSum) {
    2808           0 :                         if (temSum != 0xFFFF) { // no overflow
    2809           0 :                             lowSum = temSum;
    2810           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    2811           0 :                             yBest = i;
    2812             :                         }
    2813             :                         else {
    2814           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    2815           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    2816           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    2817           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    2818           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    2819           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    2820           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    2821           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    2822           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    2823           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    2824           0 :                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
    2825           0 :                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
    2826           0 :                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
    2827           0 :                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
    2828           0 :                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
    2829           0 :                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
    2830           0 :                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
    2831           0 :                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
    2832           0 :                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
    2833           0 :                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
    2834           0 :                             k = leftover;
    2835           0 :                             while (k > 0) {
    2836           0 :                                 for (l = 0; l < 4 && k; l++, k--) {
    2837           0 :                                     temSum = _mm_extract_epi32(s0, 0);
    2838           0 :                                     s0 = _mm_srli_si128(s0, 4);
    2839           0 :                                     if (temSum < lowSum) {
    2840           0 :                                         lowSum = temSum;
    2841           0 :                                         xBest = (int16_t)(j + leftover - k);
    2842           0 :                                         yBest = i;
    2843             :                                     }
    2844             :                                 }
    2845           0 :                                 s0 = s3;
    2846             :                             }
    2847             :                         }
    2848             :                     }
    2849             :                 }
    2850           0 :                 ref += src_stride_raw;
    2851             :             }
    2852             :         }
    2853             :         else {
    2854             :             __m128i s9, s10;
    2855           0 :             for (i = 0; i < search_area_height; i++) {
    2856           0 :                 uint32_t startW = (i & 1) << 3;
    2857           0 :                 for (j = startW; j <= search_area_width - 8; j += 16) {
    2858           0 :                     pSrc = src;
    2859           0 :                     pRef = ref + j;
    2860           0 :                     s9 = s10 = _mm_setzero_si128();
    2861           0 :                     k = 0;
    2862           0 :                     while (k < block_height) {
    2863           0 :                         s3 = s4 = s5 = s6 = _mm_setzero_si128();
    2864           0 :                         for (l = 0; l < 21 && k < block_height; k++, l++) {
    2865           0 :                             s0 = _mm_loadu_si128((__m128i*)pRef);
    2866           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2867           0 :                             s2 = _mm_loadu_si128((__m128i*)pSrc);
    2868           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2869           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2870           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2871           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2872           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    2873           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    2874           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    2875           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2876           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2877           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2878           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2879           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    2880           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    2881           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    2882           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2883           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2884           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2885           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2886           0 :                             pSrc += src_stride;
    2887           0 :                             pRef += ref_stride;
    2888             :                         }
    2889           0 :                         s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    2890           0 :                         s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    2891           0 :                         s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    2892           0 :                         s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    2893           0 :                         s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    2894           0 :                         s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    2895           0 :                         s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    2896           0 :                         s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    2897           0 :                         s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
    2898           0 :                         s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
    2899             :                     }
    2900           0 :                     s0 = _mm_packus_epi32(s9, s10);
    2901           0 :                     s0 = _mm_minpos_epu16(s0);
    2902           0 :                     temSum = _mm_extract_epi16(s0, 0);
    2903           0 :                     temSum &= 0x0000FFFF;
    2904           0 :                     if (temSum < lowSum) {
    2905           0 :                         if (temSum != 0xFFFF) { // no overflow
    2906           0 :                             lowSum = temSum;
    2907           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    2908           0 :                             yBest = i;
    2909             :                         }
    2910             :                         else {
    2911           0 :                             UPDATE_BEST(s9, 0, 0);
    2912           0 :                             UPDATE_BEST(s9, 1, 0);
    2913           0 :                             UPDATE_BEST(s9, 2, 0);
    2914           0 :                             UPDATE_BEST(s9, 3, 0);
    2915           0 :                             UPDATE_BEST(s10, 0, 4);
    2916           0 :                             UPDATE_BEST(s10, 1, 4);
    2917           0 :                             UPDATE_BEST(s10, 2, 4);
    2918           0 :                             UPDATE_BEST(s10, 3, 4);
    2919             :                         }
    2920             :                     }
    2921             :                 }
    2922             : 
    2923           0 :                 if (leftover && j < search_area_width ) {
    2924           0 :                     pSrc = src;
    2925           0 :                     pRef = ref + j;
    2926           0 :                     s9 = s10 = _mm_setzero_si128();
    2927           0 :                     k = 0;
    2928           0 :                     while (k < block_height) {
    2929           0 :                         s3 = s4 = s5 = s6 = _mm_setzero_si128();
    2930           0 :                         for (l = 0; l < 21 && k < block_height; k++, l++) {
    2931           0 :                             s0 = _mm_loadu_si128((__m128i*)pRef);
    2932           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    2933           0 :                             s2 = _mm_loadu_si128((__m128i*)pSrc);
    2934           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2935           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2936           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2937           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2938           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    2939           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    2940           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    2941           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2942           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2943           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2944           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2945           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    2946           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    2947           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    2948           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    2949           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    2950           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    2951           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    2952           0 :                             pSrc += src_stride;
    2953           0 :                             pRef += ref_stride;
    2954             :                         }
    2955           0 :                         s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    2956           0 :                         s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    2957           0 :                         s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    2958           0 :                         s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    2959           0 :                         s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    2960           0 :                         s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    2961           0 :                         s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    2962           0 :                         s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    2963           0 :                         s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
    2964           0 :                         s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
    2965             :                     }
    2966           0 :                     s0 = _mm_packus_epi32(s9, s10);
    2967           0 :                     s0 = _mm_or_si128(s0, s8);
    2968           0 :                     s0 = _mm_minpos_epu16(s0);
    2969           0 :                     temSum = _mm_extract_epi16(s0, 0);
    2970           0 :                     temSum &= 0x0000FFFF;
    2971           0 :                     if (temSum < lowSum) {
    2972           0 :                         if (temSum != 0xFFFF) { // no overflow
    2973           0 :                             lowSum = temSum;
    2974           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    2975           0 :                             yBest = i;
    2976             :                         }
    2977             :                         else {
    2978           0 :                             k = leftover;
    2979           0 :                             while (k > 0) {
    2980           0 :                                 for (l = 0; l < 4 && k; l++, k--) {
    2981           0 :                                     temSum = _mm_extract_epi32(s9, 0);
    2982           0 :                                     s9 = _mm_srli_si128(s9, 4);
    2983           0 :                                     if (temSum < lowSum) {
    2984           0 :                                         lowSum = temSum;
    2985           0 :                                         xBest = (int16_t)(j + leftover - k);
    2986           0 :                                         yBest = i;
    2987             :                                     }
    2988             :                                 }
    2989           0 :                                 s9 = s10;
    2990             :                             }
    2991             :                         }
    2992             :                     }
    2993             :                 }
    2994           0 :                 ref += src_stride_raw;
    2995             :             }
    2996             :         }
    2997           0 :         break;
    2998             : 
    2999           0 :     case 64:
    3000           0 :         if (block_height <= 32) {
    3001             :             __m128i s9, s10, s11, s12;
    3002           0 :             for (i = 0; i < search_area_height; i++) {
    3003           0 :                 uint32_t startW = (i & 1) << 3;
    3004           0 :                 for (j = startW; j <= search_area_width - 8; j += 16) {
    3005           0 :                     pSrc = src;
    3006           0 :                     pRef = ref + j;
    3007           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    3008           0 :                     for (k = 0; k < block_height >> 1; k++) {
    3009           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    3010           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3011           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    3012           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3013           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3014           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3015           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3016           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    3017           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    3018           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    3019           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3020           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3021           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3022           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3023           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    3024           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    3025           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    3026           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3027           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3028           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3029           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3030           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
    3031           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
    3032           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
    3033           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3034           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3035           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3036           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3037           0 :                         pSrc += src_stride;
    3038           0 :                         pRef += ref_stride;
    3039             :                     }
    3040           0 :                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
    3041           0 :                     for (; k < block_height; k++) {
    3042           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    3043           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3044           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    3045           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    3046           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    3047           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    3048           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    3049           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    3050           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    3051           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    3052           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    3053           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    3054           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    3055           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    3056           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    3057           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    3058           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    3059           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    3060           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    3061           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    3062           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    3063           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
    3064           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
    3065           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
    3066           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    3067           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    3068           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    3069           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    3070           0 :                         pSrc += src_stride;
    3071           0 :                         pRef += ref_stride;
    3072             :                     }
    3073           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    3074           0 :                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
    3075           0 :                     s0 = _mm_minpos_epu16(s0);
    3076           0 :                     temSum = _mm_extract_epi16(s0, 0);
    3077           0 :                     temSum &= 0x0000FFFF;
    3078           0 :                     if (temSum < lowSum) {
    3079           0 :                         if (temSum != 0xFFFF) { // no overflow
    3080           0 :                             lowSum = temSum;
    3081           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    3082           0 :                             yBest = i;
    3083             :                         }
    3084             :                         else {
    3085           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    3086           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    3087           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    3088           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    3089           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    3090           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    3091           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    3092           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    3093           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    3094           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    3095           0 :                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
    3096           0 :                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
    3097           0 :                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
    3098           0 :                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
    3099           0 :                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
    3100           0 :                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
    3101           0 :                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
    3102           0 :                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
    3103           0 :                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
    3104           0 :                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
    3105           0 :                             UPDATE_BEST(s0, 0, 0);
    3106           0 :                             UPDATE_BEST(s0, 1, 0);
    3107           0 :                             UPDATE_BEST(s0, 2, 0);
    3108           0 :                             UPDATE_BEST(s0, 3, 0);
    3109           0 :                             UPDATE_BEST(s3, 0, 4);
    3110           0 :                             UPDATE_BEST(s3, 1, 4);
    3111           0 :                             UPDATE_BEST(s3, 2, 4);
    3112           0 :                             UPDATE_BEST(s3, 3, 4);
    3113             :                         }
    3114             :                     }
    3115             :                 }
    3116             : 
    3117           0 :                 if (leftover && j < search_area_width ) {
    3118           0 :                     pSrc = src;
    3119           0 :                     pRef = ref + j;
    3120           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    3121           0 :                     for (k = 0; k < block_height >> 1; k++) {
    3122           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    3123           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3124           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    3125           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3126           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3127           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3128           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3129           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    3130           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    3131           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    3132           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3133           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3134           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3135           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3136           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    3137           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    3138           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    3139           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3140           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3141           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3142           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3143           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
    3144           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
    3145           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
    3146           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3147           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3148           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3149           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3150           0 :                         pSrc += src_stride;
    3151           0 :                         pRef += ref_stride;
    3152             :                     }
    3153           0 :                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
    3154           0 :                     for (; k < block_height; k++) {
    3155           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    3156           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3157           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    3158           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    3159           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    3160           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    3161           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    3162           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    3163           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    3164           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    3165           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    3166           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    3167           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    3168           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    3169           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    3170           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    3171           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    3172           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    3173           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    3174           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    3175           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    3176           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
    3177           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
    3178           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
    3179           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    3180           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    3181           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    3182           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    3183           0 :                         pSrc += src_stride;
    3184           0 :                         pRef += ref_stride;
    3185             :                     }
    3186           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    3187           0 :                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
    3188           0 :                     s0 = _mm_or_si128(s0, s8);
    3189           0 :                     s0 = _mm_minpos_epu16(s0);
    3190           0 :                     temSum = _mm_extract_epi16(s0, 0);
    3191           0 :                     temSum &= 0x0000FFFF;
    3192           0 :                     if (temSum < lowSum) {
    3193           0 :                         if (temSum != 0xFFFF) { // no overflow
    3194           0 :                             lowSum = temSum;
    3195           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    3196           0 :                             yBest = i;
    3197             :                         }
    3198             :                         else {
    3199           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    3200           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    3201           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    3202           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    3203           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    3204           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    3205           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    3206           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    3207           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    3208           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    3209           0 :                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
    3210           0 :                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
    3211           0 :                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
    3212           0 :                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
    3213           0 :                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
    3214           0 :                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
    3215           0 :                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
    3216           0 :                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
    3217           0 :                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
    3218           0 :                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
    3219           0 :                             k = leftover;
    3220           0 :                             while (k > 0) {
    3221           0 :                                 for (l = 0; l < 4 && k; l++, k--) {
    3222           0 :                                     temSum = _mm_extract_epi32(s0, 0);
    3223           0 :                                     s0 = _mm_srli_si128(s0, 4);
    3224           0 :                                     if (temSum < lowSum) {
    3225           0 :                                         lowSum = temSum;
    3226           0 :                                         xBest = (int16_t)(j + leftover - k);
    3227           0 :                                         yBest = i;
    3228             :                                     }
    3229             :                                 }
    3230           0 :                                 s0 = s3;
    3231             :                             }
    3232             :                         }
    3233             :                     }
    3234             :                 }
    3235           0 :                 ref += src_stride_raw;
    3236             :             }
    3237             :         }
    3238             :         else {
    3239             :             __m128i s9, s10;
    3240           0 :             for (i = 0; i < search_area_height; i++) {
    3241           0 :                 uint32_t startW = (i & 1) << 3;
    3242           0 :                 for (j = startW; j <= search_area_width - 8; j += 16) {
    3243           0 :                     pSrc = src;
    3244           0 :                     pRef = ref + j;
    3245           0 :                     s9 = s10 = _mm_setzero_si128();
    3246           0 :                     k = 0;
    3247           0 :                     while (k < block_height) {
    3248           0 :                         s3 = s4 = s5 = s6 = _mm_setzero_si128();
    3249           0 :                         for (l = 0; l < 16 && k < block_height; k++, l++) {
    3250           0 :                             s0 = _mm_loadu_si128((__m128i*)pRef);
    3251           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3252           0 :                             s2 = _mm_loadu_si128((__m128i*)pSrc);
    3253           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3254           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3255           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3256           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3257           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    3258           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    3259           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    3260           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3261           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3262           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3263           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3264           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    3265           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    3266           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    3267           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3268           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3269           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3270           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3271           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
    3272           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
    3273           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
    3274           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3275           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3276           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3277           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3278           0 :                             pSrc += src_stride;
    3279           0 :                             pRef += ref_stride;
    3280             :                         }
    3281           0 :                         s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    3282           0 :                         s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    3283           0 :                         s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    3284           0 :                         s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    3285           0 :                         s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    3286           0 :                         s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    3287           0 :                         s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    3288           0 :                         s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    3289           0 :                         s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
    3290           0 :                         s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
    3291             :                     }
    3292           0 :                     s0 = _mm_packus_epi32(s9, s10);
    3293           0 :                     s0 = _mm_minpos_epu16(s0);
    3294           0 :                     temSum = _mm_extract_epi16(s0, 0);
    3295           0 :                     temSum &= 0x0000FFFF;
    3296           0 :                     if (temSum < lowSum) {
    3297           0 :                         if (temSum != 0xFFFF) { // no overflow
    3298           0 :                             lowSum = temSum;
    3299           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    3300           0 :                             yBest = i;
    3301             :                         }
    3302             :                         else {
    3303           0 :                             UPDATE_BEST(s9, 0, 0);
    3304           0 :                             UPDATE_BEST(s9, 1, 0);
    3305           0 :                             UPDATE_BEST(s9, 2, 0);
    3306           0 :                             UPDATE_BEST(s9, 3, 0);
    3307           0 :                             UPDATE_BEST(s10, 0, 4);
    3308           0 :                             UPDATE_BEST(s10, 1, 4);
    3309           0 :                             UPDATE_BEST(s10, 2, 4);
    3310           0 :                             UPDATE_BEST(s10, 3, 4);
    3311             :                         }
    3312             :                     }
    3313             :                 }
    3314             : 
    3315           0 :                 if (leftover && j < search_area_width ) {
    3316           0 :                     pSrc = src;
    3317           0 :                     pRef = ref + j;
    3318           0 :                     s9 = s10 = _mm_setzero_si128();
    3319           0 :                     k = 0;
    3320           0 :                     while (k < block_height) {
    3321           0 :                         s3 = s4 = s5 = s6 = _mm_setzero_si128();
    3322           0 :                         for (l = 0; l < 16 && k < block_height; k++, l++) {
    3323           0 :                             s0 = _mm_loadu_si128((__m128i*)pRef);
    3324           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3325           0 :                             s2 = _mm_loadu_si128((__m128i*)pSrc);
    3326           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3327           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3328           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3329           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3330           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    3331           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    3332           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    3333           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3334           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3335           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3336           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3337           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    3338           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    3339           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    3340           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3341           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3342           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3343           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3344           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
    3345           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
    3346           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
    3347           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3348           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3349           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3350           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3351           0 :                             pSrc += src_stride;
    3352           0 :                             pRef += ref_stride;
    3353             :                         }
    3354           0 :                         s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    3355           0 :                         s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    3356           0 :                         s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    3357           0 :                         s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    3358           0 :                         s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    3359           0 :                         s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    3360           0 :                         s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    3361           0 :                         s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    3362           0 :                         s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
    3363           0 :                         s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
    3364             :                     }
    3365           0 :                     s0 = _mm_packus_epi32(s9, s10);
    3366           0 :                     s0 = _mm_or_si128(s0, s8);
    3367           0 :                     s0 = _mm_minpos_epu16(s0);
    3368           0 :                     temSum = _mm_extract_epi16(s0, 0);
    3369           0 :                     temSum &= 0x0000FFFF;
    3370           0 :                     if (temSum < lowSum) {
    3371           0 :                         if (temSum != 0xFFFF) { // no overflow
    3372           0 :                             lowSum = temSum;
    3373           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    3374           0 :                             yBest = i;
    3375             :                         }
    3376             :                         else {
    3377           0 :                             k = leftover;
    3378           0 :                             while (k > 0) {
    3379           0 :                                 for (l = 0; l < 4 && k; l++, k--) {
    3380           0 :                                     temSum = _mm_extract_epi32(s9, 0);
    3381           0 :                                     s9 = _mm_srli_si128(s9, 4);
    3382           0 :                                     if (temSum < lowSum) {
    3383           0 :                                         lowSum = temSum;
    3384           0 :                                         xBest = (int16_t)(j + leftover - k);
    3385           0 :                                         yBest = i;
    3386             :                                     }
    3387             :                                 }
    3388           0 :                                 s9 = s10;
    3389             :                             }
    3390             :                         }
    3391             :                     }
    3392             :                 }
    3393           0 :                 ref += src_stride_raw;
    3394             :             }
    3395             :         }
    3396           0 :         break;
    3397             : 
    3398           0 :     default:
    3399             :         assert(0);
    3400           0 :         break;
    3401             :     }
    3402             : 
    3403           0 :     *best_sad = lowSum;
    3404           0 :     *x_search_center = xBest;
    3405           0 :     *y_search_center = yBest;
    3406           0 : }
    3407             : 
    3408             : /*******************************************************************************
    3409             : * Requirement: width   = 4, 8, 16, 24, 32, 48 or 64
    3410             : * Requirement: block_height <= 64
    3411             : * Requirement: block_height % 2 = 0 when width = 4 or 8
    3412             : *******************************************************************************/
    3413           0 : void sad_loop_kernel_sse4_1_hme_l0_intrin(
    3414             :     uint8_t  *src,                            // input parameter, source samples Ptr
    3415             :     uint32_t  src_stride,                      // input parameter, source stride
    3416             :     uint8_t  *ref,                            // input parameter, reference samples Ptr
    3417             :     uint32_t  ref_stride,                      // input parameter, reference stride
    3418             :     uint32_t  block_height,                   // input parameter, block height (M)
    3419             :     uint32_t  block_width,                    // input parameter, block width (N)
    3420             :     uint64_t *best_sad,
    3421             :     int16_t *x_search_center,
    3422             :     int16_t *y_search_center,
    3423             :     uint32_t  src_stride_raw,                   // input parameter, source stride (no line skipping)
    3424             :     int16_t search_area_width,
    3425             :     int16_t search_area_height)
    3426             : {
    3427           0 :     int16_t xBest = *x_search_center, yBest = *y_search_center;
    3428           0 :     uint32_t lowSum = 0xffffff;
    3429           0 :     uint32_t temSum = 0;
    3430             :     int16_t i, j;
    3431             :     uint32_t k, l;
    3432             :     const uint8_t *pRef, *pSrc;
    3433             :     __m128i s0, s1, s2, s3, s4, s5, s6, s7, s9, s10, s11;
    3434             : 
    3435           0 :     switch (block_width) {
    3436           0 :     case 4:
    3437           0 :         for (i = 0; i < search_area_height; i++) {
    3438           0 :             for (j = 0; j <= search_area_width - 8; j += 8) {
    3439           0 :                 pSrc = src;
    3440           0 :                 pRef = ref + j;
    3441           0 :                 s3 = _mm_setzero_si128();
    3442           0 :                 for (k = 0; k < block_height; k += 2) {
    3443           0 :                     s0 = _mm_loadu_si128((__m128i*)pRef);
    3444           0 :                     s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
    3445           0 :                     s2 = _mm_cvtsi32_si128(*(uint32_t *)pSrc);
    3446           0 :                     s5 = _mm_cvtsi32_si128(*(uint32_t *)(pSrc + src_stride));
    3447           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3448           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
    3449           0 :                     pSrc += src_stride << 1;
    3450           0 :                     pRef += ref_stride << 1;
    3451             :                 }
    3452           0 :                 s3 = _mm_minpos_epu16(s3);
    3453           0 :                 temSum = _mm_extract_epi16(s3, 0);
    3454           0 :                 if (temSum < lowSum) {
    3455           0 :                     lowSum = temSum;
    3456           0 :                     xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
    3457           0 :                     yBest = i;
    3458             :                 }
    3459             :             }
    3460           0 :             ref += src_stride_raw;
    3461             :         }
    3462           0 :         break;
    3463             : 
    3464           0 :     case 8:
    3465           0 :         for (i = 0; i < search_area_height; i++) {
    3466           0 :             for (j = 0; j <= search_area_width - 8; j += 8) {
    3467           0 :                 pSrc = src;
    3468           0 :                 pRef = ref + j;
    3469           0 :                 s3 = s4 = _mm_setzero_si128();
    3470           0 :                 for (k = 0; k < block_height; k += 2) {
    3471           0 :                     s0 = _mm_loadu_si128((__m128i*)pRef);
    3472           0 :                     s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride));
    3473           0 :                     s2 = _mm_loadl_epi64((__m128i*)pSrc);
    3474           0 :                     s5 = _mm_loadl_epi64((__m128i*)(pSrc + src_stride));
    3475           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3476           0 :                     s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3477           0 :                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
    3478           0 :                     s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
    3479           0 :                     pSrc += src_stride << 1;
    3480           0 :                     pRef += ref_stride << 1;
    3481             :                 }
    3482           0 :                 s3 = _mm_adds_epu16(s3, s4);
    3483           0 :                 s3 = _mm_minpos_epu16(s3);
    3484           0 :                 temSum = _mm_extract_epi16(s3, 0);
    3485           0 :                 if (temSum < lowSum) {
    3486           0 :                     lowSum = temSum;
    3487           0 :                     xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
    3488           0 :                     yBest = i;
    3489             :                 }
    3490             :             }
    3491             : 
    3492           0 :             ref += src_stride_raw;
    3493             :         }
    3494           0 :         break;
    3495             : 
    3496           0 :     case 16:
    3497           0 :         if (block_height <= 16) {
    3498           0 :             for (i = 0; i < search_area_height; i++) {
    3499           0 :                 for (j = 0; j <= search_area_width - 16; j += 16) {
    3500           0 :                     pSrc = src;
    3501           0 :                     pRef = ref + j;
    3502           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    3503           0 :                     s7 = s9 = s10 = s11 = _mm_setzero_si128();
    3504           0 :                     for (k = 0; k < block_height; k++) {
    3505           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    3506           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3507           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    3508           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3509           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3510           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3511           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3512           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    3513           0 :                         s7 = _mm_adds_epu16(s7, _mm_mpsadbw_epu8(s1, s2, 0));
    3514           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 5));
    3515           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 2));
    3516           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 7));
    3517           0 :                         pSrc += src_stride;
    3518           0 :                         pRef += ref_stride;
    3519             :                     }
    3520           0 :                     s3 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    3521           0 :                     s3 = _mm_minpos_epu16(s3);
    3522           0 :                     temSum = _mm_extract_epi16(s3, 0);
    3523           0 :                     if (temSum < lowSum) {
    3524           0 :                         lowSum = temSum;
    3525           0 :                         xBest = (int16_t)(j + _mm_extract_epi16(s3, 1));
    3526           0 :                         yBest = i;
    3527             :                     }
    3528             : 
    3529           0 :                     s7 = _mm_adds_epu16(_mm_adds_epu16(s7, s11), _mm_adds_epu16(s9, s10));
    3530           0 :                     s7 = _mm_minpos_epu16(s7);
    3531           0 :                     temSum = _mm_extract_epi16(s7, 0);
    3532           0 :                     if (temSum < lowSum) {
    3533           0 :                         lowSum = temSum;
    3534           0 :                         xBest = (int16_t)(j + 8 + _mm_extract_epi16(s7, 1));
    3535           0 :                         yBest = i;
    3536             :                     }
    3537             :                 }
    3538             : 
    3539           0 :                 ref += src_stride_raw;
    3540             :             }
    3541             :         }
    3542           0 :         else if (block_height <= 32) {
    3543           0 :             for (i = 0; i < search_area_height; i++) {
    3544           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
    3545           0 :                     pSrc = src;
    3546           0 :                     pRef = ref + j;
    3547           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    3548           0 :                     for (k = 0; k < block_height; k++) {
    3549           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    3550           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3551           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    3552           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3553           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3554           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3555           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3556           0 :                         pSrc += src_stride;
    3557           0 :                         pRef += ref_stride;
    3558             :                     }
    3559           0 :                     s3 = _mm_adds_epu16(s3, s4);
    3560           0 :                     s5 = _mm_adds_epu16(s5, s6);
    3561           0 :                     s4 = _mm_minpos_epu16(s3);
    3562           0 :                     s6 = _mm_minpos_epu16(s5);
    3563           0 :                     s4 = _mm_unpacklo_epi16(s4, s4);
    3564           0 :                     s4 = _mm_unpacklo_epi32(s4, s4);
    3565           0 :                     s4 = _mm_unpacklo_epi64(s4, s4);
    3566           0 :                     s6 = _mm_unpacklo_epi16(s6, s6);
    3567           0 :                     s6 = _mm_unpacklo_epi32(s6, s6);
    3568           0 :                     s6 = _mm_unpacklo_epi64(s6, s6);
    3569           0 :                     s3 = _mm_sub_epi16(s3, s4);
    3570           0 :                     s5 = _mm_adds_epu16(s5, s3);
    3571           0 :                     s5 = _mm_sub_epi16(s5, s6);
    3572           0 :                     s5 = _mm_minpos_epu16(s5);
    3573           0 :                     temSum = _mm_extract_epi16(s5, 0);
    3574           0 :                     temSum += _mm_extract_epi16(s4, 0);
    3575           0 :                     temSum += _mm_extract_epi16(s6, 0);
    3576           0 :                     if (temSum < lowSum) {
    3577           0 :                         lowSum = temSum;
    3578           0 :                         xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
    3579           0 :                         yBest = i;
    3580             :                     }
    3581             :                 }
    3582             : 
    3583           0 :                 ref += src_stride_raw;
    3584             :             }
    3585             :         }
    3586             :         else {
    3587           0 :             for (i = 0; i < search_area_height; i++) {
    3588           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
    3589           0 :                     pSrc = src;
    3590           0 :                     pRef = ref + j;
    3591           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    3592           0 :                     for (k = 0; k < block_height; k++) {
    3593           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    3594           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3595           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    3596           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3597           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3598           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3599           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3600           0 :                         pSrc += src_stride;
    3601           0 :                         pRef += ref_stride;
    3602             :                     }
    3603           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    3604           0 :                     s0 = _mm_minpos_epu16(s0);
    3605           0 :                     temSum = _mm_extract_epi16(s0, 0);
    3606           0 :                     if (temSum < lowSum) {
    3607           0 :                         if (temSum != 0xFFFF) { // no overflow
    3608           0 :                             lowSum = temSum;
    3609           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    3610           0 :                             yBest = i;
    3611             :                         }
    3612             :                         else {
    3613           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    3614           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    3615           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    3616           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    3617           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    3618           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    3619           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    3620           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    3621           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    3622           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    3623           0 :                             UPDATE_BEST(s0, 0, 0);
    3624           0 :                             UPDATE_BEST(s0, 1, 0);
    3625           0 :                             UPDATE_BEST(s0, 2, 0);
    3626           0 :                             UPDATE_BEST(s0, 3, 0);
    3627           0 :                             UPDATE_BEST(s3, 0, 4);
    3628           0 :                             UPDATE_BEST(s3, 1, 4);
    3629           0 :                             UPDATE_BEST(s3, 2, 4);
    3630           0 :                             UPDATE_BEST(s3, 3, 4);
    3631             :                         }
    3632             :                     }
    3633             :                 }
    3634           0 :                 ref += src_stride_raw;
    3635             :             }
    3636             :         }
    3637           0 :         break;
    3638             : 
    3639           0 :     case 24:
    3640           0 :         if (block_height <= 16) {
    3641           0 :             for (i = 0; i < search_area_height; i++) {
    3642           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
    3643           0 :                     pSrc = src;
    3644           0 :                     pRef = ref + j;
    3645           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    3646           0 :                     for (k = 0; k < block_height; k++) {
    3647           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    3648           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3649           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    3650           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3651           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3652           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3653           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3654           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    3655           0 :                         s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
    3656           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3657           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3658           0 :                         pSrc += src_stride;
    3659           0 :                         pRef += ref_stride;
    3660             :                     }
    3661           0 :                     s3 = _mm_adds_epu16(s3, s4);
    3662           0 :                     s5 = _mm_adds_epu16(s5, s6);
    3663           0 :                     s4 = _mm_minpos_epu16(s3);
    3664           0 :                     s6 = _mm_minpos_epu16(s5);
    3665           0 :                     s4 = _mm_unpacklo_epi16(s4, s4);
    3666           0 :                     s4 = _mm_unpacklo_epi32(s4, s4);
    3667           0 :                     s4 = _mm_unpacklo_epi64(s4, s4);
    3668           0 :                     s6 = _mm_unpacklo_epi16(s6, s6);
    3669           0 :                     s6 = _mm_unpacklo_epi32(s6, s6);
    3670           0 :                     s6 = _mm_unpacklo_epi64(s6, s6);
    3671           0 :                     s3 = _mm_sub_epi16(s3, s4);
    3672           0 :                     s5 = _mm_adds_epu16(s5, s3);
    3673           0 :                     s5 = _mm_sub_epi16(s5, s6);
    3674           0 :                     s5 = _mm_minpos_epu16(s5);
    3675           0 :                     temSum = _mm_extract_epi16(s5, 0);
    3676           0 :                     temSum += _mm_extract_epi16(s4, 0);
    3677           0 :                     temSum += _mm_extract_epi16(s6, 0);
    3678           0 :                     if (temSum < lowSum) {
    3679           0 :                         lowSum = temSum;
    3680           0 :                         xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
    3681           0 :                         yBest = i;
    3682             :                     }
    3683             :                 }
    3684           0 :                 ref += src_stride_raw;
    3685             :             }
    3686             :         }
    3687             :         else {
    3688           0 :             for (i = 0; i < search_area_height; i++) {
    3689           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
    3690           0 :                     pSrc = src;
    3691           0 :                     pRef = ref + j;
    3692           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    3693           0 :                     for (k = 0; k < block_height; k++) {
    3694           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    3695           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3696           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    3697           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3698           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3699           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3700           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3701           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    3702           0 :                         s2 = _mm_loadl_epi64((__m128i*)(pSrc + 16));
    3703           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3704           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3705           0 :                         pSrc += src_stride;
    3706           0 :                         pRef += ref_stride;
    3707             :                     }
    3708           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    3709           0 :                     s0 = _mm_minpos_epu16(s0);
    3710           0 :                     temSum = _mm_extract_epi16(s0, 0);
    3711           0 :                     if (temSum < lowSum) {
    3712           0 :                         if (temSum != 0xFFFF) { // no overflow
    3713           0 :                             lowSum = temSum;
    3714           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    3715           0 :                             yBest = i;
    3716             :                         }
    3717             :                         else {
    3718           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    3719           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    3720           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    3721           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    3722           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    3723           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    3724           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    3725           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    3726           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    3727           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    3728           0 :                             UPDATE_BEST(s0, 0, 0);
    3729           0 :                             UPDATE_BEST(s0, 1, 0);
    3730           0 :                             UPDATE_BEST(s0, 2, 0);
    3731           0 :                             UPDATE_BEST(s0, 3, 0);
    3732           0 :                             UPDATE_BEST(s3, 0, 4);
    3733           0 :                             UPDATE_BEST(s3, 1, 4);
    3734           0 :                             UPDATE_BEST(s3, 2, 4);
    3735           0 :                             UPDATE_BEST(s3, 3, 4);
    3736             :                         }
    3737             :                     }
    3738             :                 }
    3739             : 
    3740           0 :                 ref += src_stride_raw;
    3741             :             }
    3742             :         }
    3743           0 :         break;
    3744             : 
    3745           0 :     case 32:
    3746           0 :         if (block_height < 16) {
    3747           0 :             for (i = 0; i < search_area_height; i++) {
    3748           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
    3749           0 :                     pSrc = src;
    3750           0 :                     pRef = ref + j;
    3751           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    3752           0 :                     for (k = 0; k < block_height; k++) {
    3753           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    3754           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3755           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    3756           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3757           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3758           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3759           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3760           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    3761           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    3762           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    3763           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3764           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3765           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3766           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3767           0 :                         pSrc += src_stride;
    3768           0 :                         pRef += ref_stride;
    3769             :                     }
    3770           0 :                     s3 = _mm_adds_epu16(s3, s4);
    3771           0 :                     s5 = _mm_adds_epu16(s5, s6);
    3772           0 :                     s4 = _mm_minpos_epu16(s3);
    3773           0 :                     s6 = _mm_minpos_epu16(s5);
    3774           0 :                     s4 = _mm_unpacklo_epi16(s4, s4);
    3775           0 :                     s4 = _mm_unpacklo_epi32(s4, s4);
    3776           0 :                     s4 = _mm_unpacklo_epi64(s4, s4);
    3777           0 :                     s6 = _mm_unpacklo_epi16(s6, s6);
    3778           0 :                     s6 = _mm_unpacklo_epi32(s6, s6);
    3779           0 :                     s6 = _mm_unpacklo_epi64(s6, s6);
    3780           0 :                     s3 = _mm_sub_epi16(s3, s4);
    3781           0 :                     s5 = _mm_adds_epu16(s5, s3);
    3782           0 :                     s5 = _mm_sub_epi16(s5, s6);
    3783           0 :                     s5 = _mm_minpos_epu16(s5);
    3784           0 :                     temSum = _mm_extract_epi16(s5, 0);
    3785           0 :                     temSum += _mm_extract_epi16(s4, 0);
    3786           0 :                     temSum += _mm_extract_epi16(s6, 0);
    3787           0 :                     temSum &= 0x0000FFFF;
    3788           0 :                     if (temSum < lowSum) {
    3789           0 :                         lowSum = temSum;
    3790           0 :                         xBest = (int16_t)(j + _mm_extract_epi16(s5, 1));
    3791           0 :                         yBest = i;
    3792             :                     }
    3793             :                 }
    3794             : 
    3795           0 :                 ref += src_stride_raw;
    3796             :             }
    3797             :         }
    3798           0 :         else if (block_height <= 32) {
    3799           0 :             for (i = 0; i < search_area_height; i++) {
    3800           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
    3801           0 :                     pSrc = src;
    3802           0 :                     pRef = ref + j;
    3803           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    3804           0 :                     for (k = 0; k < block_height; k++) {
    3805           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    3806           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3807           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    3808           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3809           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3810           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3811           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3812           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    3813           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    3814           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    3815           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3816           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3817           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3818           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3819           0 :                         pSrc += src_stride;
    3820           0 :                         pRef += ref_stride;
    3821             :                     }
    3822           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    3823           0 :                     s0 = _mm_minpos_epu16(s0);
    3824           0 :                     temSum = _mm_extract_epi16(s0, 0);
    3825           0 :                     temSum &= 0x0000FFFF;
    3826           0 :                     if (temSum < lowSum) {
    3827           0 :                         if (temSum != 0xFFFF) { // no overflow
    3828           0 :                             lowSum = temSum;
    3829           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    3830           0 :                             yBest = i;
    3831             :                         }
    3832             :                         else {
    3833           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    3834           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    3835           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    3836           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    3837           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    3838           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    3839           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    3840           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    3841           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    3842           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    3843           0 :                             UPDATE_BEST(s0, 0, 0);
    3844           0 :                             UPDATE_BEST(s0, 1, 0);
    3845           0 :                             UPDATE_BEST(s0, 2, 0);
    3846           0 :                             UPDATE_BEST(s0, 3, 0);
    3847           0 :                             UPDATE_BEST(s3, 0, 4);
    3848           0 :                             UPDATE_BEST(s3, 1, 4);
    3849           0 :                             UPDATE_BEST(s3, 2, 4);
    3850           0 :                             UPDATE_BEST(s3, 3, 4);
    3851             :                         }
    3852             :                     }
    3853             :                 }
    3854           0 :                 ref += src_stride_raw;
    3855             :             }
    3856             :         }
    3857             :         else {
    3858             :             __m128i s9, s10, s11, s12;
    3859           0 :             for (i = 0; i < search_area_height; i++) {
    3860           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
    3861           0 :                     pSrc = src;
    3862           0 :                     pRef = ref + j;
    3863           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    3864           0 :                     for (k = 0; k < block_height >> 1; k++) {
    3865           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    3866           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3867           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    3868           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3869           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3870           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3871           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3872           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    3873           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    3874           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    3875           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3876           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3877           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3878           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3879           0 :                         pSrc += src_stride;
    3880           0 :                         pRef += ref_stride;
    3881             :                     }
    3882           0 :                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
    3883           0 :                     for (; k < block_height; k++) {
    3884           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    3885           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3886           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    3887           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    3888           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    3889           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    3890           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    3891           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    3892           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    3893           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    3894           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    3895           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    3896           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    3897           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    3898           0 :                         pSrc += src_stride;
    3899           0 :                         pRef += ref_stride;
    3900             :                     }
    3901           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    3902           0 :                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
    3903           0 :                     s0 = _mm_minpos_epu16(s0);
    3904           0 :                     temSum = _mm_extract_epi16(s0, 0);
    3905           0 :                     temSum &= 0x0000FFFF;
    3906           0 :                     if (temSum < lowSum) {
    3907           0 :                         if (temSum != 0xFFFF) { // no overflow
    3908           0 :                             lowSum = temSum;
    3909           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    3910           0 :                             yBest = i;
    3911             :                         }
    3912             :                         else {
    3913           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    3914           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    3915           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    3916           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    3917           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    3918           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    3919           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    3920           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    3921           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    3922           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    3923           0 :                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
    3924           0 :                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
    3925           0 :                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
    3926           0 :                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
    3927           0 :                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
    3928           0 :                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
    3929           0 :                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
    3930           0 :                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
    3931           0 :                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
    3932           0 :                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
    3933           0 :                             UPDATE_BEST(s0, 0, 0);
    3934           0 :                             UPDATE_BEST(s0, 1, 0);
    3935           0 :                             UPDATE_BEST(s0, 2, 0);
    3936           0 :                             UPDATE_BEST(s0, 3, 0);
    3937           0 :                             UPDATE_BEST(s3, 0, 4);
    3938           0 :                             UPDATE_BEST(s3, 1, 4);
    3939           0 :                             UPDATE_BEST(s3, 2, 4);
    3940           0 :                             UPDATE_BEST(s3, 3, 4);
    3941             :                         }
    3942             :                     }
    3943             :                 }
    3944             : 
    3945           0 :                 ref += src_stride_raw;
    3946             :             }
    3947             :         }
    3948           0 :         break;
    3949             : 
    3950           0 :     case 48:
    3951           0 :         if (block_height <= 32) {
    3952             :             __m128i s9, s10, s11, s12;
    3953           0 :             for (i = 0; i < search_area_height; i++) {
    3954           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
    3955           0 :                     pSrc = src;
    3956           0 :                     pRef = ref + j;
    3957           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    3958           0 :                     for (k = 0; k < block_height >> 1; k++) {
    3959           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    3960           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3961           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    3962           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3963           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3964           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3965           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3966           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    3967           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    3968           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    3969           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3970           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3971           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3972           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3973           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    3974           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    3975           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    3976           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    3977           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    3978           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    3979           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    3980           0 :                         pSrc += src_stride;
    3981           0 :                         pRef += ref_stride;
    3982             :                     }
    3983           0 :                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
    3984           0 :                     for (; k < block_height; k++) {
    3985           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    3986           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    3987           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    3988           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    3989           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    3990           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    3991           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    3992           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    3993           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    3994           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    3995           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    3996           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    3997           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    3998           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    3999           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    4000           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    4001           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    4002           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    4003           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    4004           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    4005           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    4006           0 :                         pSrc += src_stride;
    4007           0 :                         pRef += ref_stride;
    4008             :                     }
    4009           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    4010           0 :                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
    4011           0 :                     s0 = _mm_minpos_epu16(s0);
    4012           0 :                     temSum = _mm_extract_epi16(s0, 0);
    4013           0 :                     temSum &= 0x0000FFFF;
    4014           0 :                     if (temSum < lowSum) {
    4015           0 :                         if (temSum != 0xFFFF) { // no overflow
    4016           0 :                             lowSum = temSum;
    4017           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    4018           0 :                             yBest = i;
    4019             :                         }
    4020             :                         else {
    4021           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    4022           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    4023           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    4024           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    4025           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    4026           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    4027           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    4028           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    4029           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    4030           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    4031           0 :                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
    4032           0 :                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
    4033           0 :                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
    4034           0 :                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
    4035           0 :                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
    4036           0 :                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
    4037           0 :                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
    4038           0 :                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
    4039           0 :                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
    4040           0 :                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
    4041           0 :                             UPDATE_BEST(s0, 0, 0);
    4042           0 :                             UPDATE_BEST(s0, 1, 0);
    4043           0 :                             UPDATE_BEST(s0, 2, 0);
    4044           0 :                             UPDATE_BEST(s0, 3, 0);
    4045           0 :                             UPDATE_BEST(s3, 0, 4);
    4046           0 :                             UPDATE_BEST(s3, 1, 4);
    4047           0 :                             UPDATE_BEST(s3, 2, 4);
    4048           0 :                             UPDATE_BEST(s3, 3, 4);
    4049             :                         }
    4050             :                     }
    4051             :                 }
    4052             : 
    4053           0 :                 ref += src_stride_raw;
    4054             :             }
    4055             :         }
    4056             :         else {
    4057             :             __m128i s9, s10;
    4058           0 :             for (i = 0; i < search_area_height; i++) {
    4059           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
    4060           0 :                     pSrc = src;
    4061           0 :                     pRef = ref + j;
    4062           0 :                     s9 = s10 = _mm_setzero_si128();
    4063           0 :                     k = 0;
    4064           0 :                     while (k < block_height) {
    4065           0 :                         s3 = s4 = s5 = s6 = _mm_setzero_si128();
    4066           0 :                         for (l = 0; l < 21 && k < block_height; k++, l++) {
    4067           0 :                             s0 = _mm_loadu_si128((__m128i*)pRef);
    4068           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    4069           0 :                             s2 = _mm_loadu_si128((__m128i*)pSrc);
    4070           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    4071           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    4072           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    4073           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    4074           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    4075           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    4076           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    4077           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    4078           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    4079           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    4080           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    4081           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    4082           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    4083           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    4084           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    4085           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    4086           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    4087           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    4088           0 :                             pSrc += src_stride;
    4089           0 :                             pRef += ref_stride;
    4090             :                         }
    4091           0 :                         s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    4092           0 :                         s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    4093           0 :                         s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    4094           0 :                         s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    4095           0 :                         s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    4096           0 :                         s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    4097           0 :                         s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    4098           0 :                         s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    4099           0 :                         s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
    4100           0 :                         s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
    4101             :                     }
    4102           0 :                     s0 = _mm_packus_epi32(s9, s10);
    4103           0 :                     s0 = _mm_minpos_epu16(s0);
    4104           0 :                     temSum = _mm_extract_epi16(s0, 0);
    4105           0 :                     temSum &= 0x0000FFFF;
    4106           0 :                     if (temSum < lowSum) {
    4107           0 :                         if (temSum != 0xFFFF) { // no overflow
    4108           0 :                             lowSum = temSum;
    4109           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    4110           0 :                             yBest = i;
    4111             :                         }
    4112             :                         else {
    4113           0 :                             UPDATE_BEST(s9, 0, 0);
    4114           0 :                             UPDATE_BEST(s9, 1, 0);
    4115           0 :                             UPDATE_BEST(s9, 2, 0);
    4116           0 :                             UPDATE_BEST(s9, 3, 0);
    4117           0 :                             UPDATE_BEST(s10, 0, 4);
    4118           0 :                             UPDATE_BEST(s10, 1, 4);
    4119           0 :                             UPDATE_BEST(s10, 2, 4);
    4120           0 :                             UPDATE_BEST(s10, 3, 4);
    4121             :                         }
    4122             :                     }
    4123             :                 }
    4124             : 
    4125           0 :                 ref += src_stride_raw;
    4126             :             }
    4127             :         }
    4128           0 :         break;
    4129             : 
    4130           0 :     case 64:
    4131           0 :         if (block_height <= 32) {
    4132             :             __m128i s9, s10, s11, s12;
    4133           0 :             for (i = 0; i < search_area_height; i++) {
    4134           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
    4135           0 :                     pSrc = src;
    4136           0 :                     pRef = ref + j;
    4137           0 :                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
    4138           0 :                     for (k = 0; k < block_height >> 1; k++) {
    4139           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    4140           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    4141           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    4142           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    4143           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    4144           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    4145           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    4146           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    4147           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    4148           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    4149           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    4150           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    4151           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    4152           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    4153           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    4154           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    4155           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    4156           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    4157           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    4158           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    4159           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    4160           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
    4161           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
    4162           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
    4163           0 :                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    4164           0 :                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    4165           0 :                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    4166           0 :                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    4167           0 :                         pSrc += src_stride;
    4168           0 :                         pRef += ref_stride;
    4169             :                     }
    4170           0 :                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
    4171           0 :                     for (; k < block_height; k++) {
    4172           0 :                         s0 = _mm_loadu_si128((__m128i*)pRef);
    4173           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    4174           0 :                         s2 = _mm_loadu_si128((__m128i*)pSrc);
    4175           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    4176           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    4177           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    4178           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    4179           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    4180           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    4181           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    4182           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    4183           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    4184           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    4185           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    4186           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    4187           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    4188           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    4189           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    4190           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    4191           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    4192           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    4193           0 :                         s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
    4194           0 :                         s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
    4195           0 :                         s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
    4196           0 :                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
    4197           0 :                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
    4198           0 :                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
    4199           0 :                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
    4200           0 :                         pSrc += src_stride;
    4201           0 :                         pRef += ref_stride;
    4202             :                     }
    4203           0 :                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
    4204           0 :                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
    4205           0 :                     s0 = _mm_minpos_epu16(s0);
    4206           0 :                     temSum = _mm_extract_epi16(s0, 0);
    4207           0 :                     temSum &= 0x0000FFFF;
    4208           0 :                     if (temSum < lowSum) {
    4209           0 :                         if (temSum != 0xFFFF) { // no overflow
    4210           0 :                             lowSum = temSum;
    4211           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    4212           0 :                             yBest = i;
    4213             :                         }
    4214             :                         else {
    4215           0 :                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    4216           0 :                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    4217           0 :                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    4218           0 :                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    4219           0 :                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    4220           0 :                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    4221           0 :                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    4222           0 :                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    4223           0 :                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
    4224           0 :                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
    4225           0 :                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
    4226           0 :                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
    4227           0 :                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
    4228           0 :                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
    4229           0 :                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
    4230           0 :                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
    4231           0 :                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
    4232           0 :                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
    4233           0 :                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
    4234           0 :                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
    4235           0 :                             UPDATE_BEST(s0, 0, 0);
    4236           0 :                             UPDATE_BEST(s0, 1, 0);
    4237           0 :                             UPDATE_BEST(s0, 2, 0);
    4238           0 :                             UPDATE_BEST(s0, 3, 0);
    4239           0 :                             UPDATE_BEST(s3, 0, 4);
    4240           0 :                             UPDATE_BEST(s3, 1, 4);
    4241           0 :                             UPDATE_BEST(s3, 2, 4);
    4242           0 :                             UPDATE_BEST(s3, 3, 4);
    4243             :                         }
    4244             :                     }
    4245             :                 }
    4246             : 
    4247           0 :                 ref += src_stride_raw;
    4248             :             }
    4249             :         }
    4250             :         else {
    4251             :             __m128i s9, s10;
    4252           0 :             for (i = 0; i < search_area_height; i++) {
    4253           0 :                 for (j = 0; j <= search_area_width - 8; j += 8) {
    4254           0 :                     pSrc = src;
    4255           0 :                     pRef = ref + j;
    4256           0 :                     s9 = s10 = _mm_setzero_si128();
    4257           0 :                     k = 0;
    4258           0 :                     while (k < block_height) {
    4259           0 :                         s3 = s4 = s5 = s6 = _mm_setzero_si128();
    4260           0 :                         for (l = 0; l < 16 && k < block_height; k++, l++) {
    4261           0 :                             s0 = _mm_loadu_si128((__m128i*)pRef);
    4262           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 8));
    4263           0 :                             s2 = _mm_loadu_si128((__m128i*)pSrc);
    4264           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    4265           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    4266           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    4267           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    4268           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 16));
    4269           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 24));
    4270           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 16));
    4271           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    4272           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    4273           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    4274           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    4275           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
    4276           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
    4277           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
    4278           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    4279           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    4280           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    4281           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    4282           0 :                             s0 = _mm_loadu_si128((__m128i*)(pRef + 48));
    4283           0 :                             s1 = _mm_loadu_si128((__m128i*)(pRef + 56));
    4284           0 :                             s2 = _mm_loadu_si128((__m128i*)(pSrc + 48));
    4285           0 :                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
    4286           0 :                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
    4287           0 :                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
    4288           0 :                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
    4289           0 :                             pSrc += src_stride;
    4290           0 :                             pRef += ref_stride;
    4291             :                         }
    4292           0 :                         s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
    4293           0 :                         s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
    4294           0 :                         s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
    4295           0 :                         s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
    4296           0 :                         s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
    4297           0 :                         s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
    4298           0 :                         s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
    4299           0 :                         s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
    4300           0 :                         s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
    4301           0 :                         s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
    4302             :                     }
    4303           0 :                     s0 = _mm_packus_epi32(s9, s10);
    4304           0 :                     s0 = _mm_minpos_epu16(s0);
    4305           0 :                     temSum = _mm_extract_epi16(s0, 0);
    4306           0 :                     temSum &= 0x0000FFFF;
    4307           0 :                     if (temSum < lowSum) {
    4308           0 :                         if (temSum != 0xFFFF) { // no overflow
    4309           0 :                             lowSum = temSum;
    4310           0 :                             xBest = (int16_t)(j + _mm_extract_epi16(s0, 1));
    4311           0 :                             yBest = i;
    4312             :                         }
    4313             :                         else {
    4314           0 :                             UPDATE_BEST(s9, 0, 0);
    4315           0 :                             UPDATE_BEST(s9, 1, 0);
    4316           0 :                             UPDATE_BEST(s9, 2, 0);
    4317           0 :                             UPDATE_BEST(s9, 3, 0);
    4318           0 :                             UPDATE_BEST(s10, 0, 4);
    4319           0 :                             UPDATE_BEST(s10, 1, 4);
    4320           0 :                             UPDATE_BEST(s10, 2, 4);
    4321           0 :                             UPDATE_BEST(s10, 3, 4);
    4322             :                         }
    4323             :                     }
    4324             :                 }
    4325             : 
    4326           0 :                 ref += src_stride_raw;
    4327             :             }
    4328             :         }
    4329           0 :         break;
    4330             : 
    4331           0 :     default:
    4332             :         assert(0);
    4333           0 :         break;
    4334             :     }
    4335             : 
    4336           0 :     *best_sad = lowSum;
    4337           0 :     *x_search_center = xBest;
    4338           0 :     *y_search_center = yBest;
    4339           0 : }
    4340             : 
    4341           0 : static INLINE void sad_eight_8x4_sse41_intrin(const uint8_t *src,
    4342             :     const uint32_t src_stride, const uint8_t *ref, const uint32_t ref_stride,
    4343             :     __m128i *sad)
    4344             : {
    4345           0 :     const uint8_t *pSrc = src;
    4346           0 :     const uint8_t *pRef = ref;
    4347             :     __m128i s0, s1, s2, s3;
    4348             : 
    4349           0 :     s0 = _mm_loadu_si128((__m128i*)pRef);
    4350           0 :     s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride * 2));
    4351           0 :     s2 = _mm_loadl_epi64((__m128i*)pSrc);
    4352           0 :     s3 = _mm_loadl_epi64((__m128i*)(pSrc + src_stride * 2));
    4353           0 :     *sad = _mm_adds_epu16(*sad, _mm_mpsadbw_epu8(s0, s2, 0));
    4354           0 :     *sad = _mm_adds_epu16(*sad, _mm_mpsadbw_epu8(s0, s2, 5));
    4355           0 :     *sad = _mm_adds_epu16(*sad, _mm_mpsadbw_epu8(s1, s3, 0));
    4356           0 :     *sad = _mm_adds_epu16(*sad, _mm_mpsadbw_epu8(s1, s3, 5));
    4357             : 
    4358           0 :     pSrc += src_stride * 4;
    4359           0 :     pRef += ref_stride * 4;
    4360             : 
    4361           0 :     s0 = _mm_loadu_si128((__m128i*)pRef);
    4362           0 :     s1 = _mm_loadu_si128((__m128i*)(pRef + ref_stride * 2));
    4363           0 :     s2 = _mm_loadl_epi64((__m128i*)pSrc);
    4364           0 :     s3 = _mm_loadl_epi64((__m128i*)(pSrc + src_stride * 2));
    4365           0 :     *sad = _mm_adds_epu16(*sad, _mm_mpsadbw_epu8(s0, s2, 0));
    4366           0 :     *sad = _mm_adds_epu16(*sad, _mm_mpsadbw_epu8(s0, s2, 5));
    4367           0 :     *sad = _mm_adds_epu16(*sad, _mm_mpsadbw_epu8(s1, s3, 0));
    4368           0 :     *sad = _mm_adds_epu16(*sad, _mm_mpsadbw_epu8(s1, s3, 5));
    4369           0 : }
    4370             : 
    4371           0 : void get_eight_horizontal_search_point_results_8x8_16x16_pu_sse41_intrin(
    4372             :     uint8_t   *src,
    4373             :     uint32_t   src_stride,
    4374             :     uint8_t   *ref,
    4375             :     uint32_t   ref_stride,
    4376             :     uint32_t  *p_best_sad8x8,
    4377             :     uint32_t  *p_best_mv8x8,
    4378             :     uint32_t  *p_best_sad16x16,
    4379             :     uint32_t  *p_best_mv16x16,
    4380             :     uint32_t   mv,
    4381             :     uint16_t  *p_sad16x16,
    4382             :     EbBool     sub_sad)
    4383             : {
    4384             :     int16_t x_mv, y_mv;
    4385             :     __m128i s0, s1, s3;
    4386             :     __m128i sad[4];
    4387             :     uint32_t temSum;
    4388             : 
    4389           0 :     sad[0] = sad[1] = sad[2] = sad[3] = _mm_setzero_si128();
    4390             : 
    4391             :     /*
    4392             :    -------------------------------------   -----------------------------------
    4393             :    | 8x8_00 | 8x8_01 | 8x8_04 | 8x8_05 |   8x8_16 | 8x8_17 | 8x8_20 | 8x8_21 |
    4394             :    -------------------------------------   -----------------------------------
    4395             :    | 8x8_02 | 8x8_03 | 8x8_06 | 8x8_07 |   8x8_18 | 8x8_19 | 8x8_22 | 8x8_23 |
    4396             :    -----------------------   -----------   ----------------------   ----------
    4397             :    | 8x8_08 | 8x8_09 | 8x8_12 | 8x8_13 |   8x8_24 | 8x8_25 | 8x8_29 | 8x8_29 |
    4398             :    ----------------------    -----------   ---------------------    ----------
    4399             :    | 8x8_10 | 8x8_11 | 8x8_14 | 8x8_15 |   8x8_26 | 8x8_27 | 8x8_30 | 8x8_31 |
    4400             :    -------------------------------------   -----------------------------------
    4401             : 
    4402             :    -------------------------------------   -----------------------------------
    4403             :    | 8x8_32 | 8x8_33 | 8x8_36 | 8x8_37 |   8x8_48 | 8x8_49 | 8x8_52 | 8x8_53 |
    4404             :    -------------------------------------   -----------------------------------
    4405             :    | 8x8_34 | 8x8_35 | 8x8_38 | 8x8_39 |   8x8_50 | 8x8_51 | 8x8_54 | 8x8_55 |
    4406             :    -----------------------   -----------   ----------------------   ----------
    4407             :    | 8x8_40 | 8x8_41 | 8x8_44 | 8x8_45 |   8x8_56 | 8x8_57 | 8x8_60 | 8x8_61 |
    4408             :    ----------------------    -----------   ---------------------    ----------
    4409             :    | 8x8_42 | 8x8_43 | 8x8_46 | 8x8_48 |   8x8_58 | 8x8_59 | 8x8_62 | 8x8_63 |
    4410             :    -------------------------------------   -----------------------------------
    4411             :    */
    4412             : 
    4413             :    /*
    4414             :    ----------------------    ----------------------
    4415             :    |  16x16_0  |  16x16_1  |  16x16_4  |  16x16_5  |
    4416             :    ----------------------    ----------------------
    4417             :    |  16x16_2  |  16x16_3  |  16x16_6  |  16x16_7  |
    4418             :    -----------------------   -----------------------
    4419             :    |  16x16_8  |  16x16_9  |  16x16_12 |  16x16_13 |
    4420             :    ----------------------    ----------------------
    4421             :    |  16x16_10 |  16x16_11 |  16x16_14 |  16x16_15 |
    4422             :    -----------------------   -----------------------
    4423             :    */
    4424             : 
    4425             :    //8x8_0
    4426           0 :     sad_eight_8x4_sse41_intrin(src + 0 * src_stride + 0, src_stride, ref + 0 * ref_stride + 0, ref_stride, &sad[0]);
    4427           0 :     sad_eight_8x4_sse41_intrin(src + 0 * src_stride + 8, src_stride, ref + 0 * ref_stride + 8, ref_stride, &sad[1]);
    4428           0 :     sad_eight_8x4_sse41_intrin(src + 8 * src_stride + 0, src_stride, ref + 8 * ref_stride + 0, ref_stride, &sad[2]);
    4429           0 :     sad_eight_8x4_sse41_intrin(src + 8 * src_stride + 8, src_stride, ref + 8 * ref_stride + 8, ref_stride, &sad[3]);
    4430             : 
    4431           0 :     if (sub_sad) {
    4432           0 :         sad[0] = _mm_slli_epi16(sad[0], 1);
    4433           0 :         sad[1] = _mm_slli_epi16(sad[1], 1);
    4434           0 :         sad[2] = _mm_slli_epi16(sad[2], 1);
    4435           0 :         sad[3] = _mm_slli_epi16(sad[3], 1);
    4436             :     }
    4437             :     else {
    4438           0 :         sad_eight_8x4_sse41_intrin(src + 1 * src_stride + 0, src_stride, ref + 1 * ref_stride + 0, ref_stride, &sad[0]);
    4439           0 :         sad_eight_8x4_sse41_intrin(src + 1 * src_stride + 8, src_stride, ref + 1 * ref_stride + 8, ref_stride, &sad[1]);
    4440           0 :         sad_eight_8x4_sse41_intrin(src + 9 * src_stride + 0, src_stride, ref + 9 * ref_stride + 0, ref_stride, &sad[2]);
    4441           0 :         sad_eight_8x4_sse41_intrin(src + 9 * src_stride + 8, src_stride, ref + 9 * ref_stride + 8, ref_stride, &sad[3]);
    4442             :     }
    4443             : 
    4444             :     //find the best for 8x8_0
    4445           0 :     s3 = _mm_minpos_epu16(sad[0]);
    4446           0 :     temSum = _mm_extract_epi16(s3, 0);
    4447           0 :     if (temSum < p_best_sad8x8[0]) {
    4448           0 :         p_best_sad8x8[0] = temSum;
    4449           0 :         x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1) * 4);
    4450           0 :         y_mv = _MVYT(mv);
    4451           0 :         p_best_mv8x8[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4452             :     }
    4453             : 
    4454             :     //find the best for 8x8_1
    4455           0 :     s3 = _mm_minpos_epu16(sad[1]);
    4456           0 :     temSum = _mm_extract_epi16(s3, 0);
    4457           0 :     if (temSum < p_best_sad8x8[1]) {
    4458           0 :         p_best_sad8x8[1] = temSum;
    4459           0 :         x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1) * 4);
    4460           0 :         y_mv = _MVYT(mv);
    4461           0 :         p_best_mv8x8[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4462             :     }
    4463             : 
    4464             :     //find the best for 8x8_2
    4465           0 :     s3 = _mm_minpos_epu16(sad[2]);
    4466           0 :     temSum = _mm_extract_epi16(s3, 0);
    4467           0 :     if (temSum < p_best_sad8x8[2]) {
    4468           0 :         p_best_sad8x8[2] = temSum;
    4469           0 :         x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1) * 4);
    4470           0 :         y_mv = _MVYT(mv);
    4471           0 :         p_best_mv8x8[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4472             :     }
    4473             : 
    4474             :     //find the best for 8x8_3
    4475           0 :     s3 = _mm_minpos_epu16(sad[3]);
    4476           0 :     temSum = _mm_extract_epi16(s3, 0);
    4477           0 :     if (temSum < p_best_sad8x8[3]) {
    4478           0 :         p_best_sad8x8[3] = temSum;
    4479           0 :         x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1) * 4);
    4480           0 :         y_mv = _MVYT(mv);
    4481           0 :         p_best_mv8x8[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4482             :     }
    4483             : 
    4484             :     //16x16
    4485             :     {
    4486           0 :         s0 = _mm_adds_epu16(sad[0], sad[1]);
    4487           0 :         s1 = _mm_adds_epu16(sad[2], sad[3]);
    4488           0 :         s3 = _mm_adds_epu16(s0, s1);
    4489             :         //sotore the 8 SADs(16x16 SADs)
    4490             :         _mm_store_si128((__m128i*)p_sad16x16, s3);
    4491             :         //find the best for 16x16
    4492           0 :         s3 = _mm_minpos_epu16(s3);
    4493           0 :         temSum = _mm_extract_epi16(s3, 0);
    4494           0 :         if (temSum < p_best_sad16x16[0]) {
    4495           0 :             p_best_sad16x16[0] = temSum;
    4496           0 :             x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1) * 4);
    4497           0 :             y_mv = _MVYT(mv);
    4498           0 :             p_best_mv16x16[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4499             :         }
    4500             :     }
    4501           0 : }
    4502             : 
    4503             : /*******************************************
    4504             :  Calcualte SAD for 32x32,64x64 from 16x16
    4505             :  and check if there is improvement, if yes keep
    4506             :  the best SAD+MV
    4507             :  *******************************************/
    4508           0 : void get_eight_horizontal_search_point_results_32x32_64x64_pu_sse41_intrin(
    4509             :     uint16_t  *p_sad16x16,
    4510             :     uint32_t  *p_best_sad32x32,
    4511             :     uint32_t  *p_best_sad64x64,
    4512             :     uint32_t  *p_best_mv32x32,
    4513             :     uint32_t  *p_best_mv64x64,
    4514             :     uint32_t   mv)
    4515             : {
    4516             :     int16_t x_mv, y_mv;
    4517             : 
    4518             :     uint32_t temSum;
    4519             :     __m128i s0, s1, s2, s3, s4, s5, sad_0, sad_1, s6, s7;
    4520             :     __m128i sad_00, sad_01, sad_10, sad_11, sad_20, sad_21, sad_30, sad_31;
    4521           0 :     __m128i Zero = _mm_setzero_si128();
    4522             : 
    4523             :     /*--------------------
    4524             :     |  32x32_0  |  32x32_1
    4525             :     ----------------------
    4526             :     |  32x32_2  |  32x32_3
    4527             :     ----------------------*/
    4528             : 
    4529             :     /*  data ordering in p_sad16x16 buffer
    4530             : 
    4531             :                   Search    Search            Search
    4532             :                   Point 0   Point 1           Point 7
    4533             :                 ---------------------------------------
    4534             :      16x16_0    |    x    |    x    | ...... |    x    |
    4535             :                 ---------------------------------------
    4536             :      16x16_1    |    x    |    x    | ...... |    x    |
    4537             : 
    4538             :      16x16_n    |    x    |    x    | ...... |    x    |
    4539             : 
    4540             :                 ---------------------------------------
    4541             :      16x16_15   |    x    |    x    | ...... |    x    |
    4542             :                 ---------------------------------------
    4543             :     */
    4544             : 
    4545             :     //32x32_0
    4546           0 :     s0 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 0 * 8));
    4547           0 :     s1 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 1 * 8));
    4548           0 :     s2 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 2 * 8));
    4549           0 :     s3 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 3 * 8));
    4550             : 
    4551           0 :     s4 = _mm_unpackhi_epi16(s0, Zero);
    4552           0 :     s5 = _mm_unpacklo_epi16(s0, Zero);
    4553           0 :     s6 = _mm_unpackhi_epi16(s1, Zero);
    4554           0 :     s7 = _mm_unpacklo_epi16(s1, Zero);
    4555           0 :     s0 = _mm_add_epi32(s4, s6);
    4556           0 :     s1 = _mm_add_epi32(s5, s7);
    4557             : 
    4558           0 :     s4 = _mm_unpackhi_epi16(s2, Zero);
    4559           0 :     s5 = _mm_unpacklo_epi16(s2, Zero);
    4560           0 :     s6 = _mm_unpackhi_epi16(s3, Zero);
    4561           0 :     s7 = _mm_unpacklo_epi16(s3, Zero);
    4562           0 :     s2 = _mm_add_epi32(s4, s6);
    4563           0 :     s3 = _mm_add_epi32(s5, s7);
    4564             : 
    4565           0 :     sad_01 = _mm_add_epi32(s0, s2);
    4566           0 :     sad_00 = _mm_add_epi32(s1, s3);
    4567             : 
    4568             :     //sad_00
    4569           0 :     temSum = _mm_extract_epi32(sad_00, 0);
    4570           0 :     if (temSum < p_best_sad32x32[0]) {
    4571           0 :         p_best_sad32x32[0] = temSum;
    4572           0 :         x_mv = _MVXT(mv) + (0 + 0) * 4;   y_mv = _MVYT(mv);
    4573           0 :         p_best_mv32x32[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4574             :     }
    4575           0 :     temSum = _mm_extract_epi32(sad_00, 1);
    4576           0 :     if (temSum < p_best_sad32x32[0]) {
    4577           0 :         p_best_sad32x32[0] = temSum;
    4578           0 :         x_mv = _MVXT(mv) + (0 + 1) * 4;  y_mv = _MVYT(mv);
    4579           0 :         p_best_mv32x32[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4580             :     }
    4581           0 :     temSum = _mm_extract_epi32(sad_00, 2);
    4582           0 :     if (temSum < p_best_sad32x32[0]) {
    4583           0 :         p_best_sad32x32[0] = temSum;
    4584           0 :         x_mv = _MVXT(mv) + (0 + 2) * 4;  y_mv = _MVYT(mv);
    4585           0 :         p_best_mv32x32[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4586             :     }
    4587           0 :     temSum = _mm_extract_epi32(sad_00, 3);
    4588           0 :     if (temSum < p_best_sad32x32[0]) {
    4589           0 :         p_best_sad32x32[0] = temSum;
    4590           0 :         x_mv = _MVXT(mv) + (0 + 3) * 4;  y_mv = _MVYT(mv);
    4591           0 :         p_best_mv32x32[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4592             :     }
    4593             : 
    4594             :     //sad_01
    4595           0 :     temSum = _mm_extract_epi32(sad_01, 0);
    4596           0 :     if (temSum < p_best_sad32x32[0]) {
    4597           0 :         p_best_sad32x32[0] = temSum;
    4598           0 :         x_mv = _MVXT(mv) + (4 + 0) * 4;   y_mv = _MVYT(mv);
    4599           0 :         p_best_mv32x32[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4600             :     }
    4601           0 :     temSum = _mm_extract_epi32(sad_01, 1);
    4602           0 :     if (temSum < p_best_sad32x32[0]) {
    4603           0 :         p_best_sad32x32[0] = temSum;
    4604           0 :         x_mv = _MVXT(mv) + (4 + 1) * 4;  y_mv = _MVYT(mv);
    4605           0 :         p_best_mv32x32[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4606             :     }
    4607           0 :     temSum = _mm_extract_epi32(sad_01, 2);
    4608           0 :     if (temSum < p_best_sad32x32[0]) {
    4609           0 :         p_best_sad32x32[0] = temSum;
    4610           0 :         x_mv = _MVXT(mv) + (4 + 2) * 4;  y_mv = _MVYT(mv);
    4611           0 :         p_best_mv32x32[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4612             :     }
    4613           0 :     temSum = _mm_extract_epi32(sad_01, 3);
    4614           0 :     if (temSum < p_best_sad32x32[0]) {
    4615           0 :         p_best_sad32x32[0] = temSum;
    4616           0 :         x_mv = _MVXT(mv) + (4 + 3) * 4;  y_mv = _MVYT(mv);
    4617           0 :         p_best_mv32x32[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4618             :     }
    4619             : 
    4620             :     //32x32_1
    4621           0 :     s0 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 4 * 8));
    4622           0 :     s1 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 5 * 8));
    4623           0 :     s2 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 6 * 8));
    4624           0 :     s3 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 7 * 8));
    4625             : 
    4626           0 :     s4 = _mm_unpackhi_epi16(s0, Zero);
    4627           0 :     s5 = _mm_unpacklo_epi16(s0, Zero);
    4628           0 :     s6 = _mm_unpackhi_epi16(s1, Zero);
    4629           0 :     s7 = _mm_unpacklo_epi16(s1, Zero);
    4630           0 :     s0 = _mm_add_epi32(s4, s6);
    4631           0 :     s1 = _mm_add_epi32(s5, s7);
    4632             : 
    4633           0 :     s4 = _mm_unpackhi_epi16(s2, Zero);
    4634           0 :     s5 = _mm_unpacklo_epi16(s2, Zero);
    4635           0 :     s6 = _mm_unpackhi_epi16(s3, Zero);
    4636           0 :     s7 = _mm_unpacklo_epi16(s3, Zero);
    4637           0 :     s2 = _mm_add_epi32(s4, s6);
    4638           0 :     s3 = _mm_add_epi32(s5, s7);
    4639             : 
    4640           0 :     sad_11 = _mm_add_epi32(s0, s2);
    4641           0 :     sad_10 = _mm_add_epi32(s1, s3);
    4642             : 
    4643             :     //sad_10
    4644           0 :     temSum = _mm_extract_epi32(sad_10, 0);
    4645           0 :     if (temSum < p_best_sad32x32[1]) {
    4646           0 :         p_best_sad32x32[1] = temSum;
    4647           0 :         x_mv = _MVXT(mv) + (0 + 0) * 4;   y_mv = _MVYT(mv);
    4648           0 :         p_best_mv32x32[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4649             :     }
    4650           0 :     temSum = _mm_extract_epi32(sad_10, 1);
    4651           0 :     if (temSum < p_best_sad32x32[1]) {
    4652           0 :         p_best_sad32x32[1] = temSum;
    4653           0 :         x_mv = _MVXT(mv) + (0 + 1) * 4;  y_mv = _MVYT(mv);
    4654           0 :         p_best_mv32x32[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4655             :     }
    4656           0 :     temSum = _mm_extract_epi32(sad_10, 2);
    4657           0 :     if (temSum < p_best_sad32x32[1]) {
    4658           0 :         p_best_sad32x32[1] = temSum;
    4659           0 :         x_mv = _MVXT(mv) + (0 + 2) * 4;  y_mv = _MVYT(mv);
    4660           0 :         p_best_mv32x32[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4661             :     }
    4662           0 :     temSum = _mm_extract_epi32(sad_10, 3);
    4663           0 :     if (temSum < p_best_sad32x32[1]) {
    4664           0 :         p_best_sad32x32[1] = temSum;
    4665           0 :         x_mv = _MVXT(mv) + (0 + 3) * 4;  y_mv = _MVYT(mv);
    4666           0 :         p_best_mv32x32[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4667             :     }
    4668             : 
    4669             :     //sad_11
    4670           0 :     temSum = _mm_extract_epi32(sad_11, 0);
    4671           0 :     if (temSum < p_best_sad32x32[1]) {
    4672           0 :         p_best_sad32x32[1] = temSum;
    4673           0 :         x_mv = _MVXT(mv) + (4 + 0) * 4;   y_mv = _MVYT(mv);
    4674           0 :         p_best_mv32x32[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4675             :     }
    4676           0 :     temSum = _mm_extract_epi32(sad_11, 1);
    4677           0 :     if (temSum < p_best_sad32x32[1]) {
    4678           0 :         p_best_sad32x32[1] = temSum;
    4679           0 :         x_mv = _MVXT(mv) + (4 + 1) * 4;  y_mv = _MVYT(mv);
    4680           0 :         p_best_mv32x32[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4681             :     }
    4682           0 :     temSum = _mm_extract_epi32(sad_11, 2);
    4683           0 :     if (temSum < p_best_sad32x32[1]) {
    4684           0 :         p_best_sad32x32[1] = temSum;
    4685           0 :         x_mv = _MVXT(mv) + (4 + 2) * 4;  y_mv = _MVYT(mv);
    4686           0 :         p_best_mv32x32[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4687             :     }
    4688           0 :     temSum = _mm_extract_epi32(sad_11, 3);
    4689           0 :     if (temSum < p_best_sad32x32[1]) {
    4690           0 :         p_best_sad32x32[1] = temSum;
    4691           0 :         x_mv = _MVXT(mv) + (4 + 3) * 4;  y_mv = _MVYT(mv);
    4692           0 :         p_best_mv32x32[1] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4693             :     }
    4694             : 
    4695             :     //32x32_2
    4696           0 :     s0 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 8 * 8));
    4697           0 :     s1 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 9 * 8));
    4698           0 :     s2 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 10 * 8));
    4699           0 :     s3 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 11 * 8));
    4700             : 
    4701           0 :     s4 = _mm_unpackhi_epi16(s0, Zero);
    4702           0 :     s5 = _mm_unpacklo_epi16(s0, Zero);
    4703           0 :     s6 = _mm_unpackhi_epi16(s1, Zero);
    4704           0 :     s7 = _mm_unpacklo_epi16(s1, Zero);
    4705           0 :     s0 = _mm_add_epi32(s4, s6);
    4706           0 :     s1 = _mm_add_epi32(s5, s7);
    4707             : 
    4708           0 :     s4 = _mm_unpackhi_epi16(s2, Zero);
    4709           0 :     s5 = _mm_unpacklo_epi16(s2, Zero);
    4710           0 :     s6 = _mm_unpackhi_epi16(s3, Zero);
    4711           0 :     s7 = _mm_unpacklo_epi16(s3, Zero);
    4712           0 :     s2 = _mm_add_epi32(s4, s6);
    4713           0 :     s3 = _mm_add_epi32(s5, s7);
    4714             : 
    4715           0 :     sad_21 = _mm_add_epi32(s0, s2);
    4716           0 :     sad_20 = _mm_add_epi32(s1, s3);
    4717             : 
    4718             :     //sad_20
    4719           0 :     temSum = _mm_extract_epi32(sad_20, 0);
    4720           0 :     if (temSum < p_best_sad32x32[2]) {
    4721           0 :         p_best_sad32x32[2] = temSum;
    4722           0 :         x_mv = _MVXT(mv) + (0 + 0) * 4;   y_mv = _MVYT(mv);
    4723           0 :         p_best_mv32x32[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4724             :     }
    4725           0 :     temSum = _mm_extract_epi32(sad_20, 1);
    4726           0 :     if (temSum < p_best_sad32x32[2]) {
    4727           0 :         p_best_sad32x32[2] = temSum;
    4728           0 :         x_mv = _MVXT(mv) + (0 + 1) * 4;  y_mv = _MVYT(mv);
    4729           0 :         p_best_mv32x32[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4730             :     }
    4731           0 :     temSum = _mm_extract_epi32(sad_20, 2);
    4732           0 :     if (temSum < p_best_sad32x32[2]) {
    4733           0 :         p_best_sad32x32[2] = temSum;
    4734           0 :         x_mv = _MVXT(mv) + (0 + 2) * 4;  y_mv = _MVYT(mv);
    4735           0 :         p_best_mv32x32[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4736             :     }
    4737           0 :     temSum = _mm_extract_epi32(sad_20, 3);
    4738           0 :     if (temSum < p_best_sad32x32[2]) {
    4739           0 :         p_best_sad32x32[2] = temSum;
    4740           0 :         x_mv = _MVXT(mv) + (0 + 3) * 4;  y_mv = _MVYT(mv);
    4741           0 :         p_best_mv32x32[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4742             :     }
    4743             : 
    4744             :     //sad_21
    4745           0 :     temSum = _mm_extract_epi32(sad_21, 0);
    4746           0 :     if (temSum < p_best_sad32x32[2]) {
    4747           0 :         p_best_sad32x32[2] = temSum;
    4748           0 :         x_mv = _MVXT(mv) + (4 + 0) * 4;   y_mv = _MVYT(mv);
    4749           0 :         p_best_mv32x32[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4750             :     }
    4751           0 :     temSum = _mm_extract_epi32(sad_21, 1);
    4752           0 :     if (temSum < p_best_sad32x32[2]) {
    4753           0 :         p_best_sad32x32[2] = temSum;
    4754           0 :         x_mv = _MVXT(mv) + (4 + 1) * 4;  y_mv = _MVYT(mv);
    4755           0 :         p_best_mv32x32[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4756             :     }
    4757           0 :     temSum = _mm_extract_epi32(sad_21, 2);
    4758           0 :     if (temSum < p_best_sad32x32[2]) {
    4759           0 :         p_best_sad32x32[2] = temSum;
    4760           0 :         x_mv = _MVXT(mv) + (4 + 2) * 4;  y_mv = _MVYT(mv);
    4761           0 :         p_best_mv32x32[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4762             :     }
    4763           0 :     temSum = _mm_extract_epi32(sad_21, 3);
    4764           0 :     if (temSum < p_best_sad32x32[2]) {
    4765           0 :         p_best_sad32x32[2] = temSum;
    4766           0 :         x_mv = _MVXT(mv) + (4 + 3) * 4;  y_mv = _MVYT(mv);
    4767           0 :         p_best_mv32x32[2] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4768             :     }
    4769             : 
    4770             :     //32x32_3
    4771           0 :     s0 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 12 * 8));
    4772           0 :     s1 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 13 * 8));
    4773           0 :     s2 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 14 * 8));
    4774           0 :     s3 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 15 * 8));
    4775             : 
    4776           0 :     s4 = _mm_unpackhi_epi16(s0, Zero);
    4777           0 :     s5 = _mm_unpacklo_epi16(s0, Zero);
    4778           0 :     s6 = _mm_unpackhi_epi16(s1, Zero);
    4779           0 :     s7 = _mm_unpacklo_epi16(s1, Zero);
    4780           0 :     s0 = _mm_add_epi32(s4, s6);
    4781           0 :     s1 = _mm_add_epi32(s5, s7);
    4782             : 
    4783           0 :     s4 = _mm_unpackhi_epi16(s2, Zero);
    4784           0 :     s5 = _mm_unpacklo_epi16(s2, Zero);
    4785           0 :     s6 = _mm_unpackhi_epi16(s3, Zero);
    4786           0 :     s7 = _mm_unpacklo_epi16(s3, Zero);
    4787           0 :     s2 = _mm_add_epi32(s4, s6);
    4788           0 :     s3 = _mm_add_epi32(s5, s7);
    4789             : 
    4790           0 :     sad_31 = _mm_add_epi32(s0, s2);
    4791           0 :     sad_30 = _mm_add_epi32(s1, s3);
    4792             : 
    4793             :     //sad_30
    4794           0 :     temSum = _mm_extract_epi32(sad_30, 0);
    4795           0 :     if (temSum < p_best_sad32x32[3]) {
    4796           0 :         p_best_sad32x32[3] = temSum;
    4797           0 :         x_mv = _MVXT(mv) + (0 + 0) * 4;   y_mv = _MVYT(mv);
    4798           0 :         p_best_mv32x32[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4799             :     }
    4800           0 :     temSum = _mm_extract_epi32(sad_30, 1);
    4801           0 :     if (temSum < p_best_sad32x32[3]) {
    4802           0 :         p_best_sad32x32[3] = temSum;
    4803           0 :         x_mv = _MVXT(mv) + (0 + 1) * 4;  y_mv = _MVYT(mv);
    4804           0 :         p_best_mv32x32[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4805             :     }
    4806           0 :     temSum = _mm_extract_epi32(sad_30, 2);
    4807           0 :     if (temSum < p_best_sad32x32[3]) {
    4808           0 :         p_best_sad32x32[3] = temSum;
    4809           0 :         x_mv = _MVXT(mv) + (0 + 2) * 4;  y_mv = _MVYT(mv);
    4810           0 :         p_best_mv32x32[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4811             :     }
    4812           0 :     temSum = _mm_extract_epi32(sad_30, 3);
    4813           0 :     if (temSum < p_best_sad32x32[3]) {
    4814           0 :         p_best_sad32x32[3] = temSum;
    4815           0 :         x_mv = _MVXT(mv) + (0 + 3) * 4;  y_mv = _MVYT(mv);
    4816           0 :         p_best_mv32x32[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4817             :     }
    4818             : 
    4819             :     //sad_31
    4820           0 :     temSum = _mm_extract_epi32(sad_31, 0);
    4821           0 :     if (temSum < p_best_sad32x32[3]) {
    4822           0 :         p_best_sad32x32[3] = temSum;
    4823           0 :         x_mv = _MVXT(mv) + (4 + 0) * 4;   y_mv = _MVYT(mv);
    4824           0 :         p_best_mv32x32[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4825             :     }
    4826           0 :     temSum = _mm_extract_epi32(sad_31, 1);
    4827           0 :     if (temSum < p_best_sad32x32[3]) {
    4828           0 :         p_best_sad32x32[3] = temSum;
    4829           0 :         x_mv = _MVXT(mv) + (4 + 1) * 4;  y_mv = _MVYT(mv);
    4830           0 :         p_best_mv32x32[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4831             :     }
    4832           0 :     temSum = _mm_extract_epi32(sad_31, 2);
    4833           0 :     if (temSum < p_best_sad32x32[3]) {
    4834           0 :         p_best_sad32x32[3] = temSum;
    4835           0 :         x_mv = _MVXT(mv) + (4 + 2) * 4;  y_mv = _MVYT(mv);
    4836           0 :         p_best_mv32x32[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4837             :     }
    4838           0 :     temSum = _mm_extract_epi32(sad_31, 3);
    4839           0 :     if (temSum < p_best_sad32x32[3]) {
    4840           0 :         p_best_sad32x32[3] = temSum;
    4841           0 :         x_mv = _MVXT(mv) + (4 + 3) * 4;  y_mv = _MVYT(mv);
    4842           0 :         p_best_mv32x32[3] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4843             :     }
    4844             : 
    4845           0 :     sad_0 = _mm_add_epi32(_mm_add_epi32(sad_00, sad_10), _mm_add_epi32(sad_20, sad_30));
    4846           0 :     sad_1 = _mm_add_epi32(_mm_add_epi32(sad_01, sad_11), _mm_add_epi32(sad_21, sad_31));
    4847             : 
    4848             :     //sad_0
    4849           0 :     temSum = _mm_extract_epi32(sad_0, 0);
    4850           0 :     if (temSum < p_best_sad64x64[0]) {
    4851           0 :         p_best_sad64x64[0] = temSum;
    4852           0 :         x_mv = _MVXT(mv) + (0 + 0) * 4;   y_mv = _MVYT(mv);
    4853           0 :         p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4854             :     }
    4855           0 :     temSum = _mm_extract_epi32(sad_0, 1);
    4856           0 :     if (temSum < p_best_sad64x64[0]) {
    4857           0 :         p_best_sad64x64[0] = temSum;
    4858           0 :         x_mv = _MVXT(mv) + (0 + 1) * 4;  y_mv = _MVYT(mv);
    4859           0 :         p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4860             :     }
    4861           0 :     temSum = _mm_extract_epi32(sad_0, 2);
    4862           0 :     if (temSum < p_best_sad64x64[0]) {
    4863           0 :         p_best_sad64x64[0] = temSum;
    4864           0 :         x_mv = _MVXT(mv) + (0 + 2) * 4;  y_mv = _MVYT(mv);
    4865           0 :         p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4866             :     }
    4867           0 :     temSum = _mm_extract_epi32(sad_0, 3);
    4868           0 :     if (temSum < p_best_sad64x64[0]) {
    4869           0 :         p_best_sad64x64[0] = temSum;
    4870           0 :         x_mv = _MVXT(mv) + (0 + 3) * 4;  y_mv = _MVYT(mv);
    4871           0 :         p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4872             :     }
    4873             : 
    4874             :     //sad_1
    4875           0 :     temSum = _mm_extract_epi32(sad_1, 0);
    4876           0 :     if (temSum < p_best_sad64x64[0]) {
    4877           0 :         p_best_sad64x64[0] = temSum;
    4878           0 :         x_mv = _MVXT(mv) + (4 + 0) * 4;   y_mv = _MVYT(mv);
    4879           0 :         p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4880             :     }
    4881           0 :     temSum = _mm_extract_epi32(sad_1, 1);
    4882           0 :     if (temSum < p_best_sad64x64[0]) {
    4883           0 :         p_best_sad64x64[0] = temSum;
    4884           0 :         x_mv = _MVXT(mv) + (4 + 1) * 4;  y_mv = _MVYT(mv);
    4885           0 :         p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4886             :     }
    4887           0 :     temSum = _mm_extract_epi32(sad_1, 2);
    4888           0 :     if (temSum < p_best_sad64x64[0]) {
    4889           0 :         p_best_sad64x64[0] = temSum;
    4890           0 :         x_mv = _MVXT(mv) + (4 + 2) * 4;  y_mv = _MVYT(mv);
    4891           0 :         p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4892             :     }
    4893           0 :     temSum = _mm_extract_epi32(sad_1, 3);
    4894           0 :     if (temSum < p_best_sad64x64[0]) {
    4895           0 :         p_best_sad64x64[0] = temSum;
    4896           0 :         x_mv = _MVXT(mv) + (4 + 3) * 4;  y_mv = _MVYT(mv);
    4897           0 :         p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
    4898             :     }
    4899           0 : }

Generated by: LCOV version 1.14