LCOV - code coverage report
Current view: top level - ASM_SSE2 - EbPictureOperators_Intrinsic_SSE2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 547 0.0 %
Date: 2019-11-25 17:38:06 Functions: 0 23 0.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include "EbPictureOperators_SSE2.h"
       7             : #include <emmintrin.h>
       8             : #include "EbDefinitions.h"
       9             : 
      10             : /*******************************************************************************
      11             :                       PictureAdditionKernel_INTRIN
      12             : *******************************************************************************/
      13           0 : void picture_addition_kernel4x4_sse_intrin(
      14             :     uint8_t  *pred_ptr,
      15             :     uint32_t  pred_stride,
      16             :     int16_t *residual_ptr,
      17             :     uint32_t  residual_stride,
      18             :     uint8_t  *recon_ptr,
      19             :     uint32_t  recon_stride,
      20             :     uint32_t  width,
      21             :     uint32_t  height)
      22             : {
      23             :     uint32_t y;
      24             :     __m128i xmm0, recon_0_3;
      25           0 :     xmm0 = _mm_setzero_si128();
      26             : 
      27           0 :     for (y = 0; y < 4; ++y) {
      28           0 :         recon_0_3 = _mm_packus_epi16(_mm_add_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)pred_ptr), xmm0), _mm_loadl_epi64((__m128i *)residual_ptr)), xmm0);
      29             : 
      30           0 :         *(uint32_t *)recon_ptr = _mm_cvtsi128_si32(recon_0_3);
      31           0 :         pred_ptr += pred_stride;
      32           0 :         residual_ptr += residual_stride;
      33           0 :         recon_ptr += recon_stride;
      34             :     }
      35             :     (void)width;
      36             :     (void)height;
      37             : 
      38           0 :     return;
      39             : }
      40             : 
      41           0 : void picture_addition_kernel8x8_sse2_intrin(
      42             :     uint8_t  *pred_ptr,
      43             :     uint32_t  pred_stride,
      44             :     int16_t *residual_ptr,
      45             :     uint32_t  residual_stride,
      46             :     uint8_t  *recon_ptr,
      47             :     uint32_t  recon_stride,
      48             :     uint32_t  width,
      49             :     uint32_t  height)
      50             : {
      51             :     __m128i recon_0_7, xmm0;
      52             :     uint32_t y;
      53             : 
      54           0 :     xmm0 = _mm_setzero_si128();
      55             : 
      56           0 :     for (y = 0; y < 8; ++y) {
      57           0 :         recon_0_7 = _mm_packus_epi16(_mm_add_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)pred_ptr), xmm0), _mm_loadu_si128((__m128i *)residual_ptr)), xmm0);
      58             : 
      59           0 :         *(uint64_t *)recon_ptr = _mm_cvtsi128_si64(recon_0_7);
      60           0 :         pred_ptr += pred_stride;
      61           0 :         residual_ptr += residual_stride;
      62           0 :         recon_ptr += recon_stride;
      63             :     }
      64             :     (void)width;
      65             :     (void)height;
      66             : 
      67           0 :     return;
      68             : }
      69             : 
      70           0 : void picture_addition_kernel16x16_sse2_intrin(
      71             :     uint8_t  *pred_ptr,
      72             :     uint32_t  pred_stride,
      73             :     int16_t *residual_ptr,
      74             :     uint32_t  residual_stride,
      75             :     uint8_t  *recon_ptr,
      76             :     uint32_t  recon_stride,
      77             :     uint32_t  width,
      78             :     uint32_t  height)
      79             : {
      80             :     __m128i xmm0, xmm_clip_U8, pred_0_15, recon_0_7, recon_8_15;
      81             :     uint32_t y;
      82             : 
      83           0 :     xmm0 = _mm_setzero_si128();
      84             : 
      85           0 :     for (y = 0; y < 16; ++y) {
      86           0 :         pred_0_15 = _mm_loadu_si128((__m128i *)pred_ptr);
      87           0 :         recon_0_7 = _mm_add_epi16(_mm_unpacklo_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)residual_ptr));
      88           0 :         recon_8_15 = _mm_add_epi16(_mm_unpackhi_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 8)));
      89           0 :         xmm_clip_U8 = _mm_packus_epi16(recon_0_7, recon_8_15);
      90             : 
      91             :         _mm_storeu_si128((__m128i*)recon_ptr, xmm_clip_U8);
      92             : 
      93           0 :         pred_ptr += pred_stride;
      94           0 :         residual_ptr += residual_stride;
      95           0 :         recon_ptr += recon_stride;
      96             :     }
      97             :     (void)width;
      98             :     (void)height;
      99             : 
     100           0 :     return;
     101             : }
     102           0 : void picture_addition_kernel32x32_sse2_intrin(
     103             :     uint8_t  *pred_ptr,
     104             :     uint32_t  pred_stride,
     105             :     int16_t *residual_ptr,
     106             :     uint32_t  residual_stride,
     107             :     uint8_t  *recon_ptr,
     108             :     uint32_t  recon_stride,
     109             :     uint32_t  width,
     110             :     uint32_t  height)
     111             : {
     112             :     uint32_t y;
     113             :     __m128i xmm0, pred_0_15, pred_16_31, recon_0_15_clipped, recon_0_7, recon_8_15, recon_16_23, recon_24_31, recon_16_31_clipped;
     114           0 :     xmm0 = _mm_setzero_si128();
     115             : 
     116           0 :     for (y = 0; y < 32; ++y) {
     117           0 :         pred_0_15 = _mm_loadu_si128((__m128i *)pred_ptr);
     118           0 :         pred_16_31 = _mm_loadu_si128((__m128i *)(pred_ptr + 16));
     119             : 
     120           0 :         recon_0_7 = _mm_add_epi16(_mm_unpacklo_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)residual_ptr));
     121           0 :         recon_8_15 = _mm_add_epi16(_mm_unpackhi_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 8)));
     122           0 :         recon_16_23 = _mm_add_epi16(_mm_unpacklo_epi8(pred_16_31, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 16)));
     123           0 :         recon_24_31 = _mm_add_epi16(_mm_unpackhi_epi8(pred_16_31, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 24)));
     124             : 
     125           0 :         recon_0_15_clipped = _mm_packus_epi16(recon_0_7, recon_8_15);
     126           0 :         recon_16_31_clipped = _mm_packus_epi16(recon_16_23, recon_24_31);
     127             : 
     128             :         _mm_storeu_si128((__m128i*)recon_ptr, recon_0_15_clipped);
     129           0 :         _mm_storeu_si128((__m128i*)(recon_ptr + 16), recon_16_31_clipped);
     130             : 
     131           0 :         pred_ptr += pred_stride;
     132           0 :         residual_ptr += residual_stride;
     133           0 :         recon_ptr += recon_stride;
     134             :     }
     135             :     (void)width;
     136             :     (void)height;
     137             : 
     138           0 :     return;
     139             : }
     140             : 
     141           0 : void picture_addition_kernel64x64_sse2_intrin(
     142             :     uint8_t  *pred_ptr,
     143             :     uint32_t  pred_stride,
     144             :     int16_t *residual_ptr,
     145             :     uint32_t  residual_stride,
     146             :     uint8_t  *recon_ptr,
     147             :     uint32_t  recon_stride,
     148             :     uint32_t  width,
     149             :     uint32_t  height)
     150             : {
     151             :     uint32_t y;
     152             : 
     153             :     __m128i xmm0, pred_0_15, pred_16_31, pred_32_47, pred_48_63;
     154             :     __m128i recon_0_15_clipped, recon_16_31_clipped, recon_32_47_clipped, recon_48_63_clipped;
     155             :     __m128i recon_0_7, recon_8_15, recon_16_23, recon_24_31, recon_32_39, recon_40_47, recon_48_55, recon_56_63;
     156             : 
     157           0 :     xmm0 = _mm_setzero_si128();
     158             : 
     159           0 :     for (y = 0; y < 64; ++y) {
     160           0 :         pred_0_15 = _mm_loadu_si128((__m128i *)pred_ptr);
     161           0 :         pred_16_31 = _mm_loadu_si128((__m128i *)(pred_ptr + 16));
     162           0 :         pred_32_47 = _mm_loadu_si128((__m128i *)(pred_ptr + 32));
     163           0 :         pred_48_63 = _mm_loadu_si128((__m128i *)(pred_ptr + 48));
     164             : 
     165           0 :         recon_0_7 = _mm_add_epi16(_mm_unpacklo_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)residual_ptr));
     166           0 :         recon_8_15 = _mm_add_epi16(_mm_unpackhi_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 8)));
     167           0 :         recon_16_23 = _mm_add_epi16(_mm_unpacklo_epi8(pred_16_31, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 16)));
     168           0 :         recon_24_31 = _mm_add_epi16(_mm_unpackhi_epi8(pred_16_31, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 24)));
     169           0 :         recon_32_39 = _mm_add_epi16(_mm_unpacklo_epi8(pred_32_47, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 32)));
     170           0 :         recon_40_47 = _mm_add_epi16(_mm_unpackhi_epi8(pred_32_47, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 40)));
     171           0 :         recon_48_55 = _mm_add_epi16(_mm_unpacklo_epi8(pred_48_63, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 48)));
     172           0 :         recon_56_63 = _mm_add_epi16(_mm_unpackhi_epi8(pred_48_63, xmm0), _mm_loadu_si128((__m128i *)(residual_ptr + 56)));
     173             : 
     174           0 :         recon_0_15_clipped = _mm_packus_epi16(recon_0_7, recon_8_15);
     175           0 :         recon_16_31_clipped = _mm_packus_epi16(recon_16_23, recon_24_31);
     176           0 :         recon_32_47_clipped = _mm_packus_epi16(recon_32_39, recon_40_47);
     177           0 :         recon_48_63_clipped = _mm_packus_epi16(recon_48_55, recon_56_63);
     178             : 
     179             :         _mm_storeu_si128((__m128i*)recon_ptr, recon_0_15_clipped);
     180           0 :         _mm_storeu_si128((__m128i*)(recon_ptr + 16), recon_16_31_clipped);
     181           0 :         _mm_storeu_si128((__m128i*)(recon_ptr + 32), recon_32_47_clipped);
     182           0 :         _mm_storeu_si128((__m128i*)(recon_ptr + 48), recon_48_63_clipped);
     183             : 
     184           0 :         pred_ptr += pred_stride;
     185           0 :         residual_ptr += residual_stride;
     186           0 :         recon_ptr += recon_stride;
     187             :     }
     188             :     (void)width;
     189             :     (void)height;
     190             : 
     191           0 :     return;
     192             : }
     193             : 
     194             : /******************************************************************************************************
     195             : residual_kernel
     196             : ***********************************************************************************************************/
     197           0 : void residual_kernel_sub_sampled4x4_sse_intrin(
     198             :     uint8_t   *input,
     199             :     uint32_t   input_stride,
     200             :     uint8_t   *pred,
     201             :     uint32_t   pred_stride,
     202             :     int16_t  *residual,
     203             :     uint32_t   residual_stride,
     204             :     uint32_t   area_width,
     205             :     uint32_t   area_height,
     206             :     uint8_t    last_line)
     207             : {
     208           0 :     __m128i residual_0_3, xmm0 = _mm_setzero_si128();
     209             :     uint32_t y;
     210             :     //hard code subampling dimensions, keep residual_stride
     211           0 :     area_height >>= 1;
     212           0 :     input_stride <<= 1;
     213           0 :     pred_stride <<= 1;
     214             : 
     215           0 :     for (y = 0; y < area_height; ++y) {
     216           0 :         residual_0_3 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)input), xmm0),
     217           0 :             _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)pred), xmm0));
     218             : 
     219           0 :         *(uint64_t *)residual = _mm_cvtsi128_si64(residual_0_3);
     220             : 
     221           0 :         residual += residual_stride;
     222           0 :         *(uint64_t *)residual = _mm_cvtsi128_si64(residual_0_3);
     223             : 
     224           0 :         input += input_stride;
     225           0 :         pred += pred_stride;
     226           0 :         residual += residual_stride;
     227             :     }
     228             :     (void)area_width;
     229             :     //compute the last line
     230             : 
     231           0 :     if (last_line) {
     232           0 :         input -= (input_stride) >> 1;
     233           0 :         pred -= (pred_stride) >> 1;
     234           0 :         residual -= residual_stride;
     235           0 :         residual_0_3 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)input), xmm0),
     236           0 :             _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)pred), xmm0));
     237             : 
     238           0 :         *(uint64_t *)residual = _mm_cvtsi128_si64(residual_0_3);
     239             :     }
     240             : 
     241           0 :     return;
     242             : }
     243             : 
     244           0 : void residual_kernel_sub_sampled8x8_sse2_intrin(
     245             :     uint8_t   *input,
     246             :     uint32_t   input_stride,
     247             :     uint8_t   *pred,
     248             :     uint32_t   pred_stride,
     249             :     int16_t  *residual,
     250             :     uint32_t   residual_stride,
     251             :     uint32_t   area_width,
     252             :     uint32_t   area_height,
     253             :     uint8_t    last_line
     254             : )
     255             : {
     256             :     __m128i xmm0, residual_0_7;
     257             :     uint32_t y;
     258             : 
     259           0 :     xmm0 = _mm_setzero_si128();
     260             :     //hard code subampling dimensions, keep residual_stride
     261           0 :     area_height >>= 1;
     262           0 :     input_stride <<= 1;
     263           0 :     pred_stride <<= 1;
     264             : 
     265           0 :     for (y = 0; y < area_height; ++y) {
     266           0 :         residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)pred), xmm0));
     267             : 
     268             :         _mm_storeu_si128((__m128i*)residual, residual_0_7);
     269             : 
     270           0 :         residual += residual_stride;
     271             :         _mm_storeu_si128((__m128i*)residual, residual_0_7);
     272             : 
     273           0 :         input += input_stride;
     274           0 :         pred += pred_stride;
     275           0 :         residual += residual_stride;
     276             :     }
     277             :     (void)area_width;
     278             :     //compute the last line
     279           0 :     if (last_line) {
     280           0 :         input -= (input_stride) >> 1;
     281           0 :         pred -= (pred_stride) >> 1;
     282           0 :         residual -= residual_stride;
     283             : 
     284           0 :         residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)pred), xmm0));
     285             : 
     286             :         _mm_storeu_si128((__m128i*)residual, residual_0_7);
     287             :     }
     288             : 
     289           0 :     return;
     290             : }
     291             : 
     292           0 : void residual_kernel_sub_sampled16x16_sse2_intrin(
     293             :     uint8_t   *input,
     294             :     uint32_t   input_stride,
     295             :     uint8_t   *pred,
     296             :     uint32_t   pred_stride,
     297             :     int16_t  *residual,
     298             :     uint32_t   residual_stride,
     299             :     uint32_t   area_width,
     300             :     uint32_t   area_height,
     301             :     uint8_t    last_line
     302             : )
     303             : {
     304             :     __m128i xmm0, residual_0_7, residual_8_15;
     305             :     uint32_t y;
     306             : 
     307           0 :     xmm0 = _mm_setzero_si128();
     308             :     //hard code subampling dimensions, keep residual_stride
     309           0 :     area_height >>= 1;
     310           0 :     input_stride <<= 1;
     311           0 :     pred_stride <<= 1;
     312             : 
     313           0 :     for (y = 0; y < area_height; ++y) {
     314           0 :         residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
     315           0 :         residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
     316             : 
     317             :         _mm_storeu_si128((__m128i*)residual, residual_0_7);
     318           0 :         _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
     319             : 
     320           0 :         residual += residual_stride;
     321             :         _mm_storeu_si128((__m128i*)residual, residual_0_7);
     322           0 :         _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
     323             : 
     324           0 :         input += input_stride;
     325           0 :         pred += pred_stride;
     326           0 :         residual += residual_stride;
     327             :     }
     328             :     (void)area_width;
     329             :     //compute the last line
     330             : 
     331           0 :     if (last_line) {
     332           0 :         input -= (input_stride) >> 1;
     333           0 :         pred -= (pred_stride) >> 1;
     334           0 :         residual -= residual_stride;
     335             : 
     336           0 :         residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
     337           0 :         residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
     338             : 
     339             :         _mm_storeu_si128((__m128i*)residual, residual_0_7);
     340           0 :         _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
     341             :     }
     342           0 :     return;
     343             : }
     344             : 
     345           0 : void residual_kernel_sub_sampled32x32_sse2_intrin(
     346             :     uint8_t   *input,
     347             :     uint32_t   input_stride,
     348             :     uint8_t   *pred,
     349             :     uint32_t   pred_stride,
     350             :     int16_t  *residual,
     351             :     uint32_t   residual_stride,
     352             :     uint32_t   area_width,
     353             :     uint32_t   area_height,
     354             :     uint8_t    last_line)
     355             : {
     356             :     __m128i xmm0, residual_0_7, residual_8_15, residual_16_23, residual_24_31;
     357             :     uint32_t y;
     358             : 
     359           0 :     xmm0 = _mm_setzero_si128();
     360             : 
     361             :     //hard code subampling dimensions, keep residual_stride
     362           0 :     area_height >>= 1;
     363           0 :     input_stride <<= 1;
     364           0 :     pred_stride <<= 1;
     365             : 
     366           0 :     for (y = 0; y < area_height; ++y) {
     367           0 :         residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
     368           0 :         residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
     369           0 :         residual_16_23 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
     370           0 :         residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
     371             : 
     372             :         _mm_storeu_si128((__m128i*)residual, residual_0_7);
     373           0 :         _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
     374           0 :         _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
     375           0 :         _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
     376             : 
     377           0 :         residual += residual_stride;
     378             :         _mm_storeu_si128((__m128i*)residual, residual_0_7);
     379           0 :         _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
     380           0 :         _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
     381           0 :         _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
     382             : 
     383           0 :         input += input_stride;
     384           0 :         pred += pred_stride;
     385           0 :         residual += residual_stride;
     386             :     }
     387             :     (void)area_width;
     388             :     //compute the last line
     389             : 
     390           0 :     if (last_line) {
     391           0 :         input -= (input_stride) >> 1;
     392           0 :         pred -= (pred_stride) >> 1;
     393           0 :         residual -= residual_stride;
     394             : 
     395           0 :         residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
     396           0 :         residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
     397           0 :         residual_16_23 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
     398           0 :         residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
     399             : 
     400             :         _mm_storeu_si128((__m128i*)residual, residual_0_7);
     401           0 :         _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
     402           0 :         _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
     403           0 :         _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
     404             :     }
     405             : 
     406           0 :     return;
     407             : }
     408             : 
     409           0 : void residual_kernel_sub_sampled64x64_sse2_intrin(
     410             :     uint8_t   *input,
     411             :     uint32_t   input_stride,
     412             :     uint8_t   *pred,
     413             :     uint32_t   pred_stride,
     414             :     int16_t  *residual,
     415             :     uint32_t   residual_stride,
     416             :     uint32_t   area_width,
     417             :     uint32_t   area_height,
     418             :     uint8_t    last_line)
     419             : {
     420             :     __m128i xmm0, residual_0_7, residual_8_15, residual_16_23, residual_24_31, resdiaul_32_39, residual_40_47, residual_48_55, residual_56_63;
     421             :     uint32_t y;
     422             : 
     423           0 :     xmm0 = _mm_setzero_si128();
     424             : 
     425             :     //hard code subampling dimensions, keep residual_stride
     426           0 :     area_height >>= 1;
     427           0 :     input_stride <<= 1;
     428           0 :     pred_stride <<= 1;
     429             : 
     430           0 :     for (y = 0; y < area_height; ++y) {
     431           0 :         residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
     432           0 :         residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
     433           0 :         residual_16_23 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
     434           0 :         residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
     435           0 :         resdiaul_32_39 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
     436           0 :         residual_40_47 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
     437           0 :         residual_48_55 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
     438           0 :         residual_56_63 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
     439             : 
     440             :         _mm_storeu_si128((__m128i*)residual, residual_0_7);
     441           0 :         _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
     442           0 :         _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
     443           0 :         _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
     444           0 :         _mm_storeu_si128((__m128i*)(residual + 32), resdiaul_32_39);
     445           0 :         _mm_storeu_si128((__m128i*)(residual + 40), residual_40_47);
     446           0 :         _mm_storeu_si128((__m128i*)(residual + 48), residual_48_55);
     447           0 :         _mm_storeu_si128((__m128i*)(residual + 56), residual_56_63);
     448             : 
     449             :         //duplicate top field residual to bottom field
     450           0 :         residual += residual_stride;
     451             :         _mm_storeu_si128((__m128i*)residual, residual_0_7);
     452           0 :         _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
     453           0 :         _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
     454           0 :         _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
     455           0 :         _mm_storeu_si128((__m128i*)(residual + 32), resdiaul_32_39);
     456           0 :         _mm_storeu_si128((__m128i*)(residual + 40), residual_40_47);
     457           0 :         _mm_storeu_si128((__m128i*)(residual + 48), residual_48_55);
     458           0 :         _mm_storeu_si128((__m128i*)(residual + 56), residual_56_63);
     459             : 
     460           0 :         input += input_stride;
     461           0 :         pred += pred_stride;
     462           0 :         residual += residual_stride;
     463             :     }
     464             :     (void)area_width;
     465             :     //compute the last line
     466             : 
     467           0 :     if (last_line) {
     468           0 :         input -= (input_stride) >> 1;
     469           0 :         pred -= (pred_stride) >> 1;
     470           0 :         residual -= residual_stride;
     471             : 
     472           0 :         residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
     473           0 :         residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
     474           0 :         residual_16_23 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
     475           0 :         residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
     476           0 :         resdiaul_32_39 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
     477           0 :         residual_40_47 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
     478           0 :         residual_48_55 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
     479           0 :         residual_56_63 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
     480             : 
     481             :         _mm_storeu_si128((__m128i*)residual, residual_0_7);
     482           0 :         _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
     483           0 :         _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
     484           0 :         _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
     485           0 :         _mm_storeu_si128((__m128i*)(residual + 32), resdiaul_32_39);
     486           0 :         _mm_storeu_si128((__m128i*)(residual + 40), residual_40_47);
     487           0 :         _mm_storeu_si128((__m128i*)(residual + 48), residual_48_55);
     488           0 :         _mm_storeu_si128((__m128i*)(residual + 56), residual_56_63);
     489             :     }
     490             : 
     491           0 :     return;
     492             : }
     493             : /******************************************************************************************************
     494             :                                        residual_kernel16bit_sse2_intrin
     495             : ******************************************************************************************************/
     496           0 : void residual_kernel16bit_sse2_intrin(
     497             :     uint16_t   *input,
     498             :     uint32_t   input_stride,
     499             :     uint16_t   *pred,
     500             :     uint32_t   pred_stride,
     501             :     int16_t  *residual,
     502             :     uint32_t   residual_stride,
     503             :     uint32_t   area_width,
     504             :     uint32_t   area_height)
     505             : {
     506             :     uint32_t x, y;
     507             :     __m128i residual0, residual1;
     508             : 
     509           0 :     if (area_width == 4)
     510             :     {
     511           0 :         for (y = 0; y < area_height; y += 2) {
     512           0 :             residual0 = _mm_sub_epi16(_mm_loadl_epi64((__m128i*)input), _mm_loadl_epi64((__m128i*)pred));
     513           0 :             residual1 = _mm_sub_epi16(_mm_loadl_epi64((__m128i*)(input + input_stride)), _mm_loadl_epi64((__m128i*)(pred + pred_stride)));
     514             : 
     515           0 :             _mm_storel_epi64((__m128i*)residual, residual0);
     516           0 :             _mm_storel_epi64((__m128i*)(residual + residual_stride), residual1);
     517             : 
     518           0 :             input += input_stride << 1;
     519           0 :             pred += pred_stride << 1;
     520           0 :             residual += residual_stride << 1;
     521             :         }
     522             :     }
     523           0 :     else if (area_width == 8) {
     524           0 :         for (y = 0; y < area_height; y += 2) {
     525           0 :             residual0 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred));
     526           0 :             residual1 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride)), _mm_loadu_si128((__m128i*)(pred + pred_stride)));
     527             : 
     528             :             _mm_storeu_si128((__m128i*) residual, residual0);
     529           0 :             _mm_storeu_si128((__m128i*) (residual + residual_stride), residual1);
     530             : 
     531           0 :             input += input_stride << 1;
     532           0 :             pred += pred_stride << 1;
     533           0 :             residual += residual_stride << 1;
     534             :         }
     535             :     }
     536           0 :     else if (area_width == 16) {
     537             :         __m128i residual2, residual3;
     538             : 
     539           0 :         for (y = 0; y < area_height; y += 2) {
     540           0 :             residual0 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred));
     541           0 :             residual1 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 8)), _mm_loadu_si128((__m128i*)(pred + 8)));
     542           0 :             residual2 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride)), _mm_loadu_si128((__m128i*)(pred + pred_stride)));
     543           0 :             residual3 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 8)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 8)));
     544             : 
     545             :             _mm_storeu_si128((__m128i*)residual, residual0);
     546           0 :             _mm_storeu_si128((__m128i*)(residual + 8), residual1);
     547           0 :             _mm_storeu_si128((__m128i*)(residual + residual_stride), residual2);
     548           0 :             _mm_storeu_si128((__m128i*)(residual + residual_stride + 8), residual3);
     549             : 
     550           0 :             input += input_stride << 1;
     551           0 :             pred += pred_stride << 1;
     552           0 :             residual += residual_stride << 1;
     553             :         }
     554             :     }
     555           0 :     else if (area_width == 32) {
     556           0 :         for (y = 0; y < area_height; y += 2) {
     557             :             //residual[columnIndex] = ((int16_t)input[columnIndex]) - ((int16_t)pred[columnIndex]);
     558           0 :             _mm_storeu_si128((__m128i*) residual, _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred)));
     559           0 :             _mm_storeu_si128((__m128i*) (residual + 8), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 8)), _mm_loadu_si128((__m128i*)(pred + 8))));
     560           0 :             _mm_storeu_si128((__m128i*) (residual + 16), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 16)), _mm_loadu_si128((__m128i*)(pred + 16))));
     561           0 :             _mm_storeu_si128((__m128i*) (residual + 24), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 24)), _mm_loadu_si128((__m128i*)(pred + 24))));
     562             : 
     563           0 :             _mm_storeu_si128((__m128i*) (residual + residual_stride), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride)), _mm_loadu_si128((__m128i*)(pred + pred_stride))));
     564           0 :             _mm_storeu_si128((__m128i*) (residual + residual_stride + 8), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 8)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 8))));
     565           0 :             _mm_storeu_si128((__m128i*) (residual + residual_stride + 16), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 16)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 16))));
     566           0 :             _mm_storeu_si128((__m128i*) (residual + residual_stride + 24), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 24)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 24))));
     567             : 
     568           0 :             input += input_stride << 1;
     569           0 :             pred += pred_stride << 1;
     570           0 :             residual += residual_stride << 1;
     571             :         }
     572             :     }
     573           0 :     else if (area_width == 64) { // Branch was not tested because the encoder had max txb_size of 32
     574             : 
     575           0 :         for (y = 0; y < area_height; y += 2) {
     576             :             //residual[columnIndex] = ((int16_t)input[columnIndex]) - ((int16_t)pred[columnIndex]) 8 indices per _mm_sub_epi16
     577           0 :             _mm_storeu_si128((__m128i*) residual, _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred)));
     578           0 :             _mm_storeu_si128((__m128i*) (residual + 8), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 8)), _mm_loadu_si128((__m128i*)(pred + 8))));
     579           0 :             _mm_storeu_si128((__m128i*) (residual + 16), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 16)), _mm_loadu_si128((__m128i*)(pred + 16))));
     580           0 :             _mm_storeu_si128((__m128i*) (residual + 24), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 24)), _mm_loadu_si128((__m128i*)(pred + 24))));
     581           0 :             _mm_storeu_si128((__m128i*) (residual + 32), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 32)), _mm_loadu_si128((__m128i*)(pred + 32))));
     582           0 :             _mm_storeu_si128((__m128i*) (residual + 40), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 40)), _mm_loadu_si128((__m128i*)(pred + 40))));
     583           0 :             _mm_storeu_si128((__m128i*) (residual + 48), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 48)), _mm_loadu_si128((__m128i*)(pred + 48))));
     584           0 :             _mm_storeu_si128((__m128i*) (residual + 56), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 56)), _mm_loadu_si128((__m128i*)(pred + 56))));
     585             : 
     586           0 :             _mm_storeu_si128((__m128i*) (residual + residual_stride), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride)), _mm_loadu_si128((__m128i*)(pred + pred_stride))));
     587           0 :             _mm_storeu_si128((__m128i*) (residual + residual_stride + 8), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 8)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 8))));
     588           0 :             _mm_storeu_si128((__m128i*) (residual + residual_stride + 16), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 16)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 16))));
     589           0 :             _mm_storeu_si128((__m128i*) (residual + residual_stride + 24), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 24)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 24))));
     590           0 :             _mm_storeu_si128((__m128i*) (residual + residual_stride + 32), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 32)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 32))));
     591           0 :             _mm_storeu_si128((__m128i*) (residual + residual_stride + 40), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 40)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 40))));
     592           0 :             _mm_storeu_si128((__m128i*) (residual + residual_stride + 48), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 48)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 48))));
     593           0 :             _mm_storeu_si128((__m128i*) (residual + residual_stride + 56), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride + 56)), _mm_loadu_si128((__m128i*)(pred + pred_stride + 56))));
     594             : 
     595           0 :             input += input_stride << 1;
     596           0 :             pred += pred_stride << 1;
     597           0 :             residual += residual_stride << 1;
     598             :         }
     599             :     }
     600             :     else {
     601           0 :         uint32_t inputStrideDiff = 2 * input_stride;
     602           0 :         uint32_t predStrideDiff = 2 * pred_stride;
     603           0 :         uint32_t residualStrideDiff = 2 * residual_stride;
     604           0 :         inputStrideDiff -= area_width;
     605           0 :         predStrideDiff -= area_width;
     606           0 :         residualStrideDiff -= area_width;
     607             : 
     608           0 :         if (!(area_width & 7)) {
     609           0 :             for (x = 0; x < area_height; x += 2) {
     610           0 :                 for (y = 0; y < area_width; y += 8) {
     611           0 :                     _mm_storeu_si128((__m128i*) residual, _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred)));
     612           0 :                     _mm_storeu_si128((__m128i*) (residual + residual_stride), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride)), _mm_loadu_si128((__m128i*)(pred + pred_stride))));
     613             : 
     614           0 :                     input += 8;
     615           0 :                     pred += 8;
     616           0 :                     residual += 8;
     617             :                 }
     618           0 :                 input = input + inputStrideDiff;
     619           0 :                 pred = pred + predStrideDiff;
     620           0 :                 residual = residual + residualStrideDiff;
     621             :             }
     622             :         }
     623             :         else {
     624           0 :             for (x = 0; x < area_height; x += 2) {
     625           0 :                 for (y = 0; y < area_width; y += 4) {
     626           0 :                     _mm_storel_epi64((__m128i*) residual, _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred)));
     627           0 :                     _mm_storel_epi64((__m128i*) (residual + residual_stride), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + input_stride)), _mm_loadu_si128((__m128i*)(pred + pred_stride))));
     628             : 
     629           0 :                     input += 4;
     630           0 :                     pred += 4;
     631           0 :                     residual += 4;
     632             :                 }
     633           0 :                 input = input + inputStrideDiff;
     634           0 :                 pred = pred + predStrideDiff;
     635           0 :                 residual = residual + residualStrideDiff;
     636             :             }
     637             :         }
     638             :     }
     639           0 :     return;
     640             : }
     641             : 
     642             : /******************************************************************************************************
     643             :                                    picture_addition_kernel16bit_sse2_intrin
     644             : ******************************************************************************************************/
     645             : 
     646           0 : void picture_addition_kernel16bit_sse2_intrin(
     647             :     uint16_t  *pred_ptr,
     648             :     uint32_t  pred_stride,
     649             :     int16_t *residual_ptr,
     650             :     uint32_t  residual_stride,
     651             :     uint16_t  *recon_ptr,
     652             :     uint32_t  recon_stride,
     653             :     uint32_t  width,
     654             :     uint32_t  height)
     655             : {
     656             :     __m128i xmm_0, xmm_Max10bit;
     657             : 
     658             :     uint32_t y, x;
     659             : 
     660           0 :     xmm_0 = _mm_setzero_si128();
     661           0 :     xmm_Max10bit = _mm_set1_epi16(1023);
     662             : 
     663           0 :     if (width == 4)
     664             :     {
     665             :         __m128i xmm_sum_0_3, xmm_sum_s0_s3, xmm_clip3_0_3, xmm_clip3_s0_s3;
     666           0 :         for (y = 0; y < height; y += 2) {
     667           0 :             xmm_sum_0_3 = _mm_adds_epi16(_mm_loadl_epi64((__m128i*)pred_ptr), _mm_loadl_epi64((__m128i*)residual_ptr));
     668           0 :             xmm_sum_s0_s3 = _mm_adds_epi16(_mm_loadl_epi64((__m128i*)(pred_ptr + pred_stride)), _mm_loadl_epi64((__m128i*)(residual_ptr + residual_stride)));
     669             : 
     670           0 :             xmm_clip3_0_3 = _mm_max_epi16(_mm_min_epi16(xmm_sum_0_3, xmm_Max10bit), xmm_0);
     671           0 :             xmm_clip3_s0_s3 = _mm_max_epi16(_mm_min_epi16(xmm_sum_s0_s3, xmm_Max10bit), xmm_0);
     672             : 
     673           0 :             _mm_storel_epi64((__m128i*) recon_ptr, xmm_clip3_0_3);
     674           0 :             _mm_storel_epi64((__m128i*) (recon_ptr + recon_stride), xmm_clip3_s0_s3);
     675             : 
     676           0 :             pred_ptr += pred_stride << 1;
     677           0 :             residual_ptr += residual_stride << 1;
     678           0 :             recon_ptr += recon_stride << 1;
     679             :         }
     680             :     }
     681           0 :     else if (width == 8) {
     682             :         __m128i xmm_sum_0_7, xmm_sum_s0_s7, xmm_clip3_0_7, xmm_clip3_s0_s7;
     683             : 
     684           0 :         for (y = 0; y < height; y += 2) {
     685           0 :             xmm_sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)pred_ptr), _mm_loadu_si128((__m128i*)residual_ptr));
     686           0 :             xmm_sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride)));
     687             : 
     688           0 :             xmm_clip3_0_7 = _mm_max_epi16(_mm_min_epi16(xmm_sum_0_7, xmm_Max10bit), xmm_0);
     689           0 :             xmm_clip3_s0_s7 = _mm_max_epi16(_mm_min_epi16(xmm_sum_s0_s7, xmm_Max10bit), xmm_0);
     690             : 
     691             :             _mm_storeu_si128((__m128i*) recon_ptr, xmm_clip3_0_7);
     692           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + recon_stride), xmm_clip3_s0_s7);
     693             : 
     694           0 :             pred_ptr += pred_stride << 1;
     695           0 :             residual_ptr += residual_stride << 1;
     696           0 :             recon_ptr += recon_stride << 1;
     697             :         }
     698             :     }
     699           0 :     else if (width == 16) {
     700             :         __m128i sum_0_7, sum_8_15, sum_s0_s7, sum_s8_s15, clip3_0_7, clip3_8_15, clip3_s0_s7, clip3_s8_s15;
     701             : 
     702           0 :         for (y = 0; y < height; y += 2) {
     703           0 :             sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)pred_ptr), _mm_loadu_si128((__m128i*)residual_ptr));
     704           0 :             sum_8_15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 8)), _mm_loadu_si128((__m128i*)(residual_ptr + 8)));
     705           0 :             sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride)));
     706           0 :             sum_s8_s15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride + 8)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride + 8)));
     707             : 
     708           0 :             clip3_0_7 = _mm_max_epi16(_mm_min_epi16(sum_0_7, xmm_Max10bit), xmm_0);
     709           0 :             clip3_8_15 = _mm_max_epi16(_mm_min_epi16(sum_8_15, xmm_Max10bit), xmm_0);
     710           0 :             clip3_s0_s7 = _mm_max_epi16(_mm_min_epi16(sum_s0_s7, xmm_Max10bit), xmm_0);
     711           0 :             clip3_s8_s15 = _mm_max_epi16(_mm_min_epi16(sum_s8_s15, xmm_Max10bit), xmm_0);
     712             : 
     713             :             _mm_storeu_si128((__m128i*) recon_ptr, clip3_0_7);
     714           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + 8), clip3_8_15);
     715           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + recon_stride), clip3_s0_s7);
     716           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + recon_stride + 8), clip3_s8_s15);
     717             : 
     718           0 :             pred_ptr += pred_stride << 1;
     719           0 :             residual_ptr += residual_stride << 1;
     720           0 :             recon_ptr += recon_stride << 1;
     721             :         }
     722             :     }
     723           0 :     else if (width == 32) {
     724             :         __m128i sum_0_7, sum_8_15, sum_16_23, sum_24_31, sum_s0_s7, sum_s8_s15, sum_s16_s23, sum_s24_s31;
     725             :         __m128i clip3_0_7, clip3_8_15, clip3_16_23, clip3_24_31, clip3_s0_s7, clip3_s8_s15, clip3_s16_s23, clip3_s24_s31;
     726             : 
     727           0 :         for (y = 0; y < height; y += 2) {
     728           0 :             sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)pred_ptr), _mm_loadu_si128((__m128i*)residual_ptr));
     729           0 :             sum_8_15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 8)), _mm_loadu_si128((__m128i*)(residual_ptr + 8)));
     730           0 :             sum_16_23 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 16)), _mm_loadu_si128((__m128i*)(residual_ptr + 16)));
     731           0 :             sum_24_31 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 24)), _mm_loadu_si128((__m128i*)(residual_ptr + 24)));
     732             : 
     733           0 :             sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride)));
     734           0 :             sum_s8_s15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride + 8)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride + 8)));
     735           0 :             sum_s16_s23 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride + 16)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride + 16)));
     736           0 :             sum_s24_s31 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride + 24)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride + 24)));
     737             : 
     738           0 :             clip3_0_7 = _mm_max_epi16(_mm_min_epi16(sum_0_7, xmm_Max10bit), xmm_0);
     739           0 :             clip3_8_15 = _mm_max_epi16(_mm_min_epi16(sum_8_15, xmm_Max10bit), xmm_0);
     740           0 :             clip3_16_23 = _mm_max_epi16(_mm_min_epi16(sum_16_23, xmm_Max10bit), xmm_0);
     741           0 :             clip3_24_31 = _mm_max_epi16(_mm_min_epi16(sum_24_31, xmm_Max10bit), xmm_0);
     742             : 
     743           0 :             clip3_s0_s7 = _mm_max_epi16(_mm_min_epi16(sum_s0_s7, xmm_Max10bit), xmm_0);
     744           0 :             clip3_s8_s15 = _mm_max_epi16(_mm_min_epi16(sum_s8_s15, xmm_Max10bit), xmm_0);
     745           0 :             clip3_s16_s23 = _mm_max_epi16(_mm_min_epi16(sum_s16_s23, xmm_Max10bit), xmm_0);
     746           0 :             clip3_s24_s31 = _mm_max_epi16(_mm_min_epi16(sum_s24_s31, xmm_Max10bit), xmm_0);
     747             : 
     748             :             _mm_storeu_si128((__m128i*) recon_ptr, clip3_0_7);
     749           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + 8), clip3_8_15);
     750           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + 16), clip3_16_23);
     751           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + 24), clip3_24_31);
     752             : 
     753           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + recon_stride), clip3_s0_s7);
     754           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + recon_stride + 8), clip3_s8_s15);
     755           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + recon_stride + 16), clip3_s16_s23);
     756           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + recon_stride + 24), clip3_s24_s31);
     757             : 
     758           0 :             pred_ptr += pred_stride << 1;
     759           0 :             residual_ptr += residual_stride << 1;
     760           0 :             recon_ptr += recon_stride << 1;
     761             :         }
     762             :     }
     763           0 :     else if (width == 64) { // Branch not tested due to Max TU size is 32 at time of development
     764             : 
     765             :         __m128i sum_0_7, sum_8_15, sum_16_23, sum_24_31, sum_32_39, sum_40_47, sum_48_55, sum_56_63;
     766             :         __m128i clip3_0_7, clip3_8_15, clip3_16_23, clip3_24_31, clip3_32_39, clip3_40_47, clip3_48_55, clip3_56_63;
     767             : 
     768           0 :         for (y = 0; y < height; ++y) {
     769           0 :             sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)pred_ptr), _mm_loadu_si128((__m128i*)residual_ptr));
     770           0 :             sum_8_15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 8)), _mm_loadu_si128((__m128i*)(residual_ptr + 8)));
     771           0 :             sum_16_23 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 16)), _mm_loadu_si128((__m128i*)(residual_ptr + 16)));
     772           0 :             sum_24_31 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 24)), _mm_loadu_si128((__m128i*)(residual_ptr + 24)));
     773           0 :             sum_32_39 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 32)), _mm_loadu_si128((__m128i*)(residual_ptr + 32)));
     774           0 :             sum_40_47 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 40)), _mm_loadu_si128((__m128i*)(residual_ptr + 40)));
     775           0 :             sum_48_55 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 48)), _mm_loadu_si128((__m128i*)(residual_ptr + 48)));
     776           0 :             sum_56_63 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + 56)), _mm_loadu_si128((__m128i*)(residual_ptr + 56)));
     777             : 
     778           0 :             clip3_0_7 = _mm_max_epi16(_mm_min_epi16(sum_0_7, xmm_Max10bit), xmm_0);
     779           0 :             clip3_8_15 = _mm_max_epi16(_mm_min_epi16(sum_8_15, xmm_Max10bit), xmm_0);
     780           0 :             clip3_16_23 = _mm_max_epi16(_mm_min_epi16(sum_16_23, xmm_Max10bit), xmm_0);
     781           0 :             clip3_24_31 = _mm_max_epi16(_mm_min_epi16(sum_24_31, xmm_Max10bit), xmm_0);
     782           0 :             clip3_32_39 = _mm_max_epi16(_mm_min_epi16(sum_32_39, xmm_Max10bit), xmm_0);
     783           0 :             clip3_40_47 = _mm_max_epi16(_mm_min_epi16(sum_40_47, xmm_Max10bit), xmm_0);
     784           0 :             clip3_48_55 = _mm_max_epi16(_mm_min_epi16(sum_48_55, xmm_Max10bit), xmm_0);
     785           0 :             clip3_56_63 = _mm_max_epi16(_mm_min_epi16(sum_56_63, xmm_Max10bit), xmm_0);
     786             : 
     787             :             _mm_storeu_si128((__m128i*) recon_ptr, clip3_0_7);
     788           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + 8), clip3_8_15);
     789           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + 16), clip3_16_23);
     790           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + 24), clip3_24_31);
     791           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + 32), clip3_32_39);
     792           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + 40), clip3_40_47);
     793           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + 48), clip3_48_55);
     794           0 :             _mm_storeu_si128((__m128i*) (recon_ptr + 56), clip3_56_63);
     795             : 
     796           0 :             pred_ptr += pred_stride;
     797           0 :             residual_ptr += residual_stride;
     798           0 :             recon_ptr += recon_stride;
     799             :         }
     800             :     }
     801             :     else
     802             :     {
     803           0 :         uint32_t predStrideDiff = 2 * pred_stride;
     804           0 :         uint32_t residualStrideDiff = 2 * residual_stride;
     805           0 :         uint32_t reconStrideDiff = 2 * recon_stride;
     806           0 :         predStrideDiff -= width;
     807           0 :         residualStrideDiff -= width;
     808           0 :         reconStrideDiff -= width;
     809             : 
     810           0 :         if (!(width & 7)) {
     811             :             __m128i xmm_sum_0_7, xmm_sum_s0_s7, xmm_clip3_0_7, xmm_clip3_s0_s7;
     812             : 
     813           0 :             for (x = 0; x < height; x += 2) {
     814           0 :                 for (y = 0; y < width; y += 8) {
     815           0 :                     xmm_sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)pred_ptr), _mm_loadu_si128((__m128i*)residual_ptr));
     816           0 :                     xmm_sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride)));
     817             : 
     818           0 :                     xmm_clip3_0_7 = _mm_max_epi16(_mm_min_epi16(xmm_sum_0_7, xmm_Max10bit), xmm_0);
     819           0 :                     xmm_clip3_s0_s7 = _mm_max_epi16(_mm_min_epi16(xmm_sum_s0_s7, xmm_Max10bit), xmm_0);
     820             : 
     821             :                     _mm_storeu_si128((__m128i*) recon_ptr, xmm_clip3_0_7);
     822           0 :                     _mm_storeu_si128((__m128i*) (recon_ptr + recon_stride), xmm_clip3_s0_s7);
     823             : 
     824           0 :                     pred_ptr += 8;
     825           0 :                     residual_ptr += 8;
     826           0 :                     recon_ptr += 8;
     827             :                 }
     828           0 :                 pred_ptr += predStrideDiff;
     829           0 :                 residual_ptr += residualStrideDiff;
     830           0 :                 recon_ptr += reconStrideDiff;
     831             :             }
     832             :         }
     833             :         else {
     834             :             __m128i xmm_sum_0_7, xmm_sum_s0_s7, xmm_clip3_0_3, xmm_clip3_s0_s3;
     835           0 :             for (x = 0; x < height; x += 2) {
     836           0 :                 for (y = 0; y < width; y += 4) {
     837           0 :                     xmm_sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)pred_ptr), _mm_loadu_si128((__m128i*)residual_ptr));
     838           0 :                     xmm_sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(pred_ptr + pred_stride)), _mm_loadu_si128((__m128i*)(residual_ptr + residual_stride)));
     839             : 
     840           0 :                     xmm_clip3_0_3 = _mm_max_epi16(_mm_min_epi16(xmm_sum_0_7, xmm_Max10bit), xmm_0);
     841           0 :                     xmm_clip3_s0_s3 = _mm_max_epi16(_mm_min_epi16(xmm_sum_s0_s7, xmm_Max10bit), xmm_0);
     842             : 
     843           0 :                     _mm_storel_epi64((__m128i*) recon_ptr, xmm_clip3_0_3);
     844           0 :                     _mm_storel_epi64((__m128i*) (recon_ptr + recon_stride), xmm_clip3_s0_s3);
     845             : 
     846           0 :                     pred_ptr += 4;
     847           0 :                     residual_ptr += 4;
     848           0 :                     recon_ptr += 4;
     849             :                 }
     850           0 :                 pred_ptr += predStrideDiff;
     851           0 :                 residual_ptr += residualStrideDiff;
     852           0 :                 recon_ptr += reconStrideDiff;
     853             :             }
     854             :         }
     855             :     }
     856           0 :     return;
     857             : }
     858             : 
     859           0 : static INLINE __m128i Distortion_SSE2_INTRIN(const __m128i input,
     860             :     const __m128i recon, const __m128i sum) {
     861           0 :     const __m128i in = _mm_unpacklo_epi8(input, _mm_setzero_si128());
     862           0 :     const __m128i re = _mm_unpacklo_epi8(recon, _mm_setzero_si128());
     863           0 :     const __m128i diff = _mm_sub_epi16(in, re);
     864           0 :     const __m128i dist = _mm_madd_epi16(diff, diff);
     865           0 :     return _mm_add_epi32(sum, dist);
     866             : }
     867             : 
     868           0 : uint64_t spatial_full_distortion_kernel4x_n_sse2_intrin(
     869             :     uint8_t   *input,
     870             :     uint32_t   input_offset,
     871             :     uint32_t   input_stride,
     872             :     uint8_t   *recon,
     873             :     uint32_t   recon_offset,
     874             :     uint32_t   recon_stride,
     875             :     uint32_t   area_width,
     876             :     uint32_t   area_height)
     877             : {
     878           0 :     int32_t row_count = area_height;
     879           0 :     __m128i sum = _mm_setzero_si128();
     880           0 :     input += input_offset;
     881           0 :     recon += recon_offset;
     882             :     (void)area_width;
     883             : 
     884             :     do {
     885           0 :         const __m128i in = _mm_cvtsi32_si128(*(uint32_t *)input);
     886           0 :         const __m128i re = _mm_cvtsi32_si128(*(uint32_t *)recon);
     887           0 :         sum = Distortion_SSE2_INTRIN(in, re, sum);
     888           0 :         input += input_stride;
     889           0 :         recon += recon_stride;
     890           0 :     } while (--row_count);
     891             : 
     892           0 :     sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
     893             : 
     894           0 :     return _mm_cvtsi128_si32(sum);
     895             : }
     896             : 
     897           0 : uint64_t spatial_full_distortion_kernel8x_n_sse2_intrin(
     898             :     uint8_t   *input,
     899             :     uint32_t   input_offset,
     900             :     uint32_t   input_stride,
     901             :     uint8_t   *recon,
     902             :     uint32_t   recon_offset,
     903             :     uint32_t   recon_stride,
     904             :     uint32_t   area_width,
     905             :     uint32_t   area_height)
     906             : {
     907           0 :     int32_t row_count = area_height;
     908           0 :     __m128i sum = _mm_setzero_si128();
     909           0 :     input += input_offset;
     910           0 :     recon += recon_offset;
     911             :     (void)area_width;
     912             : 
     913             :     do {
     914           0 :         const __m128i in = _mm_loadl_epi64((__m128i *)input);
     915           0 :         const __m128i re = _mm_loadl_epi64((__m128i *)recon);
     916           0 :         sum = Distortion_SSE2_INTRIN(in, re, sum);
     917           0 :         input += input_stride;
     918           0 :         recon += recon_stride;
     919           0 :     } while (--row_count);
     920             : 
     921           0 :     return Hadd32_SSE2_INTRIN(sum);
     922             : }
     923             : 
     924           0 : static INLINE void SpatialFullDistortionKernel16_SSE2_INTRIN(
     925             :     const uint8_t *const input, const uint8_t *const recon, __m128i *const sum)
     926             : {
     927           0 :     const __m128i in = _mm_loadu_si128((__m128i *)input);
     928           0 :     const __m128i re = _mm_loadu_si128((__m128i *)recon);
     929           0 :     const __m128i max = _mm_max_epu8(in, re);
     930           0 :     const __m128i min = _mm_min_epu8(in, re);
     931           0 :     const __m128i diff = _mm_sub_epi8(max, min);
     932           0 :     const __m128i diff_L = _mm_unpacklo_epi8(diff, _mm_setzero_si128());
     933           0 :     const __m128i diff_H = _mm_unpackhi_epi8(diff, _mm_setzero_si128());
     934           0 :     const __m128i dist_L = _mm_madd_epi16(diff_L, diff_L);
     935           0 :     const __m128i dist_H = _mm_madd_epi16(diff_H, diff_H);
     936           0 :     const __m128i dist = _mm_add_epi32(dist_L, dist_H);
     937           0 :     *sum = _mm_add_epi32(*sum, dist);
     938           0 : }
     939             : 
     940           0 : static INLINE void SpatialFullDistortionKernel32_SSE2_INTRIN(
     941             :     const uint8_t *const input, const uint8_t *const recon, __m128i *const sum)
     942             : {
     943           0 :     SpatialFullDistortionKernel16_SSE2_INTRIN(input + 0 * 16, recon + 0 * 16, sum);
     944           0 :     SpatialFullDistortionKernel16_SSE2_INTRIN(input + 1 * 16, recon + 1 * 16, sum);
     945           0 : }
     946             : 
     947           0 : static INLINE void SpatialFullDistortionKernel64_SSE2_INTRIN(
     948             :     const uint8_t *const input, const uint8_t *const recon, __m128i *const sum)
     949             : {
     950           0 :     SpatialFullDistortionKernel32_SSE2_INTRIN(input + 0 * 32, recon + 0 * 32, sum);
     951           0 :     SpatialFullDistortionKernel32_SSE2_INTRIN(input + 1 * 32, recon + 1 * 32, sum);
     952           0 : }
     953             : 
     954           0 : uint64_t spatial_full_distortion_kernel16x_n_sse2_intrin(
     955             :     uint8_t   *input,
     956             :     uint32_t   input_offset,
     957             :     uint32_t   input_stride,
     958             :     uint8_t   *recon,
     959             :     uint32_t   recon_offset,
     960             :     uint32_t   recon_stride,
     961             :     uint32_t   area_width,
     962             :     uint32_t   area_height)
     963             : {
     964           0 :     int32_t row_count = area_height;
     965           0 :     __m128i sum = _mm_setzero_si128();
     966           0 :     input += input_offset;
     967           0 :     recon += recon_offset;
     968             :     (void)area_width;
     969             : 
     970             :     do {
     971           0 :         SpatialFullDistortionKernel16_SSE2_INTRIN(input, recon, &sum);
     972           0 :         input += input_stride;
     973           0 :         recon += recon_stride;
     974           0 :     } while (--row_count);
     975             : 
     976           0 :     return Hadd32_SSE2_INTRIN(sum);
     977             : }
     978             : 
     979           0 : uint64_t spatial_full_distortion_kernel32x_n_sse2_intrin(
     980             :     uint8_t   *input,
     981             :     uint32_t   input_offset,
     982             :     uint32_t   input_stride,
     983             :     uint8_t   *recon,
     984             :     uint32_t   recon_offset,
     985             :     uint32_t   recon_stride,
     986             :     uint32_t   area_width,
     987             :     uint32_t   area_height)
     988             : {
     989           0 :     int32_t row_count = area_height;
     990           0 :     __m128i sum = _mm_setzero_si128();
     991           0 :     input += input_offset;
     992           0 :     recon += recon_offset;
     993             :     (void)area_width;
     994             : 
     995             :     do {
     996           0 :         SpatialFullDistortionKernel32_SSE2_INTRIN(input, recon, &sum);
     997           0 :         input += input_stride;
     998           0 :         recon += recon_stride;
     999           0 :     } while (--row_count);
    1000             : 
    1001           0 :     return Hadd32_SSE2_INTRIN(sum);
    1002             : }
    1003             : 
    1004           0 : uint64_t spatial_full_distortion_kernel64x_n_sse2_intrin(
    1005             :     uint8_t   *input,
    1006             :     uint32_t   input_offset,
    1007             :     uint32_t   input_stride,
    1008             :     uint8_t   *recon,
    1009             :     uint32_t   recon_offset,
    1010             :     uint32_t   recon_stride,
    1011             :     uint32_t   area_width,
    1012             :     uint32_t   area_height)
    1013             : {
    1014           0 :     int32_t row_count = area_height;
    1015           0 :     __m128i sum = _mm_setzero_si128();
    1016           0 :     input += input_offset;
    1017           0 :     recon += recon_offset;
    1018             :     (void)area_width;
    1019             : 
    1020             :     do {
    1021           0 :         SpatialFullDistortionKernel64_SSE2_INTRIN(input, recon, &sum);
    1022           0 :         input += input_stride;
    1023           0 :         recon += recon_stride;
    1024           0 :     } while (--row_count);
    1025             : 
    1026           0 :     return Hadd32_SSE2_INTRIN(sum);
    1027             : }
    1028             : 
    1029           0 : uint64_t spatial_full_distortion_kernel128x_n_sse2_intrin(
    1030             :     uint8_t   *input,
    1031             :     uint32_t   input_offset,
    1032             :     uint32_t   input_stride,
    1033             :     uint8_t   *recon,
    1034             :     uint32_t   recon_offset,
    1035             :     uint32_t   recon_stride,
    1036             :     uint32_t   area_width,
    1037             :     uint32_t   area_height)
    1038             : {
    1039           0 :     int32_t row_count = area_height;
    1040           0 :     __m128i sum = _mm_setzero_si128();
    1041           0 :     input += input_offset;
    1042           0 :     recon += recon_offset;
    1043             :     (void)area_width;
    1044             : 
    1045             :     do {
    1046           0 :         SpatialFullDistortionKernel64_SSE2_INTRIN(input + 0 * 64, recon + 0 * 64, &sum);
    1047           0 :         SpatialFullDistortionKernel64_SSE2_INTRIN(input + 1 * 64, recon + 1 * 64, &sum);
    1048           0 :         input += input_stride;
    1049           0 :         recon += recon_stride;
    1050           0 :     } while (--row_count);
    1051             : 
    1052           0 :     return Hadd32_SSE2_INTRIN(sum);
    1053             : }
    1054             : 
    1055             : /*********************************
    1056             :  * x86 implememtation of Picture Addition
    1057             :  *********************************/
    1058           0 : void picture_addition_sse2(
    1059             :     uint8_t  *pred_ptr,
    1060             :     uint32_t  pred_stride,
    1061             :     int16_t *residual_ptr,
    1062             :     uint32_t  residual_stride,
    1063             :     uint8_t  *recon_ptr,
    1064             :     uint32_t  recon_stride,
    1065             :     uint32_t  width,
    1066             :     uint32_t  height)
    1067             : {
    1068             : 
    1069           0 :     switch (width) {
    1070           0 :     case 4:
    1071           0 :         picture_addition_kernel4x4_sse_intrin(pred_ptr, pred_stride, residual_ptr, residual_stride, recon_ptr, recon_stride, width, height); break;
    1072           0 :     case 8:
    1073           0 :         picture_addition_kernel8x8_sse2_intrin(pred_ptr, pred_stride, residual_ptr, residual_stride, recon_ptr, recon_stride, width, height); break;
    1074           0 :     case 16:
    1075           0 :         picture_addition_kernel16x16_sse2_intrin(pred_ptr, pred_stride, residual_ptr, residual_stride, recon_ptr, recon_stride, width, height); break;
    1076           0 :     case 32:
    1077           0 :         picture_addition_kernel32x32_sse2_intrin(pred_ptr, pred_stride, residual_ptr, residual_stride, recon_ptr, recon_stride, width, height); break;
    1078           0 :     case 64:
    1079           0 :         picture_addition_kernel64x64_sse2_intrin(pred_ptr, pred_stride, residual_ptr, residual_stride, recon_ptr, recon_stride, width, height); break;
    1080           0 :     default:
    1081           0 :         break;
    1082             :     }
    1083             : 
    1084           0 :     return;
    1085             : }

Generated by: LCOV version 1.14