LCOV - code coverage report
Current view: top level - ASM_SSSE3 - EbAvcStyleMcp_Intrinsic_SSSE3.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 59 221 26.7 %
Date: 2019-11-25 17:38:06 Functions: 3 12 25.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include "EbAvcStyleMcp_SSSE3.h"
       7             : 
       8             : #include "EbMcp_SSE2.h"
       9             : #include "EbDefinitions.h"
      10             : #include "EbAvcStyleMcp_SSE2.h"
      11             : 
      12             : #include "emmintrin.h"
      13             : #include "tmmintrin.h"
      14             : 
      15             : EB_EXTERN EB_ALIGN(16) const int8_t AvcStyleLumaIFCoeff8_SSSE3[] = {
      16             :     -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25,
      17             :      9, -1,  9, -1,  9, -1,  9, -1,  9, -1,  9, -1,  9, -1,  9, -1,
      18             :     -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18,
      19             :     18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2,
      20             :     -1,  9, -1,  9, -1,  9, -1,  9, -1,  9, -1,  9, -1,  9, -1,  9,
      21             :     25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1
      22             : };
      23             : 
      24           0 : void avc_style_luma_interpolation_filter_pose_ssse3(
      25             :     EbByte               ref_pic,
      26             :     uint32_t                src_stride,
      27             :     EbByte               dst,
      28             :     uint32_t                dst_stride,
      29             :     uint32_t                pu_width,
      30             :     uint32_t                pu_height,
      31             :     EbByte               temp_buf,
      32             :     EbBool               skip,
      33             :     uint32_t                frac_pos)
      34             : {
      35           0 :     uint32_t tempBufSize = pu_width * pu_height;
      36             :     (void)frac_pos;
      37           0 :     avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic, src_stride, temp_buf, pu_width, pu_width, pu_height, 0, skip, 2);
      38           0 :     avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic, src_stride, temp_buf + tempBufSize, pu_width, pu_width, pu_height, 0, skip, 2);
      39           0 :     picture_average_kernel_sse2(temp_buf, pu_width, temp_buf + tempBufSize, pu_width, dst, dst_stride, pu_width, pu_height);
      40           0 : }
      41             : 
      42           0 : void avc_style_luma_interpolation_filter_posf_ssse3(
      43             :     EbByte               ref_pic,
      44             :     uint32_t                src_stride,
      45             :     EbByte               dst,
      46             :     uint32_t                dst_stride,
      47             :     uint32_t                pu_width,
      48             :     uint32_t                pu_height,
      49             :     EbByte               temp_buf,
      50             :     EbBool               skip,
      51             :     uint32_t                frac_pos)
      52             : {
      53           0 :     uint32_t tempBufSize = pu_width * pu_height;
      54             :     (void)frac_pos;
      55           0 :     avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic - src_stride, src_stride, temp_buf + tempBufSize, pu_width, pu_width, skip ? (2 * pu_height + 3) : (pu_height + 3), 0, EB_FALSE, 2);
      56           0 :     avc_style_luma_interpolation_filter_vertical_ssse3_intrin(temp_buf + tempBufSize + pu_width, pu_width, temp_buf, pu_width, pu_width, pu_height, 0, skip, 2);
      57           0 :     picture_average_kernel_sse2(temp_buf + tempBufSize + pu_width, skip ? 2 * pu_width : pu_width, temp_buf, pu_width, dst, dst_stride, pu_width, pu_height);
      58           0 : }
      59             : 
      60           0 : void avc_style_luma_interpolation_filter_posg_ssse3(
      61             :     EbByte               ref_pic,
      62             :     uint32_t                src_stride,
      63             :     EbByte               dst,
      64             :     uint32_t                dst_stride,
      65             :     uint32_t                pu_width,
      66             :     uint32_t                pu_height,
      67             :     EbByte               temp_buf,
      68             :     EbBool               skip,
      69             :     uint32_t                frac_pos)
      70             : {
      71           0 :     uint32_t tempBufSize = pu_width * pu_height;
      72             :     (void)frac_pos;
      73           0 :     avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic, src_stride, temp_buf, pu_width, pu_width, pu_height, 0, skip, 2);
      74           0 :     avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic + 1, src_stride, temp_buf + tempBufSize, pu_width, pu_width, pu_height, 0, skip, 2);
      75           0 :     picture_average_kernel_sse2(temp_buf, pu_width, temp_buf + tempBufSize, pu_width, dst, dst_stride, pu_width, pu_height);
      76           0 : }
      77             : 
      78           0 : void avc_style_luma_interpolation_filter_posi_ssse3(
      79             :     EbByte               ref_pic,
      80             :     uint32_t                src_stride,
      81             :     EbByte               dst,
      82             :     uint32_t                dst_stride,
      83             :     uint32_t                pu_width,
      84             :     uint32_t                pu_height,
      85             :     EbByte               temp_buf,
      86             :     EbBool               skip,
      87             :     uint32_t                frac_pos)
      88             : {
      89           0 :     uint32_t tempBufSize = pu_width * pu_height;
      90             :     (void)frac_pos;
      91           0 :     avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic, src_stride, temp_buf, pu_width, pu_width, pu_height, 0, skip, 2);
      92           0 :     avc_style_luma_interpolation_filter_posj_ssse3(ref_pic, src_stride, temp_buf + tempBufSize, pu_width, pu_width, pu_height, temp_buf + 2 * tempBufSize, skip, 2);
      93           0 :     picture_average_kernel_sse2(temp_buf, pu_width, temp_buf + tempBufSize, pu_width, dst, dst_stride, pu_width, pu_height);
      94           0 : }
      95             : 
      96           0 : void avc_style_luma_interpolation_filter_posj_ssse3(
      97             :     EbByte               ref_pic,
      98             :     uint32_t                src_stride,
      99             :     EbByte               dst,
     100             :     uint32_t                dst_stride,
     101             :     uint32_t                pu_width,
     102             :     uint32_t                pu_height,
     103             :     EbByte               temp_buf,
     104             :     EbBool               skip,
     105             :     uint32_t                frac_pos)
     106             : {
     107             :     (void)frac_pos;
     108           0 :     if (skip)
     109           0 :         avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic - src_stride, src_stride, temp_buf, pu_width, pu_width, (pu_height + 3), 0, EB_FALSE, 2);
     110             :     else
     111           0 :         avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic - src_stride, src_stride, temp_buf, pu_width, pu_width, skip ? (2 * pu_height + 3) : (pu_height + 3), 0, EB_FALSE, 2);
     112             : 
     113           0 :     avc_style_luma_interpolation_filter_vertical_ssse3_intrin(temp_buf + pu_width, pu_width, dst, dst_stride, pu_width, pu_height, 0, skip, 2);
     114           0 : }
     115             : 
     116           0 : void avc_style_luma_interpolation_filter_posk_ssse3(
     117             :     EbByte               ref_pic,
     118             :     uint32_t                src_stride,
     119             :     EbByte               dst,
     120             :     uint32_t                dst_stride,
     121             :     uint32_t                pu_width,
     122             :     uint32_t                pu_height,
     123             :     EbByte               temp_buf,
     124             :     EbBool               skip,
     125             :     uint32_t                frac_pos)
     126             : {
     127           0 :     uint32_t tempBufSize = pu_width * pu_height;
     128             :     (void)frac_pos;
     129           0 :     avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic + 1, src_stride, temp_buf, pu_width, pu_width, pu_height, 0, skip, 2);
     130           0 :     avc_style_luma_interpolation_filter_posj_ssse3(ref_pic, src_stride, temp_buf + tempBufSize, pu_width, pu_width, pu_height, temp_buf + 2 * tempBufSize, skip, 2);
     131           0 :     picture_average_kernel_sse2(temp_buf, pu_width, temp_buf + tempBufSize, pu_width, dst, dst_stride, pu_width, pu_height);
     132           0 : }
     133             : 
     134           0 : void avc_style_luma_interpolation_filter_posp_ssse3(
     135             :     EbByte               ref_pic,
     136             :     uint32_t                src_stride,
     137             :     EbByte               dst,
     138             :     uint32_t                dst_stride,
     139             :     uint32_t                pu_width,
     140             :     uint32_t                pu_height,
     141             :     EbByte               temp_buf,
     142             :     EbBool               skip,
     143             :     uint32_t                frac_pos)
     144             : {
     145           0 :     uint32_t tempBufSize = pu_width * pu_height;
     146             :     (void)frac_pos;
     147           0 :     avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic, src_stride, temp_buf, pu_width, pu_width, pu_height, 0, skip, 2);
     148           0 :     avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic + src_stride, src_stride, temp_buf + tempBufSize, pu_width, pu_width, pu_height, 0, skip, 2);
     149           0 :     picture_average_kernel_sse2(temp_buf, pu_width, temp_buf + tempBufSize, pu_width, dst, dst_stride, pu_width, pu_height);
     150           0 : }
     151             : 
     152           0 : void avc_style_luma_interpolation_filter_posq_ssse3(
     153             :     EbByte               ref_pic,
     154             :     uint32_t                src_stride,
     155             :     EbByte               dst,
     156             :     uint32_t                dst_stride,
     157             :     uint32_t                pu_width,
     158             :     uint32_t                pu_height,
     159             :     EbByte               temp_buf,
     160             :     EbBool               skip,
     161             :     uint32_t                frac_pos)
     162             : {
     163           0 :     uint32_t tempBufSize = pu_width * pu_height;
     164             :     (void)frac_pos;
     165           0 :     avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic - src_stride, src_stride, temp_buf + tempBufSize, pu_width, pu_width, skip ? (2 * pu_height + 3) : (pu_height + 3), 0, EB_FALSE, 2);
     166           0 :     avc_style_luma_interpolation_filter_vertical_ssse3_intrin(temp_buf + tempBufSize + pu_width, pu_width, temp_buf, pu_width, pu_width, pu_height, 0, skip, 2);
     167           0 :     picture_average_kernel_sse2(temp_buf + tempBufSize + 2 * pu_width, skip ? 2 * pu_width : pu_width, temp_buf, pu_width, dst, dst_stride, pu_width, pu_height);
     168           0 : }
     169             : 
     170           0 : void avc_style_luma_interpolation_filter_posr_ssse3(
     171             :     EbByte               ref_pic,
     172             :     uint32_t                src_stride,
     173             :     EbByte               dst,
     174             :     uint32_t                dst_stride,
     175             :     uint32_t                pu_width,
     176             :     uint32_t                pu_height,
     177             :     EbByte               temp_buf,
     178             :     EbBool               skip,
     179             :     uint32_t                frac_pos)
     180             : {
     181           0 :     uint32_t tempBufSize = pu_width * pu_height;
     182             :     (void)frac_pos;
     183           0 :     avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic + 1, src_stride, temp_buf, pu_width, pu_width, pu_height, 0, skip, 2);
     184           0 :     avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic + src_stride, src_stride, temp_buf + tempBufSize, pu_width, pu_width, pu_height, 0, skip, 2);
     185           0 :     picture_average_kernel_sse2(temp_buf, pu_width, temp_buf + tempBufSize, pu_width, dst, dst_stride, pu_width, pu_height);
     186           0 : }
     187             : 
     188       26580 : void avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(
     189             :     EbByte ref_pic,
     190             :     uint32_t src_stride,
     191             :     EbByte dst,
     192             :     uint32_t dst_stride,
     193             :     uint32_t pu_width,
     194             :     uint32_t pu_height,
     195             :     EbByte temp_buf,
     196             :     EbBool skip,
     197             :     uint32_t frac_pos)
     198             : {
     199             :     (void)temp_buf;
     200             :     __m128i IFOffset, IFCoeff_1_0, IFCoeff_3_2, sum_clip_U8;
     201             :     uint32_t width_cnt, height_cnt;
     202       26580 :     uint32_t IFShift = 5;
     203             : 
     204       26580 :     src_stride <<= skip;
     205       26580 :     dst_stride <<= skip;
     206       26580 :     pu_height >>= skip;
     207       26580 :     frac_pos <<= 5;
     208       26580 :     IFOffset = _mm_set1_epi16(0x0010);
     209       26580 :     IFCoeff_1_0 = _mm_load_si128((__m128i *)(AvcStyleLumaIFCoeff8_SSSE3 + frac_pos - 32));
     210       26580 :     IFCoeff_3_2 = _mm_load_si128((__m128i *)(AvcStyleLumaIFCoeff8_SSSE3 + frac_pos - 16));
     211             : 
     212       26580 :     if (!(pu_width & 15)) { // 16x
     213             :         __m128i ref0, ref1, ref2, ref3, ref01_lo, ref01_hi, ref23_lo, ref23_hi, sum_lo, sum_hi;
     214             : 
     215           0 :         for (height_cnt = 0; height_cnt < pu_height; ++height_cnt) {
     216           0 :             for (width_cnt = 0; width_cnt < pu_width; width_cnt += 16) {
     217           0 :                 ref0 = _mm_loadu_si128((__m128i *)(ref_pic + width_cnt - 1));
     218           0 :                 ref1 = _mm_loadu_si128((__m128i *)(ref_pic + width_cnt));
     219           0 :                 ref2 = _mm_loadu_si128((__m128i *)(ref_pic + width_cnt + 1));
     220           0 :                 ref3 = _mm_loadu_si128((__m128i *)(ref_pic + width_cnt + 2));
     221             : 
     222           0 :                 ref01_lo = _mm_unpacklo_epi8(ref0, ref1);
     223           0 :                 ref01_hi = _mm_unpackhi_epi8(ref0, ref1);
     224           0 :                 ref23_lo = _mm_unpacklo_epi8(ref2, ref3);
     225           0 :                 ref23_hi = _mm_unpackhi_epi8(ref2, ref3);
     226             : 
     227           0 :                 sum_lo = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(ref01_lo, IFCoeff_1_0), _mm_maddubs_epi16(ref23_lo, IFCoeff_3_2)), IFOffset), IFShift);
     228           0 :                 sum_hi = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(ref01_hi, IFCoeff_1_0), _mm_maddubs_epi16(ref23_hi, IFCoeff_3_2)), IFOffset), IFShift);
     229           0 :                 sum_clip_U8 = _mm_packus_epi16(sum_lo, sum_hi);
     230           0 :                 _mm_storeu_si128((__m128i *)(dst + width_cnt), sum_clip_U8);
     231             :             }
     232           0 :             ref_pic += src_stride;
     233           0 :             dst += dst_stride;
     234             :         }
     235             :         //do the last row if sub-pred ON.
     236           0 :         if (skip) {
     237           0 :             ref_pic -= (src_stride >> 1);
     238           0 :             dst -= (dst_stride >> 1);
     239           0 :             for (width_cnt = 0; width_cnt < pu_width; width_cnt += 16) {
     240           0 :                 ref0 = _mm_loadu_si128((__m128i *)(ref_pic + width_cnt - 1));
     241           0 :                 ref1 = _mm_loadu_si128((__m128i *)(ref_pic + width_cnt));
     242           0 :                 ref2 = _mm_loadu_si128((__m128i *)(ref_pic + width_cnt + 1));
     243           0 :                 ref3 = _mm_loadu_si128((__m128i *)(ref_pic + width_cnt + 2));
     244             : 
     245           0 :                 ref01_lo = _mm_unpacklo_epi8(ref0, ref1);
     246           0 :                 ref01_hi = _mm_unpackhi_epi8(ref0, ref1);
     247           0 :                 ref23_lo = _mm_unpacklo_epi8(ref2, ref3);
     248           0 :                 ref23_hi = _mm_unpackhi_epi8(ref2, ref3);
     249             : 
     250           0 :                 sum_lo = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(ref01_lo, IFCoeff_1_0), _mm_maddubs_epi16(ref23_lo, IFCoeff_3_2)), IFOffset), IFShift);
     251           0 :                 sum_hi = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(ref01_hi, IFCoeff_1_0), _mm_maddubs_epi16(ref23_hi, IFCoeff_3_2)), IFOffset), IFShift);
     252           0 :                 sum_clip_U8 = _mm_packus_epi16(sum_lo, sum_hi);
     253           0 :                 _mm_storeu_si128((__m128i *)(dst + width_cnt), sum_clip_U8);
     254             :             }
     255             :         }
     256             :     }
     257             :     else { //8x
     258             :         __m128i  sum01, sum23, sum;
     259             : 
     260     4125730 :         for (height_cnt = 0; height_cnt < pu_height; ++height_cnt) {
     261    97917100 :             for (width_cnt = 0; width_cnt < pu_width; width_cnt += 8) {
     262   281454000 :                 sum01 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ref_pic + width_cnt - 1)),
     263    93817900 :                     _mm_loadl_epi64((__m128i *)(ref_pic + width_cnt))), IFCoeff_1_0);
     264             : 
     265   281454000 :                 sum23 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ref_pic + width_cnt + 1)),
     266    93817900 :                     _mm_loadl_epi64((__m128i *)(ref_pic + width_cnt + 2))), IFCoeff_3_2);
     267             : 
     268   375272000 :                 sum = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(sum01, sum23), IFOffset), IFShift);
     269    93817900 :                 sum_clip_U8 = _mm_packus_epi16(sum, sum);
     270             : 
     271    93817900 :                 _mm_storel_epi64((__m128i *)(dst + width_cnt), sum_clip_U8);
     272             :             }
     273     4099150 :             ref_pic += src_stride;
     274     4099150 :             dst += dst_stride;
     275             :         }
     276             : 
     277             :         //do the last row if sub-pred ON.
     278       26580 :         if (skip) {
     279           0 :             ref_pic -= (src_stride >> 1);
     280           0 :             dst -= (dst_stride >> 1);
     281           0 :             for (width_cnt = 0; width_cnt < pu_width; width_cnt += 8) {
     282           0 :                 sum01 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ref_pic + width_cnt - 1)),
     283           0 :                     _mm_loadl_epi64((__m128i *)(ref_pic + width_cnt))), IFCoeff_1_0);
     284             : 
     285           0 :                 sum23 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ref_pic + width_cnt + 1)),
     286           0 :                     _mm_loadl_epi64((__m128i *)(ref_pic + width_cnt + 2))), IFCoeff_3_2);
     287             : 
     288           0 :                 sum = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(sum01, sum23), IFOffset), IFShift);
     289           0 :                 sum_clip_U8 = _mm_packus_epi16(sum, sum);
     290             : 
     291           0 :                 _mm_storel_epi64((__m128i *)(dst + width_cnt), sum_clip_U8);
     292             :             }
     293             :         }
     294             :     }
     295       26580 : }
     296             : 
     297       53156 : void avc_style_luma_interpolation_filter_vertical_ssse3_intrin(
     298             :     EbByte ref_pic,
     299             :     uint32_t src_stride,
     300             :     EbByte dst,
     301             :     uint32_t dst_stride,
     302             :     uint32_t pu_width,
     303             :     uint32_t pu_height,
     304             :     EbByte temp_buf,
     305             :     EbBool skip,
     306             :     uint32_t frac_pos)
     307             : {
     308             :     (void)temp_buf;
     309             :     __m128i IFOffset, IFCoeff_1_0, IFCoeff_3_2, sum_clip_U8;
     310             :     uint32_t width_cnt, height_cnt;
     311       53156 :     uint32_t IFShift = 5;
     312       53156 :     uint32_t srcStrideSkip = src_stride << (skip ? 1 : 0);
     313             :     EbByte refPicTemp, dstTemp;
     314             : 
     315       53156 :     frac_pos <<= 5;
     316       53156 :     ref_pic -= src_stride;
     317       53156 :     IFOffset = _mm_set1_epi16(0x0010);
     318       53156 :     IFCoeff_1_0 = _mm_load_si128((__m128i *)(AvcStyleLumaIFCoeff8_SSSE3 + frac_pos - 32));
     319       53156 :     IFCoeff_3_2 = _mm_load_si128((__m128i *)(AvcStyleLumaIFCoeff8_SSSE3 + frac_pos - 16));
     320       53156 :     dst_stride <<= skip;
     321       53156 :     pu_height >>= skip;
     322       53156 :     if (!(pu_width & 15)) { //16x
     323             : 
     324             :         __m128i sum_lo, sum_hi, ref0, refs, ref2s, ref3s;
     325             : 
     326           0 :         for (width_cnt = 0; width_cnt < pu_width; width_cnt += 16) {
     327           0 :             refPicTemp = ref_pic;
     328           0 :             dstTemp = dst;
     329             : 
     330           0 :             for (height_cnt = 0; height_cnt < pu_height; ++height_cnt) {
     331           0 :                 ref0 = _mm_loadu_si128((__m128i *)(refPicTemp));
     332           0 :                 refs = _mm_loadu_si128((__m128i *)(refPicTemp + src_stride));
     333           0 :                 ref2s = _mm_loadu_si128((__m128i *)(refPicTemp + 2 * src_stride));
     334           0 :                 ref3s = _mm_loadu_si128((__m128i *)(refPicTemp + 3 * src_stride));
     335             : 
     336           0 :                 sum_lo = _mm_add_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(ref0, refs), IFCoeff_1_0),
     337             :                     _mm_maddubs_epi16(_mm_unpacklo_epi8(ref2s, ref3s), IFCoeff_3_2));
     338             : 
     339           0 :                 sum_hi = _mm_add_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(ref0, refs), IFCoeff_1_0),
     340             :                     _mm_maddubs_epi16(_mm_unpackhi_epi8(ref2s, ref3s), IFCoeff_3_2));
     341             : 
     342           0 :                 sum_lo = _mm_srai_epi16(_mm_add_epi16(sum_lo, IFOffset), IFShift);
     343           0 :                 sum_hi = _mm_srai_epi16(_mm_add_epi16(sum_hi, IFOffset), IFShift);
     344           0 :                 sum_clip_U8 = _mm_packus_epi16(sum_lo, sum_hi);
     345             :                 _mm_storeu_si128((__m128i *)(dstTemp), sum_clip_U8);
     346           0 :                 dstTemp += dst_stride;
     347           0 :                 refPicTemp += srcStrideSkip;
     348             :             }
     349             :             //do the last row if sub-pred is ON.
     350           0 :             if (skip) {
     351           0 :                 dstTemp -= (dst_stride >> 1);
     352           0 :                 refPicTemp -= (srcStrideSkip >> 1);
     353             :                 {
     354           0 :                     ref0 = _mm_loadu_si128((__m128i *)(refPicTemp));
     355           0 :                     refs = _mm_loadu_si128((__m128i *)(refPicTemp + src_stride));
     356           0 :                     ref2s = _mm_loadu_si128((__m128i *)(refPicTemp + 2 * src_stride));
     357           0 :                     ref3s = _mm_loadu_si128((__m128i *)(refPicTemp + 3 * src_stride));
     358             : 
     359           0 :                     sum_lo = _mm_add_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(ref0, refs), IFCoeff_1_0),
     360             :                         _mm_maddubs_epi16(_mm_unpacklo_epi8(ref2s, ref3s), IFCoeff_3_2));
     361             : 
     362           0 :                     sum_hi = _mm_add_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(ref0, refs), IFCoeff_1_0),
     363             :                         _mm_maddubs_epi16(_mm_unpackhi_epi8(ref2s, ref3s), IFCoeff_3_2));
     364             : 
     365           0 :                     sum_lo = _mm_srai_epi16(_mm_add_epi16(sum_lo, IFOffset), IFShift);
     366           0 :                     sum_hi = _mm_srai_epi16(_mm_add_epi16(sum_hi, IFOffset), IFShift);
     367           0 :                     sum_clip_U8 = _mm_packus_epi16(sum_lo, sum_hi);
     368             :                     _mm_storeu_si128((__m128i *)(dstTemp), sum_clip_U8);
     369             :                 }
     370             :             }
     371           0 :             ref_pic += 16;
     372           0 :             dst += 16;
     373             :         }
     374             :     }
     375             :     else { //8x
     376             :         __m128i sum, sum01, sum23;
     377             : 
     378     1194370 :         for (width_cnt = 0; width_cnt < pu_width; width_cnt += 8) {
     379     1141220 :             refPicTemp = ref_pic;
     380     1141220 :             dstTemp = dst;
     381             : 
     382   184618000 :             for (height_cnt = 0; height_cnt < pu_height; ++height_cnt) {
     383   550431000 :                 sum01 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(refPicTemp)),
     384   183477000 :                     _mm_loadl_epi64((__m128i *)(refPicTemp + src_stride))), IFCoeff_1_0);
     385             : 
     386   550431000 :                 sum23 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(refPicTemp + 2 * src_stride)),
     387   183477000 :                     _mm_loadl_epi64((__m128i *)(refPicTemp + 3 * src_stride))), IFCoeff_3_2);
     388             : 
     389   733908000 :                 sum = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(sum01, sum23), IFOffset), IFShift);
     390   183477000 :                 sum_clip_U8 = _mm_packus_epi16(sum, sum);
     391   183477000 :                 _mm_storel_epi64((__m128i *)(dstTemp), sum_clip_U8);
     392             : 
     393   183477000 :                 dstTemp += dst_stride;
     394   183477000 :                 refPicTemp += srcStrideSkip;
     395             :             }
     396             :             //do the last row if sub-pred is ON.
     397     1141220 :             if (skip) {
     398           0 :                 dstTemp -= (dst_stride >> 1);
     399           0 :                 refPicTemp -= (srcStrideSkip >> 1);
     400             :                 {
     401           0 :                     sum01 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(refPicTemp)),
     402           0 :                         _mm_loadl_epi64((__m128i *)(refPicTemp + src_stride))), IFCoeff_1_0);
     403             : 
     404           0 :                     sum23 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(refPicTemp + 2 * src_stride)),
     405           0 :                         _mm_loadl_epi64((__m128i *)(refPicTemp + 3 * src_stride))), IFCoeff_3_2);
     406             : 
     407           0 :                     sum = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(sum01, sum23), IFOffset), IFShift);
     408           0 :                     sum_clip_U8 = _mm_packus_epi16(sum, sum);
     409           0 :                     _mm_storel_epi64((__m128i *)(dstTemp), sum_clip_U8);
     410             :                 }
     411             :             }
     412     1141220 :             ref_pic += 8;
     413     1141220 :             dst += 8;
     414             :         }
     415             :     }
     416       53156 : }
     417             : 
     418       79736 : void avc_style_luma_interpolation_filter_helper_ssse3(
     419             :     EbByte ref_pic,
     420             :     uint32_t src_stride,
     421             :     EbByte dst,
     422             :     uint32_t dst_stride,
     423             :     uint32_t pu_width,
     424             :     uint32_t pu_height,
     425             :     EbByte temp_buf,
     426             :     EbBool skip,
     427             :     uint32_t frac_pos,
     428             :     uint8_t fractional_position)
     429             : {
     430             : 
     431       79736 :     switch (fractional_position) {
     432           0 :     case 0:
     433           0 :         avc_style_copy_sse2(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
     434           0 :     case 1:
     435           0 :         avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
     436       26580 :     case 2:
     437       26580 :         avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
     438           0 :     case 3:
     439           0 :         avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
     440           0 :     case 4:
     441           0 :         avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
     442           0 :     case 5:
     443           0 :         avc_style_luma_interpolation_filter_pose_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
     444           0 :     case 6:
     445           0 :         avc_style_luma_interpolation_filter_posf_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
     446           0 :     case 7:
     447           0 :         avc_style_luma_interpolation_filter_posg_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
     448       53158 :     case 8:
     449       53158 :         avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
     450           0 :     case 9:
     451           0 :         avc_style_luma_interpolation_filter_posi_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
     452           0 :     case 10:
     453           0 :         avc_style_luma_interpolation_filter_posj_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
     454           0 :     case 11:
     455           0 :         avc_style_luma_interpolation_filter_posk_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
     456           0 :     case 12:
     457           0 :         avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
     458           0 :     case 13:
     459           0 :         avc_style_luma_interpolation_filter_posp_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
     460           0 :     case 14:
     461           0 :         avc_style_luma_interpolation_filter_posq_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
     462           0 :     case 15:
     463           0 :         avc_style_luma_interpolation_filter_posr_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
     464       79737 :     default:
     465             :         assert(0);
     466             :     }
     467       79737 : }

Generated by: LCOV version 1.14