LCOV - code coverage report
Current view: top level - ASM_SSE2 - EbMeSadCalculation_Intrinsic_SSE2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 11 71 15.5 %
Date: 2019-11-25 17:38:06 Functions: 1 4 25.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include "EbMeSadCalculation_SSE2.h"
       7             : #include <emmintrin.h>
       8             : #include "stdint.h"
       9             : 
      10           0 : static INLINE void sad8x4x2_sse2_intrin(const uint8_t *src,
      11             :     const uint32_t src_stride, const uint8_t *ref, const uint32_t ref_stride,
      12             :     __m128i *sad8x8)
      13             : {
      14           0 :     *sad8x8 = _mm_add_epi32(*sad8x8, _mm_sad_epu8(_mm_loadu_si128((__m128i*)(src + 0 * src_stride)), _mm_loadu_si128((__m128i*)(ref + 0 * ref_stride))));
      15           0 :     *sad8x8 = _mm_add_epi32(*sad8x8, _mm_sad_epu8(_mm_loadu_si128((__m128i*)(src + 2 * src_stride)), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride))));
      16           0 :     *sad8x8 = _mm_add_epi32(*sad8x8, _mm_sad_epu8(_mm_loadu_si128((__m128i*)(src + 4 * src_stride)), _mm_loadu_si128((__m128i*)(ref + 4 * ref_stride))));
      17           0 :     *sad8x8 = _mm_add_epi32(*sad8x8, _mm_sad_epu8(_mm_loadu_si128((__m128i*)(src + 6 * src_stride)), _mm_loadu_si128((__m128i*)(ref + 6 * ref_stride))));
      18           0 : }
      19             : 
      20           0 : void sad_calculation_8x8_16x16_sse2_intrin(
      21             :     uint8_t   *src,
      22             :     uint32_t   src_stride,
      23             :     uint8_t   *ref,
      24             :     uint32_t   ref_stride,
      25             :     uint32_t  *p_best_sad8x8,
      26             :     uint32_t  *p_best_sad16x16,
      27             :     uint32_t  *p_best_mv8x8,
      28             :     uint32_t  *p_best_mv16x16,
      29             :     uint32_t   mv,
      30             :     uint32_t  *p_sad16x16,
      31             :     EbBool     sub_sad)
      32             : {
      33             :     __m128i xmm_sad16x16, xmm_sad8x8[2], xmm_sad16x16_total, sad8x8_0_3, sad8x8_less_than_bitmask, xmm_N1;
      34             :     __m128i sad8x8_greater_or_eq_bitmask, BestMV8x8, BestSad8x8, xmm_pBestSad8x8, xmm_pBestMV8x8, xmm_mv;
      35             : 
      36           0 :     xmm_sad8x8[0] = xmm_sad8x8[1] = _mm_setzero_si128();
      37             : 
      38             :     //sad8x8_0, sad8x8_1
      39           0 :     sad8x4x2_sse2_intrin(src + 0 * src_stride, src_stride, ref + 0 * ref_stride, ref_stride, &xmm_sad8x8[0]);
      40             : 
      41             :     //sad8x8_2, sad8x8_3
      42           0 :     sad8x4x2_sse2_intrin(src + 8 * src_stride, src_stride, ref + 8 * ref_stride, ref_stride, &xmm_sad8x8[1]);
      43             : 
      44           0 :     if (sub_sad) {
      45           0 :         xmm_sad8x8[0] = _mm_slli_epi32(xmm_sad8x8[0], 1);
      46           0 :         xmm_sad8x8[1] = _mm_slli_epi32(xmm_sad8x8[1], 1);
      47             :     }
      48             :     else {
      49             :         //sad8x8_0, sad8x8_1
      50           0 :         sad8x4x2_sse2_intrin(src + 1 * src_stride, src_stride, ref + 1 * ref_stride, ref_stride, &xmm_sad8x8[0]);
      51             : 
      52             :         //sad8x8_2, sad8x8_3
      53           0 :         sad8x4x2_sse2_intrin(src + 9 * src_stride, src_stride, ref + 9 * ref_stride, ref_stride, &xmm_sad8x8[1]);
      54             :     }
      55             : 
      56           0 :     xmm_sad16x16 = _mm_add_epi32(xmm_sad8x8[0], xmm_sad8x8[1]);
      57           0 :     xmm_sad16x16_total = _mm_add_epi32(_mm_srli_si128(xmm_sad16x16, 8), xmm_sad16x16);
      58             : 
      59           0 :     *p_sad16x16 = _mm_cvtsi128_si32(xmm_sad16x16_total);
      60             : 
      61           0 :     sad8x8_0_3 = _mm_packs_epi32(xmm_sad8x8[0], xmm_sad8x8[1]);
      62             : 
      63           0 :     xmm_mv = _mm_cvtsi64_si128(mv);
      64           0 :     xmm_mv = _mm_unpacklo_epi32(xmm_mv, xmm_mv);
      65           0 :     xmm_mv = _mm_unpacklo_epi64(xmm_mv, xmm_mv);
      66             : 
      67           0 :     xmm_pBestSad8x8 = _mm_loadu_si128((__m128i*)p_best_sad8x8);
      68           0 :     xmm_pBestMV8x8 = _mm_loadu_si128((__m128i*)p_best_mv8x8);
      69             : 
      70             :     // sad8x8_0 < p_best_sad8x8[0] for 0 to 3
      71           0 :     sad8x8_less_than_bitmask = _mm_cmplt_epi32(sad8x8_0_3, xmm_pBestSad8x8);
      72             : 
      73           0 :     xmm_N1 = _mm_cmpeq_epi8(xmm_sad8x8[0], xmm_sad8x8[0]);
      74             : 
      75           0 :     sad8x8_greater_or_eq_bitmask = _mm_sub_epi32(xmm_N1, sad8x8_less_than_bitmask);
      76             : 
      77           0 :     BestSad8x8 = _mm_or_si128(_mm_and_si128(xmm_pBestSad8x8, sad8x8_greater_or_eq_bitmask), _mm_and_si128(sad8x8_less_than_bitmask, sad8x8_0_3));
      78           0 :     BestMV8x8 = _mm_or_si128(_mm_and_si128(xmm_pBestMV8x8, sad8x8_greater_or_eq_bitmask), _mm_and_si128(sad8x8_less_than_bitmask, xmm_mv));
      79             : 
      80             :     _mm_storeu_si128((__m128i*)p_best_sad8x8, BestSad8x8);
      81             :     _mm_storeu_si128((__m128i*)p_best_mv8x8, BestMV8x8);
      82             : 
      83           0 :     uint64_t sad16x16 = _mm_cvtsi128_si64(xmm_sad16x16_total);
      84           0 :     if (sad16x16 < p_best_sad16x16[0]) {
      85           0 :         p_best_sad16x16[0] = (uint32_t)sad16x16;
      86           0 :         p_best_mv16x16[0] = _mm_cvtsi128_si32(xmm_mv);
      87             :     }
      88           0 : }
      89             : 
      90           0 : void sad_calculation_32x32_64x64_sse2_intrin(
      91             :     uint32_t  *p_sad16x16,
      92             :     uint32_t  *p_best_sad32x32,
      93             :     uint32_t  *p_best_sad64x64,
      94             :     uint32_t  *p_best_mv32x32,
      95             :     uint32_t  *p_best_mv64x64,
      96             :     uint32_t   mv)
      97             : {
      98             :     __m128i xmm_N1, sad32x32_greater_than_bitmask, sad32x32_less_than_or_eq_bitmask, BestSad32x32, BestMV32x32, xmm_mv;
      99             :     __m128i Sad16x16_0_7_lo, Sad16x16_0_7_hi, Sad16x16_8_15_lo, Sad16x16_8_15_hi, xmm_sad64x64, xmm_sad64x64_total, xmm_pBestSad32x32, xmm_pBestMV32x32;
     100             : 
     101           0 :     Sad16x16_0_7_lo = _mm_unpacklo_epi32(_mm_loadu_si128((__m128i*)p_sad16x16), _mm_loadu_si128((__m128i*)(p_sad16x16 + 4)));
     102           0 :     Sad16x16_0_7_hi = _mm_unpackhi_epi32(_mm_loadu_si128((__m128i*)p_sad16x16), _mm_loadu_si128((__m128i*)(p_sad16x16 + 4)));
     103           0 :     Sad16x16_8_15_lo = _mm_unpacklo_epi32(_mm_loadu_si128((__m128i*)(p_sad16x16 + 8)), _mm_loadu_si128((__m128i*)(p_sad16x16 + 12)));
     104           0 :     Sad16x16_8_15_hi = _mm_unpackhi_epi32(_mm_loadu_si128((__m128i*)(p_sad16x16 + 8)), _mm_loadu_si128((__m128i*)(p_sad16x16 + 12)));
     105             : 
     106           0 :     xmm_sad64x64 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(Sad16x16_0_7_lo, Sad16x16_8_15_lo), _mm_unpackhi_epi64(Sad16x16_0_7_lo, Sad16x16_8_15_lo)),
     107             :         _mm_add_epi32(_mm_unpacklo_epi64(Sad16x16_0_7_hi, Sad16x16_8_15_hi), _mm_unpackhi_epi64(Sad16x16_0_7_hi, Sad16x16_8_15_hi)));
     108             : 
     109           0 :     xmm_sad64x64_total = _mm_add_epi32(_mm_srli_si128(xmm_sad64x64, 8), xmm_sad64x64);
     110             : 
     111           0 :     xmm_sad64x64_total = _mm_add_epi32(_mm_srli_si128(xmm_sad64x64_total, 4), xmm_sad64x64_total);
     112             : 
     113           0 :     xmm_mv = _mm_cvtsi32_si128(mv);
     114           0 :     xmm_mv = _mm_unpacklo_epi32(xmm_mv, xmm_mv);
     115           0 :     xmm_mv = _mm_unpacklo_epi64(xmm_mv, xmm_mv);
     116             : 
     117           0 :     xmm_pBestSad32x32 = _mm_loadu_si128((__m128i*)p_best_sad32x32);
     118           0 :     xmm_pBestMV32x32 = _mm_loadu_si128((__m128i*)p_best_mv32x32);
     119             : 
     120           0 :     sad32x32_greater_than_bitmask = _mm_cmpgt_epi32(xmm_pBestSad32x32, xmm_sad64x64);// _mm_cmplt_epi32(xmm_pBestSad32x32, xmm_sad64x64);
     121             : 
     122           0 :     xmm_N1 = _mm_cmpeq_epi8(xmm_mv, xmm_mv); // anything compared to itself is equal (get 0xFFFFFFFF)
     123           0 :     sad32x32_less_than_or_eq_bitmask = _mm_sub_epi32(xmm_N1, sad32x32_greater_than_bitmask);
     124             : 
     125           0 :     BestSad32x32 = _mm_or_si128(_mm_and_si128(xmm_pBestSad32x32, sad32x32_less_than_or_eq_bitmask), _mm_and_si128(xmm_sad64x64, sad32x32_greater_than_bitmask));
     126           0 :     BestMV32x32 = _mm_or_si128(_mm_and_si128(xmm_pBestMV32x32, sad32x32_less_than_or_eq_bitmask), _mm_and_si128(xmm_mv, sad32x32_greater_than_bitmask));
     127             : 
     128             :     _mm_storeu_si128((__m128i*)p_best_sad32x32, BestSad32x32);
     129             :     _mm_storeu_si128((__m128i*)p_best_mv32x32, BestMV32x32);
     130             : 
     131           0 :     uint32_t sad64x64 = _mm_cvtsi128_si32(xmm_sad64x64_total);
     132           0 :     if (sad64x64 < p_best_sad64x64[0]) {
     133           0 :         p_best_sad64x64[0] = sad64x64;
     134           0 :         p_best_mv64x64[0] = _mm_cvtsi128_si32(xmm_mv);
     135             :     }
     136           0 : }
     137             : 
     138       50692 : void initialize_buffer_32bits_sse2_intrin(
     139             :     uint32_t*        pointer,
     140             :     uint32_t        count128,
     141             :     uint32_t        count32,
     142             :     uint32_t        value)
     143             : {
     144             :     __m128i xmm1, xmm2;
     145             :     uint32_t index128;
     146       50692 :     xmm2 = _mm_cvtsi32_si128(value);
     147       50692 :     xmm1 = _mm_or_si128(_mm_slli_si128(xmm2, 4), xmm2);
     148       50692 :     xmm2 = _mm_or_si128(_mm_slli_si128(xmm1, 8), xmm1);
     149             : 
     150     2499200 :     for (index128 = 0; index128 < count128; ++index128) {
     151             :         _mm_storeu_si128((__m128i *)pointer, xmm2);
     152     2448510 :         pointer += 4;
     153             :     }
     154       50692 :     if (count32 == 3) { //Initialize 96 bits
     155           0 :         _mm_storel_epi64((__m128i *)(pointer), xmm2);
     156           0 :         *(pointer + 2) = _mm_cvtsi128_si32(xmm2);
     157             :     }
     158       50692 :     else if (count32 == 2) { // Initialize 64 bits
     159           0 :         _mm_storel_epi64((__m128i *)pointer, xmm2);
     160             :     }
     161       50692 :     else if (count32 == 1) { // Initialize 32 bits
     162       44940 :         *(pointer) = _mm_cvtsi128_si32(xmm2);
     163             :     }
     164       50692 : }

Generated by: LCOV version 1.14