LCOV - code coverage report
Current view: top level - ASM_SSE2 - EbComputeMean_Intrinsic_SSE2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 11 74 14.9 %
Date: 2019-11-25 17:38:06 Functions: 1 5 20.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include "emmintrin.h"
       7             : #include "EbComputeMean_SSE2.h"
       8             : 
       9           0 : uint64_t compute_subd_mean_of_squared_values8x8_sse2_intrin(
      10             :     uint8_t *  input_samples,      // input parameter, input samples Ptr
      11             :     uint16_t   input_stride)       // input parameter, input stride
      12             : 
      13             : {
      14             :     __m128i xmm0, xmm_blockMean, xmm_input;
      15             : 
      16           0 :     xmm0 = _mm_setzero_si128();
      17           0 :     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)input_samples), xmm0);
      18           0 :     xmm_blockMean = _mm_madd_epi16(xmm_input, xmm_input);
      19             : 
      20             :     /*xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + input_stride)), xmm0);
      21             :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));*/
      22             : 
      23           0 :     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 2 * input_stride)), xmm0);
      24           0 :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
      25             : 
      26             :     /*xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples+3*input_stride)), xmm0);
      27             :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));*/
      28             : 
      29           0 :     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 4 * input_stride)), xmm0);
      30           0 :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
      31             : 
      32             :     //xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples+5*input_stride)), xmm0);
      33             :     //xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
      34             : 
      35           0 :     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 6 * input_stride)), xmm0);
      36           0 :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
      37             : 
      38             :     /*xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples+7*input_stride)), xmm0);
      39             :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));*/
      40             : 
      41           0 :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_srli_si128(xmm_blockMean, 8));
      42           0 :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_srli_si128(xmm_blockMean, 4));
      43             : 
      44           0 :     return (uint64_t)_mm_cvtsi128_si32(xmm_blockMean) << 11;
      45             : }
      46             : 
      47      580667 : uint64_t compute_sub_mean8x8_sse2_intrin(
      48             :     uint8_t *  input_samples,      // input parameter, input samples Ptr
      49             :     uint16_t   input_stride)       // input parameter, input stride
      50             : 
      51             : {
      52      580667 :     __m128i xmm0 = _mm_setzero_si128(), xmm1, xmm3, xmm_sum1, xmm_sum2;
      53             : 
      54      580667 :     xmm1 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples)), xmm0);
      55             :     //xmm2 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + input_stride)), xmm0);
      56     1742000 :     xmm3 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + 2 * input_stride)), xmm0);
      57             :     //xmm4 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + 3 * input_stride)), xmm0);
      58      580667 :     xmm_sum1 = _mm_add_epi16(xmm1, xmm3);
      59             : 
      60      580667 :     input_samples += 4 * input_stride;
      61      580667 :     xmm1 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples)), xmm0);
      62             :     //xmm2 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + input_stride)), xmm0);
      63     1742000 :     xmm3 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + 2 * input_stride)), xmm0);
      64             :     //xmm4 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + 3 * input_stride)), xmm0);
      65      580667 :     xmm_sum2 = _mm_add_epi16(xmm1, xmm3);
      66      580667 :     xmm_sum2 = _mm_add_epi16(xmm_sum1, xmm_sum2);
      67             : 
      68      580667 :     return (uint64_t)_mm_cvtsi128_si32(xmm_sum2) << 3;
      69             : }
      70             : 
      71           0 : uint64_t compute_mean_of_squared_values8x8_sse2_intrin(
      72             :     uint8_t *  input_samples,      // input parameter, input samples Ptr
      73             :     uint32_t   input_stride,       // input parameter, input stride
      74             :     uint32_t   input_area_width,    // input parameter, input area width
      75             :     uint32_t   input_area_height)   // input parameter, input area height
      76             : {
      77             :     __m128i xmm0, xmm_blockMean, xmm_input;
      78             :     (void)input_area_width;
      79             :     (void)input_area_height;
      80           0 :     xmm0 = _mm_setzero_si128();
      81           0 :     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)input_samples), xmm0);
      82           0 :     xmm_blockMean = _mm_madd_epi16(xmm_input, xmm_input);
      83             : 
      84           0 :     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + input_stride)), xmm0);
      85           0 :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
      86             : 
      87           0 :     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 2 * input_stride)), xmm0);
      88           0 :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
      89             : 
      90           0 :     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 3 * input_stride)), xmm0);
      91           0 :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
      92             : 
      93           0 :     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 4 * input_stride)), xmm0);
      94           0 :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
      95             : 
      96           0 :     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 5 * input_stride)), xmm0);
      97           0 :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
      98             : 
      99           0 :     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 6 * input_stride)), xmm0);
     100           0 :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
     101             : 
     102           0 :     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 7 * input_stride)), xmm0);
     103           0 :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
     104             : 
     105           0 :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_srli_si128(xmm_blockMean, 8));
     106           0 :     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_srli_si128(xmm_blockMean, 4));
     107             : 
     108           0 :     return (uint64_t)_mm_cvtsi128_si32(xmm_blockMean) << 10;
     109             : }
     110             : 
     111           0 : uint64_t compute_mean8x8_sse2_intrin(
     112             :     uint8_t *  input_samples,      // input parameter, input samples Ptr
     113             :     uint32_t   input_stride,       // input parameter, input stride
     114             :     uint32_t   input_area_width,    // input parameter, input area width
     115             :     uint32_t   input_area_height)   // input parameter, input area height
     116             : {
     117           0 :     __m128i xmm0 = _mm_setzero_si128(), xmm1, xmm2, xmm3, xmm4, xmm_sum1, xmm_sum2;
     118             : 
     119           0 :     xmm1 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples)), xmm0);
     120           0 :     xmm2 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + input_stride)), xmm0);
     121           0 :     xmm3 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + 2 * input_stride)), xmm0);
     122           0 :     xmm4 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + 3 * input_stride)), xmm0);
     123           0 :     xmm_sum1 = _mm_add_epi16(_mm_add_epi16(xmm1, xmm2), _mm_add_epi16(xmm3, xmm4));
     124             : 
     125           0 :     input_samples += 4 * input_stride;
     126           0 :     xmm1 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples)), xmm0);
     127           0 :     xmm2 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + input_stride)), xmm0);
     128           0 :     xmm3 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + 2 * input_stride)), xmm0);
     129           0 :     xmm4 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + 3 * input_stride)), xmm0);
     130           0 :     xmm_sum2 = _mm_add_epi16(_mm_add_epi16(xmm1, xmm2), _mm_add_epi16(xmm3, xmm4));
     131           0 :     xmm_sum2 = _mm_add_epi16(xmm_sum1, xmm_sum2);
     132             : 
     133             :     (void)input_area_width;
     134             :     (void)input_area_height;
     135             : 
     136           0 :     return (uint64_t)_mm_cvtsi128_si32(xmm_sum2) << 2;
     137             : }
     138             : 
     139           0 : void compute_interm_var_four8x8_helper_sse2(
     140             :     uint8_t *  input_samples,
     141             :     uint16_t   input_stride,
     142             :     uint64_t * mean_of8x8_blocks,      // mean of four  8x8
     143             :     uint64_t * mean_of_squared8x8_blocks)  // meanSquared
     144             : {
     145           0 :     uint32_t blockIndex = 0;
     146             :     // (0,1)
     147           0 :     mean_of8x8_blocks[0] = compute_sub_mean8x8_sse2_intrin(input_samples + blockIndex, input_stride);
     148           0 :     mean_of_squared8x8_blocks[0] = compute_subd_mean_of_squared_values8x8_sse2_intrin(input_samples + blockIndex, input_stride);
     149             : 
     150             :     // (0,2)
     151           0 :     blockIndex = blockIndex + 8;
     152           0 :     mean_of8x8_blocks[1] = compute_sub_mean8x8_sse2_intrin(input_samples + blockIndex, input_stride);
     153           0 :     mean_of_squared8x8_blocks[1] = compute_subd_mean_of_squared_values8x8_sse2_intrin(input_samples + blockIndex, input_stride);
     154             : 
     155             :     // (0,3)
     156           0 :     blockIndex = blockIndex + 8;
     157           0 :     mean_of8x8_blocks[2] = compute_sub_mean8x8_sse2_intrin(input_samples + blockIndex, input_stride);
     158           0 :     mean_of_squared8x8_blocks[2] = compute_subd_mean_of_squared_values8x8_sse2_intrin(input_samples + blockIndex, input_stride);
     159             : 
     160             :     // (0,4)
     161           0 :     blockIndex = blockIndex + 8;
     162           0 :     mean_of8x8_blocks[3] = compute_sub_mean8x8_sse2_intrin(input_samples + blockIndex, input_stride);
     163           0 :     mean_of_squared8x8_blocks[3] = compute_subd_mean_of_squared_values8x8_sse2_intrin(input_samples + blockIndex, input_stride);
     164           0 : }

Generated by: LCOV version 1.14