LCOV - code coverage report
Current view: top level - ASM_SSE2 - EbVariance_SSE2.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 11 39 28.2 %
Date: 2019-11-25 17:38:06 Functions: 3 8 37.5 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #ifndef EBVARIANCE_SSE2_H
      13             : #define EBVARIANCE_SSE2_H
      14             : 
      15             : #include "EbDefinitions.h"
      16             : #include <assert.h>
      17             : #include <emmintrin.h>  // SSE2
      18             : #include "aom_dsp_rtcd.h"
      19             : #include "synonyms.h"
      20             : 
      21             : // Read 4 samples from each of row and row + 1. Interleave the two rows and
      22             : // zero-extend them to 16 bit samples stored in the lower half of an SSE
      23             : // register.
      24             : //static __m128i read64(const uint8_t *p, int32_t stride, int32_t row) {
      25             : //  __m128i row1 = xx_loadl_32(p + (row + 1) * stride);
      26             : //  return _mm_unpacklo_epi8(_mm_unpacklo_epi8(row0, row1), _mm_setzero_si128());
      27             : //}
      28             : 
      29     1851520 : static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int32_t stride) {
      30     1851520 :     const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride));
      31     3703040 :     const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride));
      32     5554570 :     return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
      33             : }
      34             : 
      35     7070680 : static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) {
      36     7070680 :     const __m128i p0 = _mm_loadl_epi64((const __m128i *)p);
      37    14141400 :     return _mm_unpacklo_epi8(p0, _mm_setzero_si128());
      38             : }
      39             : 
      40             : // Accumulate 4 32bit numbers in val to 1 32bit number
      41      470396 : static INLINE uint32_t add32x4_sse2(__m128i val) {
      42      470396 :     val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
      43      940792 :     val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
      44      470396 :     return _mm_cvtsi128_si32(val);
      45             : }
      46             : 
      47           0 : static INLINE void variance_kernel_no_sum_sse2(const __m128i src, const __m128i ref,
      48             :     __m128i *const sse) {
      49           0 :     const __m128i diff = _mm_sub_epi16(src, ref);
      50           0 :     *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
      51           0 : }
      52             : 
      53             : // Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
      54             : // Slightly faster than variance_final_256_pel_no_sum_sse2()
      55             : // diff sum of 128 pixels can still fit in 16bit integer
      56           0 : static INLINE void variance_final_128_pel_no_sum_sse2(__m128i vsse,
      57             :     uint32_t *const sse) {
      58           0 :     *sse = add32x4_sse2(vsse);
      59           0 : }
      60             : 
      61             : // Can handle 256 pixels' diff sum (such as 16x16)
      62           0 : static INLINE void variance_final_256_pel_no_sum_sse2(__m128i vsse,
      63             :     uint32_t *const sse) {
      64           0 :     *sse = add32x4_sse2(vsse);
      65           0 : }
      66             : 
      67           0 : static INLINE void variance4_no_sum_sse2(const uint8_t *src, const int32_t src_stride,
      68             :     const uint8_t *ref, const int32_t ref_stride,
      69             :     const int32_t h, __m128i *const sse) {
      70           0 :     assert(h <= 256);  // May overflow for larger height.
      71             : 
      72           0 :     for (int32_t i = 0; i < h; i += 2) {
      73           0 :         const __m128i s = load4x2_sse2(src, src_stride);
      74           0 :         const __m128i r = load4x2_sse2(ref, ref_stride);
      75             : 
      76           0 :         variance_kernel_no_sum_sse2(s, r, sse);
      77           0 :         src += 2 * src_stride;
      78           0 :         ref += 2 * ref_stride;
      79             :     }
      80           0 : }
      81             : 
      82           0 : static INLINE void variance8_no_sum_sse2(const uint8_t *src, const int32_t src_stride,
      83             :     const uint8_t *ref, const int32_t ref_stride,
      84             :     const int32_t h, __m128i *const sse) {
      85           0 :     assert(h <= 128);  // May overflow for larger height.
      86           0 :     for (int32_t i = 0; i < h; i++) {
      87           0 :         const __m128i s = load8_8to16_sse2(src);
      88           0 :         const __m128i r = load8_8to16_sse2(ref);
      89             : 
      90           0 :         variance_kernel_no_sum_sse2(s, r, sse);
      91           0 :         src += src_stride;
      92           0 :         ref += ref_stride;
      93             :     }
      94           0 : }
      95             : 
      96             : #endif  // EBVARIANCE_SSE2_H

Generated by: LCOV version 1.14