LCOV - code coverage report
Current view: top level - ASM_SSE2 - synonyms.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 38 48 79.2 %
Date: 2019-11-25 17:38:06 Functions: 12 14 85.7 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #ifndef AOM_DSP_X86_SYNONYMS_H_
      13             : #define AOM_DSP_X86_SYNONYMS_H_
      14             : 
      15             : #include <immintrin.h>
      16             : #include "EbDefinitions.h"
      17             : 
      18             :  /**
      19             :   * Various reusable shorthands for x86 SIMD intrinsics.
      20             :   *
      21             :   * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
      22             :   * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
      23             :   */
      24             : 
      25  3270493500 : static INLINE __m128i xx_loadl_64(const void *a) {
      26  3270493500 :     return _mm_loadl_epi64((const __m128i *)a);
      27             : }
      28             : 
      29  2117031143 : static INLINE __m128i xx_loadu_128(const void *a) {
      30  2117031143 :     return _mm_loadu_si128((const __m128i *)a);
      31             : }
      32             : 
      33   322560553 : static INLINE void xx_storel_32(void *const a, const __m128i v) {
      34   322560553 :     *(uint32_t *)a = _mm_cvtsi128_si32(v);
      35   322560553 : }
      36             : 
      37   926129000 : static INLINE void xx_storel_64(void *const a, const __m128i v) {
      38   926129000 :     _mm_storel_epi64((__m128i *)a, v);
      39   926129000 : }
      40             : 
      41   696154000 : static INLINE void xx_storeu_128(void *const a, const __m128i v) {
      42             :     _mm_storeu_si128((__m128i *)a, v);
      43   696154000 : }
      44             : 
      45    31471800 : static INLINE __m128i _mm_loadh_epi64(const void *const p, const __m128i s) {
      46    94414700 :     return _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(s), (double *)p));
      47             : }
      48             : 
      49   212645742 : static INLINE void _mm_storeh_epi64(__m128i *const p, const __m128i x) {
      50   212645742 :     _mm_storeh_pd((double *)p, _mm_castsi128_pd(x));
      51   212645742 : }
      52             : 
      53    31470600 : static INLINE __m128i load8bit_8x2_sse2(const void *const src,
      54             :     const ptrdiff_t strideInByte) {
      55    31470600 :     const __m128i s = _mm_loadl_epi64((__m128i *)src);
      56    31470600 :     return _mm_loadh_epi64((__m128i *)((uint8_t *)src + strideInByte), s);
      57             : }
      58             : 
      59    30893290 : static INLINE __m128i load_u8_8x2_sse2(const uint8_t *const src,
      60             :     const ptrdiff_t stride) {
      61    30893290 :     return load8bit_8x2_sse2(src, sizeof(*src) * stride);
      62             : }
      63             : 
      64      576822 : static INLINE __m128i load_u16_4x2_sse2(const uint16_t *const src,
      65             :     const ptrdiff_t stride) {
      66      576822 :     return load8bit_8x2_sse2(src, sizeof(*src) * stride);
      67             : }
      68             : 
      69             : SIMD_INLINE void store_u8_4x2_sse2(const __m128i src, uint8_t *const dst,
      70             :     const int32_t stride) {
      71    35268622 :     xx_storel_32(dst, src);
      72    35268722 :     *(int32_t *)(dst + stride) = _mm_extract_epi32(src, 1);
      73    35268722 : }
      74             : 
      75             : SIMD_INLINE void store_u16_2x2_sse2(const __m128i src, uint16_t *const dst,
      76             :     const int32_t stride) {
      77           0 :     xx_storel_32(dst, src);
      78           0 :     *(int32_t *)(dst + stride) = _mm_extract_epi32(src, 1);
      79           0 : }
      80             : 
      81             : SIMD_INLINE void store_s16_4x2_sse2(const __m128i src, int16_t *const dst,
      82             :     const int32_t stride) {
      83    72419400 :     _mm_storel_epi64((__m128i *)dst, src);
      84    72419400 :     _mm_storeh_epi64((__m128i *)(dst + stride), src);
      85    72413800 : }
      86             : 
      87             : SIMD_INLINE void store_u16_4x2_sse2(const __m128i src, uint16_t *const dst,
      88             :     const int32_t stride) {
      89      733244 :     _mm_storel_epi64((__m128i *)dst, src);
      90      733244 :     _mm_storeh_epi64((__m128i *)(dst + stride), src);
      91      733245 : }
      92             : 
      93             : // The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
      94             : // compilers. The following function is equivalent to _mm_set_epi64x()
      95             : // acting on 32-bit integers.
      96             : static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
      97             : #if defined(_MSC_VER) && _MSC_VER < 1900
      98             :     return _mm_set_epi32(0, e1, 0, e0);
      99             : #else
     100             :     return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
     101             : #endif
     102             : }
     103             : 
     104             : // The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
     105             : // compilers. The following function is equivalent to _mm_set1_epi64x()
     106             : // acting on a 32-bit integer.
     107    23050000 : static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
     108             : #if defined(_MSC_VER) && _MSC_VER < 1900
     109             :     return _mm_set_epi32(0, a, 0, a);
     110             : #else
     111    46099900 :     return _mm_set1_epi64x((uint32_t)a);
     112             : #endif
     113             : }
     114             : 
     115           0 : static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
     116           0 :     return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
     117             : }
     118             : 
     119   293965719 : static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int32_t bits) {
     120   587931438 :     const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
     121   587931438 :     return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
     122             : }
     123             : 
     124             : static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int32_t bits) {
     125             :     const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
     126             :     const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
     127             :     return _mm_srli_epi32(v_tmp_d, bits);
     128             : }
     129             : 
     130             : // This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
     131             : static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int32_t bits) {
     132             :     const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
     133             :     const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
     134             :     return _mm_srai_epi32(v_tmp_d, bits);
     135             : }
     136             : 
     137             : // This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
     138           0 : static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int32_t bits) {
     139           0 :     const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
     140           0 :     const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
     141             :     const __m128i v_tmp_d =
     142           0 :         _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
     143           0 :     return _mm_srai_epi32(v_tmp_d, bits);
     144             : }
     145             : 
     146             : static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int32_t bits) {
     147             :     const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
     148             :     const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
     149             :     const __m128i v_tmp_d =
     150             :         _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d);
     151             :     return _mm_srai_epi16(v_tmp_d, bits);
     152             : }
     153             : 
     154             : // This fucntion will fail gcc Linux ABI build
     155             : // Tunraround is to replace the core of the fucntion in each call
     156             : 
     157             : //static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
     158             : //  const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
     159             : //  return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
     160             : //}
     161             : //
     162             : 
     163             : // Note:
     164             : // _mm256_insert_epi16 intrinsics is available from vs2017.
     165             : // We define this macro for vs2015 and earlier. The
     166             : // intrinsics used here are in vs2015 document:
     167             : // https://msdn.microsoft.com/en-us/library/hh977022.aspx
     168             : // Input parameters:
     169             : // a: __m256i,
     170             : // d: int16_t,
     171             : // indx: imm8 (0 - 15)
     172             : //#if _MSC_VER <= 1900
     173             : #if defined(_MSC_VER) && _MSC_VER < 1910
     174             : #define _mm256_insert_epi16(a, d, indx)                                      \
     175             :   _mm256_insertf128_si256(                                                   \
     176             :       a,                                                                     \
     177             :       _mm_insert_epi16(_mm256_extractf128_si256(a, indx >> 3), d, indx % 8), \
     178             :       indx >> 3)
     179             : 
     180             : static INLINE int32_t _mm256_extract_epi32(__m256i a, const int32_t i) {
     181             :     return a.m256i_i32[i & 7];
     182             : }
     183             : 
     184             : static INLINE int32_t _mm256_extract_epi16(__m256i a, const int32_t i) {
     185             :     return a.m256i_i16[i & 15];
     186             : }
     187             : 
     188             : static INLINE __m256i _mm256_insert_epi32(__m256i a, int32_t b, const int32_t i) {
     189             :     __m256i c = a;
     190             :     c.m256i_i32[i & 7] = b;
     191             :     return c;
     192             : }
     193             : #endif
     194             : 
     195             : #endif  // AOM_DSP_X86_SYNONYMS_H_

Generated by: LCOV version 1.14