LCOV - code coverage report
Current view: top level - ASM_SSE2 - av1_txfm_sse2.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 18 18 100.0 %
Date: 2019-11-25 17:38:06 Functions: 5 5 100.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : #ifndef AV1_COMMON_X86_AV1_TXFM_SSE2_H_
      12             : #define AV1_COMMON_X86_AV1_TXFM_SSE2_H_
      13             : 
      14             : #include <emmintrin.h>  // SSE2
      15             : 
      16             : #ifdef __cplusplus
      17             : extern "C" {
      18             : #endif
      19             : 
      20             :     #define pair_set_epi16(a, b) \
      21             :       _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
      22             : 
      23             :     static INLINE void btf_16_w4_sse2(
      24             :         const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
      25             :         const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
      26             :         __m128i *const out0, __m128i *const out1) {
      27             :         const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
      28             :         const __m128i u0 = _mm_madd_epi16(t0, *w0);
      29             :         const __m128i v0 = _mm_madd_epi16(t0, *w1);
      30             :         const __m128i a0 = _mm_add_epi32(u0, __rounding);
      31             :         const __m128i b0 = _mm_add_epi32(v0, __rounding);
      32             :         const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
      33             :         const __m128i d0 = _mm_srai_epi32(b0, cos_bit);
      34             : 
      35             :         *out0 = _mm_packs_epi32(c0, c0);
      36             :         *out1 = _mm_packs_epi32(d0, c0);
      37             :     }
      38             : 
      39             : #define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
      40             :   {                                                  \
      41             :     __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
      42             :     __m128i u0 = _mm_madd_epi16(t0, w0);             \
      43             :     __m128i v0 = _mm_madd_epi16(t0, w1);             \
      44             :                                                      \
      45             :     __m128i a0 = _mm_add_epi32(u0, __rounding);      \
      46             :     __m128i b0 = _mm_add_epi32(v0, __rounding);      \
      47             :                                                      \
      48             :     __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
      49             :     __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
      50             :                                                      \
      51             :     out0 = _mm_packs_epi32(c0, c0);                  \
      52             :     out1 = _mm_packs_epi32(d0, d0);                  \
      53             :   }
      54             : 
      55             : #define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
      56             :   {                                               \
      57             :     __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
      58             :     __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
      59             :     __m128i u0 = _mm_madd_epi16(t0, w0);          \
      60             :     __m128i u1 = _mm_madd_epi16(t1, w0);          \
      61             :     __m128i v0 = _mm_madd_epi16(t0, w1);          \
      62             :     __m128i v1 = _mm_madd_epi16(t1, w1);          \
      63             :                                                   \
      64             :     __m128i a0 = _mm_add_epi32(u0, __rounding);   \
      65             :     __m128i a1 = _mm_add_epi32(u1, __rounding);   \
      66             :     __m128i b0 = _mm_add_epi32(v0, __rounding);   \
      67             :     __m128i b1 = _mm_add_epi32(v1, __rounding);   \
      68             :                                                   \
      69             :     __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
      70             :     __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
      71             :     __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
      72             :     __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
      73             :                                                   \
      74             :     out0 = _mm_packs_epi32(c0, c1);               \
      75             :     out1 = _mm_packs_epi32(d0, d1);               \
      76             :   }
      77             : 
      78             :     static INLINE __m128i load_16bit_to_16bit(const int16_t *a) {
      79             :         return _mm_load_si128((const __m128i *)a);
      80             :     }
      81             : 
      82   228781000 :     static INLINE __m128i load_32bit_to_16bit(const int32_t *a) {
      83   228781000 :         const __m128i a_low = _mm_load_si128((const __m128i *)a);
      84   457563000 :         return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
      85             :     }
      86             : 
      87   102215000 :     static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
      88   102215000 :         const __m128i a_low = _mm_load_si128((const __m128i *)a);
      89   102215000 :         return _mm_packs_epi32(a_low, a_low);
      90             :     }
      91             : 
      92             :     // Store 4 16 bit values. Sign extend the values.
      93             :     static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
      94             :         const __m128i a_lo = _mm_unpacklo_epi16(a, a);
      95             :         const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
      96             :         _mm_store_si128((__m128i *)b, a_1);
      97             :     }
      98             : 
      99             :     // Store 8 16 bit values. Sign extend the values.
     100             :     static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
     101             :         const __m128i a_lo = _mm_unpacklo_epi16(a, a);
     102             :         const __m128i a_hi = _mm_unpackhi_epi16(a, a);
     103             :         const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
     104             :         const __m128i a_2 = _mm_srai_epi32(a_hi, 16);
     105             :         _mm_store_si128((__m128i *)b, a_1);
     106             :         _mm_store_si128((__m128i *)(b + 4), a_2);
     107             :     }
     108             : 
     109             :     static INLINE __m128i scale_round_sse2(const __m128i a, const int32_t scale) {
     110             :         const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
     111             :         const __m128i b = _mm_madd_epi16(a, scale_rounding);
     112             :         return _mm_srai_epi32(b, NewSqrt2Bits);
     113             :     }
     114             : 
     115             :     static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a,
     116             :         int32_t *const b) {
     117             :         const __m128i one = _mm_set1_epi16(1);
     118             :         const __m128i a_lo = _mm_unpacklo_epi16(a, one);
     119             :         const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
     120             :         _mm_store_si128((__m128i *)b, b_lo);
     121             :     }
     122             : 
     123             :     static INLINE void store_rect_16bit_to_32bit(const __m128i a,
     124             :         int32_t *const b) {
     125             :         const __m128i one = _mm_set1_epi16(1);
     126             :         const __m128i a_lo = _mm_unpacklo_epi16(a, one);
     127             :         const __m128i a_hi = _mm_unpackhi_epi16(a, one);
     128             :         const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
     129             :         const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
     130             :         _mm_store_si128((__m128i *)b, b_lo);
     131             :         _mm_store_si128((__m128i *)(b + 4), b_hi);
     132             :     }
     133             : 
     134             :     static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
     135             :         const int32_t stride,
     136             :         __m128i *const out,
     137             :         const int32_t out_size) {
     138             :         for (int32_t i = 0; i < out_size; ++i)
     139             :             out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
     140             :     }
     141             : 
     142             :     static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
     143             :         const int32_t stride,
     144             :         __m128i *const out,
     145             :         const int32_t out_size) {
     146             :         for (int32_t i = 0; i < out_size; ++i)
     147             :             out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
     148             :     }
     149             : 
     150             :     static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int32_t stride,
     151             :         __m128i *out, int32_t out_size) {
     152             :         for (int32_t i = 0; i < out_size; ++i)
     153             :             out[i] = load_16bit_to_16bit(in + i * stride);
     154             :     }
     155             : 
     156             :     static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
     157             :         int32_t stride, __m128i *out,
     158             :         int32_t out_size) {
     159             :         for (int32_t i = 0; i < out_size; ++i)
     160             :             out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
     161             :     }
     162             : 
     163    29627500 :     static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int32_t stride,
     164             :         __m128i *out, int32_t out_size) {
     165   239257000 :         for (int32_t i = 0; i < out_size; ++i)
     166   209625000 :             out[i] = load_32bit_to_16bit(in + i * stride);
     167    29631200 :     }
     168             : 
     169    18840700 :     static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int32_t stride,
     170             :         __m128i *out, int32_t out_size) {
     171   121057000 :         for (int32_t i = 0; i < out_size; ++i)
     172   102215000 :             out[i] = load_32bit_to_16bit_w4(in + i * stride);
     173    18841900 :     }
     174             : 
     175             :     static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
     176             :         int32_t stride, __m128i *out,
     177             :         int32_t out_size) {
     178             :         for (int32_t i = 0; i < out_size; ++i)
     179             :             out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride);
     180             :     }
     181             : 
     182             :     static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
     183             :         int32_t *const out,
     184             :         const int32_t stride,
     185             :         const int32_t out_size) {
     186             :         for (int32_t i = 0; i < out_size; ++i)
     187             :             store_16bit_to_32bit_w4(in[i], out + i * stride);
     188             :     }
     189             : 
     190             :     static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
     191             :         int32_t *const out,
     192             :         const int32_t stride,
     193             :         const int32_t out_size) {
     194             :         for (int32_t i = 0; i < out_size; ++i)
     195             :             store_16bit_to_32bit(in[i], out + i * stride);
     196             :     }
     197             : 
     198             :     static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
     199             :         int32_t *const out,
     200             :         const int32_t stride,
     201             :         const int32_t out_size) {
     202             :         for (int32_t i = 0; i < out_size; ++i)
     203             :             store_rect_16bit_to_32bit_w4(in[i], out + i * stride);
     204             :     }
     205             : 
     206             :     static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
     207             :         int32_t *const out,
     208             :         const int32_t stride,
     209             :         const int32_t out_size) {
     210             :         for (int32_t i = 0; i < out_size; ++i)
     211             :             store_rect_16bit_to_32bit(in[i], out + i * stride);
     212             :     }
     213             : 
     214             :     static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
     215             :         uint16_t *out,
     216             :         const int32_t stride) {
     217             :         for (int32_t i = 0; i < 8; ++i)
     218             :             _mm_store_si128((__m128i *)(out + i * stride), in[i]);
     219             :     }
     220             : 
     221             :     static INLINE void round_shift_16bit(__m128i *in, int32_t size, int32_t bit) {
     222             :         if (bit < 0) {
     223             :             bit = -bit;
     224             :             __m128i rounding = _mm_set1_epi16(1 << (bit - 1));
     225             :             for (int32_t i = 0; i < size; ++i) {
     226             :                 in[i] = _mm_adds_epi16(in[i], rounding);
     227             :                 in[i] = _mm_srai_epi16(in[i], bit);
     228             :             }
     229             :         }
     230             :         else if (bit > 0) {
     231             :             for (int32_t i = 0; i < size; ++i)
     232             :                 in[i] = _mm_slli_epi16(in[i], bit);
     233             :         }
     234             :     }
     235             : 
     236     2117690 :     static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int32_t size) {
     237    16089900 :         for (int32_t i = 0; i < size; ++i)
     238    13972200 :             out[size - i - 1] = in[i];
     239     2117690 :     }
     240             : 
     241             :     void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
     242             :         int32_t stride, TxType tx_type, int32_t bd);
     243             : 
     244             :     void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
     245             :         int32_t stride, TxType tx_type, int32_t bd);
     246             : 
     247             :     void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
     248             :         int32_t stride, TxType tx_type, int32_t bd);
     249             : 
     250             :     void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
     251             :         int32_t stride, TxType tx_type, int32_t bd);
     252             : 
     253             :     void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
     254             :         int32_t stride, TxType tx_type, int32_t bd);
     255             : 
     256             :     void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
     257             :         int32_t stride, TxType tx_type, int32_t bd);
     258             : 
     259             :     void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
     260             :         int32_t stride, TxType tx_type, int32_t bd);
     261             : 
     262             :     void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
     263             :         int32_t stride, TxType tx_type, int32_t bd);
     264             : 
     265             :     void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
     266             :         int32_t stride, TxType tx_type, int32_t bd);
     267             : 
     268             :     void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
     269             :         int32_t stride, TxType tx_type, int32_t bd);
     270             : 
     271             :     void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
     272             :         int32_t stride, TxType tx_type, int32_t bd);
     273             : 
     274             :     void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
     275             :         int32_t stride, TxType tx_type, int32_t bd);
     276             : 
     277             :     void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
     278             :         int32_t stride, TxType tx_type, int32_t bd);
     279             : 
     280             :     void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
     281             :         int32_t stride, TxType tx_type, int32_t bd);
     282             : 
     283             :     void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
     284             :         int32_t stride, TxType tx_type, int32_t bd);
     285             : 
     286             :     void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
     287             :         int32_t stride, TxType tx_type, int32_t bd);
     288             : 
     289             :     typedef void(*transform_1d_sse2)(const __m128i *input, __m128i *output,
     290             :         int8_t cos_bit);
     291             : 
     292             :     typedef struct {
     293             :         transform_1d_sse2 col, row;  // vertical and horizontal
     294             :     } transform_2d_sse2;
     295             : 
     296             : #ifdef __cplusplus
     297             : }
     298             : #endif  // __cplusplus
     299             : #endif  // AV1_COMMON_X86_AV1_TXFM_SSE2_H_

Generated by: LCOV version 1.14