LCOV - code coverage report
Current view: top level - ASM_SSE4_1 - highbd_fwd_txfm_sse4.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 235 236 99.6 %
Date: 2019-11-25 17:38:06 Functions: 6 6 100.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : /*
       7             : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       8             : *
       9             : * This source code is subject to the terms of the BSD 2 Clause License and
      10             : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      11             : * was not distributed with this source code in the LICENSE file, you can
      12             : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      13             : * Media Patent License 1.0 was not distributed with this source code in the
      14             : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      15             : */
      16             : 
      17             : #include <assert.h>
      18             : #include <smmintrin.h> /* SSE4.1 */
      19             : 
      20             : #include "EbDefinitions.h"
      21             : #include "aom_dsp_rtcd.h"
      22             : #include "emmintrin.h"
      23             : #include "EbTransforms.h"
      24             : #include "highbd_txfm_utility_sse4.h"
      25             : 
      26             : #include "av1_txfm_sse4.h"
      27             : 
      28             : static const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL] = {
      29             :     fwd_shift_4x4, fwd_shift_8x8, fwd_shift_16x16, fwd_shift_32x32,
      30             :     fwd_shift_64x64, fwd_shift_4x8, fwd_shift_8x4, fwd_shift_8x16,
      31             :     fwd_shift_16x8, fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64,
      32             :     fwd_shift_64x32, fwd_shift_4x16, fwd_shift_16x4, fwd_shift_8x32,
      33             :     fwd_shift_32x8, fwd_shift_16x64, fwd_shift_64x16,
      34             : };
      35             : 
      36             : typedef void(*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int32_t bit,
      37             :     const int32_t num_cols);
      38             : 
      39    22754100 : static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
      40             :     int32_t stride, int32_t flipud, int32_t fliplr,
      41             :     int32_t shift) {
      42    22754100 :     if (!flipud) {
      43    21627900 :         in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
      44    21627900 :         in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
      45    21627900 :         in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
      46    43255700 :         in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
      47             :     }
      48             :     else {
      49     1126220 :         in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
      50     1126220 :         in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
      51     1126220 :         in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
      52     2252440 :         in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
      53             :     }
      54             : 
      55    22754100 :     if (fliplr) {
      56     1149590 :         in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
      57     1149590 :         in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
      58     1149590 :         in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
      59     1149590 :         in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
      60             :     }
      61             : 
      62    22754100 :     in[0] = _mm_cvtepi16_epi32(in[0]);
      63    22754100 :     in[1] = _mm_cvtepi16_epi32(in[1]);
      64    22754100 :     in[2] = _mm_cvtepi16_epi32(in[2]);
      65    22754100 :     in[3] = _mm_cvtepi16_epi32(in[3]);
      66             : 
      67    22754100 :     in[0] = _mm_slli_epi32(in[0], shift);
      68    22754100 :     in[1] = _mm_slli_epi32(in[1], shift);
      69    22754100 :     in[2] = _mm_slli_epi32(in[2], shift);
      70    22754100 :     in[3] = _mm_slli_epi32(in[3], shift);
      71    22754100 : }
      72             : 
      73     6187440 : static void fidtx4x4_sse4_1(__m128i *in, __m128i *out, int32_t bit, int32_t col_num) {
      74             :     (void)bit;
      75     6187440 :     __m128i fact = _mm_set1_epi32(NewSqrt2);
      76     6187440 :     __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
      77             :     __m128i a_low;
      78             :     __m128i v[4];
      79             : 
      80    30933400 :     for (int32_t i = 0; i < 4; i++) {
      81    49491800 :         a_low = _mm_mullo_epi32(in[i * col_num], fact);
      82    24745900 :         a_low = _mm_add_epi32(a_low, offset);
      83    49491800 :         out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
      84             :     }
      85             : 
      86             :     // Transpose for 4x4
      87     6187440 :     v[0] = _mm_unpacklo_epi32(out[0], out[1]);
      88     6187440 :     v[1] = _mm_unpackhi_epi32(out[0], out[1]);
      89     6187440 :     v[2] = _mm_unpacklo_epi32(out[2], out[3]);
      90     6187440 :     v[3] = _mm_unpackhi_epi32(out[2], out[3]);
      91             : 
      92     6187440 :     out[0] = _mm_unpacklo_epi64(v[0], v[2]);
      93     6187440 :     out[1] = _mm_unpackhi_epi64(v[0], v[2]);
      94     6187440 :     out[2] = _mm_unpacklo_epi64(v[1], v[3]);
      95     6187440 :     out[3] = _mm_unpackhi_epi64(v[1], v[3]);
      96     6187440 : }
      97             : 
      98             : // We only use stage-2 bit;
      99             : // shift[0] is used in load_buffer_4x4()
     100             : // shift[1] is used in txfm_func_col()
     101             : // shift[2] is used in txfm_func_row()
     102    27355300 : static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int32_t bit,
     103             :     const int32_t num_col) {
     104    27355300 :     const int32_t *cospi = cospi_arr(bit);
     105    27356200 :     const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
     106    27356200 :     const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
     107    27356200 :     const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
     108    27356200 :     const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
     109             :     __m128i s0, s1, s2, s3;
     110             :     __m128i u0, u1, u2, u3;
     111             :     __m128i v0, v1, v2, v3;
     112             : 
     113    27356200 :     int32_t endidx = 3 * num_col;
     114    27356200 :     s0 = _mm_add_epi32(in[0], in[endidx]);
     115    27356200 :     s3 = _mm_sub_epi32(in[0], in[endidx]);
     116    27356200 :     endidx -= num_col;
     117    27356200 :     s1 = _mm_add_epi32(in[num_col], in[endidx]);
     118    54712500 :     s2 = _mm_sub_epi32(in[num_col], in[endidx]);
     119             : 
     120             :     // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
     121    27356200 :     u0 = _mm_mullo_epi32(s0, cospi32);
     122    27356200 :     u1 = _mm_mullo_epi32(s1, cospi32);
     123    27356200 :     u2 = _mm_add_epi32(u0, u1);
     124    27356200 :     v0 = _mm_sub_epi32(u0, u1);
     125             : 
     126    27356200 :     u3 = _mm_add_epi32(u2, rnding);
     127    27356200 :     v1 = _mm_add_epi32(v0, rnding);
     128             : 
     129    27356200 :     u0 = _mm_srai_epi32(u3, bit);
     130    27356200 :     u2 = _mm_srai_epi32(v1, bit);
     131             : 
     132             :     // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
     133    27356200 :     v0 = _mm_mullo_epi32(s2, cospi48);
     134    27356200 :     v1 = _mm_mullo_epi32(s3, cospi16);
     135    27356200 :     v2 = _mm_add_epi32(v0, v1);
     136             : 
     137    27356200 :     v3 = _mm_add_epi32(v2, rnding);
     138    27356200 :     u1 = _mm_srai_epi32(v3, bit);
     139             : 
     140    27356200 :     v0 = _mm_mullo_epi32(s2, cospi16);
     141    27356200 :     v1 = _mm_mullo_epi32(s3, cospi48);
     142    27356200 :     v2 = _mm_sub_epi32(v1, v0);
     143             : 
     144    27356200 :     v3 = _mm_add_epi32(v2, rnding);
     145    27356200 :     u3 = _mm_srai_epi32(v3, bit);
     146             : 
     147             :     // Note: shift[1] and shift[2] are zeros
     148             : 
     149             :     // Transpose 4x4 32-bit
     150    27356200 :     v0 = _mm_unpacklo_epi32(u0, u1);
     151    27356200 :     v1 = _mm_unpackhi_epi32(u0, u1);
     152    27356200 :     v2 = _mm_unpacklo_epi32(u2, u3);
     153    27356200 :     v3 = _mm_unpackhi_epi32(u2, u3);
     154             : 
     155    27356200 :     out[0] = _mm_unpacklo_epi64(v0, v2);
     156    27356200 :     out[1] = _mm_unpackhi_epi64(v0, v2);
     157    27356200 :     out[2] = _mm_unpacklo_epi64(v1, v3);
     158    27356200 :     out[3] = _mm_unpackhi_epi64(v1, v3);
     159    27356200 : }
     160             : 
     161    22771100 : static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
     162    22771100 :     _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
     163    22771100 :     _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
     164    22771100 :     _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
     165    22771100 :     _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
     166    22771100 : }
     167             : 
     168    11994900 : static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int32_t bit,
     169             :     const int32_t num_col) {
     170    11994900 :     const int32_t *sinpi = sinpi_arr(bit);
     171    11993800 :     const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
     172    11993800 :     const __m128i sinpi1 = _mm_set1_epi32((int32_t)sinpi[1]);
     173    11993800 :     const __m128i sinpi2 = _mm_set1_epi32((int32_t)sinpi[2]);
     174    11993800 :     const __m128i sinpi3 = _mm_set1_epi32((int32_t)sinpi[3]);
     175    11993800 :     const __m128i sinpi4 = _mm_set1_epi32((int32_t)sinpi[4]);
     176             :     __m128i t;
     177             :     __m128i s0, s1, s2, s3, s4, s5, s6, s7;
     178             :     __m128i x0, x1, x2, x3;
     179             :     __m128i u0, u1, u2, u3;
     180             :     __m128i v0, v1, v2, v3;
     181             : 
     182    11993800 :     int32_t idx = 0 * num_col;
     183    11993800 :     s0 = _mm_mullo_epi32(in[idx], sinpi1);
     184    11993800 :     s1 = _mm_mullo_epi32(in[idx], sinpi4);
     185    11993800 :     t = _mm_add_epi32(in[idx], in[idx + num_col]);
     186    11993800 :     idx += num_col;
     187    11993800 :     s2 = _mm_mullo_epi32(in[idx], sinpi2);
     188    11993800 :     s3 = _mm_mullo_epi32(in[idx], sinpi1);
     189    11993800 :     idx += num_col;
     190    11993800 :     s4 = _mm_mullo_epi32(in[idx], sinpi3);
     191    11993800 :     idx += num_col;
     192    11993800 :     s5 = _mm_mullo_epi32(in[idx], sinpi4);
     193    11993800 :     s6 = _mm_mullo_epi32(in[idx], sinpi2);
     194    23987600 :     s7 = _mm_sub_epi32(t, in[idx]);
     195             : 
     196    11993800 :     t = _mm_add_epi32(s0, s2);
     197    11993800 :     x0 = _mm_add_epi32(t, s5);
     198    11993800 :     x1 = _mm_mullo_epi32(s7, sinpi3);
     199    11993800 :     t = _mm_sub_epi32(s1, s3);
     200    11993800 :     x2 = _mm_add_epi32(t, s6);
     201    11993800 :     x3 = s4;
     202             : 
     203    11993800 :     s0 = _mm_add_epi32(x0, x3);
     204    11993800 :     s1 = x1;
     205    11993800 :     s2 = _mm_sub_epi32(x2, x3);
     206    11993800 :     t = _mm_sub_epi32(x2, x0);
     207    11993800 :     s3 = _mm_add_epi32(t, x3);
     208             : 
     209    11993800 :     u0 = _mm_add_epi32(s0, rnding);
     210    11993800 :     u0 = _mm_srai_epi32(u0, bit);
     211             : 
     212    11993800 :     u1 = _mm_add_epi32(s1, rnding);
     213    11993800 :     u1 = _mm_srai_epi32(u1, bit);
     214             : 
     215    11993800 :     u2 = _mm_add_epi32(s2, rnding);
     216    11993800 :     u2 = _mm_srai_epi32(u2, bit);
     217             : 
     218    11993800 :     u3 = _mm_add_epi32(s3, rnding);
     219    11993800 :     u3 = _mm_srai_epi32(u3, bit);
     220             : 
     221    11993800 :     v0 = _mm_unpacklo_epi32(u0, u1);
     222    11993800 :     v1 = _mm_unpackhi_epi32(u0, u1);
     223    11993800 :     v2 = _mm_unpacklo_epi32(u2, u3);
     224    11993800 :     v3 = _mm_unpackhi_epi32(u2, u3);
     225             : 
     226    11993800 :     out[0] = _mm_unpacklo_epi64(v0, v2);
     227    11993800 :     out[1] = _mm_unpackhi_epi64(v0, v2);
     228    11993800 :     out[2] = _mm_unpacklo_epi64(v1, v3);
     229    11993800 :     out[3] = _mm_unpackhi_epi64(v1, v3);
     230    11993800 : }
     231             : 
     232    22760500 : void eb_av1_fwd_txfm2d_4x4_sse4_1(int16_t *input, int32_t *coeff, uint32_t stride, TxType tx_type, uint8_t  bd)
     233             : {
     234             :     __m128i in[4];
     235    22760500 :     const int8_t *shift = fwd_txfm_shift_ls[TX_4X4];
     236    22760500 :     const int32_t txw_idx = get_txw_idx(TX_4X4);
     237    22760100 :     const int32_t txh_idx = get_txh_idx(TX_4X4);
     238             : 
     239    22779700 :     switch (tx_type) {
     240     9814640 :     case DCT_DCT:
     241     9814640 :         load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
     242     9817500 :         fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     243     9817690 :         fdct4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     244     9817680 :         write_buffer_4x4(in, coeff);
     245     9816920 :         break;
     246     2282410 :     case ADST_DCT:
     247     2282410 :         load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
     248     2282620 :         fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     249     2282630 :         fdct4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     250     2282690 :         write_buffer_4x4(in, coeff);
     251     2282630 :         break;
     252     2341050 :     case DCT_ADST:
     253     2341050 :         load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
     254     2341260 :         fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     255     2341280 :         fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     256     2341270 :         write_buffer_4x4(in, coeff);
     257     2341190 :         break;
     258     1966580 :     case ADST_ADST:
     259     1966580 :         load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
     260     1966730 :         fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     261     1966760 :         fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     262     1966810 :         write_buffer_4x4(in, coeff);
     263     1966780 :         break;
     264      275069 :     case FLIPADST_DCT:
     265      275069 :         load_buffer_4x4(input, in, stride, 1, 0, shift[0]);
     266      275074 :         fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     267      275072 :         fdct4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     268      275072 :         write_buffer_4x4(in, coeff);
     269      275072 :         break;
     270      279377 :     case DCT_FLIPADST:
     271      279377 :         load_buffer_4x4(input, in, stride, 0, 1, shift[0]);
     272      279381 :         fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     273      279383 :         fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     274      279384 :         write_buffer_4x4(in, coeff);
     275      279383 :         break;
     276      284573 :     case FLIPADST_FLIPADST:
     277      284573 :         load_buffer_4x4(input, in, stride, 1, 1, shift[0]);
     278      284575 :         fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     279      284574 :         fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     280      284573 :         write_buffer_4x4(in, coeff);
     281      284573 :         break;
     282      291436 :     case ADST_FLIPADST:
     283      291436 :         load_buffer_4x4(input, in, stride, 0, 1, shift[0]);
     284      291438 :         fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     285      291437 :         fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     286      291437 :         write_buffer_4x4(in, coeff);
     287      291436 :         break;
     288      294438 :     case FLIPADST_ADST:
     289      294438 :         load_buffer_4x4(input, in, stride, 1, 0, shift[0]);
     290      294439 :         fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     291      294440 :         fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     292      294441 :         write_buffer_4x4(in, coeff);
     293      294440 :         break;
     294     1238040 :     case IDTX:
     295     1238040 :         load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
     296     1238120 :         fidtx4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     297     1238100 :         fidtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     298     1238120 :         write_buffer_4x4(in, coeff);
     299     1238110 :         break;
     300     1251030 :     case V_DCT:
     301     1251030 :         load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
     302     1251100 :         fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     303     1251100 :         fidtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     304     1251100 :         write_buffer_4x4(in, coeff);
     305     1251080 :         break;
     306     1315130 :     case H_DCT:
     307     1315130 :         load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
     308     1315190 :         fidtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     309     1315180 :         fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     310     1315220 :         write_buffer_4x4(in, coeff);
     311     1315200 :         break;
     312      277724 :     case V_ADST:
     313      277724 :         load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
     314      277724 :         fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     315      277725 :         fidtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     316      277725 :         write_buffer_4x4(in, coeff);
     317      277725 :         break;
     318      300236 :     case H_ADST:
     319      300236 :         load_buffer_4x4(input, in, stride, 0, 0, shift[0]);
     320      300238 :         fidtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     321      300238 :         fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     322      300237 :         write_buffer_4x4(in, coeff);
     323      300237 :         break;
     324      273713 :     case V_FLIPADST:
     325      273713 :         load_buffer_4x4(input, in, stride, 1, 0, shift[0]);
     326      273716 :         fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     327      273718 :         fidtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     328      273718 :         write_buffer_4x4(in, coeff);
     329      273718 :         break;
     330      294212 :     case H_FLIPADST:
     331      294212 :         load_buffer_4x4(input, in, stride, 0, 1, shift[0]);
     332      294214 :         fidtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     333      294211 :         fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     334      294212 :         write_buffer_4x4(in, coeff);
     335      294211 :         break;
     336           0 :     default: assert(0);
     337             :     }
     338             :     (void)bd;
     339    22782700 : }

Generated by: LCOV version 1.14