LCOV - code coverage report
Current view: top level - ASM_SSE2 - fft_sse2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 78 0.0 %
Date: 2019-11-25 17:38:06 Functions: 0 7 0.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             : s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <xmmintrin.h>
      13             : 
      14             : #include "EbDefinitions.h"
      15             : #include "fft_common.h"
      16             : 
      17           0 : static INLINE void transpose4x4(const float *A, float *B, const int32_t lda,
      18             :     const int32_t ldb) {
      19           0 :     __m128 row1 = _mm_load_ps(&A[0 * lda]);
      20           0 :     __m128 row2 = _mm_load_ps(&A[1 * lda]);
      21           0 :     __m128 row3 = _mm_load_ps(&A[2 * lda]);
      22           0 :     __m128 row4 = _mm_load_ps(&A[3 * lda]);
      23           0 :     _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
      24             :     _mm_store_ps(&B[0 * ldb], row1);
      25           0 :     _mm_store_ps(&B[1 * ldb], row2);
      26           0 :     _mm_store_ps(&B[2 * ldb], row3);
      27           0 :     _mm_store_ps(&B[3 * ldb], row4);
      28           0 : }
      29             : 
      30           0 : void eb_aom_transpose_float_sse2(const float *A, float *B, int32_t n) {
      31           0 :     for (int32_t y = 0; y < n; y += 4) {
      32           0 :         for (int32_t x = 0; x < n; x += 4)
      33           0 :             transpose4x4(A + y * n + x, B + x * n + y, n, n);
      34             :     }
      35           0 : }
      36             : 
      37           0 : void eb_aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int32_t n) {
      38           0 :     const int32_t n2 = n / 2;
      39           0 :     output[0] = packed[0];
      40           0 :     output[1] = 0;
      41           0 :     output[2 * (n2 * n)] = packed[n2 * n];
      42           0 :     output[2 * (n2 * n) + 1] = 0;
      43             : 
      44           0 :     output[2 * n2] = packed[n2];
      45           0 :     output[2 * n2 + 1] = 0;
      46           0 :     output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
      47           0 :     output[2 * (n2 * n + n2) + 1] = 0;
      48             : 
      49           0 :     for (int32_t c = 1; c < n2; ++c) {
      50           0 :         output[2 * (0 * n + c)] = packed[c];
      51           0 :         output[2 * (0 * n + c) + 1] = packed[c + n2];
      52           0 :         output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
      53           0 :         output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
      54             :     }
      55           0 :     for (int32_t r = 1; r < n2; ++r) {
      56           0 :         output[2 * (r * n + 0)] = packed[r * n];
      57           0 :         output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
      58           0 :         output[2 * (r * n + n2) + 0] = packed[r * n + n2];
      59           0 :         output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
      60             : 
      61           0 :         for (int32_t c = 1; c < AOMMIN(n2, 4); ++c) {
      62           0 :             output[2 * (r * n + c)] =
      63           0 :                 packed[r * n + c] - packed[(r + n2) * n + c + n2];
      64           0 :             output[2 * (r * n + c) + 1] =
      65           0 :                 packed[(r + n2) * n + c] + packed[r * n + c + n2];
      66             :         }
      67             : 
      68           0 :         for (int32_t c = 4; c < n2; c += 4) {
      69           0 :             __m128 real1 = _mm_load_ps(packed + r * n + c);
      70           0 :             __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
      71           0 :             __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
      72           0 :             __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
      73           0 :             real1 = _mm_sub_ps(real1, real2);
      74           0 :             imag1 = _mm_add_ps(imag1, imag2);
      75           0 :             _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
      76           0 :             _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
      77             :         }
      78             : 
      79           0 :         int32_t r2 = r + n2;
      80           0 :         int32_t r3 = n - r2;
      81           0 :         output[2 * (r2 * n + 0)] = packed[r3 * n];
      82           0 :         output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
      83           0 :         output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
      84           0 :         output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
      85           0 :         for (int32_t c = 1; c < AOMMIN(4, n2); ++c) {
      86           0 :             output[2 * (r2 * n + c)] =
      87           0 :                 packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
      88           0 :             output[2 * (r2 * n + c) + 1] =
      89           0 :                 -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
      90             :         }
      91           0 :         for (int32_t c = 4; c < n2; c += 4) {
      92           0 :             __m128 real1 = _mm_load_ps(packed + r3 * n + c);
      93           0 :             __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
      94           0 :             __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
      95           0 :             __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
      96           0 :             real1 = _mm_add_ps(real1, real2);
      97           0 :             imag1 = _mm_sub_ps(imag2, imag1);
      98           0 :             _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
      99           0 :             _mm_store_ps(output + 2 * (r2 * n + c + 2),
     100             :                 _mm_unpackhi_ps(real1, imag1));
     101             :         }
     102             :     }
     103           0 : }
     104             : 
     105             : // Generate definitions for 1d transforms using float and __mm128
     106           0 : GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
     107             :     _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
     108             : 
     109           0 : void eb_aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
     110           0 :     eb_aom_fft_2d_gen(input, temp, output, 4, eb_aom_fft1d_4_sse2,
     111             :         eb_aom_transpose_float_sse2, eb_aom_fft_unpack_2d_output_sse2, 4);
     112           0 : }
     113             : 
     114             : // Generate definitions for 1d inverse transforms using float and mm128
     115           0 : GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
     116             :     _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
     117             : 
     118           0 : void eb_aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
     119           0 :     eb_aom_ifft_2d_gen(input, temp, output, 4, eb_aom_fft1d_4_float, eb_aom_fft1d_4_sse2,
     120             :         eb_aom_ifft1d_4_sse2, eb_aom_transpose_float_sse2, 4);
     121           0 : }

Generated by: LCOV version 1.14