Line data Source code
1 : /*
2 : * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <xmmintrin.h>
13 :
14 : #include "EbDefinitions.h"
15 : #include "fft_common.h"
16 :
17 0 : static INLINE void transpose4x4(const float *A, float *B, const int32_t lda,
18 : const int32_t ldb) {
19 0 : __m128 row1 = _mm_load_ps(&A[0 * lda]);
20 0 : __m128 row2 = _mm_load_ps(&A[1 * lda]);
21 0 : __m128 row3 = _mm_load_ps(&A[2 * lda]);
22 0 : __m128 row4 = _mm_load_ps(&A[3 * lda]);
23 0 : _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
24 : _mm_store_ps(&B[0 * ldb], row1);
25 0 : _mm_store_ps(&B[1 * ldb], row2);
26 0 : _mm_store_ps(&B[2 * ldb], row3);
27 0 : _mm_store_ps(&B[3 * ldb], row4);
28 0 : }
29 :
30 0 : void eb_aom_transpose_float_sse2(const float *A, float *B, int32_t n) {
31 0 : for (int32_t y = 0; y < n; y += 4) {
32 0 : for (int32_t x = 0; x < n; x += 4)
33 0 : transpose4x4(A + y * n + x, B + x * n + y, n, n);
34 : }
35 0 : }
36 :
37 0 : void eb_aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int32_t n) {
38 0 : const int32_t n2 = n / 2;
39 0 : output[0] = packed[0];
40 0 : output[1] = 0;
41 0 : output[2 * (n2 * n)] = packed[n2 * n];
42 0 : output[2 * (n2 * n) + 1] = 0;
43 :
44 0 : output[2 * n2] = packed[n2];
45 0 : output[2 * n2 + 1] = 0;
46 0 : output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
47 0 : output[2 * (n2 * n + n2) + 1] = 0;
48 :
49 0 : for (int32_t c = 1; c < n2; ++c) {
50 0 : output[2 * (0 * n + c)] = packed[c];
51 0 : output[2 * (0 * n + c) + 1] = packed[c + n2];
52 0 : output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
53 0 : output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
54 : }
55 0 : for (int32_t r = 1; r < n2; ++r) {
56 0 : output[2 * (r * n + 0)] = packed[r * n];
57 0 : output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
58 0 : output[2 * (r * n + n2) + 0] = packed[r * n + n2];
59 0 : output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
60 :
61 0 : for (int32_t c = 1; c < AOMMIN(n2, 4); ++c) {
62 0 : output[2 * (r * n + c)] =
63 0 : packed[r * n + c] - packed[(r + n2) * n + c + n2];
64 0 : output[2 * (r * n + c) + 1] =
65 0 : packed[(r + n2) * n + c] + packed[r * n + c + n2];
66 : }
67 :
68 0 : for (int32_t c = 4; c < n2; c += 4) {
69 0 : __m128 real1 = _mm_load_ps(packed + r * n + c);
70 0 : __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
71 0 : __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
72 0 : __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
73 0 : real1 = _mm_sub_ps(real1, real2);
74 0 : imag1 = _mm_add_ps(imag1, imag2);
75 0 : _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
76 0 : _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
77 : }
78 :
79 0 : int32_t r2 = r + n2;
80 0 : int32_t r3 = n - r2;
81 0 : output[2 * (r2 * n + 0)] = packed[r3 * n];
82 0 : output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
83 0 : output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
84 0 : output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
85 0 : for (int32_t c = 1; c < AOMMIN(4, n2); ++c) {
86 0 : output[2 * (r2 * n + c)] =
87 0 : packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
88 0 : output[2 * (r2 * n + c) + 1] =
89 0 : -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
90 : }
91 0 : for (int32_t c = 4; c < n2; c += 4) {
92 0 : __m128 real1 = _mm_load_ps(packed + r3 * n + c);
93 0 : __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
94 0 : __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
95 0 : __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
96 0 : real1 = _mm_add_ps(real1, real2);
97 0 : imag1 = _mm_sub_ps(imag2, imag1);
98 0 : _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
99 0 : _mm_store_ps(output + 2 * (r2 * n + c + 2),
100 : _mm_unpackhi_ps(real1, imag1));
101 : }
102 : }
103 0 : }
104 :
105 : // Generate definitions for 1d transforms using float and __mm128
106 0 : GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
107 : _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
108 :
109 0 : void eb_aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
110 0 : eb_aom_fft_2d_gen(input, temp, output, 4, eb_aom_fft1d_4_sse2,
111 : eb_aom_transpose_float_sse2, eb_aom_fft_unpack_2d_output_sse2, 4);
112 0 : }
113 :
114 : // Generate definitions for 1d inverse transforms using float and mm128
115 0 : GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
116 : _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
117 :
118 0 : void eb_aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
119 0 : eb_aom_ifft_2d_gen(input, temp, output, 4, eb_aom_fft1d_4_float, eb_aom_fft1d_4_sse2,
120 : eb_aom_ifft1d_4_sse2, eb_aom_transpose_float_sse2, 4);
121 0 : }
|