Line data Source code
1 : /*
2 : * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include "EbDefinitions.h"
13 : #include "fft_common.h"
14 :
15 0 : static INLINE void simple_transpose(const float *A, float *B, int32_t n) {
16 0 : for (int32_t y = 0; y < n; y++) {
17 0 : for (int32_t x = 0; x < n; x++)
18 0 : B[y * n + x] = A[x * n + y];
19 : }
20 0 : }
21 :
22 : // The 1d transform is real to complex and packs the complex results in
23 : // a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real
24 : // components, followed by the n/2 - 1 imaginary components). After the
25 : // transform is done on the rows, the first n/2 + 1 columns are real, and
26 : // the remaining are the imaginary components. After the transform on the
27 : // columns, the region of [0, n/2]x[0, n/2] contains the real part of
28 : // fft of the real columns. The real part of the 2d fft also includes the
29 : // imaginary part of transformed imaginary columns. This function assembles
30 : // the correct outputs while putting the real and imaginary components
31 : // next to each other.
32 0 : static INLINE void unpack_2d_output(const float *col_fft, float *output,
33 : int32_t n) {
34 0 : for (int32_t y = 0; y <= n / 2; ++y) {
35 0 : const int32_t y2 = y + n / 2;
36 0 : const int32_t y_extra = y2 > n / 2 && y2 < n;
37 :
38 0 : for (int32_t x = 0; x <= n / 2; ++x) {
39 0 : const int32_t x2 = x + n / 2;
40 0 : const int32_t x_extra = x2 > n / 2 && x2 < n;
41 0 : output[2 * (y * n + x)] =
42 0 : col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
43 0 : output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) +
44 0 : (x_extra ? col_fft[y * n + x2] : 0);
45 0 : if (y_extra) {
46 0 : output[2 * ((n - y) * n + x)] =
47 0 : col_fft[y * n + x] +
48 0 : (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
49 0 : output[2 * ((n - y) * n + x) + 1] =
50 0 : -(y_extra ? col_fft[y2 * n + x] : 0) +
51 0 : (x_extra ? col_fft[y * n + x2] : 0);
52 : }
53 : }
54 : }
55 0 : }
56 :
57 0 : void eb_aom_fft_2d_gen(const float *input, float *temp, float *output, int32_t n,
58 : aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
59 : aom_fft_unpack_func_t unpack, int32_t vec_size) {
60 0 : for (int32_t x = 0; x < n; x += vec_size)
61 0 : tform(input + x, output + x, n);
62 0 : transpose(output, temp, n);
63 :
64 0 : for (int32_t x = 0; x < n; x += vec_size)
65 0 : tform(temp + x, output + x, n);
66 0 : transpose(output, temp, n);
67 :
68 0 : unpack(temp, output, n);
69 0 : }
70 :
71 0 : static INLINE void store_float(float *output, float input) { *output = input; }
72 0 : static INLINE float add_float(float a, float b) { return a + b; }
73 0 : static INLINE float sub_float(float a, float b) { return a - b; }
74 0 : static INLINE float mul_float(float a, float b) { return a * b; }
75 :
76 0 : GEN_FFT_2(void, float, float, float, *, store_float);
77 0 : GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float,
78 : sub_float);
79 0 : GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float,
80 : sub_float, mul_float);
81 0 : GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float,
82 : sub_float, mul_float);
83 0 : GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float,
84 : sub_float, mul_float);
85 :
86 0 : void eb_aom_fft2x2_float_c(const float *input, float *temp, float *output) {
87 0 : eb_aom_fft_2d_gen(input, temp, output, 2, eb_aom_fft1d_2_float, simple_transpose,
88 : unpack_2d_output, 1);
89 0 : }
90 :
91 0 : void eb_aom_fft4x4_float_c(const float *input, float *temp, float *output) {
92 0 : eb_aom_fft_2d_gen(input, temp, output, 4, eb_aom_fft1d_4_float, simple_transpose,
93 : unpack_2d_output, 1);
94 0 : }
95 :
96 0 : void eb_aom_fft8x8_float_c(const float *input, float *temp, float *output) {
97 0 : eb_aom_fft_2d_gen(input, temp, output, 8, eb_aom_fft1d_8_float, simple_transpose,
98 : unpack_2d_output, 1);
99 0 : }
100 :
101 0 : void eb_aom_fft16x16_float_c(const float *input, float *temp, float *output) {
102 0 : eb_aom_fft_2d_gen(input, temp, output, 16, eb_aom_fft1d_16_float, simple_transpose,
103 : unpack_2d_output, 1);
104 0 : }
105 :
106 0 : void eb_aom_fft32x32_float_c(const float *input, float *temp, float *output) {
107 0 : eb_aom_fft_2d_gen(input, temp, output, 32, eb_aom_fft1d_32_float, simple_transpose,
108 : unpack_2d_output, 1);
109 0 : }
110 :
111 0 : void eb_aom_ifft_2d_gen(const float *input, float *temp, float *output, int32_t n,
112 : aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
113 : aom_fft_1d_func_t ifft_multi,
114 : aom_fft_transpose_func_t transpose, int32_t vec_size) {
115 : // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft
116 : // and get real outputs.
117 0 : for (int32_t y = 0; y <= n / 2; ++y) {
118 0 : output[y * n] = input[2 * y * n];
119 0 : output[y * n + 1] = input[2 * (y * n + n / 2)];
120 : }
121 0 : for (int32_t y = n / 2 + 1; y < n; ++y) {
122 0 : output[y * n] = input[2 * (y - n / 2) * n + 1];
123 0 : output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1];
124 : }
125 :
126 0 : for (int32_t i = 0; i < 2; i += vec_size)
127 0 : ifft_multi(output + i, temp + i, n);
128 : // For the other columns, since we don't have a full ifft for complex inputs
129 : // we have to split them into the real and imaginary counterparts.
130 : // Pack the real component, then the imaginary components.
131 0 : for (int32_t y = 0; y < n; ++y) {
132 0 : for (int32_t x = 1; x < n / 2; ++x)
133 0 : output[y * n + (x + 1)] = input[2 * (y * n + x)];
134 0 : for (int32_t x = 1; x < n / 2; ++x)
135 0 : output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1];
136 : }
137 0 : for (int32_t y = 2; y < vec_size; y++)
138 0 : fft_single(output + y, temp + y, n);
139 : // This is the part that can be sped up with SIMD
140 0 : for (int32_t y = AOMMAX(2, vec_size); y < n; y += vec_size)
141 0 : fft_multi(output + y, temp + y, n);
142 : // Put the 0 and n/2 th results in the correct place.
143 0 : for (int32_t x = 0; x < n; ++x) {
144 0 : output[x] = temp[x * n];
145 0 : output[(n / 2) * n + x] = temp[x * n + 1];
146 : }
147 : // This rearranges and transposes.
148 0 : for (int32_t y = 1; y < n / 2; ++y) {
149 : // Fill in the real columns
150 0 : for (int32_t x = 0; x <= n / 2; ++x) {
151 0 : output[x + y * n] =
152 0 : temp[(y + 1) + x * n] +
153 0 : ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0);
154 : }
155 0 : for (int32_t x = n / 2 + 1; x < n; ++x) {
156 0 : output[x + y * n] = temp[(y + 1) + (n - x) * n] -
157 0 : temp[(y + n / 2) + ((n - x) + n / 2) * n];
158 : }
159 : // Fill in the imag columns
160 0 : for (int32_t x = 0; x <= n / 2; ++x) {
161 0 : output[x + (y + n / 2) * n] =
162 0 : temp[(y + n / 2) + x * n] -
163 0 : ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0);
164 : }
165 0 : for (int32_t x = n / 2 + 1; x < n; ++x) {
166 0 : output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] +
167 0 : temp[(y + n / 2) + (n - x) * n];
168 : }
169 : }
170 0 : for (int32_t y = 0; y < n; y += vec_size)
171 0 : ifft_multi(output + y, temp + y, n);
172 0 : transpose(temp, output, n);
173 0 : }
174 :
175 0 : GEN_IFFT_2(void, float, float, float, *, store_float);
176 0 : GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float,
177 : sub_float);
178 0 : GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float,
179 : sub_float, mul_float);
180 0 : GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float,
181 : sub_float, mul_float);
182 0 : GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float,
183 : sub_float, mul_float);
184 :
185 0 : void eb_aom_ifft2x2_float_c(const float *input, float *temp, float *output) {
186 0 : eb_aom_ifft_2d_gen(input, temp, output, 2, eb_aom_fft1d_2_float, eb_aom_fft1d_2_float,
187 : eb_aom_ifft1d_2_float, simple_transpose, 1);
188 0 : }
189 :
190 0 : void eb_aom_ifft4x4_float_c(const float* input, float* temp, float* output) {
191 0 : eb_aom_ifft_2d_gen(input, temp, output, 4, eb_aom_fft1d_4_float, eb_aom_fft1d_4_float,
192 : eb_aom_ifft1d_4_float, simple_transpose, 1);
193 0 : }
194 :
195 0 : void eb_aom_ifft8x8_float_c(const float* input, float* temp, float* output) {
196 0 : eb_aom_ifft_2d_gen(input, temp, output, 8, eb_aom_fft1d_8_float, eb_aom_fft1d_8_float,
197 : eb_aom_ifft1d_8_float, simple_transpose, 1);
198 0 : }
199 :
200 0 : void eb_aom_ifft16x16_float_c(const float* input, float* temp, float* output) {
201 0 : eb_aom_ifft_2d_gen(input, temp, output, 16, eb_aom_fft1d_16_float,
202 : eb_aom_fft1d_16_float, eb_aom_ifft1d_16_float, simple_transpose, 1);
203 0 : }
204 :
205 0 : void eb_aom_ifft32x32_float_c(const float* input, float* temp, float* output) {
206 0 : eb_aom_ifft_2d_gen(input, temp, output, 32, eb_aom_fft1d_32_float,
207 : eb_aom_fft1d_32_float, eb_aom_ifft1d_32_float, simple_transpose, 1);
208 0 : }
|