Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <assert.h>
13 : #include "EbDefinitions.h"
14 : #include "aom_dsp_rtcd.h"
15 : #include "convolve.h"
16 :
17 : // Note: Fixed size intermediate buffers, place limits on parameters
18 : // of some functions. 2d filtering proceeds in 2 steps:
19 : // (1) Interpolate horizontally into an intermediate buffer, temp.
20 : // (2) Interpolate temp vertically to derive the sub-pixel result.
21 : // Deriving the maximum number of rows in the temp buffer (135):
22 : // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
23 : // --Largest block size is 128x128 pixels.
24 : // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
25 : // original frame (in 1/16th pixel units).
26 : // --Must round-up because block may be located at sub-pixel position.
27 : // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
28 : // --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
29 : #define WIENER_MAX_EXT_SIZE 263
30 :
31 0 : static INLINE int32_t horz_scalar_product(const uint8_t *a, const int16_t *b) {
32 0 : int32_t sum = 0;
33 0 : for (int32_t k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
34 0 : return sum;
35 : }
36 :
37 0 : static INLINE int32_t highbd_horz_scalar_product(const uint16_t *a,
38 : const int16_t *b) {
39 0 : int32_t sum = 0;
40 0 : for (int32_t k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
41 0 : return sum;
42 : }
43 :
44 0 : static INLINE int32_t highbd_vert_scalar_product(const uint16_t *a,
45 : ptrdiff_t a_stride,
46 : const int16_t *b) {
47 0 : int32_t sum = 0;
48 0 : for (int32_t k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
49 0 : return sum;
50 : }
51 :
52 0 : static const InterpKernel *get_filter_base(const int16_t *filter) {
53 : // NOTE: This assumes that the filter table is 256-byte aligned.
54 : // TODO(agrange) Modify to make independent of table alignment.
55 0 : return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
56 : }
57 :
58 0 : static int32_t get_filter_offset(const int16_t *f, const InterpKernel *base) {
59 0 : return (int32_t)((const InterpKernel *)(intptr_t)f - base);
60 : }
61 :
62 0 : static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
63 : uint16_t *dst, ptrdiff_t dst_stride,
64 : const InterpKernel *x_filters, int32_t x0_q4,
65 : int32_t x_step_q4, int32_t w, int32_t h,
66 : int32_t round0_bits) {
67 0 : const int32_t bd = 8;
68 0 : src -= SUBPEL_TAPS / 2 - 1;
69 0 : for (int32_t y = 0; y < h; ++y) {
70 0 : int32_t x_q4 = x0_q4;
71 0 : for (int32_t x = 0; x < w; ++x) {
72 0 : const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
73 0 : const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
74 0 : const int32_t rounding = ((int32_t)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
75 0 : (1 << (bd + FILTER_BITS - 1));
76 0 : const int32_t sum = horz_scalar_product(src_x, x_filter) + rounding;
77 0 : dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
78 0 : WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
79 0 : x_q4 += x_step_q4;
80 : }
81 0 : src += src_stride;
82 0 : dst += dst_stride;
83 : }
84 0 : }
85 :
86 0 : static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
87 : uint8_t *dst, ptrdiff_t dst_stride,
88 : const InterpKernel *y_filters, int32_t y0_q4,
89 : int32_t y_step_q4, int32_t w, int32_t h,
90 : int32_t round1_bits) {
91 0 : const int32_t bd = 8;
92 0 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
93 :
94 0 : for (int32_t x = 0; x < w; ++x) {
95 0 : int32_t y_q4 = y0_q4;
96 0 : for (int32_t y = 0; y < h; ++y) {
97 0 : const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
98 0 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
99 0 : const int32_t rounding =
100 0 : ((int32_t)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
101 0 : (1 << (bd + round1_bits - 1));
102 0 : const int32_t sum =
103 0 : highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
104 0 : dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
105 0 : y_q4 += y_step_q4;
106 : }
107 0 : ++src;
108 0 : ++dst;
109 : }
110 0 : }
111 :
112 0 : void eb_av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
113 : uint8_t *dst, ptrdiff_t dst_stride,
114 : const int16_t *filter_x, int32_t x_step_q4,
115 : const int16_t *filter_y, int32_t y_step_q4,
116 : int32_t w, int32_t h,
117 : const ConvolveParams *conv_params) {
118 0 : const InterpKernel *const filters_x = get_filter_base(filter_x);
119 0 : const int32_t x0_q4 = get_filter_offset(filter_x, filters_x);
120 :
121 0 : const InterpKernel *const filters_y = get_filter_base(filter_y);
122 0 : const int32_t y0_q4 = get_filter_offset(filter_y, filters_y);
123 :
124 : uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
125 0 : const int32_t intermediate_height =
126 0 : (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
127 :
128 0 : assert(w <= MAX_SB_SIZE);
129 0 : assert(h <= MAX_SB_SIZE);
130 0 : assert(y_step_q4 <= 32);
131 0 : assert(x_step_q4 <= 32);
132 :
133 0 : convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
134 : src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
135 : x_step_q4, w, intermediate_height,
136 : conv_params->round_0);
137 0 : convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
138 : MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
139 : y_step_q4, w, h, conv_params->round_1);
140 0 : }
141 :
142 0 : static void highbd_convolve_add_src_horiz_hip(
143 : const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
144 : ptrdiff_t dst_stride, const InterpKernel *x_filters, int32_t x0_q4,
145 : int32_t x_step_q4, int32_t w, int32_t h, int32_t round0_bits, int32_t bd) {
146 0 : const int32_t extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
147 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
148 0 : src -= SUBPEL_TAPS / 2 - 1;
149 0 : for (int32_t y = 0; y < h; ++y) {
150 0 : int32_t x_q4 = x0_q4;
151 0 : for (int32_t x = 0; x < w; ++x) {
152 0 : const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
153 0 : const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
154 0 : const int32_t rounding = ((int32_t)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
155 0 : (1 << (bd + FILTER_BITS - 1));
156 0 : const int32_t sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
157 0 : dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
158 : extraprec_clamp_limit - 1);
159 0 : x_q4 += x_step_q4;
160 : }
161 0 : src += src_stride;
162 0 : dst += dst_stride;
163 : }
164 0 : }
165 :
166 0 : static void highbd_convolve_add_src_vert_hip(
167 : const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
168 : ptrdiff_t dst_stride, const InterpKernel *y_filters, int32_t y0_q4,
169 : int32_t y_step_q4, int32_t w, int32_t h, int32_t round1_bits, int32_t bd) {
170 0 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
171 0 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
172 0 : for (int32_t x = 0; x < w; ++x) {
173 0 : int32_t y_q4 = y0_q4;
174 0 : for (int32_t y = 0; y < h; ++y) {
175 0 : const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
176 0 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
177 0 : const int32_t rounding =
178 0 : ((int32_t)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
179 0 : (1 << (bd + round1_bits - 1));
180 0 : const int32_t sum =
181 0 : highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
182 0 : dst[y * dst_stride] =
183 0 : clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
184 0 : y_q4 += y_step_q4;
185 : }
186 0 : ++src;
187 0 : ++dst;
188 : }
189 0 : }
190 :
191 0 : void eb_av1_highbd_wiener_convolve_add_src_c(
192 : const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
193 : ptrdiff_t dst_stride, const int16_t *filter_x, int32_t x_step_q4,
194 : const int16_t *filter_y, int32_t y_step_q4, int32_t w, int32_t h,
195 : const ConvolveParams *conv_params, int32_t bd) {
196 0 : const InterpKernel *const filters_x = get_filter_base(filter_x);
197 0 : const int32_t x0_q4 = get_filter_offset(filter_x, filters_x);
198 :
199 0 : const InterpKernel *const filters_y = get_filter_base(filter_y);
200 0 : const int32_t y0_q4 = get_filter_offset(filter_y, filters_y);
201 :
202 : uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
203 0 : const int32_t intermediate_height =
204 0 : (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
205 :
206 0 : assert(w <= MAX_SB_SIZE);
207 0 : assert(h <= MAX_SB_SIZE);
208 0 : assert(y_step_q4 <= 32);
209 0 : assert(x_step_q4 <= 32);
210 0 : assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
211 :
212 0 : highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
213 : src_stride, temp, MAX_SB_SIZE, filters_x,
214 : x0_q4, x_step_q4, w, intermediate_height,
215 : conv_params->round_0, bd);
216 0 : highbd_convolve_add_src_vert_hip(
217 : temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
218 : filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
219 0 : }
220 : #if OBMC_FLAG
221 :
222 0 : static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
223 : const int16_t *b) {
224 0 : int sum = 0;
225 0 : for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
226 0 : return sum;
227 : }
228 :
229 0 : static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
230 : uint8_t *dst, ptrdiff_t dst_stride,
231 : const InterpKernel *x_filters, int x0_q4,
232 : int x_step_q4, int w, int h) {
233 0 : src -= SUBPEL_TAPS / 2 - 1;
234 0 : for (int y = 0; y < h; ++y) {
235 0 : int x_q4 = x0_q4;
236 0 : for (int x = 0; x < w; ++x) {
237 0 : const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
238 0 : const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
239 0 : const int sum = horz_scalar_product(src_x, x_filter);
240 0 : dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
241 0 : x_q4 += x_step_q4;
242 : }
243 0 : src += src_stride;
244 0 : dst += dst_stride;
245 : }
246 0 : }
247 :
248 0 : static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
249 : uint8_t *dst, ptrdiff_t dst_stride,
250 : const InterpKernel *y_filters, int y0_q4,
251 : int y_step_q4, int w, int h) {
252 0 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
253 :
254 0 : for (int x = 0; x < w; ++x) {
255 0 : int y_q4 = y0_q4;
256 0 : for (int y = 0; y < h; ++y) {
257 0 : const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
258 0 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
259 0 : const int sum = vert_scalar_product(src_y, src_stride, y_filter);
260 0 : dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
261 0 : y_q4 += y_step_q4;
262 : }
263 0 : ++src;
264 0 : ++dst;
265 : }
266 0 : }
267 :
268 0 : void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
269 : uint8_t *dst, ptrdiff_t dst_stride,
270 : const int16_t *filter_x, int x_step_q4,
271 : const int16_t *filter_y, int y_step_q4, int w,
272 : int h) {
273 0 : const InterpKernel *const filters_x = get_filter_base(filter_x);
274 0 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
275 :
276 : (void)filter_y;
277 : (void)y_step_q4;
278 :
279 0 : convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
280 : w, h);
281 0 : }
282 :
283 0 : void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
284 : uint8_t *dst, ptrdiff_t dst_stride,
285 : const int16_t *filter_x, int x_step_q4,
286 : const int16_t *filter_y, int y_step_q4, int w,
287 : int h) {
288 0 : const InterpKernel *const filters_y = get_filter_base(filter_y);
289 0 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
290 :
291 : (void)filter_x;
292 : (void)x_step_q4;
293 :
294 0 : convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
295 : w, h);
296 0 : }
297 : static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
298 : const InterpFilterParams filter_params, const int32_t subpel) ;
299 :
300 :
301 : #endif
|