Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 : #include "EbDefinitions.h"
12 : #include <assert.h>
13 : #include <emmintrin.h> // SSE2
14 : #include "aom_dsp_rtcd.h"
15 : #include "EbVariance_SSE2.h"
16 : #include "synonyms.h"
17 :
18 : #ifdef __cplusplus
19 : extern "C" {
20 : #endif
21 :
22 : #ifdef __cplusplus
23 : }
24 : #endif
25 :
26 0 : uint32_t eb_aom_get_mb_ss_sse2(const int16_t *src) {
27 0 : __m128i vsum = _mm_setzero_si128();
28 : int32_t i;
29 :
30 0 : for (i = 0; i < 32; ++i) {
31 0 : const __m128i v = xx_loadu_128(src);
32 0 : vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
33 0 : src += 8;
34 : }
35 :
36 0 : vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
37 0 : vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
38 0 : return _mm_cvtsi128_si32(vsum);
39 : }
40 :
41 : // Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
42 : // Slightly faster than variance_final_256_pel_sse2()
43 : // diff sum of 128 pixels can still fit in 16bit integer
44 436567 : static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
45 : unsigned int *const sse,
46 : int *const sum) {
47 436567 : *sse = add32x4_sse2(vsse);
48 :
49 436563 : vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
50 436563 : vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
51 436563 : vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
52 436563 : *sum = (int16_t)_mm_extract_epi16(vsum, 0);
53 436563 : }
54 :
55 : // Can handle 256 pixels' diff sum (such as 16x16)
56 33831 : static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
57 : unsigned int *const sse,
58 : int *const sum) {
59 33831 : *sse = add32x4_sse2(vsse);
60 :
61 33831 : vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
62 33831 : vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
63 33831 : *sum = (int16_t)_mm_extract_epi16(vsum, 0);
64 33831 : *sum += (int16_t)_mm_extract_epi16(vsum, 1);
65 33831 : }
66 :
67 4461300 : static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref,
68 : __m128i *const sse,
69 : __m128i *const sum) {
70 4461300 : const __m128i diff = _mm_sub_epi16(src, ref);
71 4461300 : *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
72 4461300 : *sum = _mm_add_epi16(*sum, diff);
73 4461300 : }
74 :
75 170835 : static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
76 : const uint8_t *ref, const int ref_stride,
77 : const int h, __m128i *const sse,
78 : __m128i *const sum) {
79 170835 : assert(h <= 256); // May overflow for larger height.
80 170835 : *sum = _mm_setzero_si128();
81 :
82 1096600 : for (int i = 0; i < h; i += 2) {
83 925766 : const __m128i s = load4x2_sse2(src, src_stride);
84 925765 : const __m128i r = load4x2_sse2(ref, ref_stride);
85 :
86 925767 : variance_kernel_sse2(s, r, sse, sum);
87 925768 : src += 2 * src_stride;
88 925768 : ref += 2 * ref_stride;
89 : }
90 170837 : }
91 :
92 299509 : static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
93 : const uint8_t *ref, const int ref_stride,
94 : const int h, __m128i *const sse,
95 : __m128i *const sum) {
96 299509 : assert(h <= 128); // May overflow for larger height.
97 299509 : *sum = _mm_setzero_si128();
98 3835210 : for (int i = 0; i < h; i++) {
99 3535650 : const __m128i s = load8_8to16_sse2(src);
100 3535610 : const __m128i r = load8_8to16_sse2(ref);
101 :
102 3535550 : variance_kernel_sse2(s, r, sse, sum);
103 3535700 : src += src_stride;
104 3535700 : ref += ref_stride;
105 : }
106 299563 : }
107 :
108 : #define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels) \
109 : unsigned int eb_aom_variance##bw##x##bh##_sse2( \
110 : const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
111 : unsigned int *sse) { \
112 : __m128i vsse = _mm_setzero_si128(); \
113 : __m128i vsum; \
114 : int sum = 0; \
115 : variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \
116 : variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum); \
117 : assert(sum <= 255 * bw * bh); \
118 : assert(sum >= -255 * bw * bh); \
119 : return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \
120 : }
121 :
122 66998 : AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128);
123 119964 : AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128);
124 154712 : AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128);
125 :
126 120116 : AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128);
127 269242 : AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128);
128 142004 : AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128);
129 67662 : AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256);
130 : #if OBMC_FLAG
131 :
132 51623100 : static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
133 : const InterpFilterParams filter_params, const int32_t subpel) {
134 51623100 : return filter_params.filter_ptr + filter_params.taps * subpel;
135 : }
136 : DECLARE_ALIGNED(256, static const InterpKernel,
137 : av1_bilinear_filters[SUBPEL_SHIFTS]) = {
138 : { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 },
139 : { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 },
140 : { 0, 0, 0, 96, 32, 0, 0, 0 }, { 0, 0, 0, 88, 40, 0, 0, 0 },
141 : { 0, 0, 0, 80, 48, 0, 0, 0 }, { 0, 0, 0, 72, 56, 0, 0, 0 },
142 : { 0, 0, 0, 64, 64, 0, 0, 0 }, { 0, 0, 0, 56, 72, 0, 0, 0 },
143 : { 0, 0, 0, 48, 80, 0, 0, 0 }, { 0, 0, 0, 40, 88, 0, 0, 0 },
144 : { 0, 0, 0, 32, 96, 0, 0, 0 }, { 0, 0, 0, 24, 104, 0, 0, 0 },
145 : { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
146 : };
147 :
148 : DECLARE_ALIGNED(256, static const InterpKernel,
149 : av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = {
150 : { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 },
151 : { 0, 0, -8, 122, 18, -4, 0, 0 }, { 0, 0, -10, 116, 28, -6, 0, 0 },
152 : { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 },
153 : { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
154 : { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
155 : { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 },
156 : { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 },
157 : { 0, 0, -4, 18, 122, -8, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 }
158 : };
159 : DECLARE_ALIGNED(256, static const InterpKernel,
160 : av1_sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
161 : { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 30, 62, 34, 2, 0, 0 },
162 : { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 },
163 : { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 },
164 : { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 },
165 : { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 },
166 : { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
167 : { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 },
168 : { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 30, 0, 0 }
169 : };
170 : DECLARE_ALIGNED(256, static const InterpKernel,
171 : av1_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
172 : { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, -6, 126, 8, -2, 0, 0 },
173 : { 0, 2, -10, 122, 18, -4, 0, 0 }, { 0, 2, -12, 116, 28, -8, 2, 0 },
174 : { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
175 : { 0, 2, -16, 94, 58, -12, 2, 0 }, { 0, 2, -14, 84, 66, -12, 2, 0 },
176 : { 0, 2, -14, 76, 76, -14, 2, 0 }, { 0, 2, -12, 66, 84, -14, 2, 0 },
177 : { 0, 2, -12, 58, 94, -16, 2, 0 }, { 0, 2, -12, 48, 102, -14, 2, 0 },
178 : { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
179 : { 0, 0, -4, 18, 122, -10, 2, 0 }, { 0, 0, -2, 8, 126, -6, 2, 0 }
180 : };
181 :
182 : DECLARE_ALIGNED(256, static const InterpKernel,
183 : av1_sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
184 : { 0, 0, 0, 128, 0, 0, 0, 0 }, { -2, 2, -6, 126, 8, -2, 2, 0 },
185 : { -2, 6, -12, 124, 16, -6, 4, -2 }, { -2, 8, -18, 120, 26, -10, 6, -2 },
186 : { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
187 : { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
188 : { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
189 : { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
190 : { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
191 : { -2, 4, -6, 16, 124, -12, 6, -2 }, { 0, 2, -2, 8, 126, -6, 2, -2 }
192 : };
193 :
194 : DECLARE_ALIGNED(256, static const InterpKernel,
195 : av1_sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
196 : { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, 28, 62, 34, 2, 0, 0 },
197 : { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 },
198 : { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 },
199 : { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, -2, 16, 54, 48, 12, 0, 0 },
200 : { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
201 : { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
202 : { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 },
203 : { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 28, 2, 0 }
204 : };
205 : // For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR
206 : static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = {
207 : { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
208 : EIGHTTAP_REGULAR },
209 : { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
210 : EIGHTTAP_SMOOTH },
211 : { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
212 : EIGHTTAP_REGULAR },
213 : { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
214 : BILINEAR },
215 : };
216 : static const InterpFilterParams
217 : av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
218 : { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
219 : EIGHTTAP_REGULAR },
220 : { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS,
221 : SUBPEL_SHIFTS, EIGHTTAP_SMOOTH },
222 : { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
223 : MULTITAP_SHARP },
224 : { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
225 : BILINEAR }
226 : };
227 0 : static INLINE const InterpFilterParams *get_4tap_interp_filter_params(
228 : const InterpFilter interp_filter) {
229 0 : return &av1_interp_4tap[interp_filter];
230 : }
231 36553500 : static INLINE const InterpFilterParams *av1_get_filter(int subpel_search) {
232 36553500 : assert(subpel_search >= USE_2_TAPS);
233 :
234 36553500 : switch (subpel_search) {
235 0 : case USE_2_TAPS: return get_4tap_interp_filter_params(BILINEAR);
236 0 : case USE_4_TAPS: return get_4tap_interp_filter_params(EIGHTTAP_REGULAR);
237 36554000 : case USE_8_TAPS: return &av1_interp_filter_params_list[EIGHTTAP_REGULAR];
238 0 : default: assert(0); return NULL;
239 : }
240 : }
241 :
242 :
243 36553600 : void aom_upsampled_pred_sse2(MacroBlockD *xd, const struct AV1Common *const cm,
244 : int mi_row, int mi_col, const MV *const mv,
245 : uint8_t *comp_pred, int width, int height,
246 : int subpel_x_q3, int subpel_y_q3,
247 : const uint8_t *ref, int ref_stride,
248 : int subpel_search) {
249 : (void)xd;
250 : (void)cm;
251 : (void)mi_row;
252 : (void)mi_col;
253 : (void)mv;
254 36553600 : const InterpFilterParams *filter = av1_get_filter(subpel_search);
255 : // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for
256 : // 2-tap yet.
257 36554700 : int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
258 :
259 36554700 : if (!subpel_x_q3 && !subpel_y_q3) {
260 3082220 : if (width >= 16) {
261 : int i;
262 1750220 : assert(!(width & 15));
263 : /*Read 16 pixels one row at a time.*/
264 40672600 : for (i = 0; i < height; i++) {
265 : int j;
266 109480000 : for (j = 0; j < width; j += 16) {
267 70557900 : xx_storeu_128(comp_pred, xx_loadu_128(ref));
268 70557600 : comp_pred += 16;
269 70557600 : ref += 16;
270 : }
271 38922400 : ref += ref_stride - width;
272 : }
273 1332000 : } else if (width >= 8) {
274 : int i;
275 1332140 : assert(!(width & 7));
276 1332140 : assert(!(height & 1));
277 : /*Read 8 pixels two rows at a time.*/
278 11107500 : for (i = 0; i < height; i += 2) {
279 9775420 : __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
280 9775420 : __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
281 9775390 : xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
282 9775390 : comp_pred += 16;
283 9775390 : ref += 2 * ref_stride;
284 : }
285 : } else {
286 : int i;
287 0 : assert(!(width & 3));
288 0 : assert(!(height & 3));
289 : /*Read 4 pixels four rows at a time.*/
290 0 : for (i = 0; i < height; i++) {
291 0 : const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
292 0 : const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
293 0 : const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
294 0 : const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
295 0 : const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
296 : _mm_unpacklo_epi32(row2, row3));
297 0 : xx_storeu_128(comp_pred, reg);
298 0 : comp_pred += 16;
299 0 : ref += 4 * ref_stride;
300 : }
301 : }
302 33472400 : } else if (!subpel_y_q3) {
303 : const int16_t *const kernel =
304 7618850 : av1_get_interp_filter_subpel_kernel(*filter, subpel_x_q3 << 1);
305 7618870 : aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
306 : width, height);
307 25853600 : } else if (!subpel_x_q3) {
308 : const int16_t *const kernel =
309 7715320 : av1_get_interp_filter_subpel_kernel(*filter, subpel_y_q3 << 1);
310 7715260 : aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
311 : width, height);
312 : } else {
313 : DECLARE_ALIGNED(16, uint8_t,
314 : temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
315 : const int16_t *const kernel_x =
316 18138300 : av1_get_interp_filter_subpel_kernel(*filter, subpel_x_q3 << 1);
317 : const int16_t *const kernel_y =
318 18173000 : av1_get_interp_filter_subpel_kernel(*filter, subpel_y_q3 << 1);
319 18169900 : const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
320 18169900 : uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
321 0 : ? temp + (filter_taps >> 1) * MAX_SB_SIZE
322 18169900 : : temp;
323 18169900 : uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
324 18169900 : int intermediate_height =
325 18169900 : (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
326 18169900 : assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
327 18169900 : aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
328 : kernel_x, 16, NULL, -1, width, intermediate_height);
329 18181400 : aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
330 : kernel_y, 16, width, height);
331 : }
332 36597100 : }
333 : #endif
|