Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <immintrin.h>
13 :
14 : #include "aom_dsp_rtcd.h"
15 :
16 : #include "convolve.h"
17 : #include "convolve_avx2.h"
18 : // #include "aom_ports/mem.h"
19 :
20 : #if defined(__clang__)
21 : #if (__clang_major__ > 0 && __clang_major__ < 3) || \
22 : (__clang_major__ == 3 && __clang_minor__ <= 3) || \
23 : (defined(__APPLE__) && defined(__apple_build_version__) && \
24 : ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
25 : (__clang_major__ == 5 && __clang_minor__ == 0)))
26 : #define MM256_BROADCASTSI128_SI256(x) \
27 : _mm_broadcastsi128_si256((__m128i const *)&(x))
28 : #else // clang > 3.3, and not 5.0 on macosx.
29 : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
30 : #endif // clang <= 3.3
31 : #elif defined(__GNUC__)
32 : #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
33 : #define MM256_BROADCASTSI128_SI256(x) \
34 : _mm_broadcastsi128_si256((__m128i const *)&(x))
35 : #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
36 : #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
37 : #else // gcc > 4.7
38 : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
39 : #endif // gcc <= 4.6
40 : #else // !(gcc || clang)
41 : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
42 : #endif // __clang__
43 :
44 : typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
45 : uint8_t *output_ptr, ptrdiff_t out_pitch,
46 : uint32_t output_height, const int16_t *filter);
47 : void aom_filter_block1d4_v8_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
48 : uint8_t *output_ptr, ptrdiff_t out_pitch,
49 : uint32_t output_height, const int16_t *filter);
50 : void aom_filter_block1d16_v2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
51 : uint8_t *output_ptr, ptrdiff_t out_pitch,
52 : uint32_t output_height, const int16_t *filter);
53 : void aom_filter_block1d16_h2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
54 : uint8_t *output_ptr, ptrdiff_t out_pitch,
55 : uint32_t output_height, const int16_t *filter);
56 : void aom_filter_block1d8_v2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
57 : uint8_t *output_ptr, ptrdiff_t out_pitch,
58 : uint32_t output_height, const int16_t *filter);
59 : void aom_filter_block1d8_h2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
60 : uint8_t *output_ptr, ptrdiff_t out_pitch,
61 : uint32_t output_height, const int16_t *filter);
62 : void aom_filter_block1d4_v2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
63 : uint8_t *output_ptr, ptrdiff_t out_pitch,
64 : uint32_t output_height, const int16_t *filter);
65 : void aom_filter_block1d4_h2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
66 : uint8_t *output_ptr, ptrdiff_t out_pitch,
67 : uint32_t output_height, const int16_t *filter);
68 :
69 : filter8_1dfunction aom_filter_block1d4_v8_ssse3;
70 : filter8_1dfunction aom_filter_block1d16_v2_ssse3;
71 : filter8_1dfunction aom_filter_block1d16_h2_ssse3;
72 : filter8_1dfunction aom_filter_block1d8_v2_ssse3;
73 : filter8_1dfunction aom_filter_block1d8_h2_ssse3;
74 : filter8_1dfunction aom_filter_block1d4_v2_ssse3;
75 : filter8_1dfunction aom_filter_block1d4_h2_ssse3;
76 : #define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_sse2
77 : #define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
78 : #define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
79 : #define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
80 : #define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
81 : #define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
82 : #define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
83 :
84 :
85 : #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
86 : void aom_convolve8_##name##_##opt( \
87 : const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
88 : ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
89 : const int16_t *filter_y, int y_step_q4, int w, int h) { \
90 : (void)filter_x; \
91 : (void)x_step_q4; \
92 : (void)filter_y; \
93 : (void)y_step_q4; \
94 : assert((-128 <= filter[3]) && (filter[3] <= 127)); \
95 : assert(step_q4 == 16); \
96 : if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \
97 : (filter[2] | filter[5])) { \
98 : while (w >= 16) { \
99 : aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
100 : dst_stride, h, filter); \
101 : src += 16; \
102 : dst += 16; \
103 : w -= 16; \
104 : } \
105 : while (w >= 8) { \
106 : aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \
107 : dst_stride, h, filter); \
108 : src += 8; \
109 : dst += 8; \
110 : w -= 8; \
111 : } \
112 : while (w >= 4) { \
113 : aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \
114 : dst_stride, h, filter); \
115 : src += 4; \
116 : dst += 4; \
117 : w -= 4; \
118 : } \
119 : } else if (filter[0] | filter[1] | filter[2]) { \
120 : while (w >= 16) { \
121 : aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
122 : dst_stride, h, filter); \
123 : src += 16; \
124 : dst += 16; \
125 : w -= 16; \
126 : } \
127 : while (w >= 8) { \
128 : aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
129 : dst_stride, h, filter); \
130 : src += 8; \
131 : dst += 8; \
132 : w -= 8; \
133 : } \
134 : while (w >= 4) { \
135 : aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
136 : dst_stride, h, filter); \
137 : src += 4; \
138 : dst += 4; \
139 : w -= 4; \
140 : } \
141 : } else { \
142 : while (w >= 16) { \
143 : aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \
144 : dst_stride, h, filter); \
145 : src += 16; \
146 : dst += 16; \
147 : w -= 16; \
148 : } \
149 : while (w >= 8) { \
150 : aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \
151 : dst_stride, h, filter); \
152 : src += 8; \
153 : dst += 8; \
154 : w -= 8; \
155 : } \
156 : while (w >= 4) { \
157 : aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \
158 : dst_stride, h, filter); \
159 : src += 4; \
160 : dst += 4; \
161 : w -= 4; \
162 : } \
163 : } \
164 : if (w) { \
165 : aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \
166 : x_step_q4, filter_y, y_step_q4, w, h); \
167 : } \
168 : }
169 :
170 : // filters for 16
171 : DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
172 : 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
173 : 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5,
174 : 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6,
175 : 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
176 : 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
177 : 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7,
178 : 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
179 : };
180 :
181 : DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
182 : 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2,
183 : 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9,
184 : 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
185 : };
186 :
187 : DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
188 : 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
189 : 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
190 : };
191 :
192 0 : static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
193 : const ptrdiff_t stride, const __m256i *a) {
194 0 : *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
195 0 : *((uint32_t *)(output_ptr + stride)) =
196 0 : _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
197 0 : }
198 :
199 33603300 : static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
200 67206600 : __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
201 33603300 : a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
202 33603300 : return a;
203 : }
204 :
205 187641000 : static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
206 : const ptrdiff_t stride, const __m256i *a) {
207 375282000 : _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
208 375282000 : _mm_storel_epi64((__m128i *)(output_ptr + stride),
209 187641000 : _mm256_extractf128_si256(*a, 1));
210 187641000 : }
211 :
212 848760000 : static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
213 1697520000 : __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
214 848760000 : a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
215 848760000 : return a;
216 : }
217 :
218 629185000 : static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
219 : const ptrdiff_t stride, const __m256i *a) {
220 1258370000 : _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
221 629185000 : _mm_store_si128((__m128i *)(output_ptr + stride),
222 629185000 : _mm256_extractf128_si256(*a, 1));
223 629185000 : }
224 :
225 0 : static void aom_filter_block1d4_h4_avx2(
226 : const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
227 : ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
228 : __m128i filtersReg;
229 : __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
230 : unsigned int i;
231 : ptrdiff_t src_stride, dst_stride;
232 0 : src_ptr -= 3;
233 0 : addFilterReg32 = _mm256_set1_epi16(32);
234 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
235 0 : filtersReg = _mm_srai_epi16(filtersReg, 1);
236 : // converting the 16 bit (short) to 8 bit (byte) and have the same data
237 : // in both lanes of 128 bit register.
238 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
239 : // have the same data in both lanes of a 256 bit register
240 0 : const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
241 :
242 : firstFilters =
243 0 : _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
244 0 : filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
245 :
246 : // multiple the size of the source and destination stride by two
247 0 : src_stride = src_pixels_per_line << 1;
248 0 : dst_stride = output_pitch << 1;
249 0 : for (i = output_height; i > 1; i -= 2) {
250 : // load the 2 strides of source
251 0 : srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
252 :
253 : // filter the source buffer
254 0 : srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
255 :
256 : // multiply 4 adjacent elements with the filter and add the result
257 0 : srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
258 :
259 0 : srcRegFilt32b1_1 =
260 0 : _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
261 :
262 : // shift by 6 bit each 16 bit
263 0 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
264 0 : srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
265 :
266 : // shrink to 8 bit each 16 bits, the first lane contain the first
267 : // convolve result and the second lane contain the second convolve result
268 0 : srcRegFilt32b1_1 =
269 0 : _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
270 :
271 0 : src_ptr += src_stride;
272 :
273 0 : xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
274 0 : output_ptr += dst_stride;
275 : }
276 :
277 : // if the number of strides is odd.
278 : // process only 4 bytes
279 0 : if (i > 0) {
280 : __m128i srcReg1, srcRegFilt1_1;
281 :
282 0 : srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
283 :
284 : // filter the source buffer
285 0 : srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
286 :
287 : // multiply 4 adjacent elements with the filter and add the result
288 : srcRegFilt1_1 =
289 0 : _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
290 :
291 0 : srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
292 : // shift by 6 bit each 16 bit
293 : srcRegFilt1_1 =
294 0 : _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
295 0 : srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
296 :
297 : // shrink to 8 bit each 16 bits, the first lane contain the first
298 : // convolve result and the second lane contain the second convolve result
299 0 : srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
300 :
301 : // save 4 bytes
302 0 : *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
303 : }
304 0 : }
305 :
306 0 : static void aom_filter_block1d4_h8_avx2(
307 : const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
308 : ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
309 : __m128i filtersReg;
310 : __m256i addFilterReg32, filt1Reg, filt2Reg;
311 : __m256i firstFilters, secondFilters;
312 : __m256i srcRegFilt32b1_1, srcRegFilt32b2;
313 : __m256i srcReg32b1;
314 : unsigned int i;
315 : ptrdiff_t src_stride, dst_stride;
316 0 : src_ptr -= 3;
317 0 : addFilterReg32 = _mm256_set1_epi16(32);
318 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
319 0 : filtersReg = _mm_srai_epi16(filtersReg, 1);
320 : // converting the 16 bit (short) to 8 bit (byte) and have the same data
321 : // in both lanes of 128 bit register.
322 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
323 : // have the same data in both lanes of a 256 bit register
324 0 : const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
325 :
326 : // duplicate only the first 32 bits
327 0 : firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
328 : // duplicate only the second 32 bits
329 0 : secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
330 :
331 0 : filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
332 0 : filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
333 :
334 : // multiple the size of the source and destination stride by two
335 0 : src_stride = src_pixels_per_line << 1;
336 0 : dst_stride = output_pitch << 1;
337 0 : for (i = output_height; i > 1; i -= 2) {
338 : // load the 2 strides of source
339 0 : srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
340 :
341 : // filter the source buffer
342 0 : srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
343 :
344 : // multiply 4 adjacent elements with the filter and add the result
345 0 : srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
346 :
347 : // filter the source buffer
348 0 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
349 :
350 : // multiply 4 adjacent elements with the filter and add the result
351 0 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
352 :
353 0 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
354 :
355 0 : srcRegFilt32b1_1 =
356 0 : _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
357 :
358 : // shift by 6 bit each 16 bit
359 0 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
360 0 : srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
361 :
362 : // shrink to 8 bit each 16 bits, the first lane contain the first
363 : // convolve result and the second lane contain the second convolve result
364 0 : srcRegFilt32b1_1 =
365 0 : _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
366 :
367 0 : src_ptr += src_stride;
368 :
369 0 : xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
370 0 : output_ptr += dst_stride;
371 : }
372 :
373 : // if the number of strides is odd.
374 : // process only 4 bytes
375 0 : if (i > 0) {
376 : __m128i srcReg1, srcRegFilt1_1;
377 : __m128i srcRegFilt2;
378 :
379 0 : srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
380 :
381 : // filter the source buffer
382 0 : srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
383 :
384 : // multiply 4 adjacent elements with the filter and add the result
385 : srcRegFilt1_1 =
386 0 : _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
387 :
388 : // filter the source buffer
389 0 : srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
390 :
391 : // multiply 4 adjacent elements with the filter and add the result
392 : srcRegFilt2 =
393 0 : _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
394 :
395 0 : srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
396 0 : srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
397 : // shift by 6 bit each 16 bit
398 : srcRegFilt1_1 =
399 0 : _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
400 0 : srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
401 :
402 : // shrink to 8 bit each 16 bits, the first lane contain the first
403 : // convolve result and the second lane contain the second convolve result
404 0 : srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
405 :
406 : // save 4 bytes
407 0 : *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
408 : }
409 0 : }
410 :
411 0 : static void aom_filter_block1d8_h4_avx2(
412 : const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
413 : ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
414 : __m128i filtersReg;
415 : __m256i addFilterReg32, filt2Reg, filt3Reg;
416 : __m256i secondFilters, thirdFilters;
417 : __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
418 : __m256i srcReg32b1, filtersReg32;
419 : unsigned int i;
420 : ptrdiff_t src_stride, dst_stride;
421 0 : src_ptr -= 3;
422 0 : addFilterReg32 = _mm256_set1_epi16(32);
423 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
424 0 : filtersReg = _mm_srai_epi16(filtersReg, 1);
425 : // converting the 16 bit (short) to 8 bit (byte) and have the same data
426 : // in both lanes of 128 bit register.
427 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
428 : // have the same data in both lanes of a 256 bit register
429 0 : filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
430 :
431 : // duplicate only the second 16 bits (third and forth byte)
432 : // across 256 bit register
433 0 : secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
434 : // duplicate only the third 16 bits (fifth and sixth byte)
435 : // across 256 bit register
436 0 : thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
437 :
438 0 : filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
439 0 : filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
440 :
441 : // multiply the size of the source and destination stride by two
442 0 : src_stride = src_pixels_per_line << 1;
443 0 : dst_stride = output_pitch << 1;
444 0 : for (i = output_height; i > 1; i -= 2) {
445 : // load the 2 strides of source
446 0 : srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
447 :
448 : // filter the source buffer
449 0 : srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
450 0 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
451 :
452 : // multiply 2 adjacent elements with the filter and add the result
453 0 : srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
454 0 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
455 :
456 0 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
457 :
458 : // shift by 6 bit each 16 bit
459 0 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
460 0 : srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
461 :
462 : // shrink to 8 bit each 16 bits
463 0 : srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
464 :
465 0 : src_ptr += src_stride;
466 :
467 0 : xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
468 0 : output_ptr += dst_stride;
469 : }
470 :
471 : // if the number of strides is odd.
472 : // process only 8 bytes
473 0 : if (i > 0) {
474 : __m128i srcReg1, srcRegFilt1_1;
475 : __m128i srcRegFilt2, srcRegFilt3;
476 :
477 0 : srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
478 :
479 : // filter the source buffer
480 0 : srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
481 0 : srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
482 :
483 : // multiply 2 adjacent elements with the filter and add the result
484 : srcRegFilt2 =
485 0 : _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
486 : srcRegFilt3 =
487 0 : _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
488 :
489 : // add and saturate the results together
490 0 : srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
491 :
492 : // shift by 6 bit each 16 bit
493 : srcRegFilt1_1 =
494 0 : _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
495 0 : srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
496 :
497 : // shrink to 8 bit each 16 bits
498 0 : srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
499 :
500 : // save 8 bytes
501 0 : _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
502 : }
503 0 : }
504 :
505 11204400 : static void aom_filter_block1d8_h8_avx2(
506 : const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
507 : ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
508 : __m128i filtersReg;
509 : __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
510 : __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
511 : __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
512 : __m256i srcReg32b1;
513 : unsigned int i;
514 : ptrdiff_t src_stride, dst_stride;
515 11204400 : src_ptr -= 3;
516 11204400 : addFilterReg32 = _mm256_set1_epi16(32);
517 11204400 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
518 11204400 : filtersReg = _mm_srai_epi16(filtersReg, 1);
519 : // converting the 16 bit (short) to 8 bit (byte) and have the same data
520 : // in both lanes of 128 bit register.
521 11204400 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
522 : // have the same data in both lanes of a 256 bit register
523 11204400 : const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
524 :
525 : // duplicate only the first 16 bits (first and second byte)
526 : // across 256 bit register
527 22408900 : firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
528 : // duplicate only the second 16 bits (third and forth byte)
529 : // across 256 bit register
530 22408900 : secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
531 : // duplicate only the third 16 bits (fifth and sixth byte)
532 : // across 256 bit register
533 22408900 : thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
534 : // duplicate only the forth 16 bits (seventh and eighth byte)
535 : // across 256 bit register
536 22408900 : forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
537 :
538 11204400 : filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
539 11204400 : filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
540 11204400 : filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
541 11204400 : filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
542 :
543 : // multiple the size of the source and destination stride by two
544 11204400 : src_stride = src_pixels_per_line << 1;
545 11204400 : dst_stride = output_pitch << 1;
546 116960000 : for (i = output_height; i > 1; i -= 2) {
547 : // load the 2 strides of source
548 105756000 : srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
549 :
550 : // filter the source buffer
551 105726000 : srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
552 105726000 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
553 :
554 : // multiply 2 adjacent elements with the filter and add the result
555 211452000 : srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
556 105726000 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
557 :
558 : // add and saturate the results together
559 211452000 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
560 :
561 : // filter the source buffer
562 105726000 : srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
563 105726000 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
564 :
565 : // multiply 2 adjacent elements with the filter and add the result
566 105726000 : srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
567 105726000 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
568 :
569 105726000 : __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
570 105726000 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
571 :
572 : // shift by 6 bit each 16 bit
573 105726000 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
574 211452000 : srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
575 :
576 : // shrink to 8 bit each 16 bits, the first lane contain the first
577 : // convolve result and the second lane contain the second convolve result
578 105726000 : srcRegFilt32b1_1 =
579 105726000 : _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
580 :
581 105726000 : src_ptr += src_stride;
582 :
583 105726000 : xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
584 105756000 : output_ptr += dst_stride;
585 : }
586 :
587 : // if the number of strides is odd.
588 : // process only 8 bytes
589 11203900 : if (i > 0) {
590 : __m128i srcReg1, srcRegFilt1_1;
591 : __m128i srcRegFilt2, srcRegFilt3;
592 :
593 7898760 : srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
594 :
595 : // filter the source buffer
596 15797500 : srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
597 15797500 : srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
598 :
599 : // multiply 2 adjacent elements with the filter and add the result
600 : srcRegFilt1_1 =
601 15797500 : _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
602 : srcRegFilt2 =
603 15797500 : _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
604 :
605 : // add and saturate the results together
606 7898760 : srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
607 :
608 : // filter the source buffer
609 15797500 : srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
610 15797500 : srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
611 :
612 : // multiply 2 adjacent elements with the filter and add the result
613 : srcRegFilt3 =
614 15797500 : _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
615 : srcRegFilt2 =
616 15797500 : _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
617 :
618 : // add and saturate the results together
619 : srcRegFilt1_1 =
620 15797500 : _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
621 :
622 : // shift by 6 bit each 16 bit
623 : srcRegFilt1_1 =
624 15797500 : _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
625 7898760 : srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
626 :
627 : // shrink to 8 bit each 16 bits, the first lane contain the first
628 : // convolve result and the second lane contain the second convolve
629 : // result
630 15797500 : srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
631 :
632 : // save 8 bytes
633 7898760 : _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
634 : }
635 11203900 : }
636 :
637 0 : static void aom_filter_block1d16_h4_avx2(
638 : const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
639 : ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
640 : __m128i filtersReg;
641 : __m256i addFilterReg32, filt2Reg, filt3Reg;
642 : __m256i secondFilters, thirdFilters;
643 : __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
644 : __m256i srcReg32b1, srcReg32b2, filtersReg32;
645 : unsigned int i;
646 : ptrdiff_t src_stride, dst_stride;
647 0 : src_ptr -= 3;
648 0 : addFilterReg32 = _mm256_set1_epi16(32);
649 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
650 0 : filtersReg = _mm_srai_epi16(filtersReg, 1);
651 : // converting the 16 bit (short) to 8 bit (byte) and have the same data
652 : // in both lanes of 128 bit register.
653 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
654 : // have the same data in both lanes of a 256 bit register
655 0 : filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
656 :
657 : // duplicate only the second 16 bits (third and forth byte)
658 : // across 256 bit register
659 0 : secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
660 : // duplicate only the third 16 bits (fifth and sixth byte)
661 : // across 256 bit register
662 0 : thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
663 :
664 0 : filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
665 0 : filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
666 :
667 : // multiply the size of the source and destination stride by two
668 0 : src_stride = src_pixels_per_line << 1;
669 0 : dst_stride = output_pitch << 1;
670 0 : for (i = output_height; i > 1; i -= 2) {
671 : // load the 2 strides of source
672 0 : srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
673 :
674 : // filter the source buffer
675 0 : srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
676 0 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
677 :
678 : // multiply 2 adjacent elements with the filter and add the result
679 0 : srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
680 0 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
681 :
682 0 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
683 :
684 : // reading 2 strides of the next 16 bytes
685 : // (part of it was being read by earlier read)
686 : srcReg32b2 =
687 0 : xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
688 :
689 : // filter the source buffer
690 0 : srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
691 0 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
692 :
693 : // multiply 2 adjacent elements with the filter and add the result
694 0 : srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
695 0 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
696 :
697 : // add and saturate the results together
698 0 : srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
699 :
700 : // shift by 6 bit each 16 bit
701 0 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
702 0 : srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
703 0 : srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
704 0 : srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
705 :
706 : // shrink to 8 bit each 16 bits, the first lane contain the first
707 : // convolve result and the second lane contain the second convolve result
708 0 : srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
709 :
710 0 : src_ptr += src_stride;
711 :
712 0 : xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
713 0 : output_ptr += dst_stride;
714 : }
715 :
716 : // if the number of strides is odd.
717 : // process only 16 bytes
718 0 : if (i > 0) {
719 : __m256i srcReg1, srcReg12;
720 : __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
721 :
722 0 : srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
723 0 : srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
724 :
725 : // filter the source buffer
726 0 : srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
727 0 : srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
728 :
729 : // multiply 2 adjacent elements with the filter and add the result
730 0 : srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
731 0 : srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
732 :
733 : // add and saturate the results together
734 0 : srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
735 :
736 : // shift by 6 bit each 16 bit
737 0 : srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
738 0 : srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
739 :
740 : // shrink to 8 bit each 16 bits, the first lane contain the first
741 : // convolve result and the second lane contain the second convolve
742 : // result
743 0 : srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
744 0 : srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
745 :
746 : // save 16 bytes
747 0 : _mm_store_si128((__m128i *)output_ptr,
748 : _mm256_castsi256_si128(srcRegFilt1_1));
749 : }
750 0 : }
751 :
752 24076900 : static void aom_filter_block1d16_h8_avx2(
753 : const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
754 : ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
755 : __m128i filtersReg;
756 : __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
757 : __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
758 : __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
759 : __m256i srcReg32b1, srcReg32b2, filtersReg32;
760 : unsigned int i;
761 : ptrdiff_t src_stride, dst_stride;
762 24076900 : src_ptr -= 3;
763 24076900 : addFilterReg32 = _mm256_set1_epi16(32);
764 24076900 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
765 24076900 : filtersReg = _mm_srai_epi16(filtersReg, 1);
766 : // converting the 16 bit (short) to 8 bit (byte) and have the same data
767 : // in both lanes of 128 bit register.
768 24076900 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
769 : // have the same data in both lanes of a 256 bit register
770 24076900 : filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
771 :
772 : // duplicate only the first 16 bits (first and second byte)
773 : // across 256 bit register
774 48153800 : firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
775 : // duplicate only the second 16 bits (third and forth byte)
776 : // across 256 bit register
777 48153800 : secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
778 : // duplicate only the third 16 bits (fifth and sixth byte)
779 : // across 256 bit register
780 48153800 : thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
781 : // duplicate only the forth 16 bits (seventh and eighth byte)
782 : // across 256 bit register
783 48153800 : forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
784 :
785 24076900 : filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
786 24076900 : filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
787 24076900 : filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
788 24076900 : filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
789 :
790 : // multiple the size of the source and destination stride by two
791 24076900 : src_stride = src_pixels_per_line << 1;
792 24076900 : dst_stride = output_pitch << 1;
793 363545000 : for (i = output_height; i > 1; i -= 2) {
794 : // load the 2 strides of source
795 339473000 : srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
796 :
797 : // filter the source buffer
798 339078000 : srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
799 339078000 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
800 :
801 : // multiply 2 adjacent elements with the filter and add the result
802 678155000 : srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
803 339078000 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
804 :
805 : // add and saturate the results together
806 678155000 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
807 :
808 : // filter the source buffer
809 339078000 : srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
810 339078000 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
811 :
812 : // multiply 2 adjacent elements with the filter and add the result
813 339078000 : srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
814 339078000 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
815 :
816 339078000 : __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
817 339078000 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
818 :
819 : // reading 2 strides of the next 16 bytes
820 : // (part of it was being read by earlier read)
821 : srcReg32b2 =
822 339078000 : xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
823 :
824 : // filter the source buffer
825 339049000 : srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
826 339049000 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
827 :
828 : // multiply 2 adjacent elements with the filter and add the result
829 339049000 : srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
830 339049000 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
831 :
832 : // add and saturate the results together
833 339049000 : srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
834 :
835 : // filter the source buffer
836 339049000 : srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
837 339049000 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
838 :
839 : // multiply 2 adjacent elements with the filter and add the result
840 339049000 : srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
841 339049000 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
842 :
843 : // add and saturate the results together
844 339049000 : srcRegFilt32b2_1 = _mm256_adds_epi16(
845 : srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
846 :
847 : // shift by 6 bit each 16 bit
848 678099000 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
849 339049000 : srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
850 678099000 : srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
851 339049000 : srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
852 :
853 : // shrink to 8 bit each 16 bits, the first lane contain the first
854 : // convolve result and the second lane contain the second convolve result
855 339049000 : srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
856 :
857 339049000 : src_ptr += src_stride;
858 :
859 339049000 : xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
860 339468000 : output_ptr += dst_stride;
861 : }
862 :
863 : // if the number of strides is odd.
864 : // process only 16 bytes
865 24071200 : if (i > 0) {
866 : __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
867 : __m128i srcRegFilt2, srcRegFilt3;
868 :
869 16961100 : srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
870 :
871 : // filter the source buffer
872 33922300 : srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
873 33922300 : srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
874 :
875 : // multiply 2 adjacent elements with the filter and add the result
876 : srcRegFilt1_1 =
877 33922300 : _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
878 : srcRegFilt2 =
879 33922300 : _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
880 :
881 : // add and saturate the results together
882 16961100 : srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
883 :
884 : // filter the source buffer
885 33922300 : srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
886 33922300 : srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
887 :
888 : // multiply 2 adjacent elements with the filter and add the result
889 : srcRegFilt3 =
890 33922300 : _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
891 : srcRegFilt2 =
892 33922300 : _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
893 :
894 : // add and saturate the results together
895 : srcRegFilt1_1 =
896 16961100 : _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
897 :
898 : // reading the next 16 bytes
899 : // (part of it was being read by earlier read)
900 33922300 : srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
901 :
902 : // filter the source buffer
903 33922300 : srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
904 33922300 : srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
905 :
906 : // multiply 2 adjacent elements with the filter and add the result
907 : srcRegFilt2_1 =
908 33922300 : _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
909 : srcRegFilt2 =
910 33922300 : _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
911 :
912 : // add and saturate the results together
913 16961100 : srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
914 :
915 : // filter the source buffer
916 33922300 : srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
917 33922300 : srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
918 :
919 : // multiply 2 adjacent elements with the filter and add the result
920 : srcRegFilt3 =
921 33922300 : _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
922 : srcRegFilt2 =
923 33922300 : _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
924 :
925 : // add and saturate the results together
926 : srcRegFilt2_1 =
927 33922300 : _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
928 :
929 : // shift by 6 bit each 16 bit
930 : srcRegFilt1_1 =
931 33922300 : _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
932 16961100 : srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
933 :
934 : srcRegFilt2_1 =
935 33922300 : _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
936 16961100 : srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);
937 :
938 : // shrink to 8 bit each 16 bits, the first lane contain the first
939 : // convolve result and the second lane contain the second convolve
940 : // result
941 16961100 : srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
942 :
943 : // save 16 bytes
944 : _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
945 : }
946 24071200 : }
947 :
948 0 : static void aom_filter_block1d8_v4_avx2(
949 : const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
950 : ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
951 : __m128i filtersReg;
952 : __m256i filtersReg32, addFilterReg32;
953 : __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
954 : __m256i srcReg23_34_lo, srcReg45_56_lo;
955 : __m256i resReg23_34_lo, resReg45_56_lo;
956 : __m256i resReglo, resReg;
957 : __m256i secondFilters, thirdFilters;
958 : unsigned int i;
959 : ptrdiff_t src_stride, dst_stride;
960 :
961 0 : addFilterReg32 = _mm256_set1_epi16(32);
962 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
963 : // converting the 16 bit (short) to 8 bit (byte) and have the
964 : // same data in both lanes of 128 bit register.
965 0 : filtersReg = _mm_srai_epi16(filtersReg, 1);
966 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
967 : // have the same data in both lanes of a 256 bit register
968 0 : filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
969 :
970 : // duplicate only the second 16 bits (third and forth byte)
971 : // across 256 bit register
972 0 : secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
973 : // duplicate only the third 16 bits (fifth and sixth byte)
974 : // across 256 bit register
975 0 : thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
976 :
977 : // multiple the size of the source and destination stride by two
978 0 : src_stride = src_pitch << 1;
979 0 : dst_stride = out_pitch << 1;
980 :
981 0 : srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
982 0 : srcReg4x = _mm256_castsi128_si256(
983 0 : _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
984 :
985 : // have consecutive loads on the same 256 register
986 0 : srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
987 :
988 0 : srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
989 :
990 0 : for (i = output_height; i > 1; i -= 2) {
991 : // load the last 2 loads of 16 bytes and have every two
992 : // consecutive loads in the same 256 bit register
993 0 : srcReg5x = _mm256_castsi128_si256(
994 0 : _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
995 : srcReg45 =
996 0 : _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
997 :
998 0 : srcReg6x = _mm256_castsi128_si256(
999 0 : _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
1000 : srcReg56 =
1001 0 : _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
1002 :
1003 : // merge every two consecutive registers
1004 0 : srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
1005 :
1006 : // multiply 2 adjacent elements with the filter and add the result
1007 0 : resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
1008 0 : resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
1009 :
1010 : // add and saturate the results together
1011 0 : resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
1012 :
1013 : // shift by 6 bit each 16 bit
1014 0 : resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
1015 0 : resReglo = _mm256_srai_epi16(resReglo, 6);
1016 :
1017 : // shrink to 8 bit each 16 bits, the first lane contain the first
1018 : // convolve result and the second lane contain the second convolve
1019 : // result
1020 0 : resReg = _mm256_packus_epi16(resReglo, resReglo);
1021 :
1022 0 : src_ptr += src_stride;
1023 :
1024 0 : xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
1025 :
1026 0 : output_ptr += dst_stride;
1027 :
1028 : // save part of the registers for next strides
1029 0 : srcReg23_34_lo = srcReg45_56_lo;
1030 0 : srcReg4x = srcReg6x;
1031 : }
1032 0 : }
1033 :
1034 11202800 : static void aom_filter_block1d8_v8_avx2(
1035 : const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
1036 : ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
1037 : __m128i filtersReg;
1038 : __m256i addFilterReg32;
1039 : __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
1040 : __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
1041 : __m256i srcReg32b11, srcReg32b12, filtersReg32;
1042 : __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
1043 : unsigned int i;
1044 : ptrdiff_t src_stride, dst_stride;
1045 :
1046 11202800 : addFilterReg32 = _mm256_set1_epi16(32);
1047 11202800 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
1048 : // converting the 16 bit (short) to 8 bit (byte) and have the
1049 : // same data in both lanes of 128 bit register.
1050 11202800 : filtersReg = _mm_srai_epi16(filtersReg, 1);
1051 11202800 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
1052 : // have the same data in both lanes of a 256 bit register
1053 11202800 : filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
1054 :
1055 : // duplicate only the first 16 bits (first and second byte)
1056 : // across 256 bit register
1057 22405600 : firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
1058 : // duplicate only the second 16 bits (third and forth byte)
1059 : // across 256 bit register
1060 22405600 : secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
1061 : // duplicate only the third 16 bits (fifth and sixth byte)
1062 : // across 256 bit register
1063 22405600 : thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
1064 : // duplicate only the forth 16 bits (seventh and eighth byte)
1065 : // across 256 bit register
1066 11202800 : forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
1067 :
1068 : // multiple the size of the source and destination stride by two
1069 11202800 : src_stride = src_pitch << 1;
1070 11202800 : dst_stride = out_pitch << 1;
1071 :
1072 : // load 16 bytes 7 times in stride of src_pitch
1073 11202800 : srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
1074 : srcReg32b3 =
1075 11203700 : xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1076 : srcReg32b5 =
1077 11203200 : xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
1078 11202300 : srcReg32b7 = _mm256_castsi128_si256(
1079 11202300 : _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
1080 :
1081 : // have each consecutive loads on the same 256 register
1082 11202300 : srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
1083 11202300 : srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
1084 11202300 : srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
1085 : // merge every two consecutive registers except the last one
1086 22404600 : srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
1087 11202300 : srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
1088 11202300 : srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
1089 :
1090 93391700 : for (i = output_height; i > 1; i -= 2) {
1091 : // load the last 2 loads of 16 bytes and have every two
1092 : // consecutive loads in the same 256 bit register
1093 164351000 : srcReg32b8 = _mm256_castsi128_si256(
1094 82175500 : _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
1095 82175500 : srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
1096 : _mm256_castsi256_si128(srcReg32b8), 1);
1097 164351000 : srcReg32b9 = _mm256_castsi128_si256(
1098 82175500 : _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
1099 82175500 : srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
1100 : _mm256_castsi256_si128(srcReg32b9), 1);
1101 :
1102 : // merge every two consecutive registers
1103 : // save
1104 82175500 : srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
1105 :
1106 : // multiply 2 adjacent elements with the filter and add the result
1107 82175500 : srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
1108 82175500 : srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
1109 :
1110 : // add and saturate the results together
1111 82175500 : srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
1112 :
1113 : // multiply 2 adjacent elements with the filter and add the result
1114 82175500 : srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
1115 82175500 : srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
1116 :
1117 : // add and saturate the results together
1118 164351000 : srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
1119 : _mm256_adds_epi16(srcReg32b8, srcReg32b12));
1120 :
1121 : // shift by 6 bit each 16 bit
1122 82175500 : srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
1123 82175500 : srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
1124 :
1125 : // shrink to 8 bit each 16 bits, the first lane contain the first
1126 : // convolve result and the second lane contain the second convolve
1127 : // result
1128 82175500 : srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
1129 :
1130 82175500 : src_ptr += src_stride;
1131 :
1132 82175500 : xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
1133 :
1134 82189400 : output_ptr += dst_stride;
1135 :
1136 : // save part of the registers for next strides
1137 82189400 : srcReg32b10 = srcReg32b11;
1138 82189400 : srcReg32b11 = srcReg32b2;
1139 82189400 : srcReg32b2 = srcReg32b4;
1140 82189400 : srcReg32b7 = srcReg32b9;
1141 : }
1142 11216200 : if (i > 0) {
1143 : __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
1144 : // load the last 16 bytes
1145 0 : srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
1146 :
1147 : // merge the last 2 results together
1148 : srcRegFilt4 =
1149 0 : _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
1150 :
1151 : // multiply 2 adjacent elements with the filter and add the result
1152 0 : srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
1153 : _mm256_castsi256_si128(firstFilters));
1154 : srcRegFilt4 =
1155 0 : _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
1156 :
1157 : // add and saturate the results together
1158 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
1159 :
1160 : // multiply 2 adjacent elements with the filter and add the result
1161 0 : srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
1162 : _mm256_castsi256_si128(secondFilters));
1163 :
1164 : // multiply 2 adjacent elements with the filter and add the result
1165 0 : srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
1166 : _mm256_castsi256_si128(thirdFilters));
1167 :
1168 : // add and saturate the results together
1169 : srcRegFilt1 =
1170 0 : _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
1171 :
1172 : // shift by 6 bit each 16 bit
1173 : srcRegFilt1 =
1174 0 : _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
1175 0 : srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
1176 :
1177 : // shrink to 8 bit each 16 bits, the first lane contain the first
1178 : // convolve result and the second lane contain the second convolve result
1179 0 : srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
1180 :
1181 : // save 8 bytes
1182 0 : _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
1183 : }
1184 11216200 : }
1185 :
1186 0 : static void aom_filter_block1d16_v4_avx2(
1187 : const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
1188 : ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
1189 : __m128i filtersReg;
1190 : __m256i filtersReg32, addFilterReg32;
1191 : __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
1192 : __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
1193 : __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
1194 : __m256i resReglo, resReghi, resReg;
1195 : __m256i secondFilters, thirdFilters;
1196 : unsigned int i;
1197 : ptrdiff_t src_stride, dst_stride;
1198 :
1199 0 : addFilterReg32 = _mm256_set1_epi16(32);
1200 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
1201 : // converting the 16 bit (short) to 8 bit (byte) and have the
1202 : // same data in both lanes of 128 bit register.
1203 0 : filtersReg = _mm_srai_epi16(filtersReg, 1);
1204 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
1205 : // have the same data in both lanes of a 256 bit register
1206 0 : filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
1207 :
1208 : // duplicate only the second 16 bits (third and forth byte)
1209 : // across 256 bit register
1210 0 : secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
1211 : // duplicate only the third 16 bits (fifth and sixth byte)
1212 : // across 256 bit register
1213 0 : thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
1214 :
1215 : // multiple the size of the source and destination stride by two
1216 0 : src_stride = src_pitch << 1;
1217 0 : dst_stride = out_pitch << 1;
1218 :
1219 0 : srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1220 0 : srcReg4x = _mm256_castsi128_si256(
1221 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
1222 :
1223 : // have consecutive loads on the same 256 register
1224 0 : srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
1225 :
1226 0 : srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
1227 0 : srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
1228 :
1229 0 : for (i = output_height; i > 1; i -= 2) {
1230 : // load the last 2 loads of 16 bytes and have every two
1231 : // consecutive loads in the same 256 bit register
1232 0 : srcReg5x = _mm256_castsi128_si256(
1233 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
1234 : srcReg45 =
1235 0 : _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
1236 :
1237 0 : srcReg6x = _mm256_castsi128_si256(
1238 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
1239 : srcReg56 =
1240 0 : _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
1241 :
1242 : // merge every two consecutive registers
1243 0 : srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
1244 0 : srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
1245 :
1246 : // multiply 2 adjacent elements with the filter and add the result
1247 0 : resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
1248 0 : resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
1249 :
1250 : // add and saturate the results together
1251 0 : resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
1252 :
1253 : // multiply 2 adjacent elements with the filter and add the result
1254 0 : resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
1255 0 : resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
1256 :
1257 : // add and saturate the results together
1258 0 : resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
1259 :
1260 : // shift by 6 bit each 16 bit
1261 0 : resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
1262 0 : resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
1263 0 : resReglo = _mm256_srai_epi16(resReglo, 6);
1264 0 : resReghi = _mm256_srai_epi16(resReghi, 6);
1265 :
1266 : // shrink to 8 bit each 16 bits, the first lane contain the first
1267 : // convolve result and the second lane contain the second convolve
1268 : // result
1269 0 : resReg = _mm256_packus_epi16(resReglo, resReghi);
1270 :
1271 0 : src_ptr += src_stride;
1272 :
1273 0 : xx_store2_mi128(output_ptr, out_pitch, &resReg);
1274 :
1275 0 : output_ptr += dst_stride;
1276 :
1277 : // save part of the registers for next strides
1278 0 : srcReg23_34_lo = srcReg45_56_lo;
1279 0 : srcReg23_34_hi = srcReg45_56_hi;
1280 0 : srcReg4x = srcReg6x;
1281 : }
1282 0 : }
1283 :
1284 24315400 : static void aom_filter_block1d16_v8_avx2(
1285 : const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
1286 : ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
1287 : __m128i filtersReg;
1288 : __m256i addFilterReg32;
1289 : __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
1290 : __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
1291 : __m256i srcReg32b11, srcReg32b12, filtersReg32;
1292 : __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
1293 : unsigned int i;
1294 : ptrdiff_t src_stride, dst_stride;
1295 :
1296 24315400 : addFilterReg32 = _mm256_set1_epi16(32);
1297 24315400 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
1298 : // converting the 16 bit (short) to 8 bit (byte) and have the
1299 : // same data in both lanes of 128 bit register.
1300 24315400 : filtersReg = _mm_srai_epi16(filtersReg, 1);
1301 24315400 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
1302 : // have the same data in both lanes of a 256 bit register
1303 24315400 : filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
1304 :
1305 : // duplicate only the first 16 bits (first and second byte)
1306 : // across 256 bit register
1307 48630900 : firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
1308 : // duplicate only the second 16 bits (third and forth byte)
1309 : // across 256 bit register
1310 48630900 : secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
1311 : // duplicate only the third 16 bits (fifth and sixth byte)
1312 : // across 256 bit register
1313 48630900 : thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
1314 : // duplicate only the forth 16 bits (seventh and eighth byte)
1315 : // across 256 bit register
1316 24315400 : forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
1317 :
1318 : // multiple the size of the source and destination stride by two
1319 24315400 : src_stride = src_pitch << 1;
1320 24315400 : dst_stride = out_pitch << 1;
1321 :
1322 : // load 16 bytes 7 times in stride of src_pitch
1323 24315400 : srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
1324 : srcReg32b3 =
1325 24320200 : xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1326 : srcReg32b5 =
1327 24317800 : xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
1328 24311400 : srcReg32b7 = _mm256_castsi128_si256(
1329 24311400 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
1330 :
1331 : // have each consecutive loads on the same 256 register
1332 24311400 : srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
1333 24311400 : srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
1334 24311400 : srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
1335 : // merge every two consecutive registers except the last one
1336 24311400 : srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
1337 48622800 : srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
1338 :
1339 : // save
1340 24311400 : srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
1341 24311400 : srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
1342 24311400 : srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
1343 24311400 : srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
1344 :
1345 317356000 : for (i = output_height; i > 1; i -= 2) {
1346 : // load the last 2 loads of 16 bytes and have every two
1347 : // consecutive loads in the same 256 bit register
1348 585390000 : srcReg32b8 = _mm256_castsi128_si256(
1349 292695000 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
1350 292695000 : srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
1351 : _mm256_castsi256_si128(srcReg32b8), 1);
1352 585390000 : srcReg32b9 = _mm256_castsi128_si256(
1353 292695000 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
1354 292695000 : srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
1355 : _mm256_castsi256_si128(srcReg32b9), 1);
1356 :
1357 : // merge every two consecutive registers
1358 : // save
1359 292695000 : srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
1360 292695000 : srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
1361 :
1362 : // multiply 2 adjacent elements with the filter and add the result
1363 292695000 : srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
1364 292695000 : srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
1365 :
1366 : // add and saturate the results together
1367 292695000 : srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
1368 :
1369 : // multiply 2 adjacent elements with the filter and add the result
1370 292695000 : srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
1371 292695000 : srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
1372 :
1373 : // add and saturate the results together
1374 292695000 : srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
1375 : _mm256_adds_epi16(srcReg32b8, srcReg32b12));
1376 :
1377 : // multiply 2 adjacent elements with the filter and add the result
1378 585390000 : srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
1379 292695000 : srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
1380 :
1381 585390000 : srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
1382 :
1383 : // multiply 2 adjacent elements with the filter and add the result
1384 292695000 : srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
1385 292695000 : srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
1386 :
1387 : // add and saturate the results together
1388 585390000 : srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
1389 : _mm256_adds_epi16(srcReg32b8, srcReg32b12));
1390 :
1391 : // shift by 6 bit each 16 bit
1392 292695000 : srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
1393 585390000 : srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
1394 292695000 : srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
1395 292695000 : srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);
1396 :
1397 : // shrink to 8 bit each 16 bits, the first lane contain the first
1398 : // convolve result and the second lane contain the second convolve
1399 : // result
1400 292695000 : srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
1401 :
1402 292695000 : src_ptr += src_stride;
1403 :
1404 292695000 : xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);
1405 :
1406 293044000 : output_ptr += dst_stride;
1407 :
1408 : // save part of the registers for next strides
1409 293044000 : srcReg32b10 = srcReg32b11;
1410 293044000 : srcReg32b1 = srcReg32b3;
1411 293044000 : srcReg32b11 = srcReg32b2;
1412 293044000 : srcReg32b3 = srcReg32b5;
1413 293044000 : srcReg32b2 = srcReg32b4;
1414 293044000 : srcReg32b5 = srcReg32b7;
1415 293044000 : srcReg32b7 = srcReg32b9;
1416 : }
1417 24661000 : if (i > 0) {
1418 : __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
1419 : __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
1420 : // load the last 16 bytes
1421 0 : srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
1422 :
1423 : // merge the last 2 results together
1424 : srcRegFilt4 =
1425 0 : _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
1426 : srcRegFilt7 =
1427 0 : _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
1428 :
1429 : // multiply 2 adjacent elements with the filter and add the result
1430 0 : srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
1431 : _mm256_castsi256_si128(firstFilters));
1432 : srcRegFilt4 =
1433 0 : _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
1434 0 : srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
1435 : _mm256_castsi256_si128(firstFilters));
1436 : srcRegFilt7 =
1437 0 : _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
1438 :
1439 : // add and saturate the results together
1440 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
1441 0 : srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
1442 :
1443 : // multiply 2 adjacent elements with the filter and add the result
1444 0 : srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
1445 : _mm256_castsi256_si128(secondFilters));
1446 0 : srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
1447 : _mm256_castsi256_si128(secondFilters));
1448 :
1449 : // multiply 2 adjacent elements with the filter and add the result
1450 0 : srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
1451 : _mm256_castsi256_si128(thirdFilters));
1452 0 : srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
1453 : _mm256_castsi256_si128(thirdFilters));
1454 :
1455 : // add and saturate the results together
1456 : srcRegFilt1 =
1457 0 : _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
1458 : srcRegFilt3 =
1459 0 : _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));
1460 :
1461 : // shift by 6 bit each 16 bit
1462 : srcRegFilt1 =
1463 0 : _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
1464 : srcRegFilt3 =
1465 0 : _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
1466 0 : srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
1467 0 : srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);
1468 :
1469 : // shrink to 8 bit each 16 bits, the first lane contain the first
1470 : // convolve result and the second lane contain the second convolve
1471 : // result
1472 0 : srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
1473 :
1474 : // save 16 bytes
1475 : _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
1476 : }
1477 24661000 : }
1478 :
1479 0 : static void aom_filter_block1d4_v4_avx2(
1480 : const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
1481 : ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
1482 : __m128i filtersReg;
1483 : __m256i filtersReg32, addFilterReg32;
1484 : __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
1485 : __m256i srcReg23_34_lo, srcReg45_56_lo;
1486 : __m256i srcReg2345_3456_lo;
1487 : __m256i resReglo, resReg;
1488 : __m256i firstFilters;
1489 : unsigned int i;
1490 : ptrdiff_t src_stride, dst_stride;
1491 :
1492 0 : addFilterReg32 = _mm256_set1_epi16(32);
1493 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
1494 : // converting the 16 bit (short) to 8 bit (byte) and have the
1495 : // same data in both lanes of 128 bit register.
1496 0 : filtersReg = _mm_srai_epi16(filtersReg, 1);
1497 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
1498 : // have the same data in both lanes of a 256 bit register
1499 0 : filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
1500 :
1501 : firstFilters =
1502 0 : _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
1503 :
1504 : // multiple the size of the source and destination stride by two
1505 0 : src_stride = src_pitch << 1;
1506 0 : dst_stride = out_pitch << 1;
1507 :
1508 0 : srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1509 0 : srcReg4x = _mm256_castsi128_si256(
1510 0 : _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
1511 :
1512 : // have consecutive loads on the same 256 register
1513 0 : srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
1514 :
1515 0 : srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
1516 :
1517 0 : for (i = output_height; i > 1; i -= 2) {
1518 : // load the last 2 loads of 16 bytes and have every two
1519 : // consecutive loads in the same 256 bit register
1520 0 : srcReg5x = _mm256_castsi128_si256(
1521 0 : _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
1522 : srcReg45 =
1523 0 : _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
1524 :
1525 0 : srcReg6x = _mm256_castsi128_si256(
1526 0 : _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
1527 : srcReg56 =
1528 0 : _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
1529 :
1530 : // merge every two consecutive registers
1531 0 : srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
1532 :
1533 0 : srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
1534 :
1535 : // multiply 2 adjacent elements with the filter and add the result
1536 0 : resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
1537 :
1538 0 : resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
1539 :
1540 : // shift by 6 bit each 16 bit
1541 0 : resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
1542 0 : resReglo = _mm256_srai_epi16(resReglo, 6);
1543 :
1544 : // shrink to 8 bit each 16 bits, the first lane contain the first
1545 : // convolve result and the second lane contain the second convolve
1546 : // result
1547 0 : resReg = _mm256_packus_epi16(resReglo, resReglo);
1548 :
1549 0 : src_ptr += src_stride;
1550 :
1551 0 : xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
1552 :
1553 0 : output_ptr += dst_stride;
1554 :
1555 : // save part of the registers for next strides
1556 0 : srcReg23_34_lo = srcReg45_56_lo;
1557 0 : srcReg4x = srcReg6x;
1558 : }
1559 0 : }
1560 :
1561 : #if 1//HAVE_AVX2 && HAVE_SSSE3
1562 : // void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
1563 : // uint8_t *dst, ptrdiff_t dst_stride,
1564 : // const int16_t *filter_x, int x_step_q4,
1565 : // const int16_t *filter_y, int y_step_q4,
1566 : // int w, int h);
1567 : // void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
1568 : // uint8_t *dst, ptrdiff_t dst_stride,
1569 : // const int16_t *filter_x, int x_step_q4,
1570 : // const int16_t *filter_y, int y_step_q4,
1571 : // int w, int h);
1572 86833700 : FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
1573 87282400 : FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
1574 :
1575 : #endif // HAVE_AX2 && HAVE_SSSE3
|