Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include "immintrin.h"
7 : #include "EbCombinedAveragingSAD_Inline_AVX2.h"
8 : #include "EbCombinedAveragingSAD_Intrinsic_AVX2.h"
9 : #include "EbMemory_AVX2.h"
10 : #include "EbMemory_SSE4_1.h"
11 : #include "EbComputeSAD_SSE2.h"
12 :
13 :
14 5462290 : uint32_t combined_averaging_8xm_sad_avx2_intrin(
15 : uint8_t *src,
16 : uint32_t src_stride,
17 : uint8_t *ref1,
18 : uint32_t ref1_stride,
19 : uint8_t *ref2,
20 : uint32_t ref2_stride,
21 : uint32_t height,
22 : uint32_t width)
23 : {
24 5462290 : __m256i sum = _mm256_setzero_si256();
25 : __m128i sad;
26 5462290 : uint32_t y = height;
27 : (void)width;
28 :
29 : do {
30 17511500 : const __m256i s = load_u8_8x4_avx2(src, src_stride);
31 17510100 : const __m256i r1 = load_u8_8x4_avx2(ref1, ref1_stride);
32 17508200 : const __m256i r2 = load_u8_8x4_avx2(ref2, ref2_stride);
33 17511300 : const __m256i avg = _mm256_avg_epu8(r1, r2);
34 17511300 : const __m256i sad = _mm256_sad_epu8(s, avg);
35 17511300 : sum = _mm256_add_epi32(sum, sad);
36 17511300 : src += src_stride << 2;
37 17511300 : ref1 += ref1_stride << 2;
38 17511300 : ref2 += ref2_stride << 2;
39 17511300 : y -= 4;
40 17511300 : } while (y);
41 :
42 5462050 : sad = _mm_add_epi32(_mm256_castsi256_si128(sum),
43 5462050 : _mm256_extracti128_si256(sum, 1));
44 10924100 : sad = _mm_add_epi32(sad, _mm_srli_si128(sad, 8));
45 :
46 5462050 : return _mm_cvtsi128_si32(sad);
47 : }
48 :
49 23621100 : static INLINE __m256i CombinedAveragingSad16x2_AVX2(const uint8_t *const src,
50 : const uint32_t src_stride, const uint8_t *const ref1, const uint32_t ref1_stride,
51 : const uint8_t *const ref2, const uint32_t ref2_stride, const __m256i sum)
52 : {
53 23621100 : const __m256i s = loadu_u8_16x2_avx2(src, src_stride);
54 23619700 : const __m256i r1 = loadu_u8_16x2_avx2(ref1, ref1_stride);
55 23618600 : const __m256i r2 = loadu_u8_16x2_avx2(ref2, ref2_stride);
56 23618100 : const __m256i avg = _mm256_avg_epu8(r1, r2);
57 23618100 : const __m256i sad = _mm256_sad_epu8(s, avg);
58 23618100 : return _mm256_add_epi32(sum, sad);
59 : }
60 :
61 2733860 : uint32_t combined_averaging_16xm_sad_avx2_intrin(
62 : uint8_t *src,
63 : uint32_t src_stride,
64 : uint8_t *ref1,
65 : uint32_t ref1_stride,
66 : uint8_t *ref2,
67 : uint32_t ref2_stride,
68 : uint32_t height,
69 : uint32_t width)
70 : {
71 2733860 : __m256i sum = _mm256_setzero_si256();
72 : __m128i sad;
73 2733860 : uint32_t y = height;
74 : (void)width;
75 :
76 : do {
77 23618400 : sum = CombinedAveragingSad16x2_AVX2(src, src_stride, ref1, ref1_stride,
78 : ref2, ref2_stride, sum);
79 23618300 : src += src_stride << 1;
80 23618300 : ref1 += ref1_stride << 1;
81 23618300 : ref2 += ref2_stride << 1;
82 23618300 : y -= 2;
83 23618300 : } while (y);
84 :
85 2733760 : sad = _mm_add_epi32(_mm256_castsi256_si128(sum),
86 2733760 : _mm256_extracti128_si256(sum, 1));
87 5467520 : sad = _mm_add_epi32(sad, _mm_srli_si128(sad, 8));
88 :
89 2733760 : return _mm_cvtsi128_si32(sad);
90 : }
91 :
92 0 : static INLINE __m256i CombinedAveragingSad24_AVX2(const uint8_t *const src,
93 : const uint8_t *const ref1, const uint8_t *const ref2, const __m256i sum)
94 : {
95 0 : const __m256i s = _mm256_loadu_si256((__m256i*)src);
96 0 : const __m256i r1 = _mm256_loadu_si256((__m256i*)ref1);
97 0 : const __m256i r2 = _mm256_loadu_si256((__m256i*)ref2);
98 0 : const __m256i avg = _mm256_avg_epu8(r1, r2);
99 0 : const __m256i sad = _mm256_sad_epu8(s, avg);
100 0 : return _mm256_add_epi32(sum, sad);
101 : }
102 :
103 0 : uint32_t combined_averaging_24xm_sad_avx2_intrin(
104 : uint8_t *src,
105 : uint32_t src_stride,
106 : uint8_t *ref1,
107 : uint32_t ref1_stride,
108 : uint8_t *ref2,
109 : uint32_t ref2_stride,
110 : uint32_t height,
111 : uint32_t width)
112 : {
113 0 : __m256i sum = _mm256_setzero_si256();
114 : __m128i sad;
115 0 : uint32_t y = height;
116 : (void)width;
117 :
118 : do {
119 0 : sum = CombinedAveragingSad24_AVX2(src + 0 * src_stride,
120 : ref1 + 0 * ref1_stride, ref2 + 0 * ref2_stride, sum);
121 0 : sum = CombinedAveragingSad24_AVX2(src + 1 * src_stride,
122 0 : ref1 + 1 * ref1_stride, ref2 + 1 * ref2_stride, sum);
123 0 : src += src_stride << 1;
124 0 : ref1 += ref1_stride << 1;
125 0 : ref2 += ref2_stride << 1;
126 0 : y -= 2;
127 0 : } while (y);
128 :
129 0 : sad = _mm_add_epi32(_mm256_castsi256_si128(sum),
130 0 : _mm_slli_si128(_mm256_extracti128_si256(sum, 1), 8));
131 0 : sad = _mm_add_epi32(sad, _mm_srli_si128(sad, 8));
132 :
133 0 : return _mm_cvtsi128_si32(sad);
134 : }
135 :
136 46378400 : static INLINE __m256i CombinedAveragingSad32_AVX2(const uint8_t *const src,
137 : const uint8_t *const ref1, const uint8_t *const ref2, const __m256i sum)
138 : {
139 46378400 : const __m256i s = _mm256_loadu_si256((__m256i*)src);
140 46378400 : const __m256i r1 = _mm256_loadu_si256((__m256i*)ref1);
141 46378400 : const __m256i r2 = _mm256_loadu_si256((__m256i*)ref2);
142 46378400 : const __m256i avg = _mm256_avg_epu8(r1, r2);
143 46378400 : const __m256i sad = _mm256_sad_epu8(s, avg);
144 46378400 : return _mm256_add_epi32(sum, sad);
145 : }
146 :
147 1376620 : uint32_t combined_averaging_32xm_sad_avx2_intrin(
148 : uint8_t *src,
149 : uint32_t src_stride,
150 : uint8_t *ref1,
151 : uint32_t ref1_stride,
152 : uint8_t *ref2,
153 : uint32_t ref2_stride,
154 : uint32_t height,
155 : uint32_t width)
156 : {
157 1376620 : __m256i sum = _mm256_setzero_si256();
158 : __m128i sad;
159 1376620 : uint32_t y = height;
160 : (void)width;
161 :
162 : do {
163 12680300 : sum = CombinedAveragingSad32_AVX2(src + 0 * src_stride,
164 : ref1 + 0 * ref1_stride, ref2 + 0 * ref2_stride, sum);
165 12680400 : sum = CombinedAveragingSad32_AVX2(src + 1 * src_stride,
166 12680400 : ref1 + 1 * ref1_stride, ref2 + 1 * ref2_stride, sum);
167 12680300 : src += src_stride << 1;
168 12680300 : ref1 += ref1_stride << 1;
169 12680300 : ref2 += ref2_stride << 1;
170 12680300 : y -= 2;
171 12680300 : } while (y);
172 :
173 1376620 : sad = _mm_add_epi32(_mm256_castsi256_si128(sum),
174 1376620 : _mm256_extracti128_si256(sum, 1));
175 2753250 : sad = _mm_add_epi32(sad, _mm_srli_si128(sad, 8));
176 :
177 1376620 : return _mm_cvtsi128_si32(sad);
178 : }
179 :
180 0 : uint32_t combined_averaging_48xm_sad_avx2_intrin(
181 : uint8_t *src,
182 : uint32_t src_stride,
183 : uint8_t *ref1,
184 : uint32_t ref1_stride,
185 : uint8_t *ref2,
186 : uint32_t ref2_stride,
187 : uint32_t height,
188 : uint32_t width)
189 : {
190 0 : __m256i sum = _mm256_setzero_si256();
191 : __m128i sad;
192 0 : uint32_t y = height;
193 : (void)width;
194 :
195 : do {
196 0 : sum = CombinedAveragingSad32_AVX2(src + 0 * src_stride,
197 : ref1 + 0 * ref1_stride, ref2 + 0 * ref2_stride, sum);
198 0 : sum = CombinedAveragingSad32_AVX2(src + 1 * src_stride,
199 0 : ref1 + 1 * ref1_stride, ref2 + 1 * ref2_stride, sum);
200 0 : sum = CombinedAveragingSad16x2_AVX2(src + 32, src_stride, ref1 + 32,
201 0 : ref1_stride, ref2 + 32, ref2_stride, sum);
202 :
203 0 : src += src_stride << 1;
204 0 : ref1 += ref1_stride << 1;
205 0 : ref2 += ref2_stride << 1;
206 0 : y -= 2;
207 0 : } while (y);
208 :
209 0 : sad = _mm_add_epi32(_mm256_castsi256_si128(sum),
210 0 : _mm256_extracti128_si256(sum, 1));
211 0 : sad = _mm_add_epi32(sad, _mm_srli_si128(sad, 8));
212 :
213 0 : return _mm_cvtsi128_si32(sad);
214 : }
215 :
216 342375 : uint32_t combined_averaging_64xm_sad_avx2_intrin(
217 : uint8_t *src,
218 : uint32_t src_stride,
219 : uint8_t *ref1,
220 : uint32_t ref1_stride,
221 : uint8_t *ref2,
222 : uint32_t ref2_stride,
223 : uint32_t height,
224 : uint32_t width)
225 : {
226 342375 : __m256i sum = _mm256_setzero_si256();
227 : __m128i sad;
228 342375 : uint32_t y = height;
229 : (void)width;
230 :
231 : do {
232 10528000 : sum = CombinedAveragingSad32_AVX2(src + 0x00,
233 : ref1 + 0x00, ref2 + 0x00, sum);
234 10528100 : sum = CombinedAveragingSad32_AVX2(src + 0x20,
235 10528100 : ref1 + 0x20, ref2 + 0x20, sum);
236 10528000 : src += src_stride;
237 10528000 : ref1 += ref1_stride;
238 10528000 : ref2 += ref2_stride;
239 10528000 : } while (--y);
240 :
241 342372 : sad = _mm_add_epi32(_mm256_castsi256_si128(sum),
242 342372 : _mm256_extracti128_si256(sum, 1));
243 684744 : sad = _mm_add_epi32(sad, _mm_srli_si128(sad, 8));
244 :
245 342372 : return _mm_cvtsi128_si32(sad);
246 : }
247 0 : uint64_t compute_mean8x8_avx2_intrin(
248 : uint8_t * input_samples, // input parameter, input samples Ptr
249 : uint32_t input_stride, // input parameter, input stride
250 : uint32_t input_area_width, // input parameter, input area width
251 : uint32_t input_area_height) // input parameter, input area height
252 : {
253 0 : __m256i sum, sum2, xmm2, xmm1, sum1, xmm0 = _mm256_setzero_si256();
254 0 : __m128i upper, lower, mean = _mm_setzero_si128();
255 : uint64_t result;
256 0 : xmm1 = _mm256_sad_epu8(xmm0, _mm256_set_m128i(_mm_loadl_epi64((__m128i *)(input_samples + input_stride)), _mm_loadl_epi64((__m128i *)(input_samples))));
257 0 : xmm2 = _mm256_sad_epu8(xmm0, _mm256_set_m128i(_mm_loadl_epi64((__m128i *)(input_samples + 3 * input_stride)), _mm_loadl_epi64((__m128i *)(input_samples + 2 * input_stride))));
258 0 : sum1 = _mm256_add_epi16(xmm1, xmm2);
259 :
260 0 : input_samples += 4 * input_stride;
261 :
262 0 : xmm1 = _mm256_sad_epu8(xmm0, _mm256_set_m128i(_mm_loadl_epi64((__m128i *)(input_samples + input_stride)), _mm_loadl_epi64((__m128i *)(input_samples))));
263 0 : xmm2 = _mm256_sad_epu8(xmm0, _mm256_set_m128i(_mm_loadl_epi64((__m128i *)(input_samples + 3 * input_stride)), _mm_loadl_epi64((__m128i *)(input_samples + 2 * input_stride))));
264 0 : sum2 = _mm256_add_epi16(xmm1, xmm2);
265 :
266 0 : sum = _mm256_add_epi16(sum1, sum2);
267 0 : upper = _mm256_extractf128_si256(sum, 1); //extract upper 128 bit
268 0 : upper = _mm_add_epi32(upper, _mm_srli_si128(upper, 8)); // shift 2nd 16 bits to the 1st and sum both
269 :
270 0 : lower = _mm256_extractf128_si256(sum, 0); //extract lower 128 bit
271 0 : lower = _mm_add_epi32(lower, _mm_srli_si128(lower, 8)); // shift 2nd 16 bits to the 1st and sum both
272 :
273 0 : mean = _mm_add_epi32(lower, upper);
274 :
275 : (void)input_area_width;
276 : (void)input_area_height;
277 :
278 0 : result = (uint64_t)_mm_cvtsi128_si32(mean) << 2;
279 0 : return result;
280 : }
281 :
282 : /********************************************************************************************************************************/
283 113514 : void compute_interm_var_four8x8_avx2_intrin(
284 : uint8_t * input_samples,
285 : uint16_t input_stride,
286 : uint64_t * mean_of8x8_blocks, // mean of four 8x8
287 : uint64_t * mean_of_squared8x8_blocks) // meanSquared
288 : {
289 : __m256i ymm1, ymm2, ymm3, ymm4, ymm_sum1, ymm_sum2, ymm_FinalSum, ymm_shift,/* ymm_blockMeanSquared*///,
290 : ymm_in, ymm_in_2S, ymm_in_second, ymm_in_2S_second, ymm_shiftSquared, ymm_permute8,
291 : ymm_result, ymm_blockMeanSquaredlow, ymm_blockMeanSquaredHi, ymm_inputlo, ymm_inputhi;
292 :
293 : __m128i ymm_blockMeanSquaredlo, ymm_blockMeanSquaredhi, ymm_resultlo, ymm_resulthi;
294 :
295 113514 : __m256i ymm_zero = _mm256_setzero_si256();
296 113514 : __m128i xmm_zero = _mm_setzero_si128();
297 :
298 113514 : ymm_in = _mm256_loadu_si256((__m256i *) input_samples);
299 227028 : ymm_in_2S = _mm256_loadu_si256((__m256i *)(input_samples + 2 * input_stride));
300 :
301 113514 : ymm1 = _mm256_sad_epu8(ymm_in, ymm_zero);
302 113514 : ymm2 = _mm256_sad_epu8(ymm_in_2S, ymm_zero);
303 :
304 113514 : ymm_sum1 = _mm256_add_epi16(ymm1, ymm2);
305 :
306 113514 : input_samples += 4 * input_stride;
307 113514 : ymm_in_second = _mm256_loadu_si256((__m256i *)input_samples);
308 227028 : ymm_in_2S_second = _mm256_loadu_si256((__m256i *)(input_samples + 2 * input_stride));
309 :
310 113514 : ymm3 = _mm256_sad_epu8(ymm_in_second, ymm_zero);
311 113514 : ymm4 = _mm256_sad_epu8(ymm_in_2S_second, ymm_zero);
312 :
313 113514 : ymm_sum2 = _mm256_add_epi16(ymm3, ymm4);
314 :
315 113514 : ymm_FinalSum = _mm256_add_epi16(ymm_sum1, ymm_sum2);
316 :
317 113514 : ymm_shift = _mm256_set_epi64x(3, 3, 3, 3);
318 113514 : ymm_FinalSum = _mm256_sllv_epi64(ymm_FinalSum, ymm_shift);
319 :
320 : _mm256_storeu_si256((__m256i *)(mean_of8x8_blocks), ymm_FinalSum);
321 :
322 : /*******************************Squared Mean******************************/
323 :
324 113514 : ymm_inputlo = _mm256_unpacklo_epi8(ymm_in, ymm_zero);
325 113514 : ymm_inputhi = _mm256_unpackhi_epi8(ymm_in, ymm_zero);
326 :
327 113514 : ymm_blockMeanSquaredlow = _mm256_madd_epi16(ymm_inputlo, ymm_inputlo);
328 113514 : ymm_blockMeanSquaredHi = _mm256_madd_epi16(ymm_inputhi, ymm_inputhi);
329 :
330 113514 : ymm_inputlo = _mm256_unpacklo_epi8(ymm_in_2S, ymm_zero);
331 113514 : ymm_inputhi = _mm256_unpackhi_epi8(ymm_in_2S, ymm_zero);
332 :
333 227028 : ymm_blockMeanSquaredlow = _mm256_add_epi32(ymm_blockMeanSquaredlow, _mm256_madd_epi16(ymm_inputlo, ymm_inputlo));
334 227028 : ymm_blockMeanSquaredHi = _mm256_add_epi32(ymm_blockMeanSquaredHi, _mm256_madd_epi16(ymm_inputhi, ymm_inputhi));
335 :
336 113514 : ymm_inputlo = _mm256_unpacklo_epi8(ymm_in_second, ymm_zero);
337 113514 : ymm_inputhi = _mm256_unpackhi_epi8(ymm_in_second, ymm_zero);
338 :
339 227028 : ymm_blockMeanSquaredlow = _mm256_add_epi32(ymm_blockMeanSquaredlow, _mm256_madd_epi16(ymm_inputlo, ymm_inputlo));
340 227028 : ymm_blockMeanSquaredHi = _mm256_add_epi32(ymm_blockMeanSquaredHi, _mm256_madd_epi16(ymm_inputhi, ymm_inputhi));
341 :
342 113514 : ymm_inputlo = _mm256_unpacklo_epi8(ymm_in_2S_second, ymm_zero);
343 227028 : ymm_inputhi = _mm256_unpackhi_epi8(ymm_in_2S_second, ymm_zero);
344 :
345 227028 : ymm_blockMeanSquaredlow = _mm256_add_epi32(ymm_blockMeanSquaredlow, _mm256_madd_epi16(ymm_inputlo, ymm_inputlo));
346 113514 : ymm_blockMeanSquaredHi = _mm256_add_epi32(ymm_blockMeanSquaredHi, _mm256_madd_epi16(ymm_inputhi, ymm_inputhi));
347 :
348 113514 : ymm_blockMeanSquaredlow = _mm256_add_epi32(ymm_blockMeanSquaredlow, _mm256_srli_si256(ymm_blockMeanSquaredlow, 8));
349 113514 : ymm_blockMeanSquaredHi = _mm256_add_epi32(ymm_blockMeanSquaredHi, _mm256_srli_si256(ymm_blockMeanSquaredHi, 8));
350 :
351 113514 : ymm_blockMeanSquaredlow = _mm256_add_epi32(ymm_blockMeanSquaredlow, _mm256_srli_si256(ymm_blockMeanSquaredlow, 4));
352 227028 : ymm_blockMeanSquaredHi = _mm256_add_epi32(ymm_blockMeanSquaredHi, _mm256_srli_si256(ymm_blockMeanSquaredHi, 4));
353 :
354 113514 : ymm_permute8 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 4, 0);
355 113514 : ymm_blockMeanSquaredlow = _mm256_permutevar8x32_epi32(ymm_blockMeanSquaredlow, ymm_permute8/*8*/);
356 113514 : ymm_blockMeanSquaredHi = _mm256_permutevar8x32_epi32(ymm_blockMeanSquaredHi, ymm_permute8);
357 :
358 113514 : ymm_blockMeanSquaredlo = _mm256_castsi256_si128(ymm_blockMeanSquaredlow); //lower 128
359 113514 : ymm_blockMeanSquaredhi = _mm256_extracti128_si256(ymm_blockMeanSquaredHi, 0); //lower 128
360 :
361 340542 : ymm_result = _mm256_unpacklo_epi32(_mm256_castsi128_si256(ymm_blockMeanSquaredlo), _mm256_castsi128_si256(ymm_blockMeanSquaredhi));
362 227028 : ymm_resultlo = _mm_unpacklo_epi64(_mm256_castsi256_si128(ymm_result), xmm_zero);
363 113514 : ymm_resulthi = _mm_unpackhi_epi64(_mm256_castsi256_si128(ymm_result), xmm_zero);
364 :
365 227028 : ymm_result = _mm256_set_m128i(ymm_resulthi, ymm_resultlo);
366 :
367 113514 : ymm_permute8 = _mm256_set_epi32(7, 5, 6, 4, 3, 1, 2, 0);
368 113514 : ymm_result = _mm256_permutevar8x32_epi32(ymm_result, ymm_permute8);
369 :
370 113514 : ymm_shiftSquared = _mm256_set1_epi64x(11);
371 :
372 113514 : ymm_result = _mm256_sllv_epi64(ymm_result, ymm_shiftSquared);
373 :
374 : _mm256_storeu_si256((__m256i *)(mean_of_squared8x8_blocks), ymm_result);
375 113514 : }
376 :
377 30676900 : uint32_t combined_averaging_ssd_avx2(uint8_t *src, ptrdiff_t src_stride,
378 : uint8_t *ref1, ptrdiff_t ref1_stride,
379 : uint8_t *ref2, ptrdiff_t ref2_stride,
380 : uint32_t height, uint32_t width) {
381 30676900 : uint32_t y = height;
382 : __m128i sum_128;
383 :
384 30676900 : if (width & 4) {
385 0 : const __m128i zero = _mm_setzero_si128();
386 :
387 0 : sum_128 = _mm_setzero_si128();
388 :
389 : do {
390 0 : uint32_t x = 0;
391 : do {
392 0 : const __m128i s = load_u8_4x2_sse4_1(src + x, src_stride);
393 0 : const __m128i r1 = load_u8_4x2_sse4_1(ref1 + x, ref1_stride);
394 0 : const __m128i r2 = load_u8_4x2_sse4_1(ref2 + x, ref2_stride);
395 0 : const __m128i avg = _mm_avg_epu8(r1, r2);
396 0 : const __m128i s16 = _mm_unpacklo_epi8(s, zero);
397 0 : const __m128i avg16 = _mm_unpacklo_epi8(avg, zero);
398 0 : const __m128i dif = _mm_sub_epi16(s16, avg16);
399 0 : const __m128i sqr = _mm_madd_epi16(dif, dif);
400 0 : sum_128 = _mm_add_epi32(sum_128, sqr);
401 0 : x += 4;
402 0 : } while (x < width);
403 :
404 0 : src += 2 * src_stride;
405 0 : ref1 += 2 * ref1_stride;
406 0 : ref2 += 2 * ref2_stride;
407 0 : y -= 2;
408 0 : } while (y);
409 : }
410 : else {
411 30676900 : __m256i sum = _mm256_setzero_si256();
412 :
413 30676900 : if (width == 8) {
414 : do {
415 112628000 : ssd8x2_avx2(src,
416 : src_stride,
417 : ref1,
418 : ref1_stride,
419 : ref2,
420 : ref2_stride,
421 : &sum);
422 112563000 : src += 2 * src_stride;
423 112563000 : ref1 += 2 * ref1_stride;
424 112563000 : ref2 += 2 * ref2_stride;
425 112563000 : y -= 2;
426 112563000 : } while (y);
427 : }
428 14242500 : else if (width == 16) {
429 : do {
430 150229000 : const __m128i s = _mm_loadu_si128((__m128i *)src);
431 150229000 : const __m128i r1 = _mm_loadu_si128((__m128i *)ref1);
432 150229000 : const __m128i r2 = _mm_loadu_si128((__m128i *)ref2);
433 150229000 : const __m128i avg = _mm_avg_epu8(r1, r2);
434 150229000 : const __m256i s_256 = _mm256_cvtepu8_epi16(s);
435 150229000 : const __m256i avg_256 = _mm256_cvtepu8_epi16(avg);
436 150229000 : const __m256i dif = _mm256_sub_epi16(s_256, avg_256);
437 150229000 : const __m256i sqr = _mm256_madd_epi16(dif, dif);
438 150229000 : sum = _mm256_add_epi32(sum, sqr);
439 :
440 150229000 : src += src_stride;
441 150229000 : ref1 += ref1_stride;
442 150229000 : ref2 += ref2_stride;
443 150229000 : } while (--y);
444 : }
445 5433900 : else if (width == 32) {
446 : do {
447 75129300 : ssd32_avx2(src, ref1, ref2, &sum);
448 75120500 : src += src_stride;
449 75120500 : ref1 += ref1_stride;
450 75120500 : ref2 += ref2_stride;
451 75120500 : } while (--y);
452 : }
453 1028150 : else if (width == 64) {
454 : do {
455 28109000 : ssd32_avx2(src + 0 * 32, ref1 + 0 * 32, ref2 + 0 * 32, &sum);
456 28185200 : ssd32_avx2(src + 1 * 32, ref1 + 1 * 32, ref2 + 1 * 32, &sum);
457 28185200 : src += src_stride;
458 28185200 : ref1 += ref1_stride;
459 28185200 : ref2 += ref2_stride;
460 28185200 : } while (--y);
461 : }
462 : else {
463 : do {
464 0 : uint32_t x = 0;
465 : do {
466 0 : ssd8x2_avx2(src + x,
467 : src_stride,
468 0 : ref1 + x,
469 : ref1_stride,
470 0 : ref2 + x,
471 : ref2_stride,
472 : &sum);
473 0 : x += 8;
474 0 : } while (x < width);
475 :
476 0 : src += 2 * src_stride;
477 0 : ref1 += 2 * ref1_stride;
478 0 : ref2 += 2 * ref2_stride;
479 0 : y -= 2;
480 0 : } while (y);
481 : }
482 :
483 30679400 : const __m128i sum0_128 = _mm256_castsi256_si128(sum);
484 30679400 : const __m128i sum1_128 = _mm256_extracti128_si256(sum, 1);
485 30679400 : sum_128 = _mm_add_epi32(sum0_128, sum1_128);
486 : }
487 :
488 30679400 : sum_128 = _mm_add_epi32(sum_128, _mm_srli_si128(sum_128, 8));
489 61358800 : sum_128 = _mm_add_epi32(sum_128, _mm_srli_si128(sum_128, 4));
490 30679400 : return _mm_cvtsi128_si32(sum_128);
491 : }
492 :
493 9911670 : uint32_t nxm_sad_avg_kernel_helper_avx2(
494 : uint8_t *src,
495 : uint32_t src_stride,
496 : uint8_t *ref1,
497 : uint32_t ref1_stride,
498 : uint8_t *ref2,
499 : uint32_t ref2_stride,
500 : uint32_t height,
501 : uint32_t width)
502 : {
503 :
504 9911670 : uint32_t nxm_sad_avg = 0;
505 :
506 9911670 : switch (width) {
507 0 : case 4:
508 0 : nxm_sad_avg = combined_averaging_4xm_sad_sse2_intrin(src, src_stride, ref1, ref1_stride, ref2, ref2_stride, height, width); break;
509 5462610 : case 8:
510 5462610 : nxm_sad_avg = combined_averaging_8xm_sad_avx2_intrin(src, src_stride, ref1, ref1_stride, ref2, ref2_stride, height, width); break;
511 2733910 : case 16:
512 2733910 : nxm_sad_avg = combined_averaging_16xm_sad_avx2_intrin(src, src_stride, ref1, ref1_stride, ref2, ref2_stride, height, width); break;
513 0 : case 24:
514 0 : nxm_sad_avg = combined_averaging_24xm_sad_avx2_intrin(src, src_stride, ref1, ref1_stride, ref2, ref2_stride, height, width); break;
515 1376620 : case 32:
516 1376620 : nxm_sad_avg = combined_averaging_32xm_sad_avx2_intrin(src, src_stride, ref1, ref1_stride, ref2, ref2_stride, height, width); break;
517 0 : case 48:
518 0 : nxm_sad_avg = combined_averaging_48xm_sad_avx2_intrin(src, src_stride, ref1, ref1_stride, ref2, ref2_stride, height, width); break;
519 342375 : case 64:
520 342375 : nxm_sad_avg = combined_averaging_64xm_sad_avx2_intrin(src, src_stride, ref1, ref1_stride, ref2, ref2_stride, height, width); break;
521 0 : case 40:
522 : case 56:
523 0 : break; //void_func();
524 9911720 : default:
525 : assert(0);
526 : }
527 :
528 9911720 : return nxm_sad_avg;
529 : }
|