Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include "EbDefinitions.h"
13 : #include <immintrin.h>
14 : #include "aom_dsp_rtcd.h"
15 : #include "EbVariance_SSE2.h"
16 :
17 :
18 : // Alpha blending with alpha values from the range [0, 256], where 256
19 : // means use the first input and 0 means use the second input.
20 : #define AOM_BLEND_A256_ROUND_BITS 8
21 : #define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS) // 256
22 :
23 : #define AOM_BLEND_A256(a, v0, v1) \
24 : ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \
25 : AOM_BLEND_A256_ROUND_BITS)
26 :
27 793944 : static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
28 1587890 : return _mm_add_epi16(_mm256_castsi256_si128(val),
29 793944 : _mm256_extractf128_si256(val, 1));
30 : }
31 :
32 3427330 : static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) {
33 6854670 : return _mm_add_epi32(_mm256_castsi256_si128(val),
34 3427330 : _mm256_extractf128_si256(val, 1));
35 : }
36 :
37 20677700 : static INLINE void variance_kernel_no_sum_avx2(const __m256i src, const __m256i ref,
38 : __m256i *const sse) {
39 20677700 : const __m256i adj_sub = _mm256_set1_epi16((short)0xff01); // (1,-1)
40 :
41 : // unpack into pairs of source and reference values
42 20677700 : const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
43 20677700 : const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
44 :
45 : // subtract adjacent elements using src*1 + ref*-1
46 20677700 : const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
47 20677700 : const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
48 20677700 : const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
49 20677700 : const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
50 :
51 : // add to the running totals
52 20677700 : *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
53 20677700 : }
54 :
55 2590120 : static INLINE void variance_final_from_32bit_no_sum_avx2(__m256i vsse,
56 : uint32_t *const sse) {
57 : // extract the low lane and add it to the high lane
58 2590120 : const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
59 2590060 : const __m128i zero = _mm_setzero_si128();
60 :
61 : // unpack sse and sum registers and add
62 2590060 : const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, zero);
63 2590060 : const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, zero);
64 2590060 : const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
65 :
66 : // perform the final summation and extract the results
67 5180130 : const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
68 2590060 : *((int32_t *)sse) = _mm_cvtsi128_si32(res);
69 2590060 : }
70 :
71 : // handle pixels (<= 512)
72 2590150 : static INLINE void variance_final_512_no_sum_avx2(__m256i vsse,
73 : uint32_t *const sse) {
74 : // extract the low lane and add it to the high lane
75 2590150 : variance_final_from_32bit_no_sum_avx2(vsse, sse);
76 2589590 : }
77 :
78 : // handle 1024 pixels (32x32, 16x64, 64x16)
79 0 : static INLINE void variance_final_1024_no_sum_avx2(__m256i vsse,
80 : uint32_t *const sse) {
81 : // extract the low lane and add it to the high lane
82 0 : variance_final_from_32bit_no_sum_avx2(vsse, sse);
83 0 : }
84 :
85 28067 : static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
86 28067 : const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
87 : const __m256i sum_hi =
88 56134 : _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
89 28067 : return _mm256_add_epi32(sum_lo, sum_hi);
90 : }
91 :
92 : // handle 2048 pixels (32x64, 64x32)
93 0 : static INLINE void variance_final_2048_no_sum_avx2(__m256i vsse,
94 : uint32_t *const sse) {
95 0 : variance_final_from_32bit_no_sum_avx2(vsse, sse);
96 0 : }
97 :
98 20689000 : static INLINE void variance16_kernel_no_sum_avx2(
99 : const uint8_t *const src, const int32_t src_stride, const uint8_t *const ref,
100 : const int32_t ref_stride, __m256i *const sse) {
101 20689000 : const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
102 41378000 : const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
103 20689000 : const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
104 41378000 : const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
105 20689000 : const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
106 20689000 : const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
107 20689000 : variance_kernel_no_sum_avx2(s, r, sse);
108 20683500 : }
109 :
110 0 : static INLINE void variance32_kernel_no_sum_avx2(const uint8_t *const src,
111 : const uint8_t *const ref,
112 : __m256i *const sse) {
113 0 : const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
114 0 : const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
115 0 : variance_kernel_no_sum_avx2(s, r, sse);
116 0 : }
117 :
118 2589790 : static INLINE void variance16_no_sum_avx2(const uint8_t *src,
119 : const int32_t src_stride, const uint8_t *ref, const int32_t ref_stride,
120 : const int32_t h, __m256i *const vsse) {
121 23273100 : for (int32_t i = 0; i < h; i += 2) {
122 20679200 : variance16_kernel_no_sum_avx2(src, src_stride, ref, ref_stride, vsse);
123 20683300 : src += 2 * src_stride;
124 20683300 : ref += 2 * ref_stride;
125 : }
126 2593840 : }
127 :
128 0 : static INLINE void variance32_no_sum_avx2(const uint8_t *src, const int32_t src_stride,
129 : const uint8_t *ref, const int32_t ref_stride,
130 : const int32_t h, __m256i *const vsse) {
131 0 : for (int32_t i = 0; i < h; i++) {
132 0 : variance32_kernel_no_sum_avx2(src, ref, vsse);
133 0 : src += src_stride;
134 0 : ref += ref_stride;
135 : }
136 0 : }
137 :
138 0 : static INLINE void variance64_no_sum_avx2(const uint8_t *src, const int32_t src_stride,
139 : const uint8_t *ref, const int32_t ref_stride,
140 : const int32_t h, __m256i *const vsse) {
141 0 : for (int32_t i = 0; i < h; i++) {
142 0 : variance32_kernel_no_sum_avx2(src + 0, ref + 0, vsse);
143 0 : variance32_kernel_no_sum_avx2(src + 32, ref + 32, vsse);
144 0 : src += src_stride;
145 0 : ref += ref_stride;
146 : }
147 0 : }
148 :
149 0 : static INLINE void variance128_no_sum_avx2(const uint8_t *src, const int32_t src_stride,
150 : const uint8_t *ref, const int32_t ref_stride,
151 : const int32_t h, __m256i *const vsse) {
152 0 : for (int32_t i = 0; i < h; i++) {
153 0 : variance32_kernel_no_sum_avx2(src + 0, ref + 0, vsse);
154 0 : variance32_kernel_no_sum_avx2(src + 32, ref + 32, vsse);
155 0 : variance32_kernel_no_sum_avx2(src + 64, ref + 64, vsse);
156 0 : variance32_kernel_no_sum_avx2(src + 96, ref + 96, vsse);
157 0 : src += src_stride;
158 0 : ref += ref_stride;
159 : }
160 0 : }
161 :
162 : #define AOM_VAR_NO_LOOP_NO_SUM_AVX2(bw, bh, bits, max_pixel) \
163 : void eb_aom_variance##bw##x##bh##_no_sum_avx2( \
164 : const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, \
165 : uint32_t *sse) { \
166 : __m256i vsse = _mm256_setzero_si256(); \
167 : variance##bw##_no_sum_avx2(src, src_stride, ref, ref_stride, bh, &vsse); \
168 : variance_final_##max_pixel##_no_sum_avx2(vsse, sse); \
169 : }
170 :
171 5179510 : AOM_VAR_NO_LOOP_NO_SUM_AVX2(16, 16, 8, 512);
172 :
173 2589580 : uint32_t eb_aom_mse16x16_avx2(const uint8_t *src, int32_t src_stride,
174 : const uint8_t *ref, int32_t ref_stride,
175 : uint32_t *sse) {
176 2589580 : eb_aom_variance16x16_no_sum_avx2(src, src_stride, ref, ref_stride, sse);
177 2589600 : return *sse;
178 : }
179 :
180 0 : void highbd_variance64_avx2(const uint8_t *a8, int32_t a_stride,
181 : const uint8_t *b8, int32_t b_stride, int32_t w, int32_t h,
182 : uint64_t *sse) {
183 0 : const uint8_t *a = a8;
184 0 : const uint8_t *b = b8;
185 :
186 0 : if (w == 4) {
187 0 : __m128i vsse = _mm_setzero_si128();
188 : uint32_t tsse;
189 0 : variance4_no_sum_sse2(a8, a_stride, b8, b_stride, h, &vsse);
190 0 : variance_final_128_pel_no_sum_sse2(vsse, &tsse);
191 0 : *sse = tsse;
192 : }
193 0 : else if (w == 8) {
194 0 : __m128i vsse = _mm_setzero_si128();
195 : uint32_t tsse;
196 0 : variance8_no_sum_sse2(a8, a_stride, b8, b_stride, h, &vsse);
197 0 : variance_final_256_pel_no_sum_sse2(vsse, &tsse);
198 0 : *sse = tsse;
199 : }
200 0 : else if (w == 16) {
201 0 : __m256i vsse = _mm256_setzero_si256();
202 : uint32_t tsse;
203 0 : variance16_no_sum_avx2(a8, a_stride, b8, b_stride, h, &vsse);
204 0 : variance_final_1024_no_sum_avx2(vsse, &tsse);
205 0 : *sse = tsse;
206 : }
207 0 : else if (w == 32) {
208 0 : if (h <= 64) {
209 0 : __m256i vsse = _mm256_setzero_si256();
210 : uint32_t tsse;
211 0 : variance32_no_sum_avx2(a8, a_stride, b8, b_stride, h, &vsse);
212 0 : variance_final_2048_no_sum_avx2(vsse, &tsse);
213 0 : *sse = tsse;
214 : }
215 : else {
216 0 : __m256i vsse = _mm256_setzero_si256();
217 : uint32_t tsse;
218 0 : variance32_no_sum_avx2(a8, a_stride, b8, b_stride, 64, &vsse);
219 0 : variance32_no_sum_avx2(a8 + 64 * a_stride, a_stride, b8 + 64 * b_stride,
220 : b_stride, h - 64, &vsse);
221 0 : variance_final_from_32bit_no_sum_avx2(vsse, &tsse);
222 0 : *sse = tsse;
223 : }
224 : }
225 0 : else if (w == 64) {
226 0 : if (h <= 32) {
227 0 : __m256i vsse = _mm256_setzero_si256();
228 : uint32_t tsse;
229 0 : variance64_no_sum_avx2(a8, a_stride, b8, b_stride, h, &vsse);
230 0 : variance_final_2048_no_sum_avx2(vsse, &tsse);
231 0 : *sse = tsse;
232 : }
233 : else {
234 0 : __m256i vsse = _mm256_setzero_si256();
235 : uint32_t tsse;
236 :
237 0 : int32_t i = 0;
238 : do {
239 0 : variance64_no_sum_avx2(a8, a_stride, b8, b_stride, 32, &vsse);
240 0 : a8 += 32 * a_stride;
241 0 : b8 += 32 * b_stride;
242 0 : } while (++i < (h / 32));
243 0 : variance_final_from_32bit_no_sum_avx2(vsse, &tsse);
244 0 : *sse = tsse;
245 : }
246 : }
247 0 : else if (w == 128) {
248 0 : __m256i vsse = _mm256_setzero_si256();
249 : uint32_t tsse;
250 :
251 0 : int32_t i = 0;
252 : do {
253 0 : variance128_no_sum_avx2(a8, a_stride, b8, b_stride, 16, &vsse);
254 0 : a8 += 16 * a_stride;
255 0 : b8 += 16 * b_stride;
256 0 : } while (++i < (h / 16));
257 0 : variance_final_from_32bit_no_sum_avx2(vsse, &tsse);
258 0 : *sse = tsse;
259 : }
260 : else
261 0 : highbd_variance64_c(a, a_stride, b, b_stride, w, h, sse);
262 : #ifdef _WIN32
263 : // Add this redundant instruction to fix a Visual Studio compiler bug, which
264 : // falsely loads 64-bit intermediate result into *sse in
265 : // variance_final_from_32bit_sum_avx2(), instead of 32-bit result as we
266 : // wanted. We and *sse back to 32-bit correct result.
267 : // No overflow happens here, since for the largest 8-bit 128x128 block,
268 : // *sse is at most 255 * 255 * 128 * 128, i.e., 0x000000003F804000L.
269 : *sse &= 0x00000000FFFFFFFFL;
270 : #endif
271 0 : }
272 :
273 815633 : static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
274 : unsigned int *const sse) {
275 : // extract the low lane and add it to the high lane
276 815633 : const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
277 :
278 : // unpack sse and sum registers and add
279 815557 : const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
280 815557 : const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
281 815557 : const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
282 :
283 : // perform the final summation and extract the results
284 1631110 : const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
285 815557 : *((int *)sse) = _mm_cvtsi128_si32(res);
286 815557 : return _mm_extract_epi32(res, 1);
287 : }
288 :
289 : // handle pixels (<= 512)
290 734495 : static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum,
291 : unsigned int *const sse) {
292 : // extract the low lane and add it to the high lane
293 734495 : const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
294 1468740 : const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8));
295 734371 : const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64);
296 734371 : return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse);
297 : }
298 :
299 : // handle 1024 pixels (32x32, 16x64, 64x16)
300 59596 : static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
301 : unsigned int *const sse) {
302 : // extract the low lane and add it to the high lane
303 59596 : const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
304 : const __m128i vsum_64 =
305 119188 : _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
306 59594 : _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
307 59594 : return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
308 : }
309 :
310 15284 : static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
311 : unsigned int *const sse) {
312 15284 : vsum = sum_to_32bit_avx2(vsum);
313 15284 : const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);
314 15284 : return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);
315 : }
316 :
317 9168220 : static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
318 : __m256i *const sse,
319 : __m256i *const sum) {
320 9168220 : const __m256i adj_sub = _mm256_set1_epi16(0xff01); // (1,-1)
321 :
322 : // unpack into pairs of source and reference values
323 9168220 : const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
324 9168220 : const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
325 :
326 : // subtract adjacent elements using src*1 + ref*-1
327 9168220 : const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
328 9168220 : const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
329 9168220 : const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
330 9168220 : const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
331 :
332 : // add to the running totals
333 18336400 : *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
334 9168220 : *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
335 9168220 : }
336 :
337 5278350 : static INLINE void variance16_kernel_avx2(
338 : const uint8_t *const src, const int src_stride, const uint8_t *const ref,
339 : const int ref_stride, __m256i *const sse, __m256i *const sum) {
340 5278350 : const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
341 10556700 : const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
342 5278350 : const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
343 10556700 : const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
344 5278350 : const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
345 5278350 : const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
346 5278350 : variance_kernel_avx2(s, r, sse, sum);
347 5298790 : }
348 :
349 3901550 : static INLINE void variance32_kernel_avx2(const uint8_t *const src,
350 : const uint8_t *const ref,
351 : __m256i *const sse,
352 : __m256i *const sum) {
353 3901550 : const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
354 3901550 : const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
355 3901550 : variance_kernel_avx2(s, r, sse, sum);
356 3905580 : }
357 :
358 690824 : static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
359 : const uint8_t *ref, const int ref_stride,
360 : const int h, __m256i *const vsse,
361 : __m256i *const vsum) {
362 690824 : *vsum = _mm256_setzero_si256();
363 :
364 5987560 : for (int i = 0; i < h; i += 2) {
365 5282870 : variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
366 5296740 : src += 2 * src_stride;
367 5296740 : ref += 2 * ref_stride;
368 : }
369 704690 : }
370 :
371 98455 : static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
372 : const uint8_t *ref, const int ref_stride,
373 : const int h, __m256i *const vsse,
374 : __m256i *const vsum) {
375 98455 : *vsum = _mm256_setzero_si256();
376 :
377 2341720 : for (int i = 0; i < h; i++) {
378 2243270 : variance32_kernel_avx2(src, ref, vsse, vsum);
379 2243260 : src += src_stride;
380 2243260 : ref += ref_stride;
381 : }
382 98448 : }
383 :
384 31984 : static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
385 : const uint8_t *ref, const int ref_stride,
386 : const int h, __m256i *const vsse,
387 : __m256i *const vsum) {
388 31984 : *vsum = _mm256_setzero_si256();
389 :
390 865303 : for (int i = 0; i < h; i++) {
391 833222 : variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
392 834068 : variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
393 833319 : src += src_stride;
394 833319 : ref += ref_stride;
395 : }
396 32081 : }
397 :
398 0 : static INLINE void variance128_avx2(const uint8_t *src, const int src_stride,
399 : const uint8_t *ref, const int ref_stride,
400 : const int h, __m256i *const vsse,
401 : __m256i *const vsum) {
402 0 : *vsum = _mm256_setzero_si256();
403 :
404 0 : for (int i = 0; i < h; i++) {
405 0 : variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
406 0 : variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
407 0 : variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum);
408 0 : variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum);
409 0 : src += src_stride;
410 0 : ref += ref_stride;
411 : }
412 0 : }
413 :
414 : #define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel) \
415 : unsigned int eb_aom_variance##bw##x##bh##_avx2( \
416 : const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
417 : unsigned int *sse) { \
418 : __m256i vsse = _mm256_setzero_si256(); \
419 : __m256i vsum; \
420 : variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \
421 : const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse); \
422 : return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \
423 : }
424 :
425 157744 : AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512);
426 109710 : AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512);
427 1046330 : AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512);
428 42842 : AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512);
429 25292 : AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024);
430 :
431 68046 : AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512);
432 42958 : AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512);
433 70076 : AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024);
434 15832 : AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048);
435 :
436 23814 : AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024);
437 14736 : AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048);
438 :
439 : #define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh) \
440 : unsigned int eb_aom_variance##bw##x##bh##_avx2( \
441 : const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
442 : unsigned int *sse) { \
443 : __m256i vsse = _mm256_setzero_si256(); \
444 : __m256i vsum = _mm256_setzero_si256(); \
445 : for (int i = 0; i < (bh / uh); i++) { \
446 : __m256i vsum16; \
447 : variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse, \
448 : &vsum16); \
449 : vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); \
450 : src += uh * src_stride; \
451 : ref += uh * ref_stride; \
452 : } \
453 : const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); \
454 : const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); \
455 : return *sse - (unsigned int)(((int64_t)sum * sum) >> bits); \
456 : }
457 :
458 31746 : AOM_VAR_LOOP_AVX2(64, 64, 12, 32); // 64x32 * ( 64/32)
459 0 : AOM_VAR_LOOP_AVX2(64, 128, 13, 32); // 64x32 * (128/32)
460 0 : AOM_VAR_LOOP_AVX2(128, 64, 13, 16); // 128x16 * ( 64/16)
461 0 : AOM_VAR_LOOP_AVX2(128, 128, 14, 16); // 128x16 * (128/16)
|