Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include "EbDefinitions.h"
7 : #include <stdio.h>
8 : #include <immintrin.h>
9 : #include "EbPictureOperators_AVX2.h"
10 : #include "EbPictureOperators_Inline_AVX2.h"
11 : #include "EbPictureOperators_SSE2.h"
12 : #include "EbMemory_AVX2.h"
13 : #include "synonyms.h"
14 :
15 : #define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \
16 : _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
17 :
18 0 : void compressed_packmsb_avx2_intrin(
19 : uint8_t *in8_bit_buffer,
20 : uint32_t in8_stride,
21 : uint8_t *inn_bit_buffer,
22 : uint16_t *out16_bit_buffer,
23 : uint32_t inn_stride,
24 : uint32_t out_stride,
25 : uint32_t width,
26 : uint32_t height)
27 : {
28 : uint32_t y;
29 :
30 0 : if (width == 32)
31 : {
32 : __m256i inNBit, in8Bit, inNBitStride, in8BitStride, concat0, concat1, concat2, concat3;
33 : __m256i out0_15, out16_31, out_s0_s15, out_s16_s31;
34 :
35 : __m128i in2Bit, ext0, ext1, ext2, ext3, ext01, ext23, ext01h, ext23h, ext0_15, ext16_31, ext32_47, ext48_63;
36 : __m128i msk0;
37 :
38 0 : msk0 = _mm_set1_epi8((int8_t)0xC0);//1100.000
39 :
40 : //processing 2 lines for chroma
41 0 : for (y = 0; y < height; y += 2)
42 : {
43 : //2 Lines Stored in 1D format-Could be replaced by 2 _mm_loadl_epi64
44 0 : in2Bit = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)inn_bit_buffer), _mm_loadl_epi64((__m128i*)(inn_bit_buffer + inn_stride)));
45 :
46 0 : ext0 = _mm_and_si128(in2Bit, msk0);
47 0 : ext1 = _mm_and_si128(_mm_slli_epi16(in2Bit, 2), msk0);
48 0 : ext2 = _mm_and_si128(_mm_slli_epi16(in2Bit, 4), msk0);
49 0 : ext3 = _mm_and_si128(_mm_slli_epi16(in2Bit, 6), msk0);
50 :
51 0 : ext01 = _mm_unpacklo_epi8(ext0, ext1);
52 0 : ext23 = _mm_unpacklo_epi8(ext2, ext3);
53 0 : ext0_15 = _mm_unpacklo_epi16(ext01, ext23);
54 0 : ext16_31 = _mm_unpackhi_epi16(ext01, ext23);
55 :
56 0 : ext01h = _mm_unpackhi_epi8(ext0, ext1);
57 0 : ext23h = _mm_unpackhi_epi8(ext2, ext3);
58 0 : ext32_47 = _mm_unpacklo_epi16(ext01h, ext23h);
59 0 : ext48_63 = _mm_unpackhi_epi16(ext01h, ext23h);
60 :
61 0 : inNBit = _mm256_set_m128i(ext16_31, ext0_15);
62 0 : inNBitStride = _mm256_set_m128i(ext48_63, ext32_47);
63 :
64 0 : in8Bit = _mm256_loadu_si256((__m256i*)in8_bit_buffer);
65 0 : in8BitStride = _mm256_loadu_si256((__m256i*)(in8_bit_buffer + in8_stride));
66 :
67 : //(outPixel | nBitPixel) concatenation is done with unpacklo_epi8 and unpackhi_epi8
68 0 : concat0 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBit, in8Bit), 6);
69 0 : concat1 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBit, in8Bit), 6);
70 0 : concat2 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBitStride, in8BitStride), 6);
71 0 : concat3 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBitStride, in8BitStride), 6);
72 :
73 : //Re-organize the packing for writing to the out buffer
74 0 : out0_15 = _mm256_inserti128_si256(concat0, _mm256_castsi256_si128(concat1), 1);
75 0 : out16_31 = _mm256_inserti128_si256(concat1, _mm256_extracti128_si256(concat0, 1), 0);
76 0 : out_s0_s15 = _mm256_inserti128_si256(concat2, _mm256_castsi256_si128(concat3), 1);
77 0 : out_s16_s31 = _mm256_inserti128_si256(concat3, _mm256_extracti128_si256(concat2, 1), 0);
78 :
79 : _mm256_store_si256((__m256i*) out16_bit_buffer, out0_15);
80 0 : _mm256_store_si256((__m256i*) (out16_bit_buffer + 16), out16_31);
81 0 : _mm256_store_si256((__m256i*) (out16_bit_buffer + out_stride), out_s0_s15);
82 0 : _mm256_store_si256((__m256i*) (out16_bit_buffer + out_stride + 16), out_s16_s31);
83 :
84 0 : in8_bit_buffer += in8_stride << 1;
85 0 : inn_bit_buffer += inn_stride << 1;
86 0 : out16_bit_buffer += out_stride << 1;
87 : }
88 : }
89 0 : else if (width == 64)
90 : {
91 : __m256i inNBit, in8Bit, inNBit32, in8Bit32;
92 : __m256i concat0, concat1, concat2, concat3;
93 : __m256i out_0_15, out16_31, out32_47, out_48_63;
94 :
95 : __m128i in2Bit, ext0, ext1, ext2, ext3, ext01, ext23, ext01h, ext23h, ext0_15, ext16_31, ext32_47, ext48_63;
96 : __m128i msk;
97 :
98 0 : msk = _mm_set1_epi8((int8_t)0xC0);//1100.000
99 :
100 : //One row per iter
101 0 : for (y = 0; y < height; y++)
102 : {
103 0 : in2Bit = _mm_loadu_si128((__m128i*)inn_bit_buffer);
104 :
105 0 : ext0 = _mm_and_si128(in2Bit, msk);
106 0 : ext1 = _mm_and_si128(_mm_slli_epi16(in2Bit, 2), msk);
107 0 : ext2 = _mm_and_si128(_mm_slli_epi16(in2Bit, 4), msk);
108 0 : ext3 = _mm_and_si128(_mm_slli_epi16(in2Bit, 6), msk);
109 :
110 0 : ext01 = _mm_unpacklo_epi8(ext0, ext1);
111 0 : ext23 = _mm_unpacklo_epi8(ext2, ext3);
112 0 : ext0_15 = _mm_unpacklo_epi16(ext01, ext23);
113 0 : ext16_31 = _mm_unpackhi_epi16(ext01, ext23);
114 :
115 0 : ext01h = _mm_unpackhi_epi8(ext0, ext1);
116 0 : ext23h = _mm_unpackhi_epi8(ext2, ext3);
117 0 : ext32_47 = _mm_unpacklo_epi16(ext01h, ext23h);
118 0 : ext48_63 = _mm_unpackhi_epi16(ext01h, ext23h);
119 :
120 0 : inNBit = _mm256_set_m128i(ext16_31, ext0_15);
121 0 : inNBit32 = _mm256_set_m128i(ext48_63, ext32_47);
122 :
123 0 : in8Bit = _mm256_loadu_si256((__m256i*)in8_bit_buffer);
124 0 : in8Bit32 = _mm256_loadu_si256((__m256i*)(in8_bit_buffer + 32));
125 :
126 : //(outPixel | nBitPixel) concatenation
127 0 : concat0 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBit, in8Bit), 6);
128 0 : concat1 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBit, in8Bit), 6);
129 0 : concat2 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBit32, in8Bit32), 6);
130 0 : concat3 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBit32, in8Bit32), 6);
131 :
132 : //Re-organize the packing for writing to the out buffer
133 0 : out_0_15 = _mm256_inserti128_si256(concat0, _mm256_castsi256_si128(concat1), 1);
134 0 : out16_31 = _mm256_inserti128_si256(concat1, _mm256_extracti128_si256(concat0, 1), 0);
135 0 : out32_47 = _mm256_inserti128_si256(concat2, _mm256_castsi256_si128(concat3), 1);
136 0 : out_48_63 = _mm256_inserti128_si256(concat3, _mm256_extracti128_si256(concat2, 1), 0);
137 :
138 : _mm256_store_si256((__m256i*) out16_bit_buffer, out_0_15);
139 0 : _mm256_store_si256((__m256i*) (out16_bit_buffer + 16), out16_31);
140 0 : _mm256_store_si256((__m256i*) (out16_bit_buffer + 32), out32_47);
141 0 : _mm256_store_si256((__m256i*) (out16_bit_buffer + 48), out_48_63);
142 :
143 0 : in8_bit_buffer += in8_stride;
144 0 : inn_bit_buffer += inn_stride;
145 0 : out16_bit_buffer += out_stride;
146 : }
147 : }
148 0 : }
149 :
150 0 : void c_pack_avx2_intrin(
151 : const uint8_t *inn_bit_buffer,
152 : uint32_t inn_stride,
153 : uint8_t *in_compn_bit_buffer,
154 : uint32_t out_stride,
155 : uint8_t *local_cache,
156 : uint32_t width,
157 : uint32_t height)
158 : {
159 : uint32_t y;
160 :
161 0 : if (width == 32)
162 : {
163 : __m256i inNBit;
164 :
165 : __m256i ext0, ext1, ext2, ext3, ext0123, ext0123n, extp;
166 : __m256i msk0, msk1, msk2, msk3;
167 :
168 0 : msk0 = _mm256_set1_epi32(0x000000C0);//1100.0000
169 0 : msk1 = _mm256_set1_epi32(0x00000030);//0011.0000
170 0 : msk2 = _mm256_set1_epi32(0x0000000C);//0000.1100
171 0 : msk3 = _mm256_set1_epi32(0x00000003);//0000.0011
172 :
173 : //One row per iter
174 0 : for (y = 0; y < height; y++)
175 : {
176 0 : inNBit = _mm256_loadu_si256((__m256i*)inn_bit_buffer);
177 :
178 0 : ext0 = _mm256_and_si256(inNBit, msk0);
179 0 : ext1 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 1 * 8 + 2), msk1);
180 0 : ext2 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 2 * 8 + 4), msk2);
181 0 : ext3 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 3 * 8 + 6), msk3);
182 :
183 0 : ext0123 = _mm256_or_si256(_mm256_or_si256(ext0, ext1), _mm256_or_si256(ext2, ext3));
184 :
185 0 : ext0123n = _mm256_castsi128_si256(_mm256_extracti128_si256(ext0123, 1));
186 :
187 0 : extp = _mm256_packus_epi32(ext0123, ext0123n);
188 0 : extp = _mm256_packus_epi16(extp, extp);
189 :
190 0 : _mm_storel_epi64((__m128i*) in_compn_bit_buffer, _mm256_castsi256_si128(extp));
191 0 : in_compn_bit_buffer += out_stride;
192 0 : inn_bit_buffer += inn_stride;
193 : }
194 : }
195 0 : else if (width == 64)
196 : {
197 : __m256i inNBit;
198 : __m256i ext0, ext1, ext2, ext3, ext0123, ext0123n, extp, extp1;
199 : __m256i msk0, msk1, msk2, msk3;
200 :
201 0 : msk0 = _mm256_set1_epi32(0x000000C0);//1100.0000
202 0 : msk1 = _mm256_set1_epi32(0x00000030);//0011.0000
203 0 : msk2 = _mm256_set1_epi32(0x0000000C);//0000.1100
204 0 : msk3 = _mm256_set1_epi32(0x00000003);//0000.0011
205 0 : if (height == 64)
206 : {
207 0 : uint8_t* localPtr = local_cache;
208 :
209 0 : for (y = 0; y < height; y++)
210 : {
211 0 : inNBit = _mm256_loadu_si256((__m256i*)inn_bit_buffer);
212 :
213 0 : ext0 = _mm256_and_si256(inNBit, msk0);
214 0 : ext1 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 1 * 8 + 2), msk1);
215 0 : ext2 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 2 * 8 + 4), msk2);
216 0 : ext3 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 3 * 8 + 6), msk3);
217 :
218 0 : ext0123 = _mm256_or_si256(_mm256_or_si256(ext0, ext1), _mm256_or_si256(ext2, ext3));
219 :
220 0 : ext0123n = _mm256_castsi128_si256(_mm256_extracti128_si256(ext0123, 1));
221 :
222 0 : extp = _mm256_packus_epi32(ext0123, ext0123n);
223 0 : extp = _mm256_packus_epi16(extp, extp);
224 :
225 0 : inNBit = _mm256_loadu_si256((__m256i*)(inn_bit_buffer + 32));
226 :
227 0 : ext0 = _mm256_and_si256(inNBit, msk0);
228 0 : ext1 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 1 * 8 + 2), msk1);
229 0 : ext2 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 2 * 8 + 4), msk2);
230 0 : ext3 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 3 * 8 + 6), msk3);
231 :
232 0 : ext0123 = _mm256_or_si256(_mm256_or_si256(ext0, ext1), _mm256_or_si256(ext2, ext3));
233 :
234 0 : ext0123n = _mm256_castsi128_si256(_mm256_extracti128_si256(ext0123, 1));
235 :
236 0 : extp1 = _mm256_packus_epi32(ext0123, ext0123n);
237 0 : extp1 = _mm256_packus_epi16(extp1, extp1);
238 :
239 0 : extp = _mm256_unpacklo_epi64(extp, extp1);
240 :
241 0 : _mm_storeu_si128((__m128i*) (localPtr + 16 * (y & 3)), _mm256_castsi256_si128(extp));
242 :
243 0 : if ((y & 3) == 3)
244 : {
245 0 : __m256i c0 = _mm256_loadu_si256((__m256i*)(localPtr));
246 0 : __m256i c1 = _mm256_loadu_si256((__m256i*)(localPtr + 32));
247 0 : _mm_storeu_si128((__m128i*) (in_compn_bit_buffer), _mm256_extractf128_si256(c0, 0));
248 0 : _mm_storeu_si128((__m128i*) (in_compn_bit_buffer + out_stride), _mm256_extractf128_si256(c0, 1));
249 0 : _mm_storeu_si128((__m128i*) (in_compn_bit_buffer + 2 * out_stride), _mm256_extractf128_si256(c1, 0));
250 0 : _mm_storeu_si128((__m128i*) (in_compn_bit_buffer + 3 * out_stride), _mm256_extractf128_si256(c1, 1));
251 0 : in_compn_bit_buffer += 4 * out_stride;
252 : }
253 :
254 0 : inn_bit_buffer += inn_stride;
255 : }
256 : }
257 : else {
258 : //One row per iter
259 0 : for (y = 0; y < height; y++)
260 : {
261 0 : inNBit = _mm256_loadu_si256((__m256i*)inn_bit_buffer);
262 :
263 0 : ext0 = _mm256_and_si256(inNBit, msk0);
264 0 : ext1 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 1 * 8 + 2), msk1);
265 0 : ext2 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 2 * 8 + 4), msk2);
266 0 : ext3 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 3 * 8 + 6), msk3);
267 :
268 0 : ext0123 = _mm256_or_si256(_mm256_or_si256(ext0, ext1), _mm256_or_si256(ext2, ext3));
269 :
270 0 : ext0123n = _mm256_castsi128_si256(_mm256_extracti128_si256(ext0123, 1));
271 :
272 0 : extp = _mm256_packus_epi32(ext0123, ext0123n);
273 0 : extp = _mm256_packus_epi16(extp, extp);
274 :
275 0 : inNBit = _mm256_loadu_si256((__m256i*)(inn_bit_buffer + 32));
276 :
277 0 : ext0 = _mm256_and_si256(inNBit, msk0);
278 0 : ext1 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 1 * 8 + 2), msk1);
279 0 : ext2 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 2 * 8 + 4), msk2);
280 0 : ext3 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 3 * 8 + 6), msk3);
281 :
282 0 : ext0123 = _mm256_or_si256(_mm256_or_si256(ext0, ext1), _mm256_or_si256(ext2, ext3));
283 :
284 0 : ext0123n = _mm256_castsi128_si256(_mm256_extracti128_si256(ext0123, 1));
285 :
286 0 : extp1 = _mm256_packus_epi32(ext0123, ext0123n);
287 0 : extp1 = _mm256_packus_epi16(extp1, extp1);
288 :
289 0 : extp = _mm256_unpacklo_epi64(extp, extp1);
290 :
291 0 : _mm_storeu_si128((__m128i*) in_compn_bit_buffer, _mm256_castsi256_si128(extp));
292 :
293 0 : in_compn_bit_buffer += out_stride;
294 :
295 0 : inn_bit_buffer += inn_stride;
296 : }
297 : }
298 : }
299 0 : }
300 :
301 0 : void eb_enc_msb_pack2d_avx2_intrin_al(
302 : uint8_t *in8_bit_buffer,
303 : uint32_t in8_stride,
304 : uint8_t *inn_bit_buffer,
305 : uint16_t *out16_bit_buffer,
306 : uint32_t inn_stride,
307 : uint32_t out_stride,
308 : uint32_t width,
309 : uint32_t height)
310 : {
311 : //(outPixel | nBitPixel) concatenation is done with unpacklo_epi8 and unpackhi_epi8
312 :
313 : uint32_t y, x;
314 :
315 : __m128i out0, out1;
316 :
317 0 : if (width == 4)
318 : {
319 0 : for (y = 0; y < height; y += 2) {
320 0 : out0 = _mm_srli_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)inn_bit_buffer), _mm_cvtsi32_si128(*(uint32_t *)in8_bit_buffer)), 6);
321 0 : out1 = _mm_srli_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)(inn_bit_buffer + inn_stride)), _mm_cvtsi32_si128(*(uint32_t *)(in8_bit_buffer + in8_stride))), 6);
322 :
323 0 : _mm_storel_epi64((__m128i*) out16_bit_buffer, out0);
324 0 : _mm_storel_epi64((__m128i*) (out16_bit_buffer + out_stride), out1);
325 :
326 0 : in8_bit_buffer += in8_stride << 1;
327 0 : inn_bit_buffer += inn_stride << 1;
328 0 : out16_bit_buffer += out_stride << 1;
329 : }
330 : }
331 0 : else if (width == 8)
332 : {
333 0 : for (y = 0; y < height; y += 2) {
334 0 : out0 = _mm_srli_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)inn_bit_buffer), _mm_loadl_epi64((__m128i*)in8_bit_buffer)), 6);
335 0 : out1 = _mm_srli_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(inn_bit_buffer + inn_stride)), _mm_loadl_epi64((__m128i*)(in8_bit_buffer + in8_stride))), 6);
336 :
337 : _mm_storeu_si128((__m128i*) out16_bit_buffer, out0);
338 0 : _mm_storeu_si128((__m128i*) (out16_bit_buffer + out_stride), out1);
339 :
340 0 : in8_bit_buffer += in8_stride << 1;
341 0 : inn_bit_buffer += inn_stride << 1;
342 0 : out16_bit_buffer += out_stride << 1;
343 : }
344 : }
345 0 : else if (width == 16)
346 : {
347 : __m128i inNBit, in8Bit, inNBitStride, in8BitStride, out0, out1, out2, out3;
348 :
349 0 : for (y = 0; y < height; y += 2) {
350 0 : inNBit = _mm_loadu_si128((__m128i*)inn_bit_buffer);
351 0 : in8Bit = _mm_loadu_si128((__m128i*)in8_bit_buffer);
352 0 : inNBitStride = _mm_loadu_si128((__m128i*)(inn_bit_buffer + inn_stride));
353 0 : in8BitStride = _mm_loadu_si128((__m128i*)(in8_bit_buffer + in8_stride));
354 :
355 0 : out0 = _mm_srli_epi16(_mm_unpacklo_epi8(inNBit, in8Bit), 6);
356 0 : out1 = _mm_srli_epi16(_mm_unpackhi_epi8(inNBit, in8Bit), 6);
357 0 : out2 = _mm_srli_epi16(_mm_unpacklo_epi8(inNBitStride, in8BitStride), 6);
358 0 : out3 = _mm_srli_epi16(_mm_unpackhi_epi8(inNBitStride, in8BitStride), 6);
359 :
360 : _mm_storeu_si128((__m128i*) out16_bit_buffer, out0);
361 0 : _mm_storeu_si128((__m128i*) (out16_bit_buffer + 8), out1);
362 0 : _mm_storeu_si128((__m128i*) (out16_bit_buffer + out_stride), out2);
363 0 : _mm_storeu_si128((__m128i*) (out16_bit_buffer + out_stride + 8), out3);
364 :
365 0 : in8_bit_buffer += in8_stride << 1;
366 0 : inn_bit_buffer += inn_stride << 1;
367 0 : out16_bit_buffer += out_stride << 1;
368 : }
369 : }
370 0 : else if (width == 32)
371 : {
372 : __m256i inNBit, in8Bit, inNBitStride, in8BitStride, concat0, concat1, concat2, concat3;
373 : __m256i out0_15, out16_31, out_s0_s15, out_s16_s31;
374 :
375 0 : for (y = 0; y < height; y += 2) {
376 0 : inNBit = _mm256_loadu_si256((__m256i*)inn_bit_buffer);
377 0 : in8Bit = _mm256_loadu_si256((__m256i*)in8_bit_buffer);
378 0 : inNBitStride = _mm256_loadu_si256((__m256i*)(inn_bit_buffer + inn_stride));
379 0 : in8BitStride = _mm256_loadu_si256((__m256i*)(in8_bit_buffer + in8_stride));
380 :
381 : //(outPixel | nBitPixel) concatenation is done with unpacklo_epi8 and unpackhi_epi8
382 0 : concat0 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBit, in8Bit), 6);
383 0 : concat1 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBit, in8Bit), 6);
384 0 : concat2 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBitStride, in8BitStride), 6);
385 0 : concat3 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBitStride, in8BitStride), 6);
386 :
387 : //Re-organize the packing for writing to the out buffer
388 0 : out0_15 = _mm256_inserti128_si256(concat0, _mm256_castsi256_si128(concat1), 1);
389 0 : out16_31 = _mm256_inserti128_si256(concat1, _mm256_extracti128_si256(concat0, 1), 0);
390 0 : out_s0_s15 = _mm256_inserti128_si256(concat2, _mm256_castsi256_si128(concat3), 1);
391 0 : out_s16_s31 = _mm256_inserti128_si256(concat3, _mm256_extracti128_si256(concat2, 1), 0);
392 :
393 : _mm256_store_si256((__m256i*) out16_bit_buffer, out0_15);
394 0 : _mm256_store_si256((__m256i*) (out16_bit_buffer + 16), out16_31);
395 0 : _mm256_store_si256((__m256i*) (out16_bit_buffer + out_stride), out_s0_s15);
396 0 : _mm256_store_si256((__m256i*) (out16_bit_buffer + out_stride + 16), out_s16_s31);
397 :
398 0 : in8_bit_buffer += in8_stride << 1;
399 : //inn_bit_buffer += inn_stride << 1;
400 0 : inn_bit_buffer += inn_stride * 2;
401 0 : out16_bit_buffer += out_stride << 1;
402 : }
403 : }
404 0 : else if (width == 64)
405 : {
406 : __m256i inNBit, in8Bit, inNBitStride, in8BitStride, inNBit32, in8Bit32, inNBitStride32, in8BitStride32;
407 : __m256i concat0, concat1, concat2, concat3, concat4, concat5, concat6, concat7;
408 : __m256i out_0_15, out16_31, out32_47, out_48_63, out_s0_s15, out_s16_s31, out_s32_s47, out_s48_s63;
409 :
410 0 : for (y = 0; y < height; y += 2) {
411 0 : inNBit = _mm256_loadu_si256((__m256i*)inn_bit_buffer);
412 0 : in8Bit = _mm256_loadu_si256((__m256i*)in8_bit_buffer);
413 0 : inNBit32 = _mm256_loadu_si256((__m256i*)(inn_bit_buffer + 32));
414 0 : in8Bit32 = _mm256_loadu_si256((__m256i*)(in8_bit_buffer + 32));
415 0 : inNBitStride = _mm256_loadu_si256((__m256i*)(inn_bit_buffer + inn_stride));
416 0 : in8BitStride = _mm256_loadu_si256((__m256i*)(in8_bit_buffer + in8_stride));
417 0 : inNBitStride32 = _mm256_loadu_si256((__m256i*)(inn_bit_buffer + inn_stride + 32));
418 0 : in8BitStride32 = _mm256_loadu_si256((__m256i*)(in8_bit_buffer + in8_stride + 32));
419 : //(outPixel | nBitPixel) concatenation is done with unpacklo_epi8 and unpackhi_epi8
420 0 : concat0 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBit, in8Bit), 6);
421 0 : concat1 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBit, in8Bit), 6);
422 0 : concat2 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBit32, in8Bit32), 6);
423 0 : concat3 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBit32, in8Bit32), 6);
424 0 : concat4 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBitStride, in8BitStride), 6);
425 0 : concat5 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBitStride, in8BitStride), 6);
426 0 : concat6 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBitStride32, in8BitStride32), 6);
427 0 : concat7 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBitStride32, in8BitStride32), 6);
428 :
429 : //Re-organize the packing for writing to the out buffer
430 0 : out_0_15 = _mm256_inserti128_si256(concat0, _mm256_castsi256_si128(concat1), 1);
431 0 : out16_31 = _mm256_inserti128_si256(concat1, _mm256_extracti128_si256(concat0, 1), 0);
432 0 : out32_47 = _mm256_inserti128_si256(concat2, _mm256_castsi256_si128(concat3), 1);
433 0 : out_48_63 = _mm256_inserti128_si256(concat3, _mm256_extracti128_si256(concat2, 1), 0);
434 0 : out_s0_s15 = _mm256_inserti128_si256(concat4, _mm256_castsi256_si128(concat5), 1);
435 0 : out_s16_s31 = _mm256_inserti128_si256(concat5, _mm256_extracti128_si256(concat4, 1), 0);
436 0 : out_s32_s47 = _mm256_inserti128_si256(concat6, _mm256_castsi256_si128(concat7), 1);
437 0 : out_s48_s63 = _mm256_inserti128_si256(concat7, _mm256_extracti128_si256(concat6, 1), 0);
438 :
439 : _mm256_store_si256((__m256i*) out16_bit_buffer, out_0_15);
440 0 : _mm256_store_si256((__m256i*) (out16_bit_buffer + 16), out16_31);
441 0 : _mm256_store_si256((__m256i*) (out16_bit_buffer + 32), out32_47);
442 0 : _mm256_store_si256((__m256i*) (out16_bit_buffer + 48), out_48_63);
443 :
444 0 : _mm256_store_si256((__m256i*) (out16_bit_buffer + out_stride), out_s0_s15);
445 0 : _mm256_store_si256((__m256i*) (out16_bit_buffer + out_stride + 16), out_s16_s31);
446 0 : _mm256_store_si256((__m256i*) (out16_bit_buffer + out_stride + 32), out_s32_s47);
447 0 : _mm256_store_si256((__m256i*) (out16_bit_buffer + out_stride + 48), out_s48_s63);
448 :
449 0 : in8_bit_buffer += in8_stride << 1;
450 : //inn_bit_buffer += inn_stride << 1;
451 0 : inn_bit_buffer += inn_stride * 2;
452 0 : out16_bit_buffer += out_stride << 1;
453 : }
454 : }
455 : else
456 : {
457 0 : uint32_t innStrideDiff = 2 * inn_stride;
458 0 : uint32_t in8StrideDiff = 2 * in8_stride;
459 0 : uint32_t outStrideDiff = 2 * out_stride;
460 0 : innStrideDiff -= width;
461 0 : in8StrideDiff -= width;
462 0 : outStrideDiff -= width;
463 :
464 0 : if (!(width & 7)) {
465 0 : for (x = 0; x < height; x += 2) {
466 0 : for (y = 0; y < width; y += 8) {
467 0 : out0 = _mm_srli_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)inn_bit_buffer), _mm_loadl_epi64((__m128i*)in8_bit_buffer)), 6);
468 0 : out1 = _mm_srli_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(inn_bit_buffer + inn_stride)), _mm_loadl_epi64((__m128i*)(in8_bit_buffer + in8_stride))), 6);
469 :
470 : _mm_storeu_si128((__m128i*) out16_bit_buffer, out0);
471 0 : _mm_storeu_si128((__m128i*) (out16_bit_buffer + out_stride), out1);
472 :
473 0 : in8_bit_buffer += 8;
474 0 : inn_bit_buffer += 8;
475 0 : out16_bit_buffer += 8;
476 : }
477 0 : in8_bit_buffer += in8StrideDiff;
478 0 : inn_bit_buffer += innStrideDiff;
479 0 : out16_bit_buffer += outStrideDiff;
480 : }
481 : }
482 : else {
483 0 : for (x = 0; x < height; x += 2) {
484 0 : for (y = 0; y < width; y += 4) {
485 0 : out0 = _mm_srli_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)inn_bit_buffer), _mm_cvtsi32_si128(*(uint32_t *)in8_bit_buffer)), 6);
486 0 : out1 = _mm_srli_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)(inn_bit_buffer + inn_stride)), _mm_cvtsi32_si128(*(uint32_t *)(in8_bit_buffer + in8_stride))), 6);
487 :
488 0 : _mm_storel_epi64((__m128i*) out16_bit_buffer, out0);
489 0 : _mm_storel_epi64((__m128i*) (out16_bit_buffer + out_stride), out1);
490 :
491 0 : in8_bit_buffer += 4;
492 0 : inn_bit_buffer += 4;
493 0 : out16_bit_buffer += 4;
494 : }
495 0 : in8_bit_buffer += in8StrideDiff;
496 0 : inn_bit_buffer += innStrideDiff;
497 0 : out16_bit_buffer += outStrideDiff;
498 : }
499 : }
500 : }
501 0 : }
502 :
503 : #define ALSTORE 1
504 : #define B256 1
505 :
506 0 : void unpack_avg_avx2_intrin(
507 : uint16_t *ref16_l0,
508 : uint32_t ref_l0_stride,
509 : uint16_t *ref16_l1,
510 : uint32_t ref_l1_stride,
511 : uint8_t *dst_ptr,
512 : uint32_t dst_stride,
513 : uint32_t width,
514 : uint32_t height)
515 : {
516 : uint32_t y;
517 : __m128i inPixel0, inPixel1;
518 :
519 0 : if (width == 4)
520 : {
521 : __m128i out8_0_U8_L0, out8_0_U8_L1;
522 : __m128i avg8_0_U8;
523 :
524 0 : for (y = 0; y < height; y += 2)
525 : {
526 : //--------
527 : //Line One
528 : //--------
529 :
530 : //List0
531 0 : inPixel0 = _mm_loadl_epi64((__m128i*)ref16_l0);
532 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
533 0 : out8_0_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
534 :
535 : //List1
536 0 : inPixel0 = _mm_loadl_epi64((__m128i*)ref16_l1);
537 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
538 0 : out8_0_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
539 :
540 : //AVG
541 0 : avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
542 :
543 0 : *(uint32_t*)dst_ptr = _mm_cvtsi128_si32(avg8_0_U8);
544 :
545 : //--------
546 : //Line Two
547 : //--------
548 :
549 : //List0
550 0 : inPixel0 = _mm_loadl_epi64((__m128i*)(ref16_l0 + ref_l0_stride));
551 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
552 0 : out8_0_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
553 :
554 : //List1
555 :
556 0 : inPixel0 = _mm_loadl_epi64((__m128i*)(ref16_l1 + ref_l1_stride));
557 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
558 0 : out8_0_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
559 :
560 : //AVG
561 0 : avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
562 :
563 0 : *(uint32_t*)(dst_ptr + dst_stride) = _mm_cvtsi128_si32(avg8_0_U8);
564 :
565 0 : dst_ptr += 2 * dst_stride;
566 0 : ref16_l0 += 2 * ref_l0_stride;
567 0 : ref16_l1 += 2 * ref_l1_stride;
568 : }
569 : }
570 0 : else if (width == 8)
571 : {
572 : __m128i out8_0_U8_L0, out8_0_U8_L1, out8_2_U8_L0, out8_2_U8_L1;
573 : __m128i avg8_0_U8, avg8_2_U8;
574 :
575 0 : for (y = 0; y < height; y += 2)
576 : {
577 : //--------
578 : //Line One
579 : //--------
580 :
581 : //List0
582 :
583 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
584 :
585 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
586 0 : out8_0_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
587 :
588 : //List1
589 :
590 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
591 :
592 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
593 0 : out8_0_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
594 :
595 : //AVG
596 0 : avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
597 :
598 0 : _mm_storel_epi64((__m128i*) dst_ptr, avg8_0_U8);
599 :
600 : //--------
601 : //Line Two
602 : //--------
603 :
604 : //List0
605 :
606 0 : inPixel0 = _mm_loadu_si128((__m128i*)(ref16_l0 + ref_l0_stride));
607 :
608 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
609 0 : out8_2_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
610 :
611 : //List1
612 :
613 0 : inPixel0 = _mm_loadu_si128((__m128i*)(ref16_l1 + ref_l1_stride));
614 :
615 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
616 0 : out8_2_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
617 :
618 : //AVG
619 0 : avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
620 :
621 0 : _mm_storel_epi64((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
622 :
623 0 : dst_ptr += 2 * dst_stride;
624 0 : ref16_l0 += 2 * ref_l0_stride;
625 0 : ref16_l1 += 2 * ref_l1_stride;
626 : }
627 : }
628 0 : else if (width == 16)
629 : {
630 : __m128i inPixel4, inPixel5;
631 : __m128i out8_0_U8_L0, out8_0_U8_L1, out8_2_U8_L0, out8_2_U8_L1;
632 : __m128i avg8_0_U8, avg8_2_U8;
633 :
634 0 : for (y = 0; y < height; y += 2)
635 : {
636 : //--------
637 : //Line One
638 : //--------
639 :
640 : //List0
641 :
642 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
643 0 : inPixel1 = _mm_loadu_si128((__m128i*) (ref16_l0 + 8));
644 :
645 0 : out8_0_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
646 :
647 : //List1
648 :
649 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
650 0 : inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l1 + 8));
651 :
652 0 : out8_0_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
653 :
654 : //AVG
655 0 : avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
656 : #if ALSTORE
657 : _mm_store_si128((__m128i*) dst_ptr, avg8_0_U8);
658 : #else
659 : _mm_storeu_si128((__m128i*) dst_ptr, avg8_0_U8);
660 : #endif
661 :
662 : //--------
663 : //Line Two
664 : //--------
665 :
666 : //List0
667 :
668 0 : inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride));
669 0 : inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 8));
670 :
671 0 : out8_2_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
672 :
673 : //List1
674 :
675 0 : inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride));
676 0 : inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 8));
677 :
678 0 : out8_2_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
679 :
680 : //AVG
681 0 : avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
682 : #if ALSTORE
683 0 : _mm_store_si128((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
684 : #else
685 : _mm_storeu_si128((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
686 : #endif
687 0 : dst_ptr += 2 * dst_stride;
688 0 : ref16_l0 += 2 * ref_l0_stride;
689 0 : ref16_l1 += 2 * ref_l1_stride;
690 : }
691 : }
692 0 : else if (width == 32)
693 : {
694 : #if B256
695 : __m256i inVal16b_0, inVal16b_1;
696 : __m256i data8b_32_0_L0, data8b_32_0_L1;
697 : __m256i avg8b_32_0;
698 : #else
699 : __m128i inPixel2, inPixel3, inPixel4, inPixel5, inPixel6, inPixel7;
700 : __m128i out8_0_U8_L0, out8_1_U8_L0, out8_2_U8_L0, out8_3_U8_L0;
701 : __m128i out8_0_U8_L1, out8_1_U8_L1, out8_2_U8_L1, out8_3_U8_L1;
702 : __m128i avg8_0_U8, avg8_1_U8, avg8_2_U8, avg8_3_U8;
703 : #endif
704 :
705 0 : for (y = 0; y < height; y += 2)
706 : {
707 : #if B256
708 : //--------
709 : //Line One
710 : //--------
711 :
712 : //List0
713 0 : inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l0);
714 0 : inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 16));
715 0 : data8b_32_0_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
716 : //List1
717 0 : inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l1);
718 0 : inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 16));
719 0 : data8b_32_0_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
720 :
721 : //Avg
722 0 : avg8b_32_0 = _mm256_avg_epu8(data8b_32_0_L0, data8b_32_0_L1);
723 :
724 0 : avg8b_32_0 = _mm256_permute4x64_epi64(avg8b_32_0, 216);
725 :
726 : _mm256_storeu_si256((__m256i *)(dst_ptr), avg8b_32_0);
727 :
728 : //--------
729 : //Line Two
730 : //--------
731 : //List0
732 0 : inVal16b_0 = _mm256_loadu_si256((__m256i*)(ref16_l0 + ref_l0_stride));
733 0 : inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l0 + ref_l0_stride + 16));
734 :
735 0 : data8b_32_0_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
736 :
737 : //List1
738 0 : inVal16b_0 = _mm256_loadu_si256((__m256i*)(ref16_l1 + ref_l1_stride));
739 0 : inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l1 + ref_l1_stride + 16));
740 :
741 0 : data8b_32_0_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
742 :
743 : //Avg
744 0 : avg8b_32_0 = _mm256_avg_epu8(data8b_32_0_L0, data8b_32_0_L1);
745 :
746 0 : avg8b_32_0 = _mm256_permute4x64_epi64(avg8b_32_0, 216);
747 :
748 0 : _mm256_storeu_si256((__m256i *)(dst_ptr + dst_stride), avg8b_32_0);
749 :
750 : #else
751 : //--------
752 : //Line One
753 : //--------
754 :
755 : //List0
756 :
757 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
758 : inPixel1 = _mm_loadu_si128((__m128i*) (ref16_l0 + 8));
759 : inPixel2 = _mm_loadu_si128((__m128i*) (ref16_l0 + 16));
760 : inPixel3 = _mm_loadu_si128((__m128i*) (ref16_l0 + 24));
761 :
762 : out8_0_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
763 : out8_1_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel2, 2), _mm_srli_epi16(inPixel3, 2));
764 :
765 : //List1
766 :
767 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
768 : inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l1 + 8));
769 : inPixel2 = _mm_loadu_si128((__m128i*)(ref16_l1 + 16));
770 : inPixel3 = _mm_loadu_si128((__m128i*)(ref16_l1 + 24));
771 :
772 : out8_0_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
773 : out8_1_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel2, 2), _mm_srli_epi16(inPixel3, 2));
774 :
775 : //AVG
776 : avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
777 : avg8_1_U8 = _mm_avg_epu8(out8_1_U8_L0, out8_1_U8_L1);
778 : #if ALSTORE
779 : _mm_store_si128((__m128i*) dst_ptr, avg8_0_U8);
780 : _mm_store_si128((__m128i*)(dst_ptr + 16), avg8_1_U8);
781 : #else
782 : _mm_storeu_si128((__m128i*) dst_ptr, avg8_0_U8);
783 : _mm_storeu_si128((__m128i*)(dst_ptr + 16), avg8_1_U8);
784 : #endif
785 :
786 : //--------
787 : //Line Two
788 : //--------
789 :
790 : //List0
791 :
792 : inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride));
793 : inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 8));
794 : inPixel6 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 16));
795 : inPixel7 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 24));
796 :
797 : out8_2_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
798 : out8_3_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel6, 2), _mm_srli_epi16(inPixel7, 2));
799 :
800 : //List1
801 :
802 : inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride));
803 : inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 8));
804 : inPixel6 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 16));
805 : inPixel7 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 24));
806 :
807 : out8_2_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
808 : out8_3_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel6, 2), _mm_srli_epi16(inPixel7, 2));
809 :
810 : //AVG
811 : avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
812 : avg8_3_U8 = _mm_avg_epu8(out8_3_U8_L0, out8_3_U8_L1);
813 : #if ALSTORE
814 : _mm_store_si128((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
815 : _mm_store_si128((__m128i*)(dst_ptr + dst_stride + 16), avg8_3_U8);
816 : #else
817 : _mm_storeu_si128((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
818 : _mm_storeu_si128((__m128i*)(dst_ptr + dst_stride + 16), avg8_3_U8);
819 : #endif
820 :
821 : #endif
822 0 : dst_ptr += 2 * dst_stride;
823 0 : ref16_l0 += 2 * ref_l0_stride;
824 0 : ref16_l1 += 2 * ref_l1_stride;
825 : }
826 : }
827 0 : else if (width == 64)
828 : {
829 : #if B256
830 : __m256i inVal16b_0, inVal16b_1, inVal16b_2, inVal16b_3;
831 : __m256i data8b_32_0_L0, data8b_32_1_L0, data8b_32_0_L1, data8b_32_1_L1;
832 : __m256i avg8b_32_0, avg8b_32_1;
833 : #else
834 : __m128i inPixel2, inPixel3, inPixel4, inPixel5, inPixel6, inPixel7;
835 : __m128i out8_0_U8_L0, out8_1_U8_L0, out8_2_U8_L0, out8_3_U8_L0;
836 : __m128i out8_0_U8_L1, out8_1_U8_L1, out8_2_U8_L1, out8_3_U8_L1;
837 : __m128i avg8_0_U8, avg8_1_U8, avg8_2_U8, avg8_3_U8;
838 :
839 : #endif
840 :
841 0 : for (y = 0; y < height; ++y)
842 : {
843 : #if B256 // _mm256_lddqu_si256
844 :
845 : //List0
846 0 : inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l0);
847 0 : inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 16));
848 0 : inVal16b_2 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 32));
849 0 : inVal16b_3 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 48));
850 0 : data8b_32_0_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
851 0 : data8b_32_1_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_2, 2), _mm256_srli_epi16(inVal16b_3, 2));
852 : //List1
853 0 : inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l1);
854 0 : inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 16));
855 0 : inVal16b_2 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 32));
856 0 : inVal16b_3 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 48));
857 0 : data8b_32_0_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
858 0 : data8b_32_1_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_2, 2), _mm256_srli_epi16(inVal16b_3, 2));
859 :
860 : //Avg
861 0 : avg8b_32_0 = _mm256_avg_epu8(data8b_32_0_L0, data8b_32_0_L1);
862 0 : avg8b_32_1 = _mm256_avg_epu8(data8b_32_1_L0, data8b_32_1_L1);
863 :
864 0 : avg8b_32_0 = _mm256_permute4x64_epi64(avg8b_32_0, 216);
865 0 : avg8b_32_1 = _mm256_permute4x64_epi64(avg8b_32_1, 216);
866 :
867 : _mm256_storeu_si256((__m256i *)(dst_ptr), avg8b_32_0);
868 0 : _mm256_storeu_si256((__m256i *)(dst_ptr + 32), avg8b_32_1);
869 : #else
870 : //List0
871 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
872 : inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l0 + 8));
873 : inPixel2 = _mm_loadu_si128((__m128i*)(ref16_l0 + 16));
874 : inPixel3 = _mm_loadu_si128((__m128i*)(ref16_l0 + 24));
875 : inPixel4 = _mm_loadu_si128((__m128i*)(ref16_l0 + 32));
876 : inPixel5 = _mm_loadu_si128((__m128i*)(ref16_l0 + 40));
877 : inPixel6 = _mm_loadu_si128((__m128i*)(ref16_l0 + 48));
878 : inPixel7 = _mm_loadu_si128((__m128i*)(ref16_l0 + 56));
879 :
880 : out8_0_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
881 : out8_1_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel2, 2), _mm_srli_epi16(inPixel3, 2));
882 : out8_2_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
883 : out8_3_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel6, 2), _mm_srli_epi16(inPixel7, 2));
884 :
885 : //List1
886 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
887 : inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l1 + 8));
888 : inPixel2 = _mm_loadu_si128((__m128i*)(ref16_l1 + 16));
889 : inPixel3 = _mm_loadu_si128((__m128i*)(ref16_l1 + 24));
890 : inPixel4 = _mm_loadu_si128((__m128i*)(ref16_l1 + 32));
891 : inPixel5 = _mm_loadu_si128((__m128i*)(ref16_l1 + 40));
892 : inPixel6 = _mm_loadu_si128((__m128i*)(ref16_l1 + 48));
893 : inPixel7 = _mm_loadu_si128((__m128i*)(ref16_l1 + 56));
894 :
895 : //Note: old Version used to use _mm_and_si128 to mask the MSB bits of the pixels
896 : out8_0_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
897 : out8_1_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel2, 2), _mm_srli_epi16(inPixel3, 2));
898 : out8_2_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
899 : out8_3_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel6, 2), _mm_srli_epi16(inPixel7, 2));
900 :
901 : //AVG
902 : avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
903 : avg8_1_U8 = _mm_avg_epu8(out8_1_U8_L0, out8_1_U8_L1);
904 : avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
905 : avg8_3_U8 = _mm_avg_epu8(out8_3_U8_L0, out8_3_U8_L1);
906 : #if ALSTORE
907 : _mm_store_si128((__m128i*) dst_ptr, avg8_0_U8);
908 : _mm_store_si128((__m128i*)(dst_ptr + 16), avg8_1_U8);
909 : _mm_store_si128((__m128i*)(dst_ptr + 32), avg8_2_U8);
910 : _mm_store_si128((__m128i*)(dst_ptr + 48), avg8_3_U8);
911 : #else
912 : _mm_storeu_si128((__m128i*) dst_ptr, avg8_0_U8);
913 : _mm_storeu_si128((__m128i*)(dst_ptr + 16), avg8_1_U8);
914 : _mm_storeu_si128((__m128i*)(dst_ptr + 32), avg8_2_U8);
915 : _mm_storeu_si128((__m128i*)(dst_ptr + 48), avg8_3_U8);
916 : #endif
917 :
918 : #endif
919 0 : dst_ptr += dst_stride;
920 0 : ref16_l0 += ref_l0_stride;
921 0 : ref16_l1 += ref_l1_stride;
922 : }
923 : }
924 :
925 0 : return;
926 : }
927 :
928 0 : int32_t sum_residual8bit_avx2_intrin(
929 : int16_t * in_ptr,
930 : uint32_t size,
931 : uint32_t stride_in)
932 : {
933 : int32_t sumBlock;
934 :
935 : __m128i in0, in1, in01, in2, in3, in23, sum, sumL, sumH;
936 : __m256i sum0, sum1, sum2, sum3, sum0L, sum0H, sumT, sum01, sumTPerm;
937 : uint32_t row_index;
938 :
939 : //Assumption: 9bit or 11bit residual data . for bigger block sizes or bigger bit depths , re-asses the dynamic range of the internal calculation
940 :
941 0 : if (size == 4) { //SSSE3
942 :
943 0 : __m128i zer = _mm_setzero_si128();
944 :
945 0 : in0 = _mm_loadl_epi64((__m128i*)in_ptr);
946 0 : in1 = _mm_loadl_epi64((__m128i*)(in_ptr + stride_in));
947 0 : in1 = _mm_shuffle_epi32(in1, 0x4A); //01.00.10.10
948 0 : in01 = _mm_or_si128(in1, in0);
949 :
950 0 : in2 = _mm_loadl_epi64((__m128i*)(in_ptr + 2 * stride_in));
951 0 : in3 = _mm_loadl_epi64((__m128i*)(in_ptr + 3 * stride_in));
952 0 : in3 = _mm_shuffle_epi32(in3, 0x4A); //01.00.10.10
953 0 : in23 = _mm_or_si128(in3, in2);
954 :
955 0 : sum = _mm_add_epi16(in01, in23);
956 0 : sum = _mm_hadd_epi16(sum, zer);
957 0 : sum = _mm_hadd_epi16(sum, zer);
958 0 : sum = _mm_hadd_epi16(sum, zer);
959 :
960 0 : sum = _mm_cvtepi16_epi32(sum);
961 0 : sumBlock = _mm_cvtsi128_si32(sum);
962 :
963 0 : return sumBlock;
964 : }
965 0 : else if (size == 8) {//SSSE3
966 :
967 0 : __m128i zer = _mm_setzero_si128();
968 :
969 0 : sum = _mm_add_epi16(_mm_loadu_si128((__m128i*)(in_ptr + 0 * stride_in)), _mm_loadu_si128((__m128i*)(in_ptr + 1 * stride_in)));
970 0 : sum = _mm_add_epi16(sum, _mm_loadu_si128((__m128i*)(in_ptr + 2 * stride_in)));
971 0 : sum = _mm_add_epi16(sum, _mm_loadu_si128((__m128i*)(in_ptr + 3 * stride_in)));
972 0 : sum = _mm_add_epi16(sum, _mm_loadu_si128((__m128i*)(in_ptr + 4 * stride_in)));
973 0 : sum = _mm_add_epi16(sum, _mm_loadu_si128((__m128i*)(in_ptr + 5 * stride_in)));
974 0 : sum = _mm_add_epi16(sum, _mm_loadu_si128((__m128i*)(in_ptr + 6 * stride_in)));
975 0 : sum = _mm_add_epi16(sum, _mm_loadu_si128((__m128i*)(in_ptr + 7 * stride_in)));
976 :
977 0 : sum = _mm_hadd_epi16(sum, zer);
978 0 : sum = _mm_hadd_epi16(sum, zer);
979 : // the sum is on 16bit, for negative values, we need to extend the sign to
980 : // the next 16bit, so that the next extraction to int32_t is fine.
981 0 : sum = _mm_cvtepi16_epi32(sum);
982 0 : sum = _mm_hadd_epi32(sum, zer);
983 :
984 0 : sumBlock = _mm_cvtsi128_si32(sum);
985 :
986 0 : return sumBlock;
987 : }
988 0 : else if (size == 16) {//AVX2
989 :
990 0 : sum0 = _mm256_add_epi16(_mm256_loadu_si256((__m256i *)(in_ptr + 0 * stride_in)), _mm256_loadu_si256((__m256i *)(in_ptr + 1 * stride_in)));
991 0 : sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i *)(in_ptr + 2 * stride_in)));
992 0 : sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i *)(in_ptr + 3 * stride_in)));
993 0 : sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i *)(in_ptr + 4 * stride_in)));
994 0 : sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i *)(in_ptr + 5 * stride_in)));
995 0 : sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i *)(in_ptr + 6 * stride_in)));
996 0 : sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i *)(in_ptr + 7 * stride_in)));
997 :
998 0 : in_ptr += 8 * stride_in;
999 0 : sum1 = _mm256_add_epi16(_mm256_loadu_si256((__m256i *)(in_ptr + 0 * stride_in)), _mm256_loadu_si256((__m256i *)(in_ptr + 1 * stride_in)));
1000 0 : sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i *)(in_ptr + 2 * stride_in)));
1001 0 : sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i *)(in_ptr + 3 * stride_in)));
1002 0 : sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i *)(in_ptr + 4 * stride_in)));
1003 0 : sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i *)(in_ptr + 5 * stride_in)));
1004 0 : sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i *)(in_ptr + 6 * stride_in)));
1005 0 : sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i *)(in_ptr + 7 * stride_in)));
1006 :
1007 0 : sum01 = _mm256_add_epi16(sum0, sum1);
1008 :
1009 : //go from 16bit to 32bit (to support big values)
1010 0 : sumL = _mm256_castsi256_si128(sum01);
1011 0 : sumH = _mm256_extracti128_si256(sum01, 1);
1012 0 : sum0L = _mm256_cvtepi16_epi32(sumL);
1013 0 : sum0H = _mm256_cvtepi16_epi32(sumH);
1014 :
1015 0 : sumT = _mm256_add_epi32(sum0L, sum0H);
1016 :
1017 0 : sumT = _mm256_hadd_epi32(sumT, sumT);
1018 0 : sumT = _mm256_hadd_epi32(sumT, sumT);
1019 0 : sumTPerm = _mm256_permute4x64_epi64(sumT, 2); //00.00.00.10
1020 0 : sumT = _mm256_add_epi32(sumT, sumTPerm);
1021 :
1022 0 : sum = _mm256_castsi256_si128(sumT);
1023 0 : sumBlock = _mm_cvtsi128_si32(sum);
1024 :
1025 0 : return sumBlock;
1026 : }
1027 0 : else if (size == 32) {//AVX2
1028 0 : int16_t *inPtrTemp = in_ptr;
1029 :
1030 0 : sum0 = sum1 = sum2 = sum3 = _mm256_setzero_si256();
1031 0 : for (row_index = 0; row_index < size; row_index += 2) { // Parse every two rows
1032 0 : sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i *)(inPtrTemp)));
1033 0 : sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i *)(inPtrTemp + 16)));
1034 0 : inPtrTemp += stride_in;
1035 0 : sum2 = _mm256_add_epi16(sum2, _mm256_loadu_si256((__m256i *)(inPtrTemp)));
1036 0 : sum3 = _mm256_add_epi16(sum3, _mm256_loadu_si256((__m256i *)(inPtrTemp + 16)));
1037 0 : inPtrTemp += stride_in;
1038 : }
1039 : //go from 16bit to 32bit (to support big values)
1040 0 : sumL = _mm256_castsi256_si128(sum0);
1041 0 : sumH = _mm256_extracti128_si256(sum0, 1);
1042 0 : sum0L = _mm256_cvtepi16_epi32(sumL);
1043 0 : sum0H = _mm256_cvtepi16_epi32(sumH);
1044 0 : sumT = _mm256_add_epi32(sum0L, sum0H);
1045 :
1046 0 : sumL = _mm256_castsi256_si128(sum1);
1047 0 : sumH = _mm256_extracti128_si256(sum1, 1);
1048 0 : sum0L = _mm256_cvtepi16_epi32(sumL);
1049 0 : sum0H = _mm256_cvtepi16_epi32(sumH);
1050 0 : sumT = _mm256_add_epi32(sumT, sum0L);
1051 0 : sumT = _mm256_add_epi32(sumT, sum0H);
1052 :
1053 0 : sumL = _mm256_castsi256_si128(sum2);
1054 0 : sumH = _mm256_extracti128_si256(sum2, 1);
1055 0 : sum0L = _mm256_cvtepi16_epi32(sumL);
1056 0 : sum0H = _mm256_cvtepi16_epi32(sumH);
1057 0 : sumT = _mm256_add_epi32(sumT, sum0L);
1058 0 : sumT = _mm256_add_epi32(sumT, sum0H);
1059 :
1060 0 : sumL = _mm256_castsi256_si128(sum3);
1061 0 : sumH = _mm256_extracti128_si256(sum3, 1);
1062 0 : sum0L = _mm256_cvtepi16_epi32(sumL);
1063 0 : sum0H = _mm256_cvtepi16_epi32(sumH);
1064 0 : sumT = _mm256_add_epi32(sumT, sum0L);
1065 0 : sumT = _mm256_add_epi32(sumT, sum0H);
1066 :
1067 0 : sumT = _mm256_hadd_epi32(sumT, sumT);
1068 0 : sumT = _mm256_hadd_epi32(sumT, sumT);
1069 0 : sumTPerm = _mm256_permute4x64_epi64(sumT, 2); //00.00.00.10
1070 0 : sumT = _mm256_add_epi32(sumT, sumTPerm);
1071 :
1072 0 : sum = _mm256_castsi256_si128(sumT);
1073 0 : sumBlock = _mm_cvtsi128_si32(sum);
1074 :
1075 0 : return sumBlock;
1076 : }
1077 :
1078 0 : else if (size == 64) {//AVX2
1079 : // no more than 11 bit valid residual data is fine, when valid residual data is over 11bit, the following code need to be rewritten.
1080 0 : int16_t* inPtrTemp = in_ptr;
1081 : __m256i sum4, sum5, sum6, sum7;
1082 :
1083 0 : const __m256i ones = _mm256_set1_epi16(1);
1084 0 : const __m256i zeros = _mm256_setzero_si256();
1085 :
1086 0 : sum0 = _mm256_add_epi16(_mm256_loadu_si256((__m256i*)(inPtrTemp)), _mm256_loadu_si256((__m256i*)(inPtrTemp + 16)));
1087 0 : sum1 = _mm256_add_epi16(_mm256_loadu_si256((__m256i*)(inPtrTemp + 32)), _mm256_loadu_si256((__m256i*)(inPtrTemp + 48)));
1088 0 : sum2 = _mm256_add_epi16(_mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in)), _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 16)));
1089 0 : sum3 = _mm256_add_epi16(_mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 32)), _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 48)));
1090 :
1091 0 : inPtrTemp += 2 * stride_in;
1092 :
1093 0 : sum4 = _mm256_add_epi16(_mm256_loadu_si256((__m256i*)(inPtrTemp)), _mm256_loadu_si256((__m256i*)(inPtrTemp + 16)));
1094 0 : sum5 = _mm256_add_epi16(_mm256_loadu_si256((__m256i*)(inPtrTemp + 32)), _mm256_loadu_si256((__m256i*)(inPtrTemp + 48)));
1095 0 : sum6 = _mm256_add_epi16(_mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in)), _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 16)));
1096 0 : sum7 = _mm256_add_epi16(_mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 32)), _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 48)));
1097 :
1098 0 : inPtrTemp += 2 * stride_in;
1099 :
1100 0 : for (row_index = 4; row_index < 64; row_index += 4) { // Parse every four rows
1101 0 : sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i*)(inPtrTemp)));
1102 0 : sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i*)(inPtrTemp + 16)));
1103 0 : sum2 = _mm256_add_epi16(sum2, _mm256_loadu_si256((__m256i*)(inPtrTemp + 32)));
1104 0 : sum3 = _mm256_add_epi16(sum3, _mm256_loadu_si256((__m256i*)(inPtrTemp + 48)));
1105 :
1106 0 : sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in)));
1107 0 : sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 16)));
1108 0 : sum2 = _mm256_add_epi16(sum2, _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 32)));
1109 0 : sum3 = _mm256_add_epi16(sum3, _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 48)));
1110 :
1111 0 : inPtrTemp += 2 * stride_in;
1112 :
1113 0 : sum4 = _mm256_add_epi16(sum4, _mm256_loadu_si256((__m256i*)(inPtrTemp)));
1114 0 : sum5 = _mm256_add_epi16(sum5, _mm256_loadu_si256((__m256i*)(inPtrTemp + 16)));
1115 0 : sum6 = _mm256_add_epi16(sum6, _mm256_loadu_si256((__m256i*)(inPtrTemp + 32)));
1116 0 : sum7 = _mm256_add_epi16(sum7, _mm256_loadu_si256((__m256i*)(inPtrTemp + 48)));
1117 :
1118 0 : sum4 = _mm256_add_epi16(sum4, _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in)));
1119 0 : sum5 = _mm256_add_epi16(sum5, _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 16)));
1120 0 : sum6 = _mm256_add_epi16(sum6, _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 32)));
1121 0 : sum7 = _mm256_add_epi16(sum7, _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 48)));
1122 :
1123 0 : inPtrTemp += 2 * stride_in;
1124 : }
1125 :
1126 0 : sum0 = _mm256_madd_epi16(sum0, ones);
1127 0 : sum1 = _mm256_madd_epi16(sum1, ones);
1128 0 : sum2 = _mm256_madd_epi16(sum2, ones);
1129 0 : sum3 = _mm256_madd_epi16(sum3, ones);
1130 :
1131 0 : sum4 = _mm256_madd_epi16(sum4, ones);
1132 0 : sum5 = _mm256_madd_epi16(sum5, ones);
1133 0 : sum6 = _mm256_madd_epi16(sum6, ones);
1134 0 : sum7 = _mm256_madd_epi16(sum7, ones);
1135 :
1136 0 : sum0 = _mm256_add_epi32(sum0, sum1);
1137 0 : sum2 = _mm256_add_epi32(sum2, sum3);
1138 0 : sum4 = _mm256_add_epi32(sum4, sum5);
1139 0 : sum6 = _mm256_add_epi32(sum6, sum7);
1140 :
1141 0 : sum0 = _mm256_add_epi32(sum0, sum2);
1142 0 : sum4 = _mm256_add_epi32(sum4, sum6);
1143 0 : sum0 = _mm256_hadd_epi32(sum0, zeros);
1144 0 : sum4 = _mm256_hadd_epi32(sum4, zeros);
1145 :
1146 0 : sum0 = _mm256_add_epi32(sum0, sum4);
1147 0 : sum0 = _mm256_hadd_epi32(sum0, zeros);
1148 :
1149 0 : sumBlock = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_castsi256_si128(sum0), _mm256_extracti128_si256(sum0, 1)));
1150 :
1151 0 : return sumBlock;
1152 : }
1153 : else {
1154 0 : printf("\n add the rest \n");
1155 0 : return 0;
1156 : }
1157 : }
1158 :
1159 0 : void memset16bit_block_avx2_intrin(
1160 : int16_t * in_ptr,
1161 : uint32_t stride_in,
1162 : uint32_t size,
1163 : int16_t value
1164 : )
1165 : {
1166 0 : if (size == 4) {
1167 0 : __m128i line = _mm_set1_epi16(value);
1168 :
1169 0 : _mm_storel_epi64((__m128i *)(in_ptr + 0 * stride_in), line);
1170 0 : _mm_storel_epi64((__m128i *)(in_ptr + 1 * stride_in), line);
1171 0 : _mm_storel_epi64((__m128i *)(in_ptr + 2 * stride_in), line);
1172 0 : _mm_storel_epi64((__m128i *)(in_ptr + 3 * stride_in), line);
1173 : }
1174 0 : else if (size == 8) {
1175 0 : __m128i line = _mm_set1_epi16(value);
1176 :
1177 : _mm_storeu_si128((__m128i *)(in_ptr + 0 * stride_in), line);
1178 0 : _mm_storeu_si128((__m128i *)(in_ptr + 1 * stride_in), line);
1179 0 : _mm_storeu_si128((__m128i *)(in_ptr + 2 * stride_in), line);
1180 0 : _mm_storeu_si128((__m128i *)(in_ptr + 3 * stride_in), line);
1181 0 : _mm_storeu_si128((__m128i *)(in_ptr + 4 * stride_in), line);
1182 0 : _mm_storeu_si128((__m128i *)(in_ptr + 5 * stride_in), line);
1183 0 : _mm_storeu_si128((__m128i *)(in_ptr + 6 * stride_in), line);
1184 0 : _mm_storeu_si128((__m128i *)(in_ptr + 7 * stride_in), line);
1185 : }
1186 0 : else if (size == 16) {
1187 0 : __m256i line = _mm256_set1_epi16(value);
1188 :
1189 : _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in), line);
1190 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in), line);
1191 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in), line);
1192 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in), line);
1193 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in), line);
1194 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in), line);
1195 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in), line);
1196 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in), line);
1197 :
1198 0 : in_ptr += 8 * stride_in;
1199 :
1200 : _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in), line);
1201 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in), line);
1202 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in), line);
1203 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in), line);
1204 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in), line);
1205 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in), line);
1206 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in), line);
1207 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in), line);
1208 : }
1209 0 : else if (size == 32) {
1210 0 : __m256i line = _mm256_set1_epi16(value);
1211 :
1212 : _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in), line);
1213 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in + 16), line);
1214 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in), line);
1215 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in + 16), line);
1216 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in), line);
1217 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in + 16), line);
1218 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in), line);
1219 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in + 16), line);
1220 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in), line);
1221 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in + 16), line);
1222 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in), line);
1223 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in + 16), line);
1224 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in), line);
1225 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in + 16), line);
1226 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in), line);
1227 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in + 16), line);
1228 :
1229 0 : in_ptr += 8 * stride_in;
1230 :
1231 : _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in), line);
1232 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in + 16), line);
1233 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in), line);
1234 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in + 16), line);
1235 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in), line);
1236 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in + 16), line);
1237 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in), line);
1238 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in + 16), line);
1239 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in), line);
1240 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in + 16), line);
1241 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in), line);
1242 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in + 16), line);
1243 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in), line);
1244 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in + 16), line);
1245 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in), line);
1246 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in + 16), line);
1247 :
1248 0 : in_ptr += 8 * stride_in;
1249 :
1250 : _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in), line);
1251 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in + 16), line);
1252 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in), line);
1253 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in + 16), line);
1254 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in), line);
1255 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in + 16), line);
1256 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in), line);
1257 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in + 16), line);
1258 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in), line);
1259 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in + 16), line);
1260 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in), line);
1261 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in + 16), line);
1262 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in), line);
1263 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in + 16), line);
1264 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in), line);
1265 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in + 16), line);
1266 :
1267 0 : in_ptr += 8 * stride_in;
1268 :
1269 : _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in), line);
1270 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in + 16), line);
1271 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in), line);
1272 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in + 16), line);
1273 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in), line);
1274 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in + 16), line);
1275 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in), line);
1276 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in + 16), line);
1277 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in), line);
1278 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in + 16), line);
1279 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in), line);
1280 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in + 16), line);
1281 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in), line);
1282 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in + 16), line);
1283 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in), line);
1284 0 : _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in + 16), line);
1285 : }
1286 :
1287 : else
1288 0 : printf("\n add the rest \n");
1289 0 : }
1290 :
1291 0 : void unpack_avg_safe_sub_avx2_intrin(
1292 : uint16_t *ref16_l0,
1293 : uint32_t ref_l0_stride,
1294 : uint16_t *ref16_l1,
1295 : uint32_t ref_l1_stride,
1296 : uint8_t *dst_ptr,
1297 : uint32_t dst_stride,
1298 : EbBool sub_pred,
1299 : uint32_t width,
1300 : uint32_t height)
1301 : {
1302 : uint32_t y;
1303 : __m128i inPixel0, inPixel1;
1304 :
1305 0 : if (width == 8)
1306 : {
1307 : __m128i out8_0_U8_L0, out8_0_U8_L1, out8_2_U8_L0, out8_2_U8_L1;
1308 : __m128i avg8_0_U8, avg8_2_U8;
1309 :
1310 0 : for (y = 0; y < height; y += 2)
1311 : {
1312 : //--------
1313 : //Line One
1314 : //--------
1315 :
1316 : //List0
1317 :
1318 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
1319 :
1320 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
1321 0 : out8_0_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
1322 :
1323 : //List1
1324 :
1325 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
1326 :
1327 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
1328 0 : out8_0_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
1329 :
1330 : //AVG
1331 0 : avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
1332 :
1333 0 : _mm_storel_epi64((__m128i*) dst_ptr, avg8_0_U8);
1334 :
1335 : //--------
1336 : //Line Two
1337 : //--------
1338 :
1339 : //List0
1340 :
1341 0 : inPixel0 = _mm_loadu_si128((__m128i*)(ref16_l0 + ref_l0_stride));
1342 :
1343 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
1344 0 : out8_2_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
1345 :
1346 : //List1
1347 :
1348 0 : inPixel0 = _mm_loadu_si128((__m128i*)(ref16_l1 + ref_l1_stride));
1349 :
1350 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
1351 0 : out8_2_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
1352 :
1353 : //AVG
1354 0 : avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
1355 :
1356 0 : _mm_storel_epi64((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
1357 :
1358 0 : dst_ptr += 2 * dst_stride;
1359 0 : ref16_l0 += 2 * ref_l0_stride;
1360 0 : ref16_l1 += 2 * ref_l1_stride;
1361 : }
1362 :
1363 0 : if (sub_pred) {
1364 0 : ref16_l0 -= (ref_l0_stride >> 1);
1365 0 : ref16_l1 -= (ref_l1_stride >> 1);
1366 0 : dst_ptr -= (dst_stride >> 1);
1367 : //List0
1368 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
1369 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
1370 0 : out8_0_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
1371 : //List1
1372 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
1373 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
1374 0 : out8_0_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
1375 : //AVG
1376 0 : avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
1377 0 : _mm_storel_epi64((__m128i*) dst_ptr, avg8_0_U8);
1378 : }
1379 : }
1380 0 : else if (width == 16)
1381 : {
1382 : __m128i inPixel4, inPixel5;
1383 : __m128i out8_0_U8_L0, out8_0_U8_L1, out8_2_U8_L0, out8_2_U8_L1;
1384 : __m128i avg8_0_U8, avg8_2_U8;
1385 :
1386 0 : for (y = 0; y < height; y += 2)
1387 : {
1388 : //--------
1389 : //Line One
1390 : //--------
1391 :
1392 : //List0
1393 :
1394 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
1395 0 : inPixel1 = _mm_loadu_si128((__m128i*) (ref16_l0 + 8));
1396 :
1397 0 : out8_0_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
1398 :
1399 : //List1
1400 :
1401 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
1402 0 : inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l1 + 8));
1403 :
1404 0 : out8_0_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
1405 :
1406 : //AVG
1407 0 : avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
1408 :
1409 : _mm_store_si128((__m128i*) dst_ptr, avg8_0_U8);
1410 :
1411 : //--------
1412 : //Line Two
1413 : //--------
1414 :
1415 : //List0
1416 :
1417 0 : inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride));
1418 0 : inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 8));
1419 :
1420 0 : out8_2_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
1421 :
1422 : //List1
1423 :
1424 0 : inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride));
1425 0 : inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 8));
1426 :
1427 0 : out8_2_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
1428 :
1429 : //AVG
1430 0 : avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
1431 :
1432 0 : _mm_store_si128((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
1433 :
1434 0 : dst_ptr += 2 * dst_stride;
1435 0 : ref16_l0 += 2 * ref_l0_stride;
1436 0 : ref16_l1 += 2 * ref_l1_stride;
1437 : }
1438 :
1439 0 : if (sub_pred) {
1440 0 : ref16_l0 -= (ref_l0_stride >> 1);
1441 0 : ref16_l1 -= (ref_l1_stride >> 1);
1442 0 : dst_ptr -= (dst_stride >> 1);
1443 : //List0
1444 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
1445 0 : inPixel1 = _mm_loadu_si128((__m128i*) (ref16_l0 + 8));
1446 0 : out8_0_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
1447 : //List1
1448 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
1449 0 : inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l1 + 8));
1450 0 : out8_0_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
1451 : //AVG
1452 0 : avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
1453 : _mm_store_si128((__m128i*) dst_ptr, avg8_0_U8);
1454 : }
1455 : }
1456 0 : else if (width == 32)
1457 : {
1458 : __m256i inVal16b_0, inVal16b_1;
1459 : __m256i data8b_32_0_L0, data8b_32_0_L1;
1460 : __m256i avg8b_32_0;
1461 :
1462 0 : for (y = 0; y < height; y += 2)
1463 : {
1464 : //--------
1465 : //Line One
1466 : //--------
1467 :
1468 : //List0
1469 0 : inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l0);
1470 0 : inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 16));
1471 0 : data8b_32_0_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
1472 : //List1
1473 0 : inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l1);
1474 0 : inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 16));
1475 0 : data8b_32_0_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
1476 :
1477 : //Avg
1478 0 : avg8b_32_0 = _mm256_avg_epu8(data8b_32_0_L0, data8b_32_0_L1);
1479 :
1480 0 : avg8b_32_0 = _mm256_permute4x64_epi64(avg8b_32_0, 216);
1481 :
1482 : _mm256_storeu_si256((__m256i *)(dst_ptr), avg8b_32_0);
1483 :
1484 : //--------
1485 : //Line Two
1486 : //--------
1487 : //List0
1488 0 : inVal16b_0 = _mm256_loadu_si256((__m256i*)(ref16_l0 + ref_l0_stride));
1489 0 : inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l0 + ref_l0_stride + 16));
1490 :
1491 0 : data8b_32_0_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
1492 :
1493 : //List1
1494 0 : inVal16b_0 = _mm256_loadu_si256((__m256i*)(ref16_l1 + ref_l1_stride));
1495 0 : inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l1 + ref_l1_stride + 16));
1496 :
1497 0 : data8b_32_0_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
1498 :
1499 : //Avg
1500 0 : avg8b_32_0 = _mm256_avg_epu8(data8b_32_0_L0, data8b_32_0_L1);
1501 :
1502 0 : avg8b_32_0 = _mm256_permute4x64_epi64(avg8b_32_0, 216);
1503 :
1504 0 : _mm256_storeu_si256((__m256i *)(dst_ptr + dst_stride), avg8b_32_0);
1505 :
1506 0 : dst_ptr += 2 * dst_stride;
1507 0 : ref16_l0 += 2 * ref_l0_stride;
1508 0 : ref16_l1 += 2 * ref_l1_stride;
1509 : }
1510 :
1511 0 : if (sub_pred) {
1512 0 : ref16_l0 -= (ref_l0_stride >> 1);
1513 0 : ref16_l1 -= (ref_l1_stride >> 1);
1514 0 : dst_ptr -= (dst_stride >> 1);
1515 : //List0
1516 0 : inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l0);
1517 0 : inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 16));
1518 0 : data8b_32_0_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
1519 : //List1
1520 0 : inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l1);
1521 0 : inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 16));
1522 0 : data8b_32_0_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
1523 : //Avg
1524 0 : avg8b_32_0 = _mm256_avg_epu8(data8b_32_0_L0, data8b_32_0_L1);
1525 0 : avg8b_32_0 = _mm256_permute4x64_epi64(avg8b_32_0, 216);
1526 : _mm256_storeu_si256((__m256i *)(dst_ptr), avg8b_32_0);
1527 : }
1528 : }
1529 0 : else if (width == 64)
1530 : {
1531 : __m256i inVal16b_0, inVal16b_1, inVal16b_2, inVal16b_3;
1532 : __m256i data8b_32_0_L0, data8b_32_1_L0, data8b_32_0_L1, data8b_32_1_L1;
1533 : __m256i avg8b_32_0, avg8b_32_1;
1534 :
1535 0 : for (y = 0; y < height; ++y)
1536 : {
1537 : //List0
1538 0 : inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l0);
1539 0 : inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 16));
1540 0 : inVal16b_2 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 32));
1541 0 : inVal16b_3 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 48));
1542 0 : data8b_32_0_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
1543 0 : data8b_32_1_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_2, 2), _mm256_srli_epi16(inVal16b_3, 2));
1544 : //List1
1545 0 : inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l1);
1546 0 : inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 16));
1547 0 : inVal16b_2 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 32));
1548 0 : inVal16b_3 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 48));
1549 0 : data8b_32_0_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
1550 0 : data8b_32_1_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_2, 2), _mm256_srli_epi16(inVal16b_3, 2));
1551 :
1552 : //Avg
1553 0 : avg8b_32_0 = _mm256_avg_epu8(data8b_32_0_L0, data8b_32_0_L1);
1554 0 : avg8b_32_1 = _mm256_avg_epu8(data8b_32_1_L0, data8b_32_1_L1);
1555 :
1556 0 : avg8b_32_0 = _mm256_permute4x64_epi64(avg8b_32_0, 216);
1557 0 : avg8b_32_1 = _mm256_permute4x64_epi64(avg8b_32_1, 216);
1558 :
1559 : _mm256_storeu_si256((__m256i *)(dst_ptr), avg8b_32_0);
1560 0 : _mm256_storeu_si256((__m256i *)(dst_ptr + 32), avg8b_32_1);
1561 :
1562 0 : dst_ptr += dst_stride;
1563 0 : ref16_l0 += ref_l0_stride;
1564 0 : ref16_l1 += ref_l1_stride;
1565 : }
1566 :
1567 0 : if (sub_pred) {
1568 0 : ref16_l0 -= (ref_l0_stride >> 1);
1569 0 : ref16_l1 -= (ref_l1_stride >> 1);
1570 0 : dst_ptr -= (dst_stride >> 1);
1571 : //List0
1572 0 : inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l0);
1573 0 : inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 16));
1574 0 : inVal16b_2 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 32));
1575 0 : inVal16b_3 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 48));
1576 0 : data8b_32_0_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
1577 0 : data8b_32_1_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_2, 2), _mm256_srli_epi16(inVal16b_3, 2));
1578 : //List1
1579 0 : inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l1);
1580 0 : inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 16));
1581 0 : inVal16b_2 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 32));
1582 0 : inVal16b_3 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 48));
1583 0 : data8b_32_0_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
1584 0 : data8b_32_1_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_2, 2), _mm256_srli_epi16(inVal16b_3, 2));
1585 :
1586 : //Avg
1587 0 : avg8b_32_0 = _mm256_avg_epu8(data8b_32_0_L0, data8b_32_0_L1);
1588 0 : avg8b_32_1 = _mm256_avg_epu8(data8b_32_1_L0, data8b_32_1_L1);
1589 :
1590 0 : avg8b_32_0 = _mm256_permute4x64_epi64(avg8b_32_0, 216);
1591 0 : avg8b_32_1 = _mm256_permute4x64_epi64(avg8b_32_1, 216);
1592 :
1593 : _mm256_storeu_si256((__m256i *)(dst_ptr), avg8b_32_0);
1594 0 : _mm256_storeu_si256((__m256i *)(dst_ptr + 32), avg8b_32_1);
1595 : }
1596 : }
1597 :
1598 0 : return;
1599 : }
1600 :
1601 0 : void picture_addition_kernel4x4_av1_sse2_intrin(
1602 : uint8_t *pred_ptr,
1603 : uint32_t pred_stride,
1604 : int32_t *residual_ptr,
1605 : uint32_t residual_stride,
1606 : uint8_t *recon_ptr,
1607 : uint32_t recon_stride,
1608 : uint32_t width,
1609 : uint32_t height,
1610 : int32_t bd)
1611 : {
1612 : __m128i predReg, xmm0, recon_0_7, resReg;
1613 : uint32_t y;
1614 0 : xmm0 = _mm_setzero_si128();
1615 :
1616 0 : for (y = 0; y < 4; ++y) {
1617 0 : predReg = _mm_cvtsi32_si128(*(uint32_t *)pred_ptr);
1618 0 : predReg = _mm_unpacklo_epi8(predReg, xmm0);
1619 0 : predReg = _mm_unpacklo_epi16(predReg, xmm0);
1620 0 : resReg = _mm_loadu_si128((__m128i *)residual_ptr);
1621 0 : resReg = _mm_add_epi32(resReg, predReg);
1622 0 : recon_0_7 = _mm_packus_epi32(resReg, xmm0);
1623 0 : recon_0_7 = _mm_packus_epi16(recon_0_7, xmm0);
1624 0 : *(uint32_t *)recon_ptr = _mm_cvtsi128_si32(recon_0_7);
1625 0 : pred_ptr += pred_stride;
1626 0 : residual_ptr += residual_stride;
1627 0 : recon_ptr += recon_stride;
1628 : }
1629 : (void)width;
1630 : (void)height;
1631 : (void)bd;
1632 :
1633 0 : return;
1634 : }
1635 0 : void picture_addition_kernel8x8_av1_sse2_intrin(
1636 : uint8_t *pred_ptr,
1637 : uint32_t pred_stride,
1638 : int32_t *residual_ptr,
1639 : uint32_t residual_stride,
1640 : uint8_t *recon_ptr,
1641 : uint32_t recon_stride,
1642 : uint32_t width,
1643 : uint32_t height,
1644 : int32_t bd)
1645 : {
1646 : __m256i predReg, resReg, recon_0_7, xmm0;
1647 : __m128i predReg_128, predReg_128Lo, predReg_128Hi, xmm0_128, recon_0_7_128;
1648 : uint32_t y;
1649 0 : xmm0_128 = _mm_setzero_si128();
1650 0 : xmm0 = _mm256_setzero_si256();
1651 :
1652 0 : for (y = 0; y < 8; ++y) {
1653 0 : predReg_128 = _mm_cvtsi64_si128(*(uint64_t *)pred_ptr);
1654 0 : predReg_128 = _mm_unpacklo_epi8(predReg_128, xmm0_128);
1655 0 : predReg_128Lo = _mm_unpacklo_epi16(predReg_128, xmm0_128);
1656 0 : predReg_128Hi = _mm_unpackhi_epi16(predReg_128, xmm0_128);
1657 0 : predReg = _mm256_set_m128i(predReg_128Hi, predReg_128Lo);
1658 0 : resReg = _mm256_loadu_si256((__m256i*)residual_ptr);
1659 0 : resReg = _mm256_add_epi32(predReg, resReg);
1660 0 : recon_0_7 = _mm256_packus_epi32(resReg, xmm0);
1661 0 : recon_0_7_128 = _mm_slli_si128(_mm256_extracti128_si256(recon_0_7, 1), 8);
1662 0 : recon_0_7_128 = _mm_or_si128(_mm256_castsi256_si128(recon_0_7), recon_0_7_128);
1663 0 : recon_0_7_128 = _mm_packus_epi16(recon_0_7_128, xmm0_128);
1664 0 : *(uint64_t *)recon_ptr = _mm_cvtsi128_si64(recon_0_7_128);
1665 0 : pred_ptr += pred_stride;
1666 0 : residual_ptr += residual_stride;
1667 0 : recon_ptr += recon_stride;
1668 : }
1669 : (void)width;
1670 : (void)height;
1671 : (void)bd;
1672 :
1673 0 : return;
1674 : }
1675 0 : void picture_addition_kernel16x16_av1_sse2_intrin(
1676 : uint8_t *pred_ptr,
1677 : uint32_t pred_stride,
1678 : int32_t *residual_ptr,
1679 : uint32_t residual_stride,
1680 : uint8_t *recon_ptr,
1681 : uint32_t recon_stride,
1682 : uint32_t width,
1683 : uint32_t height,
1684 : int32_t bd)
1685 : {
1686 : __m256i resReg, recon_0_7, xmm0, predRegLo, predRegHi, resRegLo, resRegHi;
1687 : __m128i predReg_128, predReg_128Lo, predReg_128Hi, xmm0_128, predReg_128Lo16Lo, predReg_128Lo16Hi, predReg_128Hi16Lo, predReg_128Hi16Hi;
1688 : uint32_t y;
1689 0 : xmm0_128 = _mm_setzero_si128();
1690 0 : xmm0 = _mm256_setzero_si256();
1691 :
1692 0 : for (y = 0; y < 16; ++y) {
1693 0 : predReg_128 = _mm_loadu_si128((__m128i *)pred_ptr);
1694 0 : predReg_128Lo = _mm_unpacklo_epi8(predReg_128, xmm0_128);
1695 0 : predReg_128Hi = _mm_unpackhi_epi8(predReg_128, xmm0_128);
1696 0 : predReg_128Lo16Lo = _mm_unpacklo_epi16(predReg_128Lo, xmm0_128);
1697 0 : predReg_128Lo16Hi = _mm_unpackhi_epi16(predReg_128Lo, xmm0_128);
1698 0 : predReg_128Hi16Lo = _mm_unpacklo_epi16(predReg_128Hi, xmm0_128);
1699 0 : predReg_128Hi16Hi = _mm_unpackhi_epi16(predReg_128Hi, xmm0_128);
1700 0 : predRegLo = _mm256_set_m128i(predReg_128Lo16Hi, predReg_128Lo16Lo);
1701 0 : predRegHi = _mm256_set_m128i(predReg_128Hi16Hi, predReg_128Hi16Lo);
1702 0 : resRegLo = _mm256_loadu_si256((__m256i*)residual_ptr);
1703 0 : resRegHi = _mm256_loadu_si256((__m256i*)(residual_ptr + 8));
1704 0 : predRegLo = _mm256_add_epi32(predRegLo, resRegLo);
1705 0 : predRegHi = _mm256_add_epi32(predRegHi, resRegHi);
1706 0 : resReg = _mm256_packus_epi32(predRegLo, predRegHi);
1707 0 : resReg = _mm256_packus_epi16(resReg, xmm0);
1708 0 : recon_0_7 = _mm256_shuffle_epi32(resReg, 0xD8);
1709 0 : predReg_128Hi = _mm256_extracti128_si256(recon_0_7, 1);
1710 0 : predReg_128Lo = _mm_slli_epi64(predReg_128Hi, 32);
1711 0 : predReg_128Lo = _mm_or_si128(_mm256_castsi256_si128(recon_0_7), predReg_128Lo);
1712 : _mm_storeu_si128((__m128i*) (recon_ptr), predReg_128Lo);
1713 0 : pred_ptr += pred_stride;
1714 0 : residual_ptr += residual_stride;
1715 0 : recon_ptr += recon_stride;
1716 : }
1717 : (void)width;
1718 : (void)height;
1719 : (void)bd;
1720 :
1721 0 : return;
1722 : }
1723 :
1724 0 : void picture_addition_kernel32x32_av1_sse2_intrin(
1725 : uint8_t *pred_ptr,
1726 : uint32_t pred_stride,
1727 : int32_t *residual_ptr,
1728 : uint32_t residual_stride,
1729 : uint8_t *recon_ptr,
1730 : uint32_t recon_stride,
1731 : uint32_t width,
1732 : uint32_t height,
1733 : int32_t bd)
1734 : {
1735 : __m256i predReg, recon_0_7, xmm0, resReg, predReg_Lo, predReg_Hi,
1736 : predReg_Lo16Lo, predReg_Lo16Hi, predReg_Hi16Lo, predReg_Hi16Hi, resReg1, resReg2, resReg3, resReg4;
1737 : __m128i predReg_128Lo, predReg_128Hi;
1738 : uint32_t y;
1739 0 : xmm0 = _mm256_setzero_si256();
1740 :
1741 0 : for (y = 0; y < 32; ++y) {
1742 0 : predReg = _mm256_loadu_si256((__m256i*)pred_ptr);
1743 0 : predReg_Lo = _mm256_unpacklo_epi8(predReg, xmm0);
1744 0 : predReg_Hi = _mm256_unpackhi_epi8(predReg, xmm0);
1745 0 : predReg_Lo16Lo = _mm256_unpacklo_epi16(predReg_Lo, xmm0);
1746 0 : predReg_Lo16Hi = _mm256_unpackhi_epi16(predReg_Lo, xmm0);
1747 0 : predReg_Hi16Lo = _mm256_unpacklo_epi16(predReg_Hi, xmm0);
1748 0 : predReg_Hi16Hi = _mm256_unpackhi_epi16(predReg_Hi, xmm0);
1749 :
1750 0 : predReg_Lo = _mm256_set_m128i(_mm256_castsi256_si128(predReg_Lo16Hi), _mm256_castsi256_si128(predReg_Lo16Lo));
1751 0 : predReg_Hi = _mm256_set_m128i(_mm256_castsi256_si128(predReg_Hi16Hi), _mm256_castsi256_si128(predReg_Hi16Lo));
1752 0 : predReg_Lo16Lo = _mm256_set_m128i(_mm256_extracti128_si256(predReg_Lo16Hi, 1), _mm256_extracti128_si256(predReg_Lo16Lo, 1));
1753 0 : predReg_Hi16Hi = _mm256_set_m128i(_mm256_extracti128_si256(predReg_Hi16Hi, 1), _mm256_extracti128_si256(predReg_Hi16Lo, 1));
1754 :
1755 0 : resReg1 = _mm256_loadu_si256((__m256i*)residual_ptr);
1756 0 : resReg2 = _mm256_loadu_si256((__m256i*)(residual_ptr + 8));
1757 0 : resReg3 = _mm256_loadu_si256((__m256i*)(residual_ptr + 16));
1758 0 : resReg4 = _mm256_loadu_si256((__m256i*)(residual_ptr + 24));
1759 :
1760 0 : resReg1 = _mm256_add_epi32(predReg_Lo, resReg1);
1761 0 : resReg2 = _mm256_add_epi32(predReg_Hi, resReg2);
1762 0 : resReg3 = _mm256_add_epi32(predReg_Lo16Lo, resReg3);
1763 0 : resReg4 = _mm256_add_epi32(predReg_Hi16Hi, resReg4);
1764 :
1765 0 : resReg = _mm256_packus_epi32(resReg1, resReg2);
1766 0 : resReg = _mm256_packus_epi16(resReg, xmm0);
1767 0 : recon_0_7 = _mm256_shuffle_epi32(resReg, 0xD8);
1768 0 : predReg_128Hi = _mm256_extracti128_si256(recon_0_7, 1);
1769 0 : predReg_128Lo = _mm_slli_epi64(predReg_128Hi, 32);
1770 0 : predReg_128Lo = _mm_or_si128(_mm256_castsi256_si128(recon_0_7), predReg_128Lo);
1771 :
1772 0 : resReg = _mm256_packus_epi32(resReg3, resReg4);
1773 0 : resReg = _mm256_packus_epi16(resReg, xmm0);
1774 0 : recon_0_7 = _mm256_shuffle_epi32(resReg, 0xD8);
1775 0 : predReg_128Hi = _mm256_extracti128_si256(recon_0_7, 1);
1776 0 : predReg_128Hi = _mm_slli_epi64(predReg_128Hi, 32);
1777 0 : predReg_128Hi = _mm_or_si128(_mm256_castsi256_si128(recon_0_7), predReg_128Hi);
1778 0 : recon_0_7 = _mm256_set_m128i(predReg_128Hi, predReg_128Lo);
1779 : _mm256_storeu_si256((__m256i*)recon_ptr, recon_0_7);
1780 :
1781 0 : pred_ptr += pred_stride;
1782 0 : residual_ptr += residual_stride;
1783 0 : recon_ptr += recon_stride;
1784 : }
1785 : (void)width;
1786 : (void)height;
1787 : (void)bd;
1788 :
1789 0 : return;
1790 : }
1791 0 : void picture_addition_kernel64x64_av1_sse2_intrin(
1792 : uint8_t *pred_ptr,
1793 : uint32_t pred_stride,
1794 : int32_t *residual_ptr,
1795 : uint32_t residual_stride,
1796 : uint8_t *recon_ptr,
1797 : uint32_t recon_stride,
1798 : uint32_t width,
1799 : uint32_t height,
1800 : int32_t bd)
1801 : {
1802 : __m256i predReg, recon_0_7, xmm0, resReg, predReg_Lo, predReg_Hi,
1803 : predReg_Lo16Lo, predReg_Lo16Hi, predReg_Hi16Lo, predReg_Hi16Hi, resReg1, resReg2, resReg3, resReg4;
1804 : __m128i predReg_128Lo, predReg_128Hi;
1805 : uint32_t y;
1806 0 : xmm0 = _mm256_setzero_si256();
1807 :
1808 0 : for (y = 0; y < 64; ++y) {
1809 0 : predReg = _mm256_loadu_si256((__m256i*)pred_ptr);
1810 0 : predReg_Lo = _mm256_unpacklo_epi8(predReg, xmm0);
1811 0 : predReg_Hi = _mm256_unpackhi_epi8(predReg, xmm0);
1812 0 : predReg_Lo16Lo = _mm256_unpacklo_epi16(predReg_Lo, xmm0);
1813 0 : predReg_Lo16Hi = _mm256_unpackhi_epi16(predReg_Lo, xmm0);
1814 0 : predReg_Hi16Lo = _mm256_unpacklo_epi16(predReg_Hi, xmm0);
1815 0 : predReg_Hi16Hi = _mm256_unpackhi_epi16(predReg_Hi, xmm0);
1816 :
1817 0 : predReg_Lo = _mm256_set_m128i(_mm256_castsi256_si128(predReg_Lo16Hi), _mm256_castsi256_si128(predReg_Lo16Lo));
1818 0 : predReg_Hi = _mm256_set_m128i(_mm256_castsi256_si128(predReg_Hi16Hi), _mm256_castsi256_si128(predReg_Hi16Lo));
1819 0 : predReg_Lo16Lo = _mm256_set_m128i(_mm256_extracti128_si256(predReg_Lo16Hi, 1), _mm256_extracti128_si256(predReg_Lo16Lo, 1));
1820 0 : predReg_Hi16Hi = _mm256_set_m128i(_mm256_extracti128_si256(predReg_Hi16Hi, 1), _mm256_extracti128_si256(predReg_Hi16Lo, 1));
1821 :
1822 0 : resReg1 = _mm256_loadu_si256((__m256i*)residual_ptr);
1823 0 : resReg2 = _mm256_loadu_si256((__m256i*)(residual_ptr + 8));
1824 0 : resReg3 = _mm256_loadu_si256((__m256i*)(residual_ptr + 16));
1825 0 : resReg4 = _mm256_loadu_si256((__m256i*)(residual_ptr + 24));
1826 :
1827 0 : resReg1 = _mm256_add_epi32(predReg_Lo, resReg1);
1828 0 : resReg2 = _mm256_add_epi32(predReg_Hi, resReg2);
1829 0 : resReg3 = _mm256_add_epi32(predReg_Lo16Lo, resReg3);
1830 0 : resReg4 = _mm256_add_epi32(predReg_Hi16Hi, resReg4);
1831 :
1832 0 : resReg = _mm256_packus_epi32(resReg1, resReg2);
1833 0 : resReg = _mm256_packus_epi16(resReg, xmm0);
1834 0 : recon_0_7 = _mm256_shuffle_epi32(resReg, 0xD8);
1835 0 : predReg_128Hi = _mm256_extracti128_si256(recon_0_7, 1);
1836 0 : predReg_128Lo = _mm_slli_epi64(predReg_128Hi, 32);
1837 0 : predReg_128Lo = _mm_or_si128(_mm256_castsi256_si128(recon_0_7), predReg_128Lo);
1838 :
1839 0 : resReg = _mm256_packus_epi32(resReg3, resReg4);
1840 0 : resReg = _mm256_packus_epi16(resReg, xmm0);
1841 0 : recon_0_7 = _mm256_shuffle_epi32(resReg, 0xD8);
1842 0 : predReg_128Hi = _mm256_extracti128_si256(recon_0_7, 1);
1843 0 : predReg_128Hi = _mm_slli_epi64(predReg_128Hi, 32);
1844 0 : predReg_128Hi = _mm_or_si128(_mm256_castsi256_si128(recon_0_7), predReg_128Hi);
1845 0 : recon_0_7 = _mm256_set_m128i(predReg_128Hi, predReg_128Lo);
1846 : _mm256_storeu_si256((__m256i*)recon_ptr, recon_0_7);
1847 :
1848 0 : predReg = _mm256_loadu_si256((__m256i*)(pred_ptr + 32));
1849 0 : predReg_Lo = _mm256_unpacklo_epi8(predReg, xmm0);
1850 0 : predReg_Hi = _mm256_unpackhi_epi8(predReg, xmm0);
1851 0 : predReg_Lo16Lo = _mm256_unpacklo_epi16(predReg_Lo, xmm0);
1852 0 : predReg_Lo16Hi = _mm256_unpackhi_epi16(predReg_Lo, xmm0);
1853 0 : predReg_Hi16Lo = _mm256_unpacklo_epi16(predReg_Hi, xmm0);
1854 0 : predReg_Hi16Hi = _mm256_unpackhi_epi16(predReg_Hi, xmm0);
1855 :
1856 0 : predReg_Lo = _mm256_set_m128i(_mm256_castsi256_si128(predReg_Lo16Hi), _mm256_castsi256_si128(predReg_Lo16Lo));
1857 0 : predReg_Hi = _mm256_set_m128i(_mm256_castsi256_si128(predReg_Hi16Hi), _mm256_castsi256_si128(predReg_Hi16Lo));
1858 0 : predReg_Lo16Lo = _mm256_set_m128i(_mm256_extracti128_si256(predReg_Lo16Hi, 1), _mm256_extracti128_si256(predReg_Lo16Lo, 1));
1859 0 : predReg_Hi16Hi = _mm256_set_m128i(_mm256_extracti128_si256(predReg_Hi16Hi, 1), _mm256_extracti128_si256(predReg_Hi16Lo, 1));
1860 :
1861 0 : resReg1 = _mm256_loadu_si256((__m256i*)(residual_ptr + 32));
1862 0 : resReg2 = _mm256_loadu_si256((__m256i*)(residual_ptr + 40));
1863 0 : resReg3 = _mm256_loadu_si256((__m256i*)(residual_ptr + 48));
1864 0 : resReg4 = _mm256_loadu_si256((__m256i*)(residual_ptr + 56));
1865 :
1866 0 : resReg1 = _mm256_add_epi32(predReg_Lo, resReg1);
1867 0 : resReg2 = _mm256_add_epi32(predReg_Hi, resReg2);
1868 0 : resReg3 = _mm256_add_epi32(predReg_Lo16Lo, resReg3);
1869 0 : resReg4 = _mm256_add_epi32(predReg_Hi16Hi, resReg4);
1870 :
1871 0 : resReg = _mm256_packus_epi32(resReg1, resReg2);
1872 0 : resReg = _mm256_packus_epi16(resReg, xmm0);
1873 0 : recon_0_7 = _mm256_shuffle_epi32(resReg, 0xD8);
1874 0 : predReg_128Hi = _mm256_extracti128_si256(recon_0_7, 1);
1875 0 : predReg_128Lo = _mm_slli_epi64(predReg_128Hi, 32);
1876 0 : predReg_128Lo = _mm_or_si128(_mm256_castsi256_si128(recon_0_7), predReg_128Lo);
1877 :
1878 0 : resReg = _mm256_packus_epi32(resReg3, resReg4);
1879 0 : resReg = _mm256_packus_epi16(resReg, xmm0);
1880 0 : recon_0_7 = _mm256_shuffle_epi32(resReg, 0xD8);
1881 0 : predReg_128Hi = _mm256_extracti128_si256(recon_0_7, 1);
1882 0 : predReg_128Hi = _mm_slli_epi64(predReg_128Hi, 32);
1883 0 : predReg_128Hi = _mm_or_si128(_mm256_castsi256_si128(recon_0_7), predReg_128Hi);
1884 0 : recon_0_7 = _mm256_set_m128i(predReg_128Hi, predReg_128Lo);
1885 0 : _mm256_storeu_si256((__m256i*)(recon_ptr + 32), recon_0_7);
1886 :
1887 0 : pred_ptr += pred_stride;
1888 0 : residual_ptr += residual_stride;
1889 0 : recon_ptr += recon_stride;
1890 : }
1891 : (void)width;
1892 : (void)height;
1893 : (void)bd;
1894 :
1895 0 : return;
1896 : }
1897 :
1898 5292800 : void full_distortion_kernel32_bits_avx2(
1899 : int32_t *coeff,
1900 : uint32_t coeff_stride,
1901 : int32_t *recon_coeff,
1902 : uint32_t recon_coeff_stride,
1903 : uint64_t distortion_result[DIST_CALC_TOTAL],
1904 : uint32_t area_width,
1905 : uint32_t area_height)
1906 : {
1907 : uint32_t rowCount, col_count;
1908 5292800 : __m256i sum1 = _mm256_setzero_si256();
1909 5292800 : __m256i sum2 = _mm256_setzero_si256();
1910 : __m128i temp1, temp2, temp3;
1911 :
1912 5292800 : rowCount = area_height;
1913 : do {
1914 38433300 : int32_t *coeffTemp = coeff;
1915 38433300 : int32_t *reconCoeffTemp = recon_coeff;
1916 :
1917 38433300 : col_count = area_width / 4;
1918 : do {
1919 : __m128i x0, y0;
1920 : __m256i x, y, z;
1921 80435000 : x0 = _mm_loadu_si128((__m128i *)(coeffTemp));
1922 80435000 : y0 = _mm_loadu_si128((__m128i *)(reconCoeffTemp));
1923 80435000 : x = _mm256_cvtepi32_epi64(x0);
1924 80435000 : y = _mm256_cvtepi32_epi64(y0);
1925 80435000 : z= _mm256_mul_epi32(x, x);
1926 80435000 : sum2 = _mm256_add_epi64(sum2, z);
1927 80435000 : x = _mm256_sub_epi64(x, y);
1928 80435000 : x = _mm256_mul_epi32(x, x);
1929 80435000 : sum1 = _mm256_add_epi32(sum1, x);
1930 80435000 : coeffTemp += 4;
1931 80435000 : reconCoeffTemp += 4;
1932 80435000 : } while (--col_count);
1933 :
1934 38433300 : coeff += coeff_stride;
1935 38433300 : recon_coeff += recon_coeff_stride;
1936 38433300 : rowCount -= 1;
1937 38433300 : } while (rowCount > 0);
1938 :
1939 5292800 : temp1 = _mm256_castsi256_si128(sum1);
1940 5292800 : temp2 = _mm256_extracti128_si256(sum1, 1);
1941 5292800 : temp1 = _mm_add_epi64(temp1, temp2);
1942 5292800 : temp2 = _mm_shuffle_epi32(temp1, 0x4e);
1943 5292800 : temp3 = _mm_add_epi64(temp1, temp2);
1944 5292800 : temp1 = _mm256_castsi256_si128(sum2);
1945 5292800 : temp2 = _mm256_extracti128_si256(sum2, 1);
1946 5292800 : temp1 = _mm_add_epi64(temp1, temp2);
1947 5292800 : temp2 = _mm_shuffle_epi32(temp1, 0x4e);
1948 5292800 : temp1 = _mm_add_epi64(temp1, temp2);
1949 5292800 : temp1 = _mm_unpacklo_epi64(temp3, temp1);
1950 :
1951 : _mm_storeu_si128((__m128i *)distortion_result, temp1);
1952 5292800 : }
1953 :
1954 8358880 : void full_distortion_kernel_cbf_zero32_bits_avx2(
1955 : int32_t *coeff,
1956 : uint32_t coeff_stride,
1957 : int32_t *recon_coeff,
1958 : uint32_t recon_coeff_stride,
1959 : uint64_t distortion_result[DIST_CALC_TOTAL],
1960 : uint32_t area_width,
1961 : uint32_t area_height)
1962 : {
1963 : uint32_t rowCount, col_count;
1964 8358880 : __m256i sum = _mm256_setzero_si256();
1965 : __m128i temp1, temp2;
1966 :
1967 8358880 : rowCount = area_height;
1968 : do {
1969 58311200 : int32_t *coeffTemp = coeff;
1970 :
1971 58311200 : col_count = area_width / 4;
1972 : do {
1973 : __m128i x0;
1974 : __m256i y0, z0;
1975 126494000 : x0 = _mm_loadu_si128((__m128i *)(coeffTemp));
1976 126494000 : coeffTemp += 4;
1977 126494000 : y0 = _mm256_cvtepi32_epi64(x0);
1978 126494000 : z0 = _mm256_mul_epi32(y0, y0);
1979 126494000 : sum = _mm256_add_epi64(sum, z0);
1980 126494000 : } while (--col_count);
1981 :
1982 58311200 : coeff += coeff_stride;
1983 58311200 : recon_coeff += coeff_stride;
1984 58311200 : rowCount -= 1;
1985 58311200 : } while (rowCount > 0);
1986 :
1987 8358880 : temp1 = _mm256_castsi256_si128(sum);
1988 8358880 : temp2 = _mm256_extracti128_si256(sum, 1);
1989 8358880 : temp1 = _mm_add_epi64(temp1, temp2);
1990 8358880 : temp2 = _mm_shuffle_epi32(temp1, 0x4e);
1991 8358880 : temp1 = _mm_add_epi64(temp1, temp2);
1992 : _mm_storeu_si128((__m128i *)distortion_result, temp1);
1993 : (void)recon_coeff_stride;
1994 8358880 : }
1995 :
1996 217626000 : static INLINE void residual32_avx2(const uint8_t *const input,
1997 : const uint8_t *const pred, int16_t *const residual)
1998 : {
1999 217626000 : const __m256i zero = _mm256_setzero_si256();
2000 217626000 : const __m256i in0 = _mm256_loadu_si256((__m256i *)input);
2001 217626000 : const __m256i pr0 = _mm256_loadu_si256((__m256i *)pred);
2002 217626000 : const __m256i in1 = _mm256_permute4x64_epi64(in0, 0xD8);
2003 217626000 : const __m256i pr1 = _mm256_permute4x64_epi64(pr0, 0xD8);
2004 217626000 : const __m256i in_lo = _mm256_unpacklo_epi8(in1, zero);
2005 217626000 : const __m256i in_hi = _mm256_unpackhi_epi8(in1, zero);
2006 217626000 : const __m256i pr_lo = _mm256_unpacklo_epi8(pr1, zero);
2007 217626000 : const __m256i pr_hi = _mm256_unpackhi_epi8(pr1, zero);
2008 217626000 : const __m256i re_lo = _mm256_sub_epi16(in_lo, pr_lo);
2009 217626000 : const __m256i re_hi = _mm256_sub_epi16(in_hi, pr_hi);
2010 : _mm256_storeu_si256((__m256i*)(residual + 0 * 16), re_lo);
2011 217626000 : _mm256_storeu_si256((__m256i*)(residual + 1 * 16), re_hi);
2012 217626000 : }
2013 :
2014 : SIMD_INLINE void residual_kernel32_avx2(
2015 : const uint8_t *input, const uint32_t input_stride, const uint8_t *pred,
2016 : const uint32_t pred_stride, int16_t *residual,
2017 : const uint32_t residual_stride, const uint32_t area_height)
2018 : {
2019 6153280 : uint32_t y = area_height;
2020 :
2021 : do {
2022 127535000 : residual32_avx2(input, pred, residual);
2023 127536000 : input += input_stride;
2024 127536000 : pred += pred_stride;
2025 127536000 : residual += residual_stride;
2026 127536000 : } while (--y);
2027 6153500 : }
2028 :
2029 : SIMD_INLINE void residual_kernel64_avx2(
2030 : const uint8_t *input, const uint32_t input_stride, const uint8_t *pred,
2031 : const uint32_t pred_stride, int16_t *residual,
2032 : const uint32_t residual_stride, const uint32_t area_height)
2033 : {
2034 1496950 : uint32_t y = area_height;
2035 :
2036 : do {
2037 45290100 : residual32_avx2(input + 0 * 32, pred + 0 * 32, residual + 0 * 32);
2038 45289600 : residual32_avx2(input + 1 * 32, pred + 1 * 32, residual + 1 * 32);
2039 45290100 : input += input_stride;
2040 45290100 : pred += pred_stride;
2041 45290100 : residual += residual_stride;
2042 45290100 : } while (--y);
2043 1496970 : }
2044 :
2045 : SIMD_INLINE void residual_kernel128_avx2(
2046 : const uint8_t *input, const uint32_t input_stride, const uint8_t *pred,
2047 : const uint32_t pred_stride, int16_t *residual,
2048 : const uint32_t residual_stride, const uint32_t area_height)
2049 : {
2050 0 : uint32_t y = area_height;
2051 :
2052 : do {
2053 0 : residual32_avx2(input + 0 * 32, pred + 0 * 32, residual + 0 * 32);
2054 0 : residual32_avx2(input + 1 * 32, pred + 1 * 32, residual + 1 * 32);
2055 0 : residual32_avx2(input + 2 * 32, pred + 2 * 32, residual + 2 * 32);
2056 0 : residual32_avx2(input + 3 * 32, pred + 3 * 32, residual + 3 * 32);
2057 0 : input += input_stride;
2058 0 : pred += pred_stride;
2059 0 : residual += residual_stride;
2060 0 : } while (--y);
2061 0 : }
2062 :
2063 67841300 : void residual_kernel8bit_avx2(
2064 : uint8_t *input,
2065 : uint32_t input_stride,
2066 : uint8_t *pred,
2067 : uint32_t pred_stride,
2068 : int16_t *residual,
2069 : uint32_t residual_stride,
2070 : uint32_t area_width,
2071 : uint32_t area_height)
2072 : {
2073 67841300 : switch (area_width) {
2074 20595000 : case 4:
2075 : residual_kernel4_avx2(input, input_stride, pred, pred_stride, residual, residual_stride, area_height);
2076 20589800 : break;
2077 :
2078 24146600 : case 8:
2079 : residual_kernel8_avx2(input, input_stride, pred, pred_stride, residual, residual_stride, area_height);
2080 24143700 : break;
2081 :
2082 15538700 : case 16:
2083 : residual_kernel16_avx2(input, input_stride, pred, pred_stride, residual, residual_stride, area_height);
2084 15540600 : break;
2085 :
2086 6153280 : case 32:
2087 : residual_kernel32_avx2(input, input_stride, pred, pred_stride, residual, residual_stride, area_height);
2088 6153500 : break;
2089 :
2090 1496950 : case 64:
2091 : residual_kernel64_avx2(input, input_stride, pred, pred_stride, residual, residual_stride, area_height);
2092 1496970 : break;
2093 :
2094 0 : default: // 128
2095 : residual_kernel128_avx2(input, input_stride, pred, pred_stride, residual, residual_stride, area_height);
2096 0 : break;
2097 : }
2098 67924600 : }
2099 :
2100 0 : uint64_t spatial_full_distortion_kernel4x_n_avx2_intrin(
2101 : uint8_t *input,
2102 : uint32_t input_offset,
2103 : uint32_t input_stride,
2104 : uint8_t *recon,
2105 : uint32_t recon_offset,
2106 : uint32_t recon_stride,
2107 : uint32_t area_width,
2108 : uint32_t area_height)
2109 : {
2110 0 : int32_t row_count = area_height;
2111 0 : __m256i sum = _mm256_setzero_si256();
2112 : __m128i sum_L, sum_H, s;
2113 0 : input += input_offset;
2114 0 : recon += recon_offset;
2115 : (void)area_width;
2116 :
2117 : do {
2118 0 : const __m128i in0 = _mm_cvtsi32_si128(*(uint32_t *)(input + 0 * input_stride));
2119 0 : const __m128i in1 = _mm_cvtsi32_si128(*(uint32_t *)(input + 1 * input_stride));
2120 0 : const __m128i re0 = _mm_cvtsi32_si128(*(uint32_t *)(recon + 0 * recon_stride));
2121 0 : const __m128i re1 = _mm_cvtsi32_si128(*(uint32_t *)(recon + 1 * recon_stride));
2122 0 : const __m256i in = _mm256_setr_m128i(in0, in1);
2123 0 : const __m256i re = _mm256_setr_m128i(re0, re1);
2124 0 : Distortion_AVX2_INTRIN(in, re, &sum);
2125 0 : input += 2 * input_stride;
2126 0 : recon += 2 * recon_stride;
2127 0 : row_count -= 2;
2128 0 : } while (row_count);
2129 :
2130 0 : sum_L = _mm256_castsi256_si128(sum);
2131 0 : sum_H = _mm256_extracti128_si256(sum, 1);
2132 0 : s = _mm_add_epi32(sum_L, sum_H);
2133 0 : s = _mm_add_epi32(s, _mm_srli_si128(s, 4));
2134 :
2135 0 : return _mm_cvtsi128_si32(s);
2136 : }
2137 :
2138 0 : uint64_t spatial_full_distortion_kernel8x_n_avx2_intrin(
2139 : uint8_t *input,
2140 : uint32_t input_offset,
2141 : uint32_t input_stride,
2142 : uint8_t *recon,
2143 : uint32_t recon_offset,
2144 : uint32_t recon_stride,
2145 : uint32_t area_width,
2146 : uint32_t area_height)
2147 : {
2148 0 : int32_t row_count = area_height;
2149 0 : __m256i sum = _mm256_setzero_si256();
2150 0 : input += input_offset;
2151 0 : recon += recon_offset;
2152 : (void)area_width;
2153 :
2154 : do {
2155 0 : const __m128i in0 = _mm_loadl_epi64((__m128i *)(input + 0 * input_stride));
2156 0 : const __m128i in1 = _mm_loadl_epi64((__m128i *)(input + 1 * input_stride));
2157 0 : const __m128i re0 = _mm_loadl_epi64((__m128i *)(recon + 0 * recon_stride));
2158 0 : const __m128i re1 = _mm_loadl_epi64((__m128i *)(recon + 1 * recon_stride));
2159 0 : const __m256i in = _mm256_setr_m128i(in0, in1);
2160 0 : const __m256i re = _mm256_setr_m128i(re0, re1);
2161 0 : Distortion_AVX2_INTRIN(in, re, &sum);
2162 0 : input += 2 * input_stride;
2163 0 : recon += 2 * recon_stride;
2164 0 : row_count -= 2;
2165 0 : } while (row_count);
2166 :
2167 0 : return Hadd32_AVX2_INTRIN(sum);
2168 : }
2169 :
2170 0 : uint64_t spatial_full_distortion_kernel16x_n_avx2_intrin(
2171 : uint8_t *input,
2172 : uint32_t input_offset,
2173 : uint32_t input_stride,
2174 : uint8_t *recon,
2175 : uint32_t recon_offset,
2176 : uint32_t recon_stride,
2177 : uint32_t area_width,
2178 : uint32_t area_height)
2179 : {
2180 0 : int32_t row_count = area_height;
2181 0 : __m256i sum = _mm256_setzero_si256();
2182 0 : input += input_offset;
2183 0 : recon += recon_offset;
2184 : (void)area_width;
2185 :
2186 : do {
2187 0 : SpatialFullDistortionKernel16_AVX2_INTRIN(input, recon, &sum);
2188 0 : input += input_stride;
2189 0 : recon += recon_stride;
2190 0 : } while (--row_count);
2191 :
2192 0 : return Hadd32_AVX2_INTRIN(sum);
2193 : }
2194 :
2195 0 : static INLINE void SpatialFullDistortionKernel64_AVX2_INTRIN(
2196 : const uint8_t *const input, const uint8_t *const recon, __m256i *const sum)
2197 : {
2198 0 : SpatialFullDistortionKernel32_AVX2_INTRIN(input + 0 * 32, recon + 0 * 32, sum);
2199 0 : SpatialFullDistortionKernel32_AVX2_INTRIN(input + 1 * 32, recon + 1 * 32, sum);
2200 0 : }
2201 :
2202 0 : uint64_t spatial_full_distortion_kernel32x_n_avx2_intrin(
2203 : uint8_t *input,
2204 : uint32_t input_offset,
2205 : uint32_t input_stride,
2206 : uint8_t *recon,
2207 : uint32_t recon_offset,
2208 : uint32_t recon_stride,
2209 : uint32_t area_width,
2210 : uint32_t area_height)
2211 : {
2212 0 : int32_t row_count = area_height;
2213 0 : __m256i sum = _mm256_setzero_si256();
2214 0 : input += input_offset;
2215 0 : recon += recon_offset;
2216 : (void)area_width;
2217 :
2218 : do {
2219 0 : SpatialFullDistortionKernel32_AVX2_INTRIN(input, recon, &sum);
2220 0 : input += input_stride;
2221 0 : recon += recon_stride;
2222 0 : } while (--row_count);
2223 :
2224 0 : return Hadd32_AVX2_INTRIN(sum);
2225 : }
2226 :
2227 0 : uint64_t spatial_full_distortion_kernel64x_n_avx2_intrin(
2228 : uint8_t *input,
2229 : uint32_t input_offset,
2230 : uint32_t input_stride,
2231 : uint8_t *recon,
2232 : uint32_t recon_offset,
2233 : uint32_t recon_stride,
2234 : uint32_t area_width,
2235 : uint32_t area_height)
2236 : {
2237 0 : int32_t row_count = area_height;
2238 0 : __m256i sum = _mm256_setzero_si256();
2239 0 : input += input_offset;
2240 0 : recon += recon_offset;
2241 : (void)area_width;
2242 :
2243 : do {
2244 0 : SpatialFullDistortionKernel64_AVX2_INTRIN(input, recon, &sum);
2245 0 : input += input_stride;
2246 0 : recon += recon_stride;
2247 0 : } while (--row_count);
2248 :
2249 0 : return Hadd32_AVX2_INTRIN(sum);
2250 : }
2251 :
2252 0 : uint64_t spatial_full_distortion_kernel128x_n_avx2_intrin(
2253 : uint8_t *input,
2254 : uint32_t input_offset,
2255 : uint32_t input_stride,
2256 : uint8_t *recon,
2257 : uint32_t recon_offset,
2258 : uint32_t recon_stride,
2259 : uint32_t area_width,
2260 : uint32_t area_height)
2261 : {
2262 0 : int32_t row_count = area_height;
2263 0 : __m256i sum = _mm256_setzero_si256();
2264 0 : input += input_offset;
2265 0 : recon += recon_offset;
2266 : (void)area_width;
2267 :
2268 : do {
2269 0 : SpatialFullDistortionKernel64_AVX2_INTRIN(input + 0 * 64, recon + 0 * 64, &sum);
2270 0 : SpatialFullDistortionKernel64_AVX2_INTRIN(input + 1 * 64, recon + 1 * 64, &sum);
2271 0 : input += input_stride;
2272 0 : recon += recon_stride;
2273 0 : } while (--row_count);
2274 :
2275 0 : return Hadd32_AVX2_INTRIN(sum);
2276 : }
2277 :
2278 : #include "EbUtility.h"
2279 :
2280 1270770000 : uint64_t spatial_full_distortion_kernel_avx2(
2281 : uint8_t *input,
2282 : uint32_t input_offset,
2283 : uint32_t input_stride,
2284 : uint8_t *recon,
2285 : uint32_t recon_offset,
2286 : uint32_t recon_stride,
2287 : uint32_t area_width,
2288 : uint32_t area_height)
2289 : {
2290 1270770000 : const uint32_t leftover = area_width & 31;
2291 : int32_t h;
2292 1270770000 : __m256i sum = _mm256_setzero_si256();
2293 : __m128i sum_L, sum_H, s;
2294 1270770000 : uint64_t spatialDistortion = 0;
2295 1270770000 : input += input_offset;
2296 1270770000 : recon += recon_offset;
2297 :
2298 1270770000 : if (leftover) {
2299 1066310000 : const uint8_t *inp = input + area_width - leftover;
2300 1066310000 : const uint8_t *rec = recon + area_width - leftover;
2301 :
2302 1066310000 : if (leftover == 4) {
2303 81796100 : h = area_height;
2304 : do {
2305 366005000 : const __m128i in0 = _mm_cvtsi32_si128(*(uint32_t *)inp);
2306 366005000 : const __m128i in1 = _mm_cvtsi32_si128(*(uint32_t *)(inp + input_stride));
2307 366005000 : const __m128i re0 = _mm_cvtsi32_si128(*(uint32_t *)rec);
2308 366005000 : const __m128i re1 = _mm_cvtsi32_si128(*(uint32_t *)(rec + recon_stride));
2309 366005000 : const __m256i in = _mm256_setr_m128i(in0, in1);
2310 366005000 : const __m256i re = _mm256_setr_m128i(re0, re1);
2311 366005000 : Distortion_AVX2_INTRIN(in, re, &sum);
2312 366175000 : inp += 2 * input_stride;
2313 366175000 : rec += 2 * recon_stride;
2314 366175000 : h -= 2;
2315 366175000 : } while (h);
2316 :
2317 81965700 : if (area_width == 4) {
2318 81800600 : sum_L = _mm256_castsi256_si128(sum);
2319 81800600 : sum_H = _mm256_extracti128_si256(sum, 1);
2320 81800600 : s = _mm_add_epi32(sum_L, sum_H);
2321 163601000 : s = _mm_add_epi32(s, _mm_srli_si128(s, 4));
2322 81800600 : spatialDistortion = _mm_cvtsi128_si32(s);
2323 81800600 : return spatialDistortion;
2324 : }
2325 : }
2326 984509000 : else if (leftover == 8) {
2327 633345000 : h = area_height;
2328 : do {
2329 4022560000 : const __m128i in0 = _mm_loadl_epi64((__m128i *)inp);
2330 8045110000 : const __m128i in1 = _mm_loadl_epi64((__m128i *)(inp + input_stride));
2331 4022560000 : const __m128i re0 = _mm_loadl_epi64((__m128i *)rec);
2332 4022560000 : const __m128i re1 = _mm_loadl_epi64((__m128i *)(rec + recon_stride));
2333 4022560000 : const __m256i in = _mm256_setr_m128i(in0, in1);
2334 4022560000 : const __m256i re = _mm256_setr_m128i(re0, re1);
2335 4022560000 : Distortion_AVX2_INTRIN(in, re, &sum);
2336 4035410000 : inp += 2 * input_stride;
2337 4035410000 : rec += 2 * recon_stride;
2338 4035410000 : h -= 2;
2339 4035410000 : } while (h);
2340 : }
2341 351164000 : else if (leftover <= 16) {
2342 371710000 : h = area_height;
2343 : do {
2344 5678600000 : SpatialFullDistortionKernel16_AVX2_INTRIN(inp, rec, &sum);
2345 5678540000 : inp += input_stride;
2346 5678540000 : rec += recon_stride;
2347 5678540000 : } while (--h);
2348 :
2349 371646000 : if (leftover == 12) {
2350 0 : const __m256i mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0);
2351 0 : sum = _mm256_and_si256(sum, mask);
2352 : }
2353 : }
2354 : else {
2355 0 : __m256i sum1 = _mm256_setzero_si256();
2356 0 : h = area_height;
2357 : do {
2358 0 : SpatialFullDistortionKernel32Leftover_AVX2_INTRIN(inp, rec, &sum, &sum1);
2359 0 : inp += input_stride;
2360 0 : rec += recon_stride;
2361 0 : } while (--h);
2362 :
2363 : __m256i mask[2];
2364 0 : if (leftover == 20) {
2365 0 : mask[0] = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0);
2366 0 : mask[1] = _mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0);
2367 : }
2368 0 : else if (leftover == 24) {
2369 0 : mask[0] = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, -1);
2370 0 : mask[1] = _mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0);
2371 : }
2372 : else { // leftover = 28
2373 0 : mask[0] = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, -1);
2374 0 : mask[1] = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0);
2375 : }
2376 :
2377 0 : sum = _mm256_and_si256(sum, mask[0]);
2378 0 : sum1 = _mm256_and_si256(sum1, mask[1]);
2379 0 : sum = _mm256_add_epi32(sum, sum1);
2380 : }
2381 : }
2382 :
2383 1222480000 : area_width -= leftover;
2384 :
2385 1222480000 : if (area_width) {
2386 221196000 : const uint8_t *inp = input;
2387 221196000 : const uint8_t *rec = recon;
2388 221196000 : h = area_height;
2389 :
2390 221196000 : if (area_width == 32) {
2391 : do {
2392 3054760000 : SpatialFullDistortionKernel32_AVX2_INTRIN(inp, rec, &sum);
2393 3034610000 : inp += input_stride;
2394 3034610000 : rec += recon_stride;
2395 3034610000 : } while (--h);
2396 : }
2397 43738500 : else if (area_width == 64) {
2398 : do {
2399 1178570000 : SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 0 * 32, rec + 0 * 32, &sum);
2400 1175810000 : SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 1 * 32, rec + 1 * 32, &sum);
2401 1175280000 : inp += input_stride;
2402 1175280000 : rec += recon_stride;
2403 1175280000 : } while (--h);
2404 : }
2405 0 : else if (area_width == 96) {
2406 : do {
2407 0 : SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 0 * 32, rec + 0 * 32, &sum);
2408 0 : SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 1 * 32, rec + 1 * 32, &sum);
2409 0 : SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 2 * 32, rec + 2 * 32, &sum);
2410 0 : inp += input_stride;
2411 0 : rec += recon_stride;
2412 0 : } while (--h);
2413 : }
2414 : else { // 128
2415 : do {
2416 0 : SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 0 * 32, rec + 0 * 32, &sum);
2417 0 : SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 1 * 32, rec + 1 * 32, &sum);
2418 0 : SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 2 * 32, rec + 2 * 32, &sum);
2419 0 : SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 3 * 32, rec + 3 * 32, &sum);
2420 0 : inp += input_stride;
2421 0 : rec += recon_stride;
2422 0 : } while (--h);
2423 : }
2424 : }
2425 :
2426 1199040000 : return Hadd32_AVX2_INTRIN(sum);
2427 : }
|