Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include "EbPackUnPack_AVX2.h"
7 :
8 : #include <emmintrin.h>
9 : #include <immintrin.h>
10 : #include <stdint.h>
11 :
12 0 : void eb_enc_un_pack8_bit_data_avx2_intrin(
13 : uint16_t *in_16bit_buffer,
14 : uint32_t in_stride,
15 : uint8_t *out_8bit_buffer,
16 : uint32_t out_stride,
17 : uint32_t width,
18 : uint32_t height) {
19 : uint32_t x, y;
20 :
21 : __m256i ymm_00FF, in_pixel0, in_pixel1;
22 : __m256i in_pixel0_shftR_2_U8;
23 0 : ymm_00FF = _mm256_set1_epi16(0x00FF);
24 :
25 0 : if (width == 8) {
26 : __m128i xmm_00FF, in_pixel0, in_pixel1, in_pixel1_shftR_2_U8;
27 : __m128i in_pixel0_shftR_2_U8, in_pixel0_shftR_2, in_pixel1_shftR_2;
28 0 : xmm_00FF = _mm_set1_epi16(0x00FF);
29 0 : for (y = 0; y < height; y += 2) {
30 0 : in_pixel0 = _mm_loadu_si128((__m128i*) in_16bit_buffer);
31 0 : in_pixel1 = _mm_loadu_si128((__m128i*)
32 0 : (in_16bit_buffer + in_stride));
33 :
34 0 : in_pixel0_shftR_2 = _mm_and_si128(_mm_srli_epi16(in_pixel0, 2),
35 : xmm_00FF);
36 0 : in_pixel1_shftR_2 = _mm_and_si128(_mm_srli_epi16(in_pixel1, 2),
37 : xmm_00FF);
38 :
39 0 : in_pixel0_shftR_2_U8 = _mm_packus_epi16(in_pixel0_shftR_2,
40 : in_pixel0_shftR_2);
41 0 : in_pixel1_shftR_2_U8 = _mm_packus_epi16(in_pixel1_shftR_2,
42 : in_pixel1_shftR_2);
43 :
44 0 : _mm_storel_epi64((__m128i*)out_8bit_buffer, in_pixel0_shftR_2_U8);
45 0 : _mm_storel_epi64((__m128i*)(out_8bit_buffer + out_stride),
46 : in_pixel1_shftR_2_U8);
47 :
48 0 : out_8bit_buffer += 2 * out_stride;
49 0 : in_16bit_buffer += 2 * in_stride;
50 : }
51 : }
52 0 : else if (width == 16) {
53 0 : for (y = 0; y < height; y += 2) {
54 0 : in_pixel0 = _mm256_loadu_si256((__m256i*) in_16bit_buffer);
55 0 : in_pixel1 = _mm256_loadu_si256((__m256i*)
56 0 : (in_16bit_buffer + in_stride));
57 :
58 0 : in_pixel0_shftR_2_U8 = _mm256_packus_epi16(_mm256_and_si256(
59 : _mm256_srli_epi16(in_pixel0, 2), ymm_00FF), _mm256_and_si256(
60 : _mm256_srli_epi16(in_pixel1, 2), ymm_00FF));
61 :
62 0 : *(uint64_t *)out_8bit_buffer =
63 0 : _mm256_extract_epi64(in_pixel0_shftR_2_U8, 0);
64 0 : *(uint64_t *)(out_8bit_buffer + 8) =
65 0 : _mm256_extract_epi64(in_pixel0_shftR_2_U8, 2);
66 0 : *(uint64_t *)(out_8bit_buffer + out_stride) =
67 0 : _mm256_extract_epi64(in_pixel0_shftR_2_U8, 1);
68 0 : *(uint64_t *)(out_8bit_buffer + out_stride + 8) =
69 0 : _mm256_extract_epi64(in_pixel0_shftR_2_U8, 3);
70 :
71 0 : out_8bit_buffer += 2 * out_stride;
72 0 : in_16bit_buffer += 2 * in_stride;
73 : }
74 : }
75 0 : else if (width == 32) {
76 : __m256i in_pixel2, in_pixel3;
77 : __m256i out8_0_U8, out8_1_U8;
78 :
79 0 : for (y = 0; y < height; y += 2) {
80 0 : in_pixel0 = _mm256_loadu_si256((__m256i*)in_16bit_buffer);
81 0 : in_pixel1 = _mm256_loadu_si256((__m256i*)(in_16bit_buffer + 16));
82 0 : in_pixel2 = _mm256_loadu_si256((__m256i*)
83 0 : (in_16bit_buffer + in_stride));
84 0 : in_pixel3 = _mm256_loadu_si256((__m256i*)
85 0 : (in_16bit_buffer + in_stride + 16));
86 :
87 0 : out8_0_U8 = _mm256_packus_epi16(
88 : _mm256_and_si256(_mm256_srli_epi16(in_pixel0, 2), ymm_00FF),
89 : _mm256_and_si256(_mm256_srli_epi16(in_pixel1, 2), ymm_00FF));
90 0 : out8_1_U8 = _mm256_packus_epi16(
91 : _mm256_and_si256(_mm256_srli_epi16(in_pixel2, 2), ymm_00FF),
92 : _mm256_and_si256(_mm256_srli_epi16(in_pixel3, 2), ymm_00FF));
93 :
94 0 : *(uint64_t *)out_8bit_buffer = _mm256_extract_epi64(out8_0_U8, 0);
95 0 : *(uint64_t *)(out_8bit_buffer + 8) =
96 0 : _mm256_extract_epi64(out8_0_U8, 2);
97 0 : *(uint64_t *)(out_8bit_buffer + 16) =
98 0 : _mm256_extract_epi64(out8_0_U8, 1);
99 0 : *(uint64_t *)(out_8bit_buffer + 24) =
100 0 : _mm256_extract_epi64(out8_0_U8, 3);
101 0 : out_8bit_buffer += out_stride;
102 :
103 0 : *(uint64_t *)out_8bit_buffer = _mm256_extract_epi64(out8_1_U8, 0);
104 0 : *(uint64_t *)(out_8bit_buffer + 8) =
105 0 : _mm256_extract_epi64(out8_1_U8, 2);
106 0 : *(uint64_t *)(out_8bit_buffer + 16) =
107 0 : _mm256_extract_epi64(out8_1_U8, 1);
108 0 : *(uint64_t *)(out_8bit_buffer + 24) =
109 0 : _mm256_extract_epi64(out8_1_U8, 3);
110 0 : out_8bit_buffer += out_stride;
111 0 : in_16bit_buffer += 2 * in_stride;
112 : }
113 : }
114 0 : else if (width == 64) {
115 : __m256i in_pixel2, in_pixel3;
116 : __m256i out8_0_U8, out8_1_U8;
117 :
118 0 : for (y = 0; y < height; ++y) {
119 0 : in_pixel0 = _mm256_loadu_si256((__m256i*)in_16bit_buffer);
120 0 : in_pixel1 = _mm256_loadu_si256((__m256i*)(in_16bit_buffer + 16));
121 0 : in_pixel2 = _mm256_loadu_si256((__m256i*)(in_16bit_buffer + 32));
122 0 : in_pixel3 = _mm256_loadu_si256((__m256i*)(in_16bit_buffer + 48));
123 :
124 0 : out8_0_U8 = _mm256_packus_epi16(
125 : _mm256_and_si256(_mm256_srli_epi16(in_pixel0, 2), ymm_00FF),
126 : _mm256_and_si256(_mm256_srli_epi16(in_pixel1, 2), ymm_00FF));
127 0 : out8_1_U8 = _mm256_packus_epi16(
128 : _mm256_and_si256(_mm256_srli_epi16(in_pixel2, 2), ymm_00FF),
129 : _mm256_and_si256(_mm256_srli_epi16(in_pixel3, 2), ymm_00FF));
130 :
131 0 : *(uint64_t *)out_8bit_buffer = _mm256_extract_epi64(out8_0_U8, 0);
132 0 : *(uint64_t *)(out_8bit_buffer + 8) =
133 0 : _mm256_extract_epi64(out8_0_U8, 2);
134 0 : *(uint64_t *)(out_8bit_buffer + 16) =
135 0 : _mm256_extract_epi64(out8_0_U8, 1);
136 0 : *(uint64_t *)(out_8bit_buffer + 24) =
137 0 : _mm256_extract_epi64(out8_0_U8, 3);
138 0 : *(uint64_t *)(out_8bit_buffer + 32) =
139 0 : _mm256_extract_epi64(out8_1_U8, 0);
140 0 : *(uint64_t *)(out_8bit_buffer + 40) =
141 0 : _mm256_extract_epi64(out8_1_U8, 2);
142 0 : *(uint64_t *)(out_8bit_buffer + 48) =
143 0 : _mm256_extract_epi64(out8_1_U8, 1);
144 0 : *(uint64_t *)(out_8bit_buffer + 56) =
145 0 : _mm256_extract_epi64(out8_1_U8, 3);
146 :
147 0 : out_8bit_buffer += out_stride;
148 0 : in_16bit_buffer += in_stride;
149 : }
150 : }
151 : else {
152 0 : uint32_t in_strideDiff = (2 * in_stride) - width;
153 0 : uint32_t out_strideDiff = (2 * out_stride) - width;
154 :
155 0 : uint32_t in_strideDiff64 = in_stride - width;
156 0 : uint32_t out_strideDiff64 = out_stride - width;
157 :
158 0 : if (!(width & 63)) {
159 : __m256i in_pixel2, in_pixel3;
160 : __m256i out8_0_U8, out8_1_U8;
161 :
162 0 : for (x = 0; x < height; x += 1) {
163 0 : for (y = 0; y < width; y += 64) {
164 0 : in_pixel0 = _mm256_loadu_si256((__m256i*)in_16bit_buffer);
165 0 : in_pixel1 = _mm256_loadu_si256(
166 0 : (__m256i*)(in_16bit_buffer + 16));
167 0 : in_pixel2 = _mm256_loadu_si256(
168 0 : (__m256i*)(in_16bit_buffer + 32));
169 0 : in_pixel3 = _mm256_loadu_si256(
170 0 : (__m256i*)(in_16bit_buffer + 48));
171 :
172 0 : out8_0_U8 = _mm256_packus_epi16(_mm256_and_si256(
173 : _mm256_srli_epi16(in_pixel0, 2), ymm_00FF),
174 : _mm256_and_si256(_mm256_srli_epi16(in_pixel1, 2),
175 : ymm_00FF));
176 0 : out8_1_U8 = _mm256_packus_epi16(_mm256_and_si256(
177 : _mm256_srli_epi16(in_pixel2, 2), ymm_00FF),
178 : _mm256_and_si256(_mm256_srli_epi16(in_pixel3, 2),
179 : ymm_00FF));
180 :
181 0 : *(uint64_t *)out_8bit_buffer =
182 0 : _mm256_extract_epi64(out8_0_U8, 0);
183 0 : *(uint64_t *)(out_8bit_buffer + 8) =
184 0 : _mm256_extract_epi64(out8_0_U8, 2);
185 0 : *(uint64_t *)(out_8bit_buffer + 16) =
186 0 : _mm256_extract_epi64(out8_0_U8, 1);
187 0 : *(uint64_t *)(out_8bit_buffer + 24) =
188 0 : _mm256_extract_epi64(out8_0_U8, 3);
189 0 : *(uint64_t *)(out_8bit_buffer + 32) =
190 0 : _mm256_extract_epi64(out8_1_U8, 0);
191 0 : *(uint64_t *)(out_8bit_buffer + 40) =
192 0 : _mm256_extract_epi64(out8_1_U8, 2);
193 0 : *(uint64_t *)(out_8bit_buffer + 48) =
194 0 : _mm256_extract_epi64(out8_1_U8, 1);
195 0 : *(uint64_t *)(out_8bit_buffer + 56) =
196 0 : _mm256_extract_epi64(out8_1_U8, 3);
197 :
198 0 : out_8bit_buffer += 64;
199 0 : in_16bit_buffer += 64;
200 : }
201 0 : in_16bit_buffer += in_strideDiff64;
202 0 : out_8bit_buffer += out_strideDiff64;
203 : }
204 : }
205 0 : else if (!(width & 31)) {
206 : __m256i in_pixel2, in_pixel3;
207 : __m256i out8_0_U8, out8_1_U8;
208 :
209 0 : for (x = 0; x < height; x += 2) {
210 0 : for (y = 0; y < width; y += 32) {
211 0 : in_pixel0 = _mm256_loadu_si256((__m256i*)in_16bit_buffer);
212 0 : in_pixel1 = _mm256_loadu_si256((__m256i*)
213 0 : (in_16bit_buffer + 16));
214 0 : in_pixel2 = _mm256_loadu_si256((__m256i*)
215 0 : (in_16bit_buffer + in_stride));
216 0 : in_pixel3 = _mm256_loadu_si256((__m256i*)
217 0 : (in_16bit_buffer + in_stride + 16));
218 :
219 0 : out8_0_U8 = _mm256_packus_epi16(_mm256_and_si256(
220 : _mm256_srli_epi16(in_pixel0, 2), ymm_00FF),
221 : _mm256_and_si256(_mm256_srli_epi16(in_pixel1, 2),
222 : ymm_00FF));
223 0 : out8_1_U8 = _mm256_packus_epi16(_mm256_and_si256(
224 : _mm256_srli_epi16(in_pixel2, 2), ymm_00FF),
225 : _mm256_and_si256(_mm256_srli_epi16(in_pixel3, 2),
226 : ymm_00FF));
227 :
228 0 : *(uint64_t *)out_8bit_buffer =
229 0 : _mm256_extract_epi64(out8_0_U8, 0);
230 0 : *(uint64_t *)(out_8bit_buffer + 8) =
231 0 : _mm256_extract_epi64(out8_0_U8, 2);
232 0 : *(uint64_t *)(out_8bit_buffer + 16) =
233 0 : _mm256_extract_epi64(out8_0_U8, 1);
234 0 : *(uint64_t *)(out_8bit_buffer + 24) =
235 0 : _mm256_extract_epi64(out8_0_U8, 3);
236 :
237 0 : *(uint64_t *)(out_8bit_buffer + out_stride) =
238 0 : _mm256_extract_epi64(out8_1_U8, 0);
239 0 : *(uint64_t *)(out_8bit_buffer + out_stride + 8) =
240 0 : _mm256_extract_epi64(out8_1_U8, 2);
241 0 : *(uint64_t *)(out_8bit_buffer + out_stride + 16) =
242 0 : _mm256_extract_epi64(out8_1_U8, 1);
243 0 : *(uint64_t *)(out_8bit_buffer + out_stride + 24) =
244 0 : _mm256_extract_epi64(out8_1_U8, 3);
245 :
246 0 : out_8bit_buffer += 32;
247 0 : in_16bit_buffer += 32;
248 : }
249 0 : in_16bit_buffer += in_strideDiff;
250 0 : out_8bit_buffer += out_strideDiff;
251 : }
252 : }
253 0 : else if (!(width & 15)) {
254 0 : for (x = 0; x < height; x += 2) {
255 0 : for (y = 0; y < width; y += 16) {
256 0 : in_pixel0 = _mm256_loadu_si256((__m256i*) in_16bit_buffer);
257 0 : in_pixel1 = _mm256_loadu_si256((__m256i*)
258 0 : (in_16bit_buffer + in_stride));
259 :
260 0 : in_pixel0_shftR_2_U8 = _mm256_packus_epi16(
261 : _mm256_and_si256(_mm256_srli_epi16(in_pixel0, 2),
262 : ymm_00FF), _mm256_and_si256(_mm256_srli_epi16(
263 : in_pixel1, 2), ymm_00FF));
264 :
265 0 : *(uint64_t *)out_8bit_buffer =
266 0 : _mm256_extract_epi64(in_pixel0_shftR_2_U8, 0);
267 0 : *(uint64_t *)(out_8bit_buffer + 8) =
268 0 : _mm256_extract_epi64(in_pixel0_shftR_2_U8, 2);
269 0 : *(uint64_t *)(out_8bit_buffer + out_stride) =
270 0 : _mm256_extract_epi64(in_pixel0_shftR_2_U8, 1);
271 0 : *(uint64_t *)(out_8bit_buffer + out_stride + 8) =
272 0 : _mm256_extract_epi64(in_pixel0_shftR_2_U8, 3);
273 :
274 0 : out_8bit_buffer += 16;
275 0 : in_16bit_buffer += 16;
276 : }
277 0 : in_16bit_buffer += in_strideDiff;
278 0 : out_8bit_buffer += out_strideDiff;
279 : }
280 : }
281 0 : else if (!(width & 7)) {
282 : __m128i xmm_00FF, in_pixel0, in_pixel1, in_pixel1_shftR_2_U8;
283 : __m128i in_pixel0_shftR_2_U8, in_pixel0_shftR_2, in_pixel1_shftR_2;
284 0 : xmm_00FF = _mm_set1_epi16(0x00FF);
285 0 : for (x = 0; x < height; x += 2) {
286 0 : for (y = 0; y < width; y += 8) {
287 0 : in_pixel0 = _mm_loadu_si128((__m128i*) in_16bit_buffer);
288 0 : in_pixel1 = _mm_loadu_si128((__m128i*)
289 0 : (in_16bit_buffer + in_stride));
290 :
291 0 : in_pixel0_shftR_2 = _mm_and_si128(
292 : _mm_srli_epi16(in_pixel0, 2), xmm_00FF);
293 0 : in_pixel1_shftR_2 = _mm_and_si128(
294 : _mm_srli_epi16(in_pixel1, 2), xmm_00FF);
295 :
296 0 : in_pixel0_shftR_2_U8 = _mm_packus_epi16(in_pixel0_shftR_2,
297 : in_pixel0_shftR_2);
298 0 : in_pixel1_shftR_2_U8 = _mm_packus_epi16(in_pixel1_shftR_2,
299 : in_pixel1_shftR_2);
300 :
301 0 : _mm_storel_epi64((__m128i*)out_8bit_buffer,
302 : in_pixel0_shftR_2_U8);
303 0 : _mm_storel_epi64((__m128i*)(out_8bit_buffer + out_stride),
304 : in_pixel1_shftR_2_U8);
305 :
306 0 : out_8bit_buffer += 8;
307 0 : in_16bit_buffer += 8;
308 : }
309 0 : in_16bit_buffer += in_strideDiff;
310 0 : out_8bit_buffer += out_strideDiff;
311 : }
312 : }
313 : else {
314 : __m128i xmm_00FF, in_pixel0, in_pixel1, in_pixel1_shftR_2_U8;
315 : __m128i in_pixel0_shftR_2_U8, in_pixel0_shftR_2, in_pixel1_shftR_2;
316 0 : xmm_00FF = _mm_set1_epi16(0x00FF);
317 0 : uint32_t width_down4 = width & (~0x3);
318 : uint16_t in_pixel;
319 0 : for (x = 0; x < height; x += 2) {
320 0 : for (y = 0; y < width_down4; y += 4) {
321 0 : in_pixel0 = _mm_loadl_epi64((__m128i*)in_16bit_buffer);
322 0 : in_pixel1 = _mm_loadl_epi64((__m128i*)
323 0 : (in_16bit_buffer + in_stride));
324 :
325 0 : in_pixel0_shftR_2 = _mm_and_si128(
326 : _mm_srli_epi16(in_pixel0, 2), xmm_00FF);
327 0 : in_pixel1_shftR_2 = _mm_and_si128(
328 : _mm_srli_epi16(in_pixel1, 2), xmm_00FF);
329 :
330 0 : in_pixel0_shftR_2_U8 = _mm_packus_epi16(in_pixel0_shftR_2,
331 : in_pixel0_shftR_2);
332 0 : in_pixel1_shftR_2_U8 = _mm_packus_epi16(in_pixel1_shftR_2,
333 : in_pixel1_shftR_2);
334 :
335 0 : *(uint32_t*)out_8bit_buffer =
336 0 : _mm_cvtsi128_si32(in_pixel0_shftR_2_U8);
337 0 : *(uint32_t*)(out_8bit_buffer + out_stride) =
338 0 : _mm_cvtsi128_si32(in_pixel1_shftR_2_U8);
339 :
340 0 : out_8bit_buffer += 4;
341 0 : in_16bit_buffer += 4;
342 : }
343 :
344 : /* Calculate lefts pixels in 2 lines,
345 : * when width is not divided by 4.
346 : */
347 0 : for (; y < width; y++) {
348 0 : in_pixel = *in_16bit_buffer;
349 0 : *out_8bit_buffer = (uint8_t)(in_pixel >> 2);
350 0 : in_pixel = *(in_16bit_buffer + in_stride);
351 0 : *(out_8bit_buffer + out_stride) = (uint8_t)(in_pixel >> 2);
352 0 : ++out_8bit_buffer;
353 0 : ++in_16bit_buffer;
354 : }
355 :
356 0 : in_16bit_buffer += in_strideDiff;
357 0 : out_8bit_buffer += out_strideDiff;
358 : }
359 : }
360 : }
361 0 : }
|