Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include "EbPackUnPack_SSE2.h"
7 :
8 : #include <emmintrin.h>
9 : #include <stdint.h>
10 :
11 : /****************************************************************************************
12 : eb_enc_msb_un_pack2d_sse2_intrin
13 : ******************************************************************************************/
14 :
15 0 : void eb_enc_msb_un_pack2d_sse2_intrin(
16 : uint16_t *in16_bit_buffer,
17 : uint32_t in_stride,
18 : uint8_t *out8_bit_buffer,
19 : uint8_t *outn_bit_buffer,
20 : uint32_t out8_stride,
21 : uint32_t outn_stride,
22 : uint32_t width,
23 : uint32_t height)
24 : {
25 : uint32_t x, y;
26 :
27 : __m128i xmm_3, xmm_00FF, inPixel0, inPixel1, tempPixel0, tempPixel1, inPixel1_shftR_2_U8, inPixel0_shftR_2_U8, inPixel0_shftR_2, inPixel1_shftR_2;
28 : __m128i tempPixel0_U8, tempPixel1_U8;
29 :
30 0 : xmm_3 = _mm_set1_epi16(0x0003);
31 0 : xmm_00FF = _mm_set1_epi16(0x00FF);
32 :
33 0 : if (width == 4)
34 : {
35 0 : for (y = 0; y < height; y += 2)
36 : {
37 0 : inPixel0 = _mm_loadl_epi64((__m128i*)in16_bit_buffer);
38 0 : inPixel1 = _mm_loadl_epi64((__m128i*)(in16_bit_buffer + in_stride));
39 :
40 0 : tempPixel0 = _mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6);
41 0 : tempPixel1 = _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6);
42 :
43 0 : tempPixel0_U8 = _mm_packus_epi16(tempPixel0, tempPixel0);
44 0 : tempPixel1_U8 = _mm_packus_epi16(tempPixel1, tempPixel1);
45 :
46 0 : inPixel0_shftR_2 = _mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF);
47 0 : inPixel1_shftR_2 = _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF);
48 :
49 0 : inPixel0_shftR_2_U8 = _mm_packus_epi16(inPixel0_shftR_2, inPixel0_shftR_2);
50 0 : inPixel1_shftR_2_U8 = _mm_packus_epi16(inPixel1_shftR_2, inPixel1_shftR_2);
51 :
52 0 : *(uint32_t*)outn_bit_buffer = _mm_cvtsi128_si32(tempPixel0_U8);
53 0 : *(uint32_t*)(outn_bit_buffer + outn_stride) = _mm_cvtsi128_si32(tempPixel1_U8);
54 0 : *(uint32_t*)out8_bit_buffer = _mm_cvtsi128_si32(inPixel0_shftR_2_U8);
55 0 : *(uint32_t*)(out8_bit_buffer + out8_stride) = _mm_cvtsi128_si32(inPixel1_shftR_2_U8);
56 :
57 0 : outn_bit_buffer += 2 * outn_stride;
58 0 : out8_bit_buffer += 2 * out8_stride;
59 0 : in16_bit_buffer += 2 * in_stride;
60 : }
61 : }
62 0 : else if (width == 8)
63 : {
64 0 : for (y = 0; y < height; y += 2)
65 : {
66 0 : inPixel0 = _mm_loadu_si128((__m128i*) in16_bit_buffer);
67 0 : inPixel1 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride));
68 :
69 0 : tempPixel0 = _mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6);
70 0 : tempPixel1 = _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6);
71 :
72 0 : tempPixel0_U8 = _mm_packus_epi16(tempPixel0, tempPixel0);
73 0 : tempPixel1_U8 = _mm_packus_epi16(tempPixel1, tempPixel1);
74 :
75 0 : inPixel0_shftR_2 = _mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF);
76 0 : inPixel1_shftR_2 = _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF);
77 :
78 0 : inPixel0_shftR_2_U8 = _mm_packus_epi16(inPixel0_shftR_2, inPixel0_shftR_2);
79 0 : inPixel1_shftR_2_U8 = _mm_packus_epi16(inPixel1_shftR_2, inPixel1_shftR_2);
80 :
81 0 : _mm_storel_epi64((__m128i*)outn_bit_buffer, tempPixel0_U8);
82 0 : _mm_storel_epi64((__m128i*)(outn_bit_buffer + outn_stride), tempPixel1_U8);
83 0 : _mm_storel_epi64((__m128i*)out8_bit_buffer, inPixel0_shftR_2_U8);
84 0 : _mm_storel_epi64((__m128i*)(out8_bit_buffer + out8_stride), inPixel1_shftR_2_U8);
85 :
86 0 : outn_bit_buffer += 2 * outn_stride;
87 0 : out8_bit_buffer += 2 * out8_stride;
88 0 : in16_bit_buffer += 2 * in_stride;
89 : }
90 : }
91 0 : else if (width == 16)
92 : {
93 : __m128i inPixel2, inPixel3;
94 :
95 0 : for (y = 0; y < height; y += 2)
96 : {
97 0 : inPixel0 = _mm_loadu_si128((__m128i*) in16_bit_buffer);
98 0 : inPixel1 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + 8));
99 0 : inPixel2 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride));
100 0 : inPixel3 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride + 8));
101 :
102 0 : tempPixel0_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6));
103 0 : tempPixel1_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel2, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel3, xmm_3), 6));
104 :
105 0 : inPixel0_shftR_2_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF));
106 0 : inPixel1_shftR_2_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel2, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel3, 2), xmm_00FF));
107 :
108 : _mm_storeu_si128((__m128i*)outn_bit_buffer, tempPixel0_U8);
109 0 : _mm_storeu_si128((__m128i*)(outn_bit_buffer + outn_stride), tempPixel1_U8);
110 : _mm_storeu_si128((__m128i*)out8_bit_buffer, inPixel0_shftR_2_U8);
111 0 : _mm_storeu_si128((__m128i*)(out8_bit_buffer + out8_stride), inPixel1_shftR_2_U8);
112 :
113 0 : outn_bit_buffer += 2 * outn_stride;
114 0 : out8_bit_buffer += 2 * out8_stride;
115 0 : in16_bit_buffer += 2 * in_stride;
116 : }
117 : }
118 0 : else if (width == 32)
119 : {
120 : __m128i inPixel2, inPixel3, inPixel4, inPixel5, inPixel6, inPixel7;
121 : __m128i outn0_U8, outn1_U8, outn2_U8, outn3_U8, out8_0_U8, out8_1_U8, out8_2_U8, out8_3_U8;
122 :
123 0 : for (y = 0; y < height; y += 2)
124 : {
125 0 : inPixel0 = _mm_loadu_si128((__m128i*)in16_bit_buffer);
126 0 : inPixel1 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 8));
127 0 : inPixel2 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 16));
128 0 : inPixel3 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 24));
129 0 : inPixel4 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + in_stride));
130 0 : inPixel5 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + in_stride + 8));
131 0 : inPixel6 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride + 16));
132 0 : inPixel7 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride + 24));
133 :
134 0 : outn0_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6));
135 0 : outn1_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel2, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel3, xmm_3), 6));
136 0 : outn2_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel4, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel5, xmm_3), 6));
137 0 : outn3_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel6, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel7, xmm_3), 6));
138 :
139 0 : out8_0_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF));
140 0 : out8_1_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel2, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel3, 2), xmm_00FF));
141 0 : out8_2_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel4, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel5, 2), xmm_00FF));
142 0 : out8_3_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel6, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel7, 2), xmm_00FF));
143 :
144 : _mm_storeu_si128((__m128i*)outn_bit_buffer, outn0_U8);
145 0 : _mm_storeu_si128((__m128i*)(outn_bit_buffer + 16), outn1_U8);
146 0 : _mm_storeu_si128((__m128i*)(outn_bit_buffer + outn_stride), outn2_U8);
147 0 : _mm_storeu_si128((__m128i*)(outn_bit_buffer + outn_stride + 16), outn3_U8);
148 :
149 : _mm_storeu_si128((__m128i*)out8_bit_buffer, out8_0_U8);
150 0 : _mm_storeu_si128((__m128i*)(out8_bit_buffer + 16), out8_1_U8);
151 0 : _mm_storeu_si128((__m128i*)(out8_bit_buffer + out8_stride), out8_2_U8);
152 0 : _mm_storeu_si128((__m128i*)(out8_bit_buffer + out8_stride + 16), out8_3_U8);
153 :
154 0 : outn_bit_buffer += 2 * outn_stride;
155 0 : out8_bit_buffer += 2 * out8_stride;
156 0 : in16_bit_buffer += 2 * in_stride;
157 : }
158 : }
159 0 : else if (width == 64)
160 : {
161 : __m128i inPixel2, inPixel3, inPixel4, inPixel5, inPixel6, inPixel7;
162 : __m128i outn0_U8, outn1_U8, outn2_U8, outn3_U8, out8_0_U8, out8_1_U8, out8_2_U8, out8_3_U8;
163 :
164 0 : for (y = 0; y < height; ++y)
165 : {
166 0 : inPixel0 = _mm_loadu_si128((__m128i*)in16_bit_buffer);
167 0 : inPixel1 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 8));
168 0 : inPixel2 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 16));
169 0 : inPixel3 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 24));
170 0 : inPixel4 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 32));
171 0 : inPixel5 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 40));
172 0 : inPixel6 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 48));
173 0 : inPixel7 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 56));
174 :
175 0 : outn0_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6));
176 0 : outn1_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel2, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel3, xmm_3), 6));
177 0 : outn2_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel4, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel5, xmm_3), 6));
178 0 : outn3_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel6, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel7, xmm_3), 6));
179 :
180 0 : out8_0_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF));
181 0 : out8_1_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel2, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel3, 2), xmm_00FF));
182 0 : out8_2_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel4, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel5, 2), xmm_00FF));
183 0 : out8_3_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel6, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel7, 2), xmm_00FF));
184 :
185 : _mm_storeu_si128((__m128i*)outn_bit_buffer, outn0_U8);
186 0 : _mm_storeu_si128((__m128i*)(outn_bit_buffer + 16), outn1_U8);
187 0 : _mm_storeu_si128((__m128i*)(outn_bit_buffer + 32), outn2_U8);
188 0 : _mm_storeu_si128((__m128i*)(outn_bit_buffer + 48), outn3_U8);
189 :
190 : _mm_storeu_si128((__m128i*)out8_bit_buffer, out8_0_U8);
191 0 : _mm_storeu_si128((__m128i*)(out8_bit_buffer + 16), out8_1_U8);
192 0 : _mm_storeu_si128((__m128i*)(out8_bit_buffer + 32), out8_2_U8);
193 0 : _mm_storeu_si128((__m128i*)(out8_bit_buffer + 48), out8_3_U8);
194 :
195 0 : outn_bit_buffer += outn_stride;
196 0 : out8_bit_buffer += out8_stride;
197 0 : in16_bit_buffer += in_stride;
198 : }
199 : }
200 : else
201 : {
202 0 : uint32_t inStrideDiff = (2 * in_stride) - width;
203 0 : uint32_t out8StrideDiff = (2 * out8_stride) - width;
204 0 : uint32_t outnStrideDiff = (2 * outn_stride) - width;
205 :
206 0 : uint32_t inStrideDiff64 = in_stride - width;
207 0 : uint32_t out8StrideDiff64 = out8_stride - width;
208 0 : uint32_t outnStrideDiff64 = outn_stride - width;
209 :
210 0 : if (!(width & 63))
211 : {
212 : __m128i inPixel2, inPixel3, inPixel4, inPixel5, inPixel6, inPixel7;
213 : __m128i outn0_U8, outn1_U8, outn2_U8, outn3_U8, out8_0_U8, out8_1_U8, out8_2_U8, out8_3_U8;
214 :
215 0 : for (x = 0; x < height; x += 1) {
216 0 : for (y = 0; y < width; y += 64) {
217 0 : inPixel0 = _mm_loadu_si128((__m128i*)in16_bit_buffer);
218 0 : inPixel1 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 8));
219 0 : inPixel2 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 16));
220 0 : inPixel3 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 24));
221 0 : inPixel4 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 32));
222 0 : inPixel5 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 40));
223 0 : inPixel6 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 48));
224 0 : inPixel7 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 56));
225 :
226 0 : outn0_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6));
227 0 : outn1_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel2, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel3, xmm_3), 6));
228 0 : outn2_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel4, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel5, xmm_3), 6));
229 0 : outn3_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel6, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel7, xmm_3), 6));
230 :
231 0 : out8_0_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF));
232 0 : out8_1_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel2, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel3, 2), xmm_00FF));
233 0 : out8_2_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel4, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel5, 2), xmm_00FF));
234 0 : out8_3_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel6, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel7, 2), xmm_00FF));
235 :
236 : _mm_storeu_si128((__m128i*)outn_bit_buffer, outn0_U8);
237 0 : _mm_storeu_si128((__m128i*)(outn_bit_buffer + 16), outn1_U8);
238 0 : _mm_storeu_si128((__m128i*)(outn_bit_buffer + 32), outn2_U8);
239 0 : _mm_storeu_si128((__m128i*)(outn_bit_buffer + 48), outn3_U8);
240 :
241 : _mm_storeu_si128((__m128i*)out8_bit_buffer, out8_0_U8);
242 0 : _mm_storeu_si128((__m128i*)(out8_bit_buffer + 16), out8_1_U8);
243 0 : _mm_storeu_si128((__m128i*)(out8_bit_buffer + 32), out8_2_U8);
244 0 : _mm_storeu_si128((__m128i*)(out8_bit_buffer + 48), out8_3_U8);
245 :
246 0 : outn_bit_buffer += 64;
247 0 : out8_bit_buffer += 64;
248 0 : in16_bit_buffer += 64;
249 : }
250 0 : in16_bit_buffer += inStrideDiff64;
251 0 : outn_bit_buffer += outnStrideDiff64;
252 0 : out8_bit_buffer += out8StrideDiff64;
253 : }
254 : }
255 0 : else if (!(width & 31))
256 : {
257 : __m128i inPixel2, inPixel3, inPixel4, inPixel5, inPixel6, inPixel7;
258 : __m128i outn0_U8, outn1_U8, outn2_U8, outn3_U8, out8_0_U8, out8_1_U8, out8_2_U8, out8_3_U8;
259 :
260 0 : for (x = 0; x < height; x += 2)
261 : {
262 0 : for (y = 0; y < width; y += 32)
263 : {
264 0 : inPixel0 = _mm_loadu_si128((__m128i*)in16_bit_buffer);
265 0 : inPixel1 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 8));
266 0 : inPixel2 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 16));
267 0 : inPixel3 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 24));
268 0 : inPixel4 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + in_stride));
269 0 : inPixel5 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + in_stride + 8));
270 0 : inPixel6 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride + 16));
271 0 : inPixel7 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride + 24));
272 :
273 0 : outn0_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6));
274 0 : outn1_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel2, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel3, xmm_3), 6));
275 0 : outn2_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel4, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel5, xmm_3), 6));
276 0 : outn3_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel6, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel7, xmm_3), 6));
277 :
278 0 : out8_0_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF));
279 0 : out8_1_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel2, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel3, 2), xmm_00FF));
280 0 : out8_2_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel4, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel5, 2), xmm_00FF));
281 0 : out8_3_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel6, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel7, 2), xmm_00FF));
282 :
283 : _mm_storeu_si128((__m128i*)outn_bit_buffer, outn0_U8);
284 0 : _mm_storeu_si128((__m128i*)(outn_bit_buffer + 16), outn1_U8);
285 0 : _mm_storeu_si128((__m128i*)(outn_bit_buffer + outn_stride), outn2_U8);
286 0 : _mm_storeu_si128((__m128i*)(outn_bit_buffer + outn_stride + 16), outn3_U8);
287 :
288 : _mm_storeu_si128((__m128i*)out8_bit_buffer, out8_0_U8);
289 0 : _mm_storeu_si128((__m128i*)(out8_bit_buffer + 16), out8_1_U8);
290 0 : _mm_storeu_si128((__m128i*)(out8_bit_buffer + out8_stride), out8_2_U8);
291 0 : _mm_storeu_si128((__m128i*)(out8_bit_buffer + out8_stride + 16), out8_3_U8);
292 :
293 0 : outn_bit_buffer += 32;
294 0 : out8_bit_buffer += 32;
295 0 : in16_bit_buffer += 32;
296 : }
297 0 : in16_bit_buffer += inStrideDiff;
298 0 : outn_bit_buffer += outnStrideDiff;
299 0 : out8_bit_buffer += out8StrideDiff;
300 : }
301 : }
302 0 : else if (!(width & 15))
303 : {
304 : __m128i inPixel2, inPixel3;
305 :
306 0 : for (x = 0; x < height; x += 2)
307 : {
308 0 : for (y = 0; y < width; y += 16)
309 : {
310 0 : inPixel0 = _mm_loadu_si128((__m128i*) in16_bit_buffer);
311 0 : inPixel1 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + 8));
312 0 : inPixel2 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride));
313 0 : inPixel3 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride + 8));
314 :
315 0 : tempPixel0_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6));
316 0 : tempPixel1_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel2, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel3, xmm_3), 6));
317 :
318 0 : inPixel0_shftR_2_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF));
319 0 : inPixel1_shftR_2_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel2, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel3, 2), xmm_00FF));
320 :
321 : _mm_storeu_si128((__m128i*)outn_bit_buffer, tempPixel0_U8);
322 0 : _mm_storeu_si128((__m128i*)(outn_bit_buffer + outn_stride), tempPixel1_U8);
323 : _mm_storeu_si128((__m128i*)out8_bit_buffer, inPixel0_shftR_2_U8);
324 0 : _mm_storeu_si128((__m128i*)(out8_bit_buffer + out8_stride), inPixel1_shftR_2_U8);
325 :
326 0 : outn_bit_buffer += 16;
327 0 : out8_bit_buffer += 16;
328 0 : in16_bit_buffer += 16;
329 : }
330 0 : in16_bit_buffer += inStrideDiff;
331 0 : outn_bit_buffer += outnStrideDiff;
332 0 : out8_bit_buffer += out8StrideDiff;
333 : }
334 : }
335 0 : else if (!(width & 7))
336 : {
337 0 : for (x = 0; x < height; x += 2)
338 : {
339 0 : for (y = 0; y < width; y += 8)
340 : {
341 0 : inPixel0 = _mm_loadu_si128((__m128i*) in16_bit_buffer);
342 0 : inPixel1 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride));
343 :
344 0 : tempPixel0 = _mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6);
345 0 : tempPixel1 = _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6);
346 :
347 0 : tempPixel0_U8 = _mm_packus_epi16(tempPixel0, tempPixel0);
348 0 : tempPixel1_U8 = _mm_packus_epi16(tempPixel1, tempPixel1);
349 :
350 0 : inPixel0_shftR_2 = _mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF);
351 0 : inPixel1_shftR_2 = _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF);
352 :
353 0 : inPixel0_shftR_2_U8 = _mm_packus_epi16(inPixel0_shftR_2, inPixel0_shftR_2);
354 0 : inPixel1_shftR_2_U8 = _mm_packus_epi16(inPixel1_shftR_2, inPixel1_shftR_2);
355 :
356 0 : _mm_storel_epi64((__m128i*)outn_bit_buffer, tempPixel0_U8);
357 0 : _mm_storel_epi64((__m128i*)(outn_bit_buffer + outn_stride), tempPixel1_U8);
358 0 : _mm_storel_epi64((__m128i*)out8_bit_buffer, inPixel0_shftR_2_U8);
359 0 : _mm_storel_epi64((__m128i*)(out8_bit_buffer + out8_stride), inPixel1_shftR_2_U8);
360 :
361 0 : outn_bit_buffer += 8;
362 0 : out8_bit_buffer += 8;
363 0 : in16_bit_buffer += 8;
364 : }
365 0 : in16_bit_buffer += inStrideDiff;
366 0 : outn_bit_buffer += outnStrideDiff;
367 0 : out8_bit_buffer += out8StrideDiff;
368 : }
369 : }
370 : else
371 : {
372 0 : for (x = 0; x < height; x += 2)
373 : {
374 0 : for (y = 0; y < width; y += 4)
375 : {
376 0 : inPixel0 = _mm_loadl_epi64((__m128i*)in16_bit_buffer);
377 0 : inPixel1 = _mm_loadl_epi64((__m128i*)(in16_bit_buffer + in_stride));
378 :
379 0 : tempPixel0 = _mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6);
380 0 : tempPixel1 = _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6);
381 :
382 0 : tempPixel0_U8 = _mm_packus_epi16(tempPixel0, tempPixel0);
383 0 : tempPixel1_U8 = _mm_packus_epi16(tempPixel1, tempPixel1);
384 :
385 0 : inPixel0_shftR_2 = _mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF);
386 0 : inPixel1_shftR_2 = _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF);
387 :
388 0 : inPixel0_shftR_2_U8 = _mm_packus_epi16(inPixel0_shftR_2, inPixel0_shftR_2);
389 0 : inPixel1_shftR_2_U8 = _mm_packus_epi16(inPixel1_shftR_2, inPixel1_shftR_2);
390 :
391 0 : *(uint32_t*)outn_bit_buffer = _mm_cvtsi128_si32(tempPixel0_U8);
392 0 : *(uint32_t*)(outn_bit_buffer + outn_stride) = _mm_cvtsi128_si32(tempPixel1_U8);
393 0 : *(uint32_t*)out8_bit_buffer = _mm_cvtsi128_si32(inPixel0_shftR_2_U8);
394 0 : *(uint32_t*)(out8_bit_buffer + out8_stride) = _mm_cvtsi128_si32(inPixel1_shftR_2_U8);
395 :
396 0 : outn_bit_buffer += 4;
397 0 : out8_bit_buffer += 4;
398 0 : in16_bit_buffer += 4;
399 : }
400 0 : in16_bit_buffer += inStrideDiff;
401 0 : outn_bit_buffer += outnStrideDiff;
402 0 : out8_bit_buffer += out8StrideDiff;
403 : }
404 : }
405 : }
406 0 : return;
407 : }
408 :
409 0 : void unpack_avg_sse2_intrin(
410 : uint16_t *ref16_l0,
411 : uint32_t ref_l0_stride,
412 : uint16_t *ref16_l1,
413 : uint32_t ref_l1_stride,
414 : uint8_t *dst_ptr,
415 : uint32_t dst_stride,
416 : uint32_t width,
417 : uint32_t height)
418 : {
419 : uint32_t y;
420 : __m128i inPixel0, inPixel1;
421 :
422 0 : if (width == 4)
423 : {
424 : __m128i out8_0_U8_L0, out8_0_U8_L1;
425 : __m128i avg8_0_U8;
426 :
427 0 : for (y = 0; y < height; y += 2)
428 : {
429 : //--------
430 : //Line One
431 : //--------
432 :
433 : //List0
434 0 : inPixel0 = _mm_loadl_epi64((__m128i*)ref16_l0);
435 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
436 0 : out8_0_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
437 :
438 : //List1
439 0 : inPixel0 = _mm_loadl_epi64((__m128i*)ref16_l1);
440 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
441 0 : out8_0_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
442 :
443 : //AVG
444 0 : avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
445 :
446 0 : *(uint32_t*)dst_ptr = _mm_cvtsi128_si32(avg8_0_U8);
447 :
448 : //--------
449 : //Line Two
450 : //--------
451 :
452 : //List0
453 0 : inPixel0 = _mm_loadl_epi64((__m128i*)(ref16_l0 + ref_l0_stride));
454 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
455 0 : out8_0_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
456 :
457 : //List1
458 :
459 0 : inPixel0 = _mm_loadl_epi64((__m128i*)(ref16_l1 + ref_l1_stride));
460 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
461 0 : out8_0_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
462 :
463 : //AVG
464 0 : avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
465 :
466 0 : *(uint32_t*)(dst_ptr + dst_stride) = _mm_cvtsi128_si32(avg8_0_U8);
467 :
468 0 : dst_ptr += 2 * dst_stride;
469 0 : ref16_l0 += 2 * ref_l0_stride;
470 0 : ref16_l1 += 2 * ref_l1_stride;
471 : }
472 : }
473 0 : else if (width == 8)
474 : {
475 : __m128i out8_0_U8_L0, out8_0_U8_L1, out8_2_U8_L0, out8_2_U8_L1;
476 : __m128i avg8_0_U8, avg8_2_U8;
477 :
478 0 : for (y = 0; y < height; y += 2)
479 : {
480 : //--------
481 : //Line One
482 : //--------
483 :
484 : //List0
485 :
486 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
487 :
488 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
489 0 : out8_0_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
490 :
491 : //List1
492 :
493 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
494 :
495 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
496 0 : out8_0_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
497 :
498 : //AVG
499 0 : avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
500 :
501 0 : _mm_storel_epi64((__m128i*) dst_ptr, avg8_0_U8);
502 :
503 : //--------
504 : //Line Two
505 : //--------
506 :
507 : //List0
508 :
509 0 : inPixel0 = _mm_loadu_si128((__m128i*)(ref16_l0 + ref_l0_stride));
510 :
511 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
512 0 : out8_2_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
513 :
514 : //List1
515 :
516 0 : inPixel0 = _mm_loadu_si128((__m128i*)(ref16_l1 + ref_l1_stride));
517 :
518 0 : inPixel1 = _mm_srli_epi16(inPixel0, 2);
519 0 : out8_2_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
520 :
521 : //AVG
522 0 : avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
523 :
524 0 : _mm_storel_epi64((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
525 :
526 0 : dst_ptr += 2 * dst_stride;
527 0 : ref16_l0 += 2 * ref_l0_stride;
528 0 : ref16_l1 += 2 * ref_l1_stride;
529 : }
530 : }
531 0 : else if (width == 16)
532 : {
533 : __m128i inPixel4, inPixel5;
534 : __m128i out8_0_U8_L0, out8_0_U8_L1, out8_2_U8_L0, out8_2_U8_L1;
535 : __m128i avg8_0_U8, avg8_2_U8;
536 :
537 0 : for (y = 0; y < height; y += 2)
538 : {
539 : //--------
540 : //Line One
541 : //--------
542 :
543 : //List0
544 :
545 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
546 0 : inPixel1 = _mm_loadu_si128((__m128i*) (ref16_l0 + 8));
547 :
548 0 : out8_0_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
549 :
550 : //List1
551 :
552 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
553 0 : inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l1 + 8));
554 :
555 0 : out8_0_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
556 :
557 : //AVG
558 0 : avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
559 :
560 : _mm_storeu_si128((__m128i*) dst_ptr, avg8_0_U8);
561 :
562 : //--------
563 : //Line Two
564 : //--------
565 :
566 : //List0
567 :
568 0 : inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride));
569 0 : inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 8));
570 :
571 0 : out8_2_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
572 :
573 : //List1
574 :
575 0 : inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride));
576 0 : inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 8));
577 :
578 0 : out8_2_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
579 :
580 : //AVG
581 0 : avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
582 :
583 0 : _mm_storeu_si128((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
584 :
585 0 : dst_ptr += 2 * dst_stride;
586 0 : ref16_l0 += 2 * ref_l0_stride;
587 0 : ref16_l1 += 2 * ref_l1_stride;
588 : }
589 : }
590 0 : else if (width == 32)
591 : {
592 : __m128i inPixel2, inPixel3, inPixel4, inPixel5, inPixel6, inPixel7;
593 : __m128i out8_0_U8_L0, out8_1_U8_L0, out8_2_U8_L0, out8_3_U8_L0;
594 : __m128i out8_0_U8_L1, out8_1_U8_L1, out8_2_U8_L1, out8_3_U8_L1;
595 : __m128i avg8_0_U8, avg8_1_U8, avg8_2_U8, avg8_3_U8;
596 :
597 0 : for (y = 0; y < height; y += 2)
598 : {
599 : //--------
600 : //Line One
601 : //--------
602 :
603 : //List0
604 :
605 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
606 0 : inPixel1 = _mm_loadu_si128((__m128i*) (ref16_l0 + 8));
607 0 : inPixel2 = _mm_loadu_si128((__m128i*) (ref16_l0 + 16));
608 0 : inPixel3 = _mm_loadu_si128((__m128i*) (ref16_l0 + 24));
609 :
610 0 : out8_0_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
611 0 : out8_1_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel2, 2), _mm_srli_epi16(inPixel3, 2));
612 :
613 : //List1
614 :
615 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
616 0 : inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l1 + 8));
617 0 : inPixel2 = _mm_loadu_si128((__m128i*)(ref16_l1 + 16));
618 0 : inPixel3 = _mm_loadu_si128((__m128i*)(ref16_l1 + 24));
619 :
620 0 : out8_0_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
621 0 : out8_1_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel2, 2), _mm_srli_epi16(inPixel3, 2));
622 :
623 : //AVG
624 0 : avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
625 0 : avg8_1_U8 = _mm_avg_epu8(out8_1_U8_L0, out8_1_U8_L1);
626 :
627 : _mm_storeu_si128((__m128i*) dst_ptr, avg8_0_U8);
628 0 : _mm_storeu_si128((__m128i*)(dst_ptr + 16), avg8_1_U8);
629 :
630 : //--------
631 : //Line Two
632 : //--------
633 :
634 : //List0
635 :
636 0 : inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride));
637 0 : inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 8));
638 0 : inPixel6 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 16));
639 0 : inPixel7 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 24));
640 :
641 0 : out8_2_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
642 0 : out8_3_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel6, 2), _mm_srli_epi16(inPixel7, 2));
643 :
644 : //List1
645 :
646 0 : inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride));
647 0 : inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 8));
648 0 : inPixel6 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 16));
649 0 : inPixel7 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 24));
650 :
651 0 : out8_2_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
652 0 : out8_3_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel6, 2), _mm_srli_epi16(inPixel7, 2));
653 :
654 : //AVG
655 0 : avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
656 0 : avg8_3_U8 = _mm_avg_epu8(out8_3_U8_L0, out8_3_U8_L1);
657 :
658 0 : _mm_storeu_si128((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
659 0 : _mm_storeu_si128((__m128i*)(dst_ptr + dst_stride + 16), avg8_3_U8);
660 :
661 0 : dst_ptr += 2 * dst_stride;
662 0 : ref16_l0 += 2 * ref_l0_stride;
663 0 : ref16_l1 += 2 * ref_l1_stride;
664 : }
665 : }
666 0 : else if (width == 64)
667 : {
668 : __m128i inPixel2, inPixel3, inPixel4, inPixel5, inPixel6, inPixel7;
669 : __m128i out8_0_U8_L0, out8_1_U8_L0, out8_2_U8_L0, out8_3_U8_L0;
670 : __m128i out8_0_U8_L1, out8_1_U8_L1, out8_2_U8_L1, out8_3_U8_L1;
671 : __m128i avg8_0_U8, avg8_1_U8, avg8_2_U8, avg8_3_U8;
672 :
673 0 : for (y = 0; y < height; ++y)
674 : {
675 : //List0
676 :
677 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
678 0 : inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l0 + 8));
679 0 : inPixel2 = _mm_loadu_si128((__m128i*)(ref16_l0 + 16));
680 0 : inPixel3 = _mm_loadu_si128((__m128i*)(ref16_l0 + 24));
681 0 : inPixel4 = _mm_loadu_si128((__m128i*)(ref16_l0 + 32));
682 0 : inPixel5 = _mm_loadu_si128((__m128i*)(ref16_l0 + 40));
683 0 : inPixel6 = _mm_loadu_si128((__m128i*)(ref16_l0 + 48));
684 0 : inPixel7 = _mm_loadu_si128((__m128i*)(ref16_l0 + 56));
685 :
686 0 : out8_0_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
687 0 : out8_1_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel2, 2), _mm_srli_epi16(inPixel3, 2));
688 0 : out8_2_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
689 0 : out8_3_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel6, 2), _mm_srli_epi16(inPixel7, 2));
690 :
691 : //List1
692 :
693 0 : inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
694 0 : inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l1 + 8));
695 0 : inPixel2 = _mm_loadu_si128((__m128i*)(ref16_l1 + 16));
696 0 : inPixel3 = _mm_loadu_si128((__m128i*)(ref16_l1 + 24));
697 0 : inPixel4 = _mm_loadu_si128((__m128i*)(ref16_l1 + 32));
698 0 : inPixel5 = _mm_loadu_si128((__m128i*)(ref16_l1 + 40));
699 0 : inPixel6 = _mm_loadu_si128((__m128i*)(ref16_l1 + 48));
700 0 : inPixel7 = _mm_loadu_si128((__m128i*)(ref16_l1 + 56));
701 :
702 : //Note: old Version used to use _mm_and_si128 to mask the MSB bits of the pixels
703 0 : out8_0_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
704 0 : out8_1_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel2, 2), _mm_srli_epi16(inPixel3, 2));
705 0 : out8_2_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
706 0 : out8_3_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel6, 2), _mm_srli_epi16(inPixel7, 2));
707 :
708 : //AVG
709 0 : avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
710 0 : avg8_1_U8 = _mm_avg_epu8(out8_1_U8_L0, out8_1_U8_L1);
711 0 : avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
712 0 : avg8_3_U8 = _mm_avg_epu8(out8_3_U8_L0, out8_3_U8_L1);
713 :
714 : _mm_storeu_si128((__m128i*) dst_ptr, avg8_0_U8);
715 0 : _mm_storeu_si128((__m128i*)(dst_ptr + 16), avg8_1_U8);
716 0 : _mm_storeu_si128((__m128i*)(dst_ptr + 32), avg8_2_U8);
717 0 : _mm_storeu_si128((__m128i*)(dst_ptr + 48), avg8_3_U8);
718 :
719 0 : dst_ptr += dst_stride;
720 0 : ref16_l0 += ref_l0_stride;
721 0 : ref16_l1 += ref_l1_stride;
722 : }
723 : }
724 :
725 0 : return;
726 : }
727 : /********************************************************************************************************************
728 : eb_enc_msb_pack2d_sse2_intrin
729 : *********************************************************************************************************************/
730 0 : void eb_enc_msb_pack2d_sse2_intrin(
731 : uint8_t *in8_bit_buffer,
732 : uint32_t in8_stride,
733 : uint8_t *inn_bit_buffer,
734 : uint16_t *out16_bit_buffer,
735 : uint32_t inn_stride,
736 : uint32_t out_stride,
737 : uint32_t width,
738 : uint32_t height)
739 : {
740 : uint32_t count_width, count_height;
741 :
742 0 : if (width == 4) {
743 0 : for (count_height = 0; count_height < height; count_height += 2) {
744 0 : _mm_storel_epi64((__m128i*)(out16_bit_buffer), _mm_srli_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t*)(inn_bit_buffer)),
745 0 : _mm_cvtsi32_si128(*(uint32_t*)(in8_bit_buffer))), 6));
746 0 : _mm_storel_epi64((__m128i*)(out16_bit_buffer + out_stride), _mm_srli_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t*)(inn_bit_buffer + inn_stride)),
747 0 : _mm_cvtsi32_si128(*(uint32_t*)(in8_bit_buffer + in8_stride))), 6));
748 0 : out16_bit_buffer += (out_stride << 1);
749 0 : in8_bit_buffer += (in8_stride << 1);
750 0 : inn_bit_buffer += (inn_stride << 1);
751 : }
752 : }
753 0 : else if (width == 8) {
754 0 : for (count_height = 0; count_height < height; count_height += 2) {
755 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer), _mm_srli_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(inn_bit_buffer)),
756 : _mm_loadl_epi64((__m128i*)(in8_bit_buffer))), 6));
757 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + out_stride), _mm_srli_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(inn_bit_buffer + inn_stride)),
758 0 : _mm_loadl_epi64((__m128i*)(in8_bit_buffer + in8_stride))), 6));
759 0 : out16_bit_buffer += (out_stride << 1);
760 0 : in8_bit_buffer += (in8_stride << 1);
761 0 : inn_bit_buffer += (inn_stride << 1);
762 : }
763 : }
764 0 : else if (width == 16) {
765 : __m128i outPixel1, outPixel2, outPixel3, outPixel4, innBitBuffer_lo, innBitBuffer_hi, in8BitBuffer_lo, in8BitBuffer_hi;
766 :
767 0 : for (count_height = 0; count_height < height; count_height += 2) {
768 0 : innBitBuffer_lo = _mm_loadu_si128((__m128i *)inn_bit_buffer);
769 0 : innBitBuffer_hi = _mm_loadu_si128((__m128i *)(inn_bit_buffer + inn_stride));
770 0 : in8BitBuffer_lo = _mm_loadu_si128((__m128i *)in8_bit_buffer);
771 0 : in8BitBuffer_hi = _mm_loadu_si128((__m128i *)(in8_bit_buffer + in8_stride));
772 :
773 0 : outPixel1 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer_lo, in8BitBuffer_lo), 6);
774 0 : outPixel2 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer_lo, in8BitBuffer_lo), 6);
775 0 : outPixel3 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer_hi, in8BitBuffer_hi), 6);
776 0 : outPixel4 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer_hi, in8BitBuffer_hi), 6);
777 :
778 : _mm_storeu_si128((__m128i*)out16_bit_buffer, outPixel1);
779 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + 8), outPixel2);
780 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + out_stride), outPixel3);
781 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + out_stride + 8), outPixel4);
782 :
783 0 : in8_bit_buffer += (in8_stride << 1);
784 0 : inn_bit_buffer += (inn_stride << 1);
785 0 : out16_bit_buffer += (out_stride << 1);
786 : }
787 : }
788 0 : else if (width == 32) {
789 : __m128i innBitBuffer1, innBitBuffer2, innBitBuffer3, innBitBuffer4, in8BitBuffer1, in8BitBuffer2, in8BitBuffer3, in8BitBuffer4;
790 : __m128i outPixel1, outPixel2, outPixel3, outPixel4, outPixel5, outPixel6, outPixel7, outPixel8;
791 :
792 0 : for (count_height = 0; count_height < height; count_height += 2)
793 : {
794 0 : innBitBuffer1 = _mm_loadu_si128((__m128i *)inn_bit_buffer);
795 0 : innBitBuffer2 = _mm_loadu_si128((__m128i *)(inn_bit_buffer + 16));
796 0 : innBitBuffer3 = _mm_loadu_si128((__m128i *)(inn_bit_buffer + inn_stride));
797 0 : innBitBuffer4 = _mm_loadu_si128((__m128i *)(inn_bit_buffer + inn_stride + 16));
798 :
799 0 : in8BitBuffer1 = _mm_loadu_si128((__m128i *)in8_bit_buffer);
800 0 : in8BitBuffer2 = _mm_loadu_si128((__m128i *)(in8_bit_buffer + 16));
801 0 : in8BitBuffer3 = _mm_loadu_si128((__m128i *)(in8_bit_buffer + in8_stride));
802 0 : in8BitBuffer4 = _mm_loadu_si128((__m128i *)(in8_bit_buffer + in8_stride + 16));
803 :
804 0 : outPixel1 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer1, in8BitBuffer1), 6);
805 0 : outPixel2 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer1, in8BitBuffer1), 6);
806 0 : outPixel3 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer2, in8BitBuffer2), 6);
807 0 : outPixel4 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer2, in8BitBuffer2), 6);
808 0 : outPixel5 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer3, in8BitBuffer3), 6);
809 0 : outPixel6 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer3, in8BitBuffer3), 6);
810 0 : outPixel7 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer4, in8BitBuffer4), 6);
811 0 : outPixel8 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer4, in8BitBuffer4), 6);
812 :
813 : _mm_storeu_si128((__m128i*)out16_bit_buffer, outPixel1);
814 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + 8), outPixel2);
815 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + 16), outPixel3);
816 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + 24), outPixel4);
817 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + out_stride), outPixel5);
818 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + out_stride + 8), outPixel6);
819 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + out_stride + 16), outPixel7);
820 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + out_stride + 24), outPixel8);
821 :
822 0 : in8_bit_buffer += (in8_stride << 1);
823 0 : inn_bit_buffer += (inn_stride << 1);
824 0 : out16_bit_buffer += (out_stride << 1);
825 : }
826 : }
827 0 : else if (width == 64) {
828 : __m128i innBitBuffer1, innBitBuffer2, innBitBuffer3, innBitBuffer4, in8BitBuffer1, in8BitBuffer2, in8BitBuffer3, in8BitBuffer4;
829 : __m128i outPixel1, outPixel2, outPixel3, outPixel4, outPixel5, outPixel6, outPixel7, outPixel8;
830 :
831 0 : for (count_height = 0; count_height < height; ++count_height)
832 : {
833 0 : innBitBuffer1 = _mm_loadu_si128((__m128i *)inn_bit_buffer);
834 0 : innBitBuffer2 = _mm_loadu_si128((__m128i *)(inn_bit_buffer + 16));
835 0 : innBitBuffer3 = _mm_loadu_si128((__m128i *)(inn_bit_buffer + 32));
836 0 : innBitBuffer4 = _mm_loadu_si128((__m128i *)(inn_bit_buffer + 48));
837 :
838 0 : in8BitBuffer1 = _mm_loadu_si128((__m128i *)in8_bit_buffer);
839 0 : in8BitBuffer2 = _mm_loadu_si128((__m128i *)(in8_bit_buffer + 16));
840 0 : in8BitBuffer3 = _mm_loadu_si128((__m128i *)(in8_bit_buffer + 32));
841 0 : in8BitBuffer4 = _mm_loadu_si128((__m128i *)(in8_bit_buffer + 48));
842 :
843 0 : outPixel1 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer1, in8BitBuffer1), 6);
844 0 : outPixel2 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer1, in8BitBuffer1), 6);
845 0 : outPixel3 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer2, in8BitBuffer2), 6);
846 0 : outPixel4 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer2, in8BitBuffer2), 6);
847 0 : outPixel5 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer3, in8BitBuffer3), 6);
848 0 : outPixel6 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer3, in8BitBuffer3), 6);
849 0 : outPixel7 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer4, in8BitBuffer4), 6);
850 0 : outPixel8 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer4, in8BitBuffer4), 6);
851 :
852 : _mm_storeu_si128((__m128i*)out16_bit_buffer, outPixel1);
853 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + 8), outPixel2);
854 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + 16), outPixel3);
855 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + 24), outPixel4);
856 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + 32), outPixel5);
857 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + 40), outPixel6);
858 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + 48), outPixel7);
859 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + 56), outPixel8);
860 :
861 0 : in8_bit_buffer += in8_stride;
862 0 : inn_bit_buffer += inn_stride;
863 0 : out16_bit_buffer += out_stride;
864 : }
865 : }
866 : else {
867 0 : uint32_t innStrideDiff = (inn_stride << 1) - width;
868 0 : uint32_t in8StrideDiff = (in8_stride << 1) - width;
869 0 : uint32_t outStrideDiff = (out_stride << 1) - width;
870 :
871 0 : if (!(width & 7)) {
872 0 : for (count_height = 0; count_height < height; count_height += 2) {
873 0 : for (count_width = 0; count_width < width; count_width += 8) {
874 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer), _mm_srli_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(inn_bit_buffer)),
875 : _mm_loadl_epi64((__m128i*)(in8_bit_buffer))), 6));
876 0 : _mm_storeu_si128((__m128i*)(out16_bit_buffer + out_stride), _mm_srli_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(inn_bit_buffer + inn_stride)),
877 0 : _mm_loadl_epi64((__m128i*)(in8_bit_buffer + in8_stride))), 6));
878 0 : out16_bit_buffer += 8;
879 0 : in8_bit_buffer += 8;
880 0 : inn_bit_buffer += 8;
881 : }
882 0 : in8_bit_buffer += in8StrideDiff;
883 0 : inn_bit_buffer += innStrideDiff;
884 0 : out16_bit_buffer += outStrideDiff;
885 : }
886 : }
887 : else {
888 0 : for (count_height = 0; count_height < height; count_height += 2) {
889 0 : for (count_width = 0; count_width < width; count_width += 4) {
890 0 : _mm_storel_epi64((__m128i*)(out16_bit_buffer), _mm_srli_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t*)(inn_bit_buffer)),
891 0 : _mm_cvtsi32_si128(*(uint32_t*)(in8_bit_buffer))), 6));
892 0 : _mm_storel_epi64((__m128i*)(out16_bit_buffer + out_stride), _mm_srli_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t*)(inn_bit_buffer + inn_stride)),
893 0 : _mm_cvtsi32_si128(*(uint32_t*)(in8_bit_buffer + in8_stride))), 6));
894 0 : out16_bit_buffer += 4;
895 0 : in8_bit_buffer += 4;
896 0 : inn_bit_buffer += 4;
897 : }
898 0 : in8_bit_buffer += in8StrideDiff;
899 0 : inn_bit_buffer += innStrideDiff;
900 0 : out16_bit_buffer += outStrideDiff;
901 : }
902 : }
903 : }
904 0 : }
|