Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include "EbDefinitions.h"
7 :
8 : #include "emmintrin.h"
9 :
10 : #ifndef PREFETCH
11 : #define PREFETCH 0 // prefetching: enables prefetching of data before interpolation
12 : #endif
13 :
14 : #include "tmmintrin.h"
15 :
16 : #ifdef __GNUC__
17 : #ifndef __cplusplus
18 : __attribute__((visibility("hidden")))
19 : #endif
20 : #endif
21 : const int16_t lumaFilterCoeff[4][8] =
22 : {
23 : { 0, 0, 0, 64, 0, 0, 0, 0},
24 : {-1, 4,-10, 58, 17, -5, 1, 0},
25 : {-1, 4,-11, 40, 40,-11, 4, -1},
26 : { 0, 1, -5, 17, 58,-10, 4, -1}
27 : };
28 :
29 : #ifdef __GNUC__
30 : #ifndef __cplusplus
31 : __attribute__((visibility("hidden")))
32 : #endif
33 : #endif
34 : const int16_t lumaFilterCoeff7[4][8] =
35 : {
36 : { 0, 0, 0, 64, 0, 0, 0, 0},
37 : {-1, 4,-10, 58, 17, -5, 1, 0},
38 : {-1, 4,-11, 40, 40,-11, 4, -1},
39 : { 1, -5, 17, 58,-10, 4, -1, 0}
40 : };
41 :
42 : #ifdef __GNUC__
43 : #ifndef __cplusplus
44 : __attribute__((visibility("hidden")))
45 : #endif
46 : #endif
47 : const int16_t chromaFilterCoeff[8][4] =
48 : {
49 : { 0, 64, 0, 0},
50 : {-2, 58, 10, -2},
51 : {-4, 54, 16, -2},
52 : {-6, 46, 28, -4},
53 : {-4, 36, 36, -4},
54 : {-4, 28, 46, -6},
55 : {-2, 16, 54, -4},
56 : {-2, 10, 58, -2},
57 : };
58 :
59 0 : static void PrefetchBlock(uint8_t *src, uint32_t src_stride, uint32_t blkWidth, uint32_t blkHeight)
60 : {
61 : #if PREFETCH
62 : uint32_t row_count = blkHeight;
63 :
64 : do {
65 : uint8_t *addr0 = src;
66 : uint8_t *addr1 = addr0 + blkWidth - 1;
67 : src += src_stride;
68 :
69 : _mm_prefetch((char*)addr0, _MM_HINT_T0);
70 : _mm_prefetch((char*)addr1, _MM_HINT_T0);
71 : } while (--row_count != 0);
72 : #else
73 : (void)src;
74 : (void)src_stride;
75 : (void)blkWidth;
76 : (void)blkHeight;
77 : #endif
78 0 : }
79 :
80 0 : void LumaInterpolationFilterTwoDInRaw7_SSSE3(int16_t *first_pass_if_dst, EbByte dst, uint32_t dst_stride, uint32_t pu_width, uint32_t pu_height, uint32_t frac_pos_y)
81 : {
82 : int32_t row_count, col_count;
83 : __m128i c0, c1, c2;
84 : __m128i a0, a1, a2, a3, a4, a5, a6;
85 : __m128i sum0, sum1;
86 : __m128i b0l, b0h, b1l, b1h, b2l, b2h;
87 :
88 : EbByte qtr;
89 :
90 0 : c0 = _mm_loadu_si128((__m128i *)lumaFilterCoeff7[frac_pos_y]);
91 0 : c2 = _mm_shuffle_epi32(c0, 0xaa);
92 0 : c1 = _mm_shuffle_epi32(c0, 0x55);
93 0 : c0 = _mm_shuffle_epi32(c0, 0x00);
94 :
95 0 : if (pu_width & 4)
96 : {
97 0 : row_count = pu_height;
98 :
99 0 : qtr = dst;
100 :
101 : do {
102 0 : a0 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 0 * 4));
103 0 : a1 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 1 * 4));
104 0 : a2 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 2 * 4));
105 0 : a3 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 3 * 4));
106 0 : a4 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 4 * 4));
107 0 : a5 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 5 * 4));
108 0 : a6 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 6 * 4));
109 0 : a0 = _mm_sub_epi16(a0, a6);
110 :
111 0 : sum0 = _mm_set1_epi32(257 << 11);
112 0 : sum1 = _mm_set1_epi32(257 << 11);
113 :
114 0 : b0l = _mm_unpacklo_epi16(a0, a1);
115 0 : b0h = _mm_unpackhi_epi16(a0, a1);
116 0 : b1l = _mm_unpacklo_epi16(a2, a3);
117 0 : b1h = _mm_unpackhi_epi16(a2, a3);
118 0 : b2l = _mm_unpacklo_epi16(a4, a5);
119 0 : b2h = _mm_unpackhi_epi16(a4, a5);
120 :
121 0 : sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(b0l, c0));
122 0 : sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b0h, c0));
123 0 : sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(b1l, c1));
124 0 : sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b1h, c1));
125 0 : sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(b2l, c2));
126 0 : sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b2h, c2));
127 :
128 0 : sum0 = _mm_srai_epi32(sum0, 12);
129 0 : sum1 = _mm_srai_epi32(sum1, 12);
130 0 : sum0 = _mm_packs_epi32(sum0, sum1);
131 0 : sum0 = _mm_packus_epi16(sum0, sum0);
132 :
133 0 : *(uint32_t *)qtr = _mm_cvtsi128_si32(sum0); qtr += dst_stride;
134 0 : *(uint32_t *)qtr = _mm_cvtsi128_si32(_mm_srli_si128(sum0, 4)); qtr += dst_stride;
135 :
136 0 : first_pass_if_dst += 8;
137 0 : row_count -= 2;
138 0 : } while (row_count > 0);
139 :
140 0 : pu_width -= 4;
141 0 : if (pu_width == 0)
142 0 : return;
143 0 : first_pass_if_dst += (frac_pos_y == 2) ? 32 : 24;
144 0 : dst += 4;
145 : }
146 :
147 0 : col_count = pu_width;
148 : do {
149 0 : EbByte qtr = dst;
150 :
151 0 : row_count = pu_height;
152 : do {
153 0 : a0 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 0 * 8));
154 0 : a1 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 1 * 8));
155 0 : a2 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 2 * 8));
156 0 : a3 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 3 * 8));
157 0 : a4 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 4 * 8));
158 0 : a5 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 5 * 8));
159 0 : a6 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 6 * 8));
160 0 : a0 = _mm_sub_epi16(a0, a6);
161 :
162 0 : sum0 = _mm_set1_epi32(257 << 11);
163 0 : sum1 = _mm_set1_epi32(257 << 11);
164 0 : sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0));
165 0 : sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0));
166 0 : sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a2, a3), c1));
167 0 : sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a2, a3), c1));
168 0 : sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a4, a5), c2));
169 0 : sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a4, a5), c2));
170 :
171 0 : sum0 = _mm_srai_epi32(sum0, 12);
172 0 : sum1 = _mm_srai_epi32(sum1, 12);
173 0 : sum0 = _mm_packs_epi32(sum0, sum1);
174 0 : sum0 = _mm_packus_epi16(sum0, sum0);
175 :
176 0 : _mm_storel_epi64((__m128i *)qtr, sum0); qtr += dst_stride;
177 :
178 0 : first_pass_if_dst += 8;
179 0 : row_count--;
180 0 : } while (row_count > 0);
181 :
182 0 : first_pass_if_dst += (frac_pos_y == 2) ? 56 : 48;
183 0 : dst += 8;
184 0 : col_count -= 8;
185 0 : } while (col_count > 0);
186 : }
187 :
188 0 : void LumaInterpolationFilterTwoDInRawOutRaw7_SSSE3(int16_t *first_pass_if_dst, int16_t *dst, uint32_t pu_width, uint32_t pu_height, uint32_t frac_pos_y)
189 : {
190 : int32_t row_count, col_count;
191 :
192 : __m128i a0, a1, a2, a3, a4, a5, a6;
193 : __m128i c0, c1, c2;
194 0 : c0 = _mm_loadu_si128((__m128i *)lumaFilterCoeff7[frac_pos_y]);
195 0 : c2 = _mm_shuffle_epi32(c0, 0xaa);
196 0 : c1 = _mm_shuffle_epi32(c0, 0x55);
197 0 : c0 = _mm_shuffle_epi32(c0, 0x00);
198 :
199 0 : if (pu_width & 4)
200 : {
201 0 : row_count = pu_height;
202 :
203 : do {
204 : __m128i sum0, sum1;
205 0 : a0 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 0 * 4));
206 0 : a1 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 1 * 4));
207 0 : a2 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 2 * 4));
208 0 : a3 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 3 * 4));
209 0 : a4 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 4 * 4));
210 0 : a5 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 5 * 4));
211 0 : a6 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 6 * 4));
212 0 : a0 = _mm_sub_epi16(a0, a6);
213 :
214 0 : sum0 = _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0);
215 0 : sum1 = _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0);
216 0 : sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a2, a3), c1));
217 0 : sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a2, a3), c1));
218 0 : sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a4, a5), c2));
219 0 : sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a4, a5), c2));
220 :
221 0 : sum0 = _mm_srai_epi32(sum0, 6);
222 0 : sum1 = _mm_srai_epi32(sum1, 6);
223 0 : sum0 = _mm_packs_epi32(sum0, sum1);
224 :
225 : _mm_storeu_si128((__m128i *)dst, sum0);
226 0 : dst += 8;
227 :
228 0 : first_pass_if_dst += 8;
229 0 : row_count -= 2;
230 0 : } while (row_count > 0);
231 :
232 0 : pu_width -= 4;
233 0 : if (pu_width == 0)
234 0 : return;
235 0 : first_pass_if_dst += (frac_pos_y == 2) ? 32 : 24;
236 : }
237 :
238 0 : col_count = pu_width;
239 : do {
240 0 : row_count = pu_height;
241 : do {
242 : __m128i b0l, b0h, b1l, b1h, b2l, b2h;
243 : __m128i sum0, sum1;
244 :
245 0 : a0 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 0 * 8));
246 0 : a1 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 1 * 8));
247 0 : a2 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 2 * 8));
248 0 : a3 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 3 * 8));
249 0 : a4 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 4 * 8));
250 0 : a5 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 5 * 8));
251 0 : a6 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 6 * 8));
252 0 : a0 = _mm_sub_epi16(a0, a6);
253 :
254 0 : b0l = _mm_unpacklo_epi16(a0, a1);
255 0 : b0h = _mm_unpackhi_epi16(a0, a1);
256 0 : b1l = _mm_unpacklo_epi16(a2, a3);
257 0 : b1h = _mm_unpackhi_epi16(a2, a3);
258 0 : b2l = _mm_unpacklo_epi16(a4, a5);
259 0 : b2h = _mm_unpackhi_epi16(a4, a5);
260 :
261 0 : sum0 = _mm_madd_epi16(b0l, c0);
262 0 : sum1 = _mm_madd_epi16(b0h, c0);
263 0 : sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(b1l, c1));
264 0 : sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b1h, c1));
265 0 : sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(b2l, c2));
266 0 : sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b2h, c2));
267 :
268 0 : sum0 = _mm_srai_epi32(sum0, 6);
269 0 : sum1 = _mm_srai_epi32(sum1, 6);
270 0 : sum0 = _mm_packs_epi32(sum0, sum1);
271 :
272 : _mm_storeu_si128((__m128i *)dst, sum0);
273 0 : dst += 8;
274 :
275 0 : first_pass_if_dst += 8;
276 0 : row_count--;
277 0 : } while (row_count > 0);
278 :
279 0 : first_pass_if_dst += (frac_pos_y == 2) ? 56 : 48;
280 0 : col_count -= 8;
281 0 : } while (col_count > 0);
282 : }
283 :
284 0 : void LumaInterpolationFilterTwoDInRawM_SSSE3(int16_t *first_pass_if_dst, EbByte dst, uint32_t dst_stride, uint32_t pu_width, uint32_t pu_height)
285 : {
286 : int32_t row_count, col_count;
287 :
288 : __m128i c0, c1;
289 : __m128i a0, a1, a2, a3, a4, a5, a6, a7;
290 : __m128i sum0, sum1;
291 :
292 : EbByte qtr;
293 :
294 0 : c0 = _mm_loadu_si128((__m128i *)lumaFilterCoeff7[2]);
295 0 : c1 = _mm_shuffle_epi32(c0, 0x55);
296 0 : c0 = _mm_shuffle_epi32(c0, 0x00);
297 :
298 0 : if (pu_width & 4){
299 0 : row_count = pu_height;
300 0 : qtr = dst;
301 :
302 : do {
303 0 : a0 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 0 * 4));
304 0 : a1 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 1 * 4));
305 0 : a2 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 2 * 4));
306 0 : a3 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 3 * 4));
307 0 : a4 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 4 * 4));
308 0 : a5 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 5 * 4));
309 0 : a6 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 6 * 4));
310 0 : a7 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 7 * 4));
311 :
312 0 : sum0 = _mm_set1_epi32(257 << 11);
313 0 : sum1 = _mm_set1_epi32(257 << 11);
314 :
315 0 : a0 = _mm_add_epi16(a0, a7);
316 0 : a1 = _mm_add_epi16(a1, a6);
317 0 : a2 = _mm_add_epi16(a2, a5);
318 0 : a3 = _mm_add_epi16(a3, a4);
319 0 : sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0));
320 0 : sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0));
321 0 : sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a2, a3), c1));
322 0 : sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a2, a3), c1));
323 :
324 0 : sum0 = _mm_srai_epi32(sum0, 12);
325 0 : sum1 = _mm_srai_epi32(sum1, 12);
326 0 : sum0 = _mm_packs_epi32(sum0, sum1);
327 0 : sum0 = _mm_packus_epi16(sum0, sum0);
328 :
329 0 : *(uint32_t *)qtr = _mm_cvtsi128_si32(sum0); qtr += dst_stride;
330 0 : *(uint32_t *)qtr = _mm_cvtsi128_si32(_mm_srli_si128(sum0, 4)); qtr += dst_stride;
331 0 : first_pass_if_dst += 8;
332 0 : row_count -= 2;
333 0 : } while (row_count > 0);
334 :
335 0 : pu_width -= 4;
336 0 : if (pu_width == 0)
337 0 : return;
338 0 : first_pass_if_dst += 32;
339 0 : dst += 4;
340 : }
341 :
342 0 : col_count = pu_width;
343 : do {
344 0 : qtr = dst;
345 :
346 0 : row_count = pu_height;
347 : do {
348 0 : a0 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 0 * 8));
349 0 : a1 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 1 * 8));
350 0 : a2 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 2 * 8));
351 0 : a3 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 3 * 8));
352 0 : a4 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 4 * 8));
353 0 : a5 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 5 * 8));
354 0 : a6 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 6 * 8));
355 0 : a7 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 7 * 8));
356 :
357 0 : sum0 = _mm_set1_epi32(257 << 11);
358 0 : sum1 = _mm_set1_epi32(257 << 11);
359 0 : a0 = _mm_add_epi16(a0, a7);
360 0 : a1 = _mm_add_epi16(a1, a6);
361 0 : a2 = _mm_add_epi16(a2, a5);
362 0 : a3 = _mm_add_epi16(a3, a4);
363 0 : sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0));
364 0 : sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0));
365 0 : sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a2, a3), c1));
366 0 : sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a2, a3), c1));
367 :
368 0 : sum0 = _mm_srai_epi32(sum0, 12);
369 0 : sum1 = _mm_srai_epi32(sum1, 12);
370 0 : sum0 = _mm_packs_epi32(sum0, sum1);
371 0 : sum0 = _mm_packus_epi16(sum0, sum0);
372 :
373 0 : _mm_storel_epi64((__m128i *)qtr, sum0); qtr += dst_stride;
374 0 : first_pass_if_dst += 8;
375 0 : } while (--row_count > 0);
376 :
377 0 : first_pass_if_dst += 56;
378 0 : dst += 8;
379 0 : col_count -= 8;
380 0 : } while (col_count > 0);
381 : }
382 :
383 0 : void LumaInterpolationFilterTwoDInRawOutRawM_SSSE3(int16_t *first_pass_if_dst, int16_t *dst, uint32_t pu_width, uint32_t pu_height){
384 : int32_t row_count, col_count;
385 :
386 : __m128i a0, a1, a2, a3, a4, a5, a6, a7;
387 : __m128i c0, c1;
388 0 : c0 = _mm_loadu_si128((__m128i *)lumaFilterCoeff7[2]);
389 0 : c1 = _mm_shuffle_epi32(c0, 0x55);
390 0 : c0 = _mm_shuffle_epi32(c0, 0x00);
391 :
392 0 : if (pu_width & 4) {
393 0 : row_count = pu_height;
394 :
395 : do {
396 : __m128i sum0, sum1;
397 0 : a0 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 0 * 4));
398 0 : a1 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 1 * 4));
399 0 : a2 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 2 * 4));
400 0 : a3 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 3 * 4));
401 0 : a4 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 4 * 4));
402 0 : a5 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 5 * 4));
403 0 : a6 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 6 * 4));
404 0 : a7 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 7 * 4));
405 :
406 0 : a0 = _mm_add_epi16(a0, a7);
407 0 : a1 = _mm_add_epi16(a1, a6);
408 0 : a2 = _mm_add_epi16(a2, a5);
409 0 : a3 = _mm_add_epi16(a3, a4);
410 0 : sum0 = _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0);
411 0 : sum1 = _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0);
412 0 : sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a2, a3), c1));
413 0 : sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a2, a3), c1));
414 :
415 0 : sum0 = _mm_srai_epi32(sum0, 6);
416 0 : sum1 = _mm_srai_epi32(sum1, 6);
417 0 : sum0 = _mm_packs_epi32(sum0, sum1);
418 :
419 : _mm_storeu_si128((__m128i *)dst, sum0);
420 0 : dst += 8;
421 0 : first_pass_if_dst += 8;
422 0 : row_count -= 2;
423 0 : } while (row_count > 0);
424 :
425 0 : pu_width -= 4;
426 0 : if (pu_width == 0)
427 0 : return;
428 0 : first_pass_if_dst += 32;
429 : }
430 :
431 0 : col_count = pu_width;
432 : do {
433 0 : row_count = pu_height;
434 : do {
435 : __m128i sum0, sum1;
436 0 : a0 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 0 * 8));
437 0 : a1 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 1 * 8));
438 0 : a2 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 2 * 8));
439 0 : a3 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 3 * 8));
440 0 : a4 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 4 * 8));
441 0 : a5 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 5 * 8));
442 0 : a6 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 6 * 8));
443 0 : a7 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 7 * 8));
444 :
445 0 : a0 = _mm_add_epi16(a0, a7);
446 0 : a1 = _mm_add_epi16(a1, a6);
447 0 : a2 = _mm_add_epi16(a2, a5);
448 0 : a3 = _mm_add_epi16(a3, a4);
449 0 : sum0 = _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0);
450 0 : sum1 = _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0);
451 0 : sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a2, a3), c1));
452 0 : sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a2, a3), c1));
453 :
454 0 : sum0 = _mm_srai_epi32(sum0, 6);
455 0 : sum1 = _mm_srai_epi32(sum1, 6);
456 0 : sum0 = _mm_packs_epi32(sum0, sum1);
457 :
458 : _mm_storeu_si128((__m128i *)dst, sum0);
459 0 : dst += 8;
460 0 : first_pass_if_dst += 8;
461 0 : } while (--row_count > 0);
462 :
463 0 : first_pass_if_dst += 56;
464 0 : col_count -= 8;
465 0 : } while (col_count > 0);
466 : }
467 :
468 0 : void PictureCopyKernelOutRaw_SSSE3(
469 : EbByte ref_pic,
470 : uint32_t src_stride,
471 : int16_t *dst,
472 : uint32_t pu_width,
473 : uint32_t pu_height,
474 : int16_t offset)
475 : {
476 : uint32_t row_count, col_count;
477 : __m128i o;
478 :
479 0 : PrefetchBlock(ref_pic, src_stride, pu_width, pu_height);
480 :
481 0 : /*__m128i*/ o = _mm_set1_epi16(offset);
482 :
483 0 : if (pu_width & 2) {
484 : __m128i a0;
485 0 : EbByte ptr = ref_pic;
486 0 : row_count = pu_height;
487 0 : /*__m128i*/ a0 = _mm_setzero_si128();
488 : do {
489 0 : a0 = _mm_insert_epi16(a0, *(uint16_t *)ptr, 0); ptr += src_stride;
490 0 : a0 = _mm_insert_epi16(a0, *(uint16_t *)ptr, 1); ptr += src_stride;
491 0 : a0 = _mm_insert_epi16(a0, *(uint16_t *)ptr, 2); ptr += src_stride;
492 0 : a0 = _mm_insert_epi16(a0, *(uint16_t *)ptr, 3); ptr += src_stride;
493 0 : a0 = _mm_unpacklo_epi8(a0, _mm_setzero_si128());
494 0 : a0 = _mm_slli_epi16(a0, 6);
495 0 : a0 = _mm_sub_epi16(a0, o);
496 : _mm_storeu_si128((__m128i *)dst, a0);
497 :
498 0 : dst += 8;
499 0 : row_count -= 4;
500 0 : } while (row_count != 0);
501 :
502 0 : pu_width -= 2;
503 0 : if (pu_width == 0)
504 0 : return;
505 0 : ref_pic += 2;
506 : }
507 :
508 0 : if (pu_width & 4) {
509 0 : EbByte ptr = ref_pic;
510 0 : row_count = pu_height;
511 : do {
512 : __m128i a0, a1;
513 0 : a0 = _mm_cvtsi32_si128(*(uint32_t *)ptr); ptr += src_stride;
514 0 : a1 = _mm_cvtsi32_si128(*(uint32_t *)ptr); ptr += src_stride;
515 0 : a0 = _mm_unpacklo_epi32(a0, a1);
516 0 : a0 = _mm_unpacklo_epi8(a0, _mm_setzero_si128());
517 0 : a0 = _mm_slli_epi16(a0, 6);
518 0 : a0 = _mm_sub_epi16(a0, o);
519 : _mm_storeu_si128((__m128i *)dst, a0);
520 :
521 0 : dst += 8;
522 0 : row_count -= 2;
523 0 : } while (row_count != 0);
524 :
525 0 : pu_width -= 4;
526 0 : if (pu_width == 0)
527 0 : return;
528 0 : ref_pic += 4;
529 : }
530 :
531 0 : col_count = pu_width;
532 : do {
533 : __m128i a0;
534 0 : EbByte ptr = ref_pic;
535 0 : row_count = pu_height;
536 : do {
537 0 : /*__m128i*/ a0 = _mm_loadl_epi64((__m128i *)ptr); ptr += src_stride;
538 0 : a0 = _mm_unpacklo_epi8(a0, _mm_setzero_si128());
539 0 : a0 = _mm_slli_epi16(a0, 6);
540 0 : a0 = _mm_sub_epi16(a0, o);
541 : _mm_storeu_si128((__m128i *)dst, a0);
542 0 : dst += 8;
543 0 : } while (--row_count != 0);
544 :
545 0 : col_count -= 8;
546 0 : ref_pic += 8;
547 0 : } while (col_count != 0);
548 : }
|