Line data Source code
1 : /*
2 : * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <emmintrin.h>
13 : #include "EbHighbdIntraPrediction_SSE2.h"
14 : #include "EbDefinitions.h"
15 : #include "aom_dsp_rtcd.h"
16 :
17 : // -----------------------------------------------------------------------------
18 : // H_PRED
19 :
20 0 : void eb_aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
21 : const uint16_t *above,
22 : const uint16_t *left, int32_t bd) {
23 0 : const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
24 0 : const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
25 0 : const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
26 0 : const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
27 0 : const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
28 : (void)above;
29 : (void)bd;
30 0 : _mm_storel_epi64((__m128i *)dst, row0);
31 0 : dst += stride;
32 0 : _mm_storel_epi64((__m128i *)dst, row1);
33 0 : dst += stride;
34 0 : _mm_storel_epi64((__m128i *)dst, row2);
35 0 : dst += stride;
36 0 : _mm_storel_epi64((__m128i *)dst, row3);
37 0 : }
38 :
39 0 : void eb_aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
40 : const uint16_t *above,
41 : const uint16_t *left, int32_t bd) {
42 0 : eb_aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
43 0 : dst += stride << 2;
44 0 : left += 4;
45 0 : eb_aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
46 0 : }
47 :
48 0 : void eb_aom_highbd_h_predictor_4x16_sse2(uint16_t *dst, ptrdiff_t stride,
49 : const uint16_t *above,
50 : const uint16_t *left, int32_t bd) {
51 0 : eb_aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
52 0 : dst += stride << 2;
53 0 : left += 4;
54 0 : eb_aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
55 0 : dst += stride << 2;
56 0 : left += 4;
57 0 : eb_aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
58 0 : dst += stride << 2;
59 0 : left += 4;
60 0 : eb_aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
61 0 : }
62 :
63 0 : void eb_aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
64 : const uint16_t *above,
65 : const uint16_t *left, int32_t bd) {
66 0 : const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
67 0 : const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
68 0 : const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
69 0 : const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
70 0 : const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
71 : (void)above;
72 : (void)bd;
73 0 : _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
74 0 : dst += stride;
75 0 : _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
76 0 : dst += stride;
77 0 : _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
78 0 : dst += stride;
79 0 : _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
80 0 : }
81 :
82 0 : void eb_aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
83 : const uint16_t *above,
84 : const uint16_t *left, int32_t bd) {
85 0 : const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
86 0 : const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
87 0 : const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
88 0 : const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
89 0 : const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
90 0 : const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
91 0 : const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
92 0 : const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
93 0 : const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
94 : (void)above;
95 : (void)bd;
96 0 : _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
97 0 : dst += stride;
98 0 : _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
99 0 : dst += stride;
100 0 : _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
101 0 : dst += stride;
102 0 : _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
103 0 : dst += stride;
104 0 : _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
105 0 : dst += stride;
106 0 : _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
107 0 : dst += stride;
108 0 : _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
109 0 : dst += stride;
110 0 : _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
111 0 : }
112 :
113 0 : void eb_aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
114 : const uint16_t *above,
115 : const uint16_t *left, int32_t bd) {
116 0 : eb_aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
117 0 : dst += stride << 3;
118 0 : left += 8;
119 0 : eb_aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
120 0 : }
121 :
122 0 : void eb_aom_highbd_h_predictor_8x32_sse2(uint16_t *dst, ptrdiff_t stride,
123 : const uint16_t *above,
124 : const uint16_t *left, int32_t bd) {
125 0 : eb_aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
126 0 : dst += stride << 3;
127 0 : left += 8;
128 0 : eb_aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
129 0 : dst += stride << 3;
130 0 : left += 8;
131 0 : eb_aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
132 0 : dst += stride << 3;
133 0 : left += 8;
134 0 : eb_aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
135 0 : }
136 :
137 0 : static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
138 : const __m128i *row) {
139 0 : const __m128i val = _mm_unpacklo_epi64(*row, *row);
140 0 : _mm_store_si128((__m128i *)*dst, val);
141 0 : _mm_store_si128((__m128i *)(*dst + 8), val);
142 0 : *dst += stride;
143 0 : }
144 :
145 0 : static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
146 : const __m128i *row) {
147 0 : const __m128i val = _mm_unpackhi_epi64(*row, *row);
148 0 : _mm_store_si128((__m128i *)(*dst), val);
149 0 : _mm_store_si128((__m128i *)(*dst + 8), val);
150 0 : *dst += stride;
151 0 : }
152 :
153 0 : static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride,
154 : const uint16_t *left) {
155 0 : const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
156 0 : const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
157 0 : const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
158 0 : const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
159 0 : const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
160 0 : const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
161 0 : const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
162 0 : const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
163 0 : const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
164 0 : h_store_16_unpacklo(&dst, stride, &row0);
165 0 : h_store_16_unpacklo(&dst, stride, &row1);
166 0 : h_store_16_unpacklo(&dst, stride, &row2);
167 0 : h_store_16_unpacklo(&dst, stride, &row3);
168 0 : h_store_16_unpackhi(&dst, stride, &row4);
169 0 : h_store_16_unpackhi(&dst, stride, &row5);
170 0 : h_store_16_unpackhi(&dst, stride, &row6);
171 0 : h_store_16_unpackhi(&dst, stride, &row7);
172 0 : }
173 :
174 0 : void eb_aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
175 : const uint16_t *above,
176 : const uint16_t *left, int32_t bd) {
177 : (void)above;
178 : (void)bd;
179 0 : h_predictor_16x8(dst, stride, left);
180 0 : }
181 :
182 0 : void eb_aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
183 : const uint16_t *above,
184 : const uint16_t *left, int32_t bd) {
185 : int32_t i;
186 : (void)above;
187 : (void)bd;
188 :
189 0 : for (i = 0; i < 2; i++, left += 8) {
190 0 : h_predictor_16x8(dst, stride, left);
191 0 : dst += stride << 3;
192 : }
193 0 : }
194 :
195 0 : void eb_aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
196 : const uint16_t *above,
197 : const uint16_t *left, int32_t bd) {
198 : int32_t i;
199 : (void)above;
200 : (void)bd;
201 :
202 0 : for (i = 0; i < 4; i++, left += 8) {
203 0 : h_predictor_16x8(dst, stride, left);
204 0 : dst += stride << 3;
205 : }
206 0 : }
207 :
208 0 : static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
209 : const __m128i *row) {
210 0 : const __m128i val = _mm_unpacklo_epi64(*row, *row);
211 0 : _mm_store_si128((__m128i *)(*dst), val);
212 0 : _mm_store_si128((__m128i *)(*dst + 8), val);
213 0 : _mm_store_si128((__m128i *)(*dst + 16), val);
214 0 : _mm_store_si128((__m128i *)(*dst + 24), val);
215 0 : *dst += stride;
216 0 : }
217 :
218 0 : static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
219 : const __m128i *row) {
220 0 : const __m128i val = _mm_unpackhi_epi64(*row, *row);
221 0 : _mm_store_si128((__m128i *)(*dst), val);
222 0 : _mm_store_si128((__m128i *)(*dst + 8), val);
223 0 : _mm_store_si128((__m128i *)(*dst + 16), val);
224 0 : _mm_store_si128((__m128i *)(*dst + 24), val);
225 0 : *dst += stride;
226 0 : }
227 :
228 0 : static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride,
229 : const uint16_t *left) {
230 0 : const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
231 0 : const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
232 0 : const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
233 0 : const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
234 0 : const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
235 0 : const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
236 0 : const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
237 0 : const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
238 0 : const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
239 0 : h_store_32_unpacklo(&dst, stride, &row0);
240 0 : h_store_32_unpacklo(&dst, stride, &row1);
241 0 : h_store_32_unpacklo(&dst, stride, &row2);
242 0 : h_store_32_unpacklo(&dst, stride, &row3);
243 0 : h_store_32_unpackhi(&dst, stride, &row4);
244 0 : h_store_32_unpackhi(&dst, stride, &row5);
245 0 : h_store_32_unpackhi(&dst, stride, &row6);
246 0 : h_store_32_unpackhi(&dst, stride, &row7);
247 0 : }
248 :
249 0 : void eb_aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
250 : const uint16_t *above,
251 : const uint16_t *left, int32_t bd) {
252 : int32_t i;
253 : (void)above;
254 : (void)bd;
255 :
256 0 : for (i = 0; i < 2; i++, left += 8) {
257 0 : h_predictor_32x8(dst, stride, left);
258 0 : dst += stride << 3;
259 : }
260 0 : }
261 :
262 0 : void eb_aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
263 : const uint16_t *above,
264 : const uint16_t *left, int32_t bd) {
265 : int32_t i;
266 : (void)above;
267 : (void)bd;
268 :
269 0 : for (i = 0; i < 4; i++, left += 8) {
270 0 : h_predictor_32x8(dst, stride, left);
271 0 : dst += stride << 3;
272 : }
273 0 : }
274 :
275 : // -----------------------------------------------------------------------------
276 : // DC_TOP, DC_LEFT, DC_128
277 :
278 : // 4x4
279 :
280 0 : static INLINE void dc_store_4xh(uint16_t *dst, ptrdiff_t stride, int32_t height,
281 : const __m128i *dc) {
282 0 : const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
283 : int32_t i;
284 0 : for (i = 0; i < height; ++i, dst += stride)
285 0 : _mm_storel_epi64((__m128i *)dst, dc_dup);
286 0 : }
287 :
288 0 : void eb_aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
289 : const uint16_t *above,
290 : const uint16_t *left, int32_t bd) {
291 0 : const __m128i two = _mm_cvtsi32_si128(2);
292 0 : const __m128i sum = dc_sum_4(left);
293 0 : const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
294 : (void)above;
295 : (void)bd;
296 0 : dc_store_4xh(dst, stride, 4, &dc);
297 0 : }
298 :
299 0 : void eb_aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
300 : const uint16_t *above,
301 : const uint16_t *left, int32_t bd) {
302 0 : const __m128i two = _mm_cvtsi32_si128(2);
303 0 : const __m128i sum = dc_sum_4(above);
304 0 : const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
305 : (void)left;
306 : (void)bd;
307 0 : dc_store_4xh(dst, stride, 4, &dc);
308 0 : }
309 :
310 0 : void eb_aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
311 : const uint16_t *above,
312 : const uint16_t *left, int32_t bd) {
313 0 : const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
314 : (void)above;
315 : (void)left;
316 0 : dc_store_4xh(dst, stride, 4, &dc);
317 0 : }
318 :
319 : // -----------------------------------------------------------------------------
320 : // 4x8
321 :
322 0 : void eb_aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
323 : const uint16_t *above,
324 : const uint16_t *left, int32_t bd) {
325 0 : const __m128i sum = dc_sum_8(left);
326 0 : const __m128i four = _mm_cvtsi32_si128(4);
327 0 : const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
328 : (void)above;
329 : (void)bd;
330 0 : dc_store_4xh(dst, stride, 8, &dc);
331 0 : }
332 :
333 0 : void eb_aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
334 : const uint16_t *above,
335 : const uint16_t *left, int32_t bd) {
336 0 : const __m128i two = _mm_cvtsi32_si128(2);
337 0 : const __m128i sum = dc_sum_4(above);
338 0 : const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
339 : (void)left;
340 : (void)bd;
341 0 : dc_store_4xh(dst, stride, 8, &dc);
342 0 : }
343 :
344 0 : void eb_aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
345 : const uint16_t *above,
346 : const uint16_t *left, int32_t bd) {
347 0 : const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
348 : (void)above;
349 : (void)left;
350 0 : dc_store_4xh(dst, stride, 8, &dc);
351 0 : }
352 :
353 : // -----------------------------------------------------------------------------
354 : // 4x16
355 :
356 0 : static INLINE __m128i dc_sum_16(const uint16_t *const src) {
357 0 : const __m128i s_lo = _mm_load_si128((const __m128i *)(src + 0));
358 0 : const __m128i s_hi = _mm_load_si128((const __m128i *)(src + 8));
359 : __m128i sum, sum_hi;
360 0 : sum = _mm_add_epi16(s_lo, s_hi);
361 0 : sum_hi = _mm_srli_si128(sum, 8);
362 0 : sum = _mm_add_epi16(sum, sum_hi);
363 0 : sum_hi = _mm_srli_si128(sum, 4);
364 0 : sum = _mm_add_epi16(sum, sum_hi);
365 0 : sum_hi = _mm_srli_si128(sum, 2);
366 0 : sum = _mm_add_epi16(sum, sum_hi);
367 0 : return sum;
368 : }
369 :
370 0 : void eb_aom_highbd_dc_left_predictor_4x16_sse2(uint16_t *dst, ptrdiff_t stride,
371 : const uint16_t *above,
372 : const uint16_t *left, int32_t bd) {
373 0 : const __m128i sum = dc_sum_16(left);
374 0 : const __m128i eight = _mm_cvtsi32_si128(8);
375 0 : const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
376 : (void)above;
377 : (void)bd;
378 0 : dc_store_4xh(dst, stride, 16, &dc);
379 0 : }
380 :
381 0 : void eb_aom_highbd_dc_top_predictor_4x16_sse2(uint16_t *dst, ptrdiff_t stride,
382 : const uint16_t *above,
383 : const uint16_t *left, int32_t bd) {
384 0 : const __m128i two = _mm_cvtsi32_si128(2);
385 0 : const __m128i sum = dc_sum_4(above);
386 0 : const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
387 : (void)left;
388 : (void)bd;
389 0 : dc_store_4xh(dst, stride, 16, &dc);
390 0 : }
391 :
392 0 : void eb_aom_highbd_dc_128_predictor_4x16_sse2(uint16_t *dst, ptrdiff_t stride,
393 : const uint16_t *above,
394 : const uint16_t *left, int32_t bd) {
395 0 : const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
396 : (void)above;
397 : (void)left;
398 0 : dc_store_4xh(dst, stride, 16, &dc);
399 0 : }
400 :
401 : // -----------------------------------------------------------------------------
402 : // 8xh
403 :
404 0 : static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int32_t height,
405 : const __m128i *dc) {
406 0 : const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
407 0 : const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
408 : int32_t i;
409 0 : for (i = 0; i < height; ++i, dst += stride)
410 : _mm_store_si128((__m128i *)dst, dc_dup);
411 0 : }
412 :
413 : // -----------------------------------------------------------------------------
414 : // DC_TOP
415 :
416 0 : static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
417 : int32_t height, const uint16_t *above) {
418 0 : const __m128i four = _mm_cvtsi32_si128(4);
419 0 : const __m128i sum = dc_sum_8(above);
420 0 : const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
421 0 : dc_store_8xh(dst, stride, height, &dc);
422 0 : }
423 :
424 0 : void eb_aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
425 : const uint16_t *above,
426 : const uint16_t *left, int32_t bd) {
427 : (void)left;
428 : (void)bd;
429 0 : dc_top_predictor_8xh(dst, stride, 4, above);
430 0 : }
431 :
432 0 : void eb_aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
433 : const uint16_t *above,
434 : const uint16_t *left, int32_t bd) {
435 : (void)left;
436 : (void)bd;
437 0 : dc_top_predictor_8xh(dst, stride, 8, above);
438 0 : }
439 :
440 0 : void eb_aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
441 : const uint16_t *above,
442 : const uint16_t *left, int32_t bd) {
443 : (void)left;
444 : (void)bd;
445 0 : dc_top_predictor_8xh(dst, stride, 16, above);
446 0 : }
447 :
448 : // -----------------------------------------------------------------------------
449 : // DC_LEFT
450 :
451 0 : static INLINE __m128i dc_sum_32(const uint16_t *ref) {
452 0 : const __m128i zero = _mm_setzero_si128();
453 0 : const __m128i sum_a = dc_sum_16(ref);
454 0 : const __m128i sum_b = dc_sum_16(ref + 16);
455 : // 12 bit bd will outrange, so expand to 32 bit before adding final total
456 0 : return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
457 : _mm_unpacklo_epi16(sum_b, zero));
458 : }
459 :
460 0 : void eb_aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
461 : const uint16_t *above,
462 : const uint16_t *left, int32_t bd) {
463 0 : const __m128i two = _mm_cvtsi32_si128(2);
464 0 : const __m128i sum = dc_sum_4(left);
465 0 : const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
466 : (void)above;
467 : (void)bd;
468 0 : dc_store_8xh(dst, stride, 4, &dc);
469 0 : }
470 :
471 0 : void eb_aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
472 : const uint16_t *above,
473 : const uint16_t *left, int32_t bd) {
474 0 : const __m128i four = _mm_cvtsi32_si128(4);
475 0 : const __m128i sum = dc_sum_8(left);
476 0 : const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
477 : (void)above;
478 : (void)bd;
479 0 : dc_store_8xh(dst, stride, 8, &dc);
480 0 : }
481 :
482 0 : void eb_aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
483 : const uint16_t *above,
484 : const uint16_t *left, int32_t bd) {
485 0 : const __m128i eight = _mm_cvtsi32_si128(8);
486 0 : const __m128i sum = dc_sum_16(left);
487 0 : const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
488 : (void)above;
489 : (void)bd;
490 0 : dc_store_8xh(dst, stride, 16, &dc);
491 0 : }
492 :
493 0 : void eb_aom_highbd_dc_left_predictor_8x32_sse2(uint16_t *dst, ptrdiff_t stride,
494 : const uint16_t *above,
495 : const uint16_t *left, int32_t bd) {
496 0 : const __m128i sixteen = _mm_cvtsi32_si128(16);
497 0 : const __m128i sum = dc_sum_32(left);
498 0 : const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
499 : (void)above;
500 : (void)bd;
501 0 : dc_store_8xh(dst, stride, 32, &dc);
502 0 : }
503 :
504 : // -----------------------------------------------------------------------------
505 : // DC_128
506 :
507 0 : static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
508 : int32_t height, int32_t bd) {
509 0 : const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
510 0 : dc_store_8xh(dst, stride, height, &dc);
511 0 : }
512 :
513 0 : void eb_aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
514 : const uint16_t *above,
515 : const uint16_t *left, int32_t bd) {
516 : (void)above;
517 : (void)left;
518 0 : dc_128_predictor_8xh(dst, stride, 4, bd);
519 0 : }
520 :
521 0 : void eb_aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
522 : const uint16_t *above,
523 : const uint16_t *left, int32_t bd) {
524 : (void)above;
525 : (void)left;
526 0 : dc_128_predictor_8xh(dst, stride, 8, bd);
527 0 : }
528 :
529 0 : void eb_aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
530 : const uint16_t *above,
531 : const uint16_t *left, int32_t bd) {
532 : (void)above;
533 : (void)left;
534 0 : dc_128_predictor_8xh(dst, stride, 16, bd);
535 0 : }
536 :
537 0 : void eb_aom_highbd_dc_128_predictor_8x32_sse2(uint16_t *dst, ptrdiff_t stride,
538 : const uint16_t *above,
539 : const uint16_t *left, int32_t bd) {
540 : (void)above;
541 : (void)left;
542 0 : dc_128_predictor_8xh(dst, stride, 32, bd);
543 0 : }
544 :
545 : // -----------------------------------------------------------------------------
546 : // V_PRED
547 :
548 0 : void eb_aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
549 : const uint16_t *above,
550 : const uint16_t *left, int32_t bd) {
551 : (void)left;
552 : (void)bd;
553 0 : const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
554 : int32_t i;
555 0 : for (i = 0; i < 2; ++i) {
556 0 : _mm_storel_epi64((__m128i *)dst, above_u16);
557 0 : _mm_storel_epi64((__m128i *)(dst + stride), above_u16);
558 0 : _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
559 0 : _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
560 0 : dst += stride << 2;
561 : }
562 0 : }
563 :
564 0 : void eb_aom_highbd_v_predictor_4x16_sse2(uint16_t *dst, ptrdiff_t stride,
565 : const uint16_t *above,
566 : const uint16_t *left, int32_t bd) {
567 : (void)left;
568 : (void)bd;
569 0 : const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
570 : int32_t i;
571 0 : for (i = 0; i < 4; ++i) {
572 0 : _mm_storel_epi64((__m128i *)dst, above_u16);
573 0 : _mm_storel_epi64((__m128i *)(dst + stride), above_u16);
574 0 : _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
575 0 : _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
576 0 : dst += stride << 2;
577 : }
578 0 : }
579 :
580 0 : void eb_aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
581 : const uint16_t *above,
582 : const uint16_t *left, int32_t bd) {
583 : (void)left;
584 : (void)bd;
585 0 : const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
586 : _mm_store_si128((__m128i *)dst, above_u16);
587 0 : _mm_store_si128((__m128i *)(dst + stride), above_u16);
588 0 : _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
589 0 : _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
590 0 : }
591 :
592 0 : void eb_aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
593 : const uint16_t *above,
594 : const uint16_t *left, int32_t bd) {
595 : (void)left;
596 : (void)bd;
597 0 : const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
598 : int32_t i;
599 0 : for (i = 0; i < 4; ++i) {
600 : _mm_store_si128((__m128i *)dst, above_u16);
601 0 : _mm_store_si128((__m128i *)(dst + stride), above_u16);
602 0 : _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
603 0 : _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
604 0 : dst += stride << 2;
605 : }
606 0 : }
607 :
608 0 : void eb_aom_highbd_v_predictor_8x32_sse2(uint16_t *dst, ptrdiff_t stride,
609 : const uint16_t *above,
610 : const uint16_t *left, int32_t bd) {
611 : (void)left;
612 : (void)bd;
613 0 : const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
614 : int32_t i;
615 0 : for (i = 0; i < 8; ++i) {
616 : _mm_store_si128((__m128i *)dst, above_u16);
617 0 : _mm_store_si128((__m128i *)(dst + stride), above_u16);
618 0 : _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
619 0 : _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
620 0 : dst += stride << 2;
621 : }
622 0 : }
623 :
624 : // -----------------------------------------------------------------------------
625 : // DC_PRED
626 :
627 0 : static INLINE __m128i dc_sum_8_32(const uint16_t *const src_8,
628 : const uint16_t *const src_32) {
629 0 : const __m128i s_8 = _mm_load_si128((const __m128i *)src_8);
630 0 : const __m128i s_32_0 = _mm_load_si128((const __m128i *)(src_32 + 0x00));
631 0 : const __m128i s_32_1 = _mm_load_si128((const __m128i *)(src_32 + 0x08));
632 0 : const __m128i s_32_2 = _mm_load_si128((const __m128i *)(src_32 + 0x10));
633 0 : const __m128i s_32_3 = _mm_load_si128((const __m128i *)(src_32 + 0x18));
634 0 : const __m128i s_32_sum0 = _mm_add_epi16(s_32_0, s_32_1);
635 0 : const __m128i s_32_sum1 = _mm_add_epi16(s_32_2, s_32_3);
636 0 : const __m128i s_32_sum = _mm_add_epi16(s_32_sum0, s_32_sum1);
637 0 : const __m128i sum = _mm_add_epi16(s_8, s_32_sum);
638 0 : return dc_sum_8x16bit_large(sum);
639 : }
640 :
641 0 : void eb_aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
642 : const uint16_t *above,
643 : const uint16_t *left, int32_t bd) {
644 : (void)bd;
645 0 : const __m128i sum_above = dc_sum_4(above);
646 0 : const __m128i sum_left = dc_sum_8(left);
647 0 : const __m128i sum = _mm_add_epi16(sum_above, sum_left);
648 0 : uint32_t sum32 = _mm_extract_epi16(sum, 0);
649 0 : sum32 += 6;
650 0 : sum32 /= 12;
651 0 : const __m128i row = _mm_set1_epi16((uint16_t)sum32);
652 : int32_t i;
653 0 : for (i = 0; i < 4; ++i) {
654 0 : _mm_storel_epi64((__m128i *)dst, row);
655 0 : dst += stride;
656 0 : _mm_storel_epi64((__m128i *)dst, row);
657 0 : dst += stride;
658 : }
659 0 : }
660 :
661 0 : void eb_aom_highbd_dc_predictor_4x16_sse2(uint16_t *dst, ptrdiff_t stride,
662 : const uint16_t *above,
663 : const uint16_t *left, int32_t bd) {
664 : (void)bd;
665 0 : __m128i sum_above = dc_sum_4(above);
666 0 : __m128i sum_left = dc_sum_16(left);
667 0 : const __m128i zero = _mm_setzero_si128();
668 0 : sum_left = _mm_unpacklo_epi16(sum_left, zero);
669 0 : sum_above = _mm_unpacklo_epi16(sum_above, zero);
670 0 : const __m128i sum = _mm_add_epi32(sum_left, sum_above);
671 0 : uint32_t sum32 = _mm_cvtsi128_si32(sum);
672 0 : sum32 += 10;
673 0 : sum32 /= 20;
674 0 : const __m128i row = _mm_set1_epi16((uint16_t)sum32);
675 : int32_t i;
676 0 : for (i = 0; i < 8; ++i) {
677 0 : _mm_storel_epi64((__m128i *)dst, row);
678 0 : dst += stride;
679 0 : _mm_storel_epi64((__m128i *)dst, row);
680 0 : dst += stride;
681 : }
682 0 : }
683 :
684 0 : void eb_aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
685 : const uint16_t *above,
686 : const uint16_t *left, int32_t bd) {
687 : (void)bd;
688 0 : const __m128i sum_left = dc_sum_4(left);
689 0 : const __m128i sum_above = dc_sum_8(above);
690 0 : const __m128i sum = _mm_add_epi16(sum_above, sum_left);
691 0 : uint32_t sum32 = _mm_extract_epi16(sum, 0);
692 0 : sum32 += 6;
693 0 : sum32 /= 12;
694 0 : const __m128i row = _mm_set1_epi16((uint16_t)sum32);
695 :
696 : _mm_store_si128((__m128i *)dst, row);
697 0 : dst += stride;
698 : _mm_store_si128((__m128i *)dst, row);
699 0 : dst += stride;
700 : _mm_store_si128((__m128i *)dst, row);
701 0 : dst += stride;
702 : _mm_store_si128((__m128i *)dst, row);
703 0 : }
704 :
705 0 : void eb_aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
706 : const uint16_t *above,
707 : const uint16_t *left, int32_t bd) {
708 : (void)bd;
709 0 : __m128i sum_left = dc_sum_16(left);
710 0 : __m128i sum_above = dc_sum_8(above);
711 0 : const __m128i zero = _mm_setzero_si128();
712 0 : sum_left = _mm_unpacklo_epi16(sum_left, zero);
713 0 : sum_above = _mm_unpacklo_epi16(sum_above, zero);
714 0 : const __m128i sum = _mm_add_epi32(sum_left, sum_above);
715 0 : uint32_t sum32 = _mm_cvtsi128_si32(sum);
716 0 : sum32 += 12;
717 0 : sum32 /= 24;
718 0 : const __m128i row = _mm_set1_epi16((uint16_t)sum32);
719 : int32_t i;
720 0 : for (i = 0; i < 4; ++i) {
721 : _mm_store_si128((__m128i *)dst, row);
722 0 : dst += stride;
723 : _mm_store_si128((__m128i *)dst, row);
724 0 : dst += stride;
725 : _mm_store_si128((__m128i *)dst, row);
726 0 : dst += stride;
727 : _mm_store_si128((__m128i *)dst, row);
728 0 : dst += stride;
729 : }
730 0 : }
731 :
732 0 : void eb_aom_highbd_dc_predictor_8x32_sse2(uint16_t *dst, ptrdiff_t stride,
733 : const uint16_t *above,
734 : const uint16_t *left, int32_t bd) {
735 : (void)bd;
736 0 : const __m128i sum = dc_sum_8_32(above, left);
737 0 : uint32_t sum32 = _mm_cvtsi128_si32(sum);
738 0 : sum32 += 20;
739 0 : sum32 /= 40;
740 0 : const __m128i row = _mm_set1_epi16((uint16_t)sum32);
741 : int32_t i;
742 0 : for (i = 0; i < 8; ++i) {
743 : _mm_store_si128((__m128i *)dst, row);
744 0 : dst += stride;
745 : _mm_store_si128((__m128i *)dst, row);
746 0 : dst += stride;
747 : _mm_store_si128((__m128i *)dst, row);
748 0 : dst += stride;
749 : _mm_store_si128((__m128i *)dst, row);
750 0 : dst += stride;
751 : }
752 0 : }
753 0 : void eb_aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
754 : const uint16_t *above,
755 : const uint16_t *left, int32_t bd) {
756 : (void)bd;
757 0 : __m128i sum_left = dc_sum_16(left);
758 0 : __m128i sum_above = dc_sum_32(above);
759 0 : const __m128i zero = _mm_setzero_si128();
760 0 : sum_left = _mm_unpacklo_epi16(sum_left, zero);
761 0 : const __m128i sum = _mm_add_epi32(sum_left, sum_above);
762 0 : uint32_t sum32 = _mm_cvtsi128_si32(sum);
763 0 : sum32 += 24;
764 0 : sum32 /= 48;
765 0 : const __m128i row = _mm_set1_epi16((uint16_t)sum32);
766 : int32_t i;
767 0 : for (i = 0; i < 4; ++i) {
768 : _mm_store_si128((__m128i *)dst, row);
769 0 : _mm_store_si128((__m128i *)(dst + 8), row);
770 0 : _mm_store_si128((__m128i *)(dst + 16), row);
771 0 : _mm_store_si128((__m128i *)(dst + 24), row);
772 0 : dst += stride;
773 : _mm_store_si128((__m128i *)dst, row);
774 0 : _mm_store_si128((__m128i *)(dst + 8), row);
775 0 : _mm_store_si128((__m128i *)(dst + 16), row);
776 0 : _mm_store_si128((__m128i *)(dst + 24), row);
777 0 : dst += stride;
778 : _mm_store_si128((__m128i *)dst, row);
779 0 : _mm_store_si128((__m128i *)(dst + 8), row);
780 0 : _mm_store_si128((__m128i *)(dst + 16), row);
781 0 : _mm_store_si128((__m128i *)(dst + 24), row);
782 0 : dst += stride;
783 : _mm_store_si128((__m128i *)dst, row);
784 0 : _mm_store_si128((__m128i *)(dst + 8), row);
785 0 : _mm_store_si128((__m128i *)(dst + 16), row);
786 0 : _mm_store_si128((__m128i *)(dst + 24), row);
787 0 : dst += stride;
788 : }
789 0 : }
|