Line data Source code
1 : /*
2 : * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <assert.h>
13 : #include <emmintrin.h> // SSE2
14 : #include <stdint.h>
15 : #include "EbDefinitions.h"
16 : #include "EbCabacContextModel.h"
17 : #include "EbCommonUtils.h"
18 :
19 336937000 : static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
20 1010780000 : return _mm_castps_si128(
21 : _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
22 : }
23 :
24 105187000 : static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
25 : const int32_t byte_stride) {
26 105187000 : return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
27 105187000 : *(const int32_t *)((int8_t *)src + 1 * byte_stride),
28 105187000 : *(const int32_t *)((int8_t *)src + 2 * byte_stride),
29 105187000 : *(const int32_t *)((int8_t *)src + 3 * byte_stride));
30 : }
31 :
32 336973000 : static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
33 : const int32_t byte_stride) {
34 : __m128i dst;
35 336973000 : dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
36 336973000 : dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
37 336895000 : return dst;
38 : }
39 :
40 21057500 : static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src,
41 : const int32_t stride,
42 : const ptrdiff_t *const offsets,
43 : __m128i *const level) {
44 21057500 : level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride);
45 21054900 : level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride);
46 21048600 : level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride);
47 21047800 : level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride);
48 21046500 : level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride);
49 21046200 : }
50 :
51 67658400 : static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src,
52 : const int32_t stride,
53 : const ptrdiff_t *const offsets,
54 : __m128i *const level) {
55 67658400 : level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride);
56 67632000 : level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride);
57 67597000 : level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride);
58 67573100 : level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride);
59 67564800 : level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride);
60 67564800 : }
61 :
62 189985000 : static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src,
63 : const int32_t stride,
64 : const ptrdiff_t *const offsets,
65 : __m128i *const level) {
66 189985000 : level[0] = _mm_loadu_si128((__m128i *)(src + 1));
67 189985000 : level[1] = _mm_loadu_si128((__m128i *)(src + stride));
68 189985000 : level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0]));
69 189985000 : level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1]));
70 189985000 : level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2]));
71 189985000 : }
72 :
73 277396000 : static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
74 277396000 : const __m128i const_3 = _mm_set1_epi8(3);
75 277396000 : const __m128i const_4 = _mm_set1_epi8(4);
76 : __m128i count;
77 :
78 277396000 : count = _mm_min_epu8(level[0], const_3);
79 277396000 : level[1] = _mm_min_epu8(level[1], const_3);
80 277396000 : level[2] = _mm_min_epu8(level[2], const_3);
81 277396000 : level[3] = _mm_min_epu8(level[3], const_3);
82 277396000 : level[4] = _mm_min_epu8(level[4], const_3);
83 277396000 : count = _mm_add_epi8(count, level[1]);
84 277396000 : count = _mm_add_epi8(count, level[2]);
85 277396000 : count = _mm_add_epi8(count, level[3]);
86 554793000 : count = _mm_add_epi8(count, level[4]);
87 554793000 : count = _mm_avg_epu8(count, _mm_setzero_si128());
88 277396000 : count = _mm_min_epu8(count, const_4);
89 277396000 : return count;
90 : }
91 :
92 11074900 : static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
93 : const int32_t height,
94 : const ptrdiff_t *const offsets,
95 : int8_t *const coeff_contexts) {
96 11074900 : const int32_t stride = 4 + TX_PAD_HOR;
97 11074900 : const __m128i pos_to_offset_large = _mm_set1_epi8(21);
98 11074900 : __m128i pos_to_offset =
99 : (height == 4)
100 7527890 : ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21)
101 22149900 : : _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21,
102 : 21, 21);
103 : __m128i count;
104 : __m128i level[5];
105 11074900 : int8_t *cc = coeff_contexts;
106 11074900 : int32_t row = height;
107 :
108 : assert(!(height % 4));
109 :
110 : do {
111 17515500 : load_levels_4x4x5_sse2(levels, stride, offsets, level);
112 17510000 : count = get_coeff_contexts_kernel_sse2(level);
113 17519500 : count = _mm_add_epi8(count, pos_to_offset);
114 : _mm_store_si128((__m128i *)cc, count);
115 17519500 : pos_to_offset = pos_to_offset_large;
116 17519500 : levels += 4 * stride;
117 17519500 : cc += 16;
118 17519500 : row -= 4;
119 17519500 : } while (row);
120 :
121 11078900 : coeff_contexts[0] = 0;
122 11078900 : }
123 :
124 11872400 : static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
125 : const int32_t height,
126 : const ptrdiff_t *const offsets,
127 : int8_t *coeff_contexts) {
128 11872400 : const int32_t stride = 8 + TX_PAD_HOR;
129 11872400 : int8_t *cc = coeff_contexts;
130 11872400 : int32_t row = height;
131 : __m128i count;
132 : __m128i level[5];
133 : __m128i pos_to_offset[3];
134 :
135 : assert(!(height % 2));
136 :
137 11872400 : if (height == 8) {
138 7254990 : pos_to_offset[0] =
139 7254990 : _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21);
140 7254990 : pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
141 : 21, 21, 21, 21, 21);
142 : }
143 4617450 : else if (height < 8) {
144 2024490 : pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21,
145 : 21, 21, 21, 21);
146 2024490 : pos_to_offset[1] = _mm_setr_epi8(16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21,
147 : 21, 21, 21, 21, 21);
148 : }
149 : else {
150 2592960 : pos_to_offset[0] = _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
151 : 11, 11, 11, 11, 11);
152 2592960 : pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
153 : 21, 21, 21, 21, 21);
154 : }
155 11872400 : pos_to_offset[2] = _mm_set1_epi8(21);
156 :
157 : do {
158 58312000 : load_levels_8x2x5_sse2(levels, stride, offsets, level);
159 58208600 : count = get_coeff_contexts_kernel_sse2(level);
160 116634000 : count = _mm_add_epi8(count, pos_to_offset[0]);
161 : _mm_store_si128((__m128i *)cc, count);
162 58317100 : pos_to_offset[0] = pos_to_offset[1];
163 58317100 : pos_to_offset[1] = pos_to_offset[2];
164 58317100 : levels += 2 * stride;
165 58317100 : cc += 16;
166 58317100 : row -= 2;
167 58317100 : } while (row);
168 :
169 11877500 : coeff_contexts[0] = 0;
170 11877500 : }
171 :
172 9094900 : static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
173 : const int32_t real_width,
174 : const int32_t real_height,
175 : const int32_t width, const int32_t height,
176 : const ptrdiff_t *const offsets,
177 : int8_t *coeff_contexts) {
178 9094900 : const int32_t stride = width + TX_PAD_HOR;
179 9094900 : int8_t *cc = coeff_contexts;
180 9094900 : int32_t row = height;
181 : __m128i pos_to_offset[5];
182 : __m128i pos_to_offset_large[3];
183 : __m128i count;
184 : __m128i level[5];
185 :
186 : assert(!(width % 16));
187 :
188 9094900 : pos_to_offset_large[2] = _mm_set1_epi8(21);
189 9094900 : if (real_width == real_height) {
190 3379460 : pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
191 : 21, 21, 21, 21);
192 3379460 : pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
193 : 21, 21, 21, 21, 21);
194 3379460 : pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
195 : 21, 21, 21, 21, 21);
196 3379460 : pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
197 : 21, 21, 21, 21, 21);
198 3379460 : pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
199 3379460 : pos_to_offset_large[2];
200 : }
201 5715440 : else if (real_width > real_height) {
202 4723200 : pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21,
203 : 21, 21, 21, 21, 21);
204 4723200 : pos_to_offset[1] = _mm_setr_epi8(16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21,
205 : 21, 21, 21, 21, 21);
206 4723200 : pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8(
207 : 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
208 4723200 : pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
209 : }
210 : else { // real_width < real_height
211 992249 : pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8(
212 : 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11);
213 992249 : pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
214 : 21, 21, 21, 21, 21);
215 992249 : pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
216 : 21, 21, 21, 21, 21);
217 992249 : pos_to_offset[4] = pos_to_offset_large[2];
218 992249 : pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(11);
219 : }
220 :
221 : do {
222 137745000 : int32_t w = width;
223 :
224 : do {
225 186634000 : load_levels_16x1x5_sse2(levels, stride, offsets, level);
226 186376000 : count = get_coeff_contexts_kernel_sse2(level);
227 373800000 : count = _mm_add_epi8(count, pos_to_offset[0]);
228 : _mm_store_si128((__m128i *)cc, count);
229 186900000 : levels += 16;
230 186900000 : cc += 16;
231 186900000 : w -= 16;
232 186900000 : pos_to_offset[0] = pos_to_offset_large[0];
233 186900000 : } while (w);
234 :
235 138011000 : pos_to_offset[0] = pos_to_offset[1];
236 138011000 : pos_to_offset[1] = pos_to_offset[2];
237 138011000 : pos_to_offset[2] = pos_to_offset[3];
238 138011000 : pos_to_offset[3] = pos_to_offset[4];
239 138011000 : pos_to_offset_large[0] = pos_to_offset_large[1];
240 138011000 : pos_to_offset_large[1] = pos_to_offset_large[2];
241 138011000 : levels += TX_PAD_HOR;
242 138011000 : } while (--row);
243 :
244 9361100 : coeff_contexts[0] = 0;
245 9361100 : }
246 :
247 1455480 : static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
248 : const int32_t height,
249 : const ptrdiff_t *const offsets,
250 : int8_t *coeff_contexts) {
251 1455480 : const int32_t stride = 4 + TX_PAD_HOR;
252 : const __m128i pos_to_offset =
253 1455480 : _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
254 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
255 : SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
256 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
257 : SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
258 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
259 : SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
260 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
261 : __m128i count;
262 : __m128i level[5];
263 1455480 : int32_t row = height;
264 :
265 : assert(!(height % 4));
266 :
267 : do {
268 1883200 : load_levels_4x4x5_sse2(levels, stride, offsets, level);
269 1883120 : count = get_coeff_contexts_kernel_sse2(level);
270 1883280 : count = _mm_add_epi8(count, pos_to_offset);
271 : _mm_store_si128((__m128i *)coeff_contexts, count);
272 1883280 : levels += 4 * stride;
273 1883280 : coeff_contexts += 16;
274 1883280 : row -= 4;
275 1883280 : } while (row);
276 1455560 : }
277 :
278 1302920 : static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
279 : const int32_t height,
280 : const ptrdiff_t *const offsets,
281 : int8_t *coeff_contexts) {
282 1302920 : const int32_t stride = 4 + TX_PAD_HOR;
283 1302920 : const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
284 : __m128i pos_to_offset =
285 1302920 : _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
286 : SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
287 : SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
288 : SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
289 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
290 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
291 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
292 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
293 : __m128i count;
294 : __m128i level[5];
295 1302920 : int32_t row = height;
296 :
297 : assert(!(height % 4));
298 :
299 : do {
300 1660330 : load_levels_4x4x5_sse2(levels, stride, offsets, level);
301 1660290 : count = get_coeff_contexts_kernel_sse2(level);
302 1660400 : count = _mm_add_epi8(count, pos_to_offset);
303 : _mm_store_si128((__m128i *)coeff_contexts, count);
304 1660400 : pos_to_offset = pos_to_offset_large;
305 1660400 : levels += 4 * stride;
306 1660400 : coeff_contexts += 16;
307 1660400 : row -= 4;
308 1660400 : } while (row);
309 1302990 : }
310 :
311 1164390 : static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
312 : const int32_t height,
313 : const ptrdiff_t *const offsets,
314 : int8_t *coeff_contexts) {
315 1164390 : const int32_t stride = 8 + TX_PAD_HOR;
316 : const __m128i pos_to_offset =
317 1164390 : _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
318 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
319 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
320 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
321 : SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
322 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
323 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
324 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
325 1164390 : int32_t row = height;
326 : __m128i count;
327 : __m128i level[5];
328 :
329 : assert(!(height % 2));
330 :
331 : do {
332 4984300 : load_levels_8x2x5_sse2(levels, stride, offsets, level);
333 4983560 : count = get_coeff_contexts_kernel_sse2(level);
334 4984370 : count = _mm_add_epi8(count, pos_to_offset);
335 : _mm_store_si128((__m128i *)coeff_contexts, count);
336 4984370 : levels += 2 * stride;
337 4984370 : coeff_contexts += 16;
338 4984370 : row -= 2;
339 4984370 : } while (row);
340 1164460 : }
341 :
342 1037100 : static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
343 : const int32_t height,
344 : const ptrdiff_t *const offsets,
345 : int8_t *coeff_contexts) {
346 1037100 : const int32_t stride = 8 + TX_PAD_HOR;
347 1037100 : const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
348 : __m128i pos_to_offset =
349 1037100 : _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
350 : SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
351 : SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
352 : SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
353 : SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
354 : SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
355 : SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
356 : SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5);
357 1037100 : int32_t row = height;
358 : __m128i count;
359 : __m128i level[5];
360 :
361 : assert(!(height % 2));
362 :
363 : do {
364 4421060 : load_levels_8x2x5_sse2(levels, stride, offsets, level);
365 4420370 : count = get_coeff_contexts_kernel_sse2(level);
366 4421110 : count = _mm_add_epi8(count, pos_to_offset);
367 : _mm_store_si128((__m128i *)coeff_contexts, count);
368 4421110 : pos_to_offset = pos_to_offset_large;
369 4421110 : levels += 2 * stride;
370 4421110 : coeff_contexts += 16;
371 4421110 : row -= 2;
372 4421110 : } while (row);
373 1037160 : }
374 :
375 243991 : static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
376 : const int32_t width, const int32_t height,
377 : const ptrdiff_t *const offsets,
378 : int8_t *coeff_contexts) {
379 243991 : const int32_t stride = width + TX_PAD_HOR;
380 : const __m128i pos_to_offset_large =
381 243991 : _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
382 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
383 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
384 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
385 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
386 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
387 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
388 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
389 : __m128i count;
390 : __m128i level[5];
391 243991 : int32_t row = height;
392 :
393 : assert(!(width % 16));
394 :
395 : do {
396 : __m128i pos_to_offset =
397 1823180 : _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
398 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
399 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
400 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
401 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
402 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
403 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
404 : SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
405 1823180 : int32_t w = width;
406 :
407 : do {
408 1823180 : load_levels_16x1x5_sse2(levels, stride, offsets, level);
409 1823160 : count = get_coeff_contexts_kernel_sse2(level);
410 1823200 : count = _mm_add_epi8(count, pos_to_offset);
411 : _mm_store_si128((__m128i *)coeff_contexts, count);
412 1823200 : pos_to_offset = pos_to_offset_large;
413 1823200 : levels += 16;
414 1823200 : coeff_contexts += 16;
415 1823200 : w -= 16;
416 1823200 : } while (w);
417 :
418 1823190 : levels += TX_PAD_HOR;
419 1823190 : } while (--row);
420 244009 : }
421 :
422 245372 : static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
423 : const int32_t width, const int32_t height,
424 : const ptrdiff_t *const offsets,
425 : int8_t *coeff_contexts) {
426 245372 : const int32_t stride = width + TX_PAD_HOR;
427 : __m128i pos_to_offset[3];
428 : __m128i count;
429 : __m128i level[5];
430 245372 : int32_t row = height;
431 :
432 : assert(!(width % 16));
433 :
434 245372 : pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0);
435 245372 : pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5);
436 245372 : pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
437 :
438 : do {
439 1836460 : int32_t w = width;
440 :
441 : do {
442 1836460 : load_levels_16x1x5_sse2(levels, stride, offsets, level);
443 1836460 : count = get_coeff_contexts_kernel_sse2(level);
444 3673020 : count = _mm_add_epi8(count, pos_to_offset[0]);
445 : _mm_store_si128((__m128i *)coeff_contexts, count);
446 1836510 : levels += 16;
447 1836510 : coeff_contexts += 16;
448 1836510 : w -= 16;
449 1836510 : } while (w);
450 :
451 1836510 : pos_to_offset[0] = pos_to_offset[1];
452 1836510 : pos_to_offset[1] = pos_to_offset[2];
453 1836510 : levels += TX_PAD_HOR;
454 1836510 : } while (--row);
455 245421 : }
456 :
457 43903800 : void eb_av1_get_nz_map_contexts_sse2(
458 : const uint8_t *const levels,
459 : const int16_t *const scan,
460 : const uint16_t eob,
461 : TxSize tx_size,
462 : const TxClass tx_class,
463 : int8_t *const coeff_contexts
464 : ) {
465 43903800 : const int32_t last_idx = eob - 1;
466 43903800 : if (!last_idx) {
467 6463950 : coeff_contexts[0] = 0;
468 6463950 : return;
469 : }
470 :
471 37439900 : const int32_t real_width = tx_size_wide[tx_size];
472 37439900 : const int32_t real_height = tx_size_high[tx_size];
473 37439900 : const int32_t width = get_txb_wide(tx_size);
474 37460800 : const int32_t height = get_txb_high(tx_size);
475 :
476 37485100 : const int32_t stride = width + TX_PAD_HOR;
477 :
478 : ptrdiff_t offsets[3];
479 :
480 : /* coeff_contexts must be 16 byte aligned. */
481 : assert(!((intptr_t)coeff_contexts & 0xf));
482 :
483 37485100 : if (tx_class == TX_CLASS_2D) {
484 32036100 : offsets[0] = 0 * stride + 2;
485 32036100 : offsets[1] = 1 * stride + 1;
486 32036100 : offsets[2] = 2 * stride + 0;
487 :
488 32036100 : if (width == 4)
489 11074700 : get_4_nz_map_contexts_2d(levels, height, offsets, coeff_contexts);
490 20961400 : else if (width == 8)
491 11873000 : get_8_coeff_contexts_2d(levels, height, offsets, coeff_contexts);
492 9088460 : else if (width == 16) {
493 6773810 : get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
494 : offsets, coeff_contexts);
495 : }
496 : else {
497 2314650 : get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
498 : offsets, coeff_contexts);
499 : }
500 : }
501 5448960 : else if (tx_class == TX_CLASS_HORIZ) {
502 2863710 : offsets[0] = 2;
503 2863710 : offsets[1] = 3;
504 2863710 : offsets[2] = 4;
505 2863710 : if (width == 4)
506 1455490 : get_4_nz_map_contexts_hor(levels, height, offsets, coeff_contexts);
507 1408220 : else if (width == 8)
508 1164390 : get_8_coeff_contexts_hor(levels, height, offsets, coeff_contexts);
509 : else {
510 243831 : get_16n_coeff_contexts_hor(levels, width, height, offsets,
511 : coeff_contexts);
512 : }
513 : }
514 : else { // TX_CLASS_VERT
515 2585250 : offsets[0] = 2 * stride;
516 2585250 : offsets[1] = 3 * stride;
517 2585250 : offsets[2] = 4 * stride;
518 2585250 : if (width == 4)
519 1302920 : get_4_nz_map_contexts_ver(levels, height, offsets, coeff_contexts);
520 1282330 : else if (width == 8)
521 1037120 : get_8_coeff_contexts_ver(levels, height, offsets, coeff_contexts);
522 : else {
523 245215 : get_16n_coeff_contexts_ver(levels, width, height, offsets,
524 : coeff_contexts);
525 : }
526 : }
527 :
528 37501900 : const int32_t bwl = get_txb_bwl(tx_size);
529 37472400 : const int32_t pos = scan[last_idx];
530 37472400 : if (last_idx <= (height << bwl) / 8)
531 6834960 : coeff_contexts[pos] = 1;
532 30637400 : else if (last_idx <= (height << bwl) / 4)
533 5374890 : coeff_contexts[pos] = 2;
534 : else
535 25262500 : coeff_contexts[pos] = 3;
536 : }
|