Line data Source code
1 : /*
2 : * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 : #ifndef AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
12 : #define AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
13 :
14 : #include <emmintrin.h> // SSE2
15 : #include <tmmintrin.h> // SSSE3
16 :
17 : #ifdef __cplusplus
18 : extern "C" {
19 : #endif
20 :
21 : #define btf_16_ssse3(w0, w1, in, out0, out1) \
22 : do { \
23 : const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
24 : const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
25 : const __m128i _in = in; \
26 : out0 = _mm_mulhrs_epi16(_in, _w0); \
27 : out1 = _mm_mulhrs_epi16(_in, _w1); \
28 : } while (0)
29 :
30 : #define btf_16_adds_subs_sse2(in0, in1) \
31 : do { \
32 : const __m128i _in0 = in0; \
33 : const __m128i _in1 = in1; \
34 : in0 = _mm_adds_epi16(_in0, _in1); \
35 : in1 = _mm_subs_epi16(_in0, _in1); \
36 : } while (0)
37 :
38 : #define btf_16_subs_adds_sse2(in0, in1) \
39 : do { \
40 : const __m128i _in0 = in0; \
41 : const __m128i _in1 = in1; \
42 : in1 = _mm_subs_epi16(_in0, _in1); \
43 : in0 = _mm_adds_epi16(_in0, _in1); \
44 : } while (0)
45 :
46 : #define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \
47 : do { \
48 : const __m128i _in0 = in0; \
49 : const __m128i _in1 = in1; \
50 : out0 = _mm_adds_epi16(_in0, _in1); \
51 : out1 = _mm_subs_epi16(_in0, _in1); \
52 : } while (0)
53 :
54 65251900 : static INLINE void round_shift_16bit_ssse3(__m128i *in, int32_t size, int32_t bit) {
55 65251900 : if (bit < 0) {
56 65252900 : const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
57 608051000 : for (int32_t i = 0; i < size; ++i)
58 1085600000 : in[i] = _mm_mulhrs_epi16(in[i], scale);
59 : }
60 0 : else if (bit > 0) {
61 0 : for (int32_t i = 0; i < size; ++i)
62 0 : in[i] = _mm_slli_epi16(in[i], bit);
63 : }
64 65251900 : }
65 :
66 : // 1D itx types
67 : typedef enum ATTRIBUTE_PACKED {
68 : IDCT_1D,
69 : IADST_1D,
70 : IFLIPADST_1D = IADST_1D,
71 : IIDENTITY_1D,
72 : ITX_TYPES_1D,
73 : } ITX_TYPE_1D;
74 :
75 : static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
76 : IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
77 : IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
78 : IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
79 : IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
80 : };
81 :
82 : static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
83 : IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
84 : IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
85 : IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
86 : IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
87 : };
88 :
89 : DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
90 : 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
91 : };
92 :
93 : DECLARE_ALIGNED(16, static const int16_t,
94 : av1_eob_to_eobxy_16x16_default[16]) = {
95 : 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
96 : 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
97 : };
98 :
99 : DECLARE_ALIGNED(16, static const int16_t,
100 : av1_eob_to_eobxy_32x32_default[32]) = {
101 : 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
102 : 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
103 : 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
104 : 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
105 : };
106 :
107 : DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
108 : 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
109 : 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
110 : };
111 :
112 : DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
113 : 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
114 : };
115 :
116 : DECLARE_ALIGNED(16, static const int16_t,
117 : av1_eob_to_eobxy_16x32_default[32]) = {
118 : 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
119 : 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
120 : 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
121 : 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
122 : };
123 :
124 : DECLARE_ALIGNED(16, static const int16_t,
125 : av1_eob_to_eobxy_32x16_default[16]) = {
126 : 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
127 : 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
128 : };
129 :
130 : DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
131 : 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
132 : 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
133 : 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
134 : 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
135 : };
136 :
137 : DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
138 : 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
139 : };
140 :
141 : DECLARE_ALIGNED(16, static const int16_t *,
142 : av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
143 : NULL,
144 : av1_eob_to_eobxy_8x8_default,
145 : av1_eob_to_eobxy_16x16_default,
146 : av1_eob_to_eobxy_32x32_default,
147 : av1_eob_to_eobxy_32x32_default,
148 : NULL,
149 : NULL,
150 : av1_eob_to_eobxy_8x16_default,
151 : av1_eob_to_eobxy_16x8_default,
152 : av1_eob_to_eobxy_16x32_default,
153 : av1_eob_to_eobxy_32x16_default,
154 : av1_eob_to_eobxy_32x32_default,
155 : av1_eob_to_eobxy_32x32_default,
156 : NULL,
157 : NULL,
158 : av1_eob_to_eobxy_8x32_default,
159 : av1_eob_to_eobxy_32x8_default,
160 : av1_eob_to_eobxy_16x32_default,
161 : av1_eob_to_eobxy_32x16_default,
162 : };
163 :
164 : static const int32_t lowbd_txfm_all_1d_zeros_idx[32] = {
165 : 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
166 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
167 : };
168 :
169 : // Transform block width in log2 for eob (size of 64 map to 32)
170 : static const int32_t tx_size_wide_log2_eob[TX_SIZES_ALL] = {
171 : 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
172 : };
173 :
174 18792940 : static INLINE void get_eobx_eoby_scan_default(int32_t *eobx, int32_t *eoby,
175 : TxSize tx_size, int32_t eob) {
176 18792940 : if (eob == 1) {
177 2765 : *eobx = 0;
178 2765 : *eoby = 0;
179 2765 : return;
180 : }
181 :
182 18790230 : const int32_t tx_w_log2 = tx_size_wide_log2_eob[tx_size];
183 18790230 : const int32_t eob_row = (eob - 1) >> tx_w_log2;
184 18790230 : const int32_t eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
185 18790230 : *eobx = eobxy & 0xFF;
186 18790230 : *eoby = eobxy >> 8;
187 : }
188 :
189 : static int32_t eob_fill[32] = {
190 : 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15,
191 : 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
192 : };
193 :
194 1131751 : static INLINE void get_eobx_eoby_scan_h_identity(int32_t *eobx, int32_t *eoby,
195 : TxSize tx_size, int32_t eob) {
196 1131751 : eob -= 1;
197 1131751 : const int32_t txfm_size_col = tx_size_wide[tx_size];
198 1131751 : const int32_t eobx_max = AOMMIN(32, txfm_size_col) - 1;
199 1131751 : *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
200 1131751 : const int32_t temp_eoby = eob / (eobx_max + 1);
201 1131751 : assert(temp_eoby < 32);
202 1131751 : *eoby = eob_fill[temp_eoby];
203 1131751 : }
204 :
205 1262287 : static INLINE void get_eobx_eoby_scan_v_identity(int32_t *eobx, int32_t *eoby,
206 : TxSize tx_size, int32_t eob) {
207 1262287 : eob -= 1;
208 1262287 : const int32_t txfm_size_row = tx_size_high[tx_size];
209 1262287 : const int32_t eoby_max = AOMMIN(32, txfm_size_row) - 1;
210 1262287 : *eobx = eob / (eoby_max + 1);
211 1262287 : *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
212 1262287 : }
213 :
214 : typedef void(*transform_1d_ssse3)(const __m128i *input, __m128i *output,
215 : int8_t cos_bit);
216 :
217 : void eb_av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input,
218 : uint8_t *output_r, int32_t stride_r,
219 : uint8_t *output_w, int32_t stride_w,
220 : TxType tx_type,
221 : TxSize tx_size, int32_t eob);
222 : #ifdef __cplusplus
223 : } // extern "C"
224 : #endif
225 :
226 : #endif // AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
|