Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include "EbDefinitions.h"
7 : #include "EbIntrinMacros16bit_SSE2.h"
8 : #include <emmintrin.h>
9 :
10 : /*****************************
11 : * Defines
12 : *****************************/
13 :
14 : #define MACRO_TRANS_2MAC_NO_SAVE(XMM_1, XMM_2, XMM_3, XMM_4, XMM_OFFSET, OFFSET1, OFFSET2, SHIFT)\
15 : XMM_3 = _mm_load_si128((__m128i *)(TransformAsmConst + OFFSET1));\
16 : XMM_4 = _mm_load_si128((__m128i *)(TransformAsmConst + OFFSET2));\
17 : XMM_3 = _mm_madd_epi16(XMM_3, XMM_1);\
18 : XMM_4 = _mm_madd_epi16(XMM_4, XMM_2);\
19 : XMM_3 = _mm_srai_epi32(_mm_add_epi32(XMM_4, _mm_add_epi32(XMM_3, XMM_OFFSET)), SHIFT);\
20 : XMM_3 = _mm_packs_epi32(XMM_3, XMM_3);
21 :
22 : #define MACRO_TRANS_2MAC(XMM_1, XMM_2, XMM_3, XMM_4, XMM_OFFSET, OFFSET1, OFFSET2, SHIFT, OFFSET3)\
23 : MACRO_TRANS_2MAC_NO_SAVE(XMM_1, XMM_2, XMM_3, XMM_4, XMM_OFFSET, OFFSET1, OFFSET2, SHIFT)\
24 : _mm_storel_epi64((__m128i *)(transform_coefficients+OFFSET3), XMM_3);
25 :
26 : #define TRANS8x8_OFFSET_83_36 0
27 : #define TRANS8x8_OFFSET_36_N83 (8 + TRANS8x8_OFFSET_83_36)
28 : #define TRANS8x8_OFFSET_89_75 (8 + TRANS8x8_OFFSET_36_N83)
29 : #define TRANS8x8_OFFSET_50_18 (8 + TRANS8x8_OFFSET_89_75)
30 : #define TRANS8x8_OFFSET_75_N18 (8 + TRANS8x8_OFFSET_50_18)
31 : #define TRANS8x8_OFFSET_N89_N50 (8 + TRANS8x8_OFFSET_75_N18)
32 : #define TRANS8x8_OFFSET_50_N89 (8 + TRANS8x8_OFFSET_N89_N50)
33 : #define TRANS8x8_OFFSET_18_75 (8 + TRANS8x8_OFFSET_50_N89)
34 : #define TRANS8x8_OFFSET_18_N50 (8 + TRANS8x8_OFFSET_18_75)
35 : #define TRANS8x8_OFFSET_75_N89 (8 + TRANS8x8_OFFSET_18_N50)
36 : #define TRANS8x8_OFFSET_256 (8 + TRANS8x8_OFFSET_75_N89)
37 : #define TRANS8x8_OFFSET_64_64 (8 + TRANS8x8_OFFSET_256)
38 : #define TRANS8x8_OFFSET_N18_N50 (8 + TRANS8x8_OFFSET_64_64)
39 : #define TRANS8x8_OFFSET_N75_N89 (8 + TRANS8x8_OFFSET_N18_N50)
40 : #define TRANS8x8_OFFSET_N36_N83 (8 + TRANS8x8_OFFSET_N75_N89)
41 : #define TRANS8x8_OFFSET_N83_N36 (8 + TRANS8x8_OFFSET_N36_N83)
42 : #define TRANS8x8_OFFSET_36_83 (8 + TRANS8x8_OFFSET_N83_N36)
43 : #define TRANS8x8_OFFSET_50_89 (8 + TRANS8x8_OFFSET_36_83)
44 : #define TRANS8x8_OFFSET_18_N75 (8 + TRANS8x8_OFFSET_50_89)
45 : #define TRANS8x8_OFFSET_N64_64 (8 + TRANS8x8_OFFSET_18_N75)
46 : #define TRANS8x8_OFFSET_64_N64 (8 + TRANS8x8_OFFSET_N64_64)
47 : #define TRANS8x8_OFFSET_N75_N18 (8 + TRANS8x8_OFFSET_64_N64)
48 : #define TRANS8x8_OFFSET_89_N50 (8 + TRANS8x8_OFFSET_N75_N18)
49 : #define TRANS8x8_OFFSET_83_N36 (8 + TRANS8x8_OFFSET_89_N50)
50 : #define TRANS8x8_OFFSET_N36_83 (8 + TRANS8x8_OFFSET_83_N36)
51 : #define TRANS8x8_OFFSET_N83_36 (8 + TRANS8x8_OFFSET_N36_83)
52 : #define TRANS8x8_OFFSET_89_N75 (8 + TRANS8x8_OFFSET_N83_36)
53 : #define TRANS8x8_OFFSET_50_N18 (8 + TRANS8x8_OFFSET_89_N75)
54 :
55 : #define MACRO_CALC_EVEN_ODD(XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8)\
56 : even0 = _mm_add_epi16(XMM1, XMM8);\
57 : even1 = _mm_add_epi16(XMM2, XMM7);\
58 : even2 = _mm_add_epi16(XMM3, XMM6);\
59 : even3 = _mm_add_epi16(XMM4, XMM5);\
60 : odd0 = _mm_sub_epi16(XMM1, XMM8);\
61 : odd1 = _mm_sub_epi16(XMM2, XMM7);\
62 : odd2 = _mm_sub_epi16(XMM3, XMM6);\
63 : odd3 = _mm_sub_epi16(XMM4, XMM5);
64 :
65 : #define MACRO_TRANS_4MAC_NO_SAVE(XMM1, XMM2, XMM3, XMM4, XMM_RET, XMM_OFFSET, MEM, OFFSET1, OFFSET2, SHIFT)\
66 : XMM_RET = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_madd_epi16(XMM1, _mm_load_si128((__m128i*)(MEM+OFFSET1))),\
67 : _mm_madd_epi16(XMM3, _mm_load_si128((__m128i*)(MEM+OFFSET2)))), XMM_OFFSET), SHIFT),\
68 : _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_madd_epi16(XMM2, _mm_load_si128((__m128i*)(MEM+OFFSET1))),\
69 : _mm_madd_epi16(XMM4, _mm_load_si128((__m128i*)(MEM+OFFSET2)))), XMM_OFFSET), SHIFT));
70 :
71 : #define MACRO_TRANS_8MAC(XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM_OFST, MEM, OFST1, OFST2, OFST3, OFST4, SHIFT, INSTR, DST, OFST5)\
72 : sum1 = _mm_add_epi32(_mm_madd_epi16(XMM1, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM2, _mm_loadu_si128((__m128i *)(MEM + OFST2))));\
73 : sum2 = _mm_add_epi32(_mm_madd_epi16(XMM3, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM4, _mm_loadu_si128((__m128i *)(MEM + OFST4))));\
74 : sum1 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum1, sum2)), SHIFT);\
75 : sum3 = _mm_add_epi32(_mm_madd_epi16(XMM5, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM6, _mm_loadu_si128((__m128i *)(MEM + OFST2))));\
76 : sum4 = _mm_add_epi32(_mm_madd_epi16(XMM7, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM8, _mm_loadu_si128((__m128i *)(MEM + OFST4))));\
77 : sum3 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum3, sum4)), SHIFT);\
78 : sum = _mm_packs_epi32(sum1, sum3);\
79 : INSTR((__m128i *)(DST + OFST5), sum);
80 :
81 : #define MACRO_TRANS_8MAC_PF_N2(XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM_OFST, MEM, OFST1, OFST2, OFST3, OFST4, SHIFT, INSTR, DST, OFST5)\
82 : sum1 = _mm_add_epi32(_mm_madd_epi16(XMM1, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM2, _mm_loadu_si128((__m128i *)(MEM + OFST2))));\
83 : sum2 = _mm_add_epi32(_mm_madd_epi16(XMM3, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM4, _mm_loadu_si128((__m128i *)(MEM + OFST4))));\
84 : sum1 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum1, sum2)), SHIFT);\
85 : /*sum3 = _mm_add_epi32(_mm_madd_epi16(XMM5, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM6, _mm_loadu_si128((__m128i *)(MEM + OFST2))));*/\
86 : /*sum4 = _mm_add_epi32(_mm_madd_epi16(XMM7, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM8, _mm_loadu_si128((__m128i *)(MEM + OFST4))));*/\
87 : /*sum3 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum3, sum4)), SHIFT);*/\
88 : /*sum = _mm_packs_epi32(sum1, sum3);*/\
89 : sum = _mm_packs_epi32(sum1, sum1);\
90 : INSTR((__m128i *)(DST + OFST5), sum);
91 : #define MACRO_TRANS_8MAC_PF_N4(XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM_OFST, MEM, OFST1, OFST2, OFST3, OFST4, SHIFT, INSTR, DST, OFST5)\
92 : sum1 = _mm_add_epi32(_mm_madd_epi16(XMM1, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM2, _mm_loadu_si128((__m128i *)(MEM + OFST2))));\
93 : sum2 = _mm_add_epi32(_mm_madd_epi16(XMM3, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM4, _mm_loadu_si128((__m128i *)(MEM + OFST4))));\
94 : sum1 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum1, sum2)), SHIFT);\
95 : /*sum3 = _mm_add_epi32(_mm_madd_epi16(XMM5, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM6, _mm_loadu_si128((__m128i *)(MEM + OFST2))));*/\
96 : /*sum4 = _mm_add_epi32(_mm_madd_epi16(XMM7, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM8, _mm_loadu_si128((__m128i *)(MEM + OFST4))));*/\
97 : /*sum3 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum3, sum4)), SHIFT);*/\
98 : /*sum = _mm_packs_epi32(sum1, sum3);*/\
99 : sum = _mm_packs_epi32(sum1, sum1);\
100 : INSTR((__m128i *)(DST + OFST5), sum);
101 :
102 : #ifdef __GNUC__
103 : #ifndef __cplusplus
104 : __attribute__((visibility("hidden")))
105 : #endif
106 : #endif
107 : EB_ALIGN(16) const int16_t dst_transform_asm_const_sse2[] = {
108 : 1, 0, 1, 0, 1, 0, 1, 0,
109 : 29, 55, 29, 55, 29, 55, 29, 55,
110 : 74, 84, 74, 84, 74, 84, 74, 84,
111 : 84, -29, 84, -29, 84, -29, 84, -29,
112 : -74, 55, -74, 55, -74, 55, -74, 55,
113 : 55, -84, 55, -84, 55, -84, 55, -84,
114 : 74, -29, 74, -29, 74, -29, 74, -29,
115 : 37, 37, 37, 37, 37, 37, 37, 37,
116 : 74, 74, 74, 74, 74, 74, 74, 74,
117 : 0, -37, 0, -37, 0, -37, 0, -37,
118 : 0, -74, 0, -74, 0, -74, 0, -74,
119 : //74, 0, 74, 0, 74, 0, 74, 0,
120 : //55, -29, 55, -29, 55, -29, 55, -29,
121 : };
122 :
123 : #ifdef __GNUC__
124 : #ifndef __cplusplus
125 : __attribute__((visibility("hidden")))
126 : #endif
127 : #endif
128 : EB_ALIGN(16) const int16_t inv_transform_asm_const_sse2[] = {
129 : 2, 0, 2, 0, 2, 0, 2, 0,
130 : 4, 0, 4, 0, 4, 0, 4, 0,
131 : 8, 0, 8, 0, 8, 0, 8, 0,
132 : 9, 0, 9, 0, 9, 0, 9, 0,
133 : 64, 0, 64, 0, 64, 0, 64, 0,
134 : 256, 0, 256, 0, 256, 0, 256, 0,
135 : 512, 0, 512, 0, 512, 0, 512, 0,
136 : 1024, 0, 1024, 0, 1024, 0, 1024, 0,
137 : 2048, 0, 2048, 0, 2048, 0, 2048, 0,
138 : 7, 0, 0, 0, 0, 0, 0, 0,
139 : 12, 0, 0, 0, 0, 0, 0, 0,
140 : 64, 64, 64, 64, 64, 64, 64, 64,
141 : 90, 57, 90, 57, 90, 57, 90, 57,
142 : 89, 50, 89, 50, 89, 50, 89, 50,
143 : 87, 43, 87, 43, 87, 43, 87, 43,
144 : 83, 36, 83, 36, 83, 36, 83, 36,
145 : 80, 25, 80, 25, 80, 25, 80, 25,
146 : 75, 18, 75, 18, 75, 18, 75, 18,
147 : 70, 9, 70, 9, 70, 9, 70, 9,
148 : 64, -64, 64, -64, 64, -64, 64, -64,
149 : 87, -80, 87, -80, 87, -80, 87, -80,
150 : 75, -89, 75, -89, 75, -89, 75, -89,
151 : 57, -90, 57, -90, 57, -90, 57, -90,
152 : 36, -83, 36, -83, 36, -83, 36, -83,
153 : 9, -70, 9, -70, 9, -70, 9, -70,
154 : -18, -50, -18, -50, -18, -50, -18, -50,
155 : -43, -25, -43, -25, -43, -25, -43, -25,
156 : 80, -25, 80, -25, 80, -25, 80, -25,
157 : 50, 18, 50, 18, 50, 18, 50, 18,
158 : 9, 57, 9, 57, 9, 57, 9, 57,
159 : -36, 83, -36, 83, -36, 83, -36, 83,
160 : -70, 90, -70, 90, -70, 90, -70, 90,
161 : -89, 75, -89, 75, -89, 75, -89, 75,
162 : -87, 43, -87, 43, -87, 43, -87, 43,
163 : 70, 90, 70, 90, 70, 90, 70, 90,
164 : 18, 75, 18, 75, 18, 75, 18, 75,
165 : -43, 25, -43, 25, -43, 25, -43, 25,
166 : -83, -36, -83, -36, -83, -36, -83, -36,
167 : -87, -80, -87, -80, -87, -80, -87, -80,
168 : -50, -89, -50, -89, -50, -89, -50, -89,
169 : 9, -57, 9, -57, 9, -57, 9, -57,
170 : 57, -9, 57, -9, 57, -9, 57, -9,
171 : -18, -75, -18, -75, -18, -75, -18, -75,
172 : -80, -87, -80, -87, -80, -87, -80, -87,
173 : -25, 43, -25, 43, -25, 43, -25, 43,
174 : 50, 89, 50, 89, 50, 89, 50, 89,
175 : 90, 70, 90, 70, 90, 70, 90, 70,
176 : 43, -87, 43, -87, 43, -87, 43, -87,
177 : -50, -18, -50, -18, -50, -18, -50, -18,
178 : -90, 70, -90, 70, -90, 70, -90, 70,
179 : 57, 9, 57, 9, 57, 9, 57, 9,
180 : 89, -75, 89, -75, 89, -75, 89, -75,
181 : 25, -80, 25, -80, 25, -80, 25, -80,
182 : 25, 43, 25, 43, 25, 43, 25, 43,
183 : -75, 89, -75, 89, -75, 89, -75, 89,
184 : -70, 9, -70, 9, -70, 9, -70, 9,
185 : 90, -57, 90, -57, 90, -57, 90, -57,
186 : 18, 50, 18, 50, 18, 50, 18, 50,
187 : -80, 87, -80, 87, -80, 87, -80, 87,
188 : 9, 70, 9, 70, 9, 70, 9, 70,
189 : -89, -50, -89, -50, -89, -50, -89, -50,
190 : -25, -80, -25, -80, -25, -80, -25, -80,
191 : 43, 87, 43, 87, 43, 87, 43, 87,
192 : -75, -18, -75, -18, -75, -18, -75, -18,
193 : -57, -90, -57, -90, -57, -90, -57, -90,
194 : -9, -70, -9, -70, -9, -70, -9, -70,
195 : 25, 80, 25, 80, 25, 80, 25, 80,
196 : -43, -87, -43, -87, -43, -87, -43, -87,
197 : 57, 90, 57, 90, 57, 90, 57, 90,
198 : -25, -43, -25, -43, -25, -43, -25, -43,
199 : 70, -9, 70, -9, 70, -9, 70, -9,
200 : -90, 57, -90, 57, -90, 57, -90, 57,
201 : 80, -87, 80, -87, 80, -87, 80, -87,
202 : -43, 87, -43, 87, -43, 87, -43, 87,
203 : 90, -70, 90, -70, 90, -70, 90, -70,
204 : -57, -9, -57, -9, -57, -9, -57, -9,
205 : -25, 80, -25, 80, -25, 80, -25, 80,
206 : -57, 9, -57, 9, -57, 9, -57, 9,
207 : 80, 87, 80, 87, 80, 87, 80, 87,
208 : 25, -43, 25, -43, 25, -43, 25, -43,
209 : -90, -70, -90, -70, -90, -70, -90, -70,
210 : -70, -90, -70, -90, -70, -90, -70, -90,
211 : 43, -25, 43, -25, 43, -25, 43, -25,
212 : 87, 80, 87, 80, 87, 80, 87, 80,
213 : -9, 57, -9, 57, -9, 57, -9, 57,
214 : -80, 25, -80, 25, -80, 25, -80, 25,
215 : -9, -57, -9, -57, -9, -57, -9, -57,
216 : 70, -90, 70, -90, 70, -90, 70, -90,
217 : 87, -43, 87, -43, 87, -43, 87, -43,
218 : -87, 80, -87, 80, -87, 80, -87, 80,
219 : -57, 90, -57, 90, -57, 90, -57, 90,
220 : -9, 70, -9, 70, -9, 70, -9, 70,
221 : 43, 25, 43, 25, 43, 25, 43, 25,
222 : -90, -57, -90, -57, -90, -57, -90, -57,
223 : -87, -43, -87, -43, -87, -43, -87, -43,
224 : -80, -25, -80, -25, -80, -25, -80, -25,
225 : -70, -9, -70, -9, -70, -9, -70, -9,
226 : 90, 61, 90, 61, 90, 61, 90, 61,
227 : 90, 54, 90, 54, 90, 54, 90, 54,
228 : 88, 46, 88, 46, 88, 46, 88, 46,
229 : 85, 38, 85, 38, 85, 38, 85, 38,
230 : 82, 31, 82, 31, 82, 31, 82, 31,
231 : 78, 22, 78, 22, 78, 22, 78, 22,
232 : 73, 13, 73, 13, 73, 13, 73, 13,
233 : 67, 4, 67, 4, 67, 4, 67, 4,
234 : 90, -73, 90, -73, 90, -73, 90, -73,
235 : 82, -85, 82, -85, 82, -85, 82, -85,
236 : 67, -90, 67, -90, 67, -90, 67, -90,
237 : 46, -88, 46, -88, 46, -88, 46, -88,
238 : 22, -78, 22, -78, 22, -78, 22, -78,
239 : -4, -61, -4, -61, -4, -61, -4, -61,
240 : -31, -38, -31, -38, -31, -38, -31, -38,
241 : -54, -13, -54, -13, -54, -13, -54, -13,
242 : 88, -46, 88, -46, 88, -46, 88, -46,
243 : 67, -4, 67, -4, 67, -4, 67, -4,
244 : 31, 38, 31, 38, 31, 38, 31, 38,
245 : -13, 73, -13, 73, -13, 73, -13, 73,
246 : -54, 90, -54, 90, -54, 90, -54, 90,
247 : -82, 85, -82, 85, -82, 85, -82, 85,
248 : -90, 61, -90, 61, -90, 61, -90, 61,
249 : -78, 22, -78, 22, -78, 22, -78, 22,
250 : 85, 82, 85, 82, 85, 82, 85, 82,
251 : 46, 88, 46, 88, 46, 88, 46, 88,
252 : -13, 54, -13, 54, -13, 54, -13, 54,
253 : -67, -4, -67, -4, -67, -4, -67, -4,
254 : -90, -61, -90, -61, -90, -61, -90, -61,
255 : -73, -90, -73, -90, -73, -90, -73, -90,
256 : -22, -78, -22, -78, -22, -78, -22, -78,
257 : 38, -31, 38, -31, 38, -31, 38, -31,
258 : 22, -46, 22, -46, 22, -46, 22, -46,
259 : -54, -90, -54, -90, -54, -90, -54, -90,
260 : -90, -67, -90, -67, -90, -67, -90, -67,
261 : -61, 4, -61, 4, -61, 4, -61, 4,
262 : 13, 73, 13, 73, 13, 73, 13, 73,
263 : 78, 88, 78, 88, 78, 88, 78, 88,
264 : 78, -88, 78, -88, 78, -88, 78, -88,
265 : -82, 31, -82, 31, -82, 31, -82, 31,
266 : -73, 90, -73, 90, -73, 90, -73, 90,
267 : 13, 54, 13, 54, 13, 54, 13, 54,
268 : 85, -38, 85, -38, 85, -38, 85, -38,
269 : -22, -46, -22, -46, -22, -46, -22, -46,
270 : 73, -13, 73, -13, 73, -13, 73, -13,
271 : -31, 82, -31, 82, -31, 82, -31, 82,
272 : -38, 85, -38, 85, -38, 85, -38, 85,
273 : -90, 54, -90, 54, -90, 54, -90, 54,
274 : 67, 90, 67, 90, 67, 90, 67, 90,
275 : -54, 13, -54, 13, -54, 13, -54, 13,
276 : -78, -88, -78, -88, -78, -88, -78, -88,
277 : -22, 46, -22, 46, -22, 46, -22, 46,
278 : -90, -73, -90, -73, -90, -73, -90, -73,
279 : 4, -61, 4, -61, 4, -61, 4, -61,
280 : 61, -4, 61, -4, 61, -4, 61, -4,
281 : -46, 22, -46, 22, -46, 22, -46, 22,
282 : 82, 85, 82, 85, 82, 85, 82, 85,
283 : 31, -38, 31, -38, 31, -38, 31, -38,
284 : -88, -78, -88, -78, -88, -78, -88, -78,
285 : 90, 67, 90, 67, 90, 67, 90, 67,
286 : 54, -90, 54, -90, 54, -90, 54, -90,
287 : -85, 38, -85, 38, -85, 38, -85, 38,
288 : -4, 67, -4, 67, -4, 67, -4, 67,
289 : 88, -78, 88, -78, 88, -78, 88, -78,
290 : -46, -22, -46, -22, -46, -22, -46, -22,
291 : -61, 90, -61, 90, -61, 90, -61, 90,
292 : 82, -31, 82, -31, 82, -31, 82, -31,
293 : 13, -73, 13, -73, 13, -73, 13, -73,
294 : 46, 22, 46, 22, 46, 22, 46, 22,
295 : -90, 67, -90, 67, -90, 67, -90, 67,
296 : 38, -85, 38, -85, 38, -85, 38, -85,
297 : 54, 13, 54, 13, 54, 13, 54, 13,
298 : -90, 73, -90, 73, -90, 73, -90, 73,
299 : 31, -82, 31, -82, 31, -82, 31, -82,
300 : 61, 4, 61, 4, 61, 4, 61, 4,
301 : -88, 78, -88, 78, -88, 78, -88, 78,
302 : 38, 85, 38, 85, 38, 85, 38, 85,
303 : -4, 61, -4, 61, -4, 61, -4, 61,
304 : -67, -90, -67, -90, -67, -90, -67, -90,
305 : -31, -82, -31, -82, -31, -82, -31, -82,
306 : -78, -22, -78, -22, -78, -22, -78, -22,
307 : 90, 73, 90, 73, 90, 73, 90, 73,
308 : -61, -90, -61, -90, -61, -90, -61, -90,
309 : 4, 67, 4, 67, 4, 67, 4, 67,
310 : 54, -13, 54, -13, 54, -13, 54, -13,
311 : -88, -46, -88, -46, -88, -46, -88, -46,
312 : 85, -82, 85, -82, 85, -82, 85, -82,
313 : -38, -31, -38, -31, -38, -31, -38, -31,
314 : -13, -73, -13, -73, -13, -73, -13, -73,
315 : 22, 78, 22, 78, 22, 78, 22, 78,
316 : -46, -88, -46, -88, -46, -88, -46, -88,
317 : 54, 90, 54, 90, 54, 90, 54, 90
318 : };
319 :
320 : #ifdef __GNUC__
321 : #ifndef __cplusplus
322 : __attribute__((visibility("hidden")))
323 : #endif
324 : #endif
325 : EB_ALIGN(16) const int16_t inv_dst_transform_asm_const_sse2[] = {
326 : 64, 0, 64, 0, 64, 0, 64, 0,
327 : 29, 84, 29, 84, 29, 84, 29, 84,
328 : 74, 55, 74, 55, 74, 55, 74, 55,
329 : 55, -29, 55, -29, 55, -29, 55, -29,
330 : 74, -84, 74, -84, 74, -84, 74, -84,
331 : 74, -74, 74, -74, 74, -74, 74, -74,
332 : 0, 74, 0, 74, 0, 74, 0, 74,
333 : 84, 55, 84, 55, 84, 55, 84, 55,
334 : -74, -29, -74, -29, -74, -29, -74, -29,
335 : };
336 :
337 : // Coefficients for inverse 32-point transform
338 : EB_EXTERN const int16_t coeff_tbl2[48 * 8] =
339 : {
340 : 64, 89, 64, 75, 64, 50, 64, 18, 64, -18, 64, -50, 64, -75, 64, -89,
341 : 83, 75, 36, -18, -36, -89, -83, -50, -83, 50, -36, 89, 36, 18, 83, -75,
342 : 64, 50, -64, -89, -64, 18, 64, 75, 64, -75, -64, -18, -64, 89, 64, -50,
343 : 36, 18, -83, -50, 83, 75, -36, -89, -36, 89, 83, -75, -83, 50, 36, -18,
344 : 90, 87, 87, 57, 80, 9, 70, -43, 57, -80, 43, -90, 25, -70, 9, -25,
345 : 80, 70, 9, -43, -70, -87, -87, 9, -25, 90, 57, 25, 90, -80, 43, -57,
346 : 57, 43, -80, -90, -25, 57, 90, 25, -9, -87, -87, 70, 43, 9, 70, -80,
347 : 25, 9, -70, -25, 90, 43, -80, -57, 43, 70, 9, -80, -57, 87, 87, -90,
348 : 90, 90, 90, 82, 88, 67, 85, 46, 82, 22, 78, -4, 73, -31, 67, -54,
349 : 61, -73, 54, -85, 46, -90, 38, -88, 31, -78, 22, -61, 13, -38, 4, -13,
350 : 88, 85, 67, 46, 31, -13, -13, -67, -54, -90, -82, -73, -90, -22, -78, 38,
351 : -46, 82, -4, 88, 38, 54, 73, -4, 90, -61, 85, -90, 61, -78, 22, -31,
352 : 82, 78, 22, -4, -54, -82, -90, -73, -61, 13, 13, 85, 78, 67, 85, -22,
353 : 31, -88, -46, -61, -90, 31, -67, 90, 4, 54, 73, -38, 88, -90, 38, -46,
354 : 73, 67, -31, -54, -90, -78, -22, 38, 78, 85, 67, -22, -38, -90, -90, 4,
355 : -13, 90, 82, 13, 61, -88, -46, -31, -88, 82, -4, 46, 85, -73, 54, -61,
356 : 61, 54, -73, -85, -46, -4, 82, 88, 31, -46, -88, -61, -13, 82, 90, 13,
357 : -4, -90, -90, 38, 22, 67, 85, -78, -38, -22, -78, 90, 54, -31, 67, -73,
358 : 46, 38, -90, -88, 38, 73, 54, -4, -90, -67, 31, 90, 61, -46, -88, -31,
359 : 22, 85, 67, -78, -85, 13, 13, 61, 73, -90, -82, 54, 4, 22, 78, -82,
360 : 31, 22, -78, -61, 90, 85, -61, -90, 4, 73, 54, -38, -88, -4, 82, 46,
361 : -38, -78, -22, 90, 73, -82, -90, 54, 67, -13, -13, -31, -46, 67, 85, -88,
362 : 13, 4, -38, -13, 61, 22, -78, -31, 88, 38, -90, -46, 85, 54, -73, -61,
363 : 54, 67, -31, -73, 4, 78, 22, -82, -46, 85, 67, -88, -82, 90, 90, -90
364 : };
365 :
366 : #ifdef __GNUC__
367 : #ifndef __cplusplus
368 : __attribute__((visibility("hidden")))
369 : #endif
370 : #endif
371 : EB_EXTERN const int16_t coeff_tbl[48 * 8] =
372 : {
373 : 64, 64, 89, 75, 83, 36, 75, -18, 64, -64, 50, -89, 36, -83, 18, -50,
374 : 64, 64, 50, 18, -36, -83, -89, -50, -64, 64, 18, 75, 83, -36, 75, -89,
375 : 64, 64, -18, -50, -83, -36, 50, 89, 64, -64, -75, -18, -36, 83, 89, -75,
376 : 64, 64, -75, -89, 36, 83, 18, -75, -64, 64, 89, -50, -83, 36, 50, -18,
377 : 90, 87, 87, 57, 80, 9, 70, -43, 57, -80, 43, -90, 25, -70, 9, -25,
378 : 80, 70, 9, -43, -70, -87, -87, 9, -25, 90, 57, 25, 90, -80, 43, -57,
379 : 57, 43, -80, -90, -25, 57, 90, 25, -9, -87, -87, 70, 43, 9, 70, -80,
380 : 25, 9, -70, -25, 90, 43, -80, -57, 43, 70, 9, -80, -57, 87, 87, -90,
381 : 90, 90, 90, 82, 88, 67, 85, 46, 82, 22, 78, -4, 73, -31, 67, -54,
382 : 61, -73, 54, -85, 46, -90, 38, -88, 31, -78, 22, -61, 13, -38, 4, -13,
383 : 88, 85, 67, 46, 31, -13, -13, -67, -54, -90, -82, -73, -90, -22, -78, 38,
384 : -46, 82, -4, 88, 38, 54, 73, -4, 90, -61, 85, -90, 61, -78, 22, -31,
385 : 82, 78, 22, -4, -54, -82, -90, -73, -61, 13, 13, 85, 78, 67, 85, -22,
386 : 31, -88, -46, -61, -90, 31, -67, 90, 4, 54, 73, -38, 88, -90, 38, -46,
387 : 73, 67, -31, -54, -90, -78, -22, 38, 78, 85, 67, -22, -38, -90, -90, 4,
388 : -13, 90, 82, 13, 61, -88, -46, -31, -88, 82, -4, 46, 85, -73, 54, -61,
389 : 61, 54, -73, -85, -46, -4, 82, 88, 31, -46, -88, -61, -13, 82, 90, 13,
390 : -4, -90, -90, 38, 22, 67, 85, -78, -38, -22, -78, 90, 54, -31, 67, -73,
391 : 46, 38, -90, -88, 38, 73, 54, -4, -90, -67, 31, 90, 61, -46, -88, -31,
392 : 22, 85, 67, -78, -85, 13, 13, 61, 73, -90, -82, 54, 4, 22, 78, -82,
393 : 31, 22, -78, -61, 90, 85, -61, -90, 4, 73, 54, -38, -88, -4, 82, 46,
394 : -38, -78, -22, 90, 73, -82, -90, 54, 67, -13, -13, -31, -46, 67, 85, -88,
395 : 13, 4, -38, -13, 61, 22, -78, -31, 88, 38, -90, -46, 85, 54, -73, -61,
396 : 54, 67, -31, -73, 4, 78, 22, -82, -46, 85, 67, -88, -82, 90, 90, -90
397 : };
398 :
399 0 : void PfreqTranspose32Type1_SSE2(
400 : int16_t *src,
401 : uint32_t src_stride,
402 : int16_t *dst,
403 : uint32_t dst_stride)
404 : {
405 : uint32_t i, j;
406 0 : for (i = 0; i < 2; i++)
407 : {
408 0 : for (j = 0; j < 2; j++)
409 : {
410 : __m128i a0, a1, a2, a3, a4, a5, a6, a7;
411 : __m128i b0, b1, b2, b3, b4, b5, b6, b7;
412 :
413 0 : a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
414 0 : a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
415 0 : a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
416 0 : a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
417 0 : a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
418 0 : a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
419 0 : a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
420 0 : a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
421 :
422 0 : b0 = _mm_unpacklo_epi16(a0, a4);
423 0 : b1 = _mm_unpacklo_epi16(a1, a5);
424 0 : b2 = _mm_unpacklo_epi16(a2, a6);
425 0 : b3 = _mm_unpacklo_epi16(a3, a7);
426 0 : b4 = _mm_unpackhi_epi16(a0, a4);
427 0 : b5 = _mm_unpackhi_epi16(a1, a5);
428 0 : b6 = _mm_unpackhi_epi16(a2, a6);
429 0 : b7 = _mm_unpackhi_epi16(a3, a7);
430 :
431 0 : a0 = _mm_unpacklo_epi16(b0, b2);
432 0 : a1 = _mm_unpacklo_epi16(b1, b3);
433 0 : a2 = _mm_unpackhi_epi16(b0, b2);
434 0 : a3 = _mm_unpackhi_epi16(b1, b3);
435 0 : a4 = _mm_unpacklo_epi16(b4, b6);
436 0 : a5 = _mm_unpacklo_epi16(b5, b7);
437 0 : a6 = _mm_unpackhi_epi16(b4, b6);
438 0 : a7 = _mm_unpackhi_epi16(b5, b7);
439 :
440 0 : b0 = _mm_unpacklo_epi16(a0, a1);
441 0 : b1 = _mm_unpackhi_epi16(a0, a1);
442 0 : b2 = _mm_unpacklo_epi16(a2, a3);
443 0 : b3 = _mm_unpackhi_epi16(a2, a3);
444 0 : b4 = _mm_unpacklo_epi16(a4, a5);
445 0 : b5 = _mm_unpackhi_epi16(a4, a5);
446 0 : b6 = _mm_unpacklo_epi16(a6, a7);
447 0 : b7 = _mm_unpackhi_epi16(a6, a7);
448 :
449 0 : _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
450 0 : _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
451 0 : _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
452 0 : _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
453 0 : _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
454 0 : _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
455 0 : _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
456 0 : _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
457 : }
458 : }
459 0 : }
|