Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include "EbTransforms_AVX2.h"
7 :
8 : #include <emmintrin.h>
9 : #include <immintrin.h>
10 :
11 : // Coefficients for forward 16/32-point transform
12 : #ifdef __GNUC__
13 : __attribute__((aligned(16)))
14 : #endif
15 : EB_ALIGN(32) const int16_t coeff_tbl_AVX2[48 * 16] =
16 : {
17 : 64, 64, 89, 75, 83, 36, 75, -18, 64, 64, 89, 75, 83, 36, 75, -18, 64, -64, 50, -89, 36, -83, 18, -50, 64, -64, 50, -89, 36, -83, 18, -50,
18 : 64, 64, 50, 18, -36, -83, -89, -50, 64, 64, 50, 18, -36, -83, -89, -50, -64, 64, 18, 75, 83, -36, 75, -89, -64, 64, 18, 75, 83, -36, 75, -89,
19 : 64, 64, -18, -50, -83, -36, 50, 89, 64, 64, -18, -50, -83, -36, 50, 89, 64, -64, -75, -18, -36, 83, 89, -75, 64, -64, -75, -18, -36, 83, 89, -75,
20 : 64, 64, -75, -89, 36, 83, 18, -75, 64, 64, -75, -89, 36, 83, 18, -75, -64, 64, 89, -50, -83, 36, 50, -18, -64, 64, 89, -50, -83, 36, 50, -18,
21 : 90, 87, 87, 57, 80, 9, 70, -43, 90, 87, 87, 57, 80, 9, 70, -43, 57, -80, 43, -90, 25, -70, 9, -25, 57, -80, 43, -90, 25, -70, 9, -25,
22 : 80, 70, 9, -43, -70, -87, -87, 9, 80, 70, 9, -43, -70, -87, -87, 9, -25, 90, 57, 25, 90, -80, 43, -57, -25, 90, 57, 25, 90, -80, 43, -57,
23 : 57, 43, -80, -90, -25, 57, 90, 25, 57, 43, -80, -90, -25, 57, 90, 25, -9, -87, -87, 70, 43, 9, 70, -80, -9, -87, -87, 70, 43, 9, 70, -80,
24 : 25, 9, -70, -25, 90, 43, -80, -57, 25, 9, -70, -25, 90, 43, -80, -57, 43, 70, 9, -80, -57, 87, 87, -90, 43, 70, 9, -80, -57, 87, 87, -90,
25 : 90, 90, 90, 82, 88, 67, 85, 46, 90, 90, 90, 82, 88, 67, 85, 46, 82, 22, 78, -4, 73, -31, 67, -54, 82, 22, 78, -4, 73, -31, 67, -54,
26 : 61, -73, 54, -85, 46, -90, 38, -88, 61, -73, 54, -85, 46, -90, 38, -88, 31, -78, 22, -61, 13, -38, 4, -13, 31, -78, 22, -61, 13, -38, 4, -13,
27 : 88, 85, 67, 46, 31, -13, -13, -67, 88, 85, 67, 46, 31, -13, -13, -67, -54, -90, -82, -73, -90, -22, -78, 38, -54, -90, -82, -73, -90, -22, -78, 38,
28 : -46, 82, -4, 88, 38, 54, 73, -4, -46, 82, -4, 88, 38, 54, 73, -4, 90, -61, 85, -90, 61, -78, 22, -31, 90, -61, 85, -90, 61, -78, 22, -31,
29 : 82, 78, 22, -4, -54, -82, -90, -73, 82, 78, 22, -4, -54, -82, -90, -73, -61, 13, 13, 85, 78, 67, 85, -22, -61, 13, 13, 85, 78, 67, 85, -22,
30 : 31, -88, -46, -61, -90, 31, -67, 90, 31, -88, -46, -61, -90, 31, -67, 90, 4, 54, 73, -38, 88, -90, 38, -46, 4, 54, 73, -38, 88, -90, 38, -46,
31 : 73, 67, -31, -54, -90, -78, -22, 38, 73, 67, -31, -54, -90, -78, -22, 38, 78, 85, 67, -22, -38, -90, -90, 4, 78, 85, 67, -22, -38, -90, -90, 4,
32 : -13, 90, 82, 13, 61, -88, -46, -31, -13, 90, 82, 13, 61, -88, -46, -31, -88, 82, -4, 46, 85, -73, 54, -61, -88, 82, -4, 46, 85, -73, 54, -61,
33 : 61, 54, -73, -85, -46, -4, 82, 88, 61, 54, -73, -85, -46, -4, 82, 88, 31, -46, -88, -61, -13, 82, 90, 13, 31, -46, -88, -61, -13, 82, 90, 13,
34 : -4, -90, -90, 38, 22, 67, 85, -78, -4, -90, -90, 38, 22, 67, 85, -78, -38, -22, -78, 90, 54, -31, 67, -73, -38, -22, -78, 90, 54, -31, 67, -73,
35 : 46, 38, -90, -88, 38, 73, 54, -4, 46, 38, -90, -88, 38, 73, 54, -4, -90, -67, 31, 90, 61, -46, -88, -31, -90, -67, 31, 90, 61, -46, -88, -31,
36 : 22, 85, 67, -78, -85, 13, 13, 61, 22, 85, 67, -78, -85, 13, 13, 61, 73, -90, -82, 54, 4, 22, 78, -82, 73, -90, -82, 54, 4, 22, 78, -82,
37 : 31, 22, -78, -61, 90, 85, -61, -90, 31, 22, -78, -61, 90, 85, -61, -90, 4, 73, 54, -38, -88, -4, 82, 46, 4, 73, 54, -38, -88, -4, 82, 46,
38 : -38, -78, -22, 90, 73, -82, -90, 54, -38, -78, -22, 90, 73, -82, -90, 54, 67, -13, -13, -31, -46, 67, 85, -88, 67, -13, -13, -31, -46, 67, 85, -88,
39 : 13, 4, -38, -13, 61, 22, -78, -31, 13, 4, -38, -13, 61, 22, -78, -31, 88, 38, -90, -46, 85, 54, -73, -61, 88, 38, -90, -46, 85, 54, -73, -61,
40 : 54, 67, -31, -73, 4, 78, 22, -82, 54, 67, -31, -73, 4, 78, 22, -82, -46, 85, 67, -88, -82, 90, 90, -90, -46, 85, 67, -88, -82, 90, 90, -90
41 : };
42 :
43 : /*******************************************************************************
44 : * Requirement: area_size = 4, 8, or area_size % 16 = 0
45 : *******************************************************************************/
46 :
47 : // transpose 16x16 block of data
48 0 : void transpose16_AVX2_INTRIN(int16_t *src, uint32_t src_stride, int16_t *dst, uint32_t dst_stride)
49 : {
50 : uint32_t i;
51 0 : for (i = 0; i < 2; i++)
52 : {
53 : __m256i a0, a1, a2, a3, a4, a5, a6, a7;
54 : __m256i b0, b1, b2, b3, b4, b5, b6, b7;
55 :
56 0 : a0 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 0)*src_stride));
57 0 : a1 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 1)*src_stride));
58 0 : a2 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 2)*src_stride));
59 0 : a3 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 3)*src_stride));
60 0 : a4 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 4)*src_stride));
61 0 : a5 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 5)*src_stride));
62 0 : a6 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 6)*src_stride));
63 0 : a7 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 7)*src_stride));
64 :
65 0 : b0 = _mm256_unpacklo_epi16(a0, a4);
66 0 : b1 = _mm256_unpacklo_epi16(a1, a5);
67 0 : b2 = _mm256_unpacklo_epi16(a2, a6);
68 0 : b3 = _mm256_unpacklo_epi16(a3, a7);
69 0 : b4 = _mm256_unpackhi_epi16(a0, a4);
70 0 : b5 = _mm256_unpackhi_epi16(a1, a5);
71 0 : b6 = _mm256_unpackhi_epi16(a2, a6);
72 0 : b7 = _mm256_unpackhi_epi16(a3, a7);
73 :
74 0 : a0 = _mm256_unpacklo_epi16(b0, b2);
75 0 : a1 = _mm256_unpacklo_epi16(b1, b3);
76 0 : a2 = _mm256_unpackhi_epi16(b0, b2);
77 0 : a3 = _mm256_unpackhi_epi16(b1, b3);
78 0 : a4 = _mm256_unpacklo_epi16(b4, b6);
79 0 : a5 = _mm256_unpacklo_epi16(b5, b7);
80 0 : a6 = _mm256_unpackhi_epi16(b4, b6);
81 0 : a7 = _mm256_unpackhi_epi16(b5, b7);
82 :
83 0 : b0 = _mm256_unpacklo_epi16(a0, a1);
84 0 : b1 = _mm256_unpackhi_epi16(a0, a1);
85 0 : b2 = _mm256_unpacklo_epi16(a2, a3);
86 0 : b3 = _mm256_unpackhi_epi16(a2, a3);
87 0 : b4 = _mm256_unpacklo_epi16(a4, a5);
88 0 : b5 = _mm256_unpackhi_epi16(a4, a5);
89 0 : b6 = _mm256_unpacklo_epi16(a6, a7);
90 0 : b7 = _mm256_unpackhi_epi16(a6, a7);
91 :
92 0 : _mm_storeu_si128((__m128i *)(dst + 0 * dst_stride + 8 * i), _mm256_extracti128_si256(b0, 0));
93 0 : _mm_storeu_si128((__m128i *)(dst + 1 * dst_stride + 8 * i), _mm256_extracti128_si256(b1, 0));
94 0 : _mm_storeu_si128((__m128i *)(dst + 2 * dst_stride + 8 * i), _mm256_extracti128_si256(b2, 0));
95 0 : _mm_storeu_si128((__m128i *)(dst + 3 * dst_stride + 8 * i), _mm256_extracti128_si256(b3, 0));
96 0 : _mm_storeu_si128((__m128i *)(dst + 4 * dst_stride + 8 * i), _mm256_extracti128_si256(b4, 0));
97 0 : _mm_storeu_si128((__m128i *)(dst + 5 * dst_stride + 8 * i), _mm256_extracti128_si256(b5, 0));
98 0 : _mm_storeu_si128((__m128i *)(dst + 6 * dst_stride + 8 * i), _mm256_extracti128_si256(b6, 0));
99 0 : _mm_storeu_si128((__m128i *)(dst + 7 * dst_stride + 8 * i), _mm256_extracti128_si256(b7, 0));
100 0 : _mm_storeu_si128((__m128i *)(dst + 8 * dst_stride + 8 * i), _mm256_extracti128_si256(b0, 1));
101 0 : _mm_storeu_si128((__m128i *)(dst + 9 * dst_stride + 8 * i), _mm256_extracti128_si256(b1, 1));
102 0 : _mm_storeu_si128((__m128i *)(dst + 10 * dst_stride + 8 * i), _mm256_extracti128_si256(b2, 1));
103 0 : _mm_storeu_si128((__m128i *)(dst + 11 * dst_stride + 8 * i), _mm256_extracti128_si256(b3, 1));
104 0 : _mm_storeu_si128((__m128i *)(dst + 12 * dst_stride + 8 * i), _mm256_extracti128_si256(b4, 1));
105 0 : _mm_storeu_si128((__m128i *)(dst + 13 * dst_stride + 8 * i), _mm256_extracti128_si256(b5, 1));
106 0 : _mm_storeu_si128((__m128i *)(dst + 14 * dst_stride + 8 * i), _mm256_extracti128_si256(b6, 1));
107 0 : _mm_storeu_si128((__m128i *)(dst + 15 * dst_stride + 8 * i), _mm256_extracti128_si256(b7, 1));
108 : }
109 0 : }
110 :
111 : // transpose 32x32 block of data
112 0 : void transpose32_AVX2_INTRIN(int16_t *src, uint32_t src_stride, int16_t *dst, uint32_t dst_stride)
113 : {
114 : uint32_t i, j;
115 0 : for (i = 0; i < 4; i++)
116 : {
117 0 : for (j = 0; j < 2; j++)
118 : {
119 : __m256i a0, a1, a2, a3, a4, a5, a6, a7;
120 : __m256i b0, b1, b2, b3, b4, b5, b6, b7;
121 :
122 0 : a0 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 0)*src_stride + 16 * j));
123 0 : a1 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 1)*src_stride + 16 * j));
124 0 : a2 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 2)*src_stride + 16 * j));
125 0 : a3 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 3)*src_stride + 16 * j));
126 0 : a4 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 4)*src_stride + 16 * j));
127 0 : a5 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 5)*src_stride + 16 * j));
128 0 : a6 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 6)*src_stride + 16 * j));
129 0 : a7 = _mm256_loadu_si256((const __m256i *)(src + (8 * i + 7)*src_stride + 16 * j));
130 :
131 0 : b0 = _mm256_unpacklo_epi16(a0, a4);
132 0 : b1 = _mm256_unpacklo_epi16(a1, a5);
133 0 : b2 = _mm256_unpacklo_epi16(a2, a6);
134 0 : b3 = _mm256_unpacklo_epi16(a3, a7);
135 0 : b4 = _mm256_unpackhi_epi16(a0, a4);
136 0 : b5 = _mm256_unpackhi_epi16(a1, a5);
137 0 : b6 = _mm256_unpackhi_epi16(a2, a6);
138 0 : b7 = _mm256_unpackhi_epi16(a3, a7);
139 :
140 0 : a0 = _mm256_unpacklo_epi16(b0, b2);
141 0 : a1 = _mm256_unpacklo_epi16(b1, b3);
142 0 : a2 = _mm256_unpackhi_epi16(b0, b2);
143 0 : a3 = _mm256_unpackhi_epi16(b1, b3);
144 0 : a4 = _mm256_unpacklo_epi16(b4, b6);
145 0 : a5 = _mm256_unpacklo_epi16(b5, b7);
146 0 : a6 = _mm256_unpackhi_epi16(b4, b6);
147 0 : a7 = _mm256_unpackhi_epi16(b5, b7);
148 :
149 0 : b0 = _mm256_unpacklo_epi16(a0, a1);
150 0 : b1 = _mm256_unpackhi_epi16(a0, a1);
151 0 : b2 = _mm256_unpacklo_epi16(a2, a3);
152 0 : b3 = _mm256_unpackhi_epi16(a2, a3);
153 0 : b4 = _mm256_unpacklo_epi16(a4, a5);
154 0 : b5 = _mm256_unpackhi_epi16(a4, a5);
155 0 : b6 = _mm256_unpacklo_epi16(a6, a7);
156 0 : b7 = _mm256_unpackhi_epi16(a6, a7);
157 :
158 0 : _mm_storeu_si128((__m128i *)(dst + (16 * j + 0)*dst_stride + 8 * i), _mm256_extracti128_si256(b0, 0));
159 0 : _mm_storeu_si128((__m128i *)(dst + (16 * j + 1)*dst_stride + 8 * i), _mm256_extracti128_si256(b1, 0));
160 0 : _mm_storeu_si128((__m128i *)(dst + (16 * j + 2)*dst_stride + 8 * i), _mm256_extracti128_si256(b2, 0));
161 0 : _mm_storeu_si128((__m128i *)(dst + (16 * j + 3)*dst_stride + 8 * i), _mm256_extracti128_si256(b3, 0));
162 0 : _mm_storeu_si128((__m128i *)(dst + (16 * j + 4)*dst_stride + 8 * i), _mm256_extracti128_si256(b4, 0));
163 0 : _mm_storeu_si128((__m128i *)(dst + (16 * j + 5)*dst_stride + 8 * i), _mm256_extracti128_si256(b5, 0));
164 0 : _mm_storeu_si128((__m128i *)(dst + (16 * j + 6)*dst_stride + 8 * i), _mm256_extracti128_si256(b6, 0));
165 0 : _mm_storeu_si128((__m128i *)(dst + (16 * j + 7)*dst_stride + 8 * i), _mm256_extracti128_si256(b7, 0));
166 0 : _mm_storeu_si128((__m128i *)(dst + (16 * j + 8)*dst_stride + 8 * i), _mm256_extracti128_si256(b0, 1));
167 0 : _mm_storeu_si128((__m128i *)(dst + (16 * j + 9)*dst_stride + 8 * i), _mm256_extracti128_si256(b1, 1));
168 0 : _mm_storeu_si128((__m128i *)(dst + (16 * j + 10)*dst_stride + 8 * i), _mm256_extracti128_si256(b2, 1));
169 0 : _mm_storeu_si128((__m128i *)(dst + (16 * j + 11)*dst_stride + 8 * i), _mm256_extracti128_si256(b3, 1));
170 0 : _mm_storeu_si128((__m128i *)(dst + (16 * j + 12)*dst_stride + 8 * i), _mm256_extracti128_si256(b4, 1));
171 0 : _mm_storeu_si128((__m128i *)(dst + (16 * j + 13)*dst_stride + 8 * i), _mm256_extracti128_si256(b5, 1));
172 0 : _mm_storeu_si128((__m128i *)(dst + (16 * j + 14)*dst_stride + 8 * i), _mm256_extracti128_si256(b6, 1));
173 0 : _mm_storeu_si128((__m128i *)(dst + (16 * j + 15)*dst_stride + 8 * i), _mm256_extracti128_si256(b7, 1));
174 : }
175 : }
176 0 : }
177 :
178 : // 32-point forward transform (32 rows)
179 0 : void transform32_AVX2_INTRIN(int16_t *src, uint32_t src_stride, int16_t *dst, uint32_t dst_stride, uint32_t shift)
180 : {
181 : uint32_t i;
182 : __m128i s0;
183 : __m256i o0;
184 0 : const __m256i *coeff32 = (const __m256i *)coeff_tbl_AVX2;
185 :
186 0 : shift &= 0x0000FFFF; // Redundant code to fix Visual Studio 2012 AVX2 compiler error
187 0 : s0 = _mm_cvtsi32_si128(shift);
188 0 : o0 = _mm256_set1_epi32(1 << (shift - 1));
189 :
190 0 : for (i = 0; i < 16; i++)
191 : {
192 : __m256i x0, x1, x2, x3;
193 : __m256i y0, y1, y2, y3;
194 : __m256i a0, a1, a2, a3, a4, a5, a6, a7;
195 : __m256i b0, b1, b2, b3, b4, b5, b6, b7;
196 :
197 0 : x0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x00))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x00)), 0x1);
198 0 : x1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x08))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x08)), 0x1);
199 0 : x2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x10))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x10)), 0x1);
200 0 : x3 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + 0x18))), _mm_loadu_si128((const __m128i *)(src + src_stride + 0x18)), 0x1);
201 :
202 : // 32-point butterfly
203 0 : x2 = _mm256_shuffle_epi8(x2, _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
204 0 : x3 = _mm256_shuffle_epi8(x3, _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
205 :
206 0 : y0 = _mm256_add_epi16(x0, x3);
207 0 : y1 = _mm256_add_epi16(x1, x2);
208 :
209 0 : y2 = _mm256_sub_epi16(x0, x3);
210 0 : y3 = _mm256_sub_epi16(x1, x2);
211 :
212 : // 16-point butterfly
213 0 : y1 = _mm256_shuffle_epi8(y1, _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
214 :
215 0 : x0 = _mm256_add_epi16(y0, y1);
216 0 : x1 = _mm256_sub_epi16(y0, y1);
217 :
218 0 : x2 = y2;
219 0 : x3 = y3;
220 :
221 0 : a0 = _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x00), coeff32[0]);
222 0 : a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x55), coeff32[2]));
223 0 : a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0xaa), coeff32[4]));
224 0 : a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0xff), coeff32[6]));
225 :
226 0 : a1 = _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x00), coeff32[1]);
227 0 : a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x55), coeff32[3]));
228 0 : a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0xaa), coeff32[5]));
229 0 : a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0xff), coeff32[7]));
230 :
231 0 : a2 = _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x00), coeff32[8]);
232 0 : a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x55), coeff32[10]));
233 0 : a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0xaa), coeff32[12]));
234 0 : a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0xff), coeff32[14]));
235 :
236 0 : a3 = _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x00), coeff32[9]);
237 0 : a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x55), coeff32[11]));
238 0 : a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0xaa), coeff32[13]));
239 0 : a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0xff), coeff32[15]));
240 :
241 0 : a4 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[16]);
242 0 : a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x55), coeff32[20]));
243 0 : a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xaa), coeff32[24]));
244 0 : a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xff), coeff32[28]));
245 0 : a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[32]));
246 0 : a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x55), coeff32[36]));
247 0 : a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xaa), coeff32[40]));
248 0 : a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xff), coeff32[44]));
249 :
250 0 : a5 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[17]);
251 0 : a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x55), coeff32[21]));
252 0 : a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xaa), coeff32[25]));
253 0 : a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xff), coeff32[29]));
254 0 : a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[33]));
255 0 : a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x55), coeff32[37]));
256 0 : a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xaa), coeff32[41]));
257 0 : a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xff), coeff32[45]));
258 :
259 0 : a6 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[18]);
260 0 : a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x55), coeff32[22]));
261 0 : a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xaa), coeff32[26]));
262 0 : a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xff), coeff32[30]));
263 0 : a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[34]));
264 0 : a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x55), coeff32[38]));
265 0 : a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xaa), coeff32[42]));
266 0 : a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xff), coeff32[46]));
267 :
268 0 : a7 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[19]);
269 0 : a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x55), coeff32[23]));
270 0 : a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xaa), coeff32[27]));
271 0 : a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xff), coeff32[31]));
272 0 : a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[35]));
273 0 : a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x55), coeff32[39]));
274 0 : a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xaa), coeff32[43]));
275 0 : a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xff), coeff32[47]));
276 :
277 0 : b0 = _mm256_sra_epi32(_mm256_add_epi32(a0, o0), s0);
278 0 : b1 = _mm256_sra_epi32(_mm256_add_epi32(a1, o0), s0);
279 0 : b2 = _mm256_sra_epi32(_mm256_add_epi32(a2, o0), s0);
280 0 : b3 = _mm256_sra_epi32(_mm256_add_epi32(a3, o0), s0);
281 0 : b4 = _mm256_sra_epi32(_mm256_add_epi32(a4, o0), s0);
282 0 : b5 = _mm256_sra_epi32(_mm256_add_epi32(a5, o0), s0);
283 0 : b6 = _mm256_sra_epi32(_mm256_add_epi32(a6, o0), s0);
284 0 : b7 = _mm256_sra_epi32(_mm256_add_epi32(a7, o0), s0);
285 :
286 0 : x0 = _mm256_packs_epi32(b0, b1);
287 0 : x1 = _mm256_packs_epi32(b2, b3);
288 0 : x2 = _mm256_packs_epi32(b4, b5);
289 0 : x3 = _mm256_packs_epi32(b6, b7);
290 :
291 0 : y0 = _mm256_unpacklo_epi16(x0, x1);
292 0 : y1 = _mm256_unpackhi_epi16(x0, x1);
293 0 : y2 = x2;
294 0 : y3 = x3;
295 0 : x0 = _mm256_unpacklo_epi16(y0, y2);
296 0 : x1 = _mm256_unpackhi_epi16(y0, y2);
297 0 : x2 = _mm256_unpacklo_epi16(y1, y3);
298 0 : x3 = _mm256_unpackhi_epi16(y1, y3);
299 :
300 0 : y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 0)), _mm256_extracti128_si256(x1, 0), 0x1);
301 0 : y1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x2, 0)), _mm256_extracti128_si256(x3, 0), 0x1);
302 0 : y2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x0, 1)), _mm256_extracti128_si256(x1, 1), 0x1);
303 0 : y3 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(x2, 1)), _mm256_extracti128_si256(x3, 1), 0x1);
304 : _mm256_storeu_si256((__m256i *)(dst + 0x00), y0);
305 0 : _mm256_storeu_si256((__m256i *)(dst + 0x10), y1);
306 0 : _mm256_storeu_si256((__m256i *)(dst + dst_stride + 0x00), y2);
307 0 : _mm256_storeu_si256((__m256i *)(dst + dst_stride + 0x10), y3);
308 :
309 0 : src += 2 * src_stride;
310 0 : dst += 2 * dst_stride;
311 : }
312 0 : }
313 :
314 0 : void mat_mult4x4_out_buff_avx2_intrin(
315 : int16_t* coeff,
316 : const uint32_t coeff_stride,
317 : int16_t* coeff_out,
318 : const uint32_t coeff_out_stride,
319 : const uint16_t *maskingMatrix,
320 : const uint32_t maskingMatrixStride,
321 : const uint32_t compute_size,
322 : const int32_t offset,
323 : const int32_t shift_num,
324 : uint32_t* nonzerocoeff)
325 :
326 : {
327 0 : __m256i z = _mm256_setzero_si256();
328 : __m128i a, b;
329 : __m256i coeffTemp, a0, a1, b0, b1, ymm_computed, MaskingMatrix, offsetREG, coeffTempORG;
330 : (void)compute_size;
331 :
332 0 : coeffTemp = a0 = a1 = b0 = b1 = ymm_computed = MaskingMatrix = offsetREG = _mm256_setzero_si256();
333 :
334 : // prepare Shift REG
335 0 : __m128i PMP_PRECISION_REG = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, (int16_t)shift_num); //_mm_set1_epi16((uint16_t)shift_num);//_mm_set1_epi32(shift_num);
336 :
337 : //prepare the offset
338 0 : offsetREG = _mm256_set1_epi32(offset);
339 :
340 : //load maskingMatrix_new
341 0 : MaskingMatrix = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)maskingMatrix), _mm_loadl_epi64((__m128i*)(maskingMatrix + maskingMatrixStride)))), _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(maskingMatrix + 2 * maskingMatrixStride)), _mm_loadl_epi64((__m128i*)(maskingMatrix + 3 * maskingMatrixStride))), 0x1);
342 :
343 : //load coefftemp
344 0 : a = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)coeff), _mm_loadl_epi64((__m128i*)(coeff + coeff_stride))); // 1st and 2nd row of the 4x4 block
345 0 : b = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(coeff + 2 * coeff_stride)), _mm_loadl_epi64((__m128i*)(coeff + 3 * coeff_stride))); // 3rd and 4th row of the 4x4 block
346 0 : coeffTemp = _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 0x1); // the 4x4 block is now loaded
347 :
348 0 : coeffTempORG = coeffTemp;
349 : //Absolute val
350 0 : coeffTemp = _mm256_abs_epi16(coeffTemp);
351 :
352 0 : a0 = _mm256_mullo_epi16(coeffTemp, MaskingMatrix);
353 0 : a1 = _mm256_mulhi_epi16(coeffTemp, MaskingMatrix);
354 :
355 0 : b0 = _mm256_unpacklo_epi16(a0, a1);
356 0 : b1 = _mm256_unpackhi_epi16(a0, a1);
357 :
358 0 : b0 = _mm256_add_epi32(b0, offsetREG);
359 0 : b1 = _mm256_add_epi32(b1, offsetREG);
360 :
361 : //Shift right by PMP_PRECISION_REG
362 0 : b0 = _mm256_sra_epi32(b0, PMP_PRECISION_REG);
363 0 : b1 = _mm256_sra_epi32(b1, PMP_PRECISION_REG);
364 :
365 : //coefftemp in c
366 0 : ymm_computed = _mm256_packs_epi32(b0, b1);//Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst.
367 0 : z = _mm256_sub_epi16(z, _mm256_cmpgt_epi16(ymm_computed, _mm256_setzero_si256())); //coeffTemp = (coeff[coeffLocation] < 0)? -coeffTemp : coeffTemp;
368 :
369 0 : ymm_computed = _mm256_sign_epi16(ymm_computed, coeffTempORG);// coeffTemp);
370 :
371 0 : a = _mm256_extracti128_si256(ymm_computed, 0);
372 0 : b = _mm256_extracti128_si256(ymm_computed, 1);
373 0 : _mm_storel_epi64((__m128i *) coeff_out, a);
374 0 : _mm_storel_epi64((__m128i *)(coeff_out + 1 * coeff_out_stride), _mm_srli_si128(a, 8));
375 0 : _mm_storel_epi64((__m128i *)(coeff_out + 2 * coeff_out_stride), b);
376 0 : _mm_storel_epi64((__m128i *)(coeff_out + 3 * coeff_out_stride), _mm_srli_si128(b, 8));
377 :
378 0 : z = _mm256_sad_epu8(z, _mm256_srli_si256(z, 8));
379 0 : *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1)));
380 0 : }
381 :
382 : /*****************************************************************************************************************************************************************/
383 0 : void mat_mult4x4_avx2_intrin(
384 : int16_t* coeff,
385 : const uint32_t coeff_stride,
386 : const uint16_t *maskingMatrix,
387 : const uint32_t maskingMatrixStride, //Matrix size
388 : const uint32_t compute_size, //Computation area size
389 : const int32_t offset, //(PMP_MAX >> 1)
390 : const int32_t shift_num, //PMP_PRECISION
391 : uint32_t* nonzerocoeff)
392 :
393 : {
394 0 : __m256i z = _mm256_setzero_si256();
395 : __m128i a, b;
396 : __m256i coeffTemp, a0, a1, b0, b1, ymm_computed, MaskingMatrix, offsetREG, coeffTempORG;
397 : (void)compute_size;
398 :
399 0 : coeffTemp = a0 = a1 = b0 = b1 = ymm_computed = MaskingMatrix = offsetREG = _mm256_setzero_si256();
400 :
401 : // prepare Shift REG
402 0 : __m128i PMP_PRECISION_REG = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, (int16_t)shift_num); //_mm_set1_epi16((uint16_t)shift_num);//_mm_set1_epi32(shift_num);
403 :
404 : //prepare the offset
405 0 : offsetREG = _mm256_set1_epi32(offset);
406 :
407 : //load maskingMatrix_new
408 0 : MaskingMatrix = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)maskingMatrix), _mm_loadl_epi64((__m128i*)(maskingMatrix + maskingMatrixStride)))), _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(maskingMatrix + 2 * maskingMatrixStride)), _mm_loadl_epi64((__m128i*)(maskingMatrix + 3 * maskingMatrixStride))), 0x1);
409 :
410 : //load coefftemp
411 0 : a = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)coeff), _mm_loadl_epi64((__m128i*)(coeff + coeff_stride))); // 1st and 2nd row of the 4x4 block
412 0 : b = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(coeff + 2 * coeff_stride)), _mm_loadl_epi64((__m128i*)(coeff + 3 * coeff_stride))); // 3rd and 4th row of the 4x4 block
413 0 : coeffTemp = _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 0x1); // the 4x4 block is now loaded
414 :
415 0 : coeffTempORG = coeffTemp;
416 : //Absolute val
417 0 : coeffTemp = _mm256_abs_epi16(coeffTemp);
418 :
419 0 : a0 = _mm256_mullo_epi16(coeffTemp, MaskingMatrix);
420 0 : a1 = _mm256_mulhi_epi16(coeffTemp, MaskingMatrix);
421 :
422 0 : b0 = _mm256_unpacklo_epi16(a0, a1);
423 0 : b1 = _mm256_unpackhi_epi16(a0, a1);
424 :
425 0 : b0 = _mm256_add_epi32(b0, offsetREG);
426 0 : b1 = _mm256_add_epi32(b1, offsetREG);
427 :
428 : //Shift right by PMP_PRECISION_REG
429 0 : b0 = _mm256_sra_epi32(b0, PMP_PRECISION_REG);
430 0 : b1 = _mm256_sra_epi32(b1, PMP_PRECISION_REG);
431 :
432 : //coefftemp in c
433 0 : ymm_computed = _mm256_packs_epi32(b0, b1);//Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst.
434 0 : z = _mm256_sub_epi16(z, _mm256_cmpgt_epi16(ymm_computed, _mm256_setzero_si256())); //coeffTemp = (coeff[coeffLocation] < 0)? -coeffTemp : coeffTemp;
435 :
436 0 : ymm_computed = _mm256_sign_epi16(ymm_computed, coeffTempORG);// coeffTemp);
437 :
438 0 : a = _mm256_extracti128_si256(ymm_computed, 0);
439 0 : b = _mm256_extracti128_si256(ymm_computed, 1);
440 0 : _mm_storel_epi64((__m128i *)coeff, a);
441 0 : _mm_storel_epi64((__m128i *)(coeff + coeff_stride), _mm_srli_si128(a, 8));
442 0 : _mm_storel_epi64((__m128i *)(coeff + 2 * coeff_stride), b);
443 0 : _mm_storel_epi64((__m128i *)(coeff + 3 * coeff_stride), _mm_srli_si128(b, 8));
444 :
445 0 : z = _mm256_sad_epu8(z, _mm256_srli_si256(z, 8));
446 0 : *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1)));
447 0 : }
448 : /*******************************************mat_mult8x8_avx2_intrin**************************************************/
449 0 : void mat_mult8x8_avx2_intrin(
450 : int16_t* coeff,
451 : const uint32_t coeff_stride,
452 : const uint16_t *maskingMatrix,
453 : const uint32_t maskingMatrixStride, //Matrix size
454 : const uint32_t compute_size, //Computation area size
455 : const int32_t offset, //(PMP_MAX >> 1)
456 : const int32_t shift_num, //PMP_PRECISION
457 : uint32_t* nonzerocoeff)
458 : {
459 : unsigned row;
460 0 : __m256i z = _mm256_setzero_si256();
461 : //__m128i a, b;
462 : __m256i coeffTemp, a0, a1, b0, b1, ymm_computed, MaskingMatrix, coeffTempORG;
463 :
464 0 : coeffTemp = a0 = a1 = b0 = b1 = ymm_computed = MaskingMatrix = _mm256_setzero_si256();
465 :
466 : // prepare Shift REG
467 0 : __m128i PMP_PRECISION_REG = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, (int16_t)shift_num);//_mm_set1_epi32(shift_num);
468 :
469 : //prepare the offset
470 0 : __m256i offsetREG = _mm256_set1_epi32(offset);
471 0 : row = 0;
472 : do {
473 : //load maskingMatrix_new
474 0 : MaskingMatrix = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(maskingMatrix + maskingMatrixStride * row))), _mm_loadu_si128((__m128i*)(maskingMatrix + maskingMatrixStride * (row + 1))), 0x1);
475 :
476 : //load coefftemp
477 0 : coeffTemp = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(coeff + coeff_stride * row))),
478 : _mm_loadu_si128((__m128i*)(coeff + coeff_stride * (row + 1))), 0x1);
479 :
480 0 : coeffTempORG = coeffTemp;
481 : //Absolute val
482 0 : coeffTemp = _mm256_abs_epi16(coeffTemp);
483 :
484 : //Multiply
485 0 : a0 = _mm256_mullo_epi16(coeffTemp, MaskingMatrix);
486 0 : a1 = _mm256_mulhi_epi16(coeffTemp, MaskingMatrix);
487 :
488 0 : b0 = _mm256_unpacklo_epi16(a0, a1);
489 0 : b1 = _mm256_unpackhi_epi16(a0, a1);
490 :
491 : //Add
492 0 : b0 = _mm256_add_epi32(b0, offsetREG);
493 0 : b1 = _mm256_add_epi32(b1, offsetREG);
494 :
495 : //Shift right by PMP_PRECISION_REG
496 0 : b0 = _mm256_sra_epi32(b0, PMP_PRECISION_REG);
497 0 : b1 = _mm256_sra_epi32(b1, PMP_PRECISION_REG);
498 :
499 : //coefftemp in c
500 0 : ymm_computed = _mm256_packs_epi32(b0, b1);//Convert packed 32-bit integers from b0 and b1 to packed 16-bit integers using signed saturation, and store the results in dst.
501 0 : z = _mm256_sub_epi16(z, _mm256_cmpgt_epi16(ymm_computed, _mm256_setzero_si256())); //coeffTemp = (coeff[coeffLocation] < 0)? -coeffTemp : coeffTemp;
502 :
503 0 : ymm_computed = _mm256_sign_epi16(ymm_computed, coeffTempORG);// coeffTemp);
504 :
505 0 : _mm_storeu_si128((__m128i *)(coeff + coeff_stride * row), _mm256_extracti128_si256(ymm_computed, 0));
506 0 : _mm_storeu_si128((__m128i *)(coeff + coeff_stride * (row + 1)), _mm256_extracti128_si256(ymm_computed, 1));
507 :
508 0 : row += 2;
509 0 : } while (row < compute_size);
510 :
511 0 : z = _mm256_sad_epu8(z, _mm256_srli_si256(z, 7));
512 0 : *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1)));
513 0 : }
514 : /***************************************mat_mult_nxn_avx2_intrin****************************************************/
515 0 : void mat_mult_nxn_avx2_intrin(
516 : int16_t* coeff,
517 : const uint32_t coeff_stride,
518 : const uint16_t *maskingMatrix,
519 : const uint32_t maskingMatrixStride, //Matrix size
520 : const uint32_t compute_size, //Computation area size
521 : const int32_t offset, //(PMP_MAX >> 1)
522 : const int32_t shift_num, //PMP_PRECISION
523 : uint32_t* nonzerocoeff)
524 : {
525 : unsigned row, col;
526 0 : __m256i z = _mm256_setzero_si256();
527 : //__m128i a, b;
528 : __m256i coeffTemp, a0, a1, b0, b1, ymm_computed, MaskingMatrix, coeffTempORG;
529 :
530 0 : coeffTemp = a0 = a1 = b0 = b1 = ymm_computed = MaskingMatrix = _mm256_setzero_si256();
531 :
532 : // prepare Shift REG
533 0 : __m128i PMP_PRECISION_REG = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, (int16_t)shift_num);//_mm_set1_epi32(shift_num);
534 :
535 : //prepare the offset
536 0 : __m256i offsetREG = _mm256_set1_epi32(offset);
537 :
538 0 : row = 0;
539 : do {
540 0 : col = 0;
541 : do {
542 : //load coefftemp
543 0 : coeffTemp = _mm256_loadu_si256((__m256i *)(coeff + coeff_stride * row + col));
544 :
545 : //load maskingMatrix_new
546 0 : MaskingMatrix = _mm256_loadu_si256((__m256i *) (maskingMatrix + maskingMatrixStride * row + col));
547 :
548 0 : coeffTempORG = coeffTemp;
549 :
550 : //Absolute val
551 0 : coeffTemp = _mm256_abs_epi16(coeffTemp);
552 :
553 : //Multiply
554 0 : a0 = _mm256_mullo_epi16(coeffTemp, MaskingMatrix);
555 0 : a1 = _mm256_mulhi_epi16(coeffTemp, MaskingMatrix);
556 :
557 0 : b0 = _mm256_unpacklo_epi16(a0, a1);
558 0 : b1 = _mm256_unpackhi_epi16(a0, a1);
559 :
560 : //Add
561 0 : b0 = _mm256_add_epi32(b0, offsetREG);
562 0 : b1 = _mm256_add_epi32(b1, offsetREG);
563 :
564 : //Shift right by PMP_PRECISION_REG
565 0 : b0 = _mm256_sra_epi32(b0, PMP_PRECISION_REG);
566 0 : b1 = _mm256_sra_epi32(b1, PMP_PRECISION_REG);
567 :
568 : //coefftemp in c
569 0 : ymm_computed = _mm256_packs_epi32(b0, b1);//Convert packed 32-bit integers from b0 and b1 to packed 16-bit integers using signed saturation, and store the results in dst.
570 0 : z = _mm256_sub_epi16(z, _mm256_cmpgt_epi16(ymm_computed, _mm256_setzero_si256())); //coeffTemp = (coeff[coeffLocation] < 0)? -coeffTemp : coeffTemp;
571 :
572 0 : ymm_computed = _mm256_sign_epi16(ymm_computed, coeffTempORG);// coeffTemp);
573 :
574 0 : _mm256_storeu_si256((__m256i *)(coeff + coeff_stride * row + col), ymm_computed);
575 :
576 0 : col += 16;
577 0 : } while (col < compute_size);
578 :
579 0 : row++;
580 0 : } while (row < compute_size);
581 :
582 0 : z = _mm256_sad_epu8(z, _mm256_srli_si256(z, 7));
583 0 : *nonzerocoeff = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_extracti128_si256(z, 0), _mm256_extracti128_si256(z, 1)));
584 0 : }
585 :
586 361686000 : static INLINE void EnergyComputation_kernel_avx2(const int32_t *const in,
587 : __m256i *const sum256)
588 : {
589 361686000 : const __m256i zero = _mm256_setzero_si256();
590 361686000 : const __m256i input = _mm256_load_si256((__m256i *)in);
591 361686000 : const __m256i in_lo = _mm256_unpacklo_epi32(input, zero);
592 361686000 : const __m256i in_hi = _mm256_unpackhi_epi32(input, zero);
593 361686000 : const __m256i energy_lo = _mm256_mul_epi32(in_lo, in_lo);
594 361686000 : const __m256i energy_hi = _mm256_mul_epi32(in_hi, in_hi);
595 361686000 : *sum256 = _mm256_add_epi64(*sum256, energy_lo);
596 361686000 : *sum256 = _mm256_add_epi64(*sum256, energy_hi);
597 361686000 : }
598 :
599 3345080 : static INLINE uint64_t hadd64_avx2(const __m256i sum256) {
600 3345080 : const __m128i sum256_lo = _mm256_castsi256_si128(sum256);
601 3345080 : const __m128i sum256_hi = _mm256_extracti128_si256(sum256, 1);
602 3345080 : const __m128i sum128 = _mm_add_epi64(sum256_lo, sum256_hi);
603 3345080 : const __m128i sum128_hi = _mm_srli_si128(sum128, 8);
604 3345080 : const __m128i sum = _mm_add_epi64(sum128, sum128_hi);
605 :
606 3345080 : return _mm_extract_epi64(sum, 0);
607 : }
608 :
609 1726650 : static INLINE uint64_t EnergyComputation_avx2(const int32_t *const in,
610 : const uint32_t size)
611 : {
612 1726650 : const __m256i zero = _mm256_setzero_si256();
613 1726650 : uint32_t i = 0;
614 1726650 : __m256i sum = zero;
615 :
616 : do {
617 205972000 : EnergyComputation_kernel_avx2(in + i, &sum);
618 205972000 : i += 8;
619 205972000 : } while (i < size);
620 :
621 1726710 : return hadd64_avx2(sum);
622 : }
623 :
624 1618460 : static INLINE uint64_t EnergyComputation64_avx2(const int32_t *in,
625 : const uint32_t height)
626 : {
627 1618460 : const __m256i zero = _mm256_setzero_si256();
628 1618460 : uint32_t i = height;
629 1618460 : __m256i sum = zero;
630 :
631 : do {
632 39105000 : EnergyComputation_kernel_avx2(in + 0 * 8, &sum);
633 39109200 : EnergyComputation_kernel_avx2(in + 1 * 8, &sum);
634 39108500 : EnergyComputation_kernel_avx2(in + 2 * 8, &sum);
635 39107800 : EnergyComputation_kernel_avx2(in + 3 * 8, &sum);
636 39105000 : in += 64;
637 39105000 : } while (--i);
638 :
639 1618490 : return hadd64_avx2(sum);
640 : }
641 :
642 1618480 : static INLINE void clean_256_bytes_avx2(int32_t *buf, const uint32_t height) {
643 1618480 : const __m256i zero = _mm256_setzero_si256();
644 1618480 : uint32_t h = height;
645 :
646 : do {
647 : _mm256_store_si256((__m256i *)(buf + 0 * 8), zero);
648 39132500 : _mm256_store_si256((__m256i *)(buf + 1 * 8), zero);
649 39132500 : _mm256_store_si256((__m256i *)(buf + 2 * 8), zero);
650 39132500 : _mm256_store_si256((__m256i *)(buf + 3 * 8), zero);
651 39132500 : buf += 64;
652 39132500 : } while (--h);
653 1618480 : }
654 :
655 149964000 : static INLINE void copy_32_bytes_avx2(const int32_t *src, int32_t *dst) {
656 149964000 : const __m256i val = _mm256_load_si256((__m256i *)(src + 0 * 8));
657 : _mm256_store_si256((__m256i *)(dst + 0 * 8), val);
658 149964000 : }
659 :
660 1618500 : static INLINE void copy_256x_bytes_avx2(const int32_t *src, int32_t *dst,
661 : const uint32_t height) {
662 1618500 : uint32_t h = height;
663 :
664 : do {
665 37495800 : copy_32_bytes_avx2(src + 0 * 8, dst + 0 * 8);
666 37496000 : copy_32_bytes_avx2(src + 1 * 8, dst + 1 * 8);
667 37495300 : copy_32_bytes_avx2(src + 2 * 8, dst + 2 * 8);
668 37494600 : copy_32_bytes_avx2(src + 3 * 8, dst + 3 * 8);
669 37495800 : src += 64;
670 37495800 : dst += 32;
671 37495800 : } while (--h);
672 1618480 : }
673 :
674 855360 : uint64_t HandleTransform16x64_avx2(int32_t *output) {
675 : //bottom 16x32 area.
676 : const uint64_t three_quad_energy =
677 855360 : EnergyComputation_avx2(output + 16 * 32, 16 * 32);
678 :
679 : // Zero out the bottom 16x32 area.
680 855375 : memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
681 :
682 855375 : return three_quad_energy;
683 : }
684 :
685 557010 : uint64_t HandleTransform32x64_avx2(int32_t *output) {
686 : //bottom 32x32 area.
687 : const uint64_t three_quad_energy =
688 557010 : EnergyComputation_avx2(output + 32 * 32, 32 * 32);
689 :
690 : // Zero out the bottom 32x32 area.
691 557017 : memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
692 :
693 557017 : return three_quad_energy;
694 : }
695 :
696 790603 : uint64_t HandleTransform64x16_avx2(int32_t *output) {
697 : // top - right 32x16 area.
698 : const uint64_t three_quad_energy =
699 790603 : EnergyComputation64_avx2(output + 32, 16);
700 :
701 : // Zero out right 32x16 area.
702 790607 : clean_256_bytes_avx2(output + 32, 16);
703 :
704 : // Re-pack non-zero coeffs in the first 32x16 indices.
705 790610 : copy_256x_bytes_avx2(output + 64, output + 32, 15);
706 :
707 790603 : return three_quad_energy;
708 : }
709 :
710 513585 : uint64_t HandleTransform64x32_avx2(int32_t *output) {
711 : // top - right 32x32 area.
712 : const uint64_t three_quad_energy =
713 513585 : EnergyComputation64_avx2(output + 32, 32);
714 :
715 : // Zero out right 32x32 area.
716 513589 : clean_256_bytes_avx2(output + 32, 32);
717 :
718 : // Re-pack non-zero coeffs in the first 32x32 indices.
719 513590 : copy_256x_bytes_avx2(output + 64, output + 32, 31);
720 :
721 513584 : return three_quad_energy;
722 : }
723 :
724 314333 : uint64_t HandleTransform64x64_avx2(int32_t *output) {
725 : uint64_t three_quad_energy;
726 :
727 : // top - right 32x32 area.
728 314333 : three_quad_energy = EnergyComputation64_avx2(output + 32, 32);
729 : //bottom 64x32 area.
730 314332 : three_quad_energy += EnergyComputation_avx2(output + 32 * 64, 64 * 32);
731 :
732 : // Zero out top-right 32x32 area.
733 314331 : clean_256_bytes_avx2(output + 32, 32);
734 :
735 : // Zero out the bottom 64x32 area.
736 314332 : memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
737 :
738 : // Re-pack non-zero coeffs in the first 32x32 indices.
739 314332 : copy_256x_bytes_avx2(output + 64, output + 32, 31);
740 :
741 314329 : return three_quad_energy;
742 : }
|