Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include "EbAvcStyleMcp_SSE2.h"
7 : #include "EbMcp_SSE2.h" // THIS SHOULD BE _SSE2 in the future
8 : #include "emmintrin.h"
9 0 : void avc_style_copy_sse2(
10 : EbByte ref_pic,
11 : uint32_t src_stride,
12 : EbByte dst,
13 : uint32_t dst_stride,
14 : uint32_t pu_width,
15 : uint32_t pu_height,
16 : EbByte temp_buf,
17 : EbBool skip,
18 : uint32_t frac_pos)
19 : {
20 : (void)temp_buf;
21 : (void)frac_pos;
22 0 : if (skip) {
23 : //do the last row too.
24 0 : EB_MEMCPY(dst + (pu_height - 1)*dst_stride, ref_pic + (pu_height - 1)*src_stride, pu_width);
25 :
26 0 : src_stride <<= 1;
27 0 : dst_stride <<= 1;
28 0 : pu_height >>= 1;
29 : }
30 :
31 0 : picture_copy_kernel_sse2(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height);
32 0 : }
33 :
34 : //This function should be removed and replace by avc_style_copy_sse2
35 :
36 6253210 : void picture_average_kernel_sse2_intrin(
37 : EbByte src0,
38 : uint32_t src0_stride,
39 : EbByte src1,
40 : uint32_t src1_stride,
41 : EbByte dst,
42 : uint32_t dst_stride,
43 : uint32_t area_width,
44 : uint32_t area_height)
45 : {
46 : __m128i xmm_avg1, xmm_avg2, xmm_avg3, xmm_avg4, xmm_avg5, xmm_avg6, xmm_avg7, xmm_avg8;
47 : uint32_t y;
48 :
49 6253210 : if (area_width > 16)
50 : {
51 1188090 : if (area_width == 24)
52 : {
53 0 : for (y = 0; y < area_height; y += 2) {
54 0 : xmm_avg1 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)src0), _mm_loadu_si128((__m128i*)src1));
55 0 : xmm_avg2 = _mm_avg_epu8(_mm_loadl_epi64((__m128i*)(src0 + 16)), _mm_loadl_epi64((__m128i*)(src1 + 16)));
56 0 : xmm_avg3 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride)), _mm_loadu_si128((__m128i*)(src1 + src1_stride)));
57 0 : xmm_avg4 = _mm_avg_epu8(_mm_loadl_epi64((__m128i*)(src0 + src0_stride + 16)), _mm_loadl_epi64((__m128i*)(src1 + src1_stride + 16)));
58 :
59 : _mm_storeu_si128((__m128i*) dst, xmm_avg1);
60 0 : _mm_storel_epi64((__m128i*) (dst + 16), xmm_avg2);
61 0 : _mm_storeu_si128((__m128i*) (dst + dst_stride), xmm_avg3);
62 0 : _mm_storel_epi64((__m128i*) (dst + dst_stride + 16), xmm_avg4);
63 :
64 0 : src0 += src0_stride << 1;
65 0 : src1 += src1_stride << 1;
66 0 : dst += dst_stride << 1;
67 : }
68 : }
69 1188090 : else if (area_width == 32)
70 : {
71 9315470 : for (y = 0; y < area_height; y += 2) {
72 16732300 : xmm_avg1 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)src0), _mm_loadu_si128((__m128i*)src1));
73 25098400 : xmm_avg2 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + 16)), _mm_loadu_si128((__m128i*)(src1 + 16)));
74 25098400 : xmm_avg3 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride)), _mm_loadu_si128((__m128i*)(src1 + src1_stride)));
75 33464600 : xmm_avg4 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride + 16)), _mm_loadu_si128((__m128i*)(src1 + src1_stride + 16)));
76 :
77 : _mm_storeu_si128((__m128i*) dst, xmm_avg1);
78 8366150 : _mm_storeu_si128((__m128i*) (dst + 16), xmm_avg2);
79 8366150 : _mm_storeu_si128((__m128i*) (dst + dst_stride), xmm_avg3);
80 8366150 : _mm_storeu_si128((__m128i*) (dst + dst_stride + 16), xmm_avg4);
81 :
82 8366150 : src0 += src0_stride << 1;
83 8366150 : src1 += src1_stride << 1;
84 8366150 : dst += dst_stride << 1;
85 : }
86 : }
87 238767 : else if (area_width == 48)
88 : {
89 0 : for (y = 0; y < area_height; y += 2) {
90 0 : xmm_avg1 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)src0), _mm_loadu_si128((__m128i*)src1));
91 0 : xmm_avg2 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + 16)), _mm_loadu_si128((__m128i*)(src1 + 16)));
92 0 : xmm_avg3 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + 32)), _mm_loadu_si128((__m128i*)(src1 + 32)));
93 :
94 0 : xmm_avg4 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride)), _mm_loadu_si128((__m128i*)(src1 + src1_stride)));
95 0 : xmm_avg5 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride + 16)), _mm_loadu_si128((__m128i*)(src1 + src1_stride + 16)));
96 0 : xmm_avg6 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride + 32)), _mm_loadu_si128((__m128i*)(src1 + src1_stride + 32)));
97 :
98 : _mm_storeu_si128((__m128i*) dst, xmm_avg1);
99 0 : _mm_storeu_si128((__m128i*) (dst + 16), xmm_avg2);
100 0 : _mm_storeu_si128((__m128i*) (dst + 32), xmm_avg3);
101 0 : _mm_storeu_si128((__m128i*) (dst + dst_stride), xmm_avg4);
102 0 : _mm_storeu_si128((__m128i*) (dst + dst_stride + 16), xmm_avg5);
103 0 : _mm_storeu_si128((__m128i*) (dst + dst_stride + 32), xmm_avg6);
104 :
105 0 : src0 += src0_stride << 1;
106 0 : src1 += src1_stride << 1;
107 0 : dst += dst_stride << 1;
108 : }
109 : }
110 : else
111 : {
112 3620930 : for (y = 0; y < area_height; y += 2) {
113 6764320 : xmm_avg1 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)src0), _mm_loadu_si128((__m128i*)src1));
114 10146500 : xmm_avg2 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + 16)), _mm_loadu_si128((__m128i*)(src1 + 16)));
115 10146500 : xmm_avg3 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + 32)), _mm_loadu_si128((__m128i*)(src1 + 32)));
116 10146500 : xmm_avg4 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + 48)), _mm_loadu_si128((__m128i*)(src1 + 48)));
117 :
118 10146500 : xmm_avg5 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride)), _mm_loadu_si128((__m128i*)(src1 + src1_stride)));
119 10146500 : xmm_avg6 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride + 16)), _mm_loadu_si128((__m128i*)(src1 + src1_stride + 16)));
120 10146500 : xmm_avg7 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride + 32)), _mm_loadu_si128((__m128i*)(src1 + src1_stride + 32)));
121 13528600 : xmm_avg8 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride + 48)), _mm_loadu_si128((__m128i*)(src1 + src1_stride + 48)));
122 :
123 : _mm_storeu_si128((__m128i*) dst, xmm_avg1);
124 3382160 : _mm_storeu_si128((__m128i*) (dst + 16), xmm_avg2);
125 3382160 : _mm_storeu_si128((__m128i*) (dst + 32), xmm_avg3);
126 3382160 : _mm_storeu_si128((__m128i*) (dst + 48), xmm_avg4);
127 :
128 3382160 : _mm_storeu_si128((__m128i*) (dst + dst_stride), xmm_avg5);
129 3382160 : _mm_storeu_si128((__m128i*) (dst + dst_stride + 16), xmm_avg6);
130 3382160 : _mm_storeu_si128((__m128i*) (dst + dst_stride + 32), xmm_avg7);
131 3382160 : _mm_storeu_si128((__m128i*) (dst + dst_stride + 48), xmm_avg8);
132 :
133 3382160 : src0 += src0_stride << 1;
134 3382160 : src1 += src1_stride << 1;
135 3382160 : dst += dst_stride << 1;
136 : }
137 : }
138 : }
139 : else
140 : {
141 5065120 : if (area_width == 16)
142 : {
143 17829600 : for (y = 0; y < area_height; y += 2) {
144 32015700 : xmm_avg1 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)src0), _mm_loadu_si128((__m128i*)src1));
145 64031400 : xmm_avg2 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride)), _mm_loadu_si128((__m128i*)(src1 + src1_stride)));
146 :
147 : _mm_storeu_si128((__m128i*) dst, xmm_avg1);
148 16007900 : _mm_storeu_si128((__m128i*) (dst + dst_stride), xmm_avg2);
149 :
150 16007900 : src0 += src0_stride << 1;
151 16007900 : src1 += src1_stride << 1;
152 16007900 : dst += dst_stride << 1;
153 : }
154 : }
155 3243390 : else if (area_width == 4)
156 : {
157 0 : for (y = 0; y < area_height; y += 2) {
158 0 : xmm_avg1 = _mm_avg_epu8(_mm_cvtsi32_si128(*(uint32_t *)src0), _mm_cvtsi32_si128(*(uint32_t *)src1));
159 0 : xmm_avg2 = _mm_avg_epu8(_mm_cvtsi32_si128(*(uint32_t *)(src0 + src0_stride)), _mm_cvtsi32_si128(*(uint32_t *)(src1 + src1_stride)));
160 :
161 0 : *(uint32_t *)dst = _mm_cvtsi128_si32(xmm_avg1);
162 0 : *(uint32_t *)(dst + dst_stride) = _mm_cvtsi128_si32(xmm_avg2);
163 :
164 0 : src0 += src0_stride << 1;
165 0 : src1 += src1_stride << 1;
166 0 : dst += dst_stride << 1;
167 : }
168 : }
169 3243390 : else if (area_width == 8)
170 : {
171 25743100 : for (y = 0; y < area_height; y += 2) {
172 44998500 : xmm_avg1 = _mm_avg_epu8(_mm_loadl_epi64((__m128i*)src0), _mm_loadl_epi64((__m128i*)src1));
173 89997000 : xmm_avg2 = _mm_avg_epu8(_mm_loadl_epi64((__m128i*)(src0 + src0_stride)), _mm_loadl_epi64((__m128i*)(src1 + src1_stride)));
174 :
175 22499200 : _mm_storel_epi64((__m128i*) dst, xmm_avg1);
176 22499200 : _mm_storel_epi64((__m128i*) (dst + dst_stride), xmm_avg2);
177 :
178 22499200 : src0 += src0_stride << 1;
179 22499200 : src1 += src1_stride << 1;
180 22499200 : dst += dst_stride << 1;
181 : }
182 : }
183 : else
184 : {
185 0 : for (y = 0; y < area_height; y += 2) {
186 0 : xmm_avg1 = _mm_avg_epu8(_mm_loadl_epi64((__m128i*)src0), _mm_loadl_epi64((__m128i*)src1));
187 0 : xmm_avg2 = _mm_avg_epu8(_mm_cvtsi32_si128(*(uint32_t *)(src0 + 8)), _mm_cvtsi32_si128(*(uint32_t *)(src1 + 8)));
188 :
189 0 : xmm_avg3 = _mm_avg_epu8(_mm_loadl_epi64((__m128i*)(src0 + src0_stride)), _mm_loadl_epi64((__m128i*)(src1 + src1_stride)));
190 0 : xmm_avg4 = _mm_avg_epu8(_mm_cvtsi32_si128(*(uint32_t *)(src0 + src0_stride + 8)), _mm_cvtsi32_si128(*(uint32_t *)(src1 + src1_stride + 8)));
191 :
192 0 : _mm_storel_epi64((__m128i*) dst, xmm_avg1);
193 0 : *(uint32_t *)(dst + 8) = _mm_cvtsi128_si32(xmm_avg2);
194 0 : _mm_storel_epi64((__m128i*) (dst + dst_stride), xmm_avg3);
195 0 : *(uint32_t *)(dst + dst_stride + 8) = _mm_cvtsi128_si32(xmm_avg4);
196 :
197 0 : src0 += src0_stride << 1;
198 0 : src1 += src1_stride << 1;
199 0 : dst += dst_stride << 1;
200 : }
201 : }
202 : }
203 6253210 : }
204 0 : void picture_average_kernel1_line_sse2_intrin(
205 : EbByte src0,
206 : EbByte src1,
207 : EbByte dst,
208 : uint32_t area_width)
209 : {
210 : __m128i xmm_avg1, xmm_avg2, xmm_avg3, xmm_avg4;
211 :
212 0 : if (area_width > 16)
213 : {
214 0 : if (area_width == 32)
215 : {
216 : //for (y = 0; y < area_height; y += 2)
217 : {
218 0 : xmm_avg1 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)src0), _mm_loadu_si128((__m128i*)src1));
219 0 : xmm_avg2 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + 16)), _mm_loadu_si128((__m128i*)(src1 + 16)));
220 : //xmm_avg3 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride)), _mm_loadu_si128((__m128i*)(src1 + src1_stride)));
221 : //xmm_avg4 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride + 16)), _mm_loadu_si128((__m128i*)(src1 + src1_stride + 16)));
222 :
223 : _mm_storeu_si128((__m128i*) dst, xmm_avg1);
224 0 : _mm_storeu_si128((__m128i*) (dst + 16), xmm_avg2);
225 : //_mm_storeu_si128((__m128i*) (dst + dst_stride), xmm_avg3);
226 : //_mm_storeu_si128((__m128i*) (dst + dst_stride + 16), xmm_avg4);
227 :
228 : //src0 += src0_stride << 1;
229 : //src1 += src1_stride << 1;
230 : //dst += dst_stride << 1;
231 : }
232 : }
233 : else
234 : {
235 : //for (y = 0; y < area_height; y += 2)
236 : {
237 0 : xmm_avg1 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)src0), _mm_loadu_si128((__m128i*)src1));
238 0 : xmm_avg2 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + 16)), _mm_loadu_si128((__m128i*)(src1 + 16)));
239 0 : xmm_avg3 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + 32)), _mm_loadu_si128((__m128i*)(src1 + 32)));
240 0 : xmm_avg4 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + 48)), _mm_loadu_si128((__m128i*)(src1 + 48)));
241 :
242 : //xmm_avg5 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride)), _mm_loadu_si128((__m128i*)(src1 + src1_stride)));
243 : //xmm_avg6 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride + 16)), _mm_loadu_si128((__m128i*)(src1 + src1_stride + 16)));
244 : //xmm_avg7 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride + 32)), _mm_loadu_si128((__m128i*)(src1 + src1_stride + 32)));
245 : //xmm_avg8 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride + 48)), _mm_loadu_si128((__m128i*)(src1 + src1_stride + 48)));
246 :
247 : _mm_storeu_si128((__m128i*) dst, xmm_avg1);
248 0 : _mm_storeu_si128((__m128i*) (dst + 16), xmm_avg2);
249 0 : _mm_storeu_si128((__m128i*) (dst + 32), xmm_avg3);
250 0 : _mm_storeu_si128((__m128i*) (dst + 48), xmm_avg4);
251 :
252 : //_mm_storeu_si128((__m128i*) (dst + dst_stride), xmm_avg5);
253 : //_mm_storeu_si128((__m128i*) (dst + dst_stride + 16), xmm_avg6);
254 : //_mm_storeu_si128((__m128i*) (dst + dst_stride + 32), xmm_avg7);
255 : //_mm_storeu_si128((__m128i*) (dst + dst_stride + 48), xmm_avg8);
256 :
257 : //src0 += src0_stride << 1;
258 : //src1 += src1_stride << 1;
259 : //dst += dst_stride << 1;
260 : }
261 : }
262 : }
263 : else
264 : {
265 0 : if (area_width == 16)
266 : {
267 : //for (y = 0; y < area_height; y += 2)
268 : {
269 0 : xmm_avg1 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)src0), _mm_loadu_si128((__m128i*)src1));
270 : //xmm_avg2 = _mm_avg_epu8(_mm_loadu_si128((__m128i*)(src0 + src0_stride)), _mm_loadu_si128((__m128i*)(src1 + src1_stride)));
271 :
272 : _mm_storeu_si128((__m128i*) dst, xmm_avg1);
273 : //_mm_storeu_si128((__m128i*) (dst + dst_stride), xmm_avg2);
274 :
275 : //src0 += src0_stride << 1;
276 : //src1 += src1_stride << 1;
277 : //dst += dst_stride << 1;
278 : }
279 : }
280 0 : else if (area_width == 4)
281 : {
282 : //for (y = 0; y < area_height; y += 2)
283 : {
284 0 : xmm_avg1 = _mm_avg_epu8(_mm_cvtsi32_si128(*(uint32_t *)src0), _mm_cvtsi32_si128(*(uint32_t *)src1));
285 : //xmm_avg2 = _mm_avg_epu8(_mm_cvtsi32_si128(*(uint32_t *)(src0 + src0_stride)), _mm_cvtsi32_si128(*(uint32_t *)(src1 + src1_stride)));
286 :
287 0 : *(uint32_t *)dst = _mm_cvtsi128_si32(xmm_avg1);
288 : //*(uint32_t *)(dst + dst_stride) = _mm_cvtsi128_si32(xmm_avg2);
289 :
290 : //src0 += src0_stride << 1;
291 : //src1 += src1_stride << 1;
292 : //dst += dst_stride << 1;
293 : }
294 : }
295 0 : else if (area_width == 8)
296 : {
297 : //for (y = 0; y < area_height; y += 2)
298 : {
299 0 : xmm_avg1 = _mm_avg_epu8(_mm_loadl_epi64((__m128i*)src0), _mm_loadl_epi64((__m128i*)src1));
300 : //xmm_avg2 = _mm_avg_epu8(_mm_loadl_epi64((__m128i*)(src0 + src0_stride)), _mm_loadl_epi64((__m128i*)(src1 + src1_stride)));
301 :
302 0 : _mm_storel_epi64((__m128i*) dst, xmm_avg1);
303 : //_mm_storel_epi64((__m128i*) (dst + dst_stride), xmm_avg2);
304 :
305 : //src0 += src0_stride << 1;
306 : //src1 += src1_stride << 1;
307 : //dst += dst_stride << 1;
308 : }
309 : }
310 : else
311 : {
312 : //for (y = 0; y < area_height; y += 2)
313 : {
314 0 : xmm_avg1 = _mm_avg_epu8(_mm_loadl_epi64((__m128i*)src0), _mm_loadl_epi64((__m128i*)src1));
315 0 : xmm_avg2 = _mm_avg_epu8(_mm_cvtsi32_si128(*(uint32_t *)(src0 + 8)), _mm_cvtsi32_si128(*(uint32_t *)(src1 + 8)));
316 :
317 : //xmm_avg3 = _mm_avg_epu8(_mm_loadl_epi64((__m128i*)(src0 + src0_stride)), _mm_loadl_epi64((__m128i*)(src1 + src1_stride)));
318 : //xmm_avg4 = _mm_avg_epu8(_mm_cvtsi32_si128(*(uint32_t *)(src0 + src0_stride + 8)), _mm_cvtsi32_si128(*(uint32_t *)(src1 + src1_stride + 8)));
319 :
320 0 : _mm_storel_epi64((__m128i*) dst, xmm_avg1);
321 0 : *(uint32_t *)(dst + 8) = _mm_cvtsi128_si32(xmm_avg2);
322 : //_mm_storel_epi64((__m128i*) (dst + dst_stride), xmm_avg3);
323 : //*(uint32_t *)(dst + dst_stride + 8) = _mm_cvtsi128_si32(xmm_avg4);
324 :
325 : //src0 += src0_stride << 1;
326 : //src1 += src1_stride << 1;
327 : //dst += dst_stride << 1;
328 : }
329 : }
330 : }
331 0 : }
|